aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2008-07-18 05:39:39 -0400
committerDavid S. Miller <davem@davemloft.net>2008-07-18 05:39:39 -0400
commit49997d75152b3d23c53b0fa730599f2f74c92c65 (patch)
tree46e93126170d02cfec9505172e545732c1b69656 /fs
parenta0c80b80e0fb48129e4e9d6a9ede914f9ff1850d (diff)
parent5b664cb235e97afbf34db9c4d77f08ebd725335e (diff)
Merge branch 'master' of master.kernel.org:/pub/scm/linux/kernel/git/torvalds/linux-2.6
Conflicts: Documentation/powerpc/booting-without-of.txt drivers/atm/Makefile drivers/net/fs_enet/fs_enet-main.c drivers/pci/pci-acpi.c net/8021q/vlan.c net/iucv/iucv.c
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/v9fs_vfs.h2
-rw-r--r--fs/9p/vfs_file.c4
-rw-r--r--fs/9p/vfs_inode.c18
-rw-r--r--fs/Kconfig151
-rw-r--r--fs/Makefile2
-rw-r--r--fs/bio-integrity.c719
-rw-r--r--fs/bio.c88
-rw-r--r--fs/buffer.c21
-rw-r--r--fs/char_dev.c7
-rw-r--r--fs/cifs/cifsacl.c10
-rw-r--r--fs/cifs/cifsfs.c2
-rw-r--r--fs/cifs/inode.c20
-rw-r--r--fs/compat_ioctl.c6
-rw-r--r--fs/configfs/configfs_internal.h4
-rw-r--r--fs/configfs/dir.c147
-rw-r--r--fs/configfs/inode.c2
-rw-r--r--fs/configfs/symlink.c16
-rw-r--r--fs/dlm/config.c45
-rw-r--r--fs/dlm/user.c9
-rw-r--r--fs/ecryptfs/file.c3
-rw-r--r--fs/ecryptfs/miscdev.c2
-rw-r--r--fs/exec.c2
-rw-r--r--fs/ext3/super.c4
-rw-r--r--fs/ext4/balloc.c209
-rw-r--r--fs/ext4/dir.c17
-rw-r--r--fs/ext4/ext4.h61
-rw-r--r--fs/ext4/ext4_extents.h1
-rw-r--r--fs/ext4/ext4_i.h10
-rw-r--r--fs/ext4/ext4_jbd2.h21
-rw-r--r--fs/ext4/ext4_sb.h5
-rw-r--r--fs/ext4/extents.c111
-rw-r--r--fs/ext4/file.c20
-rw-r--r--fs/ext4/fsync.c4
-rw-r--r--fs/ext4/group.h2
-rw-r--r--fs/ext4/ialloc.c113
-rw-r--r--fs/ext4/inode.c1591
-rw-r--r--fs/ext4/mballoc.c451
-rw-r--r--fs/ext4/namei.c45
-rw-r--r--fs/ext4/resize.c52
-rw-r--r--fs/ext4/super.c146
-rw-r--r--fs/ext4/xattr.c2
-rw-r--r--fs/ext4/xattr_trusted.c4
-rw-r--r--fs/ext4/xattr_user.c4
-rw-r--r--fs/fat/cache.c2
-rw-r--r--fs/fat/dir.c4
-rw-r--r--fs/fat/file.c6
-rw-r--r--fs/fat/inode.c26
-rw-r--r--fs/fcntl.c3
-rw-r--r--fs/fs-writeback.c22
-rw-r--r--fs/gfs2/Kconfig18
-rw-r--r--fs/gfs2/Makefile1
-rw-r--r--fs/gfs2/gfs2.h5
-rw-r--r--fs/gfs2/glock.c1643
-rw-r--r--fs/gfs2/glock.h11
-rw-r--r--fs/gfs2/glops.c70
-rw-r--r--fs/gfs2/incore.h38
-rw-r--r--fs/gfs2/inode.c11
-rw-r--r--fs/gfs2/inode.h2
-rw-r--r--fs/gfs2/locking.c52
-rw-r--r--fs/gfs2/locking/dlm/lock.c368
-rw-r--r--fs/gfs2/locking/dlm/lock_dlm.h18
-rw-r--r--fs/gfs2/locking/dlm/mount.c14
-rw-r--r--fs/gfs2/locking/dlm/sysfs.c13
-rw-r--r--fs/gfs2/locking/dlm/thread.c331
-rw-r--r--fs/gfs2/locking/nolock/Makefile3
-rw-r--r--fs/gfs2/locking/nolock/main.c238
-rw-r--r--fs/gfs2/log.c2
-rw-r--r--fs/gfs2/log.h2
-rw-r--r--fs/gfs2/main.c2
-rw-r--r--fs/gfs2/meta_io.c14
-rw-r--r--fs/gfs2/meta_io.h1
-rw-r--r--fs/gfs2/ops_address.c40
-rw-r--r--fs/gfs2/ops_file.c42
-rw-r--r--fs/gfs2/ops_fstype.c8
-rw-r--r--fs/gfs2/ops_inode.c25
-rw-r--r--fs/gfs2/ops_super.c4
-rw-r--r--fs/gfs2/quota.c2
-rw-r--r--fs/gfs2/recovery.c5
-rw-r--r--fs/gfs2/rgrp.c108
-rw-r--r--fs/gfs2/super.c4
-rw-r--r--fs/gfs2/sys.c16
-rw-r--r--fs/jbd2/checkpoint.c1
-rw-r--r--fs/jbd2/commit.c294
-rw-r--r--fs/jbd2/journal.c53
-rw-r--r--fs/jbd2/transaction.c365
-rw-r--r--fs/jfs/jfs_debug.c62
-rw-r--r--fs/jfs/jfs_debug.h10
-rw-r--r--fs/jfs/jfs_dtree.h3
-rw-r--r--fs/jfs/jfs_imap.c2
-rw-r--r--fs/jfs/jfs_logmgr.c35
-rw-r--r--fs/jfs/jfs_metapage.c36
-rw-r--r--fs/jfs/jfs_txnmgr.c68
-rw-r--r--fs/jfs/jfs_xtree.c36
-rw-r--r--fs/jfs/namei.c2
-rw-r--r--fs/jfs/super.c7
-rw-r--r--fs/libfs.c28
-rw-r--r--fs/lockd/clntproc.c8
-rw-r--r--fs/lockd/svc4proc.c2
-rw-r--r--fs/lockd/svclock.c7
-rw-r--r--fs/lockd/svcproc.c2
-rw-r--r--fs/mpage.c14
-rw-r--r--fs/msdos/namei.c35
-rw-r--r--fs/namespace.c14
-rw-r--r--fs/ncpfs/file.c12
-rw-r--r--fs/nfs/callback.c34
-rw-r--r--fs/nfs/client.c13
-rw-r--r--fs/nfs/dir.c90
-rw-r--r--fs/nfs/direct.c4
-rw-r--r--fs/nfs/file.c161
-rw-r--r--fs/nfs/inode.c79
-rw-r--r--fs/nfs/internal.h1
-rw-r--r--fs/nfs/iostat.h119
-rw-r--r--fs/nfs/nfs3acl.c9
-rw-r--r--fs/nfs/nfs3proc.c275
-rw-r--r--fs/nfs/nfs4proc.c265
-rw-r--r--fs/nfs/nfs4state.c2
-rw-r--r--fs/nfs/nfsroot.c10
-rw-r--r--fs/nfs/proc.c28
-rw-r--r--fs/nfs/super.c882
-rw-r--r--fs/nfs/write.c322
-rw-r--r--fs/nfsd/nfs4callback.c2
-rw-r--r--fs/ocfs2/aops.c13
-rw-r--r--fs/ocfs2/cluster/heartbeat.c17
-rw-r--r--fs/ocfs2/cluster/netdebug.c8
-rw-r--r--fs/ocfs2/cluster/nodemanager.c45
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c2
-rw-r--r--fs/ocfs2/dlmglue.c136
-rw-r--r--fs/ocfs2/file.c2
-rw-r--r--fs/ocfs2/journal.c2
-rw-r--r--fs/ocfs2/localalloc.c2
-rw-r--r--fs/ocfs2/ocfs2.h12
-rw-r--r--fs/ocfs2/ocfs2_fs.h2
-rw-r--r--fs/ocfs2/stack_user.c22
-rw-r--r--fs/ocfs2/super.c6
-rw-r--r--fs/open.c37
-rw-r--r--fs/proc/base.c9
-rw-r--r--fs/proc/proc_misc.c16
-rw-r--r--fs/proc/task_mmu.c86
-rw-r--r--fs/proc/task_nommu.c2
-rw-r--r--fs/ramfs/file-mmu.c1
-rw-r--r--fs/ramfs/file-nommu.c1
-rw-r--r--fs/read_write.c38
-rw-r--r--fs/reiserfs/inode.c2
-rw-r--r--fs/reiserfs/super.c4
-rw-r--r--fs/smbfs/file.c11
-rw-r--r--fs/splice.c17
-rw-r--r--fs/ubifs/Kconfig72
-rw-r--r--fs/ubifs/Makefile9
-rw-r--r--fs/ubifs/budget.c731
-rw-r--r--fs/ubifs/commit.c677
-rw-r--r--fs/ubifs/compress.c253
-rw-r--r--fs/ubifs/debug.c2289
-rw-r--r--fs/ubifs/debug.h403
-rw-r--r--fs/ubifs/dir.c1240
-rw-r--r--fs/ubifs/file.c1275
-rw-r--r--fs/ubifs/find.c975
-rw-r--r--fs/ubifs/gc.c773
-rw-r--r--fs/ubifs/io.c914
-rw-r--r--fs/ubifs/ioctl.c204
-rw-r--r--fs/ubifs/journal.c1387
-rw-r--r--fs/ubifs/key.h533
-rw-r--r--fs/ubifs/log.c805
-rw-r--r--fs/ubifs/lprops.c1357
-rw-r--r--fs/ubifs/lpt.c2243
-rw-r--r--fs/ubifs/lpt_commit.c1648
-rw-r--r--fs/ubifs/master.c387
-rw-r--r--fs/ubifs/misc.h342
-rw-r--r--fs/ubifs/orphan.c958
-rw-r--r--fs/ubifs/recovery.c1519
-rw-r--r--fs/ubifs/replay.c1075
-rw-r--r--fs/ubifs/sb.c629
-rw-r--r--fs/ubifs/scan.c362
-rw-r--r--fs/ubifs/shrinker.c322
-rw-r--r--fs/ubifs/super.c1951
-rw-r--r--fs/ubifs/tnc.c2956
-rw-r--r--fs/ubifs/tnc_commit.c1103
-rw-r--r--fs/ubifs/tnc_misc.c494
-rw-r--r--fs/ubifs/ubifs-media.h745
-rw-r--r--fs/ubifs/ubifs.h1649
-rw-r--r--fs/ubifs/xattr.c581
-rw-r--r--fs/vfat/namei.c35
-rw-r--r--fs/xfs/xfs_log.c15
182 files changed, 39659 insertions, 4464 deletions
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index fd01d90cada5..57997fa14e69 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -51,4 +51,4 @@ int v9fs_dir_release(struct inode *inode, struct file *filp);
51int v9fs_file_open(struct inode *inode, struct file *file); 51int v9fs_file_open(struct inode *inode, struct file *file);
52void v9fs_inode2stat(struct inode *inode, struct p9_stat *stat); 52void v9fs_inode2stat(struct inode *inode, struct p9_stat *stat);
53void v9fs_dentry_release(struct dentry *); 53void v9fs_dentry_release(struct dentry *);
54int v9fs_uflags2omode(int uflags); 54int v9fs_uflags2omode(int uflags, int extended);
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 0d55affe37d4..52944d2249a4 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -59,7 +59,7 @@ int v9fs_file_open(struct inode *inode, struct file *file)
59 59
60 P9_DPRINTK(P9_DEBUG_VFS, "inode: %p file: %p \n", inode, file); 60 P9_DPRINTK(P9_DEBUG_VFS, "inode: %p file: %p \n", inode, file);
61 v9ses = v9fs_inode2v9ses(inode); 61 v9ses = v9fs_inode2v9ses(inode);
62 omode = v9fs_uflags2omode(file->f_flags); 62 omode = v9fs_uflags2omode(file->f_flags, v9fs_extended(v9ses));
63 fid = file->private_data; 63 fid = file->private_data;
64 if (!fid) { 64 if (!fid) {
65 fid = v9fs_fid_clone(file->f_path.dentry); 65 fid = v9fs_fid_clone(file->f_path.dentry);
@@ -75,6 +75,8 @@ int v9fs_file_open(struct inode *inode, struct file *file)
75 inode->i_size = 0; 75 inode->i_size = 0;
76 inode->i_blocks = 0; 76 inode->i_blocks = 0;
77 } 77 }
78 if ((file->f_flags & O_APPEND) && (!v9fs_extended(v9ses)))
79 generic_file_llseek(file, 0, SEEK_END);
78 } 80 }
79 81
80 file->private_data = fid; 82 file->private_data = fid;
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 40fa807bd929..c95295c65045 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -132,10 +132,10 @@ static int p9mode2unixmode(struct v9fs_session_info *v9ses, int mode)
132/** 132/**
133 * v9fs_uflags2omode- convert posix open flags to plan 9 mode bits 133 * v9fs_uflags2omode- convert posix open flags to plan 9 mode bits
134 * @uflags: flags to convert 134 * @uflags: flags to convert
135 * 135 * @extended: if .u extensions are active
136 */ 136 */
137 137
138int v9fs_uflags2omode(int uflags) 138int v9fs_uflags2omode(int uflags, int extended)
139{ 139{
140 int ret; 140 int ret;
141 141
@@ -155,14 +155,16 @@ int v9fs_uflags2omode(int uflags)
155 break; 155 break;
156 } 156 }
157 157
158 if (uflags & O_EXCL)
159 ret |= P9_OEXCL;
160
161 if (uflags & O_TRUNC) 158 if (uflags & O_TRUNC)
162 ret |= P9_OTRUNC; 159 ret |= P9_OTRUNC;
163 160
164 if (uflags & O_APPEND) 161 if (extended) {
165 ret |= P9_OAPPEND; 162 if (uflags & O_EXCL)
163 ret |= P9_OEXCL;
164
165 if (uflags & O_APPEND)
166 ret |= P9_OAPPEND;
167 }
166 168
167 return ret; 169 return ret;
168} 170}
@@ -506,7 +508,7 @@ v9fs_vfs_create(struct inode *dir, struct dentry *dentry, int mode,
506 flags = O_RDWR; 508 flags = O_RDWR;
507 509
508 fid = v9fs_create(v9ses, dir, dentry, NULL, perm, 510 fid = v9fs_create(v9ses, dir, dentry, NULL, perm,
509 v9fs_uflags2omode(flags)); 511 v9fs_uflags2omode(flags, v9fs_extended(v9ses)));
510 if (IS_ERR(fid)) { 512 if (IS_ERR(fid)) {
511 err = PTR_ERR(fid); 513 err = PTR_ERR(fid);
512 fid = NULL; 514 fid = NULL;
diff --git a/fs/Kconfig b/fs/Kconfig
index cf12c403b8c7..37db79a2ff95 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -470,6 +470,14 @@ config OCFS2_FS_USERSPACE_CLUSTER
470 It is safe to say Y, as the clustering method is run-time 470 It is safe to say Y, as the clustering method is run-time
471 selectable. 471 selectable.
472 472
473config OCFS2_FS_STATS
474 bool "OCFS2 statistics"
475 depends on OCFS2_FS
476 default y
477 help
478 This option allows some fs statistics to be captured. Enabling
479 this option may increase the memory consumption.
480
473config OCFS2_DEBUG_MASKLOG 481config OCFS2_DEBUG_MASKLOG
474 bool "OCFS2 logging support" 482 bool "OCFS2 logging support"
475 depends on OCFS2_FS 483 depends on OCFS2_FS
@@ -830,7 +838,7 @@ config NTFS_FS
830 from the project web site. 838 from the project web site.
831 839
832 For more information see <file:Documentation/filesystems/ntfs.txt> 840 For more information see <file:Documentation/filesystems/ntfs.txt>
833 and <http://linux-ntfs.sourceforge.net/>. 841 and <http://www.linux-ntfs.org/>.
834 842
835 To compile this file system support as a module, choose M here: the 843 To compile this file system support as a module, choose M here: the
836 module will be called ntfs. 844 module will be called ntfs.
@@ -930,7 +938,7 @@ config PROC_KCORE
930 938
931config PROC_VMCORE 939config PROC_VMCORE
932 bool "/proc/vmcore support (EXPERIMENTAL)" 940 bool "/proc/vmcore support (EXPERIMENTAL)"
933 depends on PROC_FS && EXPERIMENTAL && CRASH_DUMP 941 depends on PROC_FS && CRASH_DUMP
934 default y 942 default y
935 help 943 help
936 Exports the dump image of crashed kernel in ELF format. 944 Exports the dump image of crashed kernel in ELF format.
@@ -1375,6 +1383,9 @@ config JFFS2_CMODE_FAVOURLZO
1375 1383
1376endchoice 1384endchoice
1377 1385
1386# UBIFS File system configuration
1387source "fs/ubifs/Kconfig"
1388
1378config CRAMFS 1389config CRAMFS
1379 tristate "Compressed ROM file system support (cramfs)" 1390 tristate "Compressed ROM file system support (cramfs)"
1380 depends on BLOCK 1391 depends on BLOCK
@@ -1544,10 +1555,6 @@ config UFS_FS
1544 The recently released UFS2 variant (used in FreeBSD 5.x) is 1555 The recently released UFS2 variant (used in FreeBSD 5.x) is
1545 READ-ONLY supported. 1556 READ-ONLY supported.
1546 1557
1547 If you only intend to mount files from some other Unix over the
1548 network using NFS, you don't need the UFS file system support (but
1549 you need NFS file system support obviously).
1550
1551 Note that this option is generally not needed for floppies, since a 1558 Note that this option is generally not needed for floppies, since a
1552 good portable way to transport files and directories between unixes 1559 good portable way to transport files and directories between unixes
1553 (and even other operating systems) is given by the tar program ("man 1560 (and even other operating systems) is given by the tar program ("man
@@ -1587,6 +1594,7 @@ menuconfig NETWORK_FILESYSTEMS
1587 Say Y here to get to see options for network filesystems and 1594 Say Y here to get to see options for network filesystems and
1588 filesystem-related networking code, such as NFS daemon and 1595 filesystem-related networking code, such as NFS daemon and
1589 RPCSEC security modules. 1596 RPCSEC security modules.
1597
1590 This option alone does not add any kernel code. 1598 This option alone does not add any kernel code.
1591 1599
1592 If you say N, all options in this submenu will be skipped and 1600 If you say N, all options in this submenu will be skipped and
@@ -1595,76 +1603,92 @@ menuconfig NETWORK_FILESYSTEMS
1595if NETWORK_FILESYSTEMS 1603if NETWORK_FILESYSTEMS
1596 1604
1597config NFS_FS 1605config NFS_FS
1598 tristate "NFS file system support" 1606 tristate "NFS client support"
1599 depends on INET 1607 depends on INET
1600 select LOCKD 1608 select LOCKD
1601 select SUNRPC 1609 select SUNRPC
1602 select NFS_ACL_SUPPORT if NFS_V3_ACL 1610 select NFS_ACL_SUPPORT if NFS_V3_ACL
1603 help 1611 help
1604 If you are connected to some other (usually local) Unix computer 1612 Choose Y here if you want to access files residing on other
1605 (using SLIP, PLIP, PPP or Ethernet) and want to mount files residing 1613 computers using Sun's Network File System protocol. To compile
1606 on that computer (the NFS server) using the Network File Sharing 1614 this file system support as a module, choose M here: the module
1607 protocol, say Y. "Mounting files" means that the client can access 1615 will be called nfs.
1608 the files with usual UNIX commands as if they were sitting on the
1609 client's hard disk. For this to work, the server must run the
1610 programs nfsd and mountd (but does not need to have NFS file system
1611 support enabled in its kernel). NFS is explained in the Network
1612 Administrator's Guide, available from
1613 <http://www.tldp.org/docs.html#guide>, on its man page: "man
1614 nfs", and in the NFS-HOWTO.
1615
1616 A superior but less widely used alternative to NFS is provided by
1617 the Coda file system; see "Coda file system support" below.
1618 1616
1619 If you say Y here, you should have said Y to TCP/IP networking also. 1617 To mount file systems exported by NFS servers, you also need to
1620 This option would enlarge your kernel by about 27 KB. 1618 install the user space mount.nfs command which can be found in
1619 the Linux nfs-utils package, available from http://linux-nfs.org/.
1620 Information about using the mount command is available in the
1621 mount(8) man page. More detail about the Linux NFS client
1622 implementation is available via the nfs(5) man page.
1621 1623
1622 To compile this file system support as a module, choose M here: the 1624 Below you can choose which versions of the NFS protocol are
1623 module will be called nfs. 1625 available in the kernel to mount NFS servers. Support for NFS
1626 version 2 (RFC 1094) is always available when NFS_FS is selected.
1624 1627
1625 If you are configuring a diskless machine which will mount its root 1628 To configure a system which mounts its root file system via NFS
1626 file system over NFS at boot time, say Y here and to "Kernel 1629 at boot time, say Y here, select "Kernel level IP
1627 level IP autoconfiguration" above and to "Root file system on NFS" 1630 autoconfiguration" in the NETWORK menu, and select "Root file
1628 below. You cannot compile this driver as a module in this case. 1631 system on NFS" below. You cannot compile this file system as a
1629 There are two packages designed for booting diskless machines over 1632 module in this case.
1630 the net: netboot, available from
1631 <http://ftp1.sourceforge.net/netboot/>, and Etherboot,
1632 available from <http://ftp1.sourceforge.net/etherboot/>.
1633 1633
1634 If you don't know what all this is about, say N. 1634 If unsure, say N.
1635 1635
1636config NFS_V3 1636config NFS_V3
1637 bool "Provide NFSv3 client support" 1637 bool "NFS client support for NFS version 3"
1638 depends on NFS_FS 1638 depends on NFS_FS
1639 help 1639 help
1640 Say Y here if you want your NFS client to be able to speak version 1640 This option enables support for version 3 of the NFS protocol
1641 3 of the NFS protocol. 1641 (RFC 1813) in the kernel's NFS client.
1642 1642
1643 If unsure, say Y. 1643 If unsure, say Y.
1644 1644
1645config NFS_V3_ACL 1645config NFS_V3_ACL
1646 bool "Provide client support for the NFSv3 ACL protocol extension" 1646 bool "NFS client support for the NFSv3 ACL protocol extension"
1647 depends on NFS_V3 1647 depends on NFS_V3
1648 help 1648 help
1649 Implement the NFSv3 ACL protocol extension for manipulating POSIX 1649 Some NFS servers support an auxiliary NFSv3 ACL protocol that
1650 Access Control Lists. The server should also be compiled with 1650 Sun added to Solaris but never became an official part of the
1651 the NFSv3 ACL protocol extension; see the CONFIG_NFSD_V3_ACL option. 1651 NFS version 3 protocol. This protocol extension allows
1652 applications on NFS clients to manipulate POSIX Access Control
1653 Lists on files residing on NFS servers. NFS servers enforce
1654 ACLs on local files whether this protocol is available or not.
1655
1656 Choose Y here if your NFS server supports the Solaris NFSv3 ACL
1657 protocol extension and you want your NFS client to allow
1658 applications to access and modify ACLs on files on the server.
1659
1660 Most NFS servers don't support the Solaris NFSv3 ACL protocol
1661 extension. You can choose N here or specify the "noacl" mount
1662 option to prevent your NFS client from trying to use the NFSv3
1663 ACL protocol.
1652 1664
1653 If unsure, say N. 1665 If unsure, say N.
1654 1666
1655config NFS_V4 1667config NFS_V4
1656 bool "Provide NFSv4 client support (EXPERIMENTAL)" 1668 bool "NFS client support for NFS version 4 (EXPERIMENTAL)"
1657 depends on NFS_FS && EXPERIMENTAL 1669 depends on NFS_FS && EXPERIMENTAL
1658 select RPCSEC_GSS_KRB5 1670 select RPCSEC_GSS_KRB5
1659 help 1671 help
1660 Say Y here if you want your NFS client to be able to speak the newer 1672 This option enables support for version 4 of the NFS protocol
1661 version 4 of the NFS protocol. 1673 (RFC 3530) in the kernel's NFS client.
1662 1674
1663 Note: Requires auxiliary userspace daemons which may be found on 1675 To mount NFS servers using NFSv4, you also need to install user
1664 http://www.citi.umich.edu/projects/nfsv4/ 1676 space programs which can be found in the Linux nfs-utils package,
1677 available from http://linux-nfs.org/.
1665 1678
1666 If unsure, say N. 1679 If unsure, say N.
1667 1680
1681config ROOT_NFS
1682 bool "Root file system on NFS"
1683 depends on NFS_FS=y && IP_PNP
1684 help
1685 If you want your system to mount its root file system via NFS,
1686 choose Y here. This is common practice for managing systems
1687 without local permanent storage. For details, read
1688 <file:Documentation/filesystems/nfsroot.txt>.
1689
1690 Most people say N here.
1691
1668config NFSD 1692config NFSD
1669 tristate "NFS server support" 1693 tristate "NFS server support"
1670 depends on INET 1694 depends on INET
@@ -1746,20 +1770,6 @@ config NFSD_V4
1746 1770
1747 If unsure, say N. 1771 If unsure, say N.
1748 1772
1749config ROOT_NFS
1750 bool "Root file system on NFS"
1751 depends on NFS_FS=y && IP_PNP
1752 help
1753 If you want your Linux box to mount its whole root file system (the
1754 one containing the directory /) from some other computer over the
1755 net via NFS (presumably because your box doesn't have a hard disk),
1756 say Y. Read <file:Documentation/filesystems/nfsroot.txt> for
1757 details. It is likely that in this case, you also want to say Y to
1758 "Kernel level IP autoconfiguration" so that your box can discover
1759 its network address at boot time.
1760
1761 Most people say N here.
1762
1763config LOCKD 1773config LOCKD
1764 tristate 1774 tristate
1765 1775
@@ -1800,27 +1810,6 @@ config SUNRPC_XPRT_RDMA
1800 1810
1801 If unsure, say N. 1811 If unsure, say N.
1802 1812
1803config SUNRPC_BIND34
1804 bool "Support for rpcbind versions 3 & 4 (EXPERIMENTAL)"
1805 depends on SUNRPC && EXPERIMENTAL
1806 default n
1807 help
1808 RPC requests over IPv6 networks require support for larger
1809 addresses when performing an RPC bind. Sun added support for
1810 IPv6 addressing by creating two new versions of the rpcbind
1811 protocol (RFC 1833).
1812
1813 This option enables support in the kernel RPC client for
1814 querying rpcbind servers via versions 3 and 4 of the rpcbind
1815 protocol. The kernel automatically falls back to version 2
1816 if a remote rpcbind service does not support versions 3 or 4.
1817 By themselves, these new versions do not provide support for
1818 RPC over IPv6, but the new protocol versions are necessary to
1819 support it.
1820
1821 If unsure, say N to get traditional behavior (version 2 rpcbind
1822 requests only).
1823
1824config RPCSEC_GSS_KRB5 1813config RPCSEC_GSS_KRB5
1825 tristate "Secure RPC: Kerberos V mechanism (EXPERIMENTAL)" 1814 tristate "Secure RPC: Kerberos V mechanism (EXPERIMENTAL)"
1826 depends on SUNRPC && EXPERIMENTAL 1815 depends on SUNRPC && EXPERIMENTAL
diff --git a/fs/Makefile b/fs/Makefile
index 1e7a11bd4da1..3b2178b4bb66 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -19,6 +19,7 @@ else
19obj-y += no-block.o 19obj-y += no-block.o
20endif 20endif
21 21
22obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o
22obj-$(CONFIG_INOTIFY) += inotify.o 23obj-$(CONFIG_INOTIFY) += inotify.o
23obj-$(CONFIG_INOTIFY_USER) += inotify_user.o 24obj-$(CONFIG_INOTIFY_USER) += inotify_user.o
24obj-$(CONFIG_EPOLL) += eventpoll.o 25obj-$(CONFIG_EPOLL) += eventpoll.o
@@ -100,6 +101,7 @@ obj-$(CONFIG_NTFS_FS) += ntfs/
100obj-$(CONFIG_UFS_FS) += ufs/ 101obj-$(CONFIG_UFS_FS) += ufs/
101obj-$(CONFIG_EFS_FS) += efs/ 102obj-$(CONFIG_EFS_FS) += efs/
102obj-$(CONFIG_JFFS2_FS) += jffs2/ 103obj-$(CONFIG_JFFS2_FS) += jffs2/
104obj-$(CONFIG_UBIFS_FS) += ubifs/
103obj-$(CONFIG_AFFS_FS) += affs/ 105obj-$(CONFIG_AFFS_FS) += affs/
104obj-$(CONFIG_ROMFS_FS) += romfs/ 106obj-$(CONFIG_ROMFS_FS) += romfs/
105obj-$(CONFIG_QNX4FS_FS) += qnx4/ 107obj-$(CONFIG_QNX4FS_FS) += qnx4/
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
new file mode 100644
index 000000000000..63e2ee63058d
--- /dev/null
+++ b/fs/bio-integrity.c
@@ -0,0 +1,719 @@
1/*
2 * bio-integrity.c - bio data integrity extensions
3 *
4 * Copyright (C) 2007, 2008 Oracle Corporation
5 * Written by: Martin K. Petersen <martin.petersen@oracle.com>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License version
9 * 2 as published by the Free Software Foundation.
10 *
11 * This program is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; see the file COPYING. If not, write to
18 * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139,
19 * USA.
20 *
21 */
22
23#include <linux/blkdev.h>
24#include <linux/mempool.h>
25#include <linux/bio.h>
26#include <linux/workqueue.h>
27
28static struct kmem_cache *bio_integrity_slab __read_mostly;
29static struct workqueue_struct *kintegrityd_wq;
30
31/**
32 * bio_integrity_alloc_bioset - Allocate integrity payload and attach it to bio
33 * @bio: bio to attach integrity metadata to
34 * @gfp_mask: Memory allocation mask
35 * @nr_vecs: Number of integrity metadata scatter-gather elements
36 * @bs: bio_set to allocate from
37 *
38 * Description: This function prepares a bio for attaching integrity
39 * metadata. nr_vecs specifies the maximum number of pages containing
40 * integrity metadata that can be attached.
41 */
42struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *bio,
43 gfp_t gfp_mask,
44 unsigned int nr_vecs,
45 struct bio_set *bs)
46{
47 struct bio_integrity_payload *bip;
48 struct bio_vec *iv;
49 unsigned long idx;
50
51 BUG_ON(bio == NULL);
52
53 bip = mempool_alloc(bs->bio_integrity_pool, gfp_mask);
54 if (unlikely(bip == NULL)) {
55 printk(KERN_ERR "%s: could not alloc bip\n", __func__);
56 return NULL;
57 }
58
59 memset(bip, 0, sizeof(*bip));
60
61 iv = bvec_alloc_bs(gfp_mask, nr_vecs, &idx, bs);
62 if (unlikely(iv == NULL)) {
63 printk(KERN_ERR "%s: could not alloc bip_vec\n", __func__);
64 mempool_free(bip, bs->bio_integrity_pool);
65 return NULL;
66 }
67
68 bip->bip_pool = idx;
69 bip->bip_vec = iv;
70 bip->bip_bio = bio;
71 bio->bi_integrity = bip;
72
73 return bip;
74}
75EXPORT_SYMBOL(bio_integrity_alloc_bioset);
76
77/**
78 * bio_integrity_alloc - Allocate integrity payload and attach it to bio
79 * @bio: bio to attach integrity metadata to
80 * @gfp_mask: Memory allocation mask
81 * @nr_vecs: Number of integrity metadata scatter-gather elements
82 *
83 * Description: This function prepares a bio for attaching integrity
84 * metadata. nr_vecs specifies the maximum number of pages containing
85 * integrity metadata that can be attached.
86 */
87struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
88 gfp_t gfp_mask,
89 unsigned int nr_vecs)
90{
91 return bio_integrity_alloc_bioset(bio, gfp_mask, nr_vecs, fs_bio_set);
92}
93EXPORT_SYMBOL(bio_integrity_alloc);
94
95/**
96 * bio_integrity_free - Free bio integrity payload
97 * @bio: bio containing bip to be freed
98 * @bs: bio_set this bio was allocated from
99 *
100 * Description: Used to free the integrity portion of a bio. Usually
101 * called from bio_free().
102 */
103void bio_integrity_free(struct bio *bio, struct bio_set *bs)
104{
105 struct bio_integrity_payload *bip = bio->bi_integrity;
106
107 BUG_ON(bip == NULL);
108
109 /* A cloned bio doesn't own the integrity metadata */
110 if (!bio_flagged(bio, BIO_CLONED) && bip->bip_buf != NULL)
111 kfree(bip->bip_buf);
112
113 mempool_free(bip->bip_vec, bs->bvec_pools[bip->bip_pool]);
114 mempool_free(bip, bs->bio_integrity_pool);
115
116 bio->bi_integrity = NULL;
117}
118EXPORT_SYMBOL(bio_integrity_free);
119
120/**
121 * bio_integrity_add_page - Attach integrity metadata
122 * @bio: bio to update
123 * @page: page containing integrity metadata
124 * @len: number of bytes of integrity metadata in page
125 * @offset: start offset within page
126 *
127 * Description: Attach a page containing integrity metadata to bio.
128 */
129int bio_integrity_add_page(struct bio *bio, struct page *page,
130 unsigned int len, unsigned int offset)
131{
132 struct bio_integrity_payload *bip = bio->bi_integrity;
133 struct bio_vec *iv;
134
135 if (bip->bip_vcnt >= bvec_nr_vecs(bip->bip_pool)) {
136 printk(KERN_ERR "%s: bip_vec full\n", __func__);
137 return 0;
138 }
139
140 iv = bip_vec_idx(bip, bip->bip_vcnt);
141 BUG_ON(iv == NULL);
142 BUG_ON(iv->bv_page != NULL);
143
144 iv->bv_page = page;
145 iv->bv_len = len;
146 iv->bv_offset = offset;
147 bip->bip_vcnt++;
148
149 return len;
150}
151EXPORT_SYMBOL(bio_integrity_add_page);
152
153/**
154 * bio_integrity_enabled - Check whether integrity can be passed
155 * @bio: bio to check
156 *
157 * Description: Determines whether bio_integrity_prep() can be called
158 * on this bio or not. bio data direction and target device must be
159 * set prior to calling. The functions honors the write_generate and
160 * read_verify flags in sysfs.
161 */
162int bio_integrity_enabled(struct bio *bio)
163{
164 /* Already protected? */
165 if (bio_integrity(bio))
166 return 0;
167
168 return bdev_integrity_enabled(bio->bi_bdev, bio_data_dir(bio));
169}
170EXPORT_SYMBOL(bio_integrity_enabled);
171
172/**
173 * bio_integrity_hw_sectors - Convert 512b sectors to hardware ditto
174 * @bi: blk_integrity profile for device
175 * @sectors: Number of 512 sectors to convert
176 *
177 * Description: The block layer calculates everything in 512 byte
178 * sectors but integrity metadata is done in terms of the hardware
179 * sector size of the storage device. Convert the block layer sectors
180 * to physical sectors.
181 */
182static inline unsigned int bio_integrity_hw_sectors(struct blk_integrity *bi,
183 unsigned int sectors)
184{
185 /* At this point there are only 512b or 4096b DIF/EPP devices */
186 if (bi->sector_size == 4096)
187 return sectors >>= 3;
188
189 return sectors;
190}
191
192/**
193 * bio_integrity_tag_size - Retrieve integrity tag space
194 * @bio: bio to inspect
195 *
196 * Description: Returns the maximum number of tag bytes that can be
197 * attached to this bio. Filesystems can use this to determine how
198 * much metadata to attach to an I/O.
199 */
200unsigned int bio_integrity_tag_size(struct bio *bio)
201{
202 struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
203
204 BUG_ON(bio->bi_size == 0);
205
206 return bi->tag_size * (bio->bi_size / bi->sector_size);
207}
208EXPORT_SYMBOL(bio_integrity_tag_size);
209
210int bio_integrity_tag(struct bio *bio, void *tag_buf, unsigned int len, int set)
211{
212 struct bio_integrity_payload *bip = bio->bi_integrity;
213 struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
214 unsigned int nr_sectors;
215
216 BUG_ON(bip->bip_buf == NULL);
217
218 if (bi->tag_size == 0)
219 return -1;
220
221 nr_sectors = bio_integrity_hw_sectors(bi,
222 DIV_ROUND_UP(len, bi->tag_size));
223
224 if (nr_sectors * bi->tuple_size > bip->bip_size) {
225 printk(KERN_ERR "%s: tag too big for bio: %u > %u\n",
226 __func__, nr_sectors * bi->tuple_size, bip->bip_size);
227 return -1;
228 }
229
230 if (set)
231 bi->set_tag_fn(bip->bip_buf, tag_buf, nr_sectors);
232 else
233 bi->get_tag_fn(bip->bip_buf, tag_buf, nr_sectors);
234
235 return 0;
236}
237
238/**
239 * bio_integrity_set_tag - Attach a tag buffer to a bio
240 * @bio: bio to attach buffer to
241 * @tag_buf: Pointer to a buffer containing tag data
242 * @len: Length of the included buffer
243 *
244 * Description: Use this function to tag a bio by leveraging the extra
245 * space provided by devices formatted with integrity protection. The
246 * size of the integrity buffer must be <= to the size reported by
247 * bio_integrity_tag_size().
248 */
249int bio_integrity_set_tag(struct bio *bio, void *tag_buf, unsigned int len)
250{
251 BUG_ON(bio_data_dir(bio) != WRITE);
252
253 return bio_integrity_tag(bio, tag_buf, len, 1);
254}
255EXPORT_SYMBOL(bio_integrity_set_tag);
256
257/**
258 * bio_integrity_get_tag - Retrieve a tag buffer from a bio
259 * @bio: bio to retrieve buffer from
260 * @tag_buf: Pointer to a buffer for the tag data
261 * @len: Length of the target buffer
262 *
263 * Description: Use this function to retrieve the tag buffer from a
264 * completed I/O. The size of the integrity buffer must be <= to the
265 * size reported by bio_integrity_tag_size().
266 */
267int bio_integrity_get_tag(struct bio *bio, void *tag_buf, unsigned int len)
268{
269 BUG_ON(bio_data_dir(bio) != READ);
270
271 return bio_integrity_tag(bio, tag_buf, len, 0);
272}
273EXPORT_SYMBOL(bio_integrity_get_tag);
274
275/**
276 * bio_integrity_generate - Generate integrity metadata for a bio
277 * @bio: bio to generate integrity metadata for
278 *
279 * Description: Generates integrity metadata for a bio by calling the
280 * block device's generation callback function. The bio must have a
281 * bip attached with enough room to accommodate the generated
282 * integrity metadata.
283 */
284static void bio_integrity_generate(struct bio *bio)
285{
286 struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
287 struct blk_integrity_exchg bix;
288 struct bio_vec *bv;
289 sector_t sector = bio->bi_sector;
290 unsigned int i, sectors, total;
291 void *prot_buf = bio->bi_integrity->bip_buf;
292
293 total = 0;
294 bix.disk_name = bio->bi_bdev->bd_disk->disk_name;
295 bix.sector_size = bi->sector_size;
296
297 bio_for_each_segment(bv, bio, i) {
298 void *kaddr = kmap_atomic(bv->bv_page, KM_USER0);
299 bix.data_buf = kaddr + bv->bv_offset;
300 bix.data_size = bv->bv_len;
301 bix.prot_buf = prot_buf;
302 bix.sector = sector;
303
304 bi->generate_fn(&bix);
305
306 sectors = bv->bv_len / bi->sector_size;
307 sector += sectors;
308 prot_buf += sectors * bi->tuple_size;
309 total += sectors * bi->tuple_size;
310 BUG_ON(total > bio->bi_integrity->bip_size);
311
312 kunmap_atomic(kaddr, KM_USER0);
313 }
314}
315
316/**
317 * bio_integrity_prep - Prepare bio for integrity I/O
318 * @bio: bio to prepare
319 *
320 * Description: Allocates a buffer for integrity metadata, maps the
321 * pages and attaches them to a bio. The bio must have data
322 * direction, target device and start sector set priot to calling. In
323 * the WRITE case, integrity metadata will be generated using the
324 * block device's integrity function. In the READ case, the buffer
325 * will be prepared for DMA and a suitable end_io handler set up.
326 */
327int bio_integrity_prep(struct bio *bio)
328{
329 struct bio_integrity_payload *bip;
330 struct blk_integrity *bi;
331 struct request_queue *q;
332 void *buf;
333 unsigned long start, end;
334 unsigned int len, nr_pages;
335 unsigned int bytes, offset, i;
336 unsigned int sectors;
337
338 bi = bdev_get_integrity(bio->bi_bdev);
339 q = bdev_get_queue(bio->bi_bdev);
340 BUG_ON(bi == NULL);
341 BUG_ON(bio_integrity(bio));
342
343 sectors = bio_integrity_hw_sectors(bi, bio_sectors(bio));
344
345 /* Allocate kernel buffer for protection data */
346 len = sectors * blk_integrity_tuple_size(bi);
347 buf = kmalloc(len, GFP_NOIO | __GFP_NOFAIL | q->bounce_gfp);
348 if (unlikely(buf == NULL)) {
349 printk(KERN_ERR "could not allocate integrity buffer\n");
350 return -EIO;
351 }
352
353 end = (((unsigned long) buf) + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
354 start = ((unsigned long) buf) >> PAGE_SHIFT;
355 nr_pages = end - start;
356
357 /* Allocate bio integrity payload and integrity vectors */
358 bip = bio_integrity_alloc(bio, GFP_NOIO, nr_pages);
359 if (unlikely(bip == NULL)) {
360 printk(KERN_ERR "could not allocate data integrity bioset\n");
361 kfree(buf);
362 return -EIO;
363 }
364
365 bip->bip_buf = buf;
366 bip->bip_size = len;
367 bip->bip_sector = bio->bi_sector;
368
369 /* Map it */
370 offset = offset_in_page(buf);
371 for (i = 0 ; i < nr_pages ; i++) {
372 int ret;
373 bytes = PAGE_SIZE - offset;
374
375 if (len <= 0)
376 break;
377
378 if (bytes > len)
379 bytes = len;
380
381 ret = bio_integrity_add_page(bio, virt_to_page(buf),
382 bytes, offset);
383
384 if (ret == 0)
385 return 0;
386
387 if (ret < bytes)
388 break;
389
390 buf += bytes;
391 len -= bytes;
392 offset = 0;
393 }
394
395 /* Install custom I/O completion handler if read verify is enabled */
396 if (bio_data_dir(bio) == READ) {
397 bip->bip_end_io = bio->bi_end_io;
398 bio->bi_end_io = bio_integrity_endio;
399 }
400
401 /* Auto-generate integrity metadata if this is a write */
402 if (bio_data_dir(bio) == WRITE)
403 bio_integrity_generate(bio);
404
405 return 0;
406}
407EXPORT_SYMBOL(bio_integrity_prep);
408
409/**
410 * bio_integrity_verify - Verify integrity metadata for a bio
411 * @bio: bio to verify
412 *
413 * Description: This function is called to verify the integrity of a
414 * bio. The data in the bio io_vec is compared to the integrity
415 * metadata returned by the HBA.
416 */
417static int bio_integrity_verify(struct bio *bio)
418{
419 struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
420 struct blk_integrity_exchg bix;
421 struct bio_vec *bv;
422 sector_t sector = bio->bi_integrity->bip_sector;
423 unsigned int i, sectors, total, ret;
424 void *prot_buf = bio->bi_integrity->bip_buf;
425
426 ret = total = 0;
427 bix.disk_name = bio->bi_bdev->bd_disk->disk_name;
428 bix.sector_size = bi->sector_size;
429
430 bio_for_each_segment(bv, bio, i) {
431 void *kaddr = kmap_atomic(bv->bv_page, KM_USER0);
432 bix.data_buf = kaddr + bv->bv_offset;
433 bix.data_size = bv->bv_len;
434 bix.prot_buf = prot_buf;
435 bix.sector = sector;
436
437 ret = bi->verify_fn(&bix);
438
439 if (ret) {
440 kunmap_atomic(kaddr, KM_USER0);
441 break;
442 }
443
444 sectors = bv->bv_len / bi->sector_size;
445 sector += sectors;
446 prot_buf += sectors * bi->tuple_size;
447 total += sectors * bi->tuple_size;
448 BUG_ON(total > bio->bi_integrity->bip_size);
449
450 kunmap_atomic(kaddr, KM_USER0);
451 }
452
453 return ret;
454}
455
456/**
457 * bio_integrity_verify_fn - Integrity I/O completion worker
458 * @work: Work struct stored in bio to be verified
459 *
460 * Description: This workqueue function is called to complete a READ
461 * request. The function verifies the transferred integrity metadata
462 * and then calls the original bio end_io function.
463 */
464static void bio_integrity_verify_fn(struct work_struct *work)
465{
466 struct bio_integrity_payload *bip =
467 container_of(work, struct bio_integrity_payload, bip_work);
468 struct bio *bio = bip->bip_bio;
469 int error = bip->bip_error;
470
471 if (bio_integrity_verify(bio)) {
472 clear_bit(BIO_UPTODATE, &bio->bi_flags);
473 error = -EIO;
474 }
475
476 /* Restore original bio completion handler */
477 bio->bi_end_io = bip->bip_end_io;
478
479 if (bio->bi_end_io)
480 bio->bi_end_io(bio, error);
481}
482
483/**
484 * bio_integrity_endio - Integrity I/O completion function
485 * @bio: Protected bio
486 * @error: Pointer to errno
487 *
488 * Description: Completion for integrity I/O
489 *
490 * Normally I/O completion is done in interrupt context. However,
491 * verifying I/O integrity is a time-consuming task which must be run
492 * in process context. This function postpones completion
493 * accordingly.
494 */
495void bio_integrity_endio(struct bio *bio, int error)
496{
497 struct bio_integrity_payload *bip = bio->bi_integrity;
498
499 BUG_ON(bip->bip_bio != bio);
500
501 bip->bip_error = error;
502 INIT_WORK(&bip->bip_work, bio_integrity_verify_fn);
503 queue_work(kintegrityd_wq, &bip->bip_work);
504}
505EXPORT_SYMBOL(bio_integrity_endio);
506
507/**
508 * bio_integrity_mark_head - Advance bip_vec skip bytes
509 * @bip: Integrity vector to advance
510 * @skip: Number of bytes to advance it
511 */
512void bio_integrity_mark_head(struct bio_integrity_payload *bip,
513 unsigned int skip)
514{
515 struct bio_vec *iv;
516 unsigned int i;
517
518 bip_for_each_vec(iv, bip, i) {
519 if (skip == 0) {
520 bip->bip_idx = i;
521 return;
522 } else if (skip >= iv->bv_len) {
523 skip -= iv->bv_len;
524 } else { /* skip < iv->bv_len) */
525 iv->bv_offset += skip;
526 iv->bv_len -= skip;
527 bip->bip_idx = i;
528 return;
529 }
530 }
531}
532
533/**
534 * bio_integrity_mark_tail - Truncate bip_vec to be len bytes long
535 * @bip: Integrity vector to truncate
536 * @len: New length of integrity vector
537 */
538void bio_integrity_mark_tail(struct bio_integrity_payload *bip,
539 unsigned int len)
540{
541 struct bio_vec *iv;
542 unsigned int i;
543
544 bip_for_each_vec(iv, bip, i) {
545 if (len == 0) {
546 bip->bip_vcnt = i;
547 return;
548 } else if (len >= iv->bv_len) {
549 len -= iv->bv_len;
550 } else { /* len < iv->bv_len) */
551 iv->bv_len = len;
552 len = 0;
553 }
554 }
555}
556
557/**
558 * bio_integrity_advance - Advance integrity vector
559 * @bio: bio whose integrity vector to update
560 * @bytes_done: number of data bytes that have been completed
561 *
562 * Description: This function calculates how many integrity bytes the
563 * number of completed data bytes correspond to and advances the
564 * integrity vector accordingly.
565 */
566void bio_integrity_advance(struct bio *bio, unsigned int bytes_done)
567{
568 struct bio_integrity_payload *bip = bio->bi_integrity;
569 struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
570 unsigned int nr_sectors;
571
572 BUG_ON(bip == NULL);
573 BUG_ON(bi == NULL);
574
575 nr_sectors = bio_integrity_hw_sectors(bi, bytes_done >> 9);
576 bio_integrity_mark_head(bip, nr_sectors * bi->tuple_size);
577}
578EXPORT_SYMBOL(bio_integrity_advance);
579
580/**
581 * bio_integrity_trim - Trim integrity vector
582 * @bio: bio whose integrity vector to update
583 * @offset: offset to first data sector
584 * @sectors: number of data sectors
585 *
586 * Description: Used to trim the integrity vector in a cloned bio.
587 * The ivec will be advanced corresponding to 'offset' data sectors
588 * and the length will be truncated corresponding to 'len' data
589 * sectors.
590 */
591void bio_integrity_trim(struct bio *bio, unsigned int offset,
592 unsigned int sectors)
593{
594 struct bio_integrity_payload *bip = bio->bi_integrity;
595 struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
596 unsigned int nr_sectors;
597
598 BUG_ON(bip == NULL);
599 BUG_ON(bi == NULL);
600 BUG_ON(!bio_flagged(bio, BIO_CLONED));
601
602 nr_sectors = bio_integrity_hw_sectors(bi, sectors);
603 bip->bip_sector = bip->bip_sector + offset;
604 bio_integrity_mark_head(bip, offset * bi->tuple_size);
605 bio_integrity_mark_tail(bip, sectors * bi->tuple_size);
606}
607EXPORT_SYMBOL(bio_integrity_trim);
608
609/**
610 * bio_integrity_split - Split integrity metadata
611 * @bio: Protected bio
612 * @bp: Resulting bio_pair
613 * @sectors: Offset
614 *
615 * Description: Splits an integrity page into a bio_pair.
616 */
617void bio_integrity_split(struct bio *bio, struct bio_pair *bp, int sectors)
618{
619 struct blk_integrity *bi;
620 struct bio_integrity_payload *bip = bio->bi_integrity;
621 unsigned int nr_sectors;
622
623 if (bio_integrity(bio) == 0)
624 return;
625
626 bi = bdev_get_integrity(bio->bi_bdev);
627 BUG_ON(bi == NULL);
628 BUG_ON(bip->bip_vcnt != 1);
629
630 nr_sectors = bio_integrity_hw_sectors(bi, sectors);
631
632 bp->bio1.bi_integrity = &bp->bip1;
633 bp->bio2.bi_integrity = &bp->bip2;
634
635 bp->iv1 = bip->bip_vec[0];
636 bp->iv2 = bip->bip_vec[0];
637
638 bp->bip1.bip_vec = &bp->iv1;
639 bp->bip2.bip_vec = &bp->iv2;
640
641 bp->iv1.bv_len = sectors * bi->tuple_size;
642 bp->iv2.bv_offset += sectors * bi->tuple_size;
643 bp->iv2.bv_len -= sectors * bi->tuple_size;
644
645 bp->bip1.bip_sector = bio->bi_integrity->bip_sector;
646 bp->bip2.bip_sector = bio->bi_integrity->bip_sector + nr_sectors;
647
648 bp->bip1.bip_vcnt = bp->bip2.bip_vcnt = 1;
649 bp->bip1.bip_idx = bp->bip2.bip_idx = 0;
650}
651EXPORT_SYMBOL(bio_integrity_split);
652
653/**
654 * bio_integrity_clone - Callback for cloning bios with integrity metadata
655 * @bio: New bio
656 * @bio_src: Original bio
657 * @bs: bio_set to allocate bip from
658 *
659 * Description: Called to allocate a bip when cloning a bio
660 */
661int bio_integrity_clone(struct bio *bio, struct bio *bio_src,
662 struct bio_set *bs)
663{
664 struct bio_integrity_payload *bip_src = bio_src->bi_integrity;
665 struct bio_integrity_payload *bip;
666
667 BUG_ON(bip_src == NULL);
668
669 bip = bio_integrity_alloc_bioset(bio, GFP_NOIO, bip_src->bip_vcnt, bs);
670
671 if (bip == NULL)
672 return -EIO;
673
674 memcpy(bip->bip_vec, bip_src->bip_vec,
675 bip_src->bip_vcnt * sizeof(struct bio_vec));
676
677 bip->bip_sector = bip_src->bip_sector;
678 bip->bip_vcnt = bip_src->bip_vcnt;
679 bip->bip_idx = bip_src->bip_idx;
680
681 return 0;
682}
683EXPORT_SYMBOL(bio_integrity_clone);
684
685int bioset_integrity_create(struct bio_set *bs, int pool_size)
686{
687 bs->bio_integrity_pool = mempool_create_slab_pool(pool_size,
688 bio_integrity_slab);
689 if (!bs->bio_integrity_pool)
690 return -1;
691
692 return 0;
693}
694EXPORT_SYMBOL(bioset_integrity_create);
695
696void bioset_integrity_free(struct bio_set *bs)
697{
698 if (bs->bio_integrity_pool)
699 mempool_destroy(bs->bio_integrity_pool);
700}
701EXPORT_SYMBOL(bioset_integrity_free);
702
703void __init bio_integrity_init_slab(void)
704{
705 bio_integrity_slab = KMEM_CACHE(bio_integrity_payload,
706 SLAB_HWCACHE_ALIGN|SLAB_PANIC);
707}
708EXPORT_SYMBOL(bio_integrity_init_slab);
709
710static int __init integrity_init(void)
711{
712 kintegrityd_wq = create_workqueue("kintegrityd");
713
714 if (!kintegrityd_wq)
715 panic("Failed to create kintegrityd\n");
716
717 return 0;
718}
719subsys_initcall(integrity_init);
diff --git a/fs/bio.c b/fs/bio.c
index 78562574cb52..88322b066acb 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -28,25 +28,10 @@
28#include <linux/blktrace_api.h> 28#include <linux/blktrace_api.h>
29#include <scsi/sg.h> /* for struct sg_iovec */ 29#include <scsi/sg.h> /* for struct sg_iovec */
30 30
31#define BIO_POOL_SIZE 2
32
33static struct kmem_cache *bio_slab __read_mostly; 31static struct kmem_cache *bio_slab __read_mostly;
34 32
35#define BIOVEC_NR_POOLS 6
36
37/*
38 * a small number of entries is fine, not going to be performance critical.
39 * basically we just need to survive
40 */
41#define BIO_SPLIT_ENTRIES 2
42mempool_t *bio_split_pool __read_mostly; 33mempool_t *bio_split_pool __read_mostly;
43 34
44struct biovec_slab {
45 int nr_vecs;
46 char *name;
47 struct kmem_cache *slab;
48};
49
50/* 35/*
51 * if you change this list, also change bvec_alloc or things will 36 * if you change this list, also change bvec_alloc or things will
52 * break badly! cannot be bigger than what you can fit into an 37 * break badly! cannot be bigger than what you can fit into an
@@ -60,23 +45,17 @@ static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = {
60#undef BV 45#undef BV
61 46
62/* 47/*
63 * bio_set is used to allow other portions of the IO system to
64 * allocate their own private memory pools for bio and iovec structures.
65 * These memory pools in turn all allocate from the bio_slab
66 * and the bvec_slabs[].
67 */
68struct bio_set {
69 mempool_t *bio_pool;
70 mempool_t *bvec_pools[BIOVEC_NR_POOLS];
71};
72
73/*
74 * fs_bio_set is the bio_set containing bio and iovec memory pools used by 48 * fs_bio_set is the bio_set containing bio and iovec memory pools used by
75 * IO code that does not need private memory pools. 49 * IO code that does not need private memory pools.
76 */ 50 */
77static struct bio_set *fs_bio_set; 51struct bio_set *fs_bio_set;
52
53unsigned int bvec_nr_vecs(unsigned short idx)
54{
55 return bvec_slabs[idx].nr_vecs;
56}
78 57
79static inline struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, struct bio_set *bs) 58struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, struct bio_set *bs)
80{ 59{
81 struct bio_vec *bvl; 60 struct bio_vec *bvl;
82 61
@@ -117,6 +96,9 @@ void bio_free(struct bio *bio, struct bio_set *bio_set)
117 mempool_free(bio->bi_io_vec, bio_set->bvec_pools[pool_idx]); 96 mempool_free(bio->bi_io_vec, bio_set->bvec_pools[pool_idx]);
118 } 97 }
119 98
99 if (bio_integrity(bio))
100 bio_integrity_free(bio, bio_set);
101
120 mempool_free(bio, bio_set->bio_pool); 102 mempool_free(bio, bio_set->bio_pool);
121} 103}
122 104
@@ -275,9 +257,19 @@ struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask)
275{ 257{
276 struct bio *b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs, fs_bio_set); 258 struct bio *b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs, fs_bio_set);
277 259
278 if (b) { 260 if (!b)
279 b->bi_destructor = bio_fs_destructor; 261 return NULL;
280 __bio_clone(b, bio); 262
263 b->bi_destructor = bio_fs_destructor;
264 __bio_clone(b, bio);
265
266 if (bio_integrity(bio)) {
267 int ret;
268
269 ret = bio_integrity_clone(b, bio, fs_bio_set);
270
271 if (ret < 0)
272 return NULL;
281 } 273 }
282 274
283 return b; 275 return b;
@@ -333,10 +325,19 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
333 if (page == prev->bv_page && 325 if (page == prev->bv_page &&
334 offset == prev->bv_offset + prev->bv_len) { 326 offset == prev->bv_offset + prev->bv_len) {
335 prev->bv_len += len; 327 prev->bv_len += len;
336 if (q->merge_bvec_fn && 328
337 q->merge_bvec_fn(q, bio, prev) < len) { 329 if (q->merge_bvec_fn) {
338 prev->bv_len -= len; 330 struct bvec_merge_data bvm = {
339 return 0; 331 .bi_bdev = bio->bi_bdev,
332 .bi_sector = bio->bi_sector,
333 .bi_size = bio->bi_size,
334 .bi_rw = bio->bi_rw,
335 };
336
337 if (q->merge_bvec_fn(q, &bvm, prev) < len) {
338 prev->bv_len -= len;
339 return 0;
340 }
340 } 341 }
341 342
342 goto done; 343 goto done;
@@ -377,11 +378,18 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
377 * queue to get further control 378 * queue to get further control
378 */ 379 */
379 if (q->merge_bvec_fn) { 380 if (q->merge_bvec_fn) {
381 struct bvec_merge_data bvm = {
382 .bi_bdev = bio->bi_bdev,
383 .bi_sector = bio->bi_sector,
384 .bi_size = bio->bi_size,
385 .bi_rw = bio->bi_rw,
386 };
387
380 /* 388 /*
381 * merge_bvec_fn() returns number of bytes it can accept 389 * merge_bvec_fn() returns number of bytes it can accept
382 * at this offset 390 * at this offset
383 */ 391 */
384 if (q->merge_bvec_fn(q, bio, bvec) < len) { 392 if (q->merge_bvec_fn(q, &bvm, bvec) < len) {
385 bvec->bv_page = NULL; 393 bvec->bv_page = NULL;
386 bvec->bv_len = 0; 394 bvec->bv_len = 0;
387 bvec->bv_offset = 0; 395 bvec->bv_offset = 0;
@@ -1249,6 +1257,9 @@ struct bio_pair *bio_split(struct bio *bi, mempool_t *pool, int first_sectors)
1249 bp->bio1.bi_private = bi; 1257 bp->bio1.bi_private = bi;
1250 bp->bio2.bi_private = pool; 1258 bp->bio2.bi_private = pool;
1251 1259
1260 if (bio_integrity(bi))
1261 bio_integrity_split(bi, bp, first_sectors);
1262
1252 return bp; 1263 return bp;
1253} 1264}
1254 1265
@@ -1290,6 +1301,7 @@ void bioset_free(struct bio_set *bs)
1290 if (bs->bio_pool) 1301 if (bs->bio_pool)
1291 mempool_destroy(bs->bio_pool); 1302 mempool_destroy(bs->bio_pool);
1292 1303
1304 bioset_integrity_free(bs);
1293 biovec_free_pools(bs); 1305 biovec_free_pools(bs);
1294 1306
1295 kfree(bs); 1307 kfree(bs);
@@ -1306,6 +1318,9 @@ struct bio_set *bioset_create(int bio_pool_size, int bvec_pool_size)
1306 if (!bs->bio_pool) 1318 if (!bs->bio_pool)
1307 goto bad; 1319 goto bad;
1308 1320
1321 if (bioset_integrity_create(bs, bio_pool_size))
1322 goto bad;
1323
1309 if (!biovec_create_pools(bs, bvec_pool_size)) 1324 if (!biovec_create_pools(bs, bvec_pool_size))
1310 return bs; 1325 return bs;
1311 1326
@@ -1332,6 +1347,7 @@ static int __init init_bio(void)
1332{ 1347{
1333 bio_slab = KMEM_CACHE(bio, SLAB_HWCACHE_ALIGN|SLAB_PANIC); 1348 bio_slab = KMEM_CACHE(bio, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
1334 1349
1350 bio_integrity_init_slab();
1335 biovec_init_slabs(); 1351 biovec_init_slabs();
1336 1352
1337 fs_bio_set = bioset_create(BIO_POOL_SIZE, 2); 1353 fs_bio_set = bioset_create(BIO_POOL_SIZE, 2);
diff --git a/fs/buffer.c b/fs/buffer.c
index 0f51c0f7c266..d48caee12e2a 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1464,7 +1464,7 @@ static void invalidate_bh_lru(void *arg)
1464 1464
1465void invalidate_bh_lrus(void) 1465void invalidate_bh_lrus(void)
1466{ 1466{
1467 on_each_cpu(invalidate_bh_lru, NULL, 1, 1); 1467 on_each_cpu(invalidate_bh_lru, NULL, 1);
1468} 1468}
1469EXPORT_SYMBOL_GPL(invalidate_bh_lrus); 1469EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
1470 1470
@@ -1691,11 +1691,13 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
1691 */ 1691 */
1692 clear_buffer_dirty(bh); 1692 clear_buffer_dirty(bh);
1693 set_buffer_uptodate(bh); 1693 set_buffer_uptodate(bh);
1694 } else if (!buffer_mapped(bh) && buffer_dirty(bh)) { 1694 } else if ((!buffer_mapped(bh) || buffer_delay(bh)) &&
1695 buffer_dirty(bh)) {
1695 WARN_ON(bh->b_size != blocksize); 1696 WARN_ON(bh->b_size != blocksize);
1696 err = get_block(inode, block, bh, 1); 1697 err = get_block(inode, block, bh, 1);
1697 if (err) 1698 if (err)
1698 goto recover; 1699 goto recover;
1700 clear_buffer_delay(bh);
1699 if (buffer_new(bh)) { 1701 if (buffer_new(bh)) {
1700 /* blockdev mappings never come here */ 1702 /* blockdev mappings never come here */
1701 clear_buffer_new(bh); 1703 clear_buffer_new(bh);
@@ -1774,7 +1776,8 @@ recover:
1774 bh = head; 1776 bh = head;
1775 /* Recovery: lock and submit the mapped buffers */ 1777 /* Recovery: lock and submit the mapped buffers */
1776 do { 1778 do {
1777 if (buffer_mapped(bh) && buffer_dirty(bh)) { 1779 if (buffer_mapped(bh) && buffer_dirty(bh) &&
1780 !buffer_delay(bh)) {
1778 lock_buffer(bh); 1781 lock_buffer(bh);
1779 mark_buffer_async_write(bh); 1782 mark_buffer_async_write(bh);
1780 } else { 1783 } else {
@@ -2061,6 +2064,7 @@ int generic_write_end(struct file *file, struct address_space *mapping,
2061 struct page *page, void *fsdata) 2064 struct page *page, void *fsdata)
2062{ 2065{
2063 struct inode *inode = mapping->host; 2066 struct inode *inode = mapping->host;
2067 int i_size_changed = 0;
2064 2068
2065 copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); 2069 copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
2066 2070
@@ -2073,12 +2077,21 @@ int generic_write_end(struct file *file, struct address_space *mapping,
2073 */ 2077 */
2074 if (pos+copied > inode->i_size) { 2078 if (pos+copied > inode->i_size) {
2075 i_size_write(inode, pos+copied); 2079 i_size_write(inode, pos+copied);
2076 mark_inode_dirty(inode); 2080 i_size_changed = 1;
2077 } 2081 }
2078 2082
2079 unlock_page(page); 2083 unlock_page(page);
2080 page_cache_release(page); 2084 page_cache_release(page);
2081 2085
2086 /*
2087 * Don't mark the inode dirty under page lock. First, it unnecessarily
2088 * makes the holding time of page lock longer. Second, it forces lock
2089 * ordering of page lock and transaction start for journaling
2090 * filesystems.
2091 */
2092 if (i_size_changed)
2093 mark_inode_dirty(inode);
2094
2082 return copied; 2095 return copied;
2083} 2096}
2084EXPORT_SYMBOL(generic_write_end); 2097EXPORT_SYMBOL(generic_write_end);
diff --git a/fs/char_dev.c b/fs/char_dev.c
index 68e510b88457..3cb7cda3d780 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -373,6 +373,8 @@ static int chrdev_open(struct inode *inode, struct file *filp)
373 return -ENXIO; 373 return -ENXIO;
374 new = container_of(kobj, struct cdev, kobj); 374 new = container_of(kobj, struct cdev, kobj);
375 spin_lock(&cdev_lock); 375 spin_lock(&cdev_lock);
376 /* Check i_cdev again in case somebody beat us to it while
377 we dropped the lock. */
376 p = inode->i_cdev; 378 p = inode->i_cdev;
377 if (!p) { 379 if (!p) {
378 inode->i_cdev = p = new; 380 inode->i_cdev = p = new;
@@ -392,11 +394,8 @@ static int chrdev_open(struct inode *inode, struct file *filp)
392 cdev_put(p); 394 cdev_put(p);
393 return -ENXIO; 395 return -ENXIO;
394 } 396 }
395 if (filp->f_op->open) { 397 if (filp->f_op->open)
396 lock_kernel();
397 ret = filp->f_op->open(inode,filp); 398 ret = filp->f_op->open(inode,filp);
398 unlock_kernel();
399 }
400 if (ret) 399 if (ret)
401 cdev_put(p); 400 cdev_put(p);
402 return ret; 401 return ret;
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 34902cff5400..0e9fc2ba90ee 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -34,11 +34,11 @@
34static struct cifs_wksid wksidarr[NUM_WK_SIDS] = { 34static struct cifs_wksid wksidarr[NUM_WK_SIDS] = {
35 {{1, 0, {0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0} }, "null user"}, 35 {{1, 0, {0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0} }, "null user"},
36 {{1, 1, {0, 0, 0, 0, 0, 1}, {0, 0, 0, 0, 0} }, "nobody"}, 36 {{1, 1, {0, 0, 0, 0, 0, 1}, {0, 0, 0, 0, 0} }, "nobody"},
37 {{1, 1, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(11), 0, 0, 0, 0} }, "net-users"}, 37 {{1, 1, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(11), 0, 0, 0, 0} }, "net-users"},
38 {{1, 1, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(18), 0, 0, 0, 0} }, "sys"}, 38 {{1, 1, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(18), 0, 0, 0, 0} }, "sys"},
39 {{1, 2, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(32), cpu_to_le32(544), 0, 0, 0} }, "root"}, 39 {{1, 2, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(32), __constant_cpu_to_le32(544), 0, 0, 0} }, "root"},
40 {{1, 2, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(32), cpu_to_le32(545), 0, 0, 0} }, "users"}, 40 {{1, 2, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(32), __constant_cpu_to_le32(545), 0, 0, 0} }, "users"},
41 {{1, 2, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(32), cpu_to_le32(546), 0, 0, 0} }, "guest"} } 41 {{1, 2, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(32), __constant_cpu_to_le32(546), 0, 0, 0} }, "guest"} }
42; 42;
43 43
44 44
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 86b4d5f405ae..22857c639df5 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -612,7 +612,7 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int origin)
612 if (retval < 0) 612 if (retval < 0)
613 return (loff_t)retval; 613 return (loff_t)retval;
614 } 614 }
615 return remote_llseek(file, offset, origin); 615 return generic_file_llseek_unlocked(file, offset, origin);
616} 616}
617 617
618struct file_system_type cifs_fs_type = { 618struct file_system_type cifs_fs_type = {
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 722be543ceec..2e904bd111c8 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -219,15 +219,15 @@ int cifs_get_inode_info_unix(struct inode **pinode,
219 rc = CIFSSMBUnixQPathInfo(xid, pTcon, full_path, &find_data, 219 rc = CIFSSMBUnixQPathInfo(xid, pTcon, full_path, &find_data,
220 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & 220 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
221 CIFS_MOUNT_MAP_SPECIAL_CHR); 221 CIFS_MOUNT_MAP_SPECIAL_CHR);
222 if (rc) { 222 if (rc == -EREMOTE && !is_dfs_referral) {
223 if (rc == -EREMOTE && !is_dfs_referral) { 223 is_dfs_referral = true;
224 is_dfs_referral = true; 224 cFYI(DBG2, ("DFS ref"));
225 cFYI(DBG2, ("DFS ref")); 225 /* for DFS, server does not give us real inode data */
226 /* for DFS, server does not give us real inode data */ 226 fill_fake_finddataunix(&find_data, sb);
227 fill_fake_finddataunix(&find_data, sb); 227 rc = 0;
228 rc = 0; 228 } else if (rc)
229 } 229 goto cgiiu_exit;
230 } 230
231 num_of_bytes = le64_to_cpu(find_data.NumOfBytes); 231 num_of_bytes = le64_to_cpu(find_data.NumOfBytes);
232 end_of_file = le64_to_cpu(find_data.EndOfFile); 232 end_of_file = le64_to_cpu(find_data.EndOfFile);
233 233
@@ -236,7 +236,7 @@ int cifs_get_inode_info_unix(struct inode **pinode,
236 *pinode = new_inode(sb); 236 *pinode = new_inode(sb);
237 if (*pinode == NULL) { 237 if (*pinode == NULL) {
238 rc = -ENOMEM; 238 rc = -ENOMEM;
239 goto cgiiu_exit; 239 goto cgiiu_exit;
240 } 240 }
241 /* Is an i_ino of zero legal? */ 241 /* Is an i_ino of zero legal? */
242 /* note ino incremented to unique num in new_inode */ 242 /* note ino incremented to unique num in new_inode */
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 05ec7eef8690..ddefb8851a98 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -68,9 +68,11 @@
68#include <linux/capi.h> 68#include <linux/capi.h>
69#include <linux/gigaset_dev.h> 69#include <linux/gigaset_dev.h>
70 70
71#ifdef CONFIG_BLOCK
71#include <scsi/scsi.h> 72#include <scsi/scsi.h>
72#include <scsi/scsi_ioctl.h> 73#include <scsi/scsi_ioctl.h>
73#include <scsi/sg.h> 74#include <scsi/sg.h>
75#endif
74 76
75#include <asm/uaccess.h> 77#include <asm/uaccess.h>
76#include <linux/ethtool.h> 78#include <linux/ethtool.h>
@@ -1965,6 +1967,7 @@ COMPATIBLE_IOCTL(GIO_UNISCRNMAP)
1965COMPATIBLE_IOCTL(PIO_UNISCRNMAP) 1967COMPATIBLE_IOCTL(PIO_UNISCRNMAP)
1966COMPATIBLE_IOCTL(PIO_FONTRESET) 1968COMPATIBLE_IOCTL(PIO_FONTRESET)
1967COMPATIBLE_IOCTL(PIO_UNIMAPCLR) 1969COMPATIBLE_IOCTL(PIO_UNIMAPCLR)
1970#ifdef CONFIG_BLOCK
1968/* Big S */ 1971/* Big S */
1969COMPATIBLE_IOCTL(SCSI_IOCTL_GET_IDLUN) 1972COMPATIBLE_IOCTL(SCSI_IOCTL_GET_IDLUN)
1970COMPATIBLE_IOCTL(SCSI_IOCTL_DOORLOCK) 1973COMPATIBLE_IOCTL(SCSI_IOCTL_DOORLOCK)
@@ -1974,6 +1977,7 @@ COMPATIBLE_IOCTL(SCSI_IOCTL_GET_BUS_NUMBER)
1974COMPATIBLE_IOCTL(SCSI_IOCTL_SEND_COMMAND) 1977COMPATIBLE_IOCTL(SCSI_IOCTL_SEND_COMMAND)
1975COMPATIBLE_IOCTL(SCSI_IOCTL_PROBE_HOST) 1978COMPATIBLE_IOCTL(SCSI_IOCTL_PROBE_HOST)
1976COMPATIBLE_IOCTL(SCSI_IOCTL_GET_PCI) 1979COMPATIBLE_IOCTL(SCSI_IOCTL_GET_PCI)
1980#endif
1977/* Big T */ 1981/* Big T */
1978COMPATIBLE_IOCTL(TUNSETNOCSUM) 1982COMPATIBLE_IOCTL(TUNSETNOCSUM)
1979COMPATIBLE_IOCTL(TUNSETDEBUG) 1983COMPATIBLE_IOCTL(TUNSETDEBUG)
@@ -2044,6 +2048,7 @@ COMPATIBLE_IOCTL(SIOCGIFVLAN)
2044COMPATIBLE_IOCTL(SIOCSIFVLAN) 2048COMPATIBLE_IOCTL(SIOCSIFVLAN)
2045COMPATIBLE_IOCTL(SIOCBRADDBR) 2049COMPATIBLE_IOCTL(SIOCBRADDBR)
2046COMPATIBLE_IOCTL(SIOCBRDELBR) 2050COMPATIBLE_IOCTL(SIOCBRDELBR)
2051#ifdef CONFIG_BLOCK
2047/* SG stuff */ 2052/* SG stuff */
2048COMPATIBLE_IOCTL(SG_SET_TIMEOUT) 2053COMPATIBLE_IOCTL(SG_SET_TIMEOUT)
2049COMPATIBLE_IOCTL(SG_GET_TIMEOUT) 2054COMPATIBLE_IOCTL(SG_GET_TIMEOUT)
@@ -2068,6 +2073,7 @@ COMPATIBLE_IOCTL(SG_SCSI_RESET)
2068COMPATIBLE_IOCTL(SG_GET_REQUEST_TABLE) 2073COMPATIBLE_IOCTL(SG_GET_REQUEST_TABLE)
2069COMPATIBLE_IOCTL(SG_SET_KEEP_ORPHAN) 2074COMPATIBLE_IOCTL(SG_SET_KEEP_ORPHAN)
2070COMPATIBLE_IOCTL(SG_GET_KEEP_ORPHAN) 2075COMPATIBLE_IOCTL(SG_GET_KEEP_ORPHAN)
2076#endif
2071/* PPP stuff */ 2077/* PPP stuff */
2072COMPATIBLE_IOCTL(PPPIOCGFLAGS) 2078COMPATIBLE_IOCTL(PPPIOCGFLAGS)
2073COMPATIBLE_IOCTL(PPPIOCSFLAGS) 2079COMPATIBLE_IOCTL(PPPIOCSFLAGS)
diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h
index cca98609aa7f..da015c12e3ea 100644
--- a/fs/configfs/configfs_internal.h
+++ b/fs/configfs/configfs_internal.h
@@ -26,6 +26,7 @@
26 26
27#include <linux/slab.h> 27#include <linux/slab.h>
28#include <linux/list.h> 28#include <linux/list.h>
29#include <linux/spinlock.h>
29 30
30struct configfs_dirent { 31struct configfs_dirent {
31 atomic_t s_count; 32 atomic_t s_count;
@@ -47,8 +48,11 @@ struct configfs_dirent {
47#define CONFIGFS_USET_DIR 0x0040 48#define CONFIGFS_USET_DIR 0x0040
48#define CONFIGFS_USET_DEFAULT 0x0080 49#define CONFIGFS_USET_DEFAULT 0x0080
49#define CONFIGFS_USET_DROPPING 0x0100 50#define CONFIGFS_USET_DROPPING 0x0100
51#define CONFIGFS_USET_IN_MKDIR 0x0200
50#define CONFIGFS_NOT_PINNED (CONFIGFS_ITEM_ATTR) 52#define CONFIGFS_NOT_PINNED (CONFIGFS_ITEM_ATTR)
51 53
54extern spinlock_t configfs_dirent_lock;
55
52extern struct vfsmount * configfs_mount; 56extern struct vfsmount * configfs_mount;
53extern struct kmem_cache *configfs_dir_cachep; 57extern struct kmem_cache *configfs_dir_cachep;
54 58
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index a48dc7dd8765..0e64312a084c 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -30,11 +30,25 @@
30#include <linux/mount.h> 30#include <linux/mount.h>
31#include <linux/module.h> 31#include <linux/module.h>
32#include <linux/slab.h> 32#include <linux/slab.h>
33#include <linux/err.h>
33 34
34#include <linux/configfs.h> 35#include <linux/configfs.h>
35#include "configfs_internal.h" 36#include "configfs_internal.h"
36 37
37DECLARE_RWSEM(configfs_rename_sem); 38DECLARE_RWSEM(configfs_rename_sem);
39/*
40 * Protects mutations of configfs_dirent linkage together with proper i_mutex
41 * Also protects mutations of symlinks linkage to target configfs_dirent
42 * Mutators of configfs_dirent linkage must *both* have the proper inode locked
43 * and configfs_dirent_lock locked, in that order.
44 * This allows one to safely traverse configfs_dirent trees and symlinks without
45 * having to lock inodes.
46 *
47 * Protects setting of CONFIGFS_USET_DROPPING: checking the flag
48 * unlocked is not reliable unless in detach_groups() called from
49 * rmdir()/unregister() and from configfs_attach_group()
50 */
51DEFINE_SPINLOCK(configfs_dirent_lock);
38 52
39static void configfs_d_iput(struct dentry * dentry, 53static void configfs_d_iput(struct dentry * dentry,
40 struct inode * inode) 54 struct inode * inode)
@@ -74,13 +88,20 @@ static struct configfs_dirent *configfs_new_dirent(struct configfs_dirent * pare
74 88
75 sd = kmem_cache_zalloc(configfs_dir_cachep, GFP_KERNEL); 89 sd = kmem_cache_zalloc(configfs_dir_cachep, GFP_KERNEL);
76 if (!sd) 90 if (!sd)
77 return NULL; 91 return ERR_PTR(-ENOMEM);
78 92
79 atomic_set(&sd->s_count, 1); 93 atomic_set(&sd->s_count, 1);
80 INIT_LIST_HEAD(&sd->s_links); 94 INIT_LIST_HEAD(&sd->s_links);
81 INIT_LIST_HEAD(&sd->s_children); 95 INIT_LIST_HEAD(&sd->s_children);
82 list_add(&sd->s_sibling, &parent_sd->s_children);
83 sd->s_element = element; 96 sd->s_element = element;
97 spin_lock(&configfs_dirent_lock);
98 if (parent_sd->s_type & CONFIGFS_USET_DROPPING) {
99 spin_unlock(&configfs_dirent_lock);
100 kmem_cache_free(configfs_dir_cachep, sd);
101 return ERR_PTR(-ENOENT);
102 }
103 list_add(&sd->s_sibling, &parent_sd->s_children);
104 spin_unlock(&configfs_dirent_lock);
84 105
85 return sd; 106 return sd;
86} 107}
@@ -118,8 +139,8 @@ int configfs_make_dirent(struct configfs_dirent * parent_sd,
118 struct configfs_dirent * sd; 139 struct configfs_dirent * sd;
119 140
120 sd = configfs_new_dirent(parent_sd, element); 141 sd = configfs_new_dirent(parent_sd, element);
121 if (!sd) 142 if (IS_ERR(sd))
122 return -ENOMEM; 143 return PTR_ERR(sd);
123 144
124 sd->s_mode = mode; 145 sd->s_mode = mode;
125 sd->s_type = type; 146 sd->s_type = type;
@@ -173,7 +194,9 @@ static int create_dir(struct config_item * k, struct dentry * p,
173 } else { 194 } else {
174 struct configfs_dirent *sd = d->d_fsdata; 195 struct configfs_dirent *sd = d->d_fsdata;
175 if (sd) { 196 if (sd) {
197 spin_lock(&configfs_dirent_lock);
176 list_del_init(&sd->s_sibling); 198 list_del_init(&sd->s_sibling);
199 spin_unlock(&configfs_dirent_lock);
177 configfs_put(sd); 200 configfs_put(sd);
178 } 201 }
179 } 202 }
@@ -224,7 +247,9 @@ int configfs_create_link(struct configfs_symlink *sl,
224 else { 247 else {
225 struct configfs_dirent *sd = dentry->d_fsdata; 248 struct configfs_dirent *sd = dentry->d_fsdata;
226 if (sd) { 249 if (sd) {
250 spin_lock(&configfs_dirent_lock);
227 list_del_init(&sd->s_sibling); 251 list_del_init(&sd->s_sibling);
252 spin_unlock(&configfs_dirent_lock);
228 configfs_put(sd); 253 configfs_put(sd);
229 } 254 }
230 } 255 }
@@ -238,7 +263,9 @@ static void remove_dir(struct dentry * d)
238 struct configfs_dirent * sd; 263 struct configfs_dirent * sd;
239 264
240 sd = d->d_fsdata; 265 sd = d->d_fsdata;
266 spin_lock(&configfs_dirent_lock);
241 list_del_init(&sd->s_sibling); 267 list_del_init(&sd->s_sibling);
268 spin_unlock(&configfs_dirent_lock);
242 configfs_put(sd); 269 configfs_put(sd);
243 if (d->d_inode) 270 if (d->d_inode)
244 simple_rmdir(parent->d_inode,d); 271 simple_rmdir(parent->d_inode,d);
@@ -331,13 +358,13 @@ static struct dentry * configfs_lookup(struct inode *dir,
331 358
332/* 359/*
333 * Only subdirectories count here. Files (CONFIGFS_NOT_PINNED) are 360 * Only subdirectories count here. Files (CONFIGFS_NOT_PINNED) are
334 * attributes and are removed by rmdir(). We recurse, taking i_mutex 361 * attributes and are removed by rmdir(). We recurse, setting
335 * on all children that are candidates for default detach. If the 362 * CONFIGFS_USET_DROPPING on all children that are candidates for
336 * result is clean, then configfs_detach_group() will handle dropping 363 * default detach.
337 * i_mutex. If there is an error, the caller will clean up the i_mutex 364 * If there is an error, the caller will reset the flags via
338 * holders via configfs_detach_rollback(). 365 * configfs_detach_rollback().
339 */ 366 */
340static int configfs_detach_prep(struct dentry *dentry) 367static int configfs_detach_prep(struct dentry *dentry, struct mutex **wait_mutex)
341{ 368{
342 struct configfs_dirent *parent_sd = dentry->d_fsdata; 369 struct configfs_dirent *parent_sd = dentry->d_fsdata;
343 struct configfs_dirent *sd; 370 struct configfs_dirent *sd;
@@ -352,15 +379,20 @@ static int configfs_detach_prep(struct dentry *dentry)
352 if (sd->s_type & CONFIGFS_NOT_PINNED) 379 if (sd->s_type & CONFIGFS_NOT_PINNED)
353 continue; 380 continue;
354 if (sd->s_type & CONFIGFS_USET_DEFAULT) { 381 if (sd->s_type & CONFIGFS_USET_DEFAULT) {
355 mutex_lock(&sd->s_dentry->d_inode->i_mutex); 382 /* Abort if racing with mkdir() */
356 /* Mark that we've taken i_mutex */ 383 if (sd->s_type & CONFIGFS_USET_IN_MKDIR) {
384 if (wait_mutex)
385 *wait_mutex = &sd->s_dentry->d_inode->i_mutex;
386 return -EAGAIN;
387 }
388 /* Mark that we're trying to drop the group */
357 sd->s_type |= CONFIGFS_USET_DROPPING; 389 sd->s_type |= CONFIGFS_USET_DROPPING;
358 390
359 /* 391 /*
360 * Yup, recursive. If there's a problem, blame 392 * Yup, recursive. If there's a problem, blame
361 * deep nesting of default_groups 393 * deep nesting of default_groups
362 */ 394 */
363 ret = configfs_detach_prep(sd->s_dentry); 395 ret = configfs_detach_prep(sd->s_dentry, wait_mutex);
364 if (!ret) 396 if (!ret)
365 continue; 397 continue;
366 } else 398 } else
@@ -374,7 +406,7 @@ out:
374} 406}
375 407
376/* 408/*
377 * Walk the tree, dropping i_mutex wherever CONFIGFS_USET_DROPPING is 409 * Walk the tree, resetting CONFIGFS_USET_DROPPING wherever it was
378 * set. 410 * set.
379 */ 411 */
380static void configfs_detach_rollback(struct dentry *dentry) 412static void configfs_detach_rollback(struct dentry *dentry)
@@ -385,11 +417,7 @@ static void configfs_detach_rollback(struct dentry *dentry)
385 list_for_each_entry(sd, &parent_sd->s_children, s_sibling) { 417 list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
386 if (sd->s_type & CONFIGFS_USET_DEFAULT) { 418 if (sd->s_type & CONFIGFS_USET_DEFAULT) {
387 configfs_detach_rollback(sd->s_dentry); 419 configfs_detach_rollback(sd->s_dentry);
388 420 sd->s_type &= ~CONFIGFS_USET_DROPPING;
389 if (sd->s_type & CONFIGFS_USET_DROPPING) {
390 sd->s_type &= ~CONFIGFS_USET_DROPPING;
391 mutex_unlock(&sd->s_dentry->d_inode->i_mutex);
392 }
393 } 421 }
394 } 422 }
395} 423}
@@ -410,7 +438,9 @@ static void detach_attrs(struct config_item * item)
410 list_for_each_entry_safe(sd, tmp, &parent_sd->s_children, s_sibling) { 438 list_for_each_entry_safe(sd, tmp, &parent_sd->s_children, s_sibling) {
411 if (!sd->s_element || !(sd->s_type & CONFIGFS_NOT_PINNED)) 439 if (!sd->s_element || !(sd->s_type & CONFIGFS_NOT_PINNED))
412 continue; 440 continue;
441 spin_lock(&configfs_dirent_lock);
413 list_del_init(&sd->s_sibling); 442 list_del_init(&sd->s_sibling);
443 spin_unlock(&configfs_dirent_lock);
414 configfs_drop_dentry(sd, dentry); 444 configfs_drop_dentry(sd, dentry);
415 configfs_put(sd); 445 configfs_put(sd);
416 } 446 }
@@ -466,16 +496,12 @@ static void detach_groups(struct config_group *group)
466 496
467 child = sd->s_dentry; 497 child = sd->s_dentry;
468 498
499 mutex_lock(&child->d_inode->i_mutex);
500
469 configfs_detach_group(sd->s_element); 501 configfs_detach_group(sd->s_element);
470 child->d_inode->i_flags |= S_DEAD; 502 child->d_inode->i_flags |= S_DEAD;
471 503
472 /* 504 mutex_unlock(&child->d_inode->i_mutex);
473 * From rmdir/unregister, a configfs_detach_prep() pass
474 * has taken our i_mutex for us. Drop it.
475 * From mkdir/register cleanup, there is no sem held.
476 */
477 if (sd->s_type & CONFIGFS_USET_DROPPING)
478 mutex_unlock(&child->d_inode->i_mutex);
479 505
480 d_delete(child); 506 d_delete(child);
481 dput(child); 507 dput(child);
@@ -1047,25 +1073,24 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
1047 group = NULL; 1073 group = NULL;
1048 item = NULL; 1074 item = NULL;
1049 if (type->ct_group_ops->make_group) { 1075 if (type->ct_group_ops->make_group) {
1050 group = type->ct_group_ops->make_group(to_config_group(parent_item), name); 1076 ret = type->ct_group_ops->make_group(to_config_group(parent_item), name, &group);
1051 if (group) { 1077 if (!ret) {
1052 link_group(to_config_group(parent_item), group); 1078 link_group(to_config_group(parent_item), group);
1053 item = &group->cg_item; 1079 item = &group->cg_item;
1054 } 1080 }
1055 } else { 1081 } else {
1056 item = type->ct_group_ops->make_item(to_config_group(parent_item), name); 1082 ret = type->ct_group_ops->make_item(to_config_group(parent_item), name, &item);
1057 if (item) 1083 if (!ret)
1058 link_obj(parent_item, item); 1084 link_obj(parent_item, item);
1059 } 1085 }
1060 mutex_unlock(&subsys->su_mutex); 1086 mutex_unlock(&subsys->su_mutex);
1061 1087
1062 kfree(name); 1088 kfree(name);
1063 if (!item) { 1089 if (ret) {
1064 /* 1090 /*
1065 * If item == NULL, then link_obj() was never called. 1091 * If ret != 0, then link_obj() was never called.
1066 * There are no extra references to clean up. 1092 * There are no extra references to clean up.
1067 */ 1093 */
1068 ret = -ENOMEM;
1069 goto out_put; 1094 goto out_put;
1070 } 1095 }
1071 1096
@@ -1093,11 +1118,26 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
1093 */ 1118 */
1094 module_got = 1; 1119 module_got = 1;
1095 1120
1121 /*
1122 * Make racing rmdir() fail if it did not tag parent with
1123 * CONFIGFS_USET_DROPPING
1124 * Note: if CONFIGFS_USET_DROPPING is already set, attach_group() will
1125 * fail and let rmdir() terminate correctly
1126 */
1127 spin_lock(&configfs_dirent_lock);
1128 /* This will make configfs_detach_prep() fail */
1129 sd->s_type |= CONFIGFS_USET_IN_MKDIR;
1130 spin_unlock(&configfs_dirent_lock);
1131
1096 if (group) 1132 if (group)
1097 ret = configfs_attach_group(parent_item, item, dentry); 1133 ret = configfs_attach_group(parent_item, item, dentry);
1098 else 1134 else
1099 ret = configfs_attach_item(parent_item, item, dentry); 1135 ret = configfs_attach_item(parent_item, item, dentry);
1100 1136
1137 spin_lock(&configfs_dirent_lock);
1138 sd->s_type &= ~CONFIGFS_USET_IN_MKDIR;
1139 spin_unlock(&configfs_dirent_lock);
1140
1101out_unlink: 1141out_unlink:
1102 if (ret) { 1142 if (ret) {
1103 /* Tear down everything we built up */ 1143 /* Tear down everything we built up */
@@ -1161,12 +1201,27 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
1161 return -EINVAL; 1201 return -EINVAL;
1162 } 1202 }
1163 1203
1164 ret = configfs_detach_prep(dentry); 1204 spin_lock(&configfs_dirent_lock);
1165 if (ret) { 1205 do {
1166 configfs_detach_rollback(dentry); 1206 struct mutex *wait_mutex;
1167 config_item_put(parent_item); 1207
1168 return ret; 1208 ret = configfs_detach_prep(dentry, &wait_mutex);
1169 } 1209 if (ret) {
1210 configfs_detach_rollback(dentry);
1211 spin_unlock(&configfs_dirent_lock);
1212 if (ret != -EAGAIN) {
1213 config_item_put(parent_item);
1214 return ret;
1215 }
1216
1217 /* Wait until the racing operation terminates */
1218 mutex_lock(wait_mutex);
1219 mutex_unlock(wait_mutex);
1220
1221 spin_lock(&configfs_dirent_lock);
1222 }
1223 } while (ret == -EAGAIN);
1224 spin_unlock(&configfs_dirent_lock);
1170 1225
1171 /* Get a working ref for the duration of this function */ 1226 /* Get a working ref for the duration of this function */
1172 item = configfs_get_config_item(dentry); 1227 item = configfs_get_config_item(dentry);
@@ -1258,7 +1313,7 @@ static int configfs_dir_open(struct inode *inode, struct file *file)
1258 file->private_data = configfs_new_dirent(parent_sd, NULL); 1313 file->private_data = configfs_new_dirent(parent_sd, NULL);
1259 mutex_unlock(&dentry->d_inode->i_mutex); 1314 mutex_unlock(&dentry->d_inode->i_mutex);
1260 1315
1261 return file->private_data ? 0 : -ENOMEM; 1316 return IS_ERR(file->private_data) ? PTR_ERR(file->private_data) : 0;
1262 1317
1263} 1318}
1264 1319
@@ -1268,7 +1323,9 @@ static int configfs_dir_close(struct inode *inode, struct file *file)
1268 struct configfs_dirent * cursor = file->private_data; 1323 struct configfs_dirent * cursor = file->private_data;
1269 1324
1270 mutex_lock(&dentry->d_inode->i_mutex); 1325 mutex_lock(&dentry->d_inode->i_mutex);
1326 spin_lock(&configfs_dirent_lock);
1271 list_del_init(&cursor->s_sibling); 1327 list_del_init(&cursor->s_sibling);
1328 spin_unlock(&configfs_dirent_lock);
1272 mutex_unlock(&dentry->d_inode->i_mutex); 1329 mutex_unlock(&dentry->d_inode->i_mutex);
1273 1330
1274 release_configfs_dirent(cursor); 1331 release_configfs_dirent(cursor);
@@ -1308,7 +1365,9 @@ static int configfs_readdir(struct file * filp, void * dirent, filldir_t filldir
1308 /* fallthrough */ 1365 /* fallthrough */
1309 default: 1366 default:
1310 if (filp->f_pos == 2) { 1367 if (filp->f_pos == 2) {
1368 spin_lock(&configfs_dirent_lock);
1311 list_move(q, &parent_sd->s_children); 1369 list_move(q, &parent_sd->s_children);
1370 spin_unlock(&configfs_dirent_lock);
1312 } 1371 }
1313 for (p=q->next; p!= &parent_sd->s_children; p=p->next) { 1372 for (p=q->next; p!= &parent_sd->s_children; p=p->next) {
1314 struct configfs_dirent *next; 1373 struct configfs_dirent *next;
@@ -1331,7 +1390,9 @@ static int configfs_readdir(struct file * filp, void * dirent, filldir_t filldir
1331 dt_type(next)) < 0) 1390 dt_type(next)) < 0)
1332 return 0; 1391 return 0;
1333 1392
1393 spin_lock(&configfs_dirent_lock);
1334 list_move(q, p); 1394 list_move(q, p);
1395 spin_unlock(&configfs_dirent_lock);
1335 p = q; 1396 p = q;
1336 filp->f_pos++; 1397 filp->f_pos++;
1337 } 1398 }
@@ -1362,6 +1423,7 @@ static loff_t configfs_dir_lseek(struct file * file, loff_t offset, int origin)
1362 struct list_head *p; 1423 struct list_head *p;
1363 loff_t n = file->f_pos - 2; 1424 loff_t n = file->f_pos - 2;
1364 1425
1426 spin_lock(&configfs_dirent_lock);
1365 list_del(&cursor->s_sibling); 1427 list_del(&cursor->s_sibling);
1366 p = sd->s_children.next; 1428 p = sd->s_children.next;
1367 while (n && p != &sd->s_children) { 1429 while (n && p != &sd->s_children) {
@@ -1373,6 +1435,7 @@ static loff_t configfs_dir_lseek(struct file * file, loff_t offset, int origin)
1373 p = p->next; 1435 p = p->next;
1374 } 1436 }
1375 list_add_tail(&cursor->s_sibling, p); 1437 list_add_tail(&cursor->s_sibling, p);
1438 spin_unlock(&configfs_dirent_lock);
1376 } 1439 }
1377 } 1440 }
1378 mutex_unlock(&dentry->d_inode->i_mutex); 1441 mutex_unlock(&dentry->d_inode->i_mutex);
@@ -1448,9 +1511,11 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys)
1448 mutex_lock_nested(&configfs_sb->s_root->d_inode->i_mutex, 1511 mutex_lock_nested(&configfs_sb->s_root->d_inode->i_mutex,
1449 I_MUTEX_PARENT); 1512 I_MUTEX_PARENT);
1450 mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD); 1513 mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD);
1451 if (configfs_detach_prep(dentry)) { 1514 spin_lock(&configfs_dirent_lock);
1515 if (configfs_detach_prep(dentry, NULL)) {
1452 printk(KERN_ERR "configfs: Tried to unregister non-empty subsystem!\n"); 1516 printk(KERN_ERR "configfs: Tried to unregister non-empty subsystem!\n");
1453 } 1517 }
1518 spin_unlock(&configfs_dirent_lock);
1454 configfs_detach_group(&group->cg_item); 1519 configfs_detach_group(&group->cg_item);
1455 dentry->d_inode->i_flags |= S_DEAD; 1520 dentry->d_inode->i_flags |= S_DEAD;
1456 mutex_unlock(&dentry->d_inode->i_mutex); 1521 mutex_unlock(&dentry->d_inode->i_mutex);
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index b9a1d810346d..4803ccc94480 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -247,7 +247,9 @@ void configfs_hash_and_remove(struct dentry * dir, const char * name)
247 if (!sd->s_element) 247 if (!sd->s_element)
248 continue; 248 continue;
249 if (!strcmp(configfs_get_name(sd), name)) { 249 if (!strcmp(configfs_get_name(sd), name)) {
250 spin_lock(&configfs_dirent_lock);
250 list_del_init(&sd->s_sibling); 251 list_del_init(&sd->s_sibling);
252 spin_unlock(&configfs_dirent_lock);
251 configfs_drop_dentry(sd, dir); 253 configfs_drop_dentry(sd, dir);
252 configfs_put(sd); 254 configfs_put(sd);
253 break; 255 break;
diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c
index 2a731ef5f305..0004d18c40ac 100644
--- a/fs/configfs/symlink.c
+++ b/fs/configfs/symlink.c
@@ -77,12 +77,15 @@ static int create_link(struct config_item *parent_item,
77 sl = kmalloc(sizeof(struct configfs_symlink), GFP_KERNEL); 77 sl = kmalloc(sizeof(struct configfs_symlink), GFP_KERNEL);
78 if (sl) { 78 if (sl) {
79 sl->sl_target = config_item_get(item); 79 sl->sl_target = config_item_get(item);
80 /* FIXME: needs a lock, I'd bet */ 80 spin_lock(&configfs_dirent_lock);
81 list_add(&sl->sl_list, &target_sd->s_links); 81 list_add(&sl->sl_list, &target_sd->s_links);
82 spin_unlock(&configfs_dirent_lock);
82 ret = configfs_create_link(sl, parent_item->ci_dentry, 83 ret = configfs_create_link(sl, parent_item->ci_dentry,
83 dentry); 84 dentry);
84 if (ret) { 85 if (ret) {
86 spin_lock(&configfs_dirent_lock);
85 list_del_init(&sl->sl_list); 87 list_del_init(&sl->sl_list);
88 spin_unlock(&configfs_dirent_lock);
86 config_item_put(item); 89 config_item_put(item);
87 kfree(sl); 90 kfree(sl);
88 } 91 }
@@ -137,8 +140,12 @@ int configfs_symlink(struct inode *dir, struct dentry *dentry, const char *symna
137 goto out_put; 140 goto out_put;
138 141
139 ret = type->ct_item_ops->allow_link(parent_item, target_item); 142 ret = type->ct_item_ops->allow_link(parent_item, target_item);
140 if (!ret) 143 if (!ret) {
141 ret = create_link(parent_item, target_item, dentry); 144 ret = create_link(parent_item, target_item, dentry);
145 if (ret && type->ct_item_ops->drop_link)
146 type->ct_item_ops->drop_link(parent_item,
147 target_item);
148 }
142 149
143 config_item_put(target_item); 150 config_item_put(target_item);
144 path_put(&nd.path); 151 path_put(&nd.path);
@@ -169,7 +176,9 @@ int configfs_unlink(struct inode *dir, struct dentry *dentry)
169 parent_item = configfs_get_config_item(dentry->d_parent); 176 parent_item = configfs_get_config_item(dentry->d_parent);
170 type = parent_item->ci_type; 177 type = parent_item->ci_type;
171 178
179 spin_lock(&configfs_dirent_lock);
172 list_del_init(&sd->s_sibling); 180 list_del_init(&sd->s_sibling);
181 spin_unlock(&configfs_dirent_lock);
173 configfs_drop_dentry(sd, dentry->d_parent); 182 configfs_drop_dentry(sd, dentry->d_parent);
174 dput(dentry); 183 dput(dentry);
175 configfs_put(sd); 184 configfs_put(sd);
@@ -184,8 +193,9 @@ int configfs_unlink(struct inode *dir, struct dentry *dentry)
184 type->ct_item_ops->drop_link(parent_item, 193 type->ct_item_ops->drop_link(parent_item,
185 sl->sl_target); 194 sl->sl_target);
186 195
187 /* FIXME: Needs lock */ 196 spin_lock(&configfs_dirent_lock);
188 list_del_init(&sl->sl_list); 197 list_del_init(&sl->sl_list);
198 spin_unlock(&configfs_dirent_lock);
189 199
190 /* Put reference from create_link() */ 200 /* Put reference from create_link() */
191 config_item_put(sl->sl_target); 201 config_item_put(sl->sl_target);
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index eac23bd288b2..492d8caaaf25 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -41,16 +41,20 @@ struct comm;
41struct nodes; 41struct nodes;
42struct node; 42struct node;
43 43
44static struct config_group *make_cluster(struct config_group *, const char *); 44static int make_cluster(struct config_group *, const char *,
45 struct config_group **);
45static void drop_cluster(struct config_group *, struct config_item *); 46static void drop_cluster(struct config_group *, struct config_item *);
46static void release_cluster(struct config_item *); 47static void release_cluster(struct config_item *);
47static struct config_group *make_space(struct config_group *, const char *); 48static int make_space(struct config_group *, const char *,
49 struct config_group **);
48static void drop_space(struct config_group *, struct config_item *); 50static void drop_space(struct config_group *, struct config_item *);
49static void release_space(struct config_item *); 51static void release_space(struct config_item *);
50static struct config_item *make_comm(struct config_group *, const char *); 52static int make_comm(struct config_group *, const char *,
53 struct config_item **);
51static void drop_comm(struct config_group *, struct config_item *); 54static void drop_comm(struct config_group *, struct config_item *);
52static void release_comm(struct config_item *); 55static void release_comm(struct config_item *);
53static struct config_item *make_node(struct config_group *, const char *); 56static int make_node(struct config_group *, const char *,
57 struct config_item **);
54static void drop_node(struct config_group *, struct config_item *); 58static void drop_node(struct config_group *, struct config_item *);
55static void release_node(struct config_item *); 59static void release_node(struct config_item *);
56 60
@@ -392,8 +396,8 @@ static struct node *to_node(struct config_item *i)
392 return i ? container_of(i, struct node, item) : NULL; 396 return i ? container_of(i, struct node, item) : NULL;
393} 397}
394 398
395static struct config_group *make_cluster(struct config_group *g, 399static int make_cluster(struct config_group *g, const char *name,
396 const char *name) 400 struct config_group **new_g)
397{ 401{
398 struct cluster *cl = NULL; 402 struct cluster *cl = NULL;
399 struct spaces *sps = NULL; 403 struct spaces *sps = NULL;
@@ -431,14 +435,15 @@ static struct config_group *make_cluster(struct config_group *g,
431 435
432 space_list = &sps->ss_group; 436 space_list = &sps->ss_group;
433 comm_list = &cms->cs_group; 437 comm_list = &cms->cs_group;
434 return &cl->group; 438 *new_g = &cl->group;
439 return 0;
435 440
436 fail: 441 fail:
437 kfree(cl); 442 kfree(cl);
438 kfree(gps); 443 kfree(gps);
439 kfree(sps); 444 kfree(sps);
440 kfree(cms); 445 kfree(cms);
441 return NULL; 446 return -ENOMEM;
442} 447}
443 448
444static void drop_cluster(struct config_group *g, struct config_item *i) 449static void drop_cluster(struct config_group *g, struct config_item *i)
@@ -466,7 +471,8 @@ static void release_cluster(struct config_item *i)
466 kfree(cl); 471 kfree(cl);
467} 472}
468 473
469static struct config_group *make_space(struct config_group *g, const char *name) 474static int make_space(struct config_group *g, const char *name,
475 struct config_group **new_g)
470{ 476{
471 struct space *sp = NULL; 477 struct space *sp = NULL;
472 struct nodes *nds = NULL; 478 struct nodes *nds = NULL;
@@ -489,13 +495,14 @@ static struct config_group *make_space(struct config_group *g, const char *name)
489 INIT_LIST_HEAD(&sp->members); 495 INIT_LIST_HEAD(&sp->members);
490 mutex_init(&sp->members_lock); 496 mutex_init(&sp->members_lock);
491 sp->members_count = 0; 497 sp->members_count = 0;
492 return &sp->group; 498 *new_g = &sp->group;
499 return 0;
493 500
494 fail: 501 fail:
495 kfree(sp); 502 kfree(sp);
496 kfree(gps); 503 kfree(gps);
497 kfree(nds); 504 kfree(nds);
498 return NULL; 505 return -ENOMEM;
499} 506}
500 507
501static void drop_space(struct config_group *g, struct config_item *i) 508static void drop_space(struct config_group *g, struct config_item *i)
@@ -522,19 +529,21 @@ static void release_space(struct config_item *i)
522 kfree(sp); 529 kfree(sp);
523} 530}
524 531
525static struct config_item *make_comm(struct config_group *g, const char *name) 532static int make_comm(struct config_group *g, const char *name,
533 struct config_item **new_i)
526{ 534{
527 struct comm *cm; 535 struct comm *cm;
528 536
529 cm = kzalloc(sizeof(struct comm), GFP_KERNEL); 537 cm = kzalloc(sizeof(struct comm), GFP_KERNEL);
530 if (!cm) 538 if (!cm)
531 return NULL; 539 return -ENOMEM;
532 540
533 config_item_init_type_name(&cm->item, name, &comm_type); 541 config_item_init_type_name(&cm->item, name, &comm_type);
534 cm->nodeid = -1; 542 cm->nodeid = -1;
535 cm->local = 0; 543 cm->local = 0;
536 cm->addr_count = 0; 544 cm->addr_count = 0;
537 return &cm->item; 545 *new_i = &cm->item;
546 return 0;
538} 547}
539 548
540static void drop_comm(struct config_group *g, struct config_item *i) 549static void drop_comm(struct config_group *g, struct config_item *i)
@@ -554,14 +563,15 @@ static void release_comm(struct config_item *i)
554 kfree(cm); 563 kfree(cm);
555} 564}
556 565
557static struct config_item *make_node(struct config_group *g, const char *name) 566static int make_node(struct config_group *g, const char *name,
567 struct config_item **new_i)
558{ 568{
559 struct space *sp = to_space(g->cg_item.ci_parent); 569 struct space *sp = to_space(g->cg_item.ci_parent);
560 struct node *nd; 570 struct node *nd;
561 571
562 nd = kzalloc(sizeof(struct node), GFP_KERNEL); 572 nd = kzalloc(sizeof(struct node), GFP_KERNEL);
563 if (!nd) 573 if (!nd)
564 return NULL; 574 return -ENOMEM;
565 575
566 config_item_init_type_name(&nd->item, name, &node_type); 576 config_item_init_type_name(&nd->item, name, &node_type);
567 nd->nodeid = -1; 577 nd->nodeid = -1;
@@ -573,7 +583,8 @@ static struct config_item *make_node(struct config_group *g, const char *name)
573 sp->members_count++; 583 sp->members_count++;
574 mutex_unlock(&sp->members_lock); 584 mutex_unlock(&sp->members_lock);
575 585
576 return &nd->item; 586 *new_i = &nd->item;
587 return 0;
577} 588}
578 589
579static void drop_node(struct config_group *g, struct config_item *i) 590static void drop_node(struct config_group *g, struct config_item *i)
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index ebbcf38fd33b..f976f303c196 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -15,6 +15,7 @@
15#include <linux/poll.h> 15#include <linux/poll.h>
16#include <linux/signal.h> 16#include <linux/signal.h>
17#include <linux/spinlock.h> 17#include <linux/spinlock.h>
18#include <linux/smp_lock.h>
18#include <linux/dlm.h> 19#include <linux/dlm.h>
19#include <linux/dlm_device.h> 20#include <linux/dlm_device.h>
20 21
@@ -618,13 +619,17 @@ static int device_open(struct inode *inode, struct file *file)
618 struct dlm_user_proc *proc; 619 struct dlm_user_proc *proc;
619 struct dlm_ls *ls; 620 struct dlm_ls *ls;
620 621
622 lock_kernel();
621 ls = dlm_find_lockspace_device(iminor(inode)); 623 ls = dlm_find_lockspace_device(iminor(inode));
622 if (!ls) 624 if (!ls) {
625 unlock_kernel();
623 return -ENOENT; 626 return -ENOENT;
627 }
624 628
625 proc = kzalloc(sizeof(struct dlm_user_proc), GFP_KERNEL); 629 proc = kzalloc(sizeof(struct dlm_user_proc), GFP_KERNEL);
626 if (!proc) { 630 if (!proc) {
627 dlm_put_lockspace(ls); 631 dlm_put_lockspace(ls);
632 unlock_kernel();
628 return -ENOMEM; 633 return -ENOMEM;
629 } 634 }
630 635
@@ -636,6 +641,7 @@ static int device_open(struct inode *inode, struct file *file)
636 spin_lock_init(&proc->locks_spin); 641 spin_lock_init(&proc->locks_spin);
637 init_waitqueue_head(&proc->wait); 642 init_waitqueue_head(&proc->wait);
638 file->private_data = proc; 643 file->private_data = proc;
644 unlock_kernel();
639 645
640 return 0; 646 return 0;
641} 647}
@@ -870,6 +876,7 @@ static unsigned int device_poll(struct file *file, poll_table *wait)
870 876
871static int ctl_device_open(struct inode *inode, struct file *file) 877static int ctl_device_open(struct inode *inode, struct file *file)
872{ 878{
879 cycle_kernel_lock();
873 file->private_data = NULL; 880 file->private_data = NULL;
874 return 0; 881 return 0;
875} 882}
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 2258b8f654a6..24749bf0668f 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -30,6 +30,7 @@
30#include <linux/security.h> 30#include <linux/security.h>
31#include <linux/compat.h> 31#include <linux/compat.h>
32#include <linux/fs_stack.h> 32#include <linux/fs_stack.h>
33#include <linux/smp_lock.h>
33#include "ecryptfs_kernel.h" 34#include "ecryptfs_kernel.h"
34 35
35/** 36/**
@@ -277,9 +278,11 @@ static int ecryptfs_fasync(int fd, struct file *file, int flag)
277 int rc = 0; 278 int rc = 0;
278 struct file *lower_file = NULL; 279 struct file *lower_file = NULL;
279 280
281 lock_kernel();
280 lower_file = ecryptfs_file_to_lower(file); 282 lower_file = ecryptfs_file_to_lower(file);
281 if (lower_file->f_op && lower_file->f_op->fasync) 283 if (lower_file->f_op && lower_file->f_op->fasync)
282 rc = lower_file->f_op->fasync(fd, lower_file, flag); 284 rc = lower_file->f_op->fasync(fd, lower_file, flag);
285 unlock_kernel();
283 return rc; 286 return rc;
284} 287}
285 288
diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c
index 50c994a249a5..09a4522f65e6 100644
--- a/fs/ecryptfs/miscdev.c
+++ b/fs/ecryptfs/miscdev.c
@@ -575,13 +575,11 @@ int ecryptfs_init_ecryptfs_miscdev(void)
575 int rc; 575 int rc;
576 576
577 atomic_set(&ecryptfs_num_miscdev_opens, 0); 577 atomic_set(&ecryptfs_num_miscdev_opens, 0);
578 mutex_lock(&ecryptfs_daemon_hash_mux);
579 rc = misc_register(&ecryptfs_miscdev); 578 rc = misc_register(&ecryptfs_miscdev);
580 if (rc) 579 if (rc)
581 printk(KERN_ERR "%s: Failed to register miscellaneous device " 580 printk(KERN_ERR "%s: Failed to register miscellaneous device "
582 "for communications with userspace daemons; rc = [%d]\n", 581 "for communications with userspace daemons; rc = [%d]\n",
583 __func__, rc); 582 __func__, rc);
584 mutex_unlock(&ecryptfs_daemon_hash_mux);
585 return rc; 583 return rc;
586} 584}
587 585
diff --git a/fs/exec.c b/fs/exec.c
index da94a6f05df3..fd9234379e8d 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -610,7 +610,7 @@ int setup_arg_pages(struct linux_binprm *bprm,
610 bprm->exec -= stack_shift; 610 bprm->exec -= stack_shift;
611 611
612 down_write(&mm->mmap_sem); 612 down_write(&mm->mmap_sem);
613 vm_flags = vma->vm_flags; 613 vm_flags = VM_STACK_FLAGS;
614 614
615 /* 615 /*
616 * Adjust stack execute permissions; explicitly enable for 616 * Adjust stack execute permissions; explicitly enable for
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index fe3119a71ada..2845425077e8 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -2875,8 +2875,10 @@ static ssize_t ext3_quota_write(struct super_block *sb, int type,
2875 blk++; 2875 blk++;
2876 } 2876 }
2877out: 2877out:
2878 if (len == towrite) 2878 if (len == towrite) {
2879 mutex_unlock(&inode->i_mutex);
2879 return err; 2880 return err;
2881 }
2880 if (inode->i_size < off+len-towrite) { 2882 if (inode->i_size < off+len-towrite) {
2881 i_size_write(inode, off+len-towrite); 2883 i_size_write(inode, off+len-towrite);
2882 EXT3_I(inode)->i_disksize = inode->i_size; 2884 EXT3_I(inode)->i_disksize = inode->i_size;
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 9cc80b9cc8d8..495ab21b9832 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -47,7 +47,7 @@ static int ext4_block_in_group(struct super_block *sb, ext4_fsblk_t block,
47 ext4_group_t block_group) 47 ext4_group_t block_group)
48{ 48{
49 ext4_group_t actual_group; 49 ext4_group_t actual_group;
50 ext4_get_group_no_and_offset(sb, block, &actual_group, 0); 50 ext4_get_group_no_and_offset(sb, block, &actual_group, NULL);
51 if (actual_group == block_group) 51 if (actual_group == block_group)
52 return 1; 52 return 1;
53 return 0; 53 return 0;
@@ -121,12 +121,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
121 le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks); 121 le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks);
122 } 122 }
123 } else { /* For META_BG_BLOCK_GROUPS */ 123 } else { /* For META_BG_BLOCK_GROUPS */
124 int group_rel = (block_group - 124 bit_max += ext4_bg_num_gdb(sb, block_group);
125 le32_to_cpu(sbi->s_es->s_first_meta_bg)) %
126 EXT4_DESC_PER_BLOCK(sb);
127 if (group_rel == 0 || group_rel == 1 ||
128 (group_rel == EXT4_DESC_PER_BLOCK(sb) - 1))
129 bit_max += 1;
130 } 125 }
131 126
132 if (block_group == sbi->s_groups_count - 1) { 127 if (block_group == sbi->s_groups_count - 1) {
@@ -295,7 +290,7 @@ err_out:
295 return 0; 290 return 0;
296} 291}
297/** 292/**
298 * read_block_bitmap() 293 * ext4_read_block_bitmap()
299 * @sb: super block 294 * @sb: super block
300 * @block_group: given block group 295 * @block_group: given block group
301 * 296 *
@@ -305,7 +300,7 @@ err_out:
305 * Return buffer_head on success or NULL in case of failure. 300 * Return buffer_head on success or NULL in case of failure.
306 */ 301 */
307struct buffer_head * 302struct buffer_head *
308read_block_bitmap(struct super_block *sb, ext4_group_t block_group) 303ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
309{ 304{
310 struct ext4_group_desc * desc; 305 struct ext4_group_desc * desc;
311 struct buffer_head * bh = NULL; 306 struct buffer_head * bh = NULL;
@@ -409,8 +404,7 @@ restart:
409 prev = rsv; 404 prev = rsv;
410 } 405 }
411 printk("Window map complete.\n"); 406 printk("Window map complete.\n");
412 if (bad) 407 BUG_ON(bad);
413 BUG();
414} 408}
415#define rsv_window_dump(root, verbose) \ 409#define rsv_window_dump(root, verbose) \
416 __rsv_window_dump((root), (verbose), __func__) 410 __rsv_window_dump((root), (verbose), __func__)
@@ -694,7 +688,7 @@ do_more:
694 count -= overflow; 688 count -= overflow;
695 } 689 }
696 brelse(bitmap_bh); 690 brelse(bitmap_bh);
697 bitmap_bh = read_block_bitmap(sb, block_group); 691 bitmap_bh = ext4_read_block_bitmap(sb, block_group);
698 if (!bitmap_bh) 692 if (!bitmap_bh)
699 goto error_return; 693 goto error_return;
700 desc = ext4_get_group_desc (sb, block_group, &gd_bh); 694 desc = ext4_get_group_desc (sb, block_group, &gd_bh);
@@ -810,6 +804,13 @@ do_more:
810 spin_unlock(sb_bgl_lock(sbi, block_group)); 804 spin_unlock(sb_bgl_lock(sbi, block_group));
811 percpu_counter_add(&sbi->s_freeblocks_counter, count); 805 percpu_counter_add(&sbi->s_freeblocks_counter, count);
812 806
807 if (sbi->s_log_groups_per_flex) {
808 ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
809 spin_lock(sb_bgl_lock(sbi, flex_group));
810 sbi->s_flex_groups[flex_group].free_blocks += count;
811 spin_unlock(sb_bgl_lock(sbi, flex_group));
812 }
813
813 /* We dirtied the bitmap block */ 814 /* We dirtied the bitmap block */
814 BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); 815 BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
815 err = ext4_journal_dirty_metadata(handle, bitmap_bh); 816 err = ext4_journal_dirty_metadata(handle, bitmap_bh);
@@ -1598,23 +1599,35 @@ out:
1598 1599
1599/** 1600/**
1600 * ext4_has_free_blocks() 1601 * ext4_has_free_blocks()
1601 * @sbi: in-core super block structure. 1602 * @sbi: in-core super block structure.
1603 * @nblocks: number of neeed blocks
1602 * 1604 *
1603 * Check if filesystem has at least 1 free block available for allocation. 1605 * Check if filesystem has free blocks available for allocation.
1606 * Return the number of blocks avaible for allocation for this request
1607 * On success, return nblocks
1604 */ 1608 */
1605static int ext4_has_free_blocks(struct ext4_sb_info *sbi) 1609ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
1610 ext4_fsblk_t nblocks)
1606{ 1611{
1607 ext4_fsblk_t free_blocks, root_blocks; 1612 ext4_fsblk_t free_blocks;
1613 ext4_fsblk_t root_blocks = 0;
1608 1614
1609 free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter); 1615 free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
1610 root_blocks = ext4_r_blocks_count(sbi->s_es); 1616
1611 if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) && 1617 if (!capable(CAP_SYS_RESOURCE) &&
1612 sbi->s_resuid != current->fsuid && 1618 sbi->s_resuid != current->fsuid &&
1613 (sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) { 1619 (sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid)))
1614 return 0; 1620 root_blocks = ext4_r_blocks_count(sbi->s_es);
1615 } 1621#ifdef CONFIG_SMP
1616 return 1; 1622 if (free_blocks - root_blocks < FBC_BATCH)
1617} 1623 free_blocks =
1624 percpu_counter_sum_and_set(&sbi->s_freeblocks_counter);
1625#endif
1626 if (free_blocks - root_blocks < nblocks)
1627 return free_blocks - root_blocks;
1628 return nblocks;
1629 }
1630
1618 1631
1619/** 1632/**
1620 * ext4_should_retry_alloc() 1633 * ext4_should_retry_alloc()
@@ -1630,7 +1643,7 @@ static int ext4_has_free_blocks(struct ext4_sb_info *sbi)
1630 */ 1643 */
1631int ext4_should_retry_alloc(struct super_block *sb, int *retries) 1644int ext4_should_retry_alloc(struct super_block *sb, int *retries)
1632{ 1645{
1633 if (!ext4_has_free_blocks(EXT4_SB(sb)) || (*retries)++ > 3) 1646 if (!ext4_has_free_blocks(EXT4_SB(sb), 1) || (*retries)++ > 3)
1634 return 0; 1647 return 0;
1635 1648
1636 jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id); 1649 jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id);
@@ -1639,20 +1652,24 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
1639} 1652}
1640 1653
1641/** 1654/**
1642 * ext4_new_blocks_old() -- core block(s) allocation function 1655 * ext4_old_new_blocks() -- core block bitmap based block allocation function
1656 *
1643 * @handle: handle to this transaction 1657 * @handle: handle to this transaction
1644 * @inode: file inode 1658 * @inode: file inode
1645 * @goal: given target block(filesystem wide) 1659 * @goal: given target block(filesystem wide)
1646 * @count: target number of blocks to allocate 1660 * @count: target number of blocks to allocate
1647 * @errp: error code 1661 * @errp: error code
1648 * 1662 *
1649 * ext4_new_blocks uses a goal block to assist allocation. It tries to 1663 * ext4_old_new_blocks uses a goal block to assist allocation and look up
1650 * allocate block(s) from the block group contains the goal block first. If that 1664 * the block bitmap directly to do block allocation. It tries to
1651 * fails, it will try to allocate block(s) from other block groups without 1665 * allocate block(s) from the block group contains the goal block first. If
1652 * any specific goal block. 1666 * that fails, it will try to allocate block(s) from other block groups
1667 * without any specific goal block.
1668 *
1669 * This function is called when -o nomballoc mount option is enabled
1653 * 1670 *
1654 */ 1671 */
1655ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode, 1672ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
1656 ext4_fsblk_t goal, unsigned long *count, int *errp) 1673 ext4_fsblk_t goal, unsigned long *count, int *errp)
1657{ 1674{
1658 struct buffer_head *bitmap_bh = NULL; 1675 struct buffer_head *bitmap_bh = NULL;
@@ -1676,13 +1693,26 @@ ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
1676 ext4_group_t ngroups; 1693 ext4_group_t ngroups;
1677 unsigned long num = *count; 1694 unsigned long num = *count;
1678 1695
1679 *errp = -ENOSPC;
1680 sb = inode->i_sb; 1696 sb = inode->i_sb;
1681 if (!sb) { 1697 if (!sb) {
1698 *errp = -ENODEV;
1682 printk("ext4_new_block: nonexistent device"); 1699 printk("ext4_new_block: nonexistent device");
1683 return 0; 1700 return 0;
1684 } 1701 }
1685 1702
1703 sbi = EXT4_SB(sb);
1704 if (!EXT4_I(inode)->i_delalloc_reserved_flag) {
1705 /*
1706 * With delalloc we already reserved the blocks
1707 */
1708 *count = ext4_has_free_blocks(sbi, *count);
1709 }
1710 if (*count == 0) {
1711 *errp = -ENOSPC;
1712 return 0; /*return with ENOSPC error */
1713 }
1714 num = *count;
1715
1686 /* 1716 /*
1687 * Check quota for allocation of this block. 1717 * Check quota for allocation of this block.
1688 */ 1718 */
@@ -1706,11 +1736,6 @@ ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
1706 if (block_i && ((windowsz = block_i->rsv_window_node.rsv_goal_size) > 0)) 1736 if (block_i && ((windowsz = block_i->rsv_window_node.rsv_goal_size) > 0))
1707 my_rsv = &block_i->rsv_window_node; 1737 my_rsv = &block_i->rsv_window_node;
1708 1738
1709 if (!ext4_has_free_blocks(sbi)) {
1710 *errp = -ENOSPC;
1711 goto out;
1712 }
1713
1714 /* 1739 /*
1715 * First, test whether the goal block is free. 1740 * First, test whether the goal block is free.
1716 */ 1741 */
@@ -1734,7 +1759,7 @@ retry_alloc:
1734 my_rsv = NULL; 1759 my_rsv = NULL;
1735 1760
1736 if (free_blocks > 0) { 1761 if (free_blocks > 0) {
1737 bitmap_bh = read_block_bitmap(sb, group_no); 1762 bitmap_bh = ext4_read_block_bitmap(sb, group_no);
1738 if (!bitmap_bh) 1763 if (!bitmap_bh)
1739 goto io_error; 1764 goto io_error;
1740 grp_alloc_blk = ext4_try_to_allocate_with_rsv(sb, handle, 1765 grp_alloc_blk = ext4_try_to_allocate_with_rsv(sb, handle,
@@ -1770,7 +1795,7 @@ retry_alloc:
1770 continue; 1795 continue;
1771 1796
1772 brelse(bitmap_bh); 1797 brelse(bitmap_bh);
1773 bitmap_bh = read_block_bitmap(sb, group_no); 1798 bitmap_bh = ext4_read_block_bitmap(sb, group_no);
1774 if (!bitmap_bh) 1799 if (!bitmap_bh)
1775 goto io_error; 1800 goto io_error;
1776 /* 1801 /*
@@ -1882,7 +1907,15 @@ allocated:
1882 le16_add_cpu(&gdp->bg_free_blocks_count, -num); 1907 le16_add_cpu(&gdp->bg_free_blocks_count, -num);
1883 gdp->bg_checksum = ext4_group_desc_csum(sbi, group_no, gdp); 1908 gdp->bg_checksum = ext4_group_desc_csum(sbi, group_no, gdp);
1884 spin_unlock(sb_bgl_lock(sbi, group_no)); 1909 spin_unlock(sb_bgl_lock(sbi, group_no));
1885 percpu_counter_sub(&sbi->s_freeblocks_counter, num); 1910 if (!EXT4_I(inode)->i_delalloc_reserved_flag)
1911 percpu_counter_sub(&sbi->s_freeblocks_counter, num);
1912
1913 if (sbi->s_log_groups_per_flex) {
1914 ext4_group_t flex_group = ext4_flex_group(sbi, group_no);
1915 spin_lock(sb_bgl_lock(sbi, flex_group));
1916 sbi->s_flex_groups[flex_group].free_blocks -= num;
1917 spin_unlock(sb_bgl_lock(sbi, flex_group));
1918 }
1886 1919
1887 BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor"); 1920 BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor");
1888 err = ext4_journal_dirty_metadata(handle, gdp_bh); 1921 err = ext4_journal_dirty_metadata(handle, gdp_bh);
@@ -1915,46 +1948,104 @@ out:
1915 return 0; 1948 return 0;
1916} 1949}
1917 1950
1918ext4_fsblk_t ext4_new_block(handle_t *handle, struct inode *inode, 1951#define EXT4_META_BLOCK 0x1
1919 ext4_fsblk_t goal, int *errp) 1952
1953static ext4_fsblk_t do_blk_alloc(handle_t *handle, struct inode *inode,
1954 ext4_lblk_t iblock, ext4_fsblk_t goal,
1955 unsigned long *count, int *errp, int flags)
1920{ 1956{
1921 struct ext4_allocation_request ar; 1957 struct ext4_allocation_request ar;
1922 ext4_fsblk_t ret; 1958 ext4_fsblk_t ret;
1923 1959
1924 if (!test_opt(inode->i_sb, MBALLOC)) { 1960 if (!test_opt(inode->i_sb, MBALLOC)) {
1925 unsigned long count = 1; 1961 return ext4_old_new_blocks(handle, inode, goal, count, errp);
1926 ret = ext4_new_blocks_old(handle, inode, goal, &count, errp);
1927 return ret;
1928 } 1962 }
1929 1963
1930 memset(&ar, 0, sizeof(ar)); 1964 memset(&ar, 0, sizeof(ar));
1965 /* Fill with neighbour allocated blocks */
1966
1931 ar.inode = inode; 1967 ar.inode = inode;
1932 ar.goal = goal; 1968 ar.goal = goal;
1933 ar.len = 1; 1969 ar.len = *count;
1970 ar.logical = iblock;
1971
1972 if (S_ISREG(inode->i_mode) && !(flags & EXT4_META_BLOCK))
1973 /* enable in-core preallocation for data block allocation */
1974 ar.flags = EXT4_MB_HINT_DATA;
1975 else
1976 /* disable in-core preallocation for non-regular files */
1977 ar.flags = 0;
1978
1934 ret = ext4_mb_new_blocks(handle, &ar, errp); 1979 ret = ext4_mb_new_blocks(handle, &ar, errp);
1980 *count = ar.len;
1935 return ret; 1981 return ret;
1936} 1982}
1937 1983
1938ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode, 1984/*
1985 * ext4_new_meta_blocks() -- allocate block for meta data (indexing) blocks
1986 *
1987 * @handle: handle to this transaction
1988 * @inode: file inode
1989 * @goal: given target block(filesystem wide)
1990 * @count: total number of blocks need
1991 * @errp: error code
1992 *
1993 * Return 1st allocated block numberon success, *count stores total account
1994 * error stores in errp pointer
1995 */
1996ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
1939 ext4_fsblk_t goal, unsigned long *count, int *errp) 1997 ext4_fsblk_t goal, unsigned long *count, int *errp)
1940{ 1998{
1941 struct ext4_allocation_request ar;
1942 ext4_fsblk_t ret; 1999 ext4_fsblk_t ret;
1943 2000 ret = do_blk_alloc(handle, inode, 0, goal,
1944 if (!test_opt(inode->i_sb, MBALLOC)) { 2001 count, errp, EXT4_META_BLOCK);
1945 ret = ext4_new_blocks_old(handle, inode, goal, count, errp); 2002 /*
1946 return ret; 2003 * Account for the allocated meta blocks
2004 */
2005 if (!(*errp)) {
2006 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
2007 EXT4_I(inode)->i_allocated_meta_blocks += *count;
2008 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1947 } 2009 }
1948
1949 memset(&ar, 0, sizeof(ar));
1950 ar.inode = inode;
1951 ar.goal = goal;
1952 ar.len = *count;
1953 ret = ext4_mb_new_blocks(handle, &ar, errp);
1954 *count = ar.len;
1955 return ret; 2010 return ret;
1956} 2011}
1957 2012
2013/*
2014 * ext4_new_meta_block() -- allocate block for meta data (indexing) blocks
2015 *
2016 * @handle: handle to this transaction
2017 * @inode: file inode
2018 * @goal: given target block(filesystem wide)
2019 * @errp: error code
2020 *
2021 * Return allocated block number on success
2022 */
2023ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode,
2024 ext4_fsblk_t goal, int *errp)
2025{
2026 unsigned long count = 1;
2027 return ext4_new_meta_blocks(handle, inode, goal, &count, errp);
2028}
2029
2030/*
2031 * ext4_new_blocks() -- allocate data blocks
2032 *
2033 * @handle: handle to this transaction
2034 * @inode: file inode
2035 * @goal: given target block(filesystem wide)
2036 * @count: total number of blocks need
2037 * @errp: error code
2038 *
2039 * Return 1st allocated block numberon success, *count stores total account
2040 * error stores in errp pointer
2041 */
2042
2043ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
2044 ext4_lblk_t iblock, ext4_fsblk_t goal,
2045 unsigned long *count, int *errp)
2046{
2047 return do_blk_alloc(handle, inode, iblock, goal, count, errp, 0);
2048}
1958 2049
1959/** 2050/**
1960 * ext4_count_free_blocks() -- count filesystem free blocks 2051 * ext4_count_free_blocks() -- count filesystem free blocks
@@ -1986,7 +2077,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
1986 continue; 2077 continue;
1987 desc_count += le16_to_cpu(gdp->bg_free_blocks_count); 2078 desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
1988 brelse(bitmap_bh); 2079 brelse(bitmap_bh);
1989 bitmap_bh = read_block_bitmap(sb, i); 2080 bitmap_bh = ext4_read_block_bitmap(sb, i);
1990 if (bitmap_bh == NULL) 2081 if (bitmap_bh == NULL)
1991 continue; 2082 continue;
1992 2083
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 2bf0331ea194..d3d23d73c08b 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -129,7 +129,8 @@ static int ext4_readdir(struct file * filp,
129 struct buffer_head *bh = NULL; 129 struct buffer_head *bh = NULL;
130 130
131 map_bh.b_state = 0; 131 map_bh.b_state = 0;
132 err = ext4_get_blocks_wrap(NULL, inode, blk, 1, &map_bh, 0, 0); 132 err = ext4_get_blocks_wrap(NULL, inode, blk, 1, &map_bh,
133 0, 0, 0);
133 if (err > 0) { 134 if (err > 0) {
134 pgoff_t index = map_bh.b_blocknr >> 135 pgoff_t index = map_bh.b_blocknr >>
135 (PAGE_CACHE_SHIFT - inode->i_blkbits); 136 (PAGE_CACHE_SHIFT - inode->i_blkbits);
@@ -272,7 +273,7 @@ static void free_rb_tree_fname(struct rb_root *root)
272 273
273 while (n) { 274 while (n) {
274 /* Do the node's children first */ 275 /* Do the node's children first */
275 if ((n)->rb_left) { 276 if (n->rb_left) {
276 n = n->rb_left; 277 n = n->rb_left;
277 continue; 278 continue;
278 } 279 }
@@ -301,24 +302,18 @@ static void free_rb_tree_fname(struct rb_root *root)
301 parent->rb_right = NULL; 302 parent->rb_right = NULL;
302 n = parent; 303 n = parent;
303 } 304 }
304 root->rb_node = NULL;
305} 305}
306 306
307 307
308static struct dir_private_info *create_dir_info(loff_t pos) 308static struct dir_private_info *ext4_htree_create_dir_info(loff_t pos)
309{ 309{
310 struct dir_private_info *p; 310 struct dir_private_info *p;
311 311
312 p = kmalloc(sizeof(struct dir_private_info), GFP_KERNEL); 312 p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL);
313 if (!p) 313 if (!p)
314 return NULL; 314 return NULL;
315 p->root.rb_node = NULL;
316 p->curr_node = NULL;
317 p->extra_fname = NULL;
318 p->last_pos = 0;
319 p->curr_hash = pos2maj_hash(pos); 315 p->curr_hash = pos2maj_hash(pos);
320 p->curr_minor_hash = pos2min_hash(pos); 316 p->curr_minor_hash = pos2min_hash(pos);
321 p->next_hash = 0;
322 return p; 317 return p;
323} 318}
324 319
@@ -433,7 +428,7 @@ static int ext4_dx_readdir(struct file * filp,
433 int ret; 428 int ret;
434 429
435 if (!info) { 430 if (!info) {
436 info = create_dir_info(filp->f_pos); 431 info = ext4_htree_create_dir_info(filp->f_pos);
437 if (!info) 432 if (!info)
438 return -ENOMEM; 433 return -ENOMEM;
439 filp->private_data = info; 434 filp->private_data = info;
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 8158083f7ac0..303e41cf7b14 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -22,7 +22,7 @@
22#include "ext4_i.h" 22#include "ext4_i.h"
23 23
24/* 24/*
25 * The second extended filesystem constants/structures 25 * The fourth extended filesystem constants/structures
26 */ 26 */
27 27
28/* 28/*
@@ -45,7 +45,7 @@
45#define ext4_debug(f, a...) \ 45#define ext4_debug(f, a...) \
46 do { \ 46 do { \
47 printk (KERN_DEBUG "EXT4-fs DEBUG (%s, %d): %s:", \ 47 printk (KERN_DEBUG "EXT4-fs DEBUG (%s, %d): %s:", \
48 __FILE__, __LINE__, __FUNCTION__); \ 48 __FILE__, __LINE__, __func__); \
49 printk (KERN_DEBUG f, ## a); \ 49 printk (KERN_DEBUG f, ## a); \
50 } while (0) 50 } while (0)
51#else 51#else
@@ -74,6 +74,9 @@
74#define EXT4_MB_HINT_GOAL_ONLY 256 74#define EXT4_MB_HINT_GOAL_ONLY 256
75/* goal is meaningful */ 75/* goal is meaningful */
76#define EXT4_MB_HINT_TRY_GOAL 512 76#define EXT4_MB_HINT_TRY_GOAL 512
77/* blocks already pre-reserved by delayed allocation */
78#define EXT4_MB_DELALLOC_RESERVED 1024
79
77 80
78struct ext4_allocation_request { 81struct ext4_allocation_request {
79 /* target inode for block we're allocating */ 82 /* target inode for block we're allocating */
@@ -170,6 +173,15 @@ struct ext4_group_desc
170 __u32 bg_reserved2[3]; 173 __u32 bg_reserved2[3];
171}; 174};
172 175
176/*
177 * Structure of a flex block group info
178 */
179
180struct flex_groups {
181 __u32 free_inodes;
182 __u32 free_blocks;
183};
184
173#define EXT4_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not in use */ 185#define EXT4_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not in use */
174#define EXT4_BG_BLOCK_UNINIT 0x0002 /* Block bitmap not in use */ 186#define EXT4_BG_BLOCK_UNINIT 0x0002 /* Block bitmap not in use */
175#define EXT4_BG_INODE_ZEROED 0x0004 /* On-disk itable initialized to zero */ 187#define EXT4_BG_INODE_ZEROED 0x0004 /* On-disk itable initialized to zero */
@@ -527,6 +539,7 @@ do { \
527#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ 539#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
528#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */ 540#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */
529#define EXT4_MOUNT_MBALLOC 0x4000000 /* Buddy allocation support */ 541#define EXT4_MOUNT_MBALLOC 0x4000000 /* Buddy allocation support */
542#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */
530/* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */ 543/* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */
531#ifndef _LINUX_EXT2_FS_H 544#ifndef _LINUX_EXT2_FS_H
532#define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt 545#define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt
@@ -647,7 +660,10 @@ struct ext4_super_block {
647 __le16 s_mmp_interval; /* # seconds to wait in MMP checking */ 660 __le16 s_mmp_interval; /* # seconds to wait in MMP checking */
648 __le64 s_mmp_block; /* Block for multi-mount protection */ 661 __le64 s_mmp_block; /* Block for multi-mount protection */
649 __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/ 662 __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/
650 __u32 s_reserved[163]; /* Padding to the end of the block */ 663 __u8 s_log_groups_per_flex; /* FLEX_BG group size */
664 __u8 s_reserved_char_pad2;
665 __le16 s_reserved_pad;
666 __u32 s_reserved[162]; /* Padding to the end of the block */
651}; 667};
652 668
653#ifdef __KERNEL__ 669#ifdef __KERNEL__
@@ -958,12 +974,17 @@ extern ext4_grpblk_t ext4_block_group_offset(struct super_block *sb,
958extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group); 974extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group);
959extern unsigned long ext4_bg_num_gdb(struct super_block *sb, 975extern unsigned long ext4_bg_num_gdb(struct super_block *sb,
960 ext4_group_t group); 976 ext4_group_t group);
961extern ext4_fsblk_t ext4_new_block (handle_t *handle, struct inode *inode, 977extern ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode,
962 ext4_fsblk_t goal, int *errp); 978 ext4_fsblk_t goal, int *errp);
963extern ext4_fsblk_t ext4_new_blocks (handle_t *handle, struct inode *inode, 979extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
964 ext4_fsblk_t goal, unsigned long *count, int *errp); 980 ext4_fsblk_t goal, unsigned long *count, int *errp);
965extern ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode, 981extern ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
982 ext4_lblk_t iblock, ext4_fsblk_t goal,
983 unsigned long *count, int *errp);
984extern ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
966 ext4_fsblk_t goal, unsigned long *count, int *errp); 985 ext4_fsblk_t goal, unsigned long *count, int *errp);
986extern ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
987 ext4_fsblk_t nblocks);
967extern void ext4_free_blocks (handle_t *handle, struct inode *inode, 988extern void ext4_free_blocks (handle_t *handle, struct inode *inode,
968 ext4_fsblk_t block, unsigned long count, int metadata); 989 ext4_fsblk_t block, unsigned long count, int metadata);
969extern void ext4_free_blocks_sb (handle_t *handle, struct super_block *sb, 990extern void ext4_free_blocks_sb (handle_t *handle, struct super_block *sb,
@@ -1016,9 +1037,14 @@ extern int __init init_ext4_mballoc(void);
1016extern void exit_ext4_mballoc(void); 1037extern void exit_ext4_mballoc(void);
1017extern void ext4_mb_free_blocks(handle_t *, struct inode *, 1038extern void ext4_mb_free_blocks(handle_t *, struct inode *,
1018 unsigned long, unsigned long, int, unsigned long *); 1039 unsigned long, unsigned long, int, unsigned long *);
1040extern int ext4_mb_add_more_groupinfo(struct super_block *sb,
1041 ext4_group_t i, struct ext4_group_desc *desc);
1042extern void ext4_mb_update_group_info(struct ext4_group_info *grp,
1043 ext4_grpblk_t add);
1019 1044
1020 1045
1021/* inode.c */ 1046/* inode.c */
1047void ext4_da_release_space(struct inode *inode, int used, int to_free);
1022int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode, 1048int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
1023 struct buffer_head *bh, ext4_fsblk_t blocknr); 1049 struct buffer_head *bh, ext4_fsblk_t blocknr);
1024struct buffer_head *ext4_getblk(handle_t *, struct inode *, 1050struct buffer_head *ext4_getblk(handle_t *, struct inode *,
@@ -1033,19 +1059,23 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
1033extern struct inode *ext4_iget(struct super_block *, unsigned long); 1059extern struct inode *ext4_iget(struct super_block *, unsigned long);
1034extern int ext4_write_inode (struct inode *, int); 1060extern int ext4_write_inode (struct inode *, int);
1035extern int ext4_setattr (struct dentry *, struct iattr *); 1061extern int ext4_setattr (struct dentry *, struct iattr *);
1062extern int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
1063 struct kstat *stat);
1036extern void ext4_delete_inode (struct inode *); 1064extern void ext4_delete_inode (struct inode *);
1037extern int ext4_sync_inode (handle_t *, struct inode *); 1065extern int ext4_sync_inode (handle_t *, struct inode *);
1038extern void ext4_discard_reservation (struct inode *); 1066extern void ext4_discard_reservation (struct inode *);
1039extern void ext4_dirty_inode(struct inode *); 1067extern void ext4_dirty_inode(struct inode *);
1040extern int ext4_change_inode_journal_flag(struct inode *, int); 1068extern int ext4_change_inode_journal_flag(struct inode *, int);
1041extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *); 1069extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
1070extern int ext4_can_truncate(struct inode *inode);
1042extern void ext4_truncate (struct inode *); 1071extern void ext4_truncate (struct inode *);
1043extern void ext4_set_inode_flags(struct inode *); 1072extern void ext4_set_inode_flags(struct inode *);
1044extern void ext4_get_inode_flags(struct ext4_inode_info *); 1073extern void ext4_get_inode_flags(struct ext4_inode_info *);
1045extern void ext4_set_aops(struct inode *inode); 1074extern void ext4_set_aops(struct inode *inode);
1046extern int ext4_writepage_trans_blocks(struct inode *); 1075extern int ext4_writepage_trans_blocks(struct inode *);
1047extern int ext4_block_truncate_page(handle_t *handle, struct page *page, 1076extern int ext4_block_truncate_page(handle_t *handle,
1048 struct address_space *mapping, loff_t from); 1077 struct address_space *mapping, loff_t from);
1078extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page);
1049 1079
1050/* ioctl.c */ 1080/* ioctl.c */
1051extern long ext4_ioctl(struct file *, unsigned int, unsigned long); 1081extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
@@ -1159,10 +1189,21 @@ struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
1159} 1189}
1160 1190
1161 1191
1192static inline ext4_group_t ext4_flex_group(struct ext4_sb_info *sbi,
1193 ext4_group_t block_group)
1194{
1195 return block_group >> sbi->s_log_groups_per_flex;
1196}
1197
1198static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi)
1199{
1200 return 1 << sbi->s_log_groups_per_flex;
1201}
1202
1162#define ext4_std_error(sb, errno) \ 1203#define ext4_std_error(sb, errno) \
1163do { \ 1204do { \
1164 if ((errno)) \ 1205 if ((errno)) \
1165 __ext4_std_error((sb), __FUNCTION__, (errno)); \ 1206 __ext4_std_error((sb), __func__, (errno)); \
1166} while (0) 1207} while (0)
1167 1208
1168/* 1209/*
@@ -1191,7 +1232,7 @@ extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
1191 ext4_lblk_t iblock, 1232 ext4_lblk_t iblock,
1192 unsigned long max_blocks, struct buffer_head *bh_result, 1233 unsigned long max_blocks, struct buffer_head *bh_result,
1193 int create, int extend_disksize); 1234 int create, int extend_disksize);
1194extern void ext4_ext_truncate(struct inode *, struct page *); 1235extern void ext4_ext_truncate(struct inode *);
1195extern void ext4_ext_init(struct super_block *); 1236extern void ext4_ext_init(struct super_block *);
1196extern void ext4_ext_release(struct super_block *); 1237extern void ext4_ext_release(struct super_block *);
1197extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset, 1238extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
@@ -1199,7 +1240,7 @@ extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
1199extern int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, 1240extern int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode,
1200 sector_t block, unsigned long max_blocks, 1241 sector_t block, unsigned long max_blocks,
1201 struct buffer_head *bh, int create, 1242 struct buffer_head *bh, int create,
1202 int extend_disksize); 1243 int extend_disksize, int flag);
1203#endif /* __KERNEL__ */ 1244#endif /* __KERNEL__ */
1204 1245
1205#endif /* _EXT4_H */ 1246#endif /* _EXT4_H */
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 75333b595fab..6c166c0a54b7 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -212,6 +212,7 @@ static inline int ext4_ext_get_actual_len(struct ext4_extent *ext)
212 (le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN)); 212 (le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN));
213} 213}
214 214
215extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks);
215extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *); 216extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *);
216extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t); 217extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t);
217extern int ext4_extent_tree_init(handle_t *, struct inode *); 218extern int ext4_extent_tree_init(handle_t *, struct inode *);
diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h
index 26a4ae255d79..ef7409f0e7e4 100644
--- a/fs/ext4/ext4_i.h
+++ b/fs/ext4/ext4_i.h
@@ -79,7 +79,7 @@ struct ext4_ext_cache {
79}; 79};
80 80
81/* 81/*
82 * third extended file system inode data in memory 82 * fourth extended file system inode data in memory
83 */ 83 */
84struct ext4_inode_info { 84struct ext4_inode_info {
85 __le32 i_data[15]; /* unconverted */ 85 __le32 i_data[15]; /* unconverted */
@@ -150,6 +150,7 @@ struct ext4_inode_info {
150 */ 150 */
151 struct rw_semaphore i_data_sem; 151 struct rw_semaphore i_data_sem;
152 struct inode vfs_inode; 152 struct inode vfs_inode;
153 struct jbd2_inode jinode;
153 154
154 unsigned long i_ext_generation; 155 unsigned long i_ext_generation;
155 struct ext4_ext_cache i_cached_extent; 156 struct ext4_ext_cache i_cached_extent;
@@ -162,6 +163,13 @@ struct ext4_inode_info {
162 /* mballoc */ 163 /* mballoc */
163 struct list_head i_prealloc_list; 164 struct list_head i_prealloc_list;
164 spinlock_t i_prealloc_lock; 165 spinlock_t i_prealloc_lock;
166
167 /* allocation reservation info for delalloc */
168 unsigned long i_reserved_data_blocks;
169 unsigned long i_reserved_meta_blocks;
170 unsigned long i_allocated_meta_blocks;
171 unsigned short i_delalloc_reserved_flag;
172 spinlock_t i_block_reservation_lock;
165}; 173};
166 174
167#endif /* _EXT4_I */ 175#endif /* _EXT4_I */
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 9255a7d28b24..eb8bc3afe6e9 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -142,19 +142,17 @@ int __ext4_journal_dirty_metadata(const char *where,
142 handle_t *handle, struct buffer_head *bh); 142 handle_t *handle, struct buffer_head *bh);
143 143
144#define ext4_journal_get_undo_access(handle, bh) \ 144#define ext4_journal_get_undo_access(handle, bh) \
145 __ext4_journal_get_undo_access(__FUNCTION__, (handle), (bh)) 145 __ext4_journal_get_undo_access(__func__, (handle), (bh))
146#define ext4_journal_get_write_access(handle, bh) \ 146#define ext4_journal_get_write_access(handle, bh) \
147 __ext4_journal_get_write_access(__FUNCTION__, (handle), (bh)) 147 __ext4_journal_get_write_access(__func__, (handle), (bh))
148#define ext4_journal_revoke(handle, blocknr, bh) \ 148#define ext4_journal_revoke(handle, blocknr, bh) \
149 __ext4_journal_revoke(__FUNCTION__, (handle), (blocknr), (bh)) 149 __ext4_journal_revoke(__func__, (handle), (blocknr), (bh))
150#define ext4_journal_get_create_access(handle, bh) \ 150#define ext4_journal_get_create_access(handle, bh) \
151 __ext4_journal_get_create_access(__FUNCTION__, (handle), (bh)) 151 __ext4_journal_get_create_access(__func__, (handle), (bh))
152#define ext4_journal_dirty_metadata(handle, bh) \ 152#define ext4_journal_dirty_metadata(handle, bh) \
153 __ext4_journal_dirty_metadata(__FUNCTION__, (handle), (bh)) 153 __ext4_journal_dirty_metadata(__func__, (handle), (bh))
154#define ext4_journal_forget(handle, bh) \ 154#define ext4_journal_forget(handle, bh) \
155 __ext4_journal_forget(__FUNCTION__, (handle), (bh)) 155 __ext4_journal_forget(__func__, (handle), (bh))
156
157int ext4_journal_dirty_data(handle_t *handle, struct buffer_head *bh);
158 156
159handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks); 157handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks);
160int __ext4_journal_stop(const char *where, handle_t *handle); 158int __ext4_journal_stop(const char *where, handle_t *handle);
@@ -165,7 +163,7 @@ static inline handle_t *ext4_journal_start(struct inode *inode, int nblocks)
165} 163}
166 164
167#define ext4_journal_stop(handle) \ 165#define ext4_journal_stop(handle) \
168 __ext4_journal_stop(__FUNCTION__, (handle)) 166 __ext4_journal_stop(__func__, (handle))
169 167
170static inline handle_t *ext4_journal_current_handle(void) 168static inline handle_t *ext4_journal_current_handle(void)
171{ 169{
@@ -192,6 +190,11 @@ static inline int ext4_journal_force_commit(journal_t *journal)
192 return jbd2_journal_force_commit(journal); 190 return jbd2_journal_force_commit(journal);
193} 191}
194 192
193static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode)
194{
195 return jbd2_journal_file_inode(handle, &EXT4_I(inode)->jinode);
196}
197
195/* super.c */ 198/* super.c */
196int ext4_force_commit(struct super_block *sb); 199int ext4_force_commit(struct super_block *sb);
197 200
diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
index 5802e69f2191..6300226d5531 100644
--- a/fs/ext4/ext4_sb.h
+++ b/fs/ext4/ext4_sb.h
@@ -25,7 +25,7 @@
25#include <linux/rbtree.h> 25#include <linux/rbtree.h>
26 26
27/* 27/*
28 * third extended-fs super-block data in memory 28 * fourth extended-fs super-block data in memory
29 */ 29 */
30struct ext4_sb_info { 30struct ext4_sb_info {
31 unsigned long s_desc_size; /* Size of a group descriptor in bytes */ 31 unsigned long s_desc_size; /* Size of a group descriptor in bytes */
@@ -143,6 +143,9 @@ struct ext4_sb_info {
143 143
144 /* locality groups */ 144 /* locality groups */
145 struct ext4_locality_group *s_locality_groups; 145 struct ext4_locality_group *s_locality_groups;
146
147 unsigned int s_log_groups_per_flex;
148 struct flex_groups *s_flex_groups;
146}; 149};
147 150
148#endif /* _EXT4_SB */ 151#endif /* _EXT4_SB */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 47929c4e3dae..42c4c0c892ed 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -92,17 +92,16 @@ static void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb)
92 ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff); 92 ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
93} 93}
94 94
95static handle_t *ext4_ext_journal_restart(handle_t *handle, int needed) 95static int ext4_ext_journal_restart(handle_t *handle, int needed)
96{ 96{
97 int err; 97 int err;
98 98
99 if (handle->h_buffer_credits > needed) 99 if (handle->h_buffer_credits > needed)
100 return handle; 100 return 0;
101 if (!ext4_journal_extend(handle, needed)) 101 err = ext4_journal_extend(handle, needed);
102 return handle; 102 if (err)
103 err = ext4_journal_restart(handle, needed); 103 return err;
104 104 return ext4_journal_restart(handle, needed);
105 return handle;
106} 105}
107 106
108/* 107/*
@@ -180,15 +179,18 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
180 return bg_start + colour + block; 179 return bg_start + colour + block;
181} 180}
182 181
182/*
183 * Allocation for a meta data block
184 */
183static ext4_fsblk_t 185static ext4_fsblk_t
184ext4_ext_new_block(handle_t *handle, struct inode *inode, 186ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
185 struct ext4_ext_path *path, 187 struct ext4_ext_path *path,
186 struct ext4_extent *ex, int *err) 188 struct ext4_extent *ex, int *err)
187{ 189{
188 ext4_fsblk_t goal, newblock; 190 ext4_fsblk_t goal, newblock;
189 191
190 goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block)); 192 goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
191 newblock = ext4_new_block(handle, inode, goal, err); 193 newblock = ext4_new_meta_block(handle, inode, goal, err);
192 return newblock; 194 return newblock;
193} 195}
194 196
@@ -246,6 +248,36 @@ static int ext4_ext_space_root_idx(struct inode *inode)
246 return size; 248 return size;
247} 249}
248 250
251/*
252 * Calculate the number of metadata blocks needed
253 * to allocate @blocks
254 * Worse case is one block per extent
255 */
256int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks)
257{
258 int lcap, icap, rcap, leafs, idxs, num;
259 int newextents = blocks;
260
261 rcap = ext4_ext_space_root_idx(inode);
262 lcap = ext4_ext_space_block(inode);
263 icap = ext4_ext_space_block_idx(inode);
264
265 /* number of new leaf blocks needed */
266 num = leafs = (newextents + lcap - 1) / lcap;
267
268 /*
269 * Worse case, we need separate index block(s)
270 * to link all new leaf blocks
271 */
272 idxs = (leafs + icap - 1) / icap;
273 do {
274 num += idxs;
275 idxs = (idxs + icap - 1) / icap;
276 } while (idxs > rcap);
277
278 return num;
279}
280
249static int 281static int
250ext4_ext_max_entries(struct inode *inode, int depth) 282ext4_ext_max_entries(struct inode *inode, int depth)
251{ 283{
@@ -524,6 +556,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
524 alloc = 1; 556 alloc = 1;
525 } 557 }
526 path[0].p_hdr = eh; 558 path[0].p_hdr = eh;
559 path[0].p_bh = NULL;
527 560
528 i = depth; 561 i = depth;
529 /* walk through the tree */ 562 /* walk through the tree */
@@ -552,12 +585,14 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
552 } 585 }
553 586
554 path[ppos].p_depth = i; 587 path[ppos].p_depth = i;
555 path[ppos].p_hdr = eh;
556 path[ppos].p_ext = NULL; 588 path[ppos].p_ext = NULL;
557 path[ppos].p_idx = NULL; 589 path[ppos].p_idx = NULL;
558 590
559 /* find extent */ 591 /* find extent */
560 ext4_ext_binsearch(inode, path + ppos, block); 592 ext4_ext_binsearch(inode, path + ppos, block);
593 /* if not an empty leaf */
594 if (path[ppos].p_ext)
595 path[ppos].p_block = ext_pblock(path[ppos].p_ext);
561 596
562 ext4_ext_show_path(inode, path); 597 ext4_ext_show_path(inode, path);
563 598
@@ -688,7 +723,8 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
688 /* allocate all needed blocks */ 723 /* allocate all needed blocks */
689 ext_debug("allocate %d blocks for indexes/leaf\n", depth - at); 724 ext_debug("allocate %d blocks for indexes/leaf\n", depth - at);
690 for (a = 0; a < depth - at; a++) { 725 for (a = 0; a < depth - at; a++) {
691 newblock = ext4_ext_new_block(handle, inode, path, newext, &err); 726 newblock = ext4_ext_new_meta_block(handle, inode, path,
727 newext, &err);
692 if (newblock == 0) 728 if (newblock == 0)
693 goto cleanup; 729 goto cleanup;
694 ablocks[a] = newblock; 730 ablocks[a] = newblock;
@@ -884,7 +920,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
884 ext4_fsblk_t newblock; 920 ext4_fsblk_t newblock;
885 int err = 0; 921 int err = 0;
886 922
887 newblock = ext4_ext_new_block(handle, inode, path, newext, &err); 923 newblock = ext4_ext_new_meta_block(handle, inode, path, newext, &err);
888 if (newblock == 0) 924 if (newblock == 0)
889 return err; 925 return err;
890 926
@@ -981,6 +1017,8 @@ repeat:
981 /* if we found index with free entry, then use that 1017 /* if we found index with free entry, then use that
982 * entry: create all needed subtree and add new leaf */ 1018 * entry: create all needed subtree and add new leaf */
983 err = ext4_ext_split(handle, inode, path, newext, i); 1019 err = ext4_ext_split(handle, inode, path, newext, i);
1020 if (err)
1021 goto out;
984 1022
985 /* refill path */ 1023 /* refill path */
986 ext4_ext_drop_refs(path); 1024 ext4_ext_drop_refs(path);
@@ -1883,11 +1921,9 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
1883 credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb); 1921 credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
1884#endif 1922#endif
1885 1923
1886 handle = ext4_ext_journal_restart(handle, credits); 1924 err = ext4_ext_journal_restart(handle, credits);
1887 if (IS_ERR(handle)) { 1925 if (err)
1888 err = PTR_ERR(handle);
1889 goto out; 1926 goto out;
1890 }
1891 1927
1892 err = ext4_ext_get_access(handle, inode, path + depth); 1928 err = ext4_ext_get_access(handle, inode, path + depth);
1893 if (err) 1929 if (err)
@@ -2529,6 +2565,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2529 int err = 0, depth, ret; 2565 int err = 0, depth, ret;
2530 unsigned long allocated = 0; 2566 unsigned long allocated = 0;
2531 struct ext4_allocation_request ar; 2567 struct ext4_allocation_request ar;
2568 loff_t disksize;
2532 2569
2533 __clear_bit(BH_New, &bh_result->b_state); 2570 __clear_bit(BH_New, &bh_result->b_state);
2534 ext_debug("blocks %u/%lu requested for inode %u\n", 2571 ext_debug("blocks %u/%lu requested for inode %u\n",
@@ -2616,8 +2653,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2616 */ 2653 */
2617 if (allocated > max_blocks) 2654 if (allocated > max_blocks)
2618 allocated = max_blocks; 2655 allocated = max_blocks;
2619 /* mark the buffer unwritten */ 2656 set_buffer_unwritten(bh_result);
2620 __set_bit(BH_Unwritten, &bh_result->b_state);
2621 goto out2; 2657 goto out2;
2622 } 2658 }
2623 2659
@@ -2716,14 +2752,19 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2716 goto out2; 2752 goto out2;
2717 } 2753 }
2718 2754
2719 if (extend_disksize && inode->i_size > EXT4_I(inode)->i_disksize)
2720 EXT4_I(inode)->i_disksize = inode->i_size;
2721
2722 /* previous routine could use block we allocated */ 2755 /* previous routine could use block we allocated */
2723 newblock = ext_pblock(&newex); 2756 newblock = ext_pblock(&newex);
2724 allocated = ext4_ext_get_actual_len(&newex); 2757 allocated = ext4_ext_get_actual_len(&newex);
2725outnew: 2758outnew:
2726 __set_bit(BH_New, &bh_result->b_state); 2759 if (extend_disksize) {
2760 disksize = ((loff_t) iblock + ar.len) << inode->i_blkbits;
2761 if (disksize > i_size_read(inode))
2762 disksize = i_size_read(inode);
2763 if (disksize > EXT4_I(inode)->i_disksize)
2764 EXT4_I(inode)->i_disksize = disksize;
2765 }
2766
2767 set_buffer_new(bh_result);
2727 2768
2728 /* Cache only when it is _not_ an uninitialized extent */ 2769 /* Cache only when it is _not_ an uninitialized extent */
2729 if (create != EXT4_CREATE_UNINITIALIZED_EXT) 2770 if (create != EXT4_CREATE_UNINITIALIZED_EXT)
@@ -2733,7 +2774,7 @@ out:
2733 if (allocated > max_blocks) 2774 if (allocated > max_blocks)
2734 allocated = max_blocks; 2775 allocated = max_blocks;
2735 ext4_ext_show_leaf(inode, path); 2776 ext4_ext_show_leaf(inode, path);
2736 __set_bit(BH_Mapped, &bh_result->b_state); 2777 set_buffer_mapped(bh_result);
2737 bh_result->b_bdev = inode->i_sb->s_bdev; 2778 bh_result->b_bdev = inode->i_sb->s_bdev;
2738 bh_result->b_blocknr = newblock; 2779 bh_result->b_blocknr = newblock;
2739out2: 2780out2:
@@ -2744,7 +2785,7 @@ out2:
2744 return err ? err : allocated; 2785 return err ? err : allocated;
2745} 2786}
2746 2787
2747void ext4_ext_truncate(struct inode * inode, struct page *page) 2788void ext4_ext_truncate(struct inode *inode)
2748{ 2789{
2749 struct address_space *mapping = inode->i_mapping; 2790 struct address_space *mapping = inode->i_mapping;
2750 struct super_block *sb = inode->i_sb; 2791 struct super_block *sb = inode->i_sb;
@@ -2757,18 +2798,14 @@ void ext4_ext_truncate(struct inode * inode, struct page *page)
2757 */ 2798 */
2758 err = ext4_writepage_trans_blocks(inode) + 3; 2799 err = ext4_writepage_trans_blocks(inode) + 3;
2759 handle = ext4_journal_start(inode, err); 2800 handle = ext4_journal_start(inode, err);
2760 if (IS_ERR(handle)) { 2801 if (IS_ERR(handle))
2761 if (page) {
2762 clear_highpage(page);
2763 flush_dcache_page(page);
2764 unlock_page(page);
2765 page_cache_release(page);
2766 }
2767 return; 2802 return;
2768 }
2769 2803
2770 if (page) 2804 if (inode->i_size & (sb->s_blocksize - 1))
2771 ext4_block_truncate_page(handle, page, mapping, inode->i_size); 2805 ext4_block_truncate_page(handle, mapping, inode->i_size);
2806
2807 if (ext4_orphan_add(handle, inode))
2808 goto out_stop;
2772 2809
2773 down_write(&EXT4_I(inode)->i_data_sem); 2810 down_write(&EXT4_I(inode)->i_data_sem);
2774 ext4_ext_invalidate_cache(inode); 2811 ext4_ext_invalidate_cache(inode);
@@ -2780,8 +2817,6 @@ void ext4_ext_truncate(struct inode * inode, struct page *page)
2780 * Probably we need not scan at all, 2817 * Probably we need not scan at all,
2781 * because page truncation is enough. 2818 * because page truncation is enough.
2782 */ 2819 */
2783 if (ext4_orphan_add(handle, inode))
2784 goto out_stop;
2785 2820
2786 /* we have to know where to truncate from in crash case */ 2821 /* we have to know where to truncate from in crash case */
2787 EXT4_I(inode)->i_disksize = inode->i_size; 2822 EXT4_I(inode)->i_disksize = inode->i_size;
@@ -2798,6 +2833,7 @@ void ext4_ext_truncate(struct inode * inode, struct page *page)
2798 handle->h_sync = 1; 2833 handle->h_sync = 1;
2799 2834
2800out_stop: 2835out_stop:
2836 up_write(&EXT4_I(inode)->i_data_sem);
2801 /* 2837 /*
2802 * If this was a simple ftruncate() and the file will remain alive, 2838 * If this was a simple ftruncate() and the file will remain alive,
2803 * then we need to clear up the orphan record which we created above. 2839 * then we need to clear up the orphan record which we created above.
@@ -2808,7 +2844,6 @@ out_stop:
2808 if (inode->i_nlink) 2844 if (inode->i_nlink)
2809 ext4_orphan_del(handle, inode); 2845 ext4_orphan_del(handle, inode);
2810 2846
2811 up_write(&EXT4_I(inode)->i_data_sem);
2812 inode->i_mtime = inode->i_ctime = ext4_current_time(inode); 2847 inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
2813 ext4_mark_inode_dirty(handle, inode); 2848 ext4_mark_inode_dirty(handle, inode);
2814 ext4_journal_stop(handle); 2849 ext4_journal_stop(handle);
@@ -2911,7 +2946,7 @@ retry:
2911 } 2946 }
2912 ret = ext4_get_blocks_wrap(handle, inode, block, 2947 ret = ext4_get_blocks_wrap(handle, inode, block,
2913 max_blocks, &map_bh, 2948 max_blocks, &map_bh,
2914 EXT4_CREATE_UNINITIALIZED_EXT, 0); 2949 EXT4_CREATE_UNINITIALIZED_EXT, 0, 0);
2915 if (ret <= 0) { 2950 if (ret <= 0) {
2916#ifdef EXT4FS_DEBUG 2951#ifdef EXT4FS_DEBUG
2917 WARN_ON(ret <= 0); 2952 WARN_ON(ret <= 0);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 4159be6366ab..430eb7978db4 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -123,6 +123,23 @@ force_commit:
123 return ret; 123 return ret;
124} 124}
125 125
126static struct vm_operations_struct ext4_file_vm_ops = {
127 .fault = filemap_fault,
128 .page_mkwrite = ext4_page_mkwrite,
129};
130
131static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
132{
133 struct address_space *mapping = file->f_mapping;
134
135 if (!mapping->a_ops->readpage)
136 return -ENOEXEC;
137 file_accessed(file);
138 vma->vm_ops = &ext4_file_vm_ops;
139 vma->vm_flags |= VM_CAN_NONLINEAR;
140 return 0;
141}
142
126const struct file_operations ext4_file_operations = { 143const struct file_operations ext4_file_operations = {
127 .llseek = generic_file_llseek, 144 .llseek = generic_file_llseek,
128 .read = do_sync_read, 145 .read = do_sync_read,
@@ -133,7 +150,7 @@ const struct file_operations ext4_file_operations = {
133#ifdef CONFIG_COMPAT 150#ifdef CONFIG_COMPAT
134 .compat_ioctl = ext4_compat_ioctl, 151 .compat_ioctl = ext4_compat_ioctl,
135#endif 152#endif
136 .mmap = generic_file_mmap, 153 .mmap = ext4_file_mmap,
137 .open = generic_file_open, 154 .open = generic_file_open,
138 .release = ext4_release_file, 155 .release = ext4_release_file,
139 .fsync = ext4_sync_file, 156 .fsync = ext4_sync_file,
@@ -144,6 +161,7 @@ const struct file_operations ext4_file_operations = {
144const struct inode_operations ext4_file_inode_operations = { 161const struct inode_operations ext4_file_inode_operations = {
145 .truncate = ext4_truncate, 162 .truncate = ext4_truncate,
146 .setattr = ext4_setattr, 163 .setattr = ext4_setattr,
164 .getattr = ext4_getattr,
147#ifdef CONFIG_EXT4DEV_FS_XATTR 165#ifdef CONFIG_EXT4DEV_FS_XATTR
148 .setxattr = generic_setxattr, 166 .setxattr = generic_setxattr,
149 .getxattr = generic_getxattr, 167 .getxattr = generic_getxattr,
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 1c8ba48d4f8d..a45c3737ad31 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -27,6 +27,7 @@
27#include <linux/sched.h> 27#include <linux/sched.h>
28#include <linux/writeback.h> 28#include <linux/writeback.h>
29#include <linux/jbd2.h> 29#include <linux/jbd2.h>
30#include <linux/blkdev.h>
30#include "ext4.h" 31#include "ext4.h"
31#include "ext4_jbd2.h" 32#include "ext4_jbd2.h"
32 33
@@ -45,6 +46,7 @@
45int ext4_sync_file(struct file * file, struct dentry *dentry, int datasync) 46int ext4_sync_file(struct file * file, struct dentry *dentry, int datasync)
46{ 47{
47 struct inode *inode = dentry->d_inode; 48 struct inode *inode = dentry->d_inode;
49 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
48 int ret = 0; 50 int ret = 0;
49 51
50 J_ASSERT(ext4_journal_current_handle() == NULL); 52 J_ASSERT(ext4_journal_current_handle() == NULL);
@@ -85,6 +87,8 @@ int ext4_sync_file(struct file * file, struct dentry *dentry, int datasync)
85 .nr_to_write = 0, /* sys_fsync did this */ 87 .nr_to_write = 0, /* sys_fsync did this */
86 }; 88 };
87 ret = sync_inode(inode, &wbc); 89 ret = sync_inode(inode, &wbc);
90 if (journal && (journal->j_flags & JBD2_BARRIER))
91 blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
88 } 92 }
89out: 93out:
90 return ret; 94 return ret;
diff --git a/fs/ext4/group.h b/fs/ext4/group.h
index 7eb0604e7eea..c2c0a8d06d0e 100644
--- a/fs/ext4/group.h
+++ b/fs/ext4/group.h
@@ -13,7 +13,7 @@ extern __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 group,
13 struct ext4_group_desc *gdp); 13 struct ext4_group_desc *gdp);
14extern int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 group, 14extern int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 group,
15 struct ext4_group_desc *gdp); 15 struct ext4_group_desc *gdp);
16struct buffer_head *read_block_bitmap(struct super_block *sb, 16struct buffer_head *ext4_read_block_bitmap(struct super_block *sb,
17 ext4_group_t block_group); 17 ext4_group_t block_group);
18extern unsigned ext4_init_block_bitmap(struct super_block *sb, 18extern unsigned ext4_init_block_bitmap(struct super_block *sb,
19 struct buffer_head *bh, 19 struct buffer_head *bh,
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index c6efbab0c801..a92eb305344f 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -157,6 +157,7 @@ void ext4_free_inode (handle_t *handle, struct inode * inode)
157 struct ext4_super_block * es; 157 struct ext4_super_block * es;
158 struct ext4_sb_info *sbi; 158 struct ext4_sb_info *sbi;
159 int fatal = 0, err; 159 int fatal = 0, err;
160 ext4_group_t flex_group;
160 161
161 if (atomic_read(&inode->i_count) > 1) { 162 if (atomic_read(&inode->i_count) > 1) {
162 printk ("ext4_free_inode: inode has count=%d\n", 163 printk ("ext4_free_inode: inode has count=%d\n",
@@ -232,6 +233,12 @@ void ext4_free_inode (handle_t *handle, struct inode * inode)
232 if (is_directory) 233 if (is_directory)
233 percpu_counter_dec(&sbi->s_dirs_counter); 234 percpu_counter_dec(&sbi->s_dirs_counter);
234 235
236 if (sbi->s_log_groups_per_flex) {
237 flex_group = ext4_flex_group(sbi, block_group);
238 spin_lock(sb_bgl_lock(sbi, flex_group));
239 sbi->s_flex_groups[flex_group].free_inodes++;
240 spin_unlock(sb_bgl_lock(sbi, flex_group));
241 }
235 } 242 }
236 BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata"); 243 BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata");
237 err = ext4_journal_dirty_metadata(handle, bh2); 244 err = ext4_journal_dirty_metadata(handle, bh2);
@@ -286,6 +293,80 @@ static int find_group_dir(struct super_block *sb, struct inode *parent,
286 return ret; 293 return ret;
287} 294}
288 295
296#define free_block_ratio 10
297
298static int find_group_flex(struct super_block *sb, struct inode *parent,
299 ext4_group_t *best_group)
300{
301 struct ext4_sb_info *sbi = EXT4_SB(sb);
302 struct ext4_group_desc *desc;
303 struct buffer_head *bh;
304 struct flex_groups *flex_group = sbi->s_flex_groups;
305 ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
306 ext4_group_t parent_fbg_group = ext4_flex_group(sbi, parent_group);
307 ext4_group_t ngroups = sbi->s_groups_count;
308 int flex_size = ext4_flex_bg_size(sbi);
309 ext4_group_t best_flex = parent_fbg_group;
310 int blocks_per_flex = sbi->s_blocks_per_group * flex_size;
311 int flexbg_free_blocks;
312 int flex_freeb_ratio;
313 ext4_group_t n_fbg_groups;
314 ext4_group_t i;
315
316 n_fbg_groups = (sbi->s_groups_count + flex_size - 1) >>
317 sbi->s_log_groups_per_flex;
318
319find_close_to_parent:
320 flexbg_free_blocks = flex_group[best_flex].free_blocks;
321 flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
322 if (flex_group[best_flex].free_inodes &&
323 flex_freeb_ratio > free_block_ratio)
324 goto found_flexbg;
325
326 if (best_flex && best_flex == parent_fbg_group) {
327 best_flex--;
328 goto find_close_to_parent;
329 }
330
331 for (i = 0; i < n_fbg_groups; i++) {
332 if (i == parent_fbg_group || i == parent_fbg_group - 1)
333 continue;
334
335 flexbg_free_blocks = flex_group[i].free_blocks;
336 flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
337
338 if (flex_freeb_ratio > free_block_ratio &&
339 flex_group[i].free_inodes) {
340 best_flex = i;
341 goto found_flexbg;
342 }
343
344 if (best_flex < 0 ||
345 (flex_group[i].free_blocks >
346 flex_group[best_flex].free_blocks &&
347 flex_group[i].free_inodes))
348 best_flex = i;
349 }
350
351 if (!flex_group[best_flex].free_inodes ||
352 !flex_group[best_flex].free_blocks)
353 return -1;
354
355found_flexbg:
356 for (i = best_flex * flex_size; i < ngroups &&
357 i < (best_flex + 1) * flex_size; i++) {
358 desc = ext4_get_group_desc(sb, i, &bh);
359 if (le16_to_cpu(desc->bg_free_inodes_count)) {
360 *best_group = i;
361 goto out;
362 }
363 }
364
365 return -1;
366out:
367 return 0;
368}
369
289/* 370/*
290 * Orlov's allocator for directories. 371 * Orlov's allocator for directories.
291 * 372 *
@@ -501,6 +582,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
501 struct inode *ret; 582 struct inode *ret;
502 ext4_group_t i; 583 ext4_group_t i;
503 int free = 0; 584 int free = 0;
585 ext4_group_t flex_group;
504 586
505 /* Cannot create files in a deleted directory */ 587 /* Cannot create files in a deleted directory */
506 if (!dir || !dir->i_nlink) 588 if (!dir || !dir->i_nlink)
@@ -514,6 +596,12 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
514 596
515 sbi = EXT4_SB(sb); 597 sbi = EXT4_SB(sb);
516 es = sbi->s_es; 598 es = sbi->s_es;
599
600 if (sbi->s_log_groups_per_flex) {
601 ret2 = find_group_flex(sb, dir, &group);
602 goto got_group;
603 }
604
517 if (S_ISDIR(mode)) { 605 if (S_ISDIR(mode)) {
518 if (test_opt (sb, OLDALLOC)) 606 if (test_opt (sb, OLDALLOC))
519 ret2 = find_group_dir(sb, dir, &group); 607 ret2 = find_group_dir(sb, dir, &group);
@@ -522,6 +610,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
522 } else 610 } else
523 ret2 = find_group_other(sb, dir, &group); 611 ret2 = find_group_other(sb, dir, &group);
524 612
613got_group:
525 err = -ENOSPC; 614 err = -ENOSPC;
526 if (ret2 == -1) 615 if (ret2 == -1)
527 goto out; 616 goto out;
@@ -600,7 +689,7 @@ got:
600 /* We may have to initialize the block bitmap if it isn't already */ 689 /* We may have to initialize the block bitmap if it isn't already */
601 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) && 690 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) &&
602 gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { 691 gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
603 struct buffer_head *block_bh = read_block_bitmap(sb, group); 692 struct buffer_head *block_bh = ext4_read_block_bitmap(sb, group);
604 693
605 BUFFER_TRACE(block_bh, "get block bitmap access"); 694 BUFFER_TRACE(block_bh, "get block bitmap access");
606 err = ext4_journal_get_write_access(handle, block_bh); 695 err = ext4_journal_get_write_access(handle, block_bh);
@@ -676,6 +765,13 @@ got:
676 percpu_counter_inc(&sbi->s_dirs_counter); 765 percpu_counter_inc(&sbi->s_dirs_counter);
677 sb->s_dirt = 1; 766 sb->s_dirt = 1;
678 767
768 if (sbi->s_log_groups_per_flex) {
769 flex_group = ext4_flex_group(sbi, group);
770 spin_lock(sb_bgl_lock(sbi, flex_group));
771 sbi->s_flex_groups[flex_group].free_inodes--;
772 spin_unlock(sb_bgl_lock(sbi, flex_group));
773 }
774
679 inode->i_uid = current->fsuid; 775 inode->i_uid = current->fsuid;
680 if (test_opt (sb, GRPID)) 776 if (test_opt (sb, GRPID))
681 inode->i_gid = dir->i_gid; 777 inode->i_gid = dir->i_gid;
@@ -740,14 +836,10 @@ got:
740 goto fail_free_drop; 836 goto fail_free_drop;
741 837
742 if (test_opt(sb, EXTENTS)) { 838 if (test_opt(sb, EXTENTS)) {
743 /* set extent flag only for diretory, file and normal symlink*/ 839 /* set extent flag only for directory, file and normal symlink*/
744 if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) { 840 if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) {
745 EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL; 841 EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL;
746 ext4_ext_tree_init(handle, inode); 842 ext4_ext_tree_init(handle, inode);
747 err = ext4_update_incompat_feature(handle, sb,
748 EXT4_FEATURE_INCOMPAT_EXTENTS);
749 if (err)
750 goto fail_free_drop;
751 } 843 }
752 } 844 }
753 845
@@ -817,6 +909,14 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
817 if (IS_ERR(inode)) 909 if (IS_ERR(inode))
818 goto iget_failed; 910 goto iget_failed;
819 911
912 /*
913 * If the orphans has i_nlinks > 0 then it should be able to be
914 * truncated, otherwise it won't be removed from the orphan list
915 * during processing and an infinite loop will result.
916 */
917 if (inode->i_nlink && !ext4_can_truncate(inode))
918 goto bad_orphan;
919
820 if (NEXT_ORPHAN(inode) > max_ino) 920 if (NEXT_ORPHAN(inode) > max_ino)
821 goto bad_orphan; 921 goto bad_orphan;
822 brelse(bitmap_bh); 922 brelse(bitmap_bh);
@@ -838,6 +938,7 @@ bad_orphan:
838 printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%u\n", 938 printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%u\n",
839 NEXT_ORPHAN(inode)); 939 NEXT_ORPHAN(inode));
840 printk(KERN_NOTICE "max_ino=%lu\n", max_ino); 940 printk(KERN_NOTICE "max_ino=%lu\n", max_ino);
941 printk(KERN_NOTICE "i_nlink=%u\n", inode->i_nlink);
841 /* Avoid freeing blocks if we got a bad deleted inode */ 942 /* Avoid freeing blocks if we got a bad deleted inode */
842 if (inode->i_nlink == 0) 943 if (inode->i_nlink == 0)
843 inode->i_blocks = 0; 944 inode->i_blocks = 0;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 8d9707746413..8ca2763df091 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -32,12 +32,23 @@
32#include <linux/string.h> 32#include <linux/string.h>
33#include <linux/buffer_head.h> 33#include <linux/buffer_head.h>
34#include <linux/writeback.h> 34#include <linux/writeback.h>
35#include <linux/pagevec.h>
35#include <linux/mpage.h> 36#include <linux/mpage.h>
36#include <linux/uio.h> 37#include <linux/uio.h>
37#include <linux/bio.h> 38#include <linux/bio.h>
38#include "ext4_jbd2.h" 39#include "ext4_jbd2.h"
39#include "xattr.h" 40#include "xattr.h"
40#include "acl.h" 41#include "acl.h"
42#include "ext4_extents.h"
43
44static inline int ext4_begin_ordered_truncate(struct inode *inode,
45 loff_t new_size)
46{
47 return jbd2_journal_begin_ordered_truncate(&EXT4_I(inode)->jinode,
48 new_size);
49}
50
51static void ext4_invalidatepage(struct page *page, unsigned long offset);
41 52
42/* 53/*
43 * Test whether an inode is a fast symlink. 54 * Test whether an inode is a fast symlink.
@@ -181,6 +192,8 @@ void ext4_delete_inode (struct inode * inode)
181{ 192{
182 handle_t *handle; 193 handle_t *handle;
183 194
195 if (ext4_should_order_data(inode))
196 ext4_begin_ordered_truncate(inode, 0);
184 truncate_inode_pages(&inode->i_data, 0); 197 truncate_inode_pages(&inode->i_data, 0);
185 198
186 if (is_bad_inode(inode)) 199 if (is_bad_inode(inode))
@@ -508,11 +521,12 @@ static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned long blks,
508 * direct blocks 521 * direct blocks
509 */ 522 */
510static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, 523static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
511 ext4_fsblk_t goal, int indirect_blks, int blks, 524 ext4_lblk_t iblock, ext4_fsblk_t goal,
512 ext4_fsblk_t new_blocks[4], int *err) 525 int indirect_blks, int blks,
526 ext4_fsblk_t new_blocks[4], int *err)
513{ 527{
514 int target, i; 528 int target, i;
515 unsigned long count = 0; 529 unsigned long count = 0, blk_allocated = 0;
516 int index = 0; 530 int index = 0;
517 ext4_fsblk_t current_block = 0; 531 ext4_fsblk_t current_block = 0;
518 int ret = 0; 532 int ret = 0;
@@ -525,12 +539,13 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
525 * the first direct block of this branch. That's the 539 * the first direct block of this branch. That's the
526 * minimum number of blocks need to allocate(required) 540 * minimum number of blocks need to allocate(required)
527 */ 541 */
528 target = blks + indirect_blks; 542 /* first we try to allocate the indirect blocks */
529 543 target = indirect_blks;
530 while (1) { 544 while (target > 0) {
531 count = target; 545 count = target;
532 /* allocating blocks for indirect blocks and direct blocks */ 546 /* allocating blocks for indirect blocks and direct blocks */
533 current_block = ext4_new_blocks(handle,inode,goal,&count,err); 547 current_block = ext4_new_meta_blocks(handle, inode,
548 goal, &count, err);
534 if (*err) 549 if (*err)
535 goto failed_out; 550 goto failed_out;
536 551
@@ -540,16 +555,48 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
540 new_blocks[index++] = current_block++; 555 new_blocks[index++] = current_block++;
541 count--; 556 count--;
542 } 557 }
543 558 if (count > 0) {
544 if (count > 0) 559 /*
560 * save the new block number
561 * for the first direct block
562 */
563 new_blocks[index] = current_block;
564 printk(KERN_INFO "%s returned more blocks than "
565 "requested\n", __func__);
566 WARN_ON(1);
545 break; 567 break;
568 }
546 } 569 }
547 570
548 /* save the new block number for the first direct block */ 571 target = blks - count ;
549 new_blocks[index] = current_block; 572 blk_allocated = count;
550 573 if (!target)
574 goto allocated;
575 /* Now allocate data blocks */
576 count = target;
577 /* allocating blocks for data blocks */
578 current_block = ext4_new_blocks(handle, inode, iblock,
579 goal, &count, err);
580 if (*err && (target == blks)) {
581 /*
582 * if the allocation failed and we didn't allocate
583 * any blocks before
584 */
585 goto failed_out;
586 }
587 if (!*err) {
588 if (target == blks) {
589 /*
590 * save the new block number
591 * for the first direct block
592 */
593 new_blocks[index] = current_block;
594 }
595 blk_allocated += count;
596 }
597allocated:
551 /* total number of blocks allocated for direct blocks */ 598 /* total number of blocks allocated for direct blocks */
552 ret = count; 599 ret = blk_allocated;
553 *err = 0; 600 *err = 0;
554 return ret; 601 return ret;
555failed_out: 602failed_out:
@@ -584,8 +631,9 @@ failed_out:
584 * as described above and return 0. 631 * as described above and return 0.
585 */ 632 */
586static int ext4_alloc_branch(handle_t *handle, struct inode *inode, 633static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
587 int indirect_blks, int *blks, ext4_fsblk_t goal, 634 ext4_lblk_t iblock, int indirect_blks,
588 ext4_lblk_t *offsets, Indirect *branch) 635 int *blks, ext4_fsblk_t goal,
636 ext4_lblk_t *offsets, Indirect *branch)
589{ 637{
590 int blocksize = inode->i_sb->s_blocksize; 638 int blocksize = inode->i_sb->s_blocksize;
591 int i, n = 0; 639 int i, n = 0;
@@ -595,7 +643,7 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
595 ext4_fsblk_t new_blocks[4]; 643 ext4_fsblk_t new_blocks[4];
596 ext4_fsblk_t current_block; 644 ext4_fsblk_t current_block;
597 645
598 num = ext4_alloc_blocks(handle, inode, goal, indirect_blks, 646 num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks,
599 *blks, new_blocks, &err); 647 *blks, new_blocks, &err);
600 if (err) 648 if (err)
601 return err; 649 return err;
@@ -799,6 +847,7 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
799 struct ext4_inode_info *ei = EXT4_I(inode); 847 struct ext4_inode_info *ei = EXT4_I(inode);
800 int count = 0; 848 int count = 0;
801 ext4_fsblk_t first_block = 0; 849 ext4_fsblk_t first_block = 0;
850 loff_t disksize;
802 851
803 852
804 J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)); 853 J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL));
@@ -855,8 +904,9 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
855 /* 904 /*
856 * Block out ext4_truncate while we alter the tree 905 * Block out ext4_truncate while we alter the tree
857 */ 906 */
858 err = ext4_alloc_branch(handle, inode, indirect_blks, &count, goal, 907 err = ext4_alloc_branch(handle, inode, iblock, indirect_blks,
859 offsets + (partial - chain), partial); 908 &count, goal,
909 offsets + (partial - chain), partial);
860 910
861 /* 911 /*
862 * The ext4_splice_branch call will free and forget any buffers 912 * The ext4_splice_branch call will free and forget any buffers
@@ -873,8 +923,13 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
873 * protect it if you're about to implement concurrent 923 * protect it if you're about to implement concurrent
874 * ext4_get_block() -bzzz 924 * ext4_get_block() -bzzz
875 */ 925 */
876 if (!err && extend_disksize && inode->i_size > ei->i_disksize) 926 if (!err && extend_disksize) {
877 ei->i_disksize = inode->i_size; 927 disksize = ((loff_t) iblock + count) << inode->i_blkbits;
928 if (disksize > i_size_read(inode))
929 disksize = i_size_read(inode);
930 if (disksize > ei->i_disksize)
931 ei->i_disksize = disksize;
932 }
878 if (err) 933 if (err)
879 goto cleanup; 934 goto cleanup;
880 935
@@ -934,7 +989,7 @@ out:
934 */ 989 */
935int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block, 990int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
936 unsigned long max_blocks, struct buffer_head *bh, 991 unsigned long max_blocks, struct buffer_head *bh,
937 int create, int extend_disksize) 992 int create, int extend_disksize, int flag)
938{ 993{
939 int retval; 994 int retval;
940 995
@@ -975,6 +1030,15 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
975 * with create == 1 flag. 1030 * with create == 1 flag.
976 */ 1031 */
977 down_write((&EXT4_I(inode)->i_data_sem)); 1032 down_write((&EXT4_I(inode)->i_data_sem));
1033
1034 /*
1035 * if the caller is from delayed allocation writeout path
1036 * we have already reserved fs blocks for allocation
1037 * let the underlying get_block() function know to
1038 * avoid double accounting
1039 */
1040 if (flag)
1041 EXT4_I(inode)->i_delalloc_reserved_flag = 1;
978 /* 1042 /*
979 * We need to check for EXT4 here because migrate 1043 * We need to check for EXT4 here because migrate
980 * could have changed the inode type in between 1044 * could have changed the inode type in between
@@ -996,6 +1060,18 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
996 ~EXT4_EXT_MIGRATE; 1060 ~EXT4_EXT_MIGRATE;
997 } 1061 }
998 } 1062 }
1063
1064 if (flag) {
1065 EXT4_I(inode)->i_delalloc_reserved_flag = 0;
1066 /*
1067 * Update reserved blocks/metadata blocks
1068 * after successful block allocation
1069 * which were deferred till now
1070 */
1071 if ((retval > 0) && buffer_delay(bh))
1072 ext4_da_release_space(inode, retval, 0);
1073 }
1074
999 up_write((&EXT4_I(inode)->i_data_sem)); 1075 up_write((&EXT4_I(inode)->i_data_sem));
1000 return retval; 1076 return retval;
1001} 1077}
@@ -1021,7 +1097,7 @@ static int ext4_get_block(struct inode *inode, sector_t iblock,
1021 } 1097 }
1022 1098
1023 ret = ext4_get_blocks_wrap(handle, inode, iblock, 1099 ret = ext4_get_blocks_wrap(handle, inode, iblock,
1024 max_blocks, bh_result, create, 0); 1100 max_blocks, bh_result, create, 0, 0);
1025 if (ret > 0) { 1101 if (ret > 0) {
1026 bh_result->b_size = (ret << inode->i_blkbits); 1102 bh_result->b_size = (ret << inode->i_blkbits);
1027 ret = 0; 1103 ret = 0;
@@ -1047,7 +1123,7 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
1047 dummy.b_blocknr = -1000; 1123 dummy.b_blocknr = -1000;
1048 buffer_trace_init(&dummy.b_history); 1124 buffer_trace_init(&dummy.b_history);
1049 err = ext4_get_blocks_wrap(handle, inode, block, 1, 1125 err = ext4_get_blocks_wrap(handle, inode, block, 1,
1050 &dummy, create, 1); 1126 &dummy, create, 1, 0);
1051 /* 1127 /*
1052 * ext4_get_blocks_handle() returns number of blocks 1128 * ext4_get_blocks_handle() returns number of blocks
1053 * mapped. 0 in case of a HOLE. 1129 * mapped. 0 in case of a HOLE.
@@ -1203,19 +1279,20 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
1203 to = from + len; 1279 to = from + len;
1204 1280
1205retry: 1281retry:
1206 page = __grab_cache_page(mapping, index);
1207 if (!page)
1208 return -ENOMEM;
1209 *pagep = page;
1210
1211 handle = ext4_journal_start(inode, needed_blocks); 1282 handle = ext4_journal_start(inode, needed_blocks);
1212 if (IS_ERR(handle)) { 1283 if (IS_ERR(handle)) {
1213 unlock_page(page);
1214 page_cache_release(page);
1215 ret = PTR_ERR(handle); 1284 ret = PTR_ERR(handle);
1216 goto out; 1285 goto out;
1217 } 1286 }
1218 1287
1288 page = __grab_cache_page(mapping, index);
1289 if (!page) {
1290 ext4_journal_stop(handle);
1291 ret = -ENOMEM;
1292 goto out;
1293 }
1294 *pagep = page;
1295
1219 ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 1296 ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
1220 ext4_get_block); 1297 ext4_get_block);
1221 1298
@@ -1225,8 +1302,8 @@ retry:
1225 } 1302 }
1226 1303
1227 if (ret) { 1304 if (ret) {
1228 ext4_journal_stop(handle);
1229 unlock_page(page); 1305 unlock_page(page);
1306 ext4_journal_stop(handle);
1230 page_cache_release(page); 1307 page_cache_release(page);
1231 } 1308 }
1232 1309
@@ -1236,15 +1313,6 @@ out:
1236 return ret; 1313 return ret;
1237} 1314}
1238 1315
1239int ext4_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
1240{
1241 int err = jbd2_journal_dirty_data(handle, bh);
1242 if (err)
1243 ext4_journal_abort_handle(__func__, __func__,
1244 bh, handle, err);
1245 return err;
1246}
1247
1248/* For write_end() in data=journal mode */ 1316/* For write_end() in data=journal mode */
1249static int write_end_fn(handle_t *handle, struct buffer_head *bh) 1317static int write_end_fn(handle_t *handle, struct buffer_head *bh)
1250{ 1318{
@@ -1255,29 +1323,6 @@ static int write_end_fn(handle_t *handle, struct buffer_head *bh)
1255} 1323}
1256 1324
1257/* 1325/*
1258 * Generic write_end handler for ordered and writeback ext4 journal modes.
1259 * We can't use generic_write_end, because that unlocks the page and we need to
1260 * unlock the page after ext4_journal_stop, but ext4_journal_stop must run
1261 * after block_write_end.
1262 */
1263static int ext4_generic_write_end(struct file *file,
1264 struct address_space *mapping,
1265 loff_t pos, unsigned len, unsigned copied,
1266 struct page *page, void *fsdata)
1267{
1268 struct inode *inode = file->f_mapping->host;
1269
1270 copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
1271
1272 if (pos+copied > inode->i_size) {
1273 i_size_write(inode, pos+copied);
1274 mark_inode_dirty(inode);
1275 }
1276
1277 return copied;
1278}
1279
1280/*
1281 * We need to pick up the new inode size which generic_commit_write gave us 1326 * We need to pick up the new inode size which generic_commit_write gave us
1282 * `file' can be NULL - eg, when called from page_symlink(). 1327 * `file' can be NULL - eg, when called from page_symlink().
1283 * 1328 *
@@ -1290,15 +1335,14 @@ static int ext4_ordered_write_end(struct file *file,
1290 struct page *page, void *fsdata) 1335 struct page *page, void *fsdata)
1291{ 1336{
1292 handle_t *handle = ext4_journal_current_handle(); 1337 handle_t *handle = ext4_journal_current_handle();
1293 struct inode *inode = file->f_mapping->host; 1338 struct inode *inode = mapping->host;
1294 unsigned from, to; 1339 unsigned from, to;
1295 int ret = 0, ret2; 1340 int ret = 0, ret2;
1296 1341
1297 from = pos & (PAGE_CACHE_SIZE - 1); 1342 from = pos & (PAGE_CACHE_SIZE - 1);
1298 to = from + len; 1343 to = from + len;
1299 1344
1300 ret = walk_page_buffers(handle, page_buffers(page), 1345 ret = ext4_jbd2_file_inode(handle, inode);
1301 from, to, NULL, ext4_journal_dirty_data);
1302 1346
1303 if (ret == 0) { 1347 if (ret == 0) {
1304 /* 1348 /*
@@ -1311,7 +1355,7 @@ static int ext4_ordered_write_end(struct file *file,
1311 new_i_size = pos + copied; 1355 new_i_size = pos + copied;
1312 if (new_i_size > EXT4_I(inode)->i_disksize) 1356 if (new_i_size > EXT4_I(inode)->i_disksize)
1313 EXT4_I(inode)->i_disksize = new_i_size; 1357 EXT4_I(inode)->i_disksize = new_i_size;
1314 ret2 = ext4_generic_write_end(file, mapping, pos, len, copied, 1358 ret2 = generic_write_end(file, mapping, pos, len, copied,
1315 page, fsdata); 1359 page, fsdata);
1316 copied = ret2; 1360 copied = ret2;
1317 if (ret2 < 0) 1361 if (ret2 < 0)
@@ -1320,8 +1364,6 @@ static int ext4_ordered_write_end(struct file *file,
1320 ret2 = ext4_journal_stop(handle); 1364 ret2 = ext4_journal_stop(handle);
1321 if (!ret) 1365 if (!ret)
1322 ret = ret2; 1366 ret = ret2;
1323 unlock_page(page);
1324 page_cache_release(page);
1325 1367
1326 return ret ? ret : copied; 1368 return ret ? ret : copied;
1327} 1369}
@@ -1332,7 +1374,7 @@ static int ext4_writeback_write_end(struct file *file,
1332 struct page *page, void *fsdata) 1374 struct page *page, void *fsdata)
1333{ 1375{
1334 handle_t *handle = ext4_journal_current_handle(); 1376 handle_t *handle = ext4_journal_current_handle();
1335 struct inode *inode = file->f_mapping->host; 1377 struct inode *inode = mapping->host;
1336 int ret = 0, ret2; 1378 int ret = 0, ret2;
1337 loff_t new_i_size; 1379 loff_t new_i_size;
1338 1380
@@ -1340,7 +1382,7 @@ static int ext4_writeback_write_end(struct file *file,
1340 if (new_i_size > EXT4_I(inode)->i_disksize) 1382 if (new_i_size > EXT4_I(inode)->i_disksize)
1341 EXT4_I(inode)->i_disksize = new_i_size; 1383 EXT4_I(inode)->i_disksize = new_i_size;
1342 1384
1343 ret2 = ext4_generic_write_end(file, mapping, pos, len, copied, 1385 ret2 = generic_write_end(file, mapping, pos, len, copied,
1344 page, fsdata); 1386 page, fsdata);
1345 copied = ret2; 1387 copied = ret2;
1346 if (ret2 < 0) 1388 if (ret2 < 0)
@@ -1349,8 +1391,6 @@ static int ext4_writeback_write_end(struct file *file,
1349 ret2 = ext4_journal_stop(handle); 1391 ret2 = ext4_journal_stop(handle);
1350 if (!ret) 1392 if (!ret)
1351 ret = ret2; 1393 ret = ret2;
1352 unlock_page(page);
1353 page_cache_release(page);
1354 1394
1355 return ret ? ret : copied; 1395 return ret ? ret : copied;
1356} 1396}
@@ -1389,14 +1429,965 @@ static int ext4_journalled_write_end(struct file *file,
1389 ret = ret2; 1429 ret = ret2;
1390 } 1430 }
1391 1431
1432 unlock_page(page);
1392 ret2 = ext4_journal_stop(handle); 1433 ret2 = ext4_journal_stop(handle);
1393 if (!ret) 1434 if (!ret)
1394 ret = ret2; 1435 ret = ret2;
1395 unlock_page(page);
1396 page_cache_release(page); 1436 page_cache_release(page);
1397 1437
1398 return ret ? ret : copied; 1438 return ret ? ret : copied;
1399} 1439}
1440/*
1441 * Calculate the number of metadata blocks need to reserve
1442 * to allocate @blocks for non extent file based file
1443 */
1444static int ext4_indirect_calc_metadata_amount(struct inode *inode, int blocks)
1445{
1446 int icap = EXT4_ADDR_PER_BLOCK(inode->i_sb);
1447 int ind_blks, dind_blks, tind_blks;
1448
1449 /* number of new indirect blocks needed */
1450 ind_blks = (blocks + icap - 1) / icap;
1451
1452 dind_blks = (ind_blks + icap - 1) / icap;
1453
1454 tind_blks = 1;
1455
1456 return ind_blks + dind_blks + tind_blks;
1457}
1458
1459/*
1460 * Calculate the number of metadata blocks need to reserve
1461 * to allocate given number of blocks
1462 */
1463static int ext4_calc_metadata_amount(struct inode *inode, int blocks)
1464{
1465 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
1466 return ext4_ext_calc_metadata_amount(inode, blocks);
1467
1468 return ext4_indirect_calc_metadata_amount(inode, blocks);
1469}
1470
1471static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
1472{
1473 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1474 unsigned long md_needed, mdblocks, total = 0;
1475
1476 /*
1477 * recalculate the amount of metadata blocks to reserve
1478 * in order to allocate nrblocks
1479 * worse case is one extent per block
1480 */
1481 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
1482 total = EXT4_I(inode)->i_reserved_data_blocks + nrblocks;
1483 mdblocks = ext4_calc_metadata_amount(inode, total);
1484 BUG_ON(mdblocks < EXT4_I(inode)->i_reserved_meta_blocks);
1485
1486 md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks;
1487 total = md_needed + nrblocks;
1488
1489 if (ext4_has_free_blocks(sbi, total) < total) {
1490 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1491 return -ENOSPC;
1492 }
1493
1494 /* reduce fs free blocks counter */
1495 percpu_counter_sub(&sbi->s_freeblocks_counter, total);
1496
1497 EXT4_I(inode)->i_reserved_data_blocks += nrblocks;
1498 EXT4_I(inode)->i_reserved_meta_blocks = mdblocks;
1499
1500 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1501 return 0; /* success */
1502}
1503
1504void ext4_da_release_space(struct inode *inode, int used, int to_free)
1505{
1506 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1507 int total, mdb, mdb_free, release;
1508
1509 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
1510 /* recalculate the number of metablocks still need to be reserved */
1511 total = EXT4_I(inode)->i_reserved_data_blocks - used - to_free;
1512 mdb = ext4_calc_metadata_amount(inode, total);
1513
1514 /* figure out how many metablocks to release */
1515 BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
1516 mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb;
1517
1518 /* Account for allocated meta_blocks */
1519 mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks;
1520
1521 release = to_free + mdb_free;
1522
1523 /* update fs free blocks counter for truncate case */
1524 percpu_counter_add(&sbi->s_freeblocks_counter, release);
1525
1526 /* update per-inode reservations */
1527 BUG_ON(used + to_free > EXT4_I(inode)->i_reserved_data_blocks);
1528 EXT4_I(inode)->i_reserved_data_blocks -= (used + to_free);
1529
1530 BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
1531 EXT4_I(inode)->i_reserved_meta_blocks = mdb;
1532 EXT4_I(inode)->i_allocated_meta_blocks = 0;
1533 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1534}
1535
1536static void ext4_da_page_release_reservation(struct page *page,
1537 unsigned long offset)
1538{
1539 int to_release = 0;
1540 struct buffer_head *head, *bh;
1541 unsigned int curr_off = 0;
1542
1543 head = page_buffers(page);
1544 bh = head;
1545 do {
1546 unsigned int next_off = curr_off + bh->b_size;
1547
1548 if ((offset <= curr_off) && (buffer_delay(bh))) {
1549 to_release++;
1550 clear_buffer_delay(bh);
1551 }
1552 curr_off = next_off;
1553 } while ((bh = bh->b_this_page) != head);
1554 ext4_da_release_space(page->mapping->host, 0, to_release);
1555}
1556
1557/*
1558 * Delayed allocation stuff
1559 */
1560
1561struct mpage_da_data {
1562 struct inode *inode;
1563 struct buffer_head lbh; /* extent of blocks */
1564 unsigned long first_page, next_page; /* extent of pages */
1565 get_block_t *get_block;
1566 struct writeback_control *wbc;
1567};
1568
1569/*
1570 * mpage_da_submit_io - walks through extent of pages and try to write
1571 * them with __mpage_writepage()
1572 *
1573 * @mpd->inode: inode
1574 * @mpd->first_page: first page of the extent
1575 * @mpd->next_page: page after the last page of the extent
1576 * @mpd->get_block: the filesystem's block mapper function
1577 *
1578 * By the time mpage_da_submit_io() is called we expect all blocks
1579 * to be allocated. this may be wrong if allocation failed.
1580 *
1581 * As pages are already locked by write_cache_pages(), we can't use it
1582 */
1583static int mpage_da_submit_io(struct mpage_da_data *mpd)
1584{
1585 struct address_space *mapping = mpd->inode->i_mapping;
1586 struct mpage_data mpd_pp = {
1587 .bio = NULL,
1588 .last_block_in_bio = 0,
1589 .get_block = mpd->get_block,
1590 .use_writepage = 1,
1591 };
1592 int ret = 0, err, nr_pages, i;
1593 unsigned long index, end;
1594 struct pagevec pvec;
1595
1596 BUG_ON(mpd->next_page <= mpd->first_page);
1597
1598 pagevec_init(&pvec, 0);
1599 index = mpd->first_page;
1600 end = mpd->next_page - 1;
1601
1602 while (index <= end) {
1603 /* XXX: optimize tail */
1604 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
1605 if (nr_pages == 0)
1606 break;
1607 for (i = 0; i < nr_pages; i++) {
1608 struct page *page = pvec.pages[i];
1609
1610 index = page->index;
1611 if (index > end)
1612 break;
1613 index++;
1614
1615 err = __mpage_writepage(page, mpd->wbc, &mpd_pp);
1616
1617 /*
1618 * In error case, we have to continue because
1619 * remaining pages are still locked
1620 * XXX: unlock and re-dirty them?
1621 */
1622 if (ret == 0)
1623 ret = err;
1624 }
1625 pagevec_release(&pvec);
1626 }
1627 if (mpd_pp.bio)
1628 mpage_bio_submit(WRITE, mpd_pp.bio);
1629
1630 return ret;
1631}
1632
1633/*
1634 * mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers
1635 *
1636 * @mpd->inode - inode to walk through
1637 * @exbh->b_blocknr - first block on a disk
1638 * @exbh->b_size - amount of space in bytes
1639 * @logical - first logical block to start assignment with
1640 *
1641 * the function goes through all passed space and put actual disk
1642 * block numbers into buffer heads, dropping BH_Delay
1643 */
1644static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
1645 struct buffer_head *exbh)
1646{
1647 struct inode *inode = mpd->inode;
1648 struct address_space *mapping = inode->i_mapping;
1649 int blocks = exbh->b_size >> inode->i_blkbits;
1650 sector_t pblock = exbh->b_blocknr, cur_logical;
1651 struct buffer_head *head, *bh;
1652 unsigned long index, end;
1653 struct pagevec pvec;
1654 int nr_pages, i;
1655
1656 index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
1657 end = (logical + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
1658 cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1659
1660 pagevec_init(&pvec, 0);
1661
1662 while (index <= end) {
1663 /* XXX: optimize tail */
1664 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
1665 if (nr_pages == 0)
1666 break;
1667 for (i = 0; i < nr_pages; i++) {
1668 struct page *page = pvec.pages[i];
1669
1670 index = page->index;
1671 if (index > end)
1672 break;
1673 index++;
1674
1675 BUG_ON(!PageLocked(page));
1676 BUG_ON(PageWriteback(page));
1677 BUG_ON(!page_has_buffers(page));
1678
1679 bh = page_buffers(page);
1680 head = bh;
1681
1682 /* skip blocks out of the range */
1683 do {
1684 if (cur_logical >= logical)
1685 break;
1686 cur_logical++;
1687 } while ((bh = bh->b_this_page) != head);
1688
1689 do {
1690 if (cur_logical >= logical + blocks)
1691 break;
1692 if (buffer_delay(bh)) {
1693 bh->b_blocknr = pblock;
1694 clear_buffer_delay(bh);
1695 } else if (buffer_mapped(bh))
1696 BUG_ON(bh->b_blocknr != pblock);
1697
1698 cur_logical++;
1699 pblock++;
1700 } while ((bh = bh->b_this_page) != head);
1701 }
1702 pagevec_release(&pvec);
1703 }
1704}
1705
1706
1707/*
1708 * __unmap_underlying_blocks - just a helper function to unmap
1709 * set of blocks described by @bh
1710 */
1711static inline void __unmap_underlying_blocks(struct inode *inode,
1712 struct buffer_head *bh)
1713{
1714 struct block_device *bdev = inode->i_sb->s_bdev;
1715 int blocks, i;
1716
1717 blocks = bh->b_size >> inode->i_blkbits;
1718 for (i = 0; i < blocks; i++)
1719 unmap_underlying_metadata(bdev, bh->b_blocknr + i);
1720}
1721
1722/*
1723 * mpage_da_map_blocks - go through given space
1724 *
1725 * @mpd->lbh - bh describing space
1726 * @mpd->get_block - the filesystem's block mapper function
1727 *
1728 * The function skips space we know is already mapped to disk blocks.
1729 *
1730 * The function ignores errors ->get_block() returns, thus real
1731 * error handling is postponed to __mpage_writepage()
1732 */
1733static void mpage_da_map_blocks(struct mpage_da_data *mpd)
1734{
1735 struct buffer_head *lbh = &mpd->lbh;
1736 int err = 0, remain = lbh->b_size;
1737 sector_t next = lbh->b_blocknr;
1738 struct buffer_head new;
1739
1740 /*
1741 * We consider only non-mapped and non-allocated blocks
1742 */
1743 if (buffer_mapped(lbh) && !buffer_delay(lbh))
1744 return;
1745
1746 while (remain) {
1747 new.b_state = lbh->b_state;
1748 new.b_blocknr = 0;
1749 new.b_size = remain;
1750 err = mpd->get_block(mpd->inode, next, &new, 1);
1751 if (err) {
1752 /*
1753 * Rather than implement own error handling
1754 * here, we just leave remaining blocks
1755 * unallocated and try again with ->writepage()
1756 */
1757 break;
1758 }
1759 BUG_ON(new.b_size == 0);
1760
1761 if (buffer_new(&new))
1762 __unmap_underlying_blocks(mpd->inode, &new);
1763
1764 /*
1765 * If blocks are delayed marked, we need to
1766 * put actual blocknr and drop delayed bit
1767 */
1768 if (buffer_delay(lbh))
1769 mpage_put_bnr_to_bhs(mpd, next, &new);
1770
1771 /* go for the remaining blocks */
1772 next += new.b_size >> mpd->inode->i_blkbits;
1773 remain -= new.b_size;
1774 }
1775}
1776
1777#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | (1 << BH_Delay))
1778
1779/*
1780 * mpage_add_bh_to_extent - try to add one more block to extent of blocks
1781 *
1782 * @mpd->lbh - extent of blocks
1783 * @logical - logical number of the block in the file
1784 * @bh - bh of the block (used to access block's state)
1785 *
1786 * the function is used to collect contig. blocks in same state
1787 */
1788static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
1789 sector_t logical, struct buffer_head *bh)
1790{
1791 struct buffer_head *lbh = &mpd->lbh;
1792 sector_t next;
1793
1794 next = lbh->b_blocknr + (lbh->b_size >> mpd->inode->i_blkbits);
1795
1796 /*
1797 * First block in the extent
1798 */
1799 if (lbh->b_size == 0) {
1800 lbh->b_blocknr = logical;
1801 lbh->b_size = bh->b_size;
1802 lbh->b_state = bh->b_state & BH_FLAGS;
1803 return;
1804 }
1805
1806 /*
1807 * Can we merge the block to our big extent?
1808 */
1809 if (logical == next && (bh->b_state & BH_FLAGS) == lbh->b_state) {
1810 lbh->b_size += bh->b_size;
1811 return;
1812 }
1813
1814 /*
1815 * We couldn't merge the block to our extent, so we
1816 * need to flush current extent and start new one
1817 */
1818 mpage_da_map_blocks(mpd);
1819
1820 /*
1821 * Now start a new extent
1822 */
1823 lbh->b_size = bh->b_size;
1824 lbh->b_state = bh->b_state & BH_FLAGS;
1825 lbh->b_blocknr = logical;
1826}
1827
1828/*
1829 * __mpage_da_writepage - finds extent of pages and blocks
1830 *
1831 * @page: page to consider
1832 * @wbc: not used, we just follow rules
1833 * @data: context
1834 *
1835 * The function finds extents of pages and scan them for all blocks.
1836 */
1837static int __mpage_da_writepage(struct page *page,
1838 struct writeback_control *wbc, void *data)
1839{
1840 struct mpage_da_data *mpd = data;
1841 struct inode *inode = mpd->inode;
1842 struct buffer_head *bh, *head, fake;
1843 sector_t logical;
1844
1845 /*
1846 * Can we merge this page to current extent?
1847 */
1848 if (mpd->next_page != page->index) {
1849 /*
1850 * Nope, we can't. So, we map non-allocated blocks
1851 * and start IO on them using __mpage_writepage()
1852 */
1853 if (mpd->next_page != mpd->first_page) {
1854 mpage_da_map_blocks(mpd);
1855 mpage_da_submit_io(mpd);
1856 }
1857
1858 /*
1859 * Start next extent of pages ...
1860 */
1861 mpd->first_page = page->index;
1862
1863 /*
1864 * ... and blocks
1865 */
1866 mpd->lbh.b_size = 0;
1867 mpd->lbh.b_state = 0;
1868 mpd->lbh.b_blocknr = 0;
1869 }
1870
1871 mpd->next_page = page->index + 1;
1872 logical = (sector_t) page->index <<
1873 (PAGE_CACHE_SHIFT - inode->i_blkbits);
1874
1875 if (!page_has_buffers(page)) {
1876 /*
1877 * There is no attached buffer heads yet (mmap?)
1878 * we treat the page asfull of dirty blocks
1879 */
1880 bh = &fake;
1881 bh->b_size = PAGE_CACHE_SIZE;
1882 bh->b_state = 0;
1883 set_buffer_dirty(bh);
1884 set_buffer_uptodate(bh);
1885 mpage_add_bh_to_extent(mpd, logical, bh);
1886 } else {
1887 /*
1888 * Page with regular buffer heads, just add all dirty ones
1889 */
1890 head = page_buffers(page);
1891 bh = head;
1892 do {
1893 BUG_ON(buffer_locked(bh));
1894 if (buffer_dirty(bh))
1895 mpage_add_bh_to_extent(mpd, logical, bh);
1896 logical++;
1897 } while ((bh = bh->b_this_page) != head);
1898 }
1899
1900 return 0;
1901}
1902
1903/*
1904 * mpage_da_writepages - walk the list of dirty pages of the given
1905 * address space, allocates non-allocated blocks, maps newly-allocated
1906 * blocks to existing bhs and issue IO them
1907 *
1908 * @mapping: address space structure to write
1909 * @wbc: subtract the number of written pages from *@wbc->nr_to_write
1910 * @get_block: the filesystem's block mapper function.
1911 *
1912 * This is a library function, which implements the writepages()
1913 * address_space_operation.
1914 *
1915 * In order to avoid duplication of logic that deals with partial pages,
1916 * multiple bio per page, etc, we find non-allocated blocks, allocate
1917 * them with minimal calls to ->get_block() and re-use __mpage_writepage()
1918 *
1919 * It's important that we call __mpage_writepage() only once for each
1920 * involved page, otherwise we'd have to implement more complicated logic
1921 * to deal with pages w/o PG_lock or w/ PG_writeback and so on.
1922 *
1923 * See comments to mpage_writepages()
1924 */
1925static int mpage_da_writepages(struct address_space *mapping,
1926 struct writeback_control *wbc,
1927 get_block_t get_block)
1928{
1929 struct mpage_da_data mpd;
1930 int ret;
1931
1932 if (!get_block)
1933 return generic_writepages(mapping, wbc);
1934
1935 mpd.wbc = wbc;
1936 mpd.inode = mapping->host;
1937 mpd.lbh.b_size = 0;
1938 mpd.lbh.b_state = 0;
1939 mpd.lbh.b_blocknr = 0;
1940 mpd.first_page = 0;
1941 mpd.next_page = 0;
1942 mpd.get_block = get_block;
1943
1944 ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, &mpd);
1945
1946 /*
1947 * Handle last extent of pages
1948 */
1949 if (mpd.next_page != mpd.first_page) {
1950 mpage_da_map_blocks(&mpd);
1951 mpage_da_submit_io(&mpd);
1952 }
1953
1954 return ret;
1955}
1956
1957/*
1958 * this is a special callback for ->write_begin() only
1959 * it's intention is to return mapped block or reserve space
1960 */
1961static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
1962 struct buffer_head *bh_result, int create)
1963{
1964 int ret = 0;
1965
1966 BUG_ON(create == 0);
1967 BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
1968
1969 /*
1970 * first, we need to know whether the block is allocated already
1971 * preallocated blocks are unmapped but should treated
1972 * the same as allocated blocks.
1973 */
1974 ret = ext4_get_blocks_wrap(NULL, inode, iblock, 1, bh_result, 0, 0, 0);
1975 if ((ret == 0) && !buffer_delay(bh_result)) {
1976 /* the block isn't (pre)allocated yet, let's reserve space */
1977 /*
1978 * XXX: __block_prepare_write() unmaps passed block,
1979 * is it OK?
1980 */
1981 ret = ext4_da_reserve_space(inode, 1);
1982 if (ret)
1983 /* not enough space to reserve */
1984 return ret;
1985
1986 map_bh(bh_result, inode->i_sb, 0);
1987 set_buffer_new(bh_result);
1988 set_buffer_delay(bh_result);
1989 } else if (ret > 0) {
1990 bh_result->b_size = (ret << inode->i_blkbits);
1991 ret = 0;
1992 }
1993
1994 return ret;
1995}
1996#define EXT4_DELALLOC_RSVED 1
1997static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
1998 struct buffer_head *bh_result, int create)
1999{
2000 int ret;
2001 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
2002 loff_t disksize = EXT4_I(inode)->i_disksize;
2003 handle_t *handle = NULL;
2004
2005 handle = ext4_journal_current_handle();
2006 if (!handle) {
2007 ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
2008 bh_result, 0, 0, 0);
2009 BUG_ON(!ret);
2010 } else {
2011 ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
2012 bh_result, create, 0, EXT4_DELALLOC_RSVED);
2013 }
2014
2015 if (ret > 0) {
2016 bh_result->b_size = (ret << inode->i_blkbits);
2017
2018 /*
2019 * Update on-disk size along with block allocation
2020 * we don't use 'extend_disksize' as size may change
2021 * within already allocated block -bzzz
2022 */
2023 disksize = ((loff_t) iblock + ret) << inode->i_blkbits;
2024 if (disksize > i_size_read(inode))
2025 disksize = i_size_read(inode);
2026 if (disksize > EXT4_I(inode)->i_disksize) {
2027 /*
2028 * XXX: replace with spinlock if seen contended -bzzz
2029 */
2030 down_write(&EXT4_I(inode)->i_data_sem);
2031 if (disksize > EXT4_I(inode)->i_disksize)
2032 EXT4_I(inode)->i_disksize = disksize;
2033 up_write(&EXT4_I(inode)->i_data_sem);
2034
2035 if (EXT4_I(inode)->i_disksize == disksize) {
2036 ret = ext4_mark_inode_dirty(handle, inode);
2037 return ret;
2038 }
2039 }
2040 ret = 0;
2041 }
2042 return ret;
2043}
2044
2045static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
2046{
2047 /*
2048 * unmapped buffer is possible for holes.
2049 * delay buffer is possible with delayed allocation
2050 */
2051 return ((!buffer_mapped(bh) || buffer_delay(bh)) && buffer_dirty(bh));
2052}
2053
2054static int ext4_normal_get_block_write(struct inode *inode, sector_t iblock,
2055 struct buffer_head *bh_result, int create)
2056{
2057 int ret = 0;
2058 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
2059
2060 /*
2061 * we don't want to do block allocation in writepage
2062 * so call get_block_wrap with create = 0
2063 */
2064 ret = ext4_get_blocks_wrap(NULL, inode, iblock, max_blocks,
2065 bh_result, 0, 0, 0);
2066 if (ret > 0) {
2067 bh_result->b_size = (ret << inode->i_blkbits);
2068 ret = 0;
2069 }
2070 return ret;
2071}
2072
2073/*
2074 * get called vi ext4_da_writepages after taking page lock (have journal handle)
2075 * get called via journal_submit_inode_data_buffers (no journal handle)
2076 * get called via shrink_page_list via pdflush (no journal handle)
2077 * or grab_page_cache when doing write_begin (have journal handle)
2078 */
2079static int ext4_da_writepage(struct page *page,
2080 struct writeback_control *wbc)
2081{
2082 int ret = 0;
2083 loff_t size;
2084 unsigned long len;
2085 struct buffer_head *page_bufs;
2086 struct inode *inode = page->mapping->host;
2087
2088 size = i_size_read(inode);
2089 if (page->index == size >> PAGE_CACHE_SHIFT)
2090 len = size & ~PAGE_CACHE_MASK;
2091 else
2092 len = PAGE_CACHE_SIZE;
2093
2094 if (page_has_buffers(page)) {
2095 page_bufs = page_buffers(page);
2096 if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
2097 ext4_bh_unmapped_or_delay)) {
2098 /*
2099 * We don't want to do block allocation
2100 * So redirty the page and return
2101 * We may reach here when we do a journal commit
2102 * via journal_submit_inode_data_buffers.
2103 * If we don't have mapping block we just ignore
2104 * them. We can also reach here via shrink_page_list
2105 */
2106 redirty_page_for_writepage(wbc, page);
2107 unlock_page(page);
2108 return 0;
2109 }
2110 } else {
2111 /*
2112 * The test for page_has_buffers() is subtle:
2113 * We know the page is dirty but it lost buffers. That means
2114 * that at some moment in time after write_begin()/write_end()
2115 * has been called all buffers have been clean and thus they
2116 * must have been written at least once. So they are all
2117 * mapped and we can happily proceed with mapping them
2118 * and writing the page.
2119 *
2120 * Try to initialize the buffer_heads and check whether
2121 * all are mapped and non delay. We don't want to
2122 * do block allocation here.
2123 */
2124 ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
2125 ext4_normal_get_block_write);
2126 if (!ret) {
2127 page_bufs = page_buffers(page);
2128 /* check whether all are mapped and non delay */
2129 if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
2130 ext4_bh_unmapped_or_delay)) {
2131 redirty_page_for_writepage(wbc, page);
2132 unlock_page(page);
2133 return 0;
2134 }
2135 } else {
2136 /*
2137 * We can't do block allocation here
2138 * so just redity the page and unlock
2139 * and return
2140 */
2141 redirty_page_for_writepage(wbc, page);
2142 unlock_page(page);
2143 return 0;
2144 }
2145 }
2146
2147 if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
2148 ret = nobh_writepage(page, ext4_normal_get_block_write, wbc);
2149 else
2150 ret = block_write_full_page(page,
2151 ext4_normal_get_block_write,
2152 wbc);
2153
2154 return ret;
2155}
2156
2157/*
2158 * For now just follow the DIO way to estimate the max credits
2159 * needed to write out EXT4_MAX_WRITEBACK_PAGES.
2160 * todo: need to calculate the max credits need for
2161 * extent based files, currently the DIO credits is based on
2162 * indirect-blocks mapping way.
2163 *
2164 * Probably should have a generic way to calculate credits
2165 * for DIO, writepages, and truncate
2166 */
2167#define EXT4_MAX_WRITEBACK_PAGES DIO_MAX_BLOCKS
2168#define EXT4_MAX_WRITEBACK_CREDITS DIO_CREDITS
2169
2170static int ext4_da_writepages(struct address_space *mapping,
2171 struct writeback_control *wbc)
2172{
2173 struct inode *inode = mapping->host;
2174 handle_t *handle = NULL;
2175 int needed_blocks;
2176 int ret = 0;
2177 long to_write;
2178 loff_t range_start = 0;
2179
2180 /*
2181 * No pages to write? This is mainly a kludge to avoid starting
2182 * a transaction for special inodes like journal inode on last iput()
2183 * because that could violate lock ordering on umount
2184 */
2185 if (!mapping->nrpages)
2186 return 0;
2187
2188 /*
2189 * Estimate the worse case needed credits to write out
2190 * EXT4_MAX_BUF_BLOCKS pages
2191 */
2192 needed_blocks = EXT4_MAX_WRITEBACK_CREDITS;
2193
2194 to_write = wbc->nr_to_write;
2195 if (!wbc->range_cyclic) {
2196 /*
2197 * If range_cyclic is not set force range_cont
2198 * and save the old writeback_index
2199 */
2200 wbc->range_cont = 1;
2201 range_start = wbc->range_start;
2202 }
2203
2204 while (!ret && to_write) {
2205 /* start a new transaction*/
2206 handle = ext4_journal_start(inode, needed_blocks);
2207 if (IS_ERR(handle)) {
2208 ret = PTR_ERR(handle);
2209 goto out_writepages;
2210 }
2211 if (ext4_should_order_data(inode)) {
2212 /*
2213 * With ordered mode we need to add
2214 * the inode to the journal handle
2215 * when we do block allocation.
2216 */
2217 ret = ext4_jbd2_file_inode(handle, inode);
2218 if (ret) {
2219 ext4_journal_stop(handle);
2220 goto out_writepages;
2221 }
2222
2223 }
2224 /*
2225 * set the max dirty pages could be write at a time
2226 * to fit into the reserved transaction credits
2227 */
2228 if (wbc->nr_to_write > EXT4_MAX_WRITEBACK_PAGES)
2229 wbc->nr_to_write = EXT4_MAX_WRITEBACK_PAGES;
2230
2231 to_write -= wbc->nr_to_write;
2232 ret = mpage_da_writepages(mapping, wbc,
2233 ext4_da_get_block_write);
2234 ext4_journal_stop(handle);
2235 if (wbc->nr_to_write) {
2236 /*
2237 * There is no more writeout needed
2238 * or we requested for a noblocking writeout
2239 * and we found the device congested
2240 */
2241 to_write += wbc->nr_to_write;
2242 break;
2243 }
2244 wbc->nr_to_write = to_write;
2245 }
2246
2247out_writepages:
2248 wbc->nr_to_write = to_write;
2249 if (range_start)
2250 wbc->range_start = range_start;
2251 return ret;
2252}
2253
2254static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
2255 loff_t pos, unsigned len, unsigned flags,
2256 struct page **pagep, void **fsdata)
2257{
2258 int ret, retries = 0;
2259 struct page *page;
2260 pgoff_t index;
2261 unsigned from, to;
2262 struct inode *inode = mapping->host;
2263 handle_t *handle;
2264
2265 index = pos >> PAGE_CACHE_SHIFT;
2266 from = pos & (PAGE_CACHE_SIZE - 1);
2267 to = from + len;
2268
2269retry:
2270 /*
2271 * With delayed allocation, we don't log the i_disksize update
2272 * if there is delayed block allocation. But we still need
2273 * to journalling the i_disksize update if writes to the end
2274 * of file which has an already mapped buffer.
2275 */
2276 handle = ext4_journal_start(inode, 1);
2277 if (IS_ERR(handle)) {
2278 ret = PTR_ERR(handle);
2279 goto out;
2280 }
2281
2282 page = __grab_cache_page(mapping, index);
2283 if (!page)
2284 return -ENOMEM;
2285 *pagep = page;
2286
2287 ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
2288 ext4_da_get_block_prep);
2289 if (ret < 0) {
2290 unlock_page(page);
2291 ext4_journal_stop(handle);
2292 page_cache_release(page);
2293 }
2294
2295 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
2296 goto retry;
2297out:
2298 return ret;
2299}
2300
2301/*
2302 * Check if we should update i_disksize
2303 * when write to the end of file but not require block allocation
2304 */
2305static int ext4_da_should_update_i_disksize(struct page *page,
2306 unsigned long offset)
2307{
2308 struct buffer_head *bh;
2309 struct inode *inode = page->mapping->host;
2310 unsigned int idx;
2311 int i;
2312
2313 bh = page_buffers(page);
2314 idx = offset >> inode->i_blkbits;
2315
2316 for (i=0; i < idx; i++)
2317 bh = bh->b_this_page;
2318
2319 if (!buffer_mapped(bh) || (buffer_delay(bh)))
2320 return 0;
2321 return 1;
2322}
2323
2324static int ext4_da_write_end(struct file *file,
2325 struct address_space *mapping,
2326 loff_t pos, unsigned len, unsigned copied,
2327 struct page *page, void *fsdata)
2328{
2329 struct inode *inode = mapping->host;
2330 int ret = 0, ret2;
2331 handle_t *handle = ext4_journal_current_handle();
2332 loff_t new_i_size;
2333 unsigned long start, end;
2334
2335 start = pos & (PAGE_CACHE_SIZE - 1);
2336 end = start + copied -1;
2337
2338 /*
2339 * generic_write_end() will run mark_inode_dirty() if i_size
2340 * changes. So let's piggyback the i_disksize mark_inode_dirty
2341 * into that.
2342 */
2343
2344 new_i_size = pos + copied;
2345 if (new_i_size > EXT4_I(inode)->i_disksize) {
2346 if (ext4_da_should_update_i_disksize(page, end)) {
2347 down_write(&EXT4_I(inode)->i_data_sem);
2348 if (new_i_size > EXT4_I(inode)->i_disksize) {
2349 /*
2350 * Updating i_disksize when extending file
2351 * without needing block allocation
2352 */
2353 if (ext4_should_order_data(inode))
2354 ret = ext4_jbd2_file_inode(handle,
2355 inode);
2356
2357 EXT4_I(inode)->i_disksize = new_i_size;
2358 }
2359 up_write(&EXT4_I(inode)->i_data_sem);
2360 }
2361 }
2362 ret2 = generic_write_end(file, mapping, pos, len, copied,
2363 page, fsdata);
2364 copied = ret2;
2365 if (ret2 < 0)
2366 ret = ret2;
2367 ret2 = ext4_journal_stop(handle);
2368 if (!ret)
2369 ret = ret2;
2370
2371 return ret ? ret : copied;
2372}
2373
2374static void ext4_da_invalidatepage(struct page *page, unsigned long offset)
2375{
2376 /*
2377 * Drop reserved blocks
2378 */
2379 BUG_ON(!PageLocked(page));
2380 if (!page_has_buffers(page))
2381 goto out;
2382
2383 ext4_da_page_release_reservation(page, offset);
2384
2385out:
2386 ext4_invalidatepage(page, offset);
2387
2388 return;
2389}
2390
1400 2391
1401/* 2392/*
1402 * bmap() is special. It gets used by applications such as lilo and by 2393 * bmap() is special. It gets used by applications such as lilo and by
@@ -1418,6 +2409,16 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
1418 journal_t *journal; 2409 journal_t *journal;
1419 int err; 2410 int err;
1420 2411
2412 if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
2413 test_opt(inode->i_sb, DELALLOC)) {
2414 /*
2415 * With delalloc we want to sync the file
2416 * so that we can make sure we allocate
2417 * blocks for file
2418 */
2419 filemap_write_and_wait(mapping);
2420 }
2421
1421 if (EXT4_I(inode)->i_state & EXT4_STATE_JDATA) { 2422 if (EXT4_I(inode)->i_state & EXT4_STATE_JDATA) {
1422 /* 2423 /*
1423 * This is a REALLY heavyweight approach, but the use of 2424 * This is a REALLY heavyweight approach, but the use of
@@ -1462,21 +2463,17 @@ static int bput_one(handle_t *handle, struct buffer_head *bh)
1462 return 0; 2463 return 0;
1463} 2464}
1464 2465
1465static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
1466{
1467 if (buffer_mapped(bh))
1468 return ext4_journal_dirty_data(handle, bh);
1469 return 0;
1470}
1471
1472/* 2466/*
1473 * Note that we always start a transaction even if we're not journalling 2467 * Note that we don't need to start a transaction unless we're journaling data
1474 * data. This is to preserve ordering: any hole instantiation within 2468 * because we should have holes filled from ext4_page_mkwrite(). We even don't
1475 * __block_write_full_page -> ext4_get_block() should be journalled 2469 * need to file the inode to the transaction's list in ordered mode because if
1476 * along with the data so we don't crash and then get metadata which 2470 * we are writing back data added by write(), the inode is already there and if
1477 * refers to old data. 2471 * we are writing back data modified via mmap(), noone guarantees in which
2472 * transaction the data will hit the disk. In case we are journaling data, we
2473 * cannot start transaction directly because transaction start ranks above page
2474 * lock so we have to do some magic.
1478 * 2475 *
1479 * In all journalling modes block_write_full_page() will start the I/O. 2476 * In all journaling modes block_write_full_page() will start the I/O.
1480 * 2477 *
1481 * Problem: 2478 * Problem:
1482 * 2479 *
@@ -1518,105 +2515,103 @@ static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
1518 * disastrous. Any write() or metadata operation will sync the fs for 2515 * disastrous. Any write() or metadata operation will sync the fs for
1519 * us. 2516 * us.
1520 * 2517 *
1521 * AKPM2: if all the page's buffers are mapped to disk and !data=journal,
1522 * we don't need to open a transaction here.
1523 */ 2518 */
1524static int ext4_ordered_writepage(struct page *page, 2519static int __ext4_normal_writepage(struct page *page,
1525 struct writeback_control *wbc) 2520 struct writeback_control *wbc)
1526{ 2521{
1527 struct inode *inode = page->mapping->host; 2522 struct inode *inode = page->mapping->host;
1528 struct buffer_head *page_bufs;
1529 handle_t *handle = NULL;
1530 int ret = 0;
1531 int err;
1532
1533 J_ASSERT(PageLocked(page));
1534
1535 /*
1536 * We give up here if we're reentered, because it might be for a
1537 * different filesystem.
1538 */
1539 if (ext4_journal_current_handle())
1540 goto out_fail;
1541 2523
1542 handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode)); 2524 if (test_opt(inode->i_sb, NOBH))
2525 return nobh_writepage(page,
2526 ext4_normal_get_block_write, wbc);
2527 else
2528 return block_write_full_page(page,
2529 ext4_normal_get_block_write,
2530 wbc);
2531}
1543 2532
1544 if (IS_ERR(handle)) { 2533static int ext4_normal_writepage(struct page *page,
1545 ret = PTR_ERR(handle); 2534 struct writeback_control *wbc)
1546 goto out_fail; 2535{
1547 } 2536 struct inode *inode = page->mapping->host;
2537 loff_t size = i_size_read(inode);
2538 loff_t len;
1548 2539
1549 if (!page_has_buffers(page)) { 2540 J_ASSERT(PageLocked(page));
1550 create_empty_buffers(page, inode->i_sb->s_blocksize, 2541 if (page->index == size >> PAGE_CACHE_SHIFT)
1551 (1 << BH_Dirty)|(1 << BH_Uptodate)); 2542 len = size & ~PAGE_CACHE_MASK;
2543 else
2544 len = PAGE_CACHE_SIZE;
2545
2546 if (page_has_buffers(page)) {
2547 /* if page has buffers it should all be mapped
2548 * and allocated. If there are not buffers attached
2549 * to the page we know the page is dirty but it lost
2550 * buffers. That means that at some moment in time
2551 * after write_begin() / write_end() has been called
2552 * all buffers have been clean and thus they must have been
2553 * written at least once. So they are all mapped and we can
2554 * happily proceed with mapping them and writing the page.
2555 */
2556 BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
2557 ext4_bh_unmapped_or_delay));
1552 } 2558 }
1553 page_bufs = page_buffers(page);
1554 walk_page_buffers(handle, page_bufs, 0,
1555 PAGE_CACHE_SIZE, NULL, bget_one);
1556
1557 ret = block_write_full_page(page, ext4_get_block, wbc);
1558 2559
1559 /* 2560 if (!ext4_journal_current_handle())
1560 * The page can become unlocked at any point now, and 2561 return __ext4_normal_writepage(page, wbc);
1561 * truncate can then come in and change things. So we
1562 * can't touch *page from now on. But *page_bufs is
1563 * safe due to elevated refcount.
1564 */
1565 2562
1566 /*
1567 * And attach them to the current transaction. But only if
1568 * block_write_full_page() succeeded. Otherwise they are unmapped,
1569 * and generally junk.
1570 */
1571 if (ret == 0) {
1572 err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
1573 NULL, jbd2_journal_dirty_data_fn);
1574 if (!ret)
1575 ret = err;
1576 }
1577 walk_page_buffers(handle, page_bufs, 0,
1578 PAGE_CACHE_SIZE, NULL, bput_one);
1579 err = ext4_journal_stop(handle);
1580 if (!ret)
1581 ret = err;
1582 return ret;
1583
1584out_fail:
1585 redirty_page_for_writepage(wbc, page); 2563 redirty_page_for_writepage(wbc, page);
1586 unlock_page(page); 2564 unlock_page(page);
1587 return ret; 2565 return 0;
1588} 2566}
1589 2567
1590static int ext4_writeback_writepage(struct page *page, 2568static int __ext4_journalled_writepage(struct page *page,
1591 struct writeback_control *wbc) 2569 struct writeback_control *wbc)
1592{ 2570{
1593 struct inode *inode = page->mapping->host; 2571 struct address_space *mapping = page->mapping;
2572 struct inode *inode = mapping->host;
2573 struct buffer_head *page_bufs;
1594 handle_t *handle = NULL; 2574 handle_t *handle = NULL;
1595 int ret = 0; 2575 int ret = 0;
1596 int err; 2576 int err;
1597 2577
1598 if (ext4_journal_current_handle()) 2578 ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
1599 goto out_fail; 2579 ext4_normal_get_block_write);
2580 if (ret != 0)
2581 goto out_unlock;
2582
2583 page_bufs = page_buffers(page);
2584 walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL,
2585 bget_one);
2586 /* As soon as we unlock the page, it can go away, but we have
2587 * references to buffers so we are safe */
2588 unlock_page(page);
1600 2589
1601 handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode)); 2590 handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
1602 if (IS_ERR(handle)) { 2591 if (IS_ERR(handle)) {
1603 ret = PTR_ERR(handle); 2592 ret = PTR_ERR(handle);
1604 goto out_fail; 2593 goto out;
1605 } 2594 }
1606 2595
1607 if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode)) 2596 ret = walk_page_buffers(handle, page_bufs, 0,
1608 ret = nobh_writepage(page, ext4_get_block, wbc); 2597 PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
1609 else
1610 ret = block_write_full_page(page, ext4_get_block, wbc);
1611 2598
2599 err = walk_page_buffers(handle, page_bufs, 0,
2600 PAGE_CACHE_SIZE, NULL, write_end_fn);
2601 if (ret == 0)
2602 ret = err;
1612 err = ext4_journal_stop(handle); 2603 err = ext4_journal_stop(handle);
1613 if (!ret) 2604 if (!ret)
1614 ret = err; 2605 ret = err;
1615 return ret;
1616 2606
1617out_fail: 2607 walk_page_buffers(handle, page_bufs, 0,
1618 redirty_page_for_writepage(wbc, page); 2608 PAGE_CACHE_SIZE, NULL, bput_one);
2609 EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
2610 goto out;
2611
2612out_unlock:
1619 unlock_page(page); 2613 unlock_page(page);
2614out:
1620 return ret; 2615 return ret;
1621} 2616}
1622 2617
@@ -1624,59 +2619,53 @@ static int ext4_journalled_writepage(struct page *page,
1624 struct writeback_control *wbc) 2619 struct writeback_control *wbc)
1625{ 2620{
1626 struct inode *inode = page->mapping->host; 2621 struct inode *inode = page->mapping->host;
1627 handle_t *handle = NULL; 2622 loff_t size = i_size_read(inode);
1628 int ret = 0; 2623 loff_t len;
1629 int err;
1630 2624
1631 if (ext4_journal_current_handle()) 2625 J_ASSERT(PageLocked(page));
1632 goto no_write; 2626 if (page->index == size >> PAGE_CACHE_SHIFT)
2627 len = size & ~PAGE_CACHE_MASK;
2628 else
2629 len = PAGE_CACHE_SIZE;
2630
2631 if (page_has_buffers(page)) {
2632 /* if page has buffers it should all be mapped
2633 * and allocated. If there are not buffers attached
2634 * to the page we know the page is dirty but it lost
2635 * buffers. That means that at some moment in time
2636 * after write_begin() / write_end() has been called
2637 * all buffers have been clean and thus they must have been
2638 * written at least once. So they are all mapped and we can
2639 * happily proceed with mapping them and writing the page.
2640 */
2641 BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
2642 ext4_bh_unmapped_or_delay));
2643 }
1633 2644
1634 handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode)); 2645 if (ext4_journal_current_handle())
1635 if (IS_ERR(handle)) {
1636 ret = PTR_ERR(handle);
1637 goto no_write; 2646 goto no_write;
1638 }
1639 2647
1640 if (!page_has_buffers(page) || PageChecked(page)) { 2648 if (PageChecked(page)) {
1641 /* 2649 /*
1642 * It's mmapped pagecache. Add buffers and journal it. There 2650 * It's mmapped pagecache. Add buffers and journal it. There
1643 * doesn't seem much point in redirtying the page here. 2651 * doesn't seem much point in redirtying the page here.
1644 */ 2652 */
1645 ClearPageChecked(page); 2653 ClearPageChecked(page);
1646 ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, 2654 return __ext4_journalled_writepage(page, wbc);
1647 ext4_get_block);
1648 if (ret != 0) {
1649 ext4_journal_stop(handle);
1650 goto out_unlock;
1651 }
1652 ret = walk_page_buffers(handle, page_buffers(page), 0,
1653 PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
1654
1655 err = walk_page_buffers(handle, page_buffers(page), 0,
1656 PAGE_CACHE_SIZE, NULL, write_end_fn);
1657 if (ret == 0)
1658 ret = err;
1659 EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
1660 unlock_page(page);
1661 } else { 2655 } else {
1662 /* 2656 /*
1663 * It may be a page full of checkpoint-mode buffers. We don't 2657 * It may be a page full of checkpoint-mode buffers. We don't
1664 * really know unless we go poke around in the buffer_heads. 2658 * really know unless we go poke around in the buffer_heads.
1665 * But block_write_full_page will do the right thing. 2659 * But block_write_full_page will do the right thing.
1666 */ 2660 */
1667 ret = block_write_full_page(page, ext4_get_block, wbc); 2661 return block_write_full_page(page,
2662 ext4_normal_get_block_write,
2663 wbc);
1668 } 2664 }
1669 err = ext4_journal_stop(handle);
1670 if (!ret)
1671 ret = err;
1672out:
1673 return ret;
1674
1675no_write: 2665no_write:
1676 redirty_page_for_writepage(wbc, page); 2666 redirty_page_for_writepage(wbc, page);
1677out_unlock:
1678 unlock_page(page); 2667 unlock_page(page);
1679 goto out; 2668 return 0;
1680} 2669}
1681 2670
1682static int ext4_readpage(struct file *file, struct page *page) 2671static int ext4_readpage(struct file *file, struct page *page)
@@ -1819,7 +2808,7 @@ static int ext4_journalled_set_page_dirty(struct page *page)
1819static const struct address_space_operations ext4_ordered_aops = { 2808static const struct address_space_operations ext4_ordered_aops = {
1820 .readpage = ext4_readpage, 2809 .readpage = ext4_readpage,
1821 .readpages = ext4_readpages, 2810 .readpages = ext4_readpages,
1822 .writepage = ext4_ordered_writepage, 2811 .writepage = ext4_normal_writepage,
1823 .sync_page = block_sync_page, 2812 .sync_page = block_sync_page,
1824 .write_begin = ext4_write_begin, 2813 .write_begin = ext4_write_begin,
1825 .write_end = ext4_ordered_write_end, 2814 .write_end = ext4_ordered_write_end,
@@ -1833,7 +2822,7 @@ static const struct address_space_operations ext4_ordered_aops = {
1833static const struct address_space_operations ext4_writeback_aops = { 2822static const struct address_space_operations ext4_writeback_aops = {
1834 .readpage = ext4_readpage, 2823 .readpage = ext4_readpage,
1835 .readpages = ext4_readpages, 2824 .readpages = ext4_readpages,
1836 .writepage = ext4_writeback_writepage, 2825 .writepage = ext4_normal_writepage,
1837 .sync_page = block_sync_page, 2826 .sync_page = block_sync_page,
1838 .write_begin = ext4_write_begin, 2827 .write_begin = ext4_write_begin,
1839 .write_end = ext4_writeback_write_end, 2828 .write_end = ext4_writeback_write_end,
@@ -1857,10 +2846,31 @@ static const struct address_space_operations ext4_journalled_aops = {
1857 .releasepage = ext4_releasepage, 2846 .releasepage = ext4_releasepage,
1858}; 2847};
1859 2848
2849static const struct address_space_operations ext4_da_aops = {
2850 .readpage = ext4_readpage,
2851 .readpages = ext4_readpages,
2852 .writepage = ext4_da_writepage,
2853 .writepages = ext4_da_writepages,
2854 .sync_page = block_sync_page,
2855 .write_begin = ext4_da_write_begin,
2856 .write_end = ext4_da_write_end,
2857 .bmap = ext4_bmap,
2858 .invalidatepage = ext4_da_invalidatepage,
2859 .releasepage = ext4_releasepage,
2860 .direct_IO = ext4_direct_IO,
2861 .migratepage = buffer_migrate_page,
2862};
2863
1860void ext4_set_aops(struct inode *inode) 2864void ext4_set_aops(struct inode *inode)
1861{ 2865{
1862 if (ext4_should_order_data(inode)) 2866 if (ext4_should_order_data(inode) &&
2867 test_opt(inode->i_sb, DELALLOC))
2868 inode->i_mapping->a_ops = &ext4_da_aops;
2869 else if (ext4_should_order_data(inode))
1863 inode->i_mapping->a_ops = &ext4_ordered_aops; 2870 inode->i_mapping->a_ops = &ext4_ordered_aops;
2871 else if (ext4_should_writeback_data(inode) &&
2872 test_opt(inode->i_sb, DELALLOC))
2873 inode->i_mapping->a_ops = &ext4_da_aops;
1864 else if (ext4_should_writeback_data(inode)) 2874 else if (ext4_should_writeback_data(inode))
1865 inode->i_mapping->a_ops = &ext4_writeback_aops; 2875 inode->i_mapping->a_ops = &ext4_writeback_aops;
1866 else 2876 else
@@ -1873,7 +2883,7 @@ void ext4_set_aops(struct inode *inode)
1873 * This required during truncate. We need to physically zero the tail end 2883 * This required during truncate. We need to physically zero the tail end
1874 * of that block so it doesn't yield old data if the file is later grown. 2884 * of that block so it doesn't yield old data if the file is later grown.
1875 */ 2885 */
1876int ext4_block_truncate_page(handle_t *handle, struct page *page, 2886int ext4_block_truncate_page(handle_t *handle,
1877 struct address_space *mapping, loff_t from) 2887 struct address_space *mapping, loff_t from)
1878{ 2888{
1879 ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; 2889 ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
@@ -1882,8 +2892,13 @@ int ext4_block_truncate_page(handle_t *handle, struct page *page,
1882 ext4_lblk_t iblock; 2892 ext4_lblk_t iblock;
1883 struct inode *inode = mapping->host; 2893 struct inode *inode = mapping->host;
1884 struct buffer_head *bh; 2894 struct buffer_head *bh;
2895 struct page *page;
1885 int err = 0; 2896 int err = 0;
1886 2897
2898 page = grab_cache_page(mapping, from >> PAGE_CACHE_SHIFT);
2899 if (!page)
2900 return -EINVAL;
2901
1887 blocksize = inode->i_sb->s_blocksize; 2902 blocksize = inode->i_sb->s_blocksize;
1888 length = blocksize - (offset & (blocksize - 1)); 2903 length = blocksize - (offset & (blocksize - 1));
1889 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); 2904 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
@@ -1956,7 +2971,7 @@ int ext4_block_truncate_page(handle_t *handle, struct page *page,
1956 err = ext4_journal_dirty_metadata(handle, bh); 2971 err = ext4_journal_dirty_metadata(handle, bh);
1957 } else { 2972 } else {
1958 if (ext4_should_order_data(inode)) 2973 if (ext4_should_order_data(inode))
1959 err = ext4_journal_dirty_data(handle, bh); 2974 err = ext4_jbd2_file_inode(handle, inode);
1960 mark_buffer_dirty(bh); 2975 mark_buffer_dirty(bh);
1961 } 2976 }
1962 2977
@@ -2179,7 +3194,21 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
2179 3194
2180 if (this_bh) { 3195 if (this_bh) {
2181 BUFFER_TRACE(this_bh, "call ext4_journal_dirty_metadata"); 3196 BUFFER_TRACE(this_bh, "call ext4_journal_dirty_metadata");
2182 ext4_journal_dirty_metadata(handle, this_bh); 3197
3198 /*
3199 * The buffer head should have an attached journal head at this
3200 * point. However, if the data is corrupted and an indirect
3201 * block pointed to itself, it would have been detached when
3202 * the block was cleared. Check for this instead of OOPSing.
3203 */
3204 if (bh2jh(this_bh))
3205 ext4_journal_dirty_metadata(handle, this_bh);
3206 else
3207 ext4_error(inode->i_sb, __func__,
3208 "circular indirect block detected, "
3209 "inode=%lu, block=%llu",
3210 inode->i_ino,
3211 (unsigned long long) this_bh->b_blocknr);
2183 } 3212 }
2184} 3213}
2185 3214
@@ -2305,6 +3334,19 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
2305 } 3334 }
2306} 3335}
2307 3336
3337int ext4_can_truncate(struct inode *inode)
3338{
3339 if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
3340 return 0;
3341 if (S_ISREG(inode->i_mode))
3342 return 1;
3343 if (S_ISDIR(inode->i_mode))
3344 return 1;
3345 if (S_ISLNK(inode->i_mode))
3346 return !ext4_inode_is_fast_symlink(inode);
3347 return 0;
3348}
3349
2308/* 3350/*
2309 * ext4_truncate() 3351 * ext4_truncate()
2310 * 3352 *
@@ -2347,51 +3389,25 @@ void ext4_truncate(struct inode *inode)
2347 int n; 3389 int n;
2348 ext4_lblk_t last_block; 3390 ext4_lblk_t last_block;
2349 unsigned blocksize = inode->i_sb->s_blocksize; 3391 unsigned blocksize = inode->i_sb->s_blocksize;
2350 struct page *page;
2351 3392
2352 if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || 3393 if (!ext4_can_truncate(inode))
2353 S_ISLNK(inode->i_mode)))
2354 return;
2355 if (ext4_inode_is_fast_symlink(inode))
2356 return;
2357 if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
2358 return; 3394 return;
2359 3395
2360 /*
2361 * We have to lock the EOF page here, because lock_page() nests
2362 * outside jbd2_journal_start().
2363 */
2364 if ((inode->i_size & (blocksize - 1)) == 0) {
2365 /* Block boundary? Nothing to do */
2366 page = NULL;
2367 } else {
2368 page = grab_cache_page(mapping,
2369 inode->i_size >> PAGE_CACHE_SHIFT);
2370 if (!page)
2371 return;
2372 }
2373
2374 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { 3396 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
2375 ext4_ext_truncate(inode, page); 3397 ext4_ext_truncate(inode);
2376 return; 3398 return;
2377 } 3399 }
2378 3400
2379 handle = start_transaction(inode); 3401 handle = start_transaction(inode);
2380 if (IS_ERR(handle)) { 3402 if (IS_ERR(handle))
2381 if (page) {
2382 clear_highpage(page);
2383 flush_dcache_page(page);
2384 unlock_page(page);
2385 page_cache_release(page);
2386 }
2387 return; /* AKPM: return what? */ 3403 return; /* AKPM: return what? */
2388 }
2389 3404
2390 last_block = (inode->i_size + blocksize-1) 3405 last_block = (inode->i_size + blocksize-1)
2391 >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); 3406 >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
2392 3407
2393 if (page) 3408 if (inode->i_size & (blocksize - 1))
2394 ext4_block_truncate_page(handle, page, mapping, inode->i_size); 3409 if (ext4_block_truncate_page(handle, mapping, inode->i_size))
3410 goto out_stop;
2395 3411
2396 n = ext4_block_to_path(inode, last_block, offsets, NULL); 3412 n = ext4_block_to_path(inode, last_block, offsets, NULL);
2397 if (n == 0) 3413 if (n == 0)
@@ -2410,6 +3426,11 @@ void ext4_truncate(struct inode *inode)
2410 goto out_stop; 3426 goto out_stop;
2411 3427
2412 /* 3428 /*
3429 * From here we block out all ext4_get_block() callers who want to
3430 * modify the block allocation tree.
3431 */
3432 down_write(&ei->i_data_sem);
3433 /*
2413 * The orphan list entry will now protect us from any crash which 3434 * The orphan list entry will now protect us from any crash which
2414 * occurs before the truncate completes, so it is now safe to propagate 3435 * occurs before the truncate completes, so it is now safe to propagate
2415 * the new, shorter inode size (held for now in i_size) into the 3436 * the new, shorter inode size (held for now in i_size) into the
@@ -2418,12 +3439,6 @@ void ext4_truncate(struct inode *inode)
2418 */ 3439 */
2419 ei->i_disksize = inode->i_size; 3440 ei->i_disksize = inode->i_size;
2420 3441
2421 /*
2422 * From here we block out all ext4_get_block() callers who want to
2423 * modify the block allocation tree.
2424 */
2425 down_write(&ei->i_data_sem);
2426
2427 if (n == 1) { /* direct blocks */ 3442 if (n == 1) { /* direct blocks */
2428 ext4_free_data(handle, inode, NULL, i_data+offsets[0], 3443 ext4_free_data(handle, inode, NULL, i_data+offsets[0],
2429 i_data + EXT4_NDIR_BLOCKS); 3444 i_data + EXT4_NDIR_BLOCKS);
@@ -3107,7 +4122,14 @@ int ext4_write_inode(struct inode *inode, int wait)
3107 * be freed, so we have a strong guarantee that no future commit will 4122 * be freed, so we have a strong guarantee that no future commit will
3108 * leave these blocks visible to the user.) 4123 * leave these blocks visible to the user.)
3109 * 4124 *
3110 * Called with inode->sem down. 4125 * Another thing we have to assure is that if we are in ordered mode
4126 * and inode is still attached to the committing transaction, we must
4127 * we start writeout of all the dirty pages which are being truncated.
4128 * This way we are sure that all the data written in the previous
4129 * transaction are already on disk (truncate waits for pages under
4130 * writeback).
4131 *
4132 * Called with inode->i_mutex down.
3111 */ 4133 */
3112int ext4_setattr(struct dentry *dentry, struct iattr *attr) 4134int ext4_setattr(struct dentry *dentry, struct iattr *attr)
3113{ 4135{
@@ -3173,6 +4195,22 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
3173 if (!error) 4195 if (!error)
3174 error = rc; 4196 error = rc;
3175 ext4_journal_stop(handle); 4197 ext4_journal_stop(handle);
4198
4199 if (ext4_should_order_data(inode)) {
4200 error = ext4_begin_ordered_truncate(inode,
4201 attr->ia_size);
4202 if (error) {
4203 /* Do as much error cleanup as possible */
4204 handle = ext4_journal_start(inode, 3);
4205 if (IS_ERR(handle)) {
4206 ext4_orphan_del(NULL, inode);
4207 goto err_out;
4208 }
4209 ext4_orphan_del(handle, inode);
4210 ext4_journal_stop(handle);
4211 goto err_out;
4212 }
4213 }
3176 } 4214 }
3177 4215
3178 rc = inode_setattr(inode, attr); 4216 rc = inode_setattr(inode, attr);
@@ -3193,6 +4231,32 @@ err_out:
3193 return error; 4231 return error;
3194} 4232}
3195 4233
4234int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
4235 struct kstat *stat)
4236{
4237 struct inode *inode;
4238 unsigned long delalloc_blocks;
4239
4240 inode = dentry->d_inode;
4241 generic_fillattr(inode, stat);
4242
4243 /*
4244 * We can't update i_blocks if the block allocation is delayed
4245 * otherwise in the case of system crash before the real block
4246 * allocation is done, we will have i_blocks inconsistent with
4247 * on-disk file blocks.
4248 * We always keep i_blocks updated together with real
4249 * allocation. But to not confuse with user, stat
4250 * will return the blocks that include the delayed allocation
4251 * blocks for this file.
4252 */
4253 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
4254 delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks;
4255 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
4256
4257 stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9;
4258 return 0;
4259}
3196 4260
3197/* 4261/*
3198 * How many blocks doth make a writepage()? 4262 * How many blocks doth make a writepage()?
@@ -3506,3 +4570,64 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
3506 4570
3507 return err; 4571 return err;
3508} 4572}
4573
4574static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh)
4575{
4576 return !buffer_mapped(bh);
4577}
4578
4579int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page)
4580{
4581 loff_t size;
4582 unsigned long len;
4583 int ret = -EINVAL;
4584 struct file *file = vma->vm_file;
4585 struct inode *inode = file->f_path.dentry->d_inode;
4586 struct address_space *mapping = inode->i_mapping;
4587
4588 /*
4589 * Get i_alloc_sem to stop truncates messing with the inode. We cannot
4590 * get i_mutex because we are already holding mmap_sem.
4591 */
4592 down_read(&inode->i_alloc_sem);
4593 size = i_size_read(inode);
4594 if (page->mapping != mapping || size <= page_offset(page)
4595 || !PageUptodate(page)) {
4596 /* page got truncated from under us? */
4597 goto out_unlock;
4598 }
4599 ret = 0;
4600 if (PageMappedToDisk(page))
4601 goto out_unlock;
4602
4603 if (page->index == size >> PAGE_CACHE_SHIFT)
4604 len = size & ~PAGE_CACHE_MASK;
4605 else
4606 len = PAGE_CACHE_SIZE;
4607
4608 if (page_has_buffers(page)) {
4609 /* return if we have all the buffers mapped */
4610 if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
4611 ext4_bh_unmapped))
4612 goto out_unlock;
4613 }
4614 /*
4615 * OK, we need to fill the hole... Do write_begin write_end
4616 * to do block allocation/reservation.We are not holding
4617 * inode.i__mutex here. That allow * parallel write_begin,
4618 * write_end call. lock_page prevent this from happening
4619 * on the same page though
4620 */
4621 ret = mapping->a_ops->write_begin(file, mapping, page_offset(page),
4622 len, AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
4623 if (ret < 0)
4624 goto out_unlock;
4625 ret = mapping->a_ops->write_end(file, mapping, page_offset(page),
4626 len, len, page, NULL);
4627 if (ret < 0)
4628 goto out_unlock;
4629 ret = 0;
4630out_unlock:
4631 up_read(&inode->i_alloc_sem);
4632 return ret;
4633}
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index c9900aade150..8d141a25bbee 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -381,22 +381,28 @@ static inline void mb_clear_bit_atomic(spinlock_t *lock, int bit, void *addr)
381 381
382static inline int mb_find_next_zero_bit(void *addr, int max, int start) 382static inline int mb_find_next_zero_bit(void *addr, int max, int start)
383{ 383{
384 int fix = 0; 384 int fix = 0, ret, tmpmax;
385 addr = mb_correct_addr_and_bit(&fix, addr); 385 addr = mb_correct_addr_and_bit(&fix, addr);
386 max += fix; 386 tmpmax = max + fix;
387 start += fix; 387 start += fix;
388 388
389 return ext4_find_next_zero_bit(addr, max, start) - fix; 389 ret = ext4_find_next_zero_bit(addr, tmpmax, start) - fix;
390 if (ret > max)
391 return max;
392 return ret;
390} 393}
391 394
392static inline int mb_find_next_bit(void *addr, int max, int start) 395static inline int mb_find_next_bit(void *addr, int max, int start)
393{ 396{
394 int fix = 0; 397 int fix = 0, ret, tmpmax;
395 addr = mb_correct_addr_and_bit(&fix, addr); 398 addr = mb_correct_addr_and_bit(&fix, addr);
396 max += fix; 399 tmpmax = max + fix;
397 start += fix; 400 start += fix;
398 401
399 return ext4_find_next_bit(addr, max, start) - fix; 402 ret = ext4_find_next_bit(addr, tmpmax, start) - fix;
403 if (ret > max)
404 return max;
405 return ret;
400} 406}
401 407
402static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max) 408static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
@@ -803,6 +809,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
803 if (!buffer_uptodate(bh[i])) 809 if (!buffer_uptodate(bh[i]))
804 goto out; 810 goto out;
805 811
812 err = 0;
806 first_block = page->index * blocks_per_page; 813 first_block = page->index * blocks_per_page;
807 for (i = 0; i < blocks_per_page; i++) { 814 for (i = 0; i < blocks_per_page; i++) {
808 int group; 815 int group;
@@ -883,6 +890,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
883 int pnum; 890 int pnum;
884 int poff; 891 int poff;
885 struct page *page; 892 struct page *page;
893 int ret;
886 894
887 mb_debug("load group %lu\n", group); 895 mb_debug("load group %lu\n", group);
888 896
@@ -914,15 +922,21 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
914 if (page) { 922 if (page) {
915 BUG_ON(page->mapping != inode->i_mapping); 923 BUG_ON(page->mapping != inode->i_mapping);
916 if (!PageUptodate(page)) { 924 if (!PageUptodate(page)) {
917 ext4_mb_init_cache(page, NULL); 925 ret = ext4_mb_init_cache(page, NULL);
926 if (ret) {
927 unlock_page(page);
928 goto err;
929 }
918 mb_cmp_bitmaps(e4b, page_address(page) + 930 mb_cmp_bitmaps(e4b, page_address(page) +
919 (poff * sb->s_blocksize)); 931 (poff * sb->s_blocksize));
920 } 932 }
921 unlock_page(page); 933 unlock_page(page);
922 } 934 }
923 } 935 }
924 if (page == NULL || !PageUptodate(page)) 936 if (page == NULL || !PageUptodate(page)) {
937 ret = -EIO;
925 goto err; 938 goto err;
939 }
926 e4b->bd_bitmap_page = page; 940 e4b->bd_bitmap_page = page;
927 e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); 941 e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
928 mark_page_accessed(page); 942 mark_page_accessed(page);
@@ -938,14 +952,20 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
938 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); 952 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
939 if (page) { 953 if (page) {
940 BUG_ON(page->mapping != inode->i_mapping); 954 BUG_ON(page->mapping != inode->i_mapping);
941 if (!PageUptodate(page)) 955 if (!PageUptodate(page)) {
942 ext4_mb_init_cache(page, e4b->bd_bitmap); 956 ret = ext4_mb_init_cache(page, e4b->bd_bitmap);
943 957 if (ret) {
958 unlock_page(page);
959 goto err;
960 }
961 }
944 unlock_page(page); 962 unlock_page(page);
945 } 963 }
946 } 964 }
947 if (page == NULL || !PageUptodate(page)) 965 if (page == NULL || !PageUptodate(page)) {
966 ret = -EIO;
948 goto err; 967 goto err;
968 }
949 e4b->bd_buddy_page = page; 969 e4b->bd_buddy_page = page;
950 e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize); 970 e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize);
951 mark_page_accessed(page); 971 mark_page_accessed(page);
@@ -962,7 +982,7 @@ err:
962 page_cache_release(e4b->bd_buddy_page); 982 page_cache_release(e4b->bd_buddy_page);
963 e4b->bd_buddy = NULL; 983 e4b->bd_buddy = NULL;
964 e4b->bd_bitmap = NULL; 984 e4b->bd_bitmap = NULL;
965 return -EIO; 985 return ret;
966} 986}
967 987
968static void ext4_mb_release_desc(struct ext4_buddy *e4b) 988static void ext4_mb_release_desc(struct ext4_buddy *e4b)
@@ -1031,7 +1051,7 @@ static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len)
1031 } 1051 }
1032} 1052}
1033 1053
1034static int mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, 1054static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
1035 int first, int count) 1055 int first, int count)
1036{ 1056{
1037 int block = 0; 1057 int block = 0;
@@ -1071,11 +1091,12 @@ static int mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
1071 blocknr += block; 1091 blocknr += block;
1072 blocknr += 1092 blocknr +=
1073 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); 1093 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
1074 1094 ext4_unlock_group(sb, e4b->bd_group);
1075 ext4_error(sb, __func__, "double-free of inode" 1095 ext4_error(sb, __func__, "double-free of inode"
1076 " %lu's block %llu(bit %u in group %lu)\n", 1096 " %lu's block %llu(bit %u in group %lu)\n",
1077 inode ? inode->i_ino : 0, blocknr, block, 1097 inode ? inode->i_ino : 0, blocknr, block,
1078 e4b->bd_group); 1098 e4b->bd_group);
1099 ext4_lock_group(sb, e4b->bd_group);
1079 } 1100 }
1080 mb_clear_bit(block, EXT4_MB_BITMAP(e4b)); 1101 mb_clear_bit(block, EXT4_MB_BITMAP(e4b));
1081 e4b->bd_info->bb_counters[order]++; 1102 e4b->bd_info->bb_counters[order]++;
@@ -1113,8 +1134,6 @@ static int mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
1113 } while (1); 1134 } while (1);
1114 } 1135 }
1115 mb_check_buddy(e4b); 1136 mb_check_buddy(e4b);
1116
1117 return 0;
1118} 1137}
1119 1138
1120static int mb_find_extent(struct ext4_buddy *e4b, int order, int block, 1139static int mb_find_extent(struct ext4_buddy *e4b, int order, int block,
@@ -1730,10 +1749,6 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
1730 ac->ac_g_ex.fe_start = sbi->s_mb_last_start; 1749 ac->ac_g_ex.fe_start = sbi->s_mb_last_start;
1731 spin_unlock(&sbi->s_md_lock); 1750 spin_unlock(&sbi->s_md_lock);
1732 } 1751 }
1733
1734 /* searching for the right group start from the goal value specified */
1735 group = ac->ac_g_ex.fe_group;
1736
1737 /* Let's just scan groups to find more-less suitable blocks */ 1752 /* Let's just scan groups to find more-less suitable blocks */
1738 cr = ac->ac_2order ? 0 : 1; 1753 cr = ac->ac_2order ? 0 : 1;
1739 /* 1754 /*
@@ -1743,6 +1758,12 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
1743repeat: 1758repeat:
1744 for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) { 1759 for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) {
1745 ac->ac_criteria = cr; 1760 ac->ac_criteria = cr;
1761 /*
1762 * searching for the right group start
1763 * from the goal value specified
1764 */
1765 group = ac->ac_g_ex.fe_group;
1766
1746 for (i = 0; i < EXT4_SB(sb)->s_groups_count; group++, i++) { 1767 for (i = 0; i < EXT4_SB(sb)->s_groups_count; group++, i++) {
1747 struct ext4_group_info *grp; 1768 struct ext4_group_info *grp;
1748 struct ext4_group_desc *desc; 1769 struct ext4_group_desc *desc;
@@ -1963,6 +1984,8 @@ static int ext4_mb_seq_history_open(struct inode *inode, struct file *file)
1963 int rc; 1984 int rc;
1964 int size; 1985 int size;
1965 1986
1987 if (unlikely(sbi->s_mb_history == NULL))
1988 return -ENOMEM;
1966 s = kmalloc(sizeof(*s), GFP_KERNEL); 1989 s = kmalloc(sizeof(*s), GFP_KERNEL);
1967 if (s == NULL) 1990 if (s == NULL)
1968 return -ENOMEM; 1991 return -ENOMEM;
@@ -2165,9 +2188,7 @@ static void ext4_mb_history_init(struct super_block *sb)
2165 sbi->s_mb_history_cur = 0; 2188 sbi->s_mb_history_cur = 0;
2166 spin_lock_init(&sbi->s_mb_history_lock); 2189 spin_lock_init(&sbi->s_mb_history_lock);
2167 i = sbi->s_mb_history_max * sizeof(struct ext4_mb_history); 2190 i = sbi->s_mb_history_max * sizeof(struct ext4_mb_history);
2168 sbi->s_mb_history = kmalloc(i, GFP_KERNEL); 2191 sbi->s_mb_history = kzalloc(i, GFP_KERNEL);
2169 if (likely(sbi->s_mb_history != NULL))
2170 memset(sbi->s_mb_history, 0, i);
2171 /* if we can't allocate history, then we simple won't use it */ 2192 /* if we can't allocate history, then we simple won't use it */
2172} 2193}
2173 2194
@@ -2215,21 +2236,192 @@ ext4_mb_store_history(struct ext4_allocation_context *ac)
2215#define ext4_mb_history_init(sb) 2236#define ext4_mb_history_init(sb)
2216#endif 2237#endif
2217 2238
2239
2240/* Create and initialize ext4_group_info data for the given group. */
2241int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
2242 struct ext4_group_desc *desc)
2243{
2244 int i, len;
2245 int metalen = 0;
2246 struct ext4_sb_info *sbi = EXT4_SB(sb);
2247 struct ext4_group_info **meta_group_info;
2248
2249 /*
2250 * First check if this group is the first of a reserved block.
2251 * If it's true, we have to allocate a new table of pointers
2252 * to ext4_group_info structures
2253 */
2254 if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
2255 metalen = sizeof(*meta_group_info) <<
2256 EXT4_DESC_PER_BLOCK_BITS(sb);
2257 meta_group_info = kmalloc(metalen, GFP_KERNEL);
2258 if (meta_group_info == NULL) {
2259 printk(KERN_ERR "EXT4-fs: can't allocate mem for a "
2260 "buddy group\n");
2261 goto exit_meta_group_info;
2262 }
2263 sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] =
2264 meta_group_info;
2265 }
2266
2267 /*
2268 * calculate needed size. if change bb_counters size,
2269 * don't forget about ext4_mb_generate_buddy()
2270 */
2271 len = offsetof(typeof(**meta_group_info),
2272 bb_counters[sb->s_blocksize_bits + 2]);
2273
2274 meta_group_info =
2275 sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)];
2276 i = group & (EXT4_DESC_PER_BLOCK(sb) - 1);
2277
2278 meta_group_info[i] = kzalloc(len, GFP_KERNEL);
2279 if (meta_group_info[i] == NULL) {
2280 printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n");
2281 goto exit_group_info;
2282 }
2283 set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
2284 &(meta_group_info[i]->bb_state));
2285
2286 /*
2287 * initialize bb_free to be able to skip
2288 * empty groups without initialization
2289 */
2290 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
2291 meta_group_info[i]->bb_free =
2292 ext4_free_blocks_after_init(sb, group, desc);
2293 } else {
2294 meta_group_info[i]->bb_free =
2295 le16_to_cpu(desc->bg_free_blocks_count);
2296 }
2297
2298 INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
2299
2300#ifdef DOUBLE_CHECK
2301 {
2302 struct buffer_head *bh;
2303 meta_group_info[i]->bb_bitmap =
2304 kmalloc(sb->s_blocksize, GFP_KERNEL);
2305 BUG_ON(meta_group_info[i]->bb_bitmap == NULL);
2306 bh = ext4_read_block_bitmap(sb, group);
2307 BUG_ON(bh == NULL);
2308 memcpy(meta_group_info[i]->bb_bitmap, bh->b_data,
2309 sb->s_blocksize);
2310 put_bh(bh);
2311 }
2312#endif
2313
2314 return 0;
2315
2316exit_group_info:
2317 /* If a meta_group_info table has been allocated, release it now */
2318 if (group % EXT4_DESC_PER_BLOCK(sb) == 0)
2319 kfree(sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]);
2320exit_meta_group_info:
2321 return -ENOMEM;
2322} /* ext4_mb_add_groupinfo */
2323
2324/*
2325 * Add a group to the existing groups.
2326 * This function is used for online resize
2327 */
2328int ext4_mb_add_more_groupinfo(struct super_block *sb, ext4_group_t group,
2329 struct ext4_group_desc *desc)
2330{
2331 struct ext4_sb_info *sbi = EXT4_SB(sb);
2332 struct inode *inode = sbi->s_buddy_cache;
2333 int blocks_per_page;
2334 int block;
2335 int pnum;
2336 struct page *page;
2337 int err;
2338
2339 /* Add group based on group descriptor*/
2340 err = ext4_mb_add_groupinfo(sb, group, desc);
2341 if (err)
2342 return err;
2343
2344 /*
2345 * Cache pages containing dynamic mb_alloc datas (buddy and bitmap
2346 * datas) are set not up to date so that they will be re-initilaized
2347 * during the next call to ext4_mb_load_buddy
2348 */
2349
2350 /* Set buddy page as not up to date */
2351 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
2352 block = group * 2;
2353 pnum = block / blocks_per_page;
2354 page = find_get_page(inode->i_mapping, pnum);
2355 if (page != NULL) {
2356 ClearPageUptodate(page);
2357 page_cache_release(page);
2358 }
2359
2360 /* Set bitmap page as not up to date */
2361 block++;
2362 pnum = block / blocks_per_page;
2363 page = find_get_page(inode->i_mapping, pnum);
2364 if (page != NULL) {
2365 ClearPageUptodate(page);
2366 page_cache_release(page);
2367 }
2368
2369 return 0;
2370}
2371
2372/*
2373 * Update an existing group.
2374 * This function is used for online resize
2375 */
2376void ext4_mb_update_group_info(struct ext4_group_info *grp, ext4_grpblk_t add)
2377{
2378 grp->bb_free += add;
2379}
2380
2218static int ext4_mb_init_backend(struct super_block *sb) 2381static int ext4_mb_init_backend(struct super_block *sb)
2219{ 2382{
2220 ext4_group_t i; 2383 ext4_group_t i;
2221 int j, len, metalen; 2384 int metalen;
2222 struct ext4_sb_info *sbi = EXT4_SB(sb); 2385 struct ext4_sb_info *sbi = EXT4_SB(sb);
2223 int num_meta_group_infos = 2386 struct ext4_super_block *es = sbi->s_es;
2224 (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) >> 2387 int num_meta_group_infos;
2225 EXT4_DESC_PER_BLOCK_BITS(sb); 2388 int num_meta_group_infos_max;
2389 int array_size;
2226 struct ext4_group_info **meta_group_info; 2390 struct ext4_group_info **meta_group_info;
2391 struct ext4_group_desc *desc;
2392
2393 /* This is the number of blocks used by GDT */
2394 num_meta_group_infos = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) -
2395 1) >> EXT4_DESC_PER_BLOCK_BITS(sb);
2396
2397 /*
2398 * This is the total number of blocks used by GDT including
2399 * the number of reserved blocks for GDT.
2400 * The s_group_info array is allocated with this value
2401 * to allow a clean online resize without a complex
2402 * manipulation of pointer.
2403 * The drawback is the unused memory when no resize
2404 * occurs but it's very low in terms of pages
2405 * (see comments below)
2406 * Need to handle this properly when META_BG resizing is allowed
2407 */
2408 num_meta_group_infos_max = num_meta_group_infos +
2409 le16_to_cpu(es->s_reserved_gdt_blocks);
2227 2410
2411 /*
2412 * array_size is the size of s_group_info array. We round it
2413 * to the next power of two because this approximation is done
2414 * internally by kmalloc so we can have some more memory
2415 * for free here (e.g. may be used for META_BG resize).
2416 */
2417 array_size = 1;
2418 while (array_size < sizeof(*sbi->s_group_info) *
2419 num_meta_group_infos_max)
2420 array_size = array_size << 1;
2228 /* An 8TB filesystem with 64-bit pointers requires a 4096 byte 2421 /* An 8TB filesystem with 64-bit pointers requires a 4096 byte
2229 * kmalloc. A 128kb malloc should suffice for a 256TB filesystem. 2422 * kmalloc. A 128kb malloc should suffice for a 256TB filesystem.
2230 * So a two level scheme suffices for now. */ 2423 * So a two level scheme suffices for now. */
2231 sbi->s_group_info = kmalloc(sizeof(*sbi->s_group_info) * 2424 sbi->s_group_info = kmalloc(array_size, GFP_KERNEL);
2232 num_meta_group_infos, GFP_KERNEL);
2233 if (sbi->s_group_info == NULL) { 2425 if (sbi->s_group_info == NULL) {
2234 printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n"); 2426 printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n");
2235 return -ENOMEM; 2427 return -ENOMEM;
@@ -2256,63 +2448,15 @@ static int ext4_mb_init_backend(struct super_block *sb)
2256 sbi->s_group_info[i] = meta_group_info; 2448 sbi->s_group_info[i] = meta_group_info;
2257 } 2449 }
2258 2450
2259 /*
2260 * calculate needed size. if change bb_counters size,
2261 * don't forget about ext4_mb_generate_buddy()
2262 */
2263 len = sizeof(struct ext4_group_info);
2264 len += sizeof(unsigned short) * (sb->s_blocksize_bits + 2);
2265 for (i = 0; i < sbi->s_groups_count; i++) { 2451 for (i = 0; i < sbi->s_groups_count; i++) {
2266 struct ext4_group_desc *desc;
2267
2268 meta_group_info =
2269 sbi->s_group_info[i >> EXT4_DESC_PER_BLOCK_BITS(sb)];
2270 j = i & (EXT4_DESC_PER_BLOCK(sb) - 1);
2271
2272 meta_group_info[j] = kzalloc(len, GFP_KERNEL);
2273 if (meta_group_info[j] == NULL) {
2274 printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n");
2275 goto err_freebuddy;
2276 }
2277 desc = ext4_get_group_desc(sb, i, NULL); 2452 desc = ext4_get_group_desc(sb, i, NULL);
2278 if (desc == NULL) { 2453 if (desc == NULL) {
2279 printk(KERN_ERR 2454 printk(KERN_ERR
2280 "EXT4-fs: can't read descriptor %lu\n", i); 2455 "EXT4-fs: can't read descriptor %lu\n", i);
2281 i++;
2282 goto err_freebuddy; 2456 goto err_freebuddy;
2283 } 2457 }
2284 memset(meta_group_info[j], 0, len); 2458 if (ext4_mb_add_groupinfo(sb, i, desc) != 0)
2285 set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, 2459 goto err_freebuddy;
2286 &(meta_group_info[j]->bb_state));
2287
2288 /*
2289 * initialize bb_free to be able to skip
2290 * empty groups without initialization
2291 */
2292 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
2293 meta_group_info[j]->bb_free =
2294 ext4_free_blocks_after_init(sb, i, desc);
2295 } else {
2296 meta_group_info[j]->bb_free =
2297 le16_to_cpu(desc->bg_free_blocks_count);
2298 }
2299
2300 INIT_LIST_HEAD(&meta_group_info[j]->bb_prealloc_list);
2301
2302#ifdef DOUBLE_CHECK
2303 {
2304 struct buffer_head *bh;
2305 meta_group_info[j]->bb_bitmap =
2306 kmalloc(sb->s_blocksize, GFP_KERNEL);
2307 BUG_ON(meta_group_info[j]->bb_bitmap == NULL);
2308 bh = read_block_bitmap(sb, i);
2309 BUG_ON(bh == NULL);
2310 memcpy(meta_group_info[j]->bb_bitmap, bh->b_data,
2311 sb->s_blocksize);
2312 put_bh(bh);
2313 }
2314#endif
2315
2316 } 2460 }
2317 2461
2318 return 0; 2462 return 0;
@@ -2336,6 +2480,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2336 unsigned i; 2480 unsigned i;
2337 unsigned offset; 2481 unsigned offset;
2338 unsigned max; 2482 unsigned max;
2483 int ret;
2339 2484
2340 if (!test_opt(sb, MBALLOC)) 2485 if (!test_opt(sb, MBALLOC))
2341 return 0; 2486 return 0;
@@ -2370,12 +2515,12 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2370 } while (i <= sb->s_blocksize_bits + 1); 2515 } while (i <= sb->s_blocksize_bits + 1);
2371 2516
2372 /* init file for buddy data */ 2517 /* init file for buddy data */
2373 i = ext4_mb_init_backend(sb); 2518 ret = ext4_mb_init_backend(sb);
2374 if (i) { 2519 if (ret != 0) {
2375 clear_opt(sbi->s_mount_opt, MBALLOC); 2520 clear_opt(sbi->s_mount_opt, MBALLOC);
2376 kfree(sbi->s_mb_offsets); 2521 kfree(sbi->s_mb_offsets);
2377 kfree(sbi->s_mb_maxs); 2522 kfree(sbi->s_mb_maxs);
2378 return i; 2523 return ret;
2379 } 2524 }
2380 2525
2381 spin_lock_init(&sbi->s_md_lock); 2526 spin_lock_init(&sbi->s_md_lock);
@@ -2548,8 +2693,7 @@ ext4_mb_free_committed_blocks(struct super_block *sb)
2548 ext4_lock_group(sb, md->group); 2693 ext4_lock_group(sb, md->group);
2549 for (i = 0; i < md->num; i++) { 2694 for (i = 0; i < md->num; i++) {
2550 mb_debug(" %u", md->blocks[i]); 2695 mb_debug(" %u", md->blocks[i]);
2551 err = mb_free_blocks(NULL, &e4b, md->blocks[i], 1); 2696 mb_free_blocks(NULL, &e4b, md->blocks[i], 1);
2552 BUG_ON(err != 0);
2553 } 2697 }
2554 mb_debug("\n"); 2698 mb_debug("\n");
2555 ext4_unlock_group(sb, md->group); 2699 ext4_unlock_group(sb, md->group);
@@ -2575,25 +2719,24 @@ ext4_mb_free_committed_blocks(struct super_block *sb)
2575 2719
2576 2720
2577 2721
2578#define MB_PROC_VALUE_READ(name) \ 2722#define MB_PROC_FOPS(name) \
2579static int ext4_mb_read_##name(char *page, char **start, \ 2723static int ext4_mb_##name##_proc_show(struct seq_file *m, void *v) \
2580 off_t off, int count, int *eof, void *data) \
2581{ \ 2724{ \
2582 struct ext4_sb_info *sbi = data; \ 2725 struct ext4_sb_info *sbi = m->private; \
2583 int len; \ 2726 \
2584 *eof = 1; \ 2727 seq_printf(m, "%ld\n", sbi->s_mb_##name); \
2585 if (off != 0) \ 2728 return 0; \
2586 return 0; \ 2729} \
2587 len = sprintf(page, "%ld\n", sbi->s_mb_##name); \ 2730 \
2588 *start = page; \ 2731static int ext4_mb_##name##_proc_open(struct inode *inode, struct file *file)\
2589 return len; \ 2732{ \
2590} 2733 return single_open(file, ext4_mb_##name##_proc_show, PDE(inode)->data);\
2591 2734} \
2592#define MB_PROC_VALUE_WRITE(name) \ 2735 \
2593static int ext4_mb_write_##name(struct file *file, \ 2736static ssize_t ext4_mb_##name##_proc_write(struct file *file, \
2594 const char __user *buf, unsigned long cnt, void *data) \ 2737 const char __user *buf, size_t cnt, loff_t *ppos) \
2595{ \ 2738{ \
2596 struct ext4_sb_info *sbi = data; \ 2739 struct ext4_sb_info *sbi = PDE(file->f_path.dentry->d_inode)->data;\
2597 char str[32]; \ 2740 char str[32]; \
2598 long value; \ 2741 long value; \
2599 if (cnt >= sizeof(str)) \ 2742 if (cnt >= sizeof(str)) \
@@ -2605,31 +2748,32 @@ static int ext4_mb_write_##name(struct file *file, \
2605 return -ERANGE; \ 2748 return -ERANGE; \
2606 sbi->s_mb_##name = value; \ 2749 sbi->s_mb_##name = value; \
2607 return cnt; \ 2750 return cnt; \
2608} 2751} \
2752 \
2753static const struct file_operations ext4_mb_##name##_proc_fops = { \
2754 .owner = THIS_MODULE, \
2755 .open = ext4_mb_##name##_proc_open, \
2756 .read = seq_read, \
2757 .llseek = seq_lseek, \
2758 .release = single_release, \
2759 .write = ext4_mb_##name##_proc_write, \
2760};
2609 2761
2610MB_PROC_VALUE_READ(stats); 2762MB_PROC_FOPS(stats);
2611MB_PROC_VALUE_WRITE(stats); 2763MB_PROC_FOPS(max_to_scan);
2612MB_PROC_VALUE_READ(max_to_scan); 2764MB_PROC_FOPS(min_to_scan);
2613MB_PROC_VALUE_WRITE(max_to_scan); 2765MB_PROC_FOPS(order2_reqs);
2614MB_PROC_VALUE_READ(min_to_scan); 2766MB_PROC_FOPS(stream_request);
2615MB_PROC_VALUE_WRITE(min_to_scan); 2767MB_PROC_FOPS(group_prealloc);
2616MB_PROC_VALUE_READ(order2_reqs);
2617MB_PROC_VALUE_WRITE(order2_reqs);
2618MB_PROC_VALUE_READ(stream_request);
2619MB_PROC_VALUE_WRITE(stream_request);
2620MB_PROC_VALUE_READ(group_prealloc);
2621MB_PROC_VALUE_WRITE(group_prealloc);
2622 2768
2623#define MB_PROC_HANDLER(name, var) \ 2769#define MB_PROC_HANDLER(name, var) \
2624do { \ 2770do { \
2625 proc = create_proc_entry(name, mode, sbi->s_mb_proc); \ 2771 proc = proc_create_data(name, mode, sbi->s_mb_proc, \
2772 &ext4_mb_##var##_proc_fops, sbi); \
2626 if (proc == NULL) { \ 2773 if (proc == NULL) { \
2627 printk(KERN_ERR "EXT4-fs: can't to create %s\n", name); \ 2774 printk(KERN_ERR "EXT4-fs: can't to create %s\n", name); \
2628 goto err_out; \ 2775 goto err_out; \
2629 } \ 2776 } \
2630 proc->data = sbi; \
2631 proc->read_proc = ext4_mb_read_##var ; \
2632 proc->write_proc = ext4_mb_write_##var; \
2633} while (0) 2777} while (0)
2634 2778
2635static int ext4_mb_init_per_dev_proc(struct super_block *sb) 2779static int ext4_mb_init_per_dev_proc(struct super_block *sb)
@@ -2639,6 +2783,10 @@ static int ext4_mb_init_per_dev_proc(struct super_block *sb)
2639 struct proc_dir_entry *proc; 2783 struct proc_dir_entry *proc;
2640 char devname[64]; 2784 char devname[64];
2641 2785
2786 if (proc_root_ext4 == NULL) {
2787 sbi->s_mb_proc = NULL;
2788 return -EINVAL;
2789 }
2642 bdevname(sb->s_bdev, devname); 2790 bdevname(sb->s_bdev, devname);
2643 sbi->s_mb_proc = proc_mkdir(devname, proc_root_ext4); 2791 sbi->s_mb_proc = proc_mkdir(devname, proc_root_ext4);
2644 2792
@@ -2747,7 +2895,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
2747 2895
2748 2896
2749 err = -EIO; 2897 err = -EIO;
2750 bitmap_bh = read_block_bitmap(sb, ac->ac_b_ex.fe_group); 2898 bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group);
2751 if (!bitmap_bh) 2899 if (!bitmap_bh)
2752 goto out_err; 2900 goto out_err;
2753 2901
@@ -2816,7 +2964,23 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
2816 le16_add_cpu(&gdp->bg_free_blocks_count, -ac->ac_b_ex.fe_len); 2964 le16_add_cpu(&gdp->bg_free_blocks_count, -ac->ac_b_ex.fe_len);
2817 gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp); 2965 gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp);
2818 spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group)); 2966 spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
2819 percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len); 2967
2968 /*
2969 * free blocks account has already be reduced/reserved
2970 * at write_begin() time for delayed allocation
2971 * do not double accounting
2972 */
2973 if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
2974 percpu_counter_sub(&sbi->s_freeblocks_counter,
2975 ac->ac_b_ex.fe_len);
2976
2977 if (sbi->s_log_groups_per_flex) {
2978 ext4_group_t flex_group = ext4_flex_group(sbi,
2979 ac->ac_b_ex.fe_group);
2980 spin_lock(sb_bgl_lock(sbi, flex_group));
2981 sbi->s_flex_groups[flex_group].free_blocks -= ac->ac_b_ex.fe_len;
2982 spin_unlock(sb_bgl_lock(sbi, flex_group));
2983 }
2820 2984
2821 err = ext4_journal_dirty_metadata(handle, bitmap_bh); 2985 err = ext4_journal_dirty_metadata(handle, bitmap_bh);
2822 if (err) 2986 if (err)
@@ -3473,8 +3637,6 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3473 if (bit >= end) 3637 if (bit >= end)
3474 break; 3638 break;
3475 next = mb_find_next_bit(bitmap_bh->b_data, end, bit); 3639 next = mb_find_next_bit(bitmap_bh->b_data, end, bit);
3476 if (next > end)
3477 next = end;
3478 start = group * EXT4_BLOCKS_PER_GROUP(sb) + bit + 3640 start = group * EXT4_BLOCKS_PER_GROUP(sb) + bit +
3479 le32_to_cpu(sbi->s_es->s_first_data_block); 3641 le32_to_cpu(sbi->s_es->s_first_data_block);
3480 mb_debug(" free preallocated %u/%u in group %u\n", 3642 mb_debug(" free preallocated %u/%u in group %u\n",
@@ -3569,7 +3731,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
3569 if (list_empty(&grp->bb_prealloc_list)) 3731 if (list_empty(&grp->bb_prealloc_list))
3570 return 0; 3732 return 0;
3571 3733
3572 bitmap_bh = read_block_bitmap(sb, group); 3734 bitmap_bh = ext4_read_block_bitmap(sb, group);
3573 if (bitmap_bh == NULL) { 3735 if (bitmap_bh == NULL) {
3574 /* error handling here */ 3736 /* error handling here */
3575 ext4_mb_release_desc(&e4b); 3737 ext4_mb_release_desc(&e4b);
@@ -3743,7 +3905,7 @@ repeat:
3743 err = ext4_mb_load_buddy(sb, group, &e4b); 3905 err = ext4_mb_load_buddy(sb, group, &e4b);
3744 BUG_ON(err != 0); /* error handling here */ 3906 BUG_ON(err != 0); /* error handling here */
3745 3907
3746 bitmap_bh = read_block_bitmap(sb, group); 3908 bitmap_bh = ext4_read_block_bitmap(sb, group);
3747 if (bitmap_bh == NULL) { 3909 if (bitmap_bh == NULL) {
3748 /* error handling here */ 3910 /* error handling here */
3749 ext4_mb_release_desc(&e4b); 3911 ext4_mb_release_desc(&e4b);
@@ -4011,10 +4173,21 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4011 sbi = EXT4_SB(sb); 4173 sbi = EXT4_SB(sb);
4012 4174
4013 if (!test_opt(sb, MBALLOC)) { 4175 if (!test_opt(sb, MBALLOC)) {
4014 block = ext4_new_blocks_old(handle, ar->inode, ar->goal, 4176 block = ext4_old_new_blocks(handle, ar->inode, ar->goal,
4015 &(ar->len), errp); 4177 &(ar->len), errp);
4016 return block; 4178 return block;
4017 } 4179 }
4180 if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) {
4181 /*
4182 * With delalloc we already reserved the blocks
4183 */
4184 ar->len = ext4_has_free_blocks(sbi, ar->len);
4185 }
4186
4187 if (ar->len == 0) {
4188 *errp = -ENOSPC;
4189 return 0;
4190 }
4018 4191
4019 while (ar->len && DQUOT_ALLOC_BLOCK(ar->inode, ar->len)) { 4192 while (ar->len && DQUOT_ALLOC_BLOCK(ar->inode, ar->len)) {
4020 ar->flags |= EXT4_MB_HINT_NOPREALLOC; 4193 ar->flags |= EXT4_MB_HINT_NOPREALLOC;
@@ -4026,10 +4199,14 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4026 } 4199 }
4027 inquota = ar->len; 4200 inquota = ar->len;
4028 4201
4202 if (EXT4_I(ar->inode)->i_delalloc_reserved_flag)
4203 ar->flags |= EXT4_MB_DELALLOC_RESERVED;
4204
4029 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); 4205 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
4030 if (!ac) { 4206 if (!ac) {
4207 ar->len = 0;
4031 *errp = -ENOMEM; 4208 *errp = -ENOMEM;
4032 return 0; 4209 goto out1;
4033 } 4210 }
4034 4211
4035 ext4_mb_poll_new_transaction(sb, handle); 4212 ext4_mb_poll_new_transaction(sb, handle);
@@ -4037,12 +4214,11 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4037 *errp = ext4_mb_initialize_context(ac, ar); 4214 *errp = ext4_mb_initialize_context(ac, ar);
4038 if (*errp) { 4215 if (*errp) {
4039 ar->len = 0; 4216 ar->len = 0;
4040 goto out; 4217 goto out2;
4041 } 4218 }
4042 4219
4043 ac->ac_op = EXT4_MB_HISTORY_PREALLOC; 4220 ac->ac_op = EXT4_MB_HISTORY_PREALLOC;
4044 if (!ext4_mb_use_preallocated(ac)) { 4221 if (!ext4_mb_use_preallocated(ac)) {
4045
4046 ac->ac_op = EXT4_MB_HISTORY_ALLOC; 4222 ac->ac_op = EXT4_MB_HISTORY_ALLOC;
4047 ext4_mb_normalize_request(ac, ar); 4223 ext4_mb_normalize_request(ac, ar);
4048repeat: 4224repeat:
@@ -4085,11 +4261,12 @@ repeat:
4085 4261
4086 ext4_mb_release_context(ac); 4262 ext4_mb_release_context(ac);
4087 4263
4088out: 4264out2:
4265 kmem_cache_free(ext4_ac_cachep, ac);
4266out1:
4089 if (ar->len < inquota) 4267 if (ar->len < inquota)
4090 DQUOT_FREE_BLOCK(ar->inode, inquota - ar->len); 4268 DQUOT_FREE_BLOCK(ar->inode, inquota - ar->len);
4091 4269
4092 kmem_cache_free(ext4_ac_cachep, ac);
4093 return block; 4270 return block;
4094} 4271}
4095static void ext4_mb_poll_new_transaction(struct super_block *sb, 4272static void ext4_mb_poll_new_transaction(struct super_block *sb,
@@ -4242,7 +4419,7 @@ do_more:
4242 overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb); 4419 overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb);
4243 count -= overflow; 4420 count -= overflow;
4244 } 4421 }
4245 bitmap_bh = read_block_bitmap(sb, block_group); 4422 bitmap_bh = ext4_read_block_bitmap(sb, block_group);
4246 if (!bitmap_bh) 4423 if (!bitmap_bh)
4247 goto error_return; 4424 goto error_return;
4248 gdp = ext4_get_group_desc(sb, block_group, &gd_bh); 4425 gdp = ext4_get_group_desc(sb, block_group, &gd_bh);
@@ -4309,10 +4486,9 @@ do_more:
4309 ext4_mb_free_metadata(handle, &e4b, block_group, bit, count); 4486 ext4_mb_free_metadata(handle, &e4b, block_group, bit, count);
4310 } else { 4487 } else {
4311 ext4_lock_group(sb, block_group); 4488 ext4_lock_group(sb, block_group);
4312 err = mb_free_blocks(inode, &e4b, bit, count); 4489 mb_free_blocks(inode, &e4b, bit, count);
4313 ext4_mb_return_to_preallocation(inode, &e4b, block, count); 4490 ext4_mb_return_to_preallocation(inode, &e4b, block, count);
4314 ext4_unlock_group(sb, block_group); 4491 ext4_unlock_group(sb, block_group);
4315 BUG_ON(err != 0);
4316 } 4492 }
4317 4493
4318 spin_lock(sb_bgl_lock(sbi, block_group)); 4494 spin_lock(sb_bgl_lock(sbi, block_group));
@@ -4321,6 +4497,13 @@ do_more:
4321 spin_unlock(sb_bgl_lock(sbi, block_group)); 4497 spin_unlock(sb_bgl_lock(sbi, block_group));
4322 percpu_counter_add(&sbi->s_freeblocks_counter, count); 4498 percpu_counter_add(&sbi->s_freeblocks_counter, count);
4323 4499
4500 if (sbi->s_log_groups_per_flex) {
4501 ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
4502 spin_lock(sb_bgl_lock(sbi, flex_group));
4503 sbi->s_flex_groups[flex_group].free_blocks += count;
4504 spin_unlock(sb_bgl_lock(sbi, flex_group));
4505 }
4506
4324 ext4_mb_release_desc(&e4b); 4507 ext4_mb_release_desc(&e4b);
4325 4508
4326 *freed += count; 4509 *freed += count;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index ab16beaa830d..387ad98350c3 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -183,6 +183,16 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
183 struct inode *inode); 183 struct inode *inode);
184 184
185/* 185/*
186 * p is at least 6 bytes before the end of page
187 */
188static inline struct ext4_dir_entry_2 *
189ext4_next_entry(struct ext4_dir_entry_2 *p)
190{
191 return (struct ext4_dir_entry_2 *)((char *)p +
192 ext4_rec_len_from_disk(p->rec_len));
193}
194
195/*
186 * Future: use high four bits of block for coalesce-on-delete flags 196 * Future: use high four bits of block for coalesce-on-delete flags
187 * Mask them off for now. 197 * Mask them off for now.
188 */ 198 */
@@ -231,13 +241,13 @@ static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize)
231{ 241{
232 unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) - 242 unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) -
233 EXT4_DIR_REC_LEN(2) - infosize; 243 EXT4_DIR_REC_LEN(2) - infosize;
234 return 0? 20: entry_space / sizeof(struct dx_entry); 244 return entry_space / sizeof(struct dx_entry);
235} 245}
236 246
237static inline unsigned dx_node_limit (struct inode *dir) 247static inline unsigned dx_node_limit (struct inode *dir)
238{ 248{
239 unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0); 249 unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0);
240 return 0? 22: entry_space / sizeof(struct dx_entry); 250 return entry_space / sizeof(struct dx_entry);
241} 251}
242 252
243/* 253/*
@@ -554,15 +564,6 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash,
554 564
555 565
556/* 566/*
557 * p is at least 6 bytes before the end of page
558 */
559static inline struct ext4_dir_entry_2 *ext4_next_entry(struct ext4_dir_entry_2 *p)
560{
561 return (struct ext4_dir_entry_2 *)((char *)p +
562 ext4_rec_len_from_disk(p->rec_len));
563}
564
565/*
566 * This function fills a red-black tree with information from a 567 * This function fills a red-black tree with information from a
567 * directory block. It returns the number directory entries loaded 568 * directory block. It returns the number directory entries loaded
568 * into the tree. If there is an error it is returned in err. 569 * into the tree. If there is an error it is returned in err.
@@ -993,19 +994,21 @@ static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry,
993 de = (struct ext4_dir_entry_2 *) bh->b_data; 994 de = (struct ext4_dir_entry_2 *) bh->b_data;
994 top = (struct ext4_dir_entry_2 *) ((char *) de + sb->s_blocksize - 995 top = (struct ext4_dir_entry_2 *) ((char *) de + sb->s_blocksize -
995 EXT4_DIR_REC_LEN(0)); 996 EXT4_DIR_REC_LEN(0));
996 for (; de < top; de = ext4_next_entry(de)) 997 for (; de < top; de = ext4_next_entry(de)) {
997 if (ext4_match (namelen, name, de)) { 998 int off = (block << EXT4_BLOCK_SIZE_BITS(sb))
998 if (!ext4_check_dir_entry("ext4_find_entry", 999 + ((char *) de - bh->b_data);
999 dir, de, bh, 1000
1000 (block<<EXT4_BLOCK_SIZE_BITS(sb)) 1001 if (!ext4_check_dir_entry(__func__, dir, de, bh, off)) {
1001 +((char *)de - bh->b_data))) { 1002 brelse(bh);
1002 brelse (bh);
1003 *err = ERR_BAD_DX_DIR; 1003 *err = ERR_BAD_DX_DIR;
1004 goto errout; 1004 goto errout;
1005 } 1005 }
1006 *res_dir = de; 1006
1007 dx_release (frames); 1007 if (ext4_match(namelen, name, de)) {
1008 return bh; 1008 *res_dir = de;
1009 dx_release(frames);
1010 return bh;
1011 }
1009 } 1012 }
1010 brelse (bh); 1013 brelse (bh);
1011 /* Check to see if we should continue to search */ 1014 /* Check to see if we should continue to search */
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 9ff7b1c04239..f000fbe2cd93 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -866,6 +866,15 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
866 gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp); 866 gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp);
867 867
868 /* 868 /*
869 * We can allocate memory for mb_alloc based on the new group
870 * descriptor
871 */
872 if (test_opt(sb, MBALLOC)) {
873 err = ext4_mb_add_more_groupinfo(sb, input->group, gdp);
874 if (err)
875 goto exit_journal;
876 }
877 /*
869 * Make the new blocks and inodes valid next. We do this before 878 * Make the new blocks and inodes valid next. We do this before
870 * increasing the group count so that once the group is enabled, 879 * increasing the group count so that once the group is enabled,
871 * all of its blocks and inodes are already valid. 880 * all of its blocks and inodes are already valid.
@@ -957,6 +966,8 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
957 handle_t *handle; 966 handle_t *handle;
958 int err; 967 int err;
959 unsigned long freed_blocks; 968 unsigned long freed_blocks;
969 ext4_group_t group;
970 struct ext4_group_info *grp;
960 971
961 /* We don't need to worry about locking wrt other resizers just 972 /* We don't need to worry about locking wrt other resizers just
962 * yet: we're going to revalidate es->s_blocks_count after 973 * yet: we're going to revalidate es->s_blocks_count after
@@ -988,7 +999,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
988 } 999 }
989 1000
990 /* Handle the remaining blocks in the last group only. */ 1001 /* Handle the remaining blocks in the last group only. */
991 ext4_get_group_no_and_offset(sb, o_blocks_count, NULL, &last); 1002 ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last);
992 1003
993 if (last == 0) { 1004 if (last == 0) {
994 ext4_warning(sb, __func__, 1005 ext4_warning(sb, __func__,
@@ -1060,6 +1071,45 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
1060 o_blocks_count + add); 1071 o_blocks_count + add);
1061 if ((err = ext4_journal_stop(handle))) 1072 if ((err = ext4_journal_stop(handle)))
1062 goto exit_put; 1073 goto exit_put;
1074
1075 /*
1076 * Mark mballoc pages as not up to date so that they will be updated
1077 * next time they are loaded by ext4_mb_load_buddy.
1078 */
1079 if (test_opt(sb, MBALLOC)) {
1080 struct ext4_sb_info *sbi = EXT4_SB(sb);
1081 struct inode *inode = sbi->s_buddy_cache;
1082 int blocks_per_page;
1083 int block;
1084 int pnum;
1085 struct page *page;
1086
1087 /* Set buddy page as not up to date */
1088 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1089 block = group * 2;
1090 pnum = block / blocks_per_page;
1091 page = find_get_page(inode->i_mapping, pnum);
1092 if (page != NULL) {
1093 ClearPageUptodate(page);
1094 page_cache_release(page);
1095 }
1096
1097 /* Set bitmap page as not up to date */
1098 block++;
1099 pnum = block / blocks_per_page;
1100 page = find_get_page(inode->i_mapping, pnum);
1101 if (page != NULL) {
1102 ClearPageUptodate(page);
1103 page_cache_release(page);
1104 }
1105
1106 /* Get the info on the last group */
1107 grp = ext4_get_group_info(sb, group);
1108
1109 /* Update free blocks in group info */
1110 ext4_mb_update_group_info(grp, add);
1111 }
1112
1063 if (test_opt(sb, DEBUG)) 1113 if (test_opt(sb, DEBUG))
1064 printk(KERN_DEBUG "EXT4-fs: extended group to %llu blocks\n", 1114 printk(KERN_DEBUG "EXT4-fs: extended group to %llu blocks\n",
1065 ext4_blocks_count(es)); 1115 ext4_blocks_count(es));
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index cb96f127c366..1cb371dcd609 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -506,6 +506,7 @@ static void ext4_put_super (struct super_block * sb)
506 ext4_ext_release(sb); 506 ext4_ext_release(sb);
507 ext4_xattr_put_super(sb); 507 ext4_xattr_put_super(sb);
508 jbd2_journal_destroy(sbi->s_journal); 508 jbd2_journal_destroy(sbi->s_journal);
509 sbi->s_journal = NULL;
509 if (!(sb->s_flags & MS_RDONLY)) { 510 if (!(sb->s_flags & MS_RDONLY)) {
510 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); 511 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
511 es->s_state = cpu_to_le16(sbi->s_mount_state); 512 es->s_state = cpu_to_le16(sbi->s_mount_state);
@@ -517,6 +518,7 @@ static void ext4_put_super (struct super_block * sb)
517 for (i = 0; i < sbi->s_gdb_count; i++) 518 for (i = 0; i < sbi->s_gdb_count; i++)
518 brelse(sbi->s_group_desc[i]); 519 brelse(sbi->s_group_desc[i]);
519 kfree(sbi->s_group_desc); 520 kfree(sbi->s_group_desc);
521 kfree(sbi->s_flex_groups);
520 percpu_counter_destroy(&sbi->s_freeblocks_counter); 522 percpu_counter_destroy(&sbi->s_freeblocks_counter);
521 percpu_counter_destroy(&sbi->s_freeinodes_counter); 523 percpu_counter_destroy(&sbi->s_freeinodes_counter);
522 percpu_counter_destroy(&sbi->s_dirs_counter); 524 percpu_counter_destroy(&sbi->s_dirs_counter);
@@ -571,6 +573,12 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
571 memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache)); 573 memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
572 INIT_LIST_HEAD(&ei->i_prealloc_list); 574 INIT_LIST_HEAD(&ei->i_prealloc_list);
573 spin_lock_init(&ei->i_prealloc_lock); 575 spin_lock_init(&ei->i_prealloc_lock);
576 jbd2_journal_init_jbd_inode(&ei->jinode, &ei->vfs_inode);
577 ei->i_reserved_data_blocks = 0;
578 ei->i_reserved_meta_blocks = 0;
579 ei->i_allocated_meta_blocks = 0;
580 ei->i_delalloc_reserved_flag = 0;
581 spin_lock_init(&(ei->i_block_reservation_lock));
574 return &ei->vfs_inode; 582 return &ei->vfs_inode;
575} 583}
576 584
@@ -635,6 +643,8 @@ static void ext4_clear_inode(struct inode *inode)
635 EXT4_I(inode)->i_block_alloc_info = NULL; 643 EXT4_I(inode)->i_block_alloc_info = NULL;
636 if (unlikely(rsv)) 644 if (unlikely(rsv))
637 kfree(rsv); 645 kfree(rsv);
646 jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal,
647 &EXT4_I(inode)->jinode);
638} 648}
639 649
640static inline void ext4_show_quota_options(struct seq_file *seq, struct super_block *sb) 650static inline void ext4_show_quota_options(struct seq_file *seq, struct super_block *sb)
@@ -671,7 +681,6 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
671 unsigned long def_mount_opts; 681 unsigned long def_mount_opts;
672 struct super_block *sb = vfs->mnt_sb; 682 struct super_block *sb = vfs->mnt_sb;
673 struct ext4_sb_info *sbi = EXT4_SB(sb); 683 struct ext4_sb_info *sbi = EXT4_SB(sb);
674 journal_t *journal = sbi->s_journal;
675 struct ext4_super_block *es = sbi->s_es; 684 struct ext4_super_block *es = sbi->s_es;
676 685
677 def_mount_opts = le32_to_cpu(es->s_default_mount_opts); 686 def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
@@ -747,6 +756,9 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
747 seq_puts(seq, ",nomballoc"); 756 seq_puts(seq, ",nomballoc");
748 if (test_opt(sb, I_VERSION)) 757 if (test_opt(sb, I_VERSION))
749 seq_puts(seq, ",i_version"); 758 seq_puts(seq, ",i_version");
759 if (!test_opt(sb, DELALLOC))
760 seq_puts(seq, ",nodelalloc");
761
750 762
751 if (sbi->s_stripe) 763 if (sbi->s_stripe)
752 seq_printf(seq, ",stripe=%lu", sbi->s_stripe); 764 seq_printf(seq, ",stripe=%lu", sbi->s_stripe);
@@ -894,7 +906,7 @@ enum {
894 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, 906 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
895 Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, 907 Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
896 Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version, 908 Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version,
897 Opt_mballoc, Opt_nomballoc, Opt_stripe, 909 Opt_mballoc, Opt_nomballoc, Opt_stripe, Opt_delalloc, Opt_nodelalloc,
898}; 910};
899 911
900static match_table_t tokens = { 912static match_table_t tokens = {
@@ -953,6 +965,8 @@ static match_table_t tokens = {
953 {Opt_nomballoc, "nomballoc"}, 965 {Opt_nomballoc, "nomballoc"},
954 {Opt_stripe, "stripe=%u"}, 966 {Opt_stripe, "stripe=%u"},
955 {Opt_resize, "resize"}, 967 {Opt_resize, "resize"},
968 {Opt_delalloc, "delalloc"},
969 {Opt_nodelalloc, "nodelalloc"},
956 {Opt_err, NULL}, 970 {Opt_err, NULL},
957}; 971};
958 972
@@ -990,6 +1004,7 @@ static int parse_options (char *options, struct super_block *sb,
990 int qtype, qfmt; 1004 int qtype, qfmt;
991 char *qname; 1005 char *qname;
992#endif 1006#endif
1007 ext4_fsblk_t last_block;
993 1008
994 if (!options) 1009 if (!options)
995 return 1; 1010 return 1;
@@ -1309,15 +1324,39 @@ set_qf_format:
1309 clear_opt(sbi->s_mount_opt, NOBH); 1324 clear_opt(sbi->s_mount_opt, NOBH);
1310 break; 1325 break;
1311 case Opt_extents: 1326 case Opt_extents:
1327 if (!EXT4_HAS_INCOMPAT_FEATURE(sb,
1328 EXT4_FEATURE_INCOMPAT_EXTENTS)) {
1329 ext4_warning(sb, __func__,
1330 "extents feature not enabled "
1331 "on this filesystem, use tune2fs\n");
1332 return 0;
1333 }
1312 set_opt (sbi->s_mount_opt, EXTENTS); 1334 set_opt (sbi->s_mount_opt, EXTENTS);
1313 break; 1335 break;
1314 case Opt_noextents: 1336 case Opt_noextents:
1337 /*
1338 * When e2fsprogs support resizing an already existing
1339 * ext3 file system to greater than 2**32 we need to
1340 * add support to block allocator to handle growing
1341 * already existing block mapped inode so that blocks
1342 * allocated for them fall within 2**32
1343 */
1344 last_block = ext4_blocks_count(sbi->s_es) - 1;
1345 if (last_block > 0xffffffffULL) {
1346 printk(KERN_ERR "EXT4-fs: Filesystem too "
1347 "large to mount with "
1348 "-o noextents options\n");
1349 return 0;
1350 }
1315 clear_opt (sbi->s_mount_opt, EXTENTS); 1351 clear_opt (sbi->s_mount_opt, EXTENTS);
1316 break; 1352 break;
1317 case Opt_i_version: 1353 case Opt_i_version:
1318 set_opt(sbi->s_mount_opt, I_VERSION); 1354 set_opt(sbi->s_mount_opt, I_VERSION);
1319 sb->s_flags |= MS_I_VERSION; 1355 sb->s_flags |= MS_I_VERSION;
1320 break; 1356 break;
1357 case Opt_nodelalloc:
1358 clear_opt(sbi->s_mount_opt, DELALLOC);
1359 break;
1321 case Opt_mballoc: 1360 case Opt_mballoc:
1322 set_opt(sbi->s_mount_opt, MBALLOC); 1361 set_opt(sbi->s_mount_opt, MBALLOC);
1323 break; 1362 break;
@@ -1331,6 +1370,9 @@ set_qf_format:
1331 return 0; 1370 return 0;
1332 sbi->s_stripe = option; 1371 sbi->s_stripe = option;
1333 break; 1372 break;
1373 case Opt_delalloc:
1374 set_opt(sbi->s_mount_opt, DELALLOC);
1375 break;
1334 default: 1376 default:
1335 printk (KERN_ERR 1377 printk (KERN_ERR
1336 "EXT4-fs: Unrecognized mount option \"%s\" " 1378 "EXT4-fs: Unrecognized mount option \"%s\" "
@@ -1443,6 +1485,54 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
1443 return res; 1485 return res;
1444} 1486}
1445 1487
1488static int ext4_fill_flex_info(struct super_block *sb)
1489{
1490 struct ext4_sb_info *sbi = EXT4_SB(sb);
1491 struct ext4_group_desc *gdp = NULL;
1492 struct buffer_head *bh;
1493 ext4_group_t flex_group_count;
1494 ext4_group_t flex_group;
1495 int groups_per_flex = 0;
1496 __u64 block_bitmap = 0;
1497 int i;
1498
1499 if (!sbi->s_es->s_log_groups_per_flex) {
1500 sbi->s_log_groups_per_flex = 0;
1501 return 1;
1502 }
1503
1504 sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
1505 groups_per_flex = 1 << sbi->s_log_groups_per_flex;
1506
1507 flex_group_count = (sbi->s_groups_count + groups_per_flex - 1) /
1508 groups_per_flex;
1509 sbi->s_flex_groups = kmalloc(flex_group_count *
1510 sizeof(struct flex_groups), GFP_KERNEL);
1511 if (sbi->s_flex_groups == NULL) {
1512 printk(KERN_ERR "EXT4-fs: not enough memory\n");
1513 goto failed;
1514 }
1515 memset(sbi->s_flex_groups, 0, flex_group_count *
1516 sizeof(struct flex_groups));
1517
1518 gdp = ext4_get_group_desc(sb, 1, &bh);
1519 block_bitmap = ext4_block_bitmap(sb, gdp) - 1;
1520
1521 for (i = 0; i < sbi->s_groups_count; i++) {
1522 gdp = ext4_get_group_desc(sb, i, &bh);
1523
1524 flex_group = ext4_flex_group(sbi, i);
1525 sbi->s_flex_groups[flex_group].free_inodes +=
1526 le16_to_cpu(gdp->bg_free_inodes_count);
1527 sbi->s_flex_groups[flex_group].free_blocks +=
1528 le16_to_cpu(gdp->bg_free_blocks_count);
1529 }
1530
1531 return 1;
1532failed:
1533 return 0;
1534}
1535
1446__le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group, 1536__le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group,
1447 struct ext4_group_desc *gdp) 1537 struct ext4_group_desc *gdp)
1448{ 1538{
@@ -1810,8 +1900,8 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
1810} 1900}
1811 1901
1812static int ext4_fill_super (struct super_block *sb, void *data, int silent) 1902static int ext4_fill_super (struct super_block *sb, void *data, int silent)
1813 __releases(kernel_sem) 1903 __releases(kernel_lock)
1814 __acquires(kernel_sem) 1904 __acquires(kernel_lock)
1815 1905
1816{ 1906{
1817 struct buffer_head * bh; 1907 struct buffer_head * bh;
@@ -1851,11 +1941,6 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
1851 goto out_fail; 1941 goto out_fail;
1852 } 1942 }
1853 1943
1854 if (!sb_set_blocksize(sb, blocksize)) {
1855 printk(KERN_ERR "EXT4-fs: bad blocksize %d.\n", blocksize);
1856 goto out_fail;
1857 }
1858
1859 /* 1944 /*
1860 * The ext4 superblock will not be buffer aligned for other than 1kB 1945 * The ext4 superblock will not be buffer aligned for other than 1kB
1861 * block sizes. We need to calculate the offset from buffer start. 1946 * block sizes. We need to calculate the offset from buffer start.
@@ -1919,15 +2004,28 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
1919 2004
1920 /* 2005 /*
1921 * turn on extents feature by default in ext4 filesystem 2006 * turn on extents feature by default in ext4 filesystem
1922 * User -o noextents to turn it off 2007 * only if feature flag already set by mkfs or tune2fs.
2008 * Use -o noextents to turn it off
1923 */ 2009 */
1924 set_opt(sbi->s_mount_opt, EXTENTS); 2010 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
2011 set_opt(sbi->s_mount_opt, EXTENTS);
2012 else
2013 ext4_warning(sb, __func__,
2014 "extents feature not enabled on this filesystem, "
2015 "use tune2fs.\n");
1925 /* 2016 /*
1926 * turn on mballoc feature by default in ext4 filesystem 2017 * turn on mballoc code by default in ext4 filesystem
1927 * User -o nomballoc to turn it off 2018 * Use -o nomballoc to turn it off
1928 */ 2019 */
1929 set_opt(sbi->s_mount_opt, MBALLOC); 2020 set_opt(sbi->s_mount_opt, MBALLOC);
1930 2021
2022 /*
2023 * enable delayed allocation by default
2024 * Use -o nodelalloc to turn it off
2025 */
2026 set_opt(sbi->s_mount_opt, DELALLOC);
2027
2028
1931 if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum, 2029 if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum,
1932 NULL, 0)) 2030 NULL, 0))
1933 goto failed_mount; 2031 goto failed_mount;
@@ -2138,6 +2236,14 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
2138 printk(KERN_ERR "EXT4-fs: group descriptors corrupted!\n"); 2236 printk(KERN_ERR "EXT4-fs: group descriptors corrupted!\n");
2139 goto failed_mount2; 2237 goto failed_mount2;
2140 } 2238 }
2239 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
2240 if (!ext4_fill_flex_info(sb)) {
2241 printk(KERN_ERR
2242 "EXT4-fs: unable to initialize "
2243 "flex_bg meta info!\n");
2244 goto failed_mount2;
2245 }
2246
2141 sbi->s_gdb_count = db_count; 2247 sbi->s_gdb_count = db_count;
2142 get_random_bytes(&sbi->s_next_generation, sizeof(u32)); 2248 get_random_bytes(&sbi->s_next_generation, sizeof(u32));
2143 spin_lock_init(&sbi->s_next_gen_lock); 2249 spin_lock_init(&sbi->s_next_gen_lock);
@@ -2358,6 +2464,13 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
2358 test_opt(sb,DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered": 2464 test_opt(sb,DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered":
2359 "writeback"); 2465 "writeback");
2360 2466
2467 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
2468 printk(KERN_WARNING "EXT4-fs: Ignoring delalloc option - "
2469 "requested data journaling mode\n");
2470 clear_opt(sbi->s_mount_opt, DELALLOC);
2471 } else if (test_opt(sb, DELALLOC))
2472 printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n");
2473
2361 ext4_ext_init(sb); 2474 ext4_ext_init(sb);
2362 ext4_mb_init(sb, needs_recovery); 2475 ext4_mb_init(sb, needs_recovery);
2363 2476
@@ -2372,6 +2485,7 @@ cantfind_ext4:
2372 2485
2373failed_mount4: 2486failed_mount4:
2374 jbd2_journal_destroy(sbi->s_journal); 2487 jbd2_journal_destroy(sbi->s_journal);
2488 sbi->s_journal = NULL;
2375failed_mount3: 2489failed_mount3:
2376 percpu_counter_destroy(&sbi->s_freeblocks_counter); 2490 percpu_counter_destroy(&sbi->s_freeblocks_counter);
2377 percpu_counter_destroy(&sbi->s_freeinodes_counter); 2491 percpu_counter_destroy(&sbi->s_freeinodes_counter);
@@ -3325,7 +3439,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
3325 err = ext4_journal_dirty_metadata(handle, bh); 3439 err = ext4_journal_dirty_metadata(handle, bh);
3326 else { 3440 else {
3327 /* Always do at least ordered writes for quotas */ 3441 /* Always do at least ordered writes for quotas */
3328 err = ext4_journal_dirty_data(handle, bh); 3442 err = ext4_jbd2_file_inode(handle, inode);
3329 mark_buffer_dirty(bh); 3443 mark_buffer_dirty(bh);
3330 } 3444 }
3331 brelse(bh); 3445 brelse(bh);
@@ -3337,8 +3451,10 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
3337 blk++; 3451 blk++;
3338 } 3452 }
3339out: 3453out:
3340 if (len == towrite) 3454 if (len == towrite) {
3455 mutex_unlock(&inode->i_mutex);
3341 return err; 3456 return err;
3457 }
3342 if (inode->i_size < off+len-towrite) { 3458 if (inode->i_size < off+len-towrite) {
3343 i_size_write(inode, off+len-towrite); 3459 i_size_write(inode, off+len-towrite);
3344 EXT4_I(inode)->i_disksize = inode->i_size; 3460 EXT4_I(inode)->i_disksize = inode->i_size;
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index ff08633f398e..93c5fdcdad2e 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -810,7 +810,7 @@ inserted:
810 /* We need to allocate a new block */ 810 /* We need to allocate a new block */
811 ext4_fsblk_t goal = ext4_group_first_block_no(sb, 811 ext4_fsblk_t goal = ext4_group_first_block_no(sb,
812 EXT4_I(inode)->i_block_group); 812 EXT4_I(inode)->i_block_group);
813 ext4_fsblk_t block = ext4_new_block(handle, inode, 813 ext4_fsblk_t block = ext4_new_meta_block(handle, inode,
814 goal, &error); 814 goal, &error);
815 if (error) 815 if (error)
816 goto cleanup; 816 goto cleanup;
diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c
index fff33382cadc..ac1a52cf2a37 100644
--- a/fs/ext4/xattr_trusted.c
+++ b/fs/ext4/xattr_trusted.c
@@ -13,13 +13,11 @@
13#include "ext4.h" 13#include "ext4.h"
14#include "xattr.h" 14#include "xattr.h"
15 15
16#define XATTR_TRUSTED_PREFIX "trusted."
17
18static size_t 16static size_t
19ext4_xattr_trusted_list(struct inode *inode, char *list, size_t list_size, 17ext4_xattr_trusted_list(struct inode *inode, char *list, size_t list_size,
20 const char *name, size_t name_len) 18 const char *name, size_t name_len)
21{ 19{
22 const size_t prefix_len = sizeof(XATTR_TRUSTED_PREFIX)-1; 20 const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
23 const size_t total_len = prefix_len + name_len + 1; 21 const size_t total_len = prefix_len + name_len + 1;
24 22
25 if (!capable(CAP_SYS_ADMIN)) 23 if (!capable(CAP_SYS_ADMIN))
diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c
index 67be723fcc4e..d91aa61b42aa 100644
--- a/fs/ext4/xattr_user.c
+++ b/fs/ext4/xattr_user.c
@@ -12,13 +12,11 @@
12#include "ext4.h" 12#include "ext4.h"
13#include "xattr.h" 13#include "xattr.h"
14 14
15#define XATTR_USER_PREFIX "user."
16
17static size_t 15static size_t
18ext4_xattr_user_list(struct inode *inode, char *list, size_t list_size, 16ext4_xattr_user_list(struct inode *inode, char *list, size_t list_size,
19 const char *name, size_t name_len) 17 const char *name, size_t name_len)
20{ 18{
21 const size_t prefix_len = sizeof(XATTR_USER_PREFIX)-1; 19 const size_t prefix_len = XATTR_USER_PREFIX_LEN;
22 const size_t total_len = prefix_len + name_len + 1; 20 const size_t total_len = prefix_len + name_len + 1;
23 21
24 if (!test_opt(inode->i_sb, XATTR_USER)) 22 if (!test_opt(inode->i_sb, XATTR_USER))
diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index fda25479af26..3a9ecac8d61f 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -61,7 +61,7 @@ void fat_cache_destroy(void)
61 61
62static inline struct fat_cache *fat_cache_alloc(struct inode *inode) 62static inline struct fat_cache *fat_cache_alloc(struct inode *inode)
63{ 63{
64 return kmem_cache_alloc(fat_cache_cachep, GFP_KERNEL); 64 return kmem_cache_alloc(fat_cache_cachep, GFP_NOFS);
65} 65}
66 66
67static inline void fat_cache_free(struct fat_cache *cache) 67static inline void fat_cache_free(struct fat_cache *cache)
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 486725ee99ae..34541d06e626 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -472,7 +472,7 @@ static int __fat_readdir(struct inode *inode, struct file *filp, void *dirent,
472 loff_t cpos; 472 loff_t cpos;
473 int ret = 0; 473 int ret = 0;
474 474
475 lock_kernel(); 475 lock_super(sb);
476 476
477 cpos = filp->f_pos; 477 cpos = filp->f_pos;
478 /* Fake . and .. for the root directory. */ 478 /* Fake . and .. for the root directory. */
@@ -654,7 +654,7 @@ FillFailed:
654 if (unicode) 654 if (unicode)
655 __putname(unicode); 655 __putname(unicode);
656out: 656out:
657 unlock_kernel(); 657 unlock_super(sb);
658 return ret; 658 return ret;
659} 659}
660 660
diff --git a/fs/fat/file.c b/fs/fat/file.c
index 771326b8047e..c672df4036e9 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -11,7 +11,6 @@
11#include <linux/mount.h> 11#include <linux/mount.h>
12#include <linux/time.h> 12#include <linux/time.h>
13#include <linux/msdos_fs.h> 13#include <linux/msdos_fs.h>
14#include <linux/smp_lock.h>
15#include <linux/buffer_head.h> 14#include <linux/buffer_head.h>
16#include <linux/writeback.h> 15#include <linux/writeback.h>
17#include <linux/backing-dev.h> 16#include <linux/backing-dev.h>
@@ -242,9 +241,7 @@ void fat_truncate(struct inode *inode)
242 241
243 nr_clusters = (inode->i_size + (cluster_size - 1)) >> sbi->cluster_bits; 242 nr_clusters = (inode->i_size + (cluster_size - 1)) >> sbi->cluster_bits;
244 243
245 lock_kernel();
246 fat_free(inode, nr_clusters); 244 fat_free(inode, nr_clusters);
247 unlock_kernel();
248 fat_flush_inodes(inode->i_sb, inode, NULL); 245 fat_flush_inodes(inode->i_sb, inode, NULL);
249} 246}
250 247
@@ -310,8 +307,6 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
310 int error = 0; 307 int error = 0;
311 unsigned int ia_valid; 308 unsigned int ia_valid;
312 309
313 lock_kernel();
314
315 /* 310 /*
316 * Expand the file. Since inode_setattr() updates ->i_size 311 * Expand the file. Since inode_setattr() updates ->i_size
317 * before calling the ->truncate(), but FAT needs to fill the 312 * before calling the ->truncate(), but FAT needs to fill the
@@ -366,7 +361,6 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
366 361
367 error = inode_setattr(inode, attr); 362 error = inode_setattr(inode, attr);
368out: 363out:
369 unlock_kernel();
370 return error; 364 return error;
371} 365}
372EXPORT_SYMBOL_GPL(fat_setattr); 366EXPORT_SYMBOL_GPL(fat_setattr);
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 4e0a3dd9d677..46a4508ffd2e 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -440,14 +440,13 @@ static void fat_delete_inode(struct inode *inode)
440 440
441static void fat_clear_inode(struct inode *inode) 441static void fat_clear_inode(struct inode *inode)
442{ 442{
443 struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); 443 struct super_block *sb = inode->i_sb;
444 struct msdos_sb_info *sbi = MSDOS_SB(sb);
444 445
445 lock_kernel();
446 spin_lock(&sbi->inode_hash_lock); 446 spin_lock(&sbi->inode_hash_lock);
447 fat_cache_inval_inode(inode); 447 fat_cache_inval_inode(inode);
448 hlist_del_init(&MSDOS_I(inode)->i_fat_hash); 448 hlist_del_init(&MSDOS_I(inode)->i_fat_hash);
449 spin_unlock(&sbi->inode_hash_lock); 449 spin_unlock(&sbi->inode_hash_lock);
450 unlock_kernel();
451} 450}
452 451
453static void fat_write_super(struct super_block *sb) 452static void fat_write_super(struct super_block *sb)
@@ -485,7 +484,7 @@ static struct kmem_cache *fat_inode_cachep;
485static struct inode *fat_alloc_inode(struct super_block *sb) 484static struct inode *fat_alloc_inode(struct super_block *sb)
486{ 485{
487 struct msdos_inode_info *ei; 486 struct msdos_inode_info *ei;
488 ei = kmem_cache_alloc(fat_inode_cachep, GFP_KERNEL); 487 ei = kmem_cache_alloc(fat_inode_cachep, GFP_NOFS);
489 if (!ei) 488 if (!ei)
490 return NULL; 489 return NULL;
491 return &ei->vfs_inode; 490 return &ei->vfs_inode;
@@ -567,7 +566,7 @@ retry:
567 if (inode->i_ino == MSDOS_ROOT_INO || !i_pos) 566 if (inode->i_ino == MSDOS_ROOT_INO || !i_pos)
568 return 0; 567 return 0;
569 568
570 lock_kernel(); 569 lock_super(sb);
571 bh = sb_bread(sb, i_pos >> sbi->dir_per_block_bits); 570 bh = sb_bread(sb, i_pos >> sbi->dir_per_block_bits);
572 if (!bh) { 571 if (!bh) {
573 printk(KERN_ERR "FAT: unable to read inode block " 572 printk(KERN_ERR "FAT: unable to read inode block "
@@ -579,7 +578,7 @@ retry:
579 if (i_pos != MSDOS_I(inode)->i_pos) { 578 if (i_pos != MSDOS_I(inode)->i_pos) {
580 spin_unlock(&sbi->inode_hash_lock); 579 spin_unlock(&sbi->inode_hash_lock);
581 brelse(bh); 580 brelse(bh);
582 unlock_kernel(); 581 unlock_super(sb);
583 goto retry; 582 goto retry;
584 } 583 }
585 584
@@ -606,7 +605,7 @@ retry:
606 err = sync_dirty_buffer(bh); 605 err = sync_dirty_buffer(bh);
607 brelse(bh); 606 brelse(bh);
608out: 607out:
609 unlock_kernel(); 608 unlock_super(sb);
610 return err; 609 return err;
611} 610}
612 611
@@ -736,6 +735,7 @@ fat_encode_fh(struct dentry *de, __u32 *fh, int *lenp, int connectable)
736 735
737static struct dentry *fat_get_parent(struct dentry *child) 736static struct dentry *fat_get_parent(struct dentry *child)
738{ 737{
738 struct super_block *sb = child->d_sb;
739 struct buffer_head *bh; 739 struct buffer_head *bh;
740 struct msdos_dir_entry *de; 740 struct msdos_dir_entry *de;
741 loff_t i_pos; 741 loff_t i_pos;
@@ -743,14 +743,14 @@ static struct dentry *fat_get_parent(struct dentry *child)
743 struct inode *inode; 743 struct inode *inode;
744 int err; 744 int err;
745 745
746 lock_kernel(); 746 lock_super(sb);
747 747
748 err = fat_get_dotdot_entry(child->d_inode, &bh, &de, &i_pos); 748 err = fat_get_dotdot_entry(child->d_inode, &bh, &de, &i_pos);
749 if (err) { 749 if (err) {
750 parent = ERR_PTR(err); 750 parent = ERR_PTR(err);
751 goto out; 751 goto out;
752 } 752 }
753 inode = fat_build_inode(child->d_sb, de, i_pos); 753 inode = fat_build_inode(sb, de, i_pos);
754 brelse(bh); 754 brelse(bh);
755 if (IS_ERR(inode)) { 755 if (IS_ERR(inode)) {
756 parent = ERR_CAST(inode); 756 parent = ERR_CAST(inode);
@@ -762,7 +762,7 @@ static struct dentry *fat_get_parent(struct dentry *child)
762 parent = ERR_PTR(-ENOMEM); 762 parent = ERR_PTR(-ENOMEM);
763 } 763 }
764out: 764out:
765 unlock_kernel(); 765 unlock_super(sb);
766 766
767 return parent; 767 return parent;
768} 768}
@@ -1172,6 +1172,12 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
1172 long error; 1172 long error;
1173 char buf[50]; 1173 char buf[50];
1174 1174
1175 /*
1176 * GFP_KERNEL is ok here, because while we do hold the
1177 * supeblock lock, memory pressure can't call back into
1178 * the filesystem, since we're only just about to mount
1179 * it and have no inodes etc active!
1180 */
1175 sbi = kzalloc(sizeof(struct msdos_sb_info), GFP_KERNEL); 1181 sbi = kzalloc(sizeof(struct msdos_sb_info), GFP_KERNEL);
1176 if (!sbi) 1182 if (!sbi)
1177 return -ENOMEM; 1183 return -ENOMEM;
diff --git a/fs/fcntl.c b/fs/fcntl.c
index bfd776509a72..330a7d782591 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -12,7 +12,6 @@
12#include <linux/fdtable.h> 12#include <linux/fdtable.h>
13#include <linux/capability.h> 13#include <linux/capability.h>
14#include <linux/dnotify.h> 14#include <linux/dnotify.h>
15#include <linux/smp_lock.h>
16#include <linux/slab.h> 15#include <linux/slab.h>
17#include <linux/module.h> 16#include <linux/module.h>
18#include <linux/security.h> 17#include <linux/security.h>
@@ -227,7 +226,6 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
227 if (error) 226 if (error)
228 return error; 227 return error;
229 228
230 lock_kernel();
231 if ((arg ^ filp->f_flags) & FASYNC) { 229 if ((arg ^ filp->f_flags) & FASYNC) {
232 if (filp->f_op && filp->f_op->fasync) { 230 if (filp->f_op && filp->f_op->fasync) {
233 error = filp->f_op->fasync(fd, filp, (arg & FASYNC) != 0); 231 error = filp->f_op->fasync(fd, filp, (arg & FASYNC) != 0);
@@ -238,7 +236,6 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
238 236
239 filp->f_flags = (arg & SETFL_MASK) | (filp->f_flags & ~SETFL_MASK); 237 filp->f_flags = (arg & SETFL_MASK) | (filp->f_flags & ~SETFL_MASK);
240 out: 238 out:
241 unlock_kernel();
242 return error; 239 return error;
243} 240}
244 241
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index ae45f77765c0..25adfc3c693a 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -424,8 +424,6 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
424 * WB_SYNC_HOLD is a hack for sys_sync(): reattach the inode to sb->s_dirty so 424 * WB_SYNC_HOLD is a hack for sys_sync(): reattach the inode to sb->s_dirty so
425 * that it can be located for waiting on in __writeback_single_inode(). 425 * that it can be located for waiting on in __writeback_single_inode().
426 * 426 *
427 * Called under inode_lock.
428 *
429 * If `bdi' is non-zero then we're being asked to writeback a specific queue. 427 * If `bdi' is non-zero then we're being asked to writeback a specific queue.
430 * This function assumes that the blockdev superblock's inodes are backed by 428 * This function assumes that the blockdev superblock's inodes are backed by
431 * a variety of queues, so all inodes are searched. For other superblocks, 429 * a variety of queues, so all inodes are searched. For other superblocks,
@@ -441,11 +439,12 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
441 * on the writer throttling path, and we get decent balancing between many 439 * on the writer throttling path, and we get decent balancing between many
442 * throttled threads: we don't want them all piling up on inode_sync_wait. 440 * throttled threads: we don't want them all piling up on inode_sync_wait.
443 */ 441 */
444static void 442void generic_sync_sb_inodes(struct super_block *sb,
445sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc) 443 struct writeback_control *wbc)
446{ 444{
447 const unsigned long start = jiffies; /* livelock avoidance */ 445 const unsigned long start = jiffies; /* livelock avoidance */
448 446
447 spin_lock(&inode_lock);
449 if (!wbc->for_kupdate || list_empty(&sb->s_io)) 448 if (!wbc->for_kupdate || list_empty(&sb->s_io))
450 queue_io(sb, wbc->older_than_this); 449 queue_io(sb, wbc->older_than_this);
451 450
@@ -524,8 +523,16 @@ sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
524 if (!list_empty(&sb->s_more_io)) 523 if (!list_empty(&sb->s_more_io))
525 wbc->more_io = 1; 524 wbc->more_io = 1;
526 } 525 }
526 spin_unlock(&inode_lock);
527 return; /* Leave any unwritten inodes on s_io */ 527 return; /* Leave any unwritten inodes on s_io */
528} 528}
529EXPORT_SYMBOL_GPL(generic_sync_sb_inodes);
530
531static void sync_sb_inodes(struct super_block *sb,
532 struct writeback_control *wbc)
533{
534 generic_sync_sb_inodes(sb, wbc);
535}
529 536
530/* 537/*
531 * Start writeback of dirty pagecache data against all unlocked inodes. 538 * Start writeback of dirty pagecache data against all unlocked inodes.
@@ -565,11 +572,8 @@ restart:
565 * be unmounted by the time it is released. 572 * be unmounted by the time it is released.
566 */ 573 */
567 if (down_read_trylock(&sb->s_umount)) { 574 if (down_read_trylock(&sb->s_umount)) {
568 if (sb->s_root) { 575 if (sb->s_root)
569 spin_lock(&inode_lock);
570 sync_sb_inodes(sb, wbc); 576 sync_sb_inodes(sb, wbc);
571 spin_unlock(&inode_lock);
572 }
573 up_read(&sb->s_umount); 577 up_read(&sb->s_umount);
574 } 578 }
575 spin_lock(&sb_lock); 579 spin_lock(&sb_lock);
@@ -607,9 +611,7 @@ void sync_inodes_sb(struct super_block *sb, int wait)
607 (inodes_stat.nr_inodes - inodes_stat.nr_unused) + 611 (inodes_stat.nr_inodes - inodes_stat.nr_unused) +
608 nr_dirty + nr_unstable; 612 nr_dirty + nr_unstable;
609 wbc.nr_to_write += wbc.nr_to_write / 2; /* Bit more for luck */ 613 wbc.nr_to_write += wbc.nr_to_write / 2; /* Bit more for luck */
610 spin_lock(&inode_lock);
611 sync_sb_inodes(sb, &wbc); 614 sync_sb_inodes(sb, &wbc);
612 spin_unlock(&inode_lock);
613} 615}
614 616
615/* 617/*
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index 7f7947e3dfbb..ab2f57e3fb87 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -14,23 +14,11 @@ config GFS2_FS
14 GFS is perfect consistency -- changes made to the filesystem on one 14 GFS is perfect consistency -- changes made to the filesystem on one
15 machine show up immediately on all other machines in the cluster. 15 machine show up immediately on all other machines in the cluster.
16 16
17 To use the GFS2 filesystem, you will need to enable one or more of 17 To use the GFS2 filesystem in a cluster, you will need to enable
18 the below locking modules. Documentation and utilities for GFS2 can 18 the locking module below. Documentation and utilities for GFS2 can
19 be found here: http://sources.redhat.com/cluster 19 be found here: http://sources.redhat.com/cluster
20 20
21config GFS2_FS_LOCKING_NOLOCK 21 The "nolock" lock module is now built in to GFS2 by default.
22 tristate "GFS2 \"nolock\" locking module"
23 depends on GFS2_FS
24 help
25 Single node locking module for GFS2.
26
27 Use this module if you want to use GFS2 on a single node without
28 its clustering features. You can still take advantage of the
29 large file support, and upgrade to running a full cluster later on
30 if required.
31
32 If you will only be using GFS2 in cluster mode, you do not need this
33 module.
34 22
35config GFS2_FS_LOCKING_DLM 23config GFS2_FS_LOCKING_DLM
36 tristate "GFS2 DLM locking module" 24 tristate "GFS2 DLM locking module"
diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile
index e2350df02a07..ec65851ec80a 100644
--- a/fs/gfs2/Makefile
+++ b/fs/gfs2/Makefile
@@ -5,6 +5,5 @@ gfs2-y := acl.o bmap.o daemon.o dir.o eaops.o eattr.o glock.o \
5 ops_fstype.o ops_inode.o ops_super.o quota.o \ 5 ops_fstype.o ops_inode.o ops_super.o quota.o \
6 recovery.o rgrp.o super.o sys.o trans.o util.o 6 recovery.o rgrp.o super.o sys.o trans.o util.o
7 7
8obj-$(CONFIG_GFS2_FS_LOCKING_NOLOCK) += locking/nolock/
9obj-$(CONFIG_GFS2_FS_LOCKING_DLM) += locking/dlm/ 8obj-$(CONFIG_GFS2_FS_LOCKING_DLM) += locking/dlm/
10 9
diff --git a/fs/gfs2/gfs2.h b/fs/gfs2/gfs2.h
index 3bb11c0f8b56..ef606e3a5cf4 100644
--- a/fs/gfs2/gfs2.h
+++ b/fs/gfs2/gfs2.h
@@ -16,11 +16,6 @@ enum {
16}; 16};
17 17
18enum { 18enum {
19 NO_WAIT = 0,
20 WAIT = 1,
21};
22
23enum {
24 NO_FORCE = 0, 19 NO_FORCE = 0,
25 FORCE = 1, 20 FORCE = 1,
26}; 21};
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index d636b3e80f5d..13391e546616 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -45,21 +45,19 @@ struct gfs2_gl_hash_bucket {
45 struct hlist_head hb_list; 45 struct hlist_head hb_list;
46}; 46};
47 47
48struct glock_iter { 48struct gfs2_glock_iter {
49 int hash; /* hash bucket index */ 49 int hash; /* hash bucket index */
50 struct gfs2_sbd *sdp; /* incore superblock */ 50 struct gfs2_sbd *sdp; /* incore superblock */
51 struct gfs2_glock *gl; /* current glock struct */ 51 struct gfs2_glock *gl; /* current glock struct */
52 struct seq_file *seq; /* sequence file for debugfs */ 52 char string[512]; /* scratch space */
53 char string[512]; /* scratch space */
54}; 53};
55 54
56typedef void (*glock_examiner) (struct gfs2_glock * gl); 55typedef void (*glock_examiner) (struct gfs2_glock * gl);
57 56
58static int gfs2_dump_lockstate(struct gfs2_sbd *sdp); 57static int gfs2_dump_lockstate(struct gfs2_sbd *sdp);
59static int dump_glock(struct glock_iter *gi, struct gfs2_glock *gl); 58static int __dump_glock(struct seq_file *seq, const struct gfs2_glock *gl);
60static void gfs2_glock_xmote_th(struct gfs2_glock *gl, struct gfs2_holder *gh); 59#define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { __dump_glock(NULL, gl); BUG(); } } while(0)
61static void gfs2_glock_drop_th(struct gfs2_glock *gl); 60static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target);
62static void run_queue(struct gfs2_glock *gl);
63 61
64static DECLARE_RWSEM(gfs2_umount_flush_sem); 62static DECLARE_RWSEM(gfs2_umount_flush_sem);
65static struct dentry *gfs2_root; 63static struct dentry *gfs2_root;
@@ -123,33 +121,6 @@ static inline rwlock_t *gl_lock_addr(unsigned int x)
123#endif 121#endif
124 122
125/** 123/**
126 * relaxed_state_ok - is a requested lock compatible with the current lock mode?
127 * @actual: the current state of the lock
128 * @requested: the lock state that was requested by the caller
129 * @flags: the modifier flags passed in by the caller
130 *
131 * Returns: 1 if the locks are compatible, 0 otherwise
132 */
133
134static inline int relaxed_state_ok(unsigned int actual, unsigned requested,
135 int flags)
136{
137 if (actual == requested)
138 return 1;
139
140 if (flags & GL_EXACT)
141 return 0;
142
143 if (actual == LM_ST_EXCLUSIVE && requested == LM_ST_SHARED)
144 return 1;
145
146 if (actual != LM_ST_UNLOCKED && (flags & LM_FLAG_ANY))
147 return 1;
148
149 return 0;
150}
151
152/**
153 * gl_hash() - Turn glock number into hash bucket number 124 * gl_hash() - Turn glock number into hash bucket number
154 * @lock: The glock number 125 * @lock: The glock number
155 * 126 *
@@ -182,7 +153,7 @@ static void glock_free(struct gfs2_glock *gl)
182 struct gfs2_sbd *sdp = gl->gl_sbd; 153 struct gfs2_sbd *sdp = gl->gl_sbd;
183 struct inode *aspace = gl->gl_aspace; 154 struct inode *aspace = gl->gl_aspace;
184 155
185 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) 156 if (sdp->sd_lockstruct.ls_ops->lm_put_lock)
186 sdp->sd_lockstruct.ls_ops->lm_put_lock(gl->gl_lock); 157 sdp->sd_lockstruct.ls_ops->lm_put_lock(gl->gl_lock);
187 158
188 if (aspace) 159 if (aspace)
@@ -211,17 +182,14 @@ static void gfs2_glock_hold(struct gfs2_glock *gl)
211int gfs2_glock_put(struct gfs2_glock *gl) 182int gfs2_glock_put(struct gfs2_glock *gl)
212{ 183{
213 int rv = 0; 184 int rv = 0;
214 struct gfs2_sbd *sdp = gl->gl_sbd;
215 185
216 write_lock(gl_lock_addr(gl->gl_hash)); 186 write_lock(gl_lock_addr(gl->gl_hash));
217 if (atomic_dec_and_test(&gl->gl_ref)) { 187 if (atomic_dec_and_test(&gl->gl_ref)) {
218 hlist_del(&gl->gl_list); 188 hlist_del(&gl->gl_list);
219 write_unlock(gl_lock_addr(gl->gl_hash)); 189 write_unlock(gl_lock_addr(gl->gl_hash));
220 gfs2_assert(sdp, gl->gl_state == LM_ST_UNLOCKED); 190 GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_UNLOCKED);
221 gfs2_assert(sdp, list_empty(&gl->gl_reclaim)); 191 GLOCK_BUG_ON(gl, !list_empty(&gl->gl_reclaim));
222 gfs2_assert(sdp, list_empty(&gl->gl_holders)); 192 GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders));
223 gfs2_assert(sdp, list_empty(&gl->gl_waiters1));
224 gfs2_assert(sdp, list_empty(&gl->gl_waiters3));
225 glock_free(gl); 193 glock_free(gl);
226 rv = 1; 194 rv = 1;
227 goto out; 195 goto out;
@@ -281,22 +249,401 @@ static struct gfs2_glock *gfs2_glock_find(const struct gfs2_sbd *sdp,
281 return gl; 249 return gl;
282} 250}
283 251
252/**
253 * may_grant - check if its ok to grant a new lock
254 * @gl: The glock
255 * @gh: The lock request which we wish to grant
256 *
257 * Returns: true if its ok to grant the lock
258 */
259
260static inline int may_grant(const struct gfs2_glock *gl, const struct gfs2_holder *gh)
261{
262 const struct gfs2_holder *gh_head = list_entry(gl->gl_holders.next, const struct gfs2_holder, gh_list);
263 if ((gh->gh_state == LM_ST_EXCLUSIVE ||
264 gh_head->gh_state == LM_ST_EXCLUSIVE) && gh != gh_head)
265 return 0;
266 if (gl->gl_state == gh->gh_state)
267 return 1;
268 if (gh->gh_flags & GL_EXACT)
269 return 0;
270 if (gl->gl_state == LM_ST_EXCLUSIVE) {
271 if (gh->gh_state == LM_ST_SHARED && gh_head->gh_state == LM_ST_SHARED)
272 return 1;
273 if (gh->gh_state == LM_ST_DEFERRED && gh_head->gh_state == LM_ST_DEFERRED)
274 return 1;
275 }
276 if (gl->gl_state != LM_ST_UNLOCKED && (gh->gh_flags & LM_FLAG_ANY))
277 return 1;
278 return 0;
279}
280
281static void gfs2_holder_wake(struct gfs2_holder *gh)
282{
283 clear_bit(HIF_WAIT, &gh->gh_iflags);
284 smp_mb__after_clear_bit();
285 wake_up_bit(&gh->gh_iflags, HIF_WAIT);
286}
287
288/**
289 * do_promote - promote as many requests as possible on the current queue
290 * @gl: The glock
291 *
292 * Returns: true if there is a blocked holder at the head of the list
293 */
294
295static int do_promote(struct gfs2_glock *gl)
296{
297 const struct gfs2_glock_operations *glops = gl->gl_ops;
298 struct gfs2_holder *gh, *tmp;
299 int ret;
300
301restart:
302 list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) {
303 if (test_bit(HIF_HOLDER, &gh->gh_iflags))
304 continue;
305 if (may_grant(gl, gh)) {
306 if (gh->gh_list.prev == &gl->gl_holders &&
307 glops->go_lock) {
308 spin_unlock(&gl->gl_spin);
309 /* FIXME: eliminate this eventually */
310 ret = glops->go_lock(gh);
311 spin_lock(&gl->gl_spin);
312 if (ret) {
313 gh->gh_error = ret;
314 list_del_init(&gh->gh_list);
315 gfs2_holder_wake(gh);
316 goto restart;
317 }
318 set_bit(HIF_HOLDER, &gh->gh_iflags);
319 gfs2_holder_wake(gh);
320 goto restart;
321 }
322 set_bit(HIF_HOLDER, &gh->gh_iflags);
323 gfs2_holder_wake(gh);
324 continue;
325 }
326 if (gh->gh_list.prev == &gl->gl_holders)
327 return 1;
328 break;
329 }
330 return 0;
331}
332
333/**
334 * do_error - Something unexpected has happened during a lock request
335 *
336 */
337
338static inline void do_error(struct gfs2_glock *gl, const int ret)
339{
340 struct gfs2_holder *gh, *tmp;
341
342 list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) {
343 if (test_bit(HIF_HOLDER, &gh->gh_iflags))
344 continue;
345 if (ret & LM_OUT_ERROR)
346 gh->gh_error = -EIO;
347 else if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))
348 gh->gh_error = GLR_TRYFAILED;
349 else
350 continue;
351 list_del_init(&gh->gh_list);
352 gfs2_holder_wake(gh);
353 }
354}
355
356/**
357 * find_first_waiter - find the first gh that's waiting for the glock
358 * @gl: the glock
359 */
360
361static inline struct gfs2_holder *find_first_waiter(const struct gfs2_glock *gl)
362{
363 struct gfs2_holder *gh;
364
365 list_for_each_entry(gh, &gl->gl_holders, gh_list) {
366 if (!test_bit(HIF_HOLDER, &gh->gh_iflags))
367 return gh;
368 }
369 return NULL;
370}
371
372/**
373 * state_change - record that the glock is now in a different state
374 * @gl: the glock
375 * @new_state the new state
376 *
377 */
378
379static void state_change(struct gfs2_glock *gl, unsigned int new_state)
380{
381 int held1, held2;
382
383 held1 = (gl->gl_state != LM_ST_UNLOCKED);
384 held2 = (new_state != LM_ST_UNLOCKED);
385
386 if (held1 != held2) {
387 if (held2)
388 gfs2_glock_hold(gl);
389 else
390 gfs2_glock_put(gl);
391 }
392
393 gl->gl_state = new_state;
394 gl->gl_tchange = jiffies;
395}
396
397static void gfs2_demote_wake(struct gfs2_glock *gl)
398{
399 gl->gl_demote_state = LM_ST_EXCLUSIVE;
400 clear_bit(GLF_DEMOTE, &gl->gl_flags);
401 smp_mb__after_clear_bit();
402 wake_up_bit(&gl->gl_flags, GLF_DEMOTE);
403}
404
405/**
406 * finish_xmote - The DLM has replied to one of our lock requests
407 * @gl: The glock
408 * @ret: The status from the DLM
409 *
410 */
411
412static void finish_xmote(struct gfs2_glock *gl, unsigned int ret)
413{
414 const struct gfs2_glock_operations *glops = gl->gl_ops;
415 struct gfs2_holder *gh;
416 unsigned state = ret & LM_OUT_ST_MASK;
417
418 spin_lock(&gl->gl_spin);
419 state_change(gl, state);
420 gh = find_first_waiter(gl);
421
422 /* Demote to UN request arrived during demote to SH or DF */
423 if (test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags) &&
424 state != LM_ST_UNLOCKED && gl->gl_demote_state == LM_ST_UNLOCKED)
425 gl->gl_target = LM_ST_UNLOCKED;
426
427 /* Check for state != intended state */
428 if (unlikely(state != gl->gl_target)) {
429 if (gh && !test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags)) {
430 /* move to back of queue and try next entry */
431 if (ret & LM_OUT_CANCELED) {
432 if ((gh->gh_flags & LM_FLAG_PRIORITY) == 0)
433 list_move_tail(&gh->gh_list, &gl->gl_holders);
434 gh = find_first_waiter(gl);
435 gl->gl_target = gh->gh_state;
436 goto retry;
437 }
438 /* Some error or failed "try lock" - report it */
439 if ((ret & LM_OUT_ERROR) ||
440 (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) {
441 gl->gl_target = gl->gl_state;
442 do_error(gl, ret);
443 goto out;
444 }
445 }
446 switch(state) {
447 /* Unlocked due to conversion deadlock, try again */
448 case LM_ST_UNLOCKED:
449retry:
450 do_xmote(gl, gh, gl->gl_target);
451 break;
452 /* Conversion fails, unlock and try again */
453 case LM_ST_SHARED:
454 case LM_ST_DEFERRED:
455 do_xmote(gl, gh, LM_ST_UNLOCKED);
456 break;
457 default: /* Everything else */
458 printk(KERN_ERR "GFS2: wanted %u got %u\n", gl->gl_target, state);
459 GLOCK_BUG_ON(gl, 1);
460 }
461 spin_unlock(&gl->gl_spin);
462 gfs2_glock_put(gl);
463 return;
464 }
465
466 /* Fast path - we got what we asked for */
467 if (test_and_clear_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags))
468 gfs2_demote_wake(gl);
469 if (state != LM_ST_UNLOCKED) {
470 if (glops->go_xmote_bh) {
471 int rv;
472 spin_unlock(&gl->gl_spin);
473 rv = glops->go_xmote_bh(gl, gh);
474 if (rv == -EAGAIN)
475 return;
476 spin_lock(&gl->gl_spin);
477 if (rv) {
478 do_error(gl, rv);
479 goto out;
480 }
481 }
482 do_promote(gl);
483 }
484out:
485 clear_bit(GLF_LOCK, &gl->gl_flags);
486 spin_unlock(&gl->gl_spin);
487 gfs2_glock_put(gl);
488}
489
490static unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock,
491 unsigned int cur_state, unsigned int req_state,
492 unsigned int flags)
493{
494 int ret = LM_OUT_ERROR;
495
496 if (!sdp->sd_lockstruct.ls_ops->lm_lock)
497 return req_state == LM_ST_UNLOCKED ? 0 : req_state;
498
499 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
500 ret = sdp->sd_lockstruct.ls_ops->lm_lock(lock, cur_state,
501 req_state, flags);
502 return ret;
503}
504
505/**
506 * do_xmote - Calls the DLM to change the state of a lock
507 * @gl: The lock state
508 * @gh: The holder (only for promotes)
509 * @target: The target lock state
510 *
511 */
512
513static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target)
514{
515 const struct gfs2_glock_operations *glops = gl->gl_ops;
516 struct gfs2_sbd *sdp = gl->gl_sbd;
517 unsigned int lck_flags = gh ? gh->gh_flags : 0;
518 int ret;
519
520 lck_flags &= (LM_FLAG_TRY | LM_FLAG_TRY_1CB | LM_FLAG_NOEXP |
521 LM_FLAG_PRIORITY);
522 BUG_ON(gl->gl_state == target);
523 BUG_ON(gl->gl_state == gl->gl_target);
524 if ((target == LM_ST_UNLOCKED || target == LM_ST_DEFERRED) &&
525 glops->go_inval) {
526 set_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags);
527 do_error(gl, 0); /* Fail queued try locks */
528 }
529 spin_unlock(&gl->gl_spin);
530 if (glops->go_xmote_th)
531 glops->go_xmote_th(gl);
532 if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags))
533 glops->go_inval(gl, target == LM_ST_DEFERRED ? 0 : DIO_METADATA);
534 clear_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags);
535
536 gfs2_glock_hold(gl);
537 if (target != LM_ST_UNLOCKED && (gl->gl_state == LM_ST_SHARED ||
538 gl->gl_state == LM_ST_DEFERRED) &&
539 !(lck_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)))
540 lck_flags |= LM_FLAG_TRY_1CB;
541 ret = gfs2_lm_lock(sdp, gl->gl_lock, gl->gl_state, target, lck_flags);
542
543 if (!(ret & LM_OUT_ASYNC)) {
544 finish_xmote(gl, ret);
545 gfs2_glock_hold(gl);
546 if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
547 gfs2_glock_put(gl);
548 } else {
549 GLOCK_BUG_ON(gl, ret != LM_OUT_ASYNC);
550 }
551 spin_lock(&gl->gl_spin);
552}
553
554/**
555 * find_first_holder - find the first "holder" gh
556 * @gl: the glock
557 */
558
559static inline struct gfs2_holder *find_first_holder(const struct gfs2_glock *gl)
560{
561 struct gfs2_holder *gh;
562
563 if (!list_empty(&gl->gl_holders)) {
564 gh = list_entry(gl->gl_holders.next, struct gfs2_holder, gh_list);
565 if (test_bit(HIF_HOLDER, &gh->gh_iflags))
566 return gh;
567 }
568 return NULL;
569}
570
571/**
572 * run_queue - do all outstanding tasks related to a glock
573 * @gl: The glock in question
574 * @nonblock: True if we must not block in run_queue
575 *
576 */
577
578static void run_queue(struct gfs2_glock *gl, const int nonblock)
579{
580 struct gfs2_holder *gh = NULL;
581
582 if (test_and_set_bit(GLF_LOCK, &gl->gl_flags))
583 return;
584
585 GLOCK_BUG_ON(gl, test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags));
586
587 if (test_bit(GLF_DEMOTE, &gl->gl_flags) &&
588 gl->gl_demote_state != gl->gl_state) {
589 if (find_first_holder(gl))
590 goto out;
591 if (nonblock)
592 goto out_sched;
593 set_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags);
594 GLOCK_BUG_ON(gl, gl->gl_demote_state == LM_ST_EXCLUSIVE);
595 gl->gl_target = gl->gl_demote_state;
596 } else {
597 if (test_bit(GLF_DEMOTE, &gl->gl_flags))
598 gfs2_demote_wake(gl);
599 if (do_promote(gl) == 0)
600 goto out;
601 gh = find_first_waiter(gl);
602 gl->gl_target = gh->gh_state;
603 if (!(gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)))
604 do_error(gl, 0); /* Fail queued try locks */
605 }
606 do_xmote(gl, gh, gl->gl_target);
607 return;
608
609out_sched:
610 gfs2_glock_hold(gl);
611 if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
612 gfs2_glock_put(gl);
613out:
614 clear_bit(GLF_LOCK, &gl->gl_flags);
615}
616
284static void glock_work_func(struct work_struct *work) 617static void glock_work_func(struct work_struct *work)
285{ 618{
619 unsigned long delay = 0;
286 struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_work.work); 620 struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_work.work);
287 621
622 if (test_and_clear_bit(GLF_REPLY_PENDING, &gl->gl_flags))
623 finish_xmote(gl, gl->gl_reply);
288 spin_lock(&gl->gl_spin); 624 spin_lock(&gl->gl_spin);
289 if (test_and_clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags)) 625 if (test_and_clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
290 set_bit(GLF_DEMOTE, &gl->gl_flags); 626 gl->gl_state != LM_ST_UNLOCKED &&
291 run_queue(gl); 627 gl->gl_demote_state != LM_ST_EXCLUSIVE) {
628 unsigned long holdtime, now = jiffies;
629 holdtime = gl->gl_tchange + gl->gl_ops->go_min_hold_time;
630 if (time_before(now, holdtime))
631 delay = holdtime - now;
632 set_bit(delay ? GLF_PENDING_DEMOTE : GLF_DEMOTE, &gl->gl_flags);
633 }
634 run_queue(gl, 0);
292 spin_unlock(&gl->gl_spin); 635 spin_unlock(&gl->gl_spin);
293 gfs2_glock_put(gl); 636 if (!delay ||
637 queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0)
638 gfs2_glock_put(gl);
294} 639}
295 640
296static int gfs2_lm_get_lock(struct gfs2_sbd *sdp, struct lm_lockname *name, 641static int gfs2_lm_get_lock(struct gfs2_sbd *sdp, struct lm_lockname *name,
297 void **lockp) 642 void **lockp)
298{ 643{
299 int error = -EIO; 644 int error = -EIO;
645 if (!sdp->sd_lockstruct.ls_ops->lm_get_lock)
646 return 0;
300 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) 647 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
301 error = sdp->sd_lockstruct.ls_ops->lm_get_lock( 648 error = sdp->sd_lockstruct.ls_ops->lm_get_lock(
302 sdp->sd_lockstruct.ls_lockspace, name, lockp); 649 sdp->sd_lockstruct.ls_lockspace, name, lockp);
@@ -342,12 +689,10 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
342 gl->gl_name = name; 689 gl->gl_name = name;
343 atomic_set(&gl->gl_ref, 1); 690 atomic_set(&gl->gl_ref, 1);
344 gl->gl_state = LM_ST_UNLOCKED; 691 gl->gl_state = LM_ST_UNLOCKED;
692 gl->gl_target = LM_ST_UNLOCKED;
345 gl->gl_demote_state = LM_ST_EXCLUSIVE; 693 gl->gl_demote_state = LM_ST_EXCLUSIVE;
346 gl->gl_hash = hash; 694 gl->gl_hash = hash;
347 gl->gl_owner_pid = NULL;
348 gl->gl_ip = 0;
349 gl->gl_ops = glops; 695 gl->gl_ops = glops;
350 gl->gl_req_gh = NULL;
351 gl->gl_stamp = jiffies; 696 gl->gl_stamp = jiffies;
352 gl->gl_tchange = jiffies; 697 gl->gl_tchange = jiffies;
353 gl->gl_object = NULL; 698 gl->gl_object = NULL;
@@ -447,13 +792,6 @@ void gfs2_holder_uninit(struct gfs2_holder *gh)
447 gh->gh_ip = 0; 792 gh->gh_ip = 0;
448} 793}
449 794
450static void gfs2_holder_wake(struct gfs2_holder *gh)
451{
452 clear_bit(HIF_WAIT, &gh->gh_iflags);
453 smp_mb__after_clear_bit();
454 wake_up_bit(&gh->gh_iflags, HIF_WAIT);
455}
456
457static int just_schedule(void *word) 795static int just_schedule(void *word)
458{ 796{
459 schedule(); 797 schedule();
@@ -466,14 +804,6 @@ static void wait_on_holder(struct gfs2_holder *gh)
466 wait_on_bit(&gh->gh_iflags, HIF_WAIT, just_schedule, TASK_UNINTERRUPTIBLE); 804 wait_on_bit(&gh->gh_iflags, HIF_WAIT, just_schedule, TASK_UNINTERRUPTIBLE);
467} 805}
468 806
469static void gfs2_demote_wake(struct gfs2_glock *gl)
470{
471 gl->gl_demote_state = LM_ST_EXCLUSIVE;
472 clear_bit(GLF_DEMOTE, &gl->gl_flags);
473 smp_mb__after_clear_bit();
474 wake_up_bit(&gl->gl_flags, GLF_DEMOTE);
475}
476
477static void wait_on_demote(struct gfs2_glock *gl) 807static void wait_on_demote(struct gfs2_glock *gl)
478{ 808{
479 might_sleep(); 809 might_sleep();
@@ -481,217 +811,6 @@ static void wait_on_demote(struct gfs2_glock *gl)
481} 811}
482 812
483/** 813/**
484 * rq_mutex - process a mutex request in the queue
485 * @gh: the glock holder
486 *
487 * Returns: 1 if the queue is blocked
488 */
489
490static int rq_mutex(struct gfs2_holder *gh)
491{
492 struct gfs2_glock *gl = gh->gh_gl;
493
494 list_del_init(&gh->gh_list);
495 /* gh->gh_error never examined. */
496 set_bit(GLF_LOCK, &gl->gl_flags);
497 clear_bit(HIF_WAIT, &gh->gh_iflags);
498 smp_mb();
499 wake_up_bit(&gh->gh_iflags, HIF_WAIT);
500
501 return 1;
502}
503
504/**
505 * rq_promote - process a promote request in the queue
506 * @gh: the glock holder
507 *
508 * Acquire a new inter-node lock, or change a lock state to more restrictive.
509 *
510 * Returns: 1 if the queue is blocked
511 */
512
513static int rq_promote(struct gfs2_holder *gh)
514{
515 struct gfs2_glock *gl = gh->gh_gl;
516
517 if (!relaxed_state_ok(gl->gl_state, gh->gh_state, gh->gh_flags)) {
518 if (list_empty(&gl->gl_holders)) {
519 gl->gl_req_gh = gh;
520 set_bit(GLF_LOCK, &gl->gl_flags);
521 spin_unlock(&gl->gl_spin);
522 gfs2_glock_xmote_th(gh->gh_gl, gh);
523 spin_lock(&gl->gl_spin);
524 }
525 return 1;
526 }
527
528 if (list_empty(&gl->gl_holders)) {
529 set_bit(HIF_FIRST, &gh->gh_iflags);
530 set_bit(GLF_LOCK, &gl->gl_flags);
531 } else {
532 struct gfs2_holder *next_gh;
533 if (gh->gh_state == LM_ST_EXCLUSIVE)
534 return 1;
535 next_gh = list_entry(gl->gl_holders.next, struct gfs2_holder,
536 gh_list);
537 if (next_gh->gh_state == LM_ST_EXCLUSIVE)
538 return 1;
539 }
540
541 list_move_tail(&gh->gh_list, &gl->gl_holders);
542 gh->gh_error = 0;
543 set_bit(HIF_HOLDER, &gh->gh_iflags);
544
545 gfs2_holder_wake(gh);
546
547 return 0;
548}
549
550/**
551 * rq_demote - process a demote request in the queue
552 * @gh: the glock holder
553 *
554 * Returns: 1 if the queue is blocked
555 */
556
557static int rq_demote(struct gfs2_glock *gl)
558{
559 if (!list_empty(&gl->gl_holders))
560 return 1;
561
562 if (gl->gl_state == gl->gl_demote_state ||
563 gl->gl_state == LM_ST_UNLOCKED) {
564 gfs2_demote_wake(gl);
565 return 0;
566 }
567
568 set_bit(GLF_LOCK, &gl->gl_flags);
569 set_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags);
570
571 if (gl->gl_demote_state == LM_ST_UNLOCKED ||
572 gl->gl_state != LM_ST_EXCLUSIVE) {
573 spin_unlock(&gl->gl_spin);
574 gfs2_glock_drop_th(gl);
575 } else {
576 spin_unlock(&gl->gl_spin);
577 gfs2_glock_xmote_th(gl, NULL);
578 }
579
580 spin_lock(&gl->gl_spin);
581 clear_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags);
582
583 return 0;
584}
585
586/**
587 * run_queue - process holder structures on a glock
588 * @gl: the glock
589 *
590 */
591static void run_queue(struct gfs2_glock *gl)
592{
593 struct gfs2_holder *gh;
594 int blocked = 1;
595
596 for (;;) {
597 if (test_bit(GLF_LOCK, &gl->gl_flags))
598 break;
599
600 if (!list_empty(&gl->gl_waiters1)) {
601 gh = list_entry(gl->gl_waiters1.next,
602 struct gfs2_holder, gh_list);
603 blocked = rq_mutex(gh);
604 } else if (test_bit(GLF_DEMOTE, &gl->gl_flags)) {
605 blocked = rq_demote(gl);
606 if (test_bit(GLF_WAITERS2, &gl->gl_flags) &&
607 !blocked) {
608 set_bit(GLF_DEMOTE, &gl->gl_flags);
609 gl->gl_demote_state = LM_ST_UNLOCKED;
610 }
611 clear_bit(GLF_WAITERS2, &gl->gl_flags);
612 } else if (!list_empty(&gl->gl_waiters3)) {
613 gh = list_entry(gl->gl_waiters3.next,
614 struct gfs2_holder, gh_list);
615 blocked = rq_promote(gh);
616 } else
617 break;
618
619 if (blocked)
620 break;
621 }
622}
623
624/**
625 * gfs2_glmutex_lock - acquire a local lock on a glock
626 * @gl: the glock
627 *
628 * Gives caller exclusive access to manipulate a glock structure.
629 */
630
631static void gfs2_glmutex_lock(struct gfs2_glock *gl)
632{
633 spin_lock(&gl->gl_spin);
634 if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
635 struct gfs2_holder gh;
636
637 gfs2_holder_init(gl, 0, 0, &gh);
638 set_bit(HIF_WAIT, &gh.gh_iflags);
639 list_add_tail(&gh.gh_list, &gl->gl_waiters1);
640 spin_unlock(&gl->gl_spin);
641 wait_on_holder(&gh);
642 gfs2_holder_uninit(&gh);
643 } else {
644 gl->gl_owner_pid = get_pid(task_pid(current));
645 gl->gl_ip = (unsigned long)__builtin_return_address(0);
646 spin_unlock(&gl->gl_spin);
647 }
648}
649
650/**
651 * gfs2_glmutex_trylock - try to acquire a local lock on a glock
652 * @gl: the glock
653 *
654 * Returns: 1 if the glock is acquired
655 */
656
657static int gfs2_glmutex_trylock(struct gfs2_glock *gl)
658{
659 int acquired = 1;
660
661 spin_lock(&gl->gl_spin);
662 if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
663 acquired = 0;
664 } else {
665 gl->gl_owner_pid = get_pid(task_pid(current));
666 gl->gl_ip = (unsigned long)__builtin_return_address(0);
667 }
668 spin_unlock(&gl->gl_spin);
669
670 return acquired;
671}
672
673/**
674 * gfs2_glmutex_unlock - release a local lock on a glock
675 * @gl: the glock
676 *
677 */
678
679static void gfs2_glmutex_unlock(struct gfs2_glock *gl)
680{
681 struct pid *pid;
682
683 spin_lock(&gl->gl_spin);
684 clear_bit(GLF_LOCK, &gl->gl_flags);
685 pid = gl->gl_owner_pid;
686 gl->gl_owner_pid = NULL;
687 gl->gl_ip = 0;
688 run_queue(gl);
689 spin_unlock(&gl->gl_spin);
690
691 put_pid(pid);
692}
693
694/**
695 * handle_callback - process a demote request 814 * handle_callback - process a demote request
696 * @gl: the glock 815 * @gl: the glock
697 * @state: the state the caller wants us to change to 816 * @state: the state the caller wants us to change to
@@ -705,398 +824,45 @@ static void handle_callback(struct gfs2_glock *gl, unsigned int state,
705{ 824{
706 int bit = delay ? GLF_PENDING_DEMOTE : GLF_DEMOTE; 825 int bit = delay ? GLF_PENDING_DEMOTE : GLF_DEMOTE;
707 826
708 spin_lock(&gl->gl_spin);
709 set_bit(bit, &gl->gl_flags); 827 set_bit(bit, &gl->gl_flags);
710 if (gl->gl_demote_state == LM_ST_EXCLUSIVE) { 828 if (gl->gl_demote_state == LM_ST_EXCLUSIVE) {
711 gl->gl_demote_state = state; 829 gl->gl_demote_state = state;
712 gl->gl_demote_time = jiffies; 830 gl->gl_demote_time = jiffies;
713 if (remote && gl->gl_ops->go_type == LM_TYPE_IOPEN && 831 if (remote && gl->gl_ops->go_type == LM_TYPE_IOPEN &&
714 gl->gl_object) { 832 gl->gl_object)
715 gfs2_glock_schedule_for_reclaim(gl); 833 gfs2_glock_schedule_for_reclaim(gl);
716 spin_unlock(&gl->gl_spin);
717 return;
718 }
719 } else if (gl->gl_demote_state != LM_ST_UNLOCKED && 834 } else if (gl->gl_demote_state != LM_ST_UNLOCKED &&
720 gl->gl_demote_state != state) { 835 gl->gl_demote_state != state) {
721 if (test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags)) 836 gl->gl_demote_state = LM_ST_UNLOCKED;
722 set_bit(GLF_WAITERS2, &gl->gl_flags);
723 else
724 gl->gl_demote_state = LM_ST_UNLOCKED;
725 }
726 spin_unlock(&gl->gl_spin);
727}
728
729/**
730 * state_change - record that the glock is now in a different state
731 * @gl: the glock
732 * @new_state the new state
733 *
734 */
735
736static void state_change(struct gfs2_glock *gl, unsigned int new_state)
737{
738 int held1, held2;
739
740 held1 = (gl->gl_state != LM_ST_UNLOCKED);
741 held2 = (new_state != LM_ST_UNLOCKED);
742
743 if (held1 != held2) {
744 if (held2)
745 gfs2_glock_hold(gl);
746 else
747 gfs2_glock_put(gl);
748 } 837 }
749
750 gl->gl_state = new_state;
751 gl->gl_tchange = jiffies;
752} 838}
753 839
754/** 840/**
755 * drop_bh - Called after a lock module unlock completes 841 * gfs2_glock_wait - wait on a glock acquisition
756 * @gl: the glock
757 * @ret: the return status
758 *
759 * Doesn't wake up the process waiting on the struct gfs2_holder (if any)
760 * Doesn't drop the reference on the glock the top half took out
761 *
762 */
763
764static void drop_bh(struct gfs2_glock *gl, unsigned int ret)
765{
766 struct gfs2_sbd *sdp = gl->gl_sbd;
767 struct gfs2_holder *gh = gl->gl_req_gh;
768
769 gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
770 gfs2_assert_warn(sdp, list_empty(&gl->gl_holders));
771 gfs2_assert_warn(sdp, !ret);
772
773 state_change(gl, LM_ST_UNLOCKED);
774
775 if (test_and_clear_bit(GLF_CONV_DEADLK, &gl->gl_flags)) {
776 spin_lock(&gl->gl_spin);
777 gh->gh_error = 0;
778 spin_unlock(&gl->gl_spin);
779 gfs2_glock_xmote_th(gl, gl->gl_req_gh);
780 gfs2_glock_put(gl);
781 return;
782 }
783
784 spin_lock(&gl->gl_spin);
785 gfs2_demote_wake(gl);
786 clear_bit(GLF_LOCK, &gl->gl_flags);
787 spin_unlock(&gl->gl_spin);
788 gfs2_glock_put(gl);
789}
790
791/**
792 * xmote_bh - Called after the lock module is done acquiring a lock
793 * @gl: The glock in question
794 * @ret: the int returned from the lock module
795 *
796 */
797
798static void xmote_bh(struct gfs2_glock *gl, unsigned int ret)
799{
800 struct gfs2_sbd *sdp = gl->gl_sbd;
801 const struct gfs2_glock_operations *glops = gl->gl_ops;
802 struct gfs2_holder *gh = gl->gl_req_gh;
803 int op_done = 1;
804
805 if (!gh && (ret & LM_OUT_ST_MASK) == LM_ST_UNLOCKED) {
806 drop_bh(gl, ret);
807 return;
808 }
809
810 gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
811 gfs2_assert_warn(sdp, list_empty(&gl->gl_holders));
812 gfs2_assert_warn(sdp, !(ret & LM_OUT_ASYNC));
813
814 state_change(gl, ret & LM_OUT_ST_MASK);
815
816 /* Deal with each possible exit condition */
817
818 if (!gh) {
819 gl->gl_stamp = jiffies;
820 if (ret & LM_OUT_CANCELED) {
821 op_done = 0;
822 } else {
823 spin_lock(&gl->gl_spin);
824 if (gl->gl_state != gl->gl_demote_state) {
825 spin_unlock(&gl->gl_spin);
826 gfs2_glock_drop_th(gl);
827 gfs2_glock_put(gl);
828 return;
829 }
830 gfs2_demote_wake(gl);
831 spin_unlock(&gl->gl_spin);
832 }
833 } else {
834 spin_lock(&gl->gl_spin);
835 if (ret & LM_OUT_CONV_DEADLK) {
836 gh->gh_error = 0;
837 set_bit(GLF_CONV_DEADLK, &gl->gl_flags);
838 spin_unlock(&gl->gl_spin);
839 gfs2_glock_drop_th(gl);
840 gfs2_glock_put(gl);
841 return;
842 }
843 list_del_init(&gh->gh_list);
844 gh->gh_error = -EIO;
845 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
846 goto out;
847 gh->gh_error = GLR_CANCELED;
848 if (ret & LM_OUT_CANCELED)
849 goto out;
850 if (relaxed_state_ok(gl->gl_state, gh->gh_state, gh->gh_flags)) {
851 list_add_tail(&gh->gh_list, &gl->gl_holders);
852 gh->gh_error = 0;
853 set_bit(HIF_HOLDER, &gh->gh_iflags);
854 set_bit(HIF_FIRST, &gh->gh_iflags);
855 op_done = 0;
856 goto out;
857 }
858 gh->gh_error = GLR_TRYFAILED;
859 if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))
860 goto out;
861 gh->gh_error = -EINVAL;
862 if (gfs2_assert_withdraw(sdp, 0) == -1)
863 fs_err(sdp, "ret = 0x%.8X\n", ret);
864out:
865 spin_unlock(&gl->gl_spin);
866 }
867
868 if (glops->go_xmote_bh)
869 glops->go_xmote_bh(gl);
870
871 if (op_done) {
872 spin_lock(&gl->gl_spin);
873 gl->gl_req_gh = NULL;
874 clear_bit(GLF_LOCK, &gl->gl_flags);
875 spin_unlock(&gl->gl_spin);
876 }
877
878 gfs2_glock_put(gl);
879
880 if (gh)
881 gfs2_holder_wake(gh);
882}
883
884static unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock,
885 unsigned int cur_state, unsigned int req_state,
886 unsigned int flags)
887{
888 int ret = 0;
889 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
890 ret = sdp->sd_lockstruct.ls_ops->lm_lock(lock, cur_state,
891 req_state, flags);
892 return ret;
893}
894
895/**
896 * gfs2_glock_xmote_th - Call into the lock module to acquire or change a glock
897 * @gl: The glock in question
898 * @state: the requested state
899 * @flags: modifier flags to the lock call
900 *
901 */
902
903static void gfs2_glock_xmote_th(struct gfs2_glock *gl, struct gfs2_holder *gh)
904{
905 struct gfs2_sbd *sdp = gl->gl_sbd;
906 int flags = gh ? gh->gh_flags : 0;
907 unsigned state = gh ? gh->gh_state : gl->gl_demote_state;
908 const struct gfs2_glock_operations *glops = gl->gl_ops;
909 int lck_flags = flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB |
910 LM_FLAG_NOEXP | LM_FLAG_ANY |
911 LM_FLAG_PRIORITY);
912 unsigned int lck_ret;
913
914 if (glops->go_xmote_th)
915 glops->go_xmote_th(gl);
916 if (state == LM_ST_DEFERRED && glops->go_inval)
917 glops->go_inval(gl, DIO_METADATA);
918
919 gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
920 gfs2_assert_warn(sdp, list_empty(&gl->gl_holders));
921 gfs2_assert_warn(sdp, state != LM_ST_UNLOCKED);
922 gfs2_assert_warn(sdp, state != gl->gl_state);
923
924 gfs2_glock_hold(gl);
925
926 lck_ret = gfs2_lm_lock(sdp, gl->gl_lock, gl->gl_state, state, lck_flags);
927
928 if (gfs2_assert_withdraw(sdp, !(lck_ret & LM_OUT_ERROR)))
929 return;
930
931 if (lck_ret & LM_OUT_ASYNC)
932 gfs2_assert_warn(sdp, lck_ret == LM_OUT_ASYNC);
933 else
934 xmote_bh(gl, lck_ret);
935}
936
937static unsigned int gfs2_lm_unlock(struct gfs2_sbd *sdp, void *lock,
938 unsigned int cur_state)
939{
940 int ret = 0;
941 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
942 ret = sdp->sd_lockstruct.ls_ops->lm_unlock(lock, cur_state);
943 return ret;
944}
945
946/**
947 * gfs2_glock_drop_th - call into the lock module to unlock a lock
948 * @gl: the glock
949 *
950 */
951
952static void gfs2_glock_drop_th(struct gfs2_glock *gl)
953{
954 struct gfs2_sbd *sdp = gl->gl_sbd;
955 const struct gfs2_glock_operations *glops = gl->gl_ops;
956 unsigned int ret;
957
958 if (glops->go_xmote_th)
959 glops->go_xmote_th(gl);
960 if (glops->go_inval)
961 glops->go_inval(gl, DIO_METADATA);
962
963 gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
964 gfs2_assert_warn(sdp, list_empty(&gl->gl_holders));
965 gfs2_assert_warn(sdp, gl->gl_state != LM_ST_UNLOCKED);
966
967 gfs2_glock_hold(gl);
968
969 ret = gfs2_lm_unlock(sdp, gl->gl_lock, gl->gl_state);
970
971 if (gfs2_assert_withdraw(sdp, !(ret & LM_OUT_ERROR)))
972 return;
973
974 if (!ret)
975 drop_bh(gl, ret);
976 else
977 gfs2_assert_warn(sdp, ret == LM_OUT_ASYNC);
978}
979
980/**
981 * do_cancels - cancel requests for locks stuck waiting on an expire flag
982 * @gh: the LM_FLAG_PRIORITY holder waiting to acquire the lock
983 *
984 * Don't cancel GL_NOCANCEL requests.
985 */
986
987static void do_cancels(struct gfs2_holder *gh)
988{
989 struct gfs2_glock *gl = gh->gh_gl;
990 struct gfs2_sbd *sdp = gl->gl_sbd;
991
992 spin_lock(&gl->gl_spin);
993
994 while (gl->gl_req_gh != gh &&
995 !test_bit(HIF_HOLDER, &gh->gh_iflags) &&
996 !list_empty(&gh->gh_list)) {
997 if (!(gl->gl_req_gh && (gl->gl_req_gh->gh_flags & GL_NOCANCEL))) {
998 spin_unlock(&gl->gl_spin);
999 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
1000 sdp->sd_lockstruct.ls_ops->lm_cancel(gl->gl_lock);
1001 msleep(100);
1002 spin_lock(&gl->gl_spin);
1003 } else {
1004 spin_unlock(&gl->gl_spin);
1005 msleep(100);
1006 spin_lock(&gl->gl_spin);
1007 }
1008 }
1009
1010 spin_unlock(&gl->gl_spin);
1011}
1012
1013/**
1014 * glock_wait_internal - wait on a glock acquisition
1015 * @gh: the glock holder 842 * @gh: the glock holder
1016 * 843 *
1017 * Returns: 0 on success 844 * Returns: 0 on success
1018 */ 845 */
1019 846
1020static int glock_wait_internal(struct gfs2_holder *gh) 847int gfs2_glock_wait(struct gfs2_holder *gh)
1021{ 848{
1022 struct gfs2_glock *gl = gh->gh_gl;
1023 struct gfs2_sbd *sdp = gl->gl_sbd;
1024 const struct gfs2_glock_operations *glops = gl->gl_ops;
1025
1026 if (test_bit(HIF_ABORTED, &gh->gh_iflags))
1027 return -EIO;
1028
1029 if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) {
1030 spin_lock(&gl->gl_spin);
1031 if (gl->gl_req_gh != gh &&
1032 !test_bit(HIF_HOLDER, &gh->gh_iflags) &&
1033 !list_empty(&gh->gh_list)) {
1034 list_del_init(&gh->gh_list);
1035 gh->gh_error = GLR_TRYFAILED;
1036 run_queue(gl);
1037 spin_unlock(&gl->gl_spin);
1038 return gh->gh_error;
1039 }
1040 spin_unlock(&gl->gl_spin);
1041 }
1042
1043 if (gh->gh_flags & LM_FLAG_PRIORITY)
1044 do_cancels(gh);
1045
1046 wait_on_holder(gh); 849 wait_on_holder(gh);
1047 if (gh->gh_error)
1048 return gh->gh_error;
1049
1050 gfs2_assert_withdraw(sdp, test_bit(HIF_HOLDER, &gh->gh_iflags));
1051 gfs2_assert_withdraw(sdp, relaxed_state_ok(gl->gl_state, gh->gh_state,
1052 gh->gh_flags));
1053
1054 if (test_bit(HIF_FIRST, &gh->gh_iflags)) {
1055 gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
1056
1057 if (glops->go_lock) {
1058 gh->gh_error = glops->go_lock(gh);
1059 if (gh->gh_error) {
1060 spin_lock(&gl->gl_spin);
1061 list_del_init(&gh->gh_list);
1062 spin_unlock(&gl->gl_spin);
1063 }
1064 }
1065
1066 spin_lock(&gl->gl_spin);
1067 gl->gl_req_gh = NULL;
1068 clear_bit(GLF_LOCK, &gl->gl_flags);
1069 run_queue(gl);
1070 spin_unlock(&gl->gl_spin);
1071 }
1072
1073 return gh->gh_error; 850 return gh->gh_error;
1074} 851}
1075 852
1076static inline struct gfs2_holder * 853void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...)
1077find_holder_by_owner(struct list_head *head, struct pid *pid)
1078{
1079 struct gfs2_holder *gh;
1080
1081 list_for_each_entry(gh, head, gh_list) {
1082 if (gh->gh_owner_pid == pid)
1083 return gh;
1084 }
1085
1086 return NULL;
1087}
1088
1089static void print_dbg(struct glock_iter *gi, const char *fmt, ...)
1090{ 854{
1091 va_list args; 855 va_list args;
1092 856
1093 va_start(args, fmt); 857 va_start(args, fmt);
1094 if (gi) { 858 if (seq) {
859 struct gfs2_glock_iter *gi = seq->private;
1095 vsprintf(gi->string, fmt, args); 860 vsprintf(gi->string, fmt, args);
1096 seq_printf(gi->seq, gi->string); 861 seq_printf(seq, gi->string);
1097 } 862 } else {
1098 else 863 printk(KERN_ERR " ");
1099 vprintk(fmt, args); 864 vprintk(fmt, args);
865 }
1100 va_end(args); 866 va_end(args);
1101} 867}
1102 868
@@ -1104,50 +870,76 @@ static void print_dbg(struct glock_iter *gi, const char *fmt, ...)
1104 * add_to_queue - Add a holder to the wait queue (but look for recursion) 870 * add_to_queue - Add a holder to the wait queue (but look for recursion)
1105 * @gh: the holder structure to add 871 * @gh: the holder structure to add
1106 * 872 *
873 * Eventually we should move the recursive locking trap to a
874 * debugging option or something like that. This is the fast
875 * path and needs to have the minimum number of distractions.
876 *
1107 */ 877 */
1108 878
1109static void add_to_queue(struct gfs2_holder *gh) 879static inline void add_to_queue(struct gfs2_holder *gh)
1110{ 880{
1111 struct gfs2_glock *gl = gh->gh_gl; 881 struct gfs2_glock *gl = gh->gh_gl;
1112 struct gfs2_holder *existing; 882 struct gfs2_sbd *sdp = gl->gl_sbd;
883 struct list_head *insert_pt = NULL;
884 struct gfs2_holder *gh2;
885 int try_lock = 0;
1113 886
1114 BUG_ON(gh->gh_owner_pid == NULL); 887 BUG_ON(gh->gh_owner_pid == NULL);
1115 if (test_and_set_bit(HIF_WAIT, &gh->gh_iflags)) 888 if (test_and_set_bit(HIF_WAIT, &gh->gh_iflags))
1116 BUG(); 889 BUG();
1117 890
1118 if (!(gh->gh_flags & GL_FLOCK)) { 891 if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) {
1119 existing = find_holder_by_owner(&gl->gl_holders, 892 if (test_bit(GLF_LOCK, &gl->gl_flags))
1120 gh->gh_owner_pid); 893 try_lock = 1;
1121 if (existing) { 894 if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags))
1122 print_symbol(KERN_WARNING "original: %s\n", 895 goto fail;
1123 existing->gh_ip); 896 }
1124 printk(KERN_INFO "pid : %d\n", 897
1125 pid_nr(existing->gh_owner_pid)); 898 list_for_each_entry(gh2, &gl->gl_holders, gh_list) {
1126 printk(KERN_INFO "lock type : %d lock state : %d\n", 899 if (unlikely(gh2->gh_owner_pid == gh->gh_owner_pid &&
1127 existing->gh_gl->gl_name.ln_type, 900 (gh->gh_gl->gl_ops->go_type != LM_TYPE_FLOCK)))
1128 existing->gh_gl->gl_state); 901 goto trap_recursive;
1129 print_symbol(KERN_WARNING "new: %s\n", gh->gh_ip); 902 if (try_lock &&
1130 printk(KERN_INFO "pid : %d\n", 903 !(gh2->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) &&
1131 pid_nr(gh->gh_owner_pid)); 904 !may_grant(gl, gh)) {
1132 printk(KERN_INFO "lock type : %d lock state : %d\n", 905fail:
1133 gl->gl_name.ln_type, gl->gl_state); 906 gh->gh_error = GLR_TRYFAILED;
1134 BUG(); 907 gfs2_holder_wake(gh);
1135 } 908 return;
1136
1137 existing = find_holder_by_owner(&gl->gl_waiters3,
1138 gh->gh_owner_pid);
1139 if (existing) {
1140 print_symbol(KERN_WARNING "original: %s\n",
1141 existing->gh_ip);
1142 print_symbol(KERN_WARNING "new: %s\n", gh->gh_ip);
1143 BUG();
1144 } 909 }
910 if (test_bit(HIF_HOLDER, &gh2->gh_iflags))
911 continue;
912 if (unlikely((gh->gh_flags & LM_FLAG_PRIORITY) && !insert_pt))
913 insert_pt = &gh2->gh_list;
914 }
915 if (likely(insert_pt == NULL)) {
916 list_add_tail(&gh->gh_list, &gl->gl_holders);
917 if (unlikely(gh->gh_flags & LM_FLAG_PRIORITY))
918 goto do_cancel;
919 return;
920 }
921 list_add_tail(&gh->gh_list, insert_pt);
922do_cancel:
923 gh = list_entry(gl->gl_holders.next, struct gfs2_holder, gh_list);
924 if (!(gh->gh_flags & LM_FLAG_PRIORITY)) {
925 spin_unlock(&gl->gl_spin);
926 if (sdp->sd_lockstruct.ls_ops->lm_cancel)
927 sdp->sd_lockstruct.ls_ops->lm_cancel(gl->gl_lock);
928 spin_lock(&gl->gl_spin);
1145 } 929 }
930 return;
1146 931
1147 if (gh->gh_flags & LM_FLAG_PRIORITY) 932trap_recursive:
1148 list_add(&gh->gh_list, &gl->gl_waiters3); 933 print_symbol(KERN_ERR "original: %s\n", gh2->gh_ip);
1149 else 934 printk(KERN_ERR "pid: %d\n", pid_nr(gh2->gh_owner_pid));
1150 list_add_tail(&gh->gh_list, &gl->gl_waiters3); 935 printk(KERN_ERR "lock type: %d req lock state : %d\n",
936 gh2->gh_gl->gl_name.ln_type, gh2->gh_state);
937 print_symbol(KERN_ERR "new: %s\n", gh->gh_ip);
938 printk(KERN_ERR "pid: %d\n", pid_nr(gh->gh_owner_pid));
939 printk(KERN_ERR "lock type: %d req lock state : %d\n",
940 gh->gh_gl->gl_name.ln_type, gh->gh_state);
941 __dump_glock(NULL, gl);
942 BUG();
1151} 943}
1152 944
1153/** 945/**
@@ -1165,24 +957,16 @@ int gfs2_glock_nq(struct gfs2_holder *gh)
1165 struct gfs2_sbd *sdp = gl->gl_sbd; 957 struct gfs2_sbd *sdp = gl->gl_sbd;
1166 int error = 0; 958 int error = 0;
1167 959
1168restart: 960 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
1169 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) {
1170 set_bit(HIF_ABORTED, &gh->gh_iflags);
1171 return -EIO; 961 return -EIO;
1172 }
1173 962
1174 spin_lock(&gl->gl_spin); 963 spin_lock(&gl->gl_spin);
1175 add_to_queue(gh); 964 add_to_queue(gh);
1176 run_queue(gl); 965 run_queue(gl, 1);
1177 spin_unlock(&gl->gl_spin); 966 spin_unlock(&gl->gl_spin);
1178 967
1179 if (!(gh->gh_flags & GL_ASYNC)) { 968 if (!(gh->gh_flags & GL_ASYNC))
1180 error = glock_wait_internal(gh); 969 error = gfs2_glock_wait(gh);
1181 if (error == GLR_CANCELED) {
1182 msleep(100);
1183 goto restart;
1184 }
1185 }
1186 970
1187 return error; 971 return error;
1188} 972}
@@ -1196,48 +980,7 @@ restart:
1196 980
1197int gfs2_glock_poll(struct gfs2_holder *gh) 981int gfs2_glock_poll(struct gfs2_holder *gh)
1198{ 982{
1199 struct gfs2_glock *gl = gh->gh_gl; 983 return test_bit(HIF_WAIT, &gh->gh_iflags) ? 0 : 1;
1200 int ready = 0;
1201
1202 spin_lock(&gl->gl_spin);
1203
1204 if (test_bit(HIF_HOLDER, &gh->gh_iflags))
1205 ready = 1;
1206 else if (list_empty(&gh->gh_list)) {
1207 if (gh->gh_error == GLR_CANCELED) {
1208 spin_unlock(&gl->gl_spin);
1209 msleep(100);
1210 if (gfs2_glock_nq(gh))
1211 return 1;
1212 return 0;
1213 } else
1214 ready = 1;
1215 }
1216
1217 spin_unlock(&gl->gl_spin);
1218
1219 return ready;
1220}
1221
1222/**
1223 * gfs2_glock_wait - wait for a lock acquisition that ended in a GLR_ASYNC
1224 * @gh: the holder structure
1225 *
1226 * Returns: 0, GLR_TRYFAILED, or errno on failure
1227 */
1228
1229int gfs2_glock_wait(struct gfs2_holder *gh)
1230{
1231 int error;
1232
1233 error = glock_wait_internal(gh);
1234 if (error == GLR_CANCELED) {
1235 msleep(100);
1236 gh->gh_flags &= ~GL_ASYNC;
1237 error = gfs2_glock_nq(gh);
1238 }
1239
1240 return error;
1241} 984}
1242 985
1243/** 986/**
@@ -1251,26 +994,30 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
1251 struct gfs2_glock *gl = gh->gh_gl; 994 struct gfs2_glock *gl = gh->gh_gl;
1252 const struct gfs2_glock_operations *glops = gl->gl_ops; 995 const struct gfs2_glock_operations *glops = gl->gl_ops;
1253 unsigned delay = 0; 996 unsigned delay = 0;
997 int fast_path = 0;
1254 998
999 spin_lock(&gl->gl_spin);
1255 if (gh->gh_flags & GL_NOCACHE) 1000 if (gh->gh_flags & GL_NOCACHE)
1256 handle_callback(gl, LM_ST_UNLOCKED, 0, 0); 1001 handle_callback(gl, LM_ST_UNLOCKED, 0, 0);
1257 1002
1258 gfs2_glmutex_lock(gl);
1259
1260 spin_lock(&gl->gl_spin);
1261 list_del_init(&gh->gh_list); 1003 list_del_init(&gh->gh_list);
1262 1004 if (find_first_holder(gl) == NULL) {
1263 if (list_empty(&gl->gl_holders)) {
1264 if (glops->go_unlock) { 1005 if (glops->go_unlock) {
1006 GLOCK_BUG_ON(gl, test_and_set_bit(GLF_LOCK, &gl->gl_flags));
1265 spin_unlock(&gl->gl_spin); 1007 spin_unlock(&gl->gl_spin);
1266 glops->go_unlock(gh); 1008 glops->go_unlock(gh);
1267 spin_lock(&gl->gl_spin); 1009 spin_lock(&gl->gl_spin);
1010 clear_bit(GLF_LOCK, &gl->gl_flags);
1268 } 1011 }
1269 gl->gl_stamp = jiffies; 1012 gl->gl_stamp = jiffies;
1013 if (list_empty(&gl->gl_holders) &&
1014 !test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
1015 !test_bit(GLF_DEMOTE, &gl->gl_flags))
1016 fast_path = 1;
1270 } 1017 }
1271
1272 clear_bit(GLF_LOCK, &gl->gl_flags);
1273 spin_unlock(&gl->gl_spin); 1018 spin_unlock(&gl->gl_spin);
1019 if (likely(fast_path))
1020 return;
1274 1021
1275 gfs2_glock_hold(gl); 1022 gfs2_glock_hold(gl);
1276 if (test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) && 1023 if (test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
@@ -1454,6 +1201,8 @@ void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs)
1454static int gfs2_lm_hold_lvb(struct gfs2_sbd *sdp, void *lock, char **lvbp) 1201static int gfs2_lm_hold_lvb(struct gfs2_sbd *sdp, void *lock, char **lvbp)
1455{ 1202{
1456 int error = -EIO; 1203 int error = -EIO;
1204 if (!sdp->sd_lockstruct.ls_ops->lm_hold_lvb)
1205 return 0;
1457 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) 1206 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
1458 error = sdp->sd_lockstruct.ls_ops->lm_hold_lvb(lock, lvbp); 1207 error = sdp->sd_lockstruct.ls_ops->lm_hold_lvb(lock, lvbp);
1459 return error; 1208 return error;
@@ -1469,20 +1218,14 @@ int gfs2_lvb_hold(struct gfs2_glock *gl)
1469{ 1218{
1470 int error; 1219 int error;
1471 1220
1472 gfs2_glmutex_lock(gl);
1473
1474 if (!atomic_read(&gl->gl_lvb_count)) { 1221 if (!atomic_read(&gl->gl_lvb_count)) {
1475 error = gfs2_lm_hold_lvb(gl->gl_sbd, gl->gl_lock, &gl->gl_lvb); 1222 error = gfs2_lm_hold_lvb(gl->gl_sbd, gl->gl_lock, &gl->gl_lvb);
1476 if (error) { 1223 if (error)
1477 gfs2_glmutex_unlock(gl);
1478 return error; 1224 return error;
1479 }
1480 gfs2_glock_hold(gl); 1225 gfs2_glock_hold(gl);
1481 } 1226 }
1482 atomic_inc(&gl->gl_lvb_count); 1227 atomic_inc(&gl->gl_lvb_count);
1483 1228
1484 gfs2_glmutex_unlock(gl);
1485
1486 return 0; 1229 return 0;
1487} 1230}
1488 1231
@@ -1497,17 +1240,13 @@ void gfs2_lvb_unhold(struct gfs2_glock *gl)
1497 struct gfs2_sbd *sdp = gl->gl_sbd; 1240 struct gfs2_sbd *sdp = gl->gl_sbd;
1498 1241
1499 gfs2_glock_hold(gl); 1242 gfs2_glock_hold(gl);
1500 gfs2_glmutex_lock(gl);
1501
1502 gfs2_assert(gl->gl_sbd, atomic_read(&gl->gl_lvb_count) > 0); 1243 gfs2_assert(gl->gl_sbd, atomic_read(&gl->gl_lvb_count) > 0);
1503 if (atomic_dec_and_test(&gl->gl_lvb_count)) { 1244 if (atomic_dec_and_test(&gl->gl_lvb_count)) {
1504 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) 1245 if (sdp->sd_lockstruct.ls_ops->lm_unhold_lvb)
1505 sdp->sd_lockstruct.ls_ops->lm_unhold_lvb(gl->gl_lock, gl->gl_lvb); 1246 sdp->sd_lockstruct.ls_ops->lm_unhold_lvb(gl->gl_lock, gl->gl_lvb);
1506 gl->gl_lvb = NULL; 1247 gl->gl_lvb = NULL;
1507 gfs2_glock_put(gl); 1248 gfs2_glock_put(gl);
1508 } 1249 }
1509
1510 gfs2_glmutex_unlock(gl);
1511 gfs2_glock_put(gl); 1250 gfs2_glock_put(gl);
1512} 1251}
1513 1252
@@ -1527,7 +1266,9 @@ static void blocking_cb(struct gfs2_sbd *sdp, struct lm_lockname *name,
1527 if (time_before(now, holdtime)) 1266 if (time_before(now, holdtime))
1528 delay = holdtime - now; 1267 delay = holdtime - now;
1529 1268
1269 spin_lock(&gl->gl_spin);
1530 handle_callback(gl, state, 1, delay); 1270 handle_callback(gl, state, 1, delay);
1271 spin_unlock(&gl->gl_spin);
1531 if (queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0) 1272 if (queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0)
1532 gfs2_glock_put(gl); 1273 gfs2_glock_put(gl);
1533} 1274}
@@ -1568,7 +1309,8 @@ void gfs2_glock_cb(void *cb_data, unsigned int type, void *data)
1568 gl = gfs2_glock_find(sdp, &async->lc_name); 1309 gl = gfs2_glock_find(sdp, &async->lc_name);
1569 if (gfs2_assert_warn(sdp, gl)) 1310 if (gfs2_assert_warn(sdp, gl))
1570 return; 1311 return;
1571 xmote_bh(gl, async->lc_ret); 1312 gl->gl_reply = async->lc_ret;
1313 set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
1572 if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) 1314 if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
1573 gfs2_glock_put(gl); 1315 gfs2_glock_put(gl);
1574 up_read(&gfs2_umount_flush_sem); 1316 up_read(&gfs2_umount_flush_sem);
@@ -1581,11 +1323,6 @@ void gfs2_glock_cb(void *cb_data, unsigned int type, void *data)
1581 wake_up_process(sdp->sd_recoverd_process); 1323 wake_up_process(sdp->sd_recoverd_process);
1582 return; 1324 return;
1583 1325
1584 case LM_CB_DROPLOCKS:
1585 gfs2_gl_hash_clear(sdp, NO_WAIT);
1586 gfs2_quota_scan(sdp);
1587 return;
1588
1589 default: 1326 default:
1590 gfs2_assert_warn(sdp, 0); 1327 gfs2_assert_warn(sdp, 0);
1591 return; 1328 return;
@@ -1646,6 +1383,7 @@ void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
1646void gfs2_reclaim_glock(struct gfs2_sbd *sdp) 1383void gfs2_reclaim_glock(struct gfs2_sbd *sdp)
1647{ 1384{
1648 struct gfs2_glock *gl; 1385 struct gfs2_glock *gl;
1386 int done_callback = 0;
1649 1387
1650 spin_lock(&sdp->sd_reclaim_lock); 1388 spin_lock(&sdp->sd_reclaim_lock);
1651 if (list_empty(&sdp->sd_reclaim_list)) { 1389 if (list_empty(&sdp->sd_reclaim_list)) {
@@ -1660,14 +1398,16 @@ void gfs2_reclaim_glock(struct gfs2_sbd *sdp)
1660 atomic_dec(&sdp->sd_reclaim_count); 1398 atomic_dec(&sdp->sd_reclaim_count);
1661 atomic_inc(&sdp->sd_reclaimed); 1399 atomic_inc(&sdp->sd_reclaimed);
1662 1400
1663 if (gfs2_glmutex_trylock(gl)) { 1401 spin_lock(&gl->gl_spin);
1664 if (list_empty(&gl->gl_holders) && 1402 if (find_first_holder(gl) == NULL &&
1665 gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl)) 1403 gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl)) {
1666 handle_callback(gl, LM_ST_UNLOCKED, 0, 0); 1404 handle_callback(gl, LM_ST_UNLOCKED, 0, 0);
1667 gfs2_glmutex_unlock(gl); 1405 done_callback = 1;
1668 } 1406 }
1669 1407 spin_unlock(&gl->gl_spin);
1670 gfs2_glock_put(gl); 1408 if (!done_callback ||
1409 queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
1410 gfs2_glock_put(gl);
1671} 1411}
1672 1412
1673/** 1413/**
@@ -1724,18 +1464,14 @@ static void scan_glock(struct gfs2_glock *gl)
1724{ 1464{
1725 if (gl->gl_ops == &gfs2_inode_glops && gl->gl_object) 1465 if (gl->gl_ops == &gfs2_inode_glops && gl->gl_object)
1726 return; 1466 return;
1467 if (test_bit(GLF_LOCK, &gl->gl_flags))
1468 return;
1727 1469
1728 if (gfs2_glmutex_trylock(gl)) { 1470 spin_lock(&gl->gl_spin);
1729 if (list_empty(&gl->gl_holders) && 1471 if (find_first_holder(gl) == NULL &&
1730 gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl)) 1472 gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl))
1731 goto out_schedule; 1473 gfs2_glock_schedule_for_reclaim(gl);
1732 gfs2_glmutex_unlock(gl); 1474 spin_unlock(&gl->gl_spin);
1733 }
1734 return;
1735
1736out_schedule:
1737 gfs2_glmutex_unlock(gl);
1738 gfs2_glock_schedule_for_reclaim(gl);
1739} 1475}
1740 1476
1741/** 1477/**
@@ -1760,12 +1496,13 @@ static void clear_glock(struct gfs2_glock *gl)
1760 spin_unlock(&sdp->sd_reclaim_lock); 1496 spin_unlock(&sdp->sd_reclaim_lock);
1761 } 1497 }
1762 1498
1763 if (gfs2_glmutex_trylock(gl)) { 1499 spin_lock(&gl->gl_spin);
1764 if (list_empty(&gl->gl_holders) && 1500 if (find_first_holder(gl) == NULL && gl->gl_state != LM_ST_UNLOCKED)
1765 gl->gl_state != LM_ST_UNLOCKED) 1501 handle_callback(gl, LM_ST_UNLOCKED, 0, 0);
1766 handle_callback(gl, LM_ST_UNLOCKED, 0, 0); 1502 spin_unlock(&gl->gl_spin);
1767 gfs2_glmutex_unlock(gl); 1503 gfs2_glock_hold(gl);
1768 } 1504 if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
1505 gfs2_glock_put(gl);
1769} 1506}
1770 1507
1771/** 1508/**
@@ -1773,11 +1510,10 @@ static void clear_glock(struct gfs2_glock *gl)
1773 * @sdp: the filesystem 1510 * @sdp: the filesystem
1774 * @wait: wait until it's all gone 1511 * @wait: wait until it's all gone
1775 * 1512 *
1776 * Called when unmounting the filesystem, or when inter-node lock manager 1513 * Called when unmounting the filesystem.
1777 * requests DROPLOCKS because it is running out of capacity.
1778 */ 1514 */
1779 1515
1780void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait) 1516void gfs2_gl_hash_clear(struct gfs2_sbd *sdp)
1781{ 1517{
1782 unsigned long t; 1518 unsigned long t;
1783 unsigned int x; 1519 unsigned int x;
@@ -1792,7 +1528,7 @@ void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait)
1792 cont = 1; 1528 cont = 1;
1793 } 1529 }
1794 1530
1795 if (!wait || !cont) 1531 if (!cont)
1796 break; 1532 break;
1797 1533
1798 if (time_after_eq(jiffies, 1534 if (time_after_eq(jiffies,
@@ -1810,180 +1546,164 @@ void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait)
1810 } 1546 }
1811} 1547}
1812 1548
1813/* 1549static const char *state2str(unsigned state)
1814 * Diagnostic routines to help debug distributed deadlock
1815 */
1816
1817static void gfs2_print_symbol(struct glock_iter *gi, const char *fmt,
1818 unsigned long address)
1819{ 1550{
1820 char buffer[KSYM_SYMBOL_LEN]; 1551 switch(state) {
1821 1552 case LM_ST_UNLOCKED:
1822 sprint_symbol(buffer, address); 1553 return "UN";
1823 print_dbg(gi, fmt, buffer); 1554 case LM_ST_SHARED:
1555 return "SH";
1556 case LM_ST_DEFERRED:
1557 return "DF";
1558 case LM_ST_EXCLUSIVE:
1559 return "EX";
1560 }
1561 return "??";
1562}
1563
1564static const char *hflags2str(char *buf, unsigned flags, unsigned long iflags)
1565{
1566 char *p = buf;
1567 if (flags & LM_FLAG_TRY)
1568 *p++ = 't';
1569 if (flags & LM_FLAG_TRY_1CB)
1570 *p++ = 'T';
1571 if (flags & LM_FLAG_NOEXP)
1572 *p++ = 'e';
1573 if (flags & LM_FLAG_ANY)
1574 *p++ = 'a';
1575 if (flags & LM_FLAG_PRIORITY)
1576 *p++ = 'p';
1577 if (flags & GL_ASYNC)
1578 *p++ = 'a';
1579 if (flags & GL_EXACT)
1580 *p++ = 'E';
1581 if (flags & GL_ATIME)
1582 *p++ = 'a';
1583 if (flags & GL_NOCACHE)
1584 *p++ = 'c';
1585 if (test_bit(HIF_HOLDER, &iflags))
1586 *p++ = 'H';
1587 if (test_bit(HIF_WAIT, &iflags))
1588 *p++ = 'W';
1589 if (test_bit(HIF_FIRST, &iflags))
1590 *p++ = 'F';
1591 *p = 0;
1592 return buf;
1824} 1593}
1825 1594
1826/** 1595/**
1827 * dump_holder - print information about a glock holder 1596 * dump_holder - print information about a glock holder
1828 * @str: a string naming the type of holder 1597 * @seq: the seq_file struct
1829 * @gh: the glock holder 1598 * @gh: the glock holder
1830 * 1599 *
1831 * Returns: 0 on success, -ENOBUFS when we run out of space 1600 * Returns: 0 on success, -ENOBUFS when we run out of space
1832 */ 1601 */
1833 1602
1834static int dump_holder(struct glock_iter *gi, char *str, 1603static int dump_holder(struct seq_file *seq, const struct gfs2_holder *gh)
1835 struct gfs2_holder *gh)
1836{ 1604{
1837 unsigned int x; 1605 struct task_struct *gh_owner = NULL;
1838 struct task_struct *gh_owner; 1606 char buffer[KSYM_SYMBOL_LEN];
1607 char flags_buf[32];
1839 1608
1840 print_dbg(gi, " %s\n", str); 1609 sprint_symbol(buffer, gh->gh_ip);
1841 if (gh->gh_owner_pid) { 1610 if (gh->gh_owner_pid)
1842 print_dbg(gi, " owner = %ld ",
1843 (long)pid_nr(gh->gh_owner_pid));
1844 gh_owner = pid_task(gh->gh_owner_pid, PIDTYPE_PID); 1611 gh_owner = pid_task(gh->gh_owner_pid, PIDTYPE_PID);
1845 if (gh_owner) 1612 gfs2_print_dbg(seq, " H: s:%s f:%s e:%d p:%ld [%s] %s\n",
1846 print_dbg(gi, "(%s)\n", gh_owner->comm); 1613 state2str(gh->gh_state),
1847 else 1614 hflags2str(flags_buf, gh->gh_flags, gh->gh_iflags),
1848 print_dbg(gi, "(ended)\n"); 1615 gh->gh_error,
1849 } else 1616 gh->gh_owner_pid ? (long)pid_nr(gh->gh_owner_pid) : -1,
1850 print_dbg(gi, " owner = -1\n"); 1617 gh_owner ? gh_owner->comm : "(ended)", buffer);
1851 print_dbg(gi, " gh_state = %u\n", gh->gh_state);
1852 print_dbg(gi, " gh_flags =");
1853 for (x = 0; x < 32; x++)
1854 if (gh->gh_flags & (1 << x))
1855 print_dbg(gi, " %u", x);
1856 print_dbg(gi, " \n");
1857 print_dbg(gi, " error = %d\n", gh->gh_error);
1858 print_dbg(gi, " gh_iflags =");
1859 for (x = 0; x < 32; x++)
1860 if (test_bit(x, &gh->gh_iflags))
1861 print_dbg(gi, " %u", x);
1862 print_dbg(gi, " \n");
1863 gfs2_print_symbol(gi, " initialized at: %s\n", gh->gh_ip);
1864
1865 return 0; 1618 return 0;
1866} 1619}
1867 1620
1868/** 1621static const char *gflags2str(char *buf, const unsigned long *gflags)
1869 * dump_inode - print information about an inode 1622{
1870 * @ip: the inode 1623 char *p = buf;
1871 * 1624 if (test_bit(GLF_LOCK, gflags))
1872 * Returns: 0 on success, -ENOBUFS when we run out of space 1625 *p++ = 'l';
1873 */ 1626 if (test_bit(GLF_STICKY, gflags))
1874 1627 *p++ = 's';
1875static int dump_inode(struct glock_iter *gi, struct gfs2_inode *ip) 1628 if (test_bit(GLF_DEMOTE, gflags))
1876{ 1629 *p++ = 'D';
1877 unsigned int x; 1630 if (test_bit(GLF_PENDING_DEMOTE, gflags))
1878 1631 *p++ = 'd';
1879 print_dbg(gi, " Inode:\n"); 1632 if (test_bit(GLF_DEMOTE_IN_PROGRESS, gflags))
1880 print_dbg(gi, " num = %llu/%llu\n", 1633 *p++ = 'p';
1881 (unsigned long long)ip->i_no_formal_ino, 1634 if (test_bit(GLF_DIRTY, gflags))
1882 (unsigned long long)ip->i_no_addr); 1635 *p++ = 'y';
1883 print_dbg(gi, " type = %u\n", IF2DT(ip->i_inode.i_mode)); 1636 if (test_bit(GLF_LFLUSH, gflags))
1884 print_dbg(gi, " i_flags ="); 1637 *p++ = 'f';
1885 for (x = 0; x < 32; x++) 1638 if (test_bit(GLF_INVALIDATE_IN_PROGRESS, gflags))
1886 if (test_bit(x, &ip->i_flags)) 1639 *p++ = 'i';
1887 print_dbg(gi, " %u", x); 1640 if (test_bit(GLF_REPLY_PENDING, gflags))
1888 print_dbg(gi, " \n"); 1641 *p++ = 'r';
1889 return 0; 1642 *p = 0;
1643 return buf;
1890} 1644}
1891 1645
1892/** 1646/**
1893 * dump_glock - print information about a glock 1647 * __dump_glock - print information about a glock
1648 * @seq: The seq_file struct
1894 * @gl: the glock 1649 * @gl: the glock
1895 * @count: where we are in the buffer 1650 *
1651 * The file format is as follows:
1652 * One line per object, capital letters are used to indicate objects
1653 * G = glock, I = Inode, R = rgrp, H = holder. Glocks are not indented,
1654 * other objects are indented by a single space and follow the glock to
1655 * which they are related. Fields are indicated by lower case letters
1656 * followed by a colon and the field value, except for strings which are in
1657 * [] so that its possible to see if they are composed of spaces for
1658 * example. The field's are n = number (id of the object), f = flags,
1659 * t = type, s = state, r = refcount, e = error, p = pid.
1896 * 1660 *
1897 * Returns: 0 on success, -ENOBUFS when we run out of space 1661 * Returns: 0 on success, -ENOBUFS when we run out of space
1898 */ 1662 */
1899 1663
1900static int dump_glock(struct glock_iter *gi, struct gfs2_glock *gl) 1664static int __dump_glock(struct seq_file *seq, const struct gfs2_glock *gl)
1901{ 1665{
1902 struct gfs2_holder *gh; 1666 const struct gfs2_glock_operations *glops = gl->gl_ops;
1903 unsigned int x; 1667 unsigned long long dtime;
1904 int error = -ENOBUFS; 1668 const struct gfs2_holder *gh;
1905 struct task_struct *gl_owner; 1669 char gflags_buf[32];
1670 int error = 0;
1906 1671
1907 spin_lock(&gl->gl_spin); 1672 dtime = jiffies - gl->gl_demote_time;
1673 dtime *= 1000000/HZ; /* demote time in uSec */
1674 if (!test_bit(GLF_DEMOTE, &gl->gl_flags))
1675 dtime = 0;
1676 gfs2_print_dbg(seq, "G: s:%s n:%u/%llu f:%s t:%s d:%s/%llu l:%d a:%d r:%d\n",
1677 state2str(gl->gl_state),
1678 gl->gl_name.ln_type,
1679 (unsigned long long)gl->gl_name.ln_number,
1680 gflags2str(gflags_buf, &gl->gl_flags),
1681 state2str(gl->gl_target),
1682 state2str(gl->gl_demote_state), dtime,
1683 atomic_read(&gl->gl_lvb_count),
1684 atomic_read(&gl->gl_ail_count),
1685 atomic_read(&gl->gl_ref));
1908 1686
1909 print_dbg(gi, "Glock 0x%p (%u, 0x%llx)\n", gl, gl->gl_name.ln_type,
1910 (unsigned long long)gl->gl_name.ln_number);
1911 print_dbg(gi, " gl_flags =");
1912 for (x = 0; x < 32; x++) {
1913 if (test_bit(x, &gl->gl_flags))
1914 print_dbg(gi, " %u", x);
1915 }
1916 if (!test_bit(GLF_LOCK, &gl->gl_flags))
1917 print_dbg(gi, " (unlocked)");
1918 print_dbg(gi, " \n");
1919 print_dbg(gi, " gl_ref = %d\n", atomic_read(&gl->gl_ref));
1920 print_dbg(gi, " gl_state = %u\n", gl->gl_state);
1921 if (gl->gl_owner_pid) {
1922 gl_owner = pid_task(gl->gl_owner_pid, PIDTYPE_PID);
1923 if (gl_owner)
1924 print_dbg(gi, " gl_owner = pid %d (%s)\n",
1925 pid_nr(gl->gl_owner_pid), gl_owner->comm);
1926 else
1927 print_dbg(gi, " gl_owner = %d (ended)\n",
1928 pid_nr(gl->gl_owner_pid));
1929 } else
1930 print_dbg(gi, " gl_owner = -1\n");
1931 print_dbg(gi, " gl_ip = %lu\n", gl->gl_ip);
1932 print_dbg(gi, " req_gh = %s\n", (gl->gl_req_gh) ? "yes" : "no");
1933 print_dbg(gi, " lvb_count = %d\n", atomic_read(&gl->gl_lvb_count));
1934 print_dbg(gi, " object = %s\n", (gl->gl_object) ? "yes" : "no");
1935 print_dbg(gi, " reclaim = %s\n",
1936 (list_empty(&gl->gl_reclaim)) ? "no" : "yes");
1937 if (gl->gl_aspace)
1938 print_dbg(gi, " aspace = 0x%p nrpages = %lu\n", gl->gl_aspace,
1939 gl->gl_aspace->i_mapping->nrpages);
1940 else
1941 print_dbg(gi, " aspace = no\n");
1942 print_dbg(gi, " ail = %d\n", atomic_read(&gl->gl_ail_count));
1943 if (gl->gl_req_gh) {
1944 error = dump_holder(gi, "Request", gl->gl_req_gh);
1945 if (error)
1946 goto out;
1947 }
1948 list_for_each_entry(gh, &gl->gl_holders, gh_list) { 1687 list_for_each_entry(gh, &gl->gl_holders, gh_list) {
1949 error = dump_holder(gi, "Holder", gh); 1688 error = dump_holder(seq, gh);
1950 if (error) 1689 if (error)
1951 goto out; 1690 goto out;
1952 } 1691 }
1953 list_for_each_entry(gh, &gl->gl_waiters1, gh_list) { 1692 if (gl->gl_state != LM_ST_UNLOCKED && glops->go_dump)
1954 error = dump_holder(gi, "Waiter1", gh); 1693 error = glops->go_dump(seq, gl);
1955 if (error)
1956 goto out;
1957 }
1958 list_for_each_entry(gh, &gl->gl_waiters3, gh_list) {
1959 error = dump_holder(gi, "Waiter3", gh);
1960 if (error)
1961 goto out;
1962 }
1963 if (test_bit(GLF_DEMOTE, &gl->gl_flags)) {
1964 print_dbg(gi, " Demotion req to state %u (%llu uS ago)\n",
1965 gl->gl_demote_state, (unsigned long long)
1966 (jiffies - gl->gl_demote_time)*(1000000/HZ));
1967 }
1968 if (gl->gl_ops == &gfs2_inode_glops && gl->gl_object) {
1969 if (!test_bit(GLF_LOCK, &gl->gl_flags) &&
1970 list_empty(&gl->gl_holders)) {
1971 error = dump_inode(gi, gl->gl_object);
1972 if (error)
1973 goto out;
1974 } else {
1975 error = -ENOBUFS;
1976 print_dbg(gi, " Inode: busy\n");
1977 }
1978 }
1979
1980 error = 0;
1981
1982out: 1694out:
1983 spin_unlock(&gl->gl_spin);
1984 return error; 1695 return error;
1985} 1696}
1986 1697
1698static int dump_glock(struct seq_file *seq, struct gfs2_glock *gl)
1699{
1700 int ret;
1701 spin_lock(&gl->gl_spin);
1702 ret = __dump_glock(seq, gl);
1703 spin_unlock(&gl->gl_spin);
1704 return ret;
1705}
1706
1987/** 1707/**
1988 * gfs2_dump_lockstate - print out the current lockstate 1708 * gfs2_dump_lockstate - print out the current lockstate
1989 * @sdp: the filesystem 1709 * @sdp: the filesystem
@@ -2086,7 +1806,7 @@ void gfs2_glock_exit(void)
2086module_param(scand_secs, uint, S_IRUGO|S_IWUSR); 1806module_param(scand_secs, uint, S_IRUGO|S_IWUSR);
2087MODULE_PARM_DESC(scand_secs, "The number of seconds between scand runs"); 1807MODULE_PARM_DESC(scand_secs, "The number of seconds between scand runs");
2088 1808
2089static int gfs2_glock_iter_next(struct glock_iter *gi) 1809static int gfs2_glock_iter_next(struct gfs2_glock_iter *gi)
2090{ 1810{
2091 struct gfs2_glock *gl; 1811 struct gfs2_glock *gl;
2092 1812
@@ -2104,7 +1824,7 @@ restart:
2104 gfs2_glock_put(gl); 1824 gfs2_glock_put(gl);
2105 if (gl && gi->gl == NULL) 1825 if (gl && gi->gl == NULL)
2106 gi->hash++; 1826 gi->hash++;
2107 while(gi->gl == NULL) { 1827 while (gi->gl == NULL) {
2108 if (gi->hash >= GFS2_GL_HASH_SIZE) 1828 if (gi->hash >= GFS2_GL_HASH_SIZE)
2109 return 1; 1829 return 1;
2110 read_lock(gl_lock_addr(gi->hash)); 1830 read_lock(gl_lock_addr(gi->hash));
@@ -2122,58 +1842,34 @@ restart:
2122 return 0; 1842 return 0;
2123} 1843}
2124 1844
2125static void gfs2_glock_iter_free(struct glock_iter *gi) 1845static void gfs2_glock_iter_free(struct gfs2_glock_iter *gi)
2126{ 1846{
2127 if (gi->gl) 1847 if (gi->gl)
2128 gfs2_glock_put(gi->gl); 1848 gfs2_glock_put(gi->gl);
2129 kfree(gi);
2130}
2131
2132static struct glock_iter *gfs2_glock_iter_init(struct gfs2_sbd *sdp)
2133{
2134 struct glock_iter *gi;
2135
2136 gi = kmalloc(sizeof (*gi), GFP_KERNEL);
2137 if (!gi)
2138 return NULL;
2139
2140 gi->sdp = sdp;
2141 gi->hash = 0;
2142 gi->seq = NULL;
2143 gi->gl = NULL; 1849 gi->gl = NULL;
2144 memset(gi->string, 0, sizeof(gi->string));
2145
2146 if (gfs2_glock_iter_next(gi)) {
2147 gfs2_glock_iter_free(gi);
2148 return NULL;
2149 }
2150
2151 return gi;
2152} 1850}
2153 1851
2154static void *gfs2_glock_seq_start(struct seq_file *file, loff_t *pos) 1852static void *gfs2_glock_seq_start(struct seq_file *seq, loff_t *pos)
2155{ 1853{
2156 struct glock_iter *gi; 1854 struct gfs2_glock_iter *gi = seq->private;
2157 loff_t n = *pos; 1855 loff_t n = *pos;
2158 1856
2159 gi = gfs2_glock_iter_init(file->private); 1857 gi->hash = 0;
2160 if (!gi)
2161 return NULL;
2162 1858
2163 while(n--) { 1859 do {
2164 if (gfs2_glock_iter_next(gi)) { 1860 if (gfs2_glock_iter_next(gi)) {
2165 gfs2_glock_iter_free(gi); 1861 gfs2_glock_iter_free(gi);
2166 return NULL; 1862 return NULL;
2167 } 1863 }
2168 } 1864 } while (n--);
2169 1865
2170 return gi; 1866 return gi->gl;
2171} 1867}
2172 1868
2173static void *gfs2_glock_seq_next(struct seq_file *file, void *iter_ptr, 1869static void *gfs2_glock_seq_next(struct seq_file *seq, void *iter_ptr,
2174 loff_t *pos) 1870 loff_t *pos)
2175{ 1871{
2176 struct glock_iter *gi = iter_ptr; 1872 struct gfs2_glock_iter *gi = seq->private;
2177 1873
2178 (*pos)++; 1874 (*pos)++;
2179 1875
@@ -2182,24 +1878,18 @@ static void *gfs2_glock_seq_next(struct seq_file *file, void *iter_ptr,
2182 return NULL; 1878 return NULL;
2183 } 1879 }
2184 1880
2185 return gi; 1881 return gi->gl;
2186} 1882}
2187 1883
2188static void gfs2_glock_seq_stop(struct seq_file *file, void *iter_ptr) 1884static void gfs2_glock_seq_stop(struct seq_file *seq, void *iter_ptr)
2189{ 1885{
2190 struct glock_iter *gi = iter_ptr; 1886 struct gfs2_glock_iter *gi = seq->private;
2191 if (gi) 1887 gfs2_glock_iter_free(gi);
2192 gfs2_glock_iter_free(gi);
2193} 1888}
2194 1889
2195static int gfs2_glock_seq_show(struct seq_file *file, void *iter_ptr) 1890static int gfs2_glock_seq_show(struct seq_file *seq, void *iter_ptr)
2196{ 1891{
2197 struct glock_iter *gi = iter_ptr; 1892 return dump_glock(seq, iter_ptr);
2198
2199 gi->seq = file;
2200 dump_glock(gi, gi->gl);
2201
2202 return 0;
2203} 1893}
2204 1894
2205static const struct seq_operations gfs2_glock_seq_ops = { 1895static const struct seq_operations gfs2_glock_seq_ops = {
@@ -2211,17 +1901,14 @@ static const struct seq_operations gfs2_glock_seq_ops = {
2211 1901
2212static int gfs2_debugfs_open(struct inode *inode, struct file *file) 1902static int gfs2_debugfs_open(struct inode *inode, struct file *file)
2213{ 1903{
2214 struct seq_file *seq; 1904 int ret = seq_open_private(file, &gfs2_glock_seq_ops,
2215 int ret; 1905 sizeof(struct gfs2_glock_iter));
2216 1906 if (ret == 0) {
2217 ret = seq_open(file, &gfs2_glock_seq_ops); 1907 struct seq_file *seq = file->private_data;
2218 if (ret) 1908 struct gfs2_glock_iter *gi = seq->private;
2219 return ret; 1909 gi->sdp = inode->i_private;
2220 1910 }
2221 seq = file->private_data; 1911 return ret;
2222 seq->private = inode->i_private;
2223
2224 return 0;
2225} 1912}
2226 1913
2227static const struct file_operations gfs2_debug_fops = { 1914static const struct file_operations gfs2_debug_fops = {
@@ -2229,7 +1916,7 @@ static const struct file_operations gfs2_debug_fops = {
2229 .open = gfs2_debugfs_open, 1916 .open = gfs2_debugfs_open,
2230 .read = seq_read, 1917 .read = seq_read,
2231 .llseek = seq_lseek, 1918 .llseek = seq_lseek,
2232 .release = seq_release 1919 .release = seq_release_private,
2233}; 1920};
2234 1921
2235int gfs2_create_debugfs_file(struct gfs2_sbd *sdp) 1922int gfs2_create_debugfs_file(struct gfs2_sbd *sdp)
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index cdad3e6f8150..971d92af70fc 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -26,11 +26,8 @@
26#define GL_SKIP 0x00000100 26#define GL_SKIP 0x00000100
27#define GL_ATIME 0x00000200 27#define GL_ATIME 0x00000200
28#define GL_NOCACHE 0x00000400 28#define GL_NOCACHE 0x00000400
29#define GL_FLOCK 0x00000800
30#define GL_NOCANCEL 0x00001000
31 29
32#define GLR_TRYFAILED 13 30#define GLR_TRYFAILED 13
33#define GLR_CANCELED 14
34 31
35static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *gl) 32static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *gl)
36{ 33{
@@ -41,6 +38,8 @@ static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *
41 spin_lock(&gl->gl_spin); 38 spin_lock(&gl->gl_spin);
42 pid = task_pid(current); 39 pid = task_pid(current);
43 list_for_each_entry(gh, &gl->gl_holders, gh_list) { 40 list_for_each_entry(gh, &gl->gl_holders, gh_list) {
41 if (!test_bit(HIF_HOLDER, &gh->gh_iflags))
42 break;
44 if (gh->gh_owner_pid == pid) 43 if (gh->gh_owner_pid == pid)
45 goto out; 44 goto out;
46 } 45 }
@@ -70,7 +69,7 @@ static inline int gfs2_glock_is_blocking(struct gfs2_glock *gl)
70{ 69{
71 int ret; 70 int ret;
72 spin_lock(&gl->gl_spin); 71 spin_lock(&gl->gl_spin);
73 ret = test_bit(GLF_DEMOTE, &gl->gl_flags) || !list_empty(&gl->gl_waiters3); 72 ret = test_bit(GLF_DEMOTE, &gl->gl_flags);
74 spin_unlock(&gl->gl_spin); 73 spin_unlock(&gl->gl_spin);
75 return ret; 74 return ret;
76} 75}
@@ -98,6 +97,7 @@ int gfs2_glock_nq_num(struct gfs2_sbd *sdp,
98int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs); 97int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs);
99void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs); 98void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs);
100void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs); 99void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs);
100void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...);
101 101
102/** 102/**
103 * gfs2_glock_nq_init - intialize a holder and enqueue it on a glock 103 * gfs2_glock_nq_init - intialize a holder and enqueue it on a glock
@@ -130,10 +130,9 @@ int gfs2_lvb_hold(struct gfs2_glock *gl);
130void gfs2_lvb_unhold(struct gfs2_glock *gl); 130void gfs2_lvb_unhold(struct gfs2_glock *gl);
131 131
132void gfs2_glock_cb(void *cb_data, unsigned int type, void *data); 132void gfs2_glock_cb(void *cb_data, unsigned int type, void *data);
133
134void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl); 133void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl);
135void gfs2_reclaim_glock(struct gfs2_sbd *sdp); 134void gfs2_reclaim_glock(struct gfs2_sbd *sdp);
136void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait); 135void gfs2_gl_hash_clear(struct gfs2_sbd *sdp);
137 136
138int __init gfs2_glock_init(void); 137int __init gfs2_glock_init(void);
139void gfs2_glock_exit(void); 138void gfs2_glock_exit(void);
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 07d84d16cda4..c6c318c2a0f6 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -13,6 +13,7 @@
13#include <linux/buffer_head.h> 13#include <linux/buffer_head.h>
14#include <linux/gfs2_ondisk.h> 14#include <linux/gfs2_ondisk.h>
15#include <linux/lm_interface.h> 15#include <linux/lm_interface.h>
16#include <linux/bio.h>
16 17
17#include "gfs2.h" 18#include "gfs2.h"
18#include "incore.h" 19#include "incore.h"
@@ -172,26 +173,6 @@ static void inode_go_sync(struct gfs2_glock *gl)
172} 173}
173 174
174/** 175/**
175 * inode_go_xmote_bh - After promoting/demoting a glock
176 * @gl: the glock
177 *
178 */
179
180static void inode_go_xmote_bh(struct gfs2_glock *gl)
181{
182 struct gfs2_holder *gh = gl->gl_req_gh;
183 struct buffer_head *bh;
184 int error;
185
186 if (gl->gl_state != LM_ST_UNLOCKED &&
187 (!gh || !(gh->gh_flags & GL_SKIP))) {
188 error = gfs2_meta_read(gl, gl->gl_name.ln_number, 0, &bh);
189 if (!error)
190 brelse(bh);
191 }
192}
193
194/**
195 * inode_go_inval - prepare a inode glock to be released 176 * inode_go_inval - prepare a inode glock to be released
196 * @gl: the glock 177 * @gl: the glock
197 * @flags: 178 * @flags:
@@ -267,6 +248,26 @@ static int inode_go_lock(struct gfs2_holder *gh)
267} 248}
268 249
269/** 250/**
251 * inode_go_dump - print information about an inode
252 * @seq: The iterator
253 * @ip: the inode
254 *
255 * Returns: 0 on success, -ENOBUFS when we run out of space
256 */
257
258static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
259{
260 const struct gfs2_inode *ip = gl->gl_object;
261 if (ip == NULL)
262 return 0;
263 gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%08lx\n",
264 (unsigned long long)ip->i_no_formal_ino,
265 (unsigned long long)ip->i_no_addr,
266 IF2DT(ip->i_inode.i_mode), ip->i_flags);
267 return 0;
268}
269
270/**
270 * rgrp_go_demote_ok - Check to see if it's ok to unlock a RG's glock 271 * rgrp_go_demote_ok - Check to see if it's ok to unlock a RG's glock
271 * @gl: the glock 272 * @gl: the glock
272 * 273 *
@@ -306,6 +307,22 @@ static void rgrp_go_unlock(struct gfs2_holder *gh)
306} 307}
307 308
308/** 309/**
310 * rgrp_go_dump - print out an rgrp
311 * @seq: The iterator
312 * @gl: The glock in question
313 *
314 */
315
316static int rgrp_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
317{
318 const struct gfs2_rgrpd *rgd = gl->gl_object;
319 if (rgd == NULL)
320 return 0;
321 gfs2_print_dbg(seq, " R: n:%llu\n", (unsigned long long)rgd->rd_addr);
322 return 0;
323}
324
325/**
309 * trans_go_sync - promote/demote the transaction glock 326 * trans_go_sync - promote/demote the transaction glock
310 * @gl: the glock 327 * @gl: the glock
311 * @state: the requested state 328 * @state: the requested state
@@ -330,7 +347,7 @@ static void trans_go_sync(struct gfs2_glock *gl)
330 * 347 *
331 */ 348 */
332 349
333static void trans_go_xmote_bh(struct gfs2_glock *gl) 350static int trans_go_xmote_bh(struct gfs2_glock *gl, struct gfs2_holder *gh)
334{ 351{
335 struct gfs2_sbd *sdp = gl->gl_sbd; 352 struct gfs2_sbd *sdp = gl->gl_sbd;
336 struct gfs2_inode *ip = GFS2_I(sdp->sd_jdesc->jd_inode); 353 struct gfs2_inode *ip = GFS2_I(sdp->sd_jdesc->jd_inode);
@@ -338,8 +355,7 @@ static void trans_go_xmote_bh(struct gfs2_glock *gl)
338 struct gfs2_log_header_host head; 355 struct gfs2_log_header_host head;
339 int error; 356 int error;
340 357
341 if (gl->gl_state != LM_ST_UNLOCKED && 358 if (test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
342 test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
343 j_gl->gl_ops->go_inval(j_gl, DIO_METADATA); 359 j_gl->gl_ops->go_inval(j_gl, DIO_METADATA);
344 360
345 error = gfs2_find_jhead(sdp->sd_jdesc, &head); 361 error = gfs2_find_jhead(sdp->sd_jdesc, &head);
@@ -354,6 +370,7 @@ static void trans_go_xmote_bh(struct gfs2_glock *gl)
354 gfs2_log_pointers_init(sdp, head.lh_blkno); 370 gfs2_log_pointers_init(sdp, head.lh_blkno);
355 } 371 }
356 } 372 }
373 return 0;
357} 374}
358 375
359/** 376/**
@@ -375,12 +392,12 @@ const struct gfs2_glock_operations gfs2_meta_glops = {
375 392
376const struct gfs2_glock_operations gfs2_inode_glops = { 393const struct gfs2_glock_operations gfs2_inode_glops = {
377 .go_xmote_th = inode_go_sync, 394 .go_xmote_th = inode_go_sync,
378 .go_xmote_bh = inode_go_xmote_bh,
379 .go_inval = inode_go_inval, 395 .go_inval = inode_go_inval,
380 .go_demote_ok = inode_go_demote_ok, 396 .go_demote_ok = inode_go_demote_ok,
381 .go_lock = inode_go_lock, 397 .go_lock = inode_go_lock,
398 .go_dump = inode_go_dump,
382 .go_type = LM_TYPE_INODE, 399 .go_type = LM_TYPE_INODE,
383 .go_min_hold_time = HZ / 10, 400 .go_min_hold_time = HZ / 5,
384}; 401};
385 402
386const struct gfs2_glock_operations gfs2_rgrp_glops = { 403const struct gfs2_glock_operations gfs2_rgrp_glops = {
@@ -389,8 +406,9 @@ const struct gfs2_glock_operations gfs2_rgrp_glops = {
389 .go_demote_ok = rgrp_go_demote_ok, 406 .go_demote_ok = rgrp_go_demote_ok,
390 .go_lock = rgrp_go_lock, 407 .go_lock = rgrp_go_lock,
391 .go_unlock = rgrp_go_unlock, 408 .go_unlock = rgrp_go_unlock,
409 .go_dump = rgrp_go_dump,
392 .go_type = LM_TYPE_RGRP, 410 .go_type = LM_TYPE_RGRP,
393 .go_min_hold_time = HZ / 10, 411 .go_min_hold_time = HZ / 5,
394}; 412};
395 413
396const struct gfs2_glock_operations gfs2_trans_glops = { 414const struct gfs2_glock_operations gfs2_trans_glops = {
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index eabe5eac41da..448697a5c462 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -77,7 +77,6 @@ struct gfs2_rgrp_host {
77struct gfs2_rgrpd { 77struct gfs2_rgrpd {
78 struct list_head rd_list; /* Link with superblock */ 78 struct list_head rd_list; /* Link with superblock */
79 struct list_head rd_list_mru; 79 struct list_head rd_list_mru;
80 struct list_head rd_recent; /* Recently used rgrps */
81 struct gfs2_glock *rd_gl; /* Glock for this rgrp */ 80 struct gfs2_glock *rd_gl; /* Glock for this rgrp */
82 u64 rd_addr; /* grp block disk address */ 81 u64 rd_addr; /* grp block disk address */
83 u64 rd_data0; /* first data location */ 82 u64 rd_data0; /* first data location */
@@ -128,20 +127,20 @@ struct gfs2_bufdata {
128 127
129struct gfs2_glock_operations { 128struct gfs2_glock_operations {
130 void (*go_xmote_th) (struct gfs2_glock *gl); 129 void (*go_xmote_th) (struct gfs2_glock *gl);
131 void (*go_xmote_bh) (struct gfs2_glock *gl); 130 int (*go_xmote_bh) (struct gfs2_glock *gl, struct gfs2_holder *gh);
132 void (*go_inval) (struct gfs2_glock *gl, int flags); 131 void (*go_inval) (struct gfs2_glock *gl, int flags);
133 int (*go_demote_ok) (struct gfs2_glock *gl); 132 int (*go_demote_ok) (struct gfs2_glock *gl);
134 int (*go_lock) (struct gfs2_holder *gh); 133 int (*go_lock) (struct gfs2_holder *gh);
135 void (*go_unlock) (struct gfs2_holder *gh); 134 void (*go_unlock) (struct gfs2_holder *gh);
135 int (*go_dump)(struct seq_file *seq, const struct gfs2_glock *gl);
136 const int go_type; 136 const int go_type;
137 const unsigned long go_min_hold_time; 137 const unsigned long go_min_hold_time;
138}; 138};
139 139
140enum { 140enum {
141 /* States */ 141 /* States */
142 HIF_HOLDER = 6, 142 HIF_HOLDER = 6, /* Set for gh that "holds" the glock */
143 HIF_FIRST = 7, 143 HIF_FIRST = 7,
144 HIF_ABORTED = 9,
145 HIF_WAIT = 10, 144 HIF_WAIT = 10,
146}; 145};
147 146
@@ -154,20 +153,20 @@ struct gfs2_holder {
154 unsigned gh_flags; 153 unsigned gh_flags;
155 154
156 int gh_error; 155 int gh_error;
157 unsigned long gh_iflags; 156 unsigned long gh_iflags; /* HIF_... */
158 unsigned long gh_ip; 157 unsigned long gh_ip;
159}; 158};
160 159
161enum { 160enum {
162 GLF_LOCK = 1, 161 GLF_LOCK = 1,
163 GLF_STICKY = 2, 162 GLF_STICKY = 2,
164 GLF_DEMOTE = 3, 163 GLF_DEMOTE = 3,
165 GLF_PENDING_DEMOTE = 4, 164 GLF_PENDING_DEMOTE = 4,
166 GLF_DIRTY = 5, 165 GLF_DEMOTE_IN_PROGRESS = 5,
167 GLF_DEMOTE_IN_PROGRESS = 6, 166 GLF_DIRTY = 6,
168 GLF_LFLUSH = 7, 167 GLF_LFLUSH = 7,
169 GLF_WAITERS2 = 8, 168 GLF_INVALIDATE_IN_PROGRESS = 8,
170 GLF_CONV_DEADLK = 9, 169 GLF_REPLY_PENDING = 9,
171}; 170};
172 171
173struct gfs2_glock { 172struct gfs2_glock {
@@ -179,19 +178,14 @@ struct gfs2_glock {
179 spinlock_t gl_spin; 178 spinlock_t gl_spin;
180 179
181 unsigned int gl_state; 180 unsigned int gl_state;
181 unsigned int gl_target;
182 unsigned int gl_reply;
182 unsigned int gl_hash; 183 unsigned int gl_hash;
183 unsigned int gl_demote_state; /* state requested by remote node */ 184 unsigned int gl_demote_state; /* state requested by remote node */
184 unsigned long gl_demote_time; /* time of first demote request */ 185 unsigned long gl_demote_time; /* time of first demote request */
185 struct pid *gl_owner_pid;
186 unsigned long gl_ip;
187 struct list_head gl_holders; 186 struct list_head gl_holders;
188 struct list_head gl_waiters1; /* HIF_MUTEX */
189 struct list_head gl_waiters3; /* HIF_PROMOTE */
190 187
191 const struct gfs2_glock_operations *gl_ops; 188 const struct gfs2_glock_operations *gl_ops;
192
193 struct gfs2_holder *gl_req_gh;
194
195 void *gl_lock; 189 void *gl_lock;
196 char *gl_lvb; 190 char *gl_lvb;
197 atomic_t gl_lvb_count; 191 atomic_t gl_lvb_count;
@@ -427,7 +421,6 @@ struct gfs2_tune {
427 unsigned int gt_quota_quantum; /* Secs between syncs to quota file */ 421 unsigned int gt_quota_quantum; /* Secs between syncs to quota file */
428 unsigned int gt_atime_quantum; /* Min secs between atime updates */ 422 unsigned int gt_atime_quantum; /* Min secs between atime updates */
429 unsigned int gt_new_files_jdata; 423 unsigned int gt_new_files_jdata;
430 unsigned int gt_new_files_directio;
431 unsigned int gt_max_readahead; /* Max bytes to read-ahead from disk */ 424 unsigned int gt_max_readahead; /* Max bytes to read-ahead from disk */
432 unsigned int gt_stall_secs; /* Detects trouble! */ 425 unsigned int gt_stall_secs; /* Detects trouble! */
433 unsigned int gt_complain_secs; 426 unsigned int gt_complain_secs;
@@ -534,7 +527,6 @@ struct gfs2_sbd {
534 struct mutex sd_rindex_mutex; 527 struct mutex sd_rindex_mutex;
535 struct list_head sd_rindex_list; 528 struct list_head sd_rindex_list;
536 struct list_head sd_rindex_mru_list; 529 struct list_head sd_rindex_mru_list;
537 struct list_head sd_rindex_recent_list;
538 struct gfs2_rgrpd *sd_rindex_forward; 530 struct gfs2_rgrpd *sd_rindex_forward;
539 unsigned int sd_rgrps; 531 unsigned int sd_rgrps;
540 532
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 09453d057e41..6da0ab355b8a 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -504,7 +504,7 @@ struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
504 } 504 }
505 505
506 if (!is_root) { 506 if (!is_root) {
507 error = permission(dir, MAY_EXEC, NULL); 507 error = gfs2_permission(dir, MAY_EXEC);
508 if (error) 508 if (error)
509 goto out; 509 goto out;
510 } 510 }
@@ -667,7 +667,7 @@ static int create_ok(struct gfs2_inode *dip, const struct qstr *name,
667{ 667{
668 int error; 668 int error;
669 669
670 error = permission(&dip->i_inode, MAY_WRITE | MAY_EXEC, NULL); 670 error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC);
671 if (error) 671 if (error)
672 return error; 672 return error;
673 673
@@ -789,13 +789,8 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
789 if ((dip->i_di.di_flags & GFS2_DIF_INHERIT_JDATA) || 789 if ((dip->i_di.di_flags & GFS2_DIF_INHERIT_JDATA) ||
790 gfs2_tune_get(sdp, gt_new_files_jdata)) 790 gfs2_tune_get(sdp, gt_new_files_jdata))
791 di->di_flags |= cpu_to_be32(GFS2_DIF_JDATA); 791 di->di_flags |= cpu_to_be32(GFS2_DIF_JDATA);
792 if ((dip->i_di.di_flags & GFS2_DIF_INHERIT_DIRECTIO) ||
793 gfs2_tune_get(sdp, gt_new_files_directio))
794 di->di_flags |= cpu_to_be32(GFS2_DIF_DIRECTIO);
795 } else if (S_ISDIR(mode)) { 792 } else if (S_ISDIR(mode)) {
796 di->di_flags |= cpu_to_be32(dip->i_di.di_flags & 793 di->di_flags |= cpu_to_be32(dip->i_di.di_flags &
797 GFS2_DIF_INHERIT_DIRECTIO);
798 di->di_flags |= cpu_to_be32(dip->i_di.di_flags &
799 GFS2_DIF_INHERIT_JDATA); 794 GFS2_DIF_INHERIT_JDATA);
800 } 795 }
801 796
@@ -1134,7 +1129,7 @@ int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
1134 if (IS_APPEND(&dip->i_inode)) 1129 if (IS_APPEND(&dip->i_inode))
1135 return -EPERM; 1130 return -EPERM;
1136 1131
1137 error = permission(&dip->i_inode, MAY_WRITE | MAY_EXEC, NULL); 1132 error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC);
1138 if (error) 1133 if (error)
1139 return error; 1134 return error;
1140 1135
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 580da454b38f..6074c2506f75 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -72,7 +72,6 @@ static inline void gfs2_inum_out(const struct gfs2_inode *ip,
72} 72}
73 73
74 74
75void gfs2_inode_attr_in(struct gfs2_inode *ip);
76void gfs2_set_iop(struct inode *inode); 75void gfs2_set_iop(struct inode *inode);
77struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, 76struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type,
78 u64 no_addr, u64 no_formal_ino, 77 u64 no_addr, u64 no_formal_ino,
@@ -91,6 +90,7 @@ int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
91 struct gfs2_inode *ip); 90 struct gfs2_inode *ip);
92int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name, 91int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
93 const struct gfs2_inode *ip); 92 const struct gfs2_inode *ip);
93int gfs2_permission(struct inode *inode, int mask);
94int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to); 94int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to);
95int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len); 95int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len);
96int gfs2_glock_nq_atime(struct gfs2_holder *gh); 96int gfs2_glock_nq_atime(struct gfs2_holder *gh);
diff --git a/fs/gfs2/locking.c b/fs/gfs2/locking.c
index 663fee728783..523243a13a21 100644
--- a/fs/gfs2/locking.c
+++ b/fs/gfs2/locking.c
@@ -23,12 +23,54 @@ struct lmh_wrapper {
23 const struct lm_lockops *lw_ops; 23 const struct lm_lockops *lw_ops;
24}; 24};
25 25
26static int nolock_mount(char *table_name, char *host_data,
27 lm_callback_t cb, void *cb_data,
28 unsigned int min_lvb_size, int flags,
29 struct lm_lockstruct *lockstruct,
30 struct kobject *fskobj);
31
26/* List of registered low-level locking protocols. A file system selects one 32/* List of registered low-level locking protocols. A file system selects one
27 of them by name at mount time, e.g. lock_nolock, lock_dlm. */ 33 of them by name at mount time, e.g. lock_nolock, lock_dlm. */
28 34
35static const struct lm_lockops nolock_ops = {
36 .lm_proto_name = "lock_nolock",
37 .lm_mount = nolock_mount,
38};
39
40static struct lmh_wrapper nolock_proto = {
41 .lw_list = LIST_HEAD_INIT(nolock_proto.lw_list),
42 .lw_ops = &nolock_ops,
43};
44
29static LIST_HEAD(lmh_list); 45static LIST_HEAD(lmh_list);
30static DEFINE_MUTEX(lmh_lock); 46static DEFINE_MUTEX(lmh_lock);
31 47
48static int nolock_mount(char *table_name, char *host_data,
49 lm_callback_t cb, void *cb_data,
50 unsigned int min_lvb_size, int flags,
51 struct lm_lockstruct *lockstruct,
52 struct kobject *fskobj)
53{
54 char *c;
55 unsigned int jid;
56
57 c = strstr(host_data, "jid=");
58 if (!c)
59 jid = 0;
60 else {
61 c += 4;
62 sscanf(c, "%u", &jid);
63 }
64
65 lockstruct->ls_jid = jid;
66 lockstruct->ls_first = 1;
67 lockstruct->ls_lvb_size = min_lvb_size;
68 lockstruct->ls_ops = &nolock_ops;
69 lockstruct->ls_flags = LM_LSFLAG_LOCAL;
70
71 return 0;
72}
73
32/** 74/**
33 * gfs2_register_lockproto - Register a low-level locking protocol 75 * gfs2_register_lockproto - Register a low-level locking protocol
34 * @proto: the protocol definition 76 * @proto: the protocol definition
@@ -116,9 +158,13 @@ int gfs2_mount_lockproto(char *proto_name, char *table_name, char *host_data,
116 int try = 0; 158 int try = 0;
117 int error, found; 159 int error, found;
118 160
161
119retry: 162retry:
120 mutex_lock(&lmh_lock); 163 mutex_lock(&lmh_lock);
121 164
165 if (list_empty(&nolock_proto.lw_list))
166 list_add(&nolock_proto.lw_list, &lmh_list);
167
122 found = 0; 168 found = 0;
123 list_for_each_entry(lw, &lmh_list, lw_list) { 169 list_for_each_entry(lw, &lmh_list, lw_list) {
124 if (!strcmp(lw->lw_ops->lm_proto_name, proto_name)) { 170 if (!strcmp(lw->lw_ops->lm_proto_name, proto_name)) {
@@ -139,7 +185,8 @@ retry:
139 goto out; 185 goto out;
140 } 186 }
141 187
142 if (!try_module_get(lw->lw_ops->lm_owner)) { 188 if (lw->lw_ops->lm_owner &&
189 !try_module_get(lw->lw_ops->lm_owner)) {
143 try = 0; 190 try = 0;
144 mutex_unlock(&lmh_lock); 191 mutex_unlock(&lmh_lock);
145 msleep(1000); 192 msleep(1000);
@@ -158,7 +205,8 @@ out:
158void gfs2_unmount_lockproto(struct lm_lockstruct *lockstruct) 205void gfs2_unmount_lockproto(struct lm_lockstruct *lockstruct)
159{ 206{
160 mutex_lock(&lmh_lock); 207 mutex_lock(&lmh_lock);
161 lockstruct->ls_ops->lm_unmount(lockstruct->ls_lockspace); 208 if (lockstruct->ls_ops->lm_unmount)
209 lockstruct->ls_ops->lm_unmount(lockstruct->ls_lockspace);
162 if (lockstruct->ls_ops->lm_owner) 210 if (lockstruct->ls_ops->lm_owner)
163 module_put(lockstruct->ls_ops->lm_owner); 211 module_put(lockstruct->ls_ops->lm_owner);
164 mutex_unlock(&lmh_lock); 212 mutex_unlock(&lmh_lock);
diff --git a/fs/gfs2/locking/dlm/lock.c b/fs/gfs2/locking/dlm/lock.c
index cf7ea8abec87..2482c9047505 100644
--- a/fs/gfs2/locking/dlm/lock.c
+++ b/fs/gfs2/locking/dlm/lock.c
@@ -11,46 +11,60 @@
11 11
12static char junk_lvb[GDLM_LVB_SIZE]; 12static char junk_lvb[GDLM_LVB_SIZE];
13 13
14static void queue_complete(struct gdlm_lock *lp) 14
15/* convert dlm lock-mode to gfs lock-state */
16
17static s16 gdlm_make_lmstate(s16 dlmmode)
15{ 18{
16 struct gdlm_ls *ls = lp->ls; 19 switch (dlmmode) {
20 case DLM_LOCK_IV:
21 case DLM_LOCK_NL:
22 return LM_ST_UNLOCKED;
23 case DLM_LOCK_EX:
24 return LM_ST_EXCLUSIVE;
25 case DLM_LOCK_CW:
26 return LM_ST_DEFERRED;
27 case DLM_LOCK_PR:
28 return LM_ST_SHARED;
29 }
30 gdlm_assert(0, "unknown DLM mode %d", dlmmode);
31 return -1;
32}
17 33
18 clear_bit(LFL_ACTIVE, &lp->flags); 34/* A lock placed on this queue is re-submitted to DLM as soon as the lock_dlm
35 thread gets to it. */
36
37static void queue_submit(struct gdlm_lock *lp)
38{
39 struct gdlm_ls *ls = lp->ls;
19 40
20 spin_lock(&ls->async_lock); 41 spin_lock(&ls->async_lock);
21 list_add_tail(&lp->clist, &ls->complete); 42 list_add_tail(&lp->delay_list, &ls->submit);
22 spin_unlock(&ls->async_lock); 43 spin_unlock(&ls->async_lock);
23 wake_up(&ls->thread_wait); 44 wake_up(&ls->thread_wait);
24} 45}
25 46
26static inline void gdlm_ast(void *astarg) 47static void wake_up_ast(struct gdlm_lock *lp)
27{ 48{
28 queue_complete(astarg); 49 clear_bit(LFL_AST_WAIT, &lp->flags);
50 smp_mb__after_clear_bit();
51 wake_up_bit(&lp->flags, LFL_AST_WAIT);
29} 52}
30 53
31static inline void gdlm_bast(void *astarg, int mode) 54static void gdlm_delete_lp(struct gdlm_lock *lp)
32{ 55{
33 struct gdlm_lock *lp = astarg;
34 struct gdlm_ls *ls = lp->ls; 56 struct gdlm_ls *ls = lp->ls;
35 57
36 if (!mode) {
37 printk(KERN_INFO "lock_dlm: bast mode zero %x,%llx\n",
38 lp->lockname.ln_type,
39 (unsigned long long)lp->lockname.ln_number);
40 return;
41 }
42
43 spin_lock(&ls->async_lock); 58 spin_lock(&ls->async_lock);
44 if (!lp->bast_mode) { 59 if (!list_empty(&lp->delay_list))
45 list_add_tail(&lp->blist, &ls->blocking); 60 list_del_init(&lp->delay_list);
46 lp->bast_mode = mode; 61 ls->all_locks_count--;
47 } else if (lp->bast_mode < mode)
48 lp->bast_mode = mode;
49 spin_unlock(&ls->async_lock); 62 spin_unlock(&ls->async_lock);
50 wake_up(&ls->thread_wait); 63
64 kfree(lp);
51} 65}
52 66
53void gdlm_queue_delayed(struct gdlm_lock *lp) 67static void gdlm_queue_delayed(struct gdlm_lock *lp)
54{ 68{
55 struct gdlm_ls *ls = lp->ls; 69 struct gdlm_ls *ls = lp->ls;
56 70
@@ -59,6 +73,236 @@ void gdlm_queue_delayed(struct gdlm_lock *lp)
59 spin_unlock(&ls->async_lock); 73 spin_unlock(&ls->async_lock);
60} 74}
61 75
76static void process_complete(struct gdlm_lock *lp)
77{
78 struct gdlm_ls *ls = lp->ls;
79 struct lm_async_cb acb;
80
81 memset(&acb, 0, sizeof(acb));
82
83 if (lp->lksb.sb_status == -DLM_ECANCEL) {
84 log_info("complete dlm cancel %x,%llx flags %lx",
85 lp->lockname.ln_type,
86 (unsigned long long)lp->lockname.ln_number,
87 lp->flags);
88
89 lp->req = lp->cur;
90 acb.lc_ret |= LM_OUT_CANCELED;
91 if (lp->cur == DLM_LOCK_IV)
92 lp->lksb.sb_lkid = 0;
93 goto out;
94 }
95
96 if (test_and_clear_bit(LFL_DLM_UNLOCK, &lp->flags)) {
97 if (lp->lksb.sb_status != -DLM_EUNLOCK) {
98 log_info("unlock sb_status %d %x,%llx flags %lx",
99 lp->lksb.sb_status, lp->lockname.ln_type,
100 (unsigned long long)lp->lockname.ln_number,
101 lp->flags);
102 return;
103 }
104
105 lp->cur = DLM_LOCK_IV;
106 lp->req = DLM_LOCK_IV;
107 lp->lksb.sb_lkid = 0;
108
109 if (test_and_clear_bit(LFL_UNLOCK_DELETE, &lp->flags)) {
110 gdlm_delete_lp(lp);
111 return;
112 }
113 goto out;
114 }
115
116 if (lp->lksb.sb_flags & DLM_SBF_VALNOTVALID)
117 memset(lp->lksb.sb_lvbptr, 0, GDLM_LVB_SIZE);
118
119 if (lp->lksb.sb_flags & DLM_SBF_ALTMODE) {
120 if (lp->req == DLM_LOCK_PR)
121 lp->req = DLM_LOCK_CW;
122 else if (lp->req == DLM_LOCK_CW)
123 lp->req = DLM_LOCK_PR;
124 }
125
126 /*
127 * A canceled lock request. The lock was just taken off the delayed
128 * list and was never even submitted to dlm.
129 */
130
131 if (test_and_clear_bit(LFL_CANCEL, &lp->flags)) {
132 log_info("complete internal cancel %x,%llx",
133 lp->lockname.ln_type,
134 (unsigned long long)lp->lockname.ln_number);
135 lp->req = lp->cur;
136 acb.lc_ret |= LM_OUT_CANCELED;
137 goto out;
138 }
139
140 /*
141 * An error occured.
142 */
143
144 if (lp->lksb.sb_status) {
145 /* a "normal" error */
146 if ((lp->lksb.sb_status == -EAGAIN) &&
147 (lp->lkf & DLM_LKF_NOQUEUE)) {
148 lp->req = lp->cur;
149 if (lp->cur == DLM_LOCK_IV)
150 lp->lksb.sb_lkid = 0;
151 goto out;
152 }
153
154 /* this could only happen with cancels I think */
155 log_info("ast sb_status %d %x,%llx flags %lx",
156 lp->lksb.sb_status, lp->lockname.ln_type,
157 (unsigned long long)lp->lockname.ln_number,
158 lp->flags);
159 return;
160 }
161
162 /*
163 * This is an AST for an EX->EX conversion for sync_lvb from GFS.
164 */
165
166 if (test_and_clear_bit(LFL_SYNC_LVB, &lp->flags)) {
167 wake_up_ast(lp);
168 return;
169 }
170
171 /*
172 * A lock has been demoted to NL because it initially completed during
173 * BLOCK_LOCKS. Now it must be requested in the originally requested
174 * mode.
175 */
176
177 if (test_and_clear_bit(LFL_REREQUEST, &lp->flags)) {
178 gdlm_assert(lp->req == DLM_LOCK_NL, "%x,%llx",
179 lp->lockname.ln_type,
180 (unsigned long long)lp->lockname.ln_number);
181 gdlm_assert(lp->prev_req > DLM_LOCK_NL, "%x,%llx",
182 lp->lockname.ln_type,
183 (unsigned long long)lp->lockname.ln_number);
184
185 lp->cur = DLM_LOCK_NL;
186 lp->req = lp->prev_req;
187 lp->prev_req = DLM_LOCK_IV;
188 lp->lkf &= ~DLM_LKF_CONVDEADLK;
189
190 set_bit(LFL_NOCACHE, &lp->flags);
191
192 if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
193 !test_bit(LFL_NOBLOCK, &lp->flags))
194 gdlm_queue_delayed(lp);
195 else
196 queue_submit(lp);
197 return;
198 }
199
200 /*
201 * A request is granted during dlm recovery. It may be granted
202 * because the locks of a failed node were cleared. In that case,
203 * there may be inconsistent data beneath this lock and we must wait
204 * for recovery to complete to use it. When gfs recovery is done this
205 * granted lock will be converted to NL and then reacquired in this
206 * granted state.
207 */
208
209 if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
210 !test_bit(LFL_NOBLOCK, &lp->flags) &&
211 lp->req != DLM_LOCK_NL) {
212
213 lp->cur = lp->req;
214 lp->prev_req = lp->req;
215 lp->req = DLM_LOCK_NL;
216 lp->lkf |= DLM_LKF_CONVERT;
217 lp->lkf &= ~DLM_LKF_CONVDEADLK;
218
219 log_debug("rereq %x,%llx id %x %d,%d",
220 lp->lockname.ln_type,
221 (unsigned long long)lp->lockname.ln_number,
222 lp->lksb.sb_lkid, lp->cur, lp->req);
223
224 set_bit(LFL_REREQUEST, &lp->flags);
225 queue_submit(lp);
226 return;
227 }
228
229 /*
230 * DLM demoted the lock to NL before it was granted so GFS must be
231 * told it cannot cache data for this lock.
232 */
233
234 if (lp->lksb.sb_flags & DLM_SBF_DEMOTED)
235 set_bit(LFL_NOCACHE, &lp->flags);
236
237out:
238 /*
239 * This is an internal lock_dlm lock
240 */
241
242 if (test_bit(LFL_INLOCK, &lp->flags)) {
243 clear_bit(LFL_NOBLOCK, &lp->flags);
244 lp->cur = lp->req;
245 wake_up_ast(lp);
246 return;
247 }
248
249 /*
250 * Normal completion of a lock request. Tell GFS it now has the lock.
251 */
252
253 clear_bit(LFL_NOBLOCK, &lp->flags);
254 lp->cur = lp->req;
255
256 acb.lc_name = lp->lockname;
257 acb.lc_ret |= gdlm_make_lmstate(lp->cur);
258
259 ls->fscb(ls->sdp, LM_CB_ASYNC, &acb);
260}
261
262static void gdlm_ast(void *astarg)
263{
264 struct gdlm_lock *lp = astarg;
265 clear_bit(LFL_ACTIVE, &lp->flags);
266 process_complete(lp);
267}
268
269static void process_blocking(struct gdlm_lock *lp, int bast_mode)
270{
271 struct gdlm_ls *ls = lp->ls;
272 unsigned int cb = 0;
273
274 switch (gdlm_make_lmstate(bast_mode)) {
275 case LM_ST_EXCLUSIVE:
276 cb = LM_CB_NEED_E;
277 break;
278 case LM_ST_DEFERRED:
279 cb = LM_CB_NEED_D;
280 break;
281 case LM_ST_SHARED:
282 cb = LM_CB_NEED_S;
283 break;
284 default:
285 gdlm_assert(0, "unknown bast mode %u", bast_mode);
286 }
287
288 ls->fscb(ls->sdp, cb, &lp->lockname);
289}
290
291
292static void gdlm_bast(void *astarg, int mode)
293{
294 struct gdlm_lock *lp = astarg;
295
296 if (!mode) {
297 printk(KERN_INFO "lock_dlm: bast mode zero %x,%llx\n",
298 lp->lockname.ln_type,
299 (unsigned long long)lp->lockname.ln_number);
300 return;
301 }
302
303 process_blocking(lp, mode);
304}
305
62/* convert gfs lock-state to dlm lock-mode */ 306/* convert gfs lock-state to dlm lock-mode */
63 307
64static s16 make_mode(s16 lmstate) 308static s16 make_mode(s16 lmstate)
@@ -77,24 +321,6 @@ static s16 make_mode(s16 lmstate)
77 return -1; 321 return -1;
78} 322}
79 323
80/* convert dlm lock-mode to gfs lock-state */
81
82s16 gdlm_make_lmstate(s16 dlmmode)
83{
84 switch (dlmmode) {
85 case DLM_LOCK_IV:
86 case DLM_LOCK_NL:
87 return LM_ST_UNLOCKED;
88 case DLM_LOCK_EX:
89 return LM_ST_EXCLUSIVE;
90 case DLM_LOCK_CW:
91 return LM_ST_DEFERRED;
92 case DLM_LOCK_PR:
93 return LM_ST_SHARED;
94 }
95 gdlm_assert(0, "unknown DLM mode %d", dlmmode);
96 return -1;
97}
98 324
99/* verify agreement with GFS on the current lock state, NB: DLM_LOCK_NL and 325/* verify agreement with GFS on the current lock state, NB: DLM_LOCK_NL and
100 DLM_LOCK_IV are both considered LM_ST_UNLOCKED by GFS. */ 326 DLM_LOCK_IV are both considered LM_ST_UNLOCKED by GFS. */
@@ -134,14 +360,6 @@ static inline unsigned int make_flags(struct gdlm_lock *lp,
134 360
135 if (lp->lksb.sb_lkid != 0) { 361 if (lp->lksb.sb_lkid != 0) {
136 lkf |= DLM_LKF_CONVERT; 362 lkf |= DLM_LKF_CONVERT;
137
138 /* Conversion deadlock avoidance by DLM */
139
140 if (!(lp->ls->fsflags & LM_MFLAG_CONV_NODROP) &&
141 !test_bit(LFL_FORCE_PROMOTE, &lp->flags) &&
142 !(lkf & DLM_LKF_NOQUEUE) &&
143 cur > DLM_LOCK_NL && req > DLM_LOCK_NL && cur != req)
144 lkf |= DLM_LKF_CONVDEADLK;
145 } 363 }
146 364
147 if (lp->lvb) 365 if (lp->lvb)
@@ -173,14 +391,9 @@ static int gdlm_create_lp(struct gdlm_ls *ls, struct lm_lockname *name,
173 make_strname(name, &lp->strname); 391 make_strname(name, &lp->strname);
174 lp->ls = ls; 392 lp->ls = ls;
175 lp->cur = DLM_LOCK_IV; 393 lp->cur = DLM_LOCK_IV;
176 lp->lvb = NULL;
177 lp->hold_null = NULL;
178 INIT_LIST_HEAD(&lp->clist);
179 INIT_LIST_HEAD(&lp->blist);
180 INIT_LIST_HEAD(&lp->delay_list); 394 INIT_LIST_HEAD(&lp->delay_list);
181 395
182 spin_lock(&ls->async_lock); 396 spin_lock(&ls->async_lock);
183 list_add(&lp->all_list, &ls->all_locks);
184 ls->all_locks_count++; 397 ls->all_locks_count++;
185 spin_unlock(&ls->async_lock); 398 spin_unlock(&ls->async_lock);
186 399
@@ -188,26 +401,6 @@ static int gdlm_create_lp(struct gdlm_ls *ls, struct lm_lockname *name,
188 return 0; 401 return 0;
189} 402}
190 403
191void gdlm_delete_lp(struct gdlm_lock *lp)
192{
193 struct gdlm_ls *ls = lp->ls;
194
195 spin_lock(&ls->async_lock);
196 if (!list_empty(&lp->clist))
197 list_del_init(&lp->clist);
198 if (!list_empty(&lp->blist))
199 list_del_init(&lp->blist);
200 if (!list_empty(&lp->delay_list))
201 list_del_init(&lp->delay_list);
202 gdlm_assert(!list_empty(&lp->all_list), "%x,%llx", lp->lockname.ln_type,
203 (unsigned long long)lp->lockname.ln_number);
204 list_del_init(&lp->all_list);
205 ls->all_locks_count--;
206 spin_unlock(&ls->async_lock);
207
208 kfree(lp);
209}
210
211int gdlm_get_lock(void *lockspace, struct lm_lockname *name, 404int gdlm_get_lock(void *lockspace, struct lm_lockname *name,
212 void **lockp) 405 void **lockp)
213{ 406{
@@ -261,7 +454,7 @@ unsigned int gdlm_do_lock(struct gdlm_lock *lp)
261 454
262 if ((error == -EAGAIN) && (lp->lkf & DLM_LKF_NOQUEUE)) { 455 if ((error == -EAGAIN) && (lp->lkf & DLM_LKF_NOQUEUE)) {
263 lp->lksb.sb_status = -EAGAIN; 456 lp->lksb.sb_status = -EAGAIN;
264 queue_complete(lp); 457 gdlm_ast(lp);
265 error = 0; 458 error = 0;
266 } 459 }
267 460
@@ -308,6 +501,12 @@ unsigned int gdlm_lock(void *lock, unsigned int cur_state,
308{ 501{
309 struct gdlm_lock *lp = lock; 502 struct gdlm_lock *lp = lock;
310 503
504 if (req_state == LM_ST_UNLOCKED)
505 return gdlm_unlock(lock, cur_state);
506
507 if (req_state == LM_ST_UNLOCKED)
508 return gdlm_unlock(lock, cur_state);
509
311 clear_bit(LFL_DLM_CANCEL, &lp->flags); 510 clear_bit(LFL_DLM_CANCEL, &lp->flags);
312 if (flags & LM_FLAG_NOEXP) 511 if (flags & LM_FLAG_NOEXP)
313 set_bit(LFL_NOBLOCK, &lp->flags); 512 set_bit(LFL_NOBLOCK, &lp->flags);
@@ -351,7 +550,7 @@ void gdlm_cancel(void *lock)
351 if (delay_list) { 550 if (delay_list) {
352 set_bit(LFL_CANCEL, &lp->flags); 551 set_bit(LFL_CANCEL, &lp->flags);
353 set_bit(LFL_ACTIVE, &lp->flags); 552 set_bit(LFL_ACTIVE, &lp->flags);
354 queue_complete(lp); 553 gdlm_ast(lp);
355 return; 554 return;
356 } 555 }
357 556
@@ -507,22 +706,3 @@ void gdlm_submit_delayed(struct gdlm_ls *ls)
507 wake_up(&ls->thread_wait); 706 wake_up(&ls->thread_wait);
508} 707}
509 708
510int gdlm_release_all_locks(struct gdlm_ls *ls)
511{
512 struct gdlm_lock *lp, *safe;
513 int count = 0;
514
515 spin_lock(&ls->async_lock);
516 list_for_each_entry_safe(lp, safe, &ls->all_locks, all_list) {
517 list_del_init(&lp->all_list);
518
519 if (lp->lvb && lp->lvb != junk_lvb)
520 kfree(lp->lvb);
521 kfree(lp);
522 count++;
523 }
524 spin_unlock(&ls->async_lock);
525
526 return count;
527}
528
diff --git a/fs/gfs2/locking/dlm/lock_dlm.h b/fs/gfs2/locking/dlm/lock_dlm.h
index a243cf69c54e..3c98e7c6f93b 100644
--- a/fs/gfs2/locking/dlm/lock_dlm.h
+++ b/fs/gfs2/locking/dlm/lock_dlm.h
@@ -72,19 +72,12 @@ struct gdlm_ls {
72 int recover_jid_done; 72 int recover_jid_done;
73 int recover_jid_status; 73 int recover_jid_status;
74 spinlock_t async_lock; 74 spinlock_t async_lock;
75 struct list_head complete;
76 struct list_head blocking;
77 struct list_head delayed; 75 struct list_head delayed;
78 struct list_head submit; 76 struct list_head submit;
79 struct list_head all_locks;
80 u32 all_locks_count; 77 u32 all_locks_count;
81 wait_queue_head_t wait_control; 78 wait_queue_head_t wait_control;
82 struct task_struct *thread1; 79 struct task_struct *thread;
83 struct task_struct *thread2;
84 wait_queue_head_t thread_wait; 80 wait_queue_head_t thread_wait;
85 unsigned long drop_time;
86 int drop_locks_count;
87 int drop_locks_period;
88}; 81};
89 82
90enum { 83enum {
@@ -117,12 +110,7 @@ struct gdlm_lock {
117 u32 lkf; /* dlm flags DLM_LKF_ */ 110 u32 lkf; /* dlm flags DLM_LKF_ */
118 unsigned long flags; /* lock_dlm flags LFL_ */ 111 unsigned long flags; /* lock_dlm flags LFL_ */
119 112
120 int bast_mode; /* protected by async_lock */
121
122 struct list_head clist; /* complete */
123 struct list_head blist; /* blocking */
124 struct list_head delay_list; /* delayed */ 113 struct list_head delay_list; /* delayed */
125 struct list_head all_list; /* all locks for the fs */
126 struct gdlm_lock *hold_null; /* NL lock for hold_lvb */ 114 struct gdlm_lock *hold_null; /* NL lock for hold_lvb */
127}; 115};
128 116
@@ -159,11 +147,7 @@ void gdlm_release_threads(struct gdlm_ls *);
159 147
160/* lock.c */ 148/* lock.c */
161 149
162s16 gdlm_make_lmstate(s16);
163void gdlm_queue_delayed(struct gdlm_lock *);
164void gdlm_submit_delayed(struct gdlm_ls *); 150void gdlm_submit_delayed(struct gdlm_ls *);
165int gdlm_release_all_locks(struct gdlm_ls *);
166void gdlm_delete_lp(struct gdlm_lock *);
167unsigned int gdlm_do_lock(struct gdlm_lock *); 151unsigned int gdlm_do_lock(struct gdlm_lock *);
168 152
169int gdlm_get_lock(void *, struct lm_lockname *, void **); 153int gdlm_get_lock(void *, struct lm_lockname *, void **);
diff --git a/fs/gfs2/locking/dlm/mount.c b/fs/gfs2/locking/dlm/mount.c
index 470bdf650b50..09d78c216f48 100644
--- a/fs/gfs2/locking/dlm/mount.c
+++ b/fs/gfs2/locking/dlm/mount.c
@@ -22,22 +22,14 @@ static struct gdlm_ls *init_gdlm(lm_callback_t cb, struct gfs2_sbd *sdp,
22 if (!ls) 22 if (!ls)
23 return NULL; 23 return NULL;
24 24
25 ls->drop_locks_count = GDLM_DROP_COUNT;
26 ls->drop_locks_period = GDLM_DROP_PERIOD;
27 ls->fscb = cb; 25 ls->fscb = cb;
28 ls->sdp = sdp; 26 ls->sdp = sdp;
29 ls->fsflags = flags; 27 ls->fsflags = flags;
30 spin_lock_init(&ls->async_lock); 28 spin_lock_init(&ls->async_lock);
31 INIT_LIST_HEAD(&ls->complete);
32 INIT_LIST_HEAD(&ls->blocking);
33 INIT_LIST_HEAD(&ls->delayed); 29 INIT_LIST_HEAD(&ls->delayed);
34 INIT_LIST_HEAD(&ls->submit); 30 INIT_LIST_HEAD(&ls->submit);
35 INIT_LIST_HEAD(&ls->all_locks);
36 init_waitqueue_head(&ls->thread_wait); 31 init_waitqueue_head(&ls->thread_wait);
37 init_waitqueue_head(&ls->wait_control); 32 init_waitqueue_head(&ls->wait_control);
38 ls->thread1 = NULL;
39 ls->thread2 = NULL;
40 ls->drop_time = jiffies;
41 ls->jid = -1; 33 ls->jid = -1;
42 34
43 strncpy(buf, table_name, 256); 35 strncpy(buf, table_name, 256);
@@ -180,7 +172,6 @@ out:
180static void gdlm_unmount(void *lockspace) 172static void gdlm_unmount(void *lockspace)
181{ 173{
182 struct gdlm_ls *ls = lockspace; 174 struct gdlm_ls *ls = lockspace;
183 int rv;
184 175
185 log_debug("unmount flags %lx", ls->flags); 176 log_debug("unmount flags %lx", ls->flags);
186 177
@@ -194,9 +185,7 @@ static void gdlm_unmount(void *lockspace)
194 gdlm_kobject_release(ls); 185 gdlm_kobject_release(ls);
195 dlm_release_lockspace(ls->dlm_lockspace, 2); 186 dlm_release_lockspace(ls->dlm_lockspace, 2);
196 gdlm_release_threads(ls); 187 gdlm_release_threads(ls);
197 rv = gdlm_release_all_locks(ls); 188 BUG_ON(ls->all_locks_count);
198 if (rv)
199 log_info("gdlm_unmount: %d stray locks freed", rv);
200out: 189out:
201 kfree(ls); 190 kfree(ls);
202} 191}
@@ -232,7 +221,6 @@ static void gdlm_withdraw(void *lockspace)
232 221
233 dlm_release_lockspace(ls->dlm_lockspace, 2); 222 dlm_release_lockspace(ls->dlm_lockspace, 2);
234 gdlm_release_threads(ls); 223 gdlm_release_threads(ls);
235 gdlm_release_all_locks(ls);
236 gdlm_kobject_release(ls); 224 gdlm_kobject_release(ls);
237} 225}
238 226
diff --git a/fs/gfs2/locking/dlm/sysfs.c b/fs/gfs2/locking/dlm/sysfs.c
index a4ff271df9ee..4ec571c3d8a9 100644
--- a/fs/gfs2/locking/dlm/sysfs.c
+++ b/fs/gfs2/locking/dlm/sysfs.c
@@ -114,17 +114,6 @@ static ssize_t recover_status_show(struct gdlm_ls *ls, char *buf)
114 return sprintf(buf, "%d\n", ls->recover_jid_status); 114 return sprintf(buf, "%d\n", ls->recover_jid_status);
115} 115}
116 116
117static ssize_t drop_count_show(struct gdlm_ls *ls, char *buf)
118{
119 return sprintf(buf, "%d\n", ls->drop_locks_count);
120}
121
122static ssize_t drop_count_store(struct gdlm_ls *ls, const char *buf, size_t len)
123{
124 ls->drop_locks_count = simple_strtol(buf, NULL, 0);
125 return len;
126}
127
128struct gdlm_attr { 117struct gdlm_attr {
129 struct attribute attr; 118 struct attribute attr;
130 ssize_t (*show)(struct gdlm_ls *, char *); 119 ssize_t (*show)(struct gdlm_ls *, char *);
@@ -144,7 +133,6 @@ GDLM_ATTR(first_done, 0444, first_done_show, NULL);
144GDLM_ATTR(recover, 0644, recover_show, recover_store); 133GDLM_ATTR(recover, 0644, recover_show, recover_store);
145GDLM_ATTR(recover_done, 0444, recover_done_show, NULL); 134GDLM_ATTR(recover_done, 0444, recover_done_show, NULL);
146GDLM_ATTR(recover_status, 0444, recover_status_show, NULL); 135GDLM_ATTR(recover_status, 0444, recover_status_show, NULL);
147GDLM_ATTR(drop_count, 0644, drop_count_show, drop_count_store);
148 136
149static struct attribute *gdlm_attrs[] = { 137static struct attribute *gdlm_attrs[] = {
150 &gdlm_attr_proto_name.attr, 138 &gdlm_attr_proto_name.attr,
@@ -157,7 +145,6 @@ static struct attribute *gdlm_attrs[] = {
157 &gdlm_attr_recover.attr, 145 &gdlm_attr_recover.attr,
158 &gdlm_attr_recover_done.attr, 146 &gdlm_attr_recover_done.attr,
159 &gdlm_attr_recover_status.attr, 147 &gdlm_attr_recover_status.attr,
160 &gdlm_attr_drop_count.attr,
161 NULL, 148 NULL,
162}; 149};
163 150
diff --git a/fs/gfs2/locking/dlm/thread.c b/fs/gfs2/locking/dlm/thread.c
index e53db6fd28ab..38823efd698c 100644
--- a/fs/gfs2/locking/dlm/thread.c
+++ b/fs/gfs2/locking/dlm/thread.c
@@ -9,367 +9,60 @@
9 9
10#include "lock_dlm.h" 10#include "lock_dlm.h"
11 11
12/* A lock placed on this queue is re-submitted to DLM as soon as the lock_dlm 12static inline int no_work(struct gdlm_ls *ls)
13 thread gets to it. */
14
15static void queue_submit(struct gdlm_lock *lp)
16{
17 struct gdlm_ls *ls = lp->ls;
18
19 spin_lock(&ls->async_lock);
20 list_add_tail(&lp->delay_list, &ls->submit);
21 spin_unlock(&ls->async_lock);
22 wake_up(&ls->thread_wait);
23}
24
25static void process_blocking(struct gdlm_lock *lp, int bast_mode)
26{
27 struct gdlm_ls *ls = lp->ls;
28 unsigned int cb = 0;
29
30 switch (gdlm_make_lmstate(bast_mode)) {
31 case LM_ST_EXCLUSIVE:
32 cb = LM_CB_NEED_E;
33 break;
34 case LM_ST_DEFERRED:
35 cb = LM_CB_NEED_D;
36 break;
37 case LM_ST_SHARED:
38 cb = LM_CB_NEED_S;
39 break;
40 default:
41 gdlm_assert(0, "unknown bast mode %u", lp->bast_mode);
42 }
43
44 ls->fscb(ls->sdp, cb, &lp->lockname);
45}
46
47static void wake_up_ast(struct gdlm_lock *lp)
48{
49 clear_bit(LFL_AST_WAIT, &lp->flags);
50 smp_mb__after_clear_bit();
51 wake_up_bit(&lp->flags, LFL_AST_WAIT);
52}
53
54static void process_complete(struct gdlm_lock *lp)
55{
56 struct gdlm_ls *ls = lp->ls;
57 struct lm_async_cb acb;
58 s16 prev_mode = lp->cur;
59
60 memset(&acb, 0, sizeof(acb));
61
62 if (lp->lksb.sb_status == -DLM_ECANCEL) {
63 log_info("complete dlm cancel %x,%llx flags %lx",
64 lp->lockname.ln_type,
65 (unsigned long long)lp->lockname.ln_number,
66 lp->flags);
67
68 lp->req = lp->cur;
69 acb.lc_ret |= LM_OUT_CANCELED;
70 if (lp->cur == DLM_LOCK_IV)
71 lp->lksb.sb_lkid = 0;
72 goto out;
73 }
74
75 if (test_and_clear_bit(LFL_DLM_UNLOCK, &lp->flags)) {
76 if (lp->lksb.sb_status != -DLM_EUNLOCK) {
77 log_info("unlock sb_status %d %x,%llx flags %lx",
78 lp->lksb.sb_status, lp->lockname.ln_type,
79 (unsigned long long)lp->lockname.ln_number,
80 lp->flags);
81 return;
82 }
83
84 lp->cur = DLM_LOCK_IV;
85 lp->req = DLM_LOCK_IV;
86 lp->lksb.sb_lkid = 0;
87
88 if (test_and_clear_bit(LFL_UNLOCK_DELETE, &lp->flags)) {
89 gdlm_delete_lp(lp);
90 return;
91 }
92 goto out;
93 }
94
95 if (lp->lksb.sb_flags & DLM_SBF_VALNOTVALID)
96 memset(lp->lksb.sb_lvbptr, 0, GDLM_LVB_SIZE);
97
98 if (lp->lksb.sb_flags & DLM_SBF_ALTMODE) {
99 if (lp->req == DLM_LOCK_PR)
100 lp->req = DLM_LOCK_CW;
101 else if (lp->req == DLM_LOCK_CW)
102 lp->req = DLM_LOCK_PR;
103 }
104
105 /*
106 * A canceled lock request. The lock was just taken off the delayed
107 * list and was never even submitted to dlm.
108 */
109
110 if (test_and_clear_bit(LFL_CANCEL, &lp->flags)) {
111 log_info("complete internal cancel %x,%llx",
112 lp->lockname.ln_type,
113 (unsigned long long)lp->lockname.ln_number);
114 lp->req = lp->cur;
115 acb.lc_ret |= LM_OUT_CANCELED;
116 goto out;
117 }
118
119 /*
120 * An error occured.
121 */
122
123 if (lp->lksb.sb_status) {
124 /* a "normal" error */
125 if ((lp->lksb.sb_status == -EAGAIN) &&
126 (lp->lkf & DLM_LKF_NOQUEUE)) {
127 lp->req = lp->cur;
128 if (lp->cur == DLM_LOCK_IV)
129 lp->lksb.sb_lkid = 0;
130 goto out;
131 }
132
133 /* this could only happen with cancels I think */
134 log_info("ast sb_status %d %x,%llx flags %lx",
135 lp->lksb.sb_status, lp->lockname.ln_type,
136 (unsigned long long)lp->lockname.ln_number,
137 lp->flags);
138 if (lp->lksb.sb_status == -EDEADLOCK &&
139 lp->ls->fsflags & LM_MFLAG_CONV_NODROP) {
140 lp->req = lp->cur;
141 acb.lc_ret |= LM_OUT_CONV_DEADLK;
142 if (lp->cur == DLM_LOCK_IV)
143 lp->lksb.sb_lkid = 0;
144 goto out;
145 } else
146 return;
147 }
148
149 /*
150 * This is an AST for an EX->EX conversion for sync_lvb from GFS.
151 */
152
153 if (test_and_clear_bit(LFL_SYNC_LVB, &lp->flags)) {
154 wake_up_ast(lp);
155 return;
156 }
157
158 /*
159 * A lock has been demoted to NL because it initially completed during
160 * BLOCK_LOCKS. Now it must be requested in the originally requested
161 * mode.
162 */
163
164 if (test_and_clear_bit(LFL_REREQUEST, &lp->flags)) {
165 gdlm_assert(lp->req == DLM_LOCK_NL, "%x,%llx",
166 lp->lockname.ln_type,
167 (unsigned long long)lp->lockname.ln_number);
168 gdlm_assert(lp->prev_req > DLM_LOCK_NL, "%x,%llx",
169 lp->lockname.ln_type,
170 (unsigned long long)lp->lockname.ln_number);
171
172 lp->cur = DLM_LOCK_NL;
173 lp->req = lp->prev_req;
174 lp->prev_req = DLM_LOCK_IV;
175 lp->lkf &= ~DLM_LKF_CONVDEADLK;
176
177 set_bit(LFL_NOCACHE, &lp->flags);
178
179 if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
180 !test_bit(LFL_NOBLOCK, &lp->flags))
181 gdlm_queue_delayed(lp);
182 else
183 queue_submit(lp);
184 return;
185 }
186
187 /*
188 * A request is granted during dlm recovery. It may be granted
189 * because the locks of a failed node were cleared. In that case,
190 * there may be inconsistent data beneath this lock and we must wait
191 * for recovery to complete to use it. When gfs recovery is done this
192 * granted lock will be converted to NL and then reacquired in this
193 * granted state.
194 */
195
196 if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
197 !test_bit(LFL_NOBLOCK, &lp->flags) &&
198 lp->req != DLM_LOCK_NL) {
199
200 lp->cur = lp->req;
201 lp->prev_req = lp->req;
202 lp->req = DLM_LOCK_NL;
203 lp->lkf |= DLM_LKF_CONVERT;
204 lp->lkf &= ~DLM_LKF_CONVDEADLK;
205
206 log_debug("rereq %x,%llx id %x %d,%d",
207 lp->lockname.ln_type,
208 (unsigned long long)lp->lockname.ln_number,
209 lp->lksb.sb_lkid, lp->cur, lp->req);
210
211 set_bit(LFL_REREQUEST, &lp->flags);
212 queue_submit(lp);
213 return;
214 }
215
216 /*
217 * DLM demoted the lock to NL before it was granted so GFS must be
218 * told it cannot cache data for this lock.
219 */
220
221 if (lp->lksb.sb_flags & DLM_SBF_DEMOTED)
222 set_bit(LFL_NOCACHE, &lp->flags);
223
224out:
225 /*
226 * This is an internal lock_dlm lock
227 */
228
229 if (test_bit(LFL_INLOCK, &lp->flags)) {
230 clear_bit(LFL_NOBLOCK, &lp->flags);
231 lp->cur = lp->req;
232 wake_up_ast(lp);
233 return;
234 }
235
236 /*
237 * Normal completion of a lock request. Tell GFS it now has the lock.
238 */
239
240 clear_bit(LFL_NOBLOCK, &lp->flags);
241 lp->cur = lp->req;
242
243 acb.lc_name = lp->lockname;
244 acb.lc_ret |= gdlm_make_lmstate(lp->cur);
245
246 if (!test_and_clear_bit(LFL_NOCACHE, &lp->flags) &&
247 (lp->cur > DLM_LOCK_NL) && (prev_mode > DLM_LOCK_NL))
248 acb.lc_ret |= LM_OUT_CACHEABLE;
249
250 ls->fscb(ls->sdp, LM_CB_ASYNC, &acb);
251}
252
253static inline int no_work(struct gdlm_ls *ls, int blocking)
254{ 13{
255 int ret; 14 int ret;
256 15
257 spin_lock(&ls->async_lock); 16 spin_lock(&ls->async_lock);
258 ret = list_empty(&ls->complete) && list_empty(&ls->submit); 17 ret = list_empty(&ls->submit);
259 if (ret && blocking)
260 ret = list_empty(&ls->blocking);
261 spin_unlock(&ls->async_lock); 18 spin_unlock(&ls->async_lock);
262 19
263 return ret; 20 return ret;
264} 21}
265 22
266static inline int check_drop(struct gdlm_ls *ls) 23static int gdlm_thread(void *data)
267{
268 if (!ls->drop_locks_count)
269 return 0;
270
271 if (time_after(jiffies, ls->drop_time + ls->drop_locks_period * HZ)) {
272 ls->drop_time = jiffies;
273 if (ls->all_locks_count >= ls->drop_locks_count)
274 return 1;
275 }
276 return 0;
277}
278
279static int gdlm_thread(void *data, int blist)
280{ 24{
281 struct gdlm_ls *ls = (struct gdlm_ls *) data; 25 struct gdlm_ls *ls = (struct gdlm_ls *) data;
282 struct gdlm_lock *lp = NULL; 26 struct gdlm_lock *lp = NULL;
283 uint8_t complete, blocking, submit, drop;
284
285 /* Only thread1 is allowed to do blocking callbacks since gfs
286 may wait for a completion callback within a blocking cb. */
287 27
288 while (!kthread_should_stop()) { 28 while (!kthread_should_stop()) {
289 wait_event_interruptible(ls->thread_wait, 29 wait_event_interruptible(ls->thread_wait,
290 !no_work(ls, blist) || kthread_should_stop()); 30 !no_work(ls) || kthread_should_stop());
291
292 complete = blocking = submit = drop = 0;
293 31
294 spin_lock(&ls->async_lock); 32 spin_lock(&ls->async_lock);
295 33
296 if (blist && !list_empty(&ls->blocking)) { 34 if (!list_empty(&ls->submit)) {
297 lp = list_entry(ls->blocking.next, struct gdlm_lock,
298 blist);
299 list_del_init(&lp->blist);
300 blocking = lp->bast_mode;
301 lp->bast_mode = 0;
302 } else if (!list_empty(&ls->complete)) {
303 lp = list_entry(ls->complete.next, struct gdlm_lock,
304 clist);
305 list_del_init(&lp->clist);
306 complete = 1;
307 } else if (!list_empty(&ls->submit)) {
308 lp = list_entry(ls->submit.next, struct gdlm_lock, 35 lp = list_entry(ls->submit.next, struct gdlm_lock,
309 delay_list); 36 delay_list);
310 list_del_init(&lp->delay_list); 37 list_del_init(&lp->delay_list);
311 submit = 1; 38 spin_unlock(&ls->async_lock);
39 gdlm_do_lock(lp);
40 spin_lock(&ls->async_lock);
312 } 41 }
313
314 drop = check_drop(ls);
315 spin_unlock(&ls->async_lock); 42 spin_unlock(&ls->async_lock);
316
317 if (complete)
318 process_complete(lp);
319
320 else if (blocking)
321 process_blocking(lp, blocking);
322
323 else if (submit)
324 gdlm_do_lock(lp);
325
326 if (drop)
327 ls->fscb(ls->sdp, LM_CB_DROPLOCKS, NULL);
328
329 schedule();
330 } 43 }
331 44
332 return 0; 45 return 0;
333} 46}
334 47
335static int gdlm_thread1(void *data)
336{
337 return gdlm_thread(data, 1);
338}
339
340static int gdlm_thread2(void *data)
341{
342 return gdlm_thread(data, 0);
343}
344
345int gdlm_init_threads(struct gdlm_ls *ls) 48int gdlm_init_threads(struct gdlm_ls *ls)
346{ 49{
347 struct task_struct *p; 50 struct task_struct *p;
348 int error; 51 int error;
349 52
350 p = kthread_run(gdlm_thread1, ls, "lock_dlm1"); 53 p = kthread_run(gdlm_thread, ls, "lock_dlm");
351 error = IS_ERR(p);
352 if (error) {
353 log_error("can't start lock_dlm1 thread %d", error);
354 return error;
355 }
356 ls->thread1 = p;
357
358 p = kthread_run(gdlm_thread2, ls, "lock_dlm2");
359 error = IS_ERR(p); 54 error = IS_ERR(p);
360 if (error) { 55 if (error) {
361 log_error("can't start lock_dlm2 thread %d", error); 56 log_error("can't start lock_dlm thread %d", error);
362 kthread_stop(ls->thread1);
363 return error; 57 return error;
364 } 58 }
365 ls->thread2 = p; 59 ls->thread = p;
366 60
367 return 0; 61 return 0;
368} 62}
369 63
370void gdlm_release_threads(struct gdlm_ls *ls) 64void gdlm_release_threads(struct gdlm_ls *ls)
371{ 65{
372 kthread_stop(ls->thread1); 66 kthread_stop(ls->thread);
373 kthread_stop(ls->thread2);
374} 67}
375 68
diff --git a/fs/gfs2/locking/nolock/Makefile b/fs/gfs2/locking/nolock/Makefile
deleted file mode 100644
index 35e9730bc3a8..000000000000
--- a/fs/gfs2/locking/nolock/Makefile
+++ /dev/null
@@ -1,3 +0,0 @@
1obj-$(CONFIG_GFS2_FS_LOCKING_NOLOCK) += lock_nolock.o
2lock_nolock-y := main.o
3
diff --git a/fs/gfs2/locking/nolock/main.c b/fs/gfs2/locking/nolock/main.c
deleted file mode 100644
index 284a5ece8d94..000000000000
--- a/fs/gfs2/locking/nolock/main.c
+++ /dev/null
@@ -1,238 +0,0 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/module.h>
11#include <linux/slab.h>
12#include <linux/init.h>
13#include <linux/types.h>
14#include <linux/fs.h>
15#include <linux/lm_interface.h>
16
17struct nolock_lockspace {
18 unsigned int nl_lvb_size;
19};
20
21static const struct lm_lockops nolock_ops;
22
23static int nolock_mount(char *table_name, char *host_data,
24 lm_callback_t cb, void *cb_data,
25 unsigned int min_lvb_size, int flags,
26 struct lm_lockstruct *lockstruct,
27 struct kobject *fskobj)
28{
29 char *c;
30 unsigned int jid;
31 struct nolock_lockspace *nl;
32
33 c = strstr(host_data, "jid=");
34 if (!c)
35 jid = 0;
36 else {
37 c += 4;
38 sscanf(c, "%u", &jid);
39 }
40
41 nl = kzalloc(sizeof(struct nolock_lockspace), GFP_KERNEL);
42 if (!nl)
43 return -ENOMEM;
44
45 nl->nl_lvb_size = min_lvb_size;
46
47 lockstruct->ls_jid = jid;
48 lockstruct->ls_first = 1;
49 lockstruct->ls_lvb_size = min_lvb_size;
50 lockstruct->ls_lockspace = nl;
51 lockstruct->ls_ops = &nolock_ops;
52 lockstruct->ls_flags = LM_LSFLAG_LOCAL;
53
54 return 0;
55}
56
57static void nolock_others_may_mount(void *lockspace)
58{
59}
60
61static void nolock_unmount(void *lockspace)
62{
63 struct nolock_lockspace *nl = lockspace;
64 kfree(nl);
65}
66
67static void nolock_withdraw(void *lockspace)
68{
69}
70
71/**
72 * nolock_get_lock - get a lm_lock_t given a descripton of the lock
73 * @lockspace: the lockspace the lock lives in
74 * @name: the name of the lock
75 * @lockp: return the lm_lock_t here
76 *
77 * Returns: 0 on success, -EXXX on failure
78 */
79
80static int nolock_get_lock(void *lockspace, struct lm_lockname *name,
81 void **lockp)
82{
83 *lockp = lockspace;
84 return 0;
85}
86
87/**
88 * nolock_put_lock - get rid of a lock structure
89 * @lock: the lock to throw away
90 *
91 */
92
93static void nolock_put_lock(void *lock)
94{
95}
96
97/**
98 * nolock_lock - acquire a lock
99 * @lock: the lock to manipulate
100 * @cur_state: the current state
101 * @req_state: the requested state
102 * @flags: modifier flags
103 *
104 * Returns: A bitmap of LM_OUT_*
105 */
106
107static unsigned int nolock_lock(void *lock, unsigned int cur_state,
108 unsigned int req_state, unsigned int flags)
109{
110 return req_state | LM_OUT_CACHEABLE;
111}
112
113/**
114 * nolock_unlock - unlock a lock
115 * @lock: the lock to manipulate
116 * @cur_state: the current state
117 *
118 * Returns: 0
119 */
120
121static unsigned int nolock_unlock(void *lock, unsigned int cur_state)
122{
123 return 0;
124}
125
126static void nolock_cancel(void *lock)
127{
128}
129
130/**
131 * nolock_hold_lvb - hold on to a lock value block
132 * @lock: the lock the LVB is associated with
133 * @lvbp: return the lm_lvb_t here
134 *
135 * Returns: 0 on success, -EXXX on failure
136 */
137
138static int nolock_hold_lvb(void *lock, char **lvbp)
139{
140 struct nolock_lockspace *nl = lock;
141 int error = 0;
142
143 *lvbp = kzalloc(nl->nl_lvb_size, GFP_NOFS);
144 if (!*lvbp)
145 error = -ENOMEM;
146
147 return error;
148}
149
150/**
151 * nolock_unhold_lvb - release a LVB
152 * @lock: the lock the LVB is associated with
153 * @lvb: the lock value block
154 *
155 */
156
157static void nolock_unhold_lvb(void *lock, char *lvb)
158{
159 kfree(lvb);
160}
161
162static int nolock_plock_get(void *lockspace, struct lm_lockname *name,
163 struct file *file, struct file_lock *fl)
164{
165 posix_test_lock(file, fl);
166
167 return 0;
168}
169
170static int nolock_plock(void *lockspace, struct lm_lockname *name,
171 struct file *file, int cmd, struct file_lock *fl)
172{
173 int error;
174 error = posix_lock_file_wait(file, fl);
175 return error;
176}
177
178static int nolock_punlock(void *lockspace, struct lm_lockname *name,
179 struct file *file, struct file_lock *fl)
180{
181 int error;
182 error = posix_lock_file_wait(file, fl);
183 return error;
184}
185
186static void nolock_recovery_done(void *lockspace, unsigned int jid,
187 unsigned int message)
188{
189}
190
191static const struct lm_lockops nolock_ops = {
192 .lm_proto_name = "lock_nolock",
193 .lm_mount = nolock_mount,
194 .lm_others_may_mount = nolock_others_may_mount,
195 .lm_unmount = nolock_unmount,
196 .lm_withdraw = nolock_withdraw,
197 .lm_get_lock = nolock_get_lock,
198 .lm_put_lock = nolock_put_lock,
199 .lm_lock = nolock_lock,
200 .lm_unlock = nolock_unlock,
201 .lm_cancel = nolock_cancel,
202 .lm_hold_lvb = nolock_hold_lvb,
203 .lm_unhold_lvb = nolock_unhold_lvb,
204 .lm_plock_get = nolock_plock_get,
205 .lm_plock = nolock_plock,
206 .lm_punlock = nolock_punlock,
207 .lm_recovery_done = nolock_recovery_done,
208 .lm_owner = THIS_MODULE,
209};
210
211static int __init init_nolock(void)
212{
213 int error;
214
215 error = gfs2_register_lockproto(&nolock_ops);
216 if (error) {
217 printk(KERN_WARNING
218 "lock_nolock: can't register protocol: %d\n", error);
219 return error;
220 }
221
222 printk(KERN_INFO
223 "Lock_Nolock (built %s %s) installed\n", __DATE__, __TIME__);
224 return 0;
225}
226
227static void __exit exit_nolock(void)
228{
229 gfs2_unregister_lockproto(&nolock_ops);
230}
231
232module_init(init_nolock);
233module_exit(exit_nolock);
234
235MODULE_DESCRIPTION("GFS Nolock Locking Module");
236MODULE_AUTHOR("Red Hat, Inc.");
237MODULE_LICENSE("GPL");
238
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 548264b1836d..6c6af9f5e3ab 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -87,6 +87,8 @@ void gfs2_remove_from_ail(struct gfs2_bufdata *bd)
87 */ 87 */
88 88
89static void gfs2_ail1_start_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai) 89static void gfs2_ail1_start_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
90__releases(&sdp->sd_log_lock)
91__acquires(&sdp->sd_log_lock)
90{ 92{
91 struct gfs2_bufdata *bd, *s; 93 struct gfs2_bufdata *bd, *s;
92 struct buffer_head *bh; 94 struct buffer_head *bh;
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
index 771152816508..7c64510ccfd2 100644
--- a/fs/gfs2/log.h
+++ b/fs/gfs2/log.h
@@ -21,6 +21,7 @@
21 */ 21 */
22 22
23static inline void gfs2_log_lock(struct gfs2_sbd *sdp) 23static inline void gfs2_log_lock(struct gfs2_sbd *sdp)
24__acquires(&sdp->sd_log_lock)
24{ 25{
25 spin_lock(&sdp->sd_log_lock); 26 spin_lock(&sdp->sd_log_lock);
26} 27}
@@ -32,6 +33,7 @@ static inline void gfs2_log_lock(struct gfs2_sbd *sdp)
32 */ 33 */
33 34
34static inline void gfs2_log_unlock(struct gfs2_sbd *sdp) 35static inline void gfs2_log_unlock(struct gfs2_sbd *sdp)
36__releases(&sdp->sd_log_lock)
35{ 37{
36 spin_unlock(&sdp->sd_log_lock); 38 spin_unlock(&sdp->sd_log_lock);
37} 39}
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 053e2ebbbd50..bcc668d0fadd 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -40,8 +40,6 @@ static void gfs2_init_glock_once(struct kmem_cache *cachep, void *foo)
40 INIT_HLIST_NODE(&gl->gl_list); 40 INIT_HLIST_NODE(&gl->gl_list);
41 spin_lock_init(&gl->gl_spin); 41 spin_lock_init(&gl->gl_spin);
42 INIT_LIST_HEAD(&gl->gl_holders); 42 INIT_LIST_HEAD(&gl->gl_holders);
43 INIT_LIST_HEAD(&gl->gl_waiters1);
44 INIT_LIST_HEAD(&gl->gl_waiters3);
45 gl->gl_lvb = NULL; 43 gl->gl_lvb = NULL;
46 atomic_set(&gl->gl_lvb_count, 0); 44 atomic_set(&gl->gl_lvb_count, 0);
47 INIT_LIST_HEAD(&gl->gl_reclaim); 45 INIT_LIST_HEAD(&gl->gl_reclaim);
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 78d75f892f82..09853620c951 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -129,7 +129,7 @@ void gfs2_meta_sync(struct gfs2_glock *gl)
129} 129}
130 130
131/** 131/**
132 * getbuf - Get a buffer with a given address space 132 * gfs2_getbuf - Get a buffer with a given address space
133 * @gl: the glock 133 * @gl: the glock
134 * @blkno: the block number (filesystem scope) 134 * @blkno: the block number (filesystem scope)
135 * @create: 1 if the buffer should be created 135 * @create: 1 if the buffer should be created
@@ -137,7 +137,7 @@ void gfs2_meta_sync(struct gfs2_glock *gl)
137 * Returns: the buffer 137 * Returns: the buffer
138 */ 138 */
139 139
140static struct buffer_head *getbuf(struct gfs2_glock *gl, u64 blkno, int create) 140struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create)
141{ 141{
142 struct address_space *mapping = gl->gl_aspace->i_mapping; 142 struct address_space *mapping = gl->gl_aspace->i_mapping;
143 struct gfs2_sbd *sdp = gl->gl_sbd; 143 struct gfs2_sbd *sdp = gl->gl_sbd;
@@ -205,7 +205,7 @@ static void meta_prep_new(struct buffer_head *bh)
205struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno) 205struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno)
206{ 206{
207 struct buffer_head *bh; 207 struct buffer_head *bh;
208 bh = getbuf(gl, blkno, CREATE); 208 bh = gfs2_getbuf(gl, blkno, CREATE);
209 meta_prep_new(bh); 209 meta_prep_new(bh);
210 return bh; 210 return bh;
211} 211}
@@ -223,7 +223,7 @@ struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno)
223int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, 223int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
224 struct buffer_head **bhp) 224 struct buffer_head **bhp)
225{ 225{
226 *bhp = getbuf(gl, blkno, CREATE); 226 *bhp = gfs2_getbuf(gl, blkno, CREATE);
227 if (!buffer_uptodate(*bhp)) { 227 if (!buffer_uptodate(*bhp)) {
228 ll_rw_block(READ_META, 1, bhp); 228 ll_rw_block(READ_META, 1, bhp);
229 if (flags & DIO_WAIT) { 229 if (flags & DIO_WAIT) {
@@ -346,7 +346,7 @@ void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen)
346 struct buffer_head *bh; 346 struct buffer_head *bh;
347 347
348 while (blen) { 348 while (blen) {
349 bh = getbuf(ip->i_gl, bstart, NO_CREATE); 349 bh = gfs2_getbuf(ip->i_gl, bstart, NO_CREATE);
350 if (bh) { 350 if (bh) {
351 lock_buffer(bh); 351 lock_buffer(bh);
352 gfs2_log_lock(sdp); 352 gfs2_log_lock(sdp);
@@ -421,7 +421,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen)
421 if (extlen > max_ra) 421 if (extlen > max_ra)
422 extlen = max_ra; 422 extlen = max_ra;
423 423
424 first_bh = getbuf(gl, dblock, CREATE); 424 first_bh = gfs2_getbuf(gl, dblock, CREATE);
425 425
426 if (buffer_uptodate(first_bh)) 426 if (buffer_uptodate(first_bh))
427 goto out; 427 goto out;
@@ -432,7 +432,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen)
432 extlen--; 432 extlen--;
433 433
434 while (extlen) { 434 while (extlen) {
435 bh = getbuf(gl, dblock, CREATE); 435 bh = gfs2_getbuf(gl, dblock, CREATE);
436 436
437 if (!buffer_uptodate(bh) && !buffer_locked(bh)) 437 if (!buffer_uptodate(bh) && !buffer_locked(bh))
438 ll_rw_block(READA, 1, &bh); 438 ll_rw_block(READA, 1, &bh);
diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h
index 73e3b1c76fe1..b1a5f3674d43 100644
--- a/fs/gfs2/meta_io.h
+++ b/fs/gfs2/meta_io.h
@@ -47,6 +47,7 @@ struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno);
47int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, 47int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno,
48 int flags, struct buffer_head **bhp); 48 int flags, struct buffer_head **bhp);
49int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh); 49int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh);
50struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create);
50 51
51void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh, 52void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh,
52 int meta); 53 int meta);
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index f55394e57cb2..e64a1b04117a 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -499,34 +499,34 @@ static int __gfs2_readpage(void *file, struct page *page)
499 * @file: The file to read 499 * @file: The file to read
500 * @page: The page of the file 500 * @page: The page of the file
501 * 501 *
502 * This deals with the locking required. We use a trylock in order to 502 * This deals with the locking required. We have to unlock and
503 * avoid the page lock / glock ordering problems returning AOP_TRUNCATED_PAGE 503 * relock the page in order to get the locking in the right
504 * in the event that we are unable to get the lock. 504 * order.
505 */ 505 */
506 506
507static int gfs2_readpage(struct file *file, struct page *page) 507static int gfs2_readpage(struct file *file, struct page *page)
508{ 508{
509 struct gfs2_inode *ip = GFS2_I(page->mapping->host); 509 struct address_space *mapping = page->mapping;
510 struct gfs2_holder *gh; 510 struct gfs2_inode *ip = GFS2_I(mapping->host);
511 struct gfs2_holder gh;
511 int error; 512 int error;
512 513
513 gh = gfs2_glock_is_locked_by_me(ip->i_gl); 514 unlock_page(page);
514 if (!gh) { 515 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh);
515 gh = kmalloc(sizeof(struct gfs2_holder), GFP_NOFS); 516 error = gfs2_glock_nq_atime(&gh);
516 if (!gh) 517 if (unlikely(error))
517 return -ENOBUFS; 518 goto out;
518 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, gh); 519 error = AOP_TRUNCATED_PAGE;
520 lock_page(page);
521 if (page->mapping == mapping && !PageUptodate(page))
522 error = __gfs2_readpage(file, page);
523 else
519 unlock_page(page); 524 unlock_page(page);
520 error = gfs2_glock_nq_atime(gh); 525 gfs2_glock_dq(&gh);
521 if (likely(error != 0))
522 goto out;
523 return AOP_TRUNCATED_PAGE;
524 }
525 error = __gfs2_readpage(file, page);
526 gfs2_glock_dq(gh);
527out: 526out:
528 gfs2_holder_uninit(gh); 527 gfs2_holder_uninit(&gh);
529 kfree(gh); 528 if (error && error != AOP_TRUNCATED_PAGE)
529 lock_page(page);
530 return error; 530 return error;
531} 531}
532 532
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index e1b7d525a066..e9a366d4411c 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -15,6 +15,7 @@
15#include <linux/uio.h> 15#include <linux/uio.h>
16#include <linux/blkdev.h> 16#include <linux/blkdev.h>
17#include <linux/mm.h> 17#include <linux/mm.h>
18#include <linux/mount.h>
18#include <linux/fs.h> 19#include <linux/fs.h>
19#include <linux/gfs2_ondisk.h> 20#include <linux/gfs2_ondisk.h>
20#include <linux/ext2_fs.h> 21#include <linux/ext2_fs.h>
@@ -62,11 +63,11 @@ static loff_t gfs2_llseek(struct file *file, loff_t offset, int origin)
62 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, 63 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY,
63 &i_gh); 64 &i_gh);
64 if (!error) { 65 if (!error) {
65 error = remote_llseek(file, offset, origin); 66 error = generic_file_llseek_unlocked(file, offset, origin);
66 gfs2_glock_dq_uninit(&i_gh); 67 gfs2_glock_dq_uninit(&i_gh);
67 } 68 }
68 } else 69 } else
69 error = remote_llseek(file, offset, origin); 70 error = generic_file_llseek_unlocked(file, offset, origin);
70 71
71 return error; 72 return error;
72} 73}
@@ -133,7 +134,6 @@ static const u32 fsflags_to_gfs2[32] = {
133 [7] = GFS2_DIF_NOATIME, 134 [7] = GFS2_DIF_NOATIME,
134 [12] = GFS2_DIF_EXHASH, 135 [12] = GFS2_DIF_EXHASH,
135 [14] = GFS2_DIF_INHERIT_JDATA, 136 [14] = GFS2_DIF_INHERIT_JDATA,
136 [20] = GFS2_DIF_INHERIT_DIRECTIO,
137}; 137};
138 138
139static const u32 gfs2_to_fsflags[32] = { 139static const u32 gfs2_to_fsflags[32] = {
@@ -142,7 +142,6 @@ static const u32 gfs2_to_fsflags[32] = {
142 [gfs2fl_AppendOnly] = FS_APPEND_FL, 142 [gfs2fl_AppendOnly] = FS_APPEND_FL,
143 [gfs2fl_NoAtime] = FS_NOATIME_FL, 143 [gfs2fl_NoAtime] = FS_NOATIME_FL,
144 [gfs2fl_ExHash] = FS_INDEX_FL, 144 [gfs2fl_ExHash] = FS_INDEX_FL,
145 [gfs2fl_InheritDirectio] = FS_DIRECTIO_FL,
146 [gfs2fl_InheritJdata] = FS_JOURNAL_DATA_FL, 145 [gfs2fl_InheritJdata] = FS_JOURNAL_DATA_FL,
147}; 146};
148 147
@@ -160,12 +159,8 @@ static int gfs2_get_flags(struct file *filp, u32 __user *ptr)
160 return error; 159 return error;
161 160
162 fsflags = fsflags_cvt(gfs2_to_fsflags, ip->i_di.di_flags); 161 fsflags = fsflags_cvt(gfs2_to_fsflags, ip->i_di.di_flags);
163 if (!S_ISDIR(inode->i_mode)) { 162 if (!S_ISDIR(inode->i_mode) && ip->i_di.di_flags & GFS2_DIF_JDATA)
164 if (ip->i_di.di_flags & GFS2_DIF_JDATA) 163 fsflags |= FS_JOURNAL_DATA_FL;
165 fsflags |= FS_JOURNAL_DATA_FL;
166 if (ip->i_di.di_flags & GFS2_DIF_DIRECTIO)
167 fsflags |= FS_DIRECTIO_FL;
168 }
169 if (put_user(fsflags, ptr)) 164 if (put_user(fsflags, ptr))
170 error = -EFAULT; 165 error = -EFAULT;
171 166
@@ -194,13 +189,11 @@ void gfs2_set_inode_flags(struct inode *inode)
194 189
195/* Flags that can be set by user space */ 190/* Flags that can be set by user space */
196#define GFS2_FLAGS_USER_SET (GFS2_DIF_JDATA| \ 191#define GFS2_FLAGS_USER_SET (GFS2_DIF_JDATA| \
197 GFS2_DIF_DIRECTIO| \
198 GFS2_DIF_IMMUTABLE| \ 192 GFS2_DIF_IMMUTABLE| \
199 GFS2_DIF_APPENDONLY| \ 193 GFS2_DIF_APPENDONLY| \
200 GFS2_DIF_NOATIME| \ 194 GFS2_DIF_NOATIME| \
201 GFS2_DIF_SYNC| \ 195 GFS2_DIF_SYNC| \
202 GFS2_DIF_SYSTEM| \ 196 GFS2_DIF_SYSTEM| \
203 GFS2_DIF_INHERIT_DIRECTIO| \
204 GFS2_DIF_INHERIT_JDATA) 197 GFS2_DIF_INHERIT_JDATA)
205 198
206/** 199/**
@@ -220,10 +213,14 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
220 int error; 213 int error;
221 u32 new_flags, flags; 214 u32 new_flags, flags;
222 215
223 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); 216 error = mnt_want_write(filp->f_path.mnt);
224 if (error) 217 if (error)
225 return error; 218 return error;
226 219
220 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
221 if (error)
222 goto out_drop_write;
223
227 flags = ip->i_di.di_flags; 224 flags = ip->i_di.di_flags;
228 new_flags = (flags & ~mask) | (reqflags & mask); 225 new_flags = (flags & ~mask) | (reqflags & mask);
229 if ((new_flags ^ flags) == 0) 226 if ((new_flags ^ flags) == 0)
@@ -242,7 +239,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
242 !capable(CAP_LINUX_IMMUTABLE)) 239 !capable(CAP_LINUX_IMMUTABLE))
243 goto out; 240 goto out;
244 if (!IS_IMMUTABLE(inode)) { 241 if (!IS_IMMUTABLE(inode)) {
245 error = permission(inode, MAY_WRITE, NULL); 242 error = gfs2_permission(inode, MAY_WRITE);
246 if (error) 243 if (error)
247 goto out; 244 goto out;
248 } 245 }
@@ -272,6 +269,8 @@ out_trans_end:
272 gfs2_trans_end(sdp); 269 gfs2_trans_end(sdp);
273out: 270out:
274 gfs2_glock_dq_uninit(&gh); 271 gfs2_glock_dq_uninit(&gh);
272out_drop_write:
273 mnt_drop_write(filp->f_path.mnt);
275 return error; 274 return error;
276} 275}
277 276
@@ -285,8 +284,6 @@ static int gfs2_set_flags(struct file *filp, u32 __user *ptr)
285 if (!S_ISDIR(inode->i_mode)) { 284 if (!S_ISDIR(inode->i_mode)) {
286 if (gfsflags & GFS2_DIF_INHERIT_JDATA) 285 if (gfsflags & GFS2_DIF_INHERIT_JDATA)
287 gfsflags ^= (GFS2_DIF_JDATA | GFS2_DIF_INHERIT_JDATA); 286 gfsflags ^= (GFS2_DIF_JDATA | GFS2_DIF_INHERIT_JDATA);
288 if (gfsflags & GFS2_DIF_INHERIT_DIRECTIO)
289 gfsflags ^= (GFS2_DIF_DIRECTIO | GFS2_DIF_INHERIT_DIRECTIO);
290 return do_gfs2_set_flags(filp, gfsflags, ~0); 287 return do_gfs2_set_flags(filp, gfsflags, ~0);
291 } 288 }
292 return do_gfs2_set_flags(filp, gfsflags, ~GFS2_DIF_JDATA); 289 return do_gfs2_set_flags(filp, gfsflags, ~GFS2_DIF_JDATA);
@@ -487,11 +484,6 @@ static int gfs2_open(struct inode *inode, struct file *file)
487 goto fail_gunlock; 484 goto fail_gunlock;
488 } 485 }
489 486
490 /* Listen to the Direct I/O flag */
491
492 if (ip->i_di.di_flags & GFS2_DIF_DIRECTIO)
493 file->f_flags |= O_DIRECT;
494
495 gfs2_glock_dq_uninit(&i_gh); 487 gfs2_glock_dq_uninit(&i_gh);
496 } 488 }
497 489
@@ -669,8 +661,7 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl)
669 int error = 0; 661 int error = 0;
670 662
671 state = (fl->fl_type == F_WRLCK) ? LM_ST_EXCLUSIVE : LM_ST_SHARED; 663 state = (fl->fl_type == F_WRLCK) ? LM_ST_EXCLUSIVE : LM_ST_SHARED;
672 flags = (IS_SETLKW(cmd) ? 0 : LM_FLAG_TRY) | GL_EXACT | GL_NOCACHE 664 flags = (IS_SETLKW(cmd) ? 0 : LM_FLAG_TRY) | GL_EXACT | GL_NOCACHE;
673 | GL_FLOCK;
674 665
675 mutex_lock(&fp->f_fl_mutex); 666 mutex_lock(&fp->f_fl_mutex);
676 667
@@ -683,9 +674,8 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl)
683 gfs2_glock_dq_wait(fl_gh); 674 gfs2_glock_dq_wait(fl_gh);
684 gfs2_holder_reinit(state, flags, fl_gh); 675 gfs2_holder_reinit(state, flags, fl_gh);
685 } else { 676 } else {
686 error = gfs2_glock_get(GFS2_SB(&ip->i_inode), 677 error = gfs2_glock_get(GFS2_SB(&ip->i_inode), ip->i_no_addr,
687 ip->i_no_addr, &gfs2_flock_glops, 678 &gfs2_flock_glops, CREATE, &gl);
688 CREATE, &gl);
689 if (error) 679 if (error)
690 goto out; 680 goto out;
691 gfs2_holder_init(gl, state, flags, fl_gh); 681 gfs2_holder_init(gl, state, flags, fl_gh);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index b2028c82e8d1..b4d1d6490633 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -64,7 +64,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
64 mutex_init(&sdp->sd_rindex_mutex); 64 mutex_init(&sdp->sd_rindex_mutex);
65 INIT_LIST_HEAD(&sdp->sd_rindex_list); 65 INIT_LIST_HEAD(&sdp->sd_rindex_list);
66 INIT_LIST_HEAD(&sdp->sd_rindex_mru_list); 66 INIT_LIST_HEAD(&sdp->sd_rindex_mru_list);
67 INIT_LIST_HEAD(&sdp->sd_rindex_recent_list);
68 67
69 INIT_LIST_HEAD(&sdp->sd_jindex_list); 68 INIT_LIST_HEAD(&sdp->sd_jindex_list);
70 spin_lock_init(&sdp->sd_jindex_spin); 69 spin_lock_init(&sdp->sd_jindex_spin);
@@ -364,6 +363,8 @@ static int map_journal_extents(struct gfs2_sbd *sdp)
364 363
365static void gfs2_lm_others_may_mount(struct gfs2_sbd *sdp) 364static void gfs2_lm_others_may_mount(struct gfs2_sbd *sdp)
366{ 365{
366 if (!sdp->sd_lockstruct.ls_ops->lm_others_may_mount)
367 return;
367 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) 368 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
368 sdp->sd_lockstruct.ls_ops->lm_others_may_mount( 369 sdp->sd_lockstruct.ls_ops->lm_others_may_mount(
369 sdp->sd_lockstruct.ls_lockspace); 370 sdp->sd_lockstruct.ls_lockspace);
@@ -741,8 +742,7 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
741 goto out; 742 goto out;
742 } 743 }
743 744
744 if (gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_lockspace) || 745 if (gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_ops) ||
745 gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_ops) ||
746 gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_lvb_size >= 746 gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_lvb_size >=
747 GFS2_MIN_LVB_SIZE)) { 747 GFS2_MIN_LVB_SIZE)) {
748 gfs2_unmount_lockproto(&sdp->sd_lockstruct); 748 gfs2_unmount_lockproto(&sdp->sd_lockstruct);
@@ -873,7 +873,7 @@ fail_sb:
873fail_locking: 873fail_locking:
874 init_locking(sdp, &mount_gh, UNDO); 874 init_locking(sdp, &mount_gh, UNDO);
875fail_lm: 875fail_lm:
876 gfs2_gl_hash_clear(sdp, WAIT); 876 gfs2_gl_hash_clear(sdp);
877 gfs2_lm_unmount(sdp); 877 gfs2_lm_unmount(sdp);
878 while (invalidate_inodes(sb)) 878 while (invalidate_inodes(sb))
879 yield(); 879 yield();
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 2686ad4c0029..1e252dfc5294 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -163,7 +163,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
163 if (error) 163 if (error)
164 goto out; 164 goto out;
165 165
166 error = permission(dir, MAY_WRITE | MAY_EXEC, NULL); 166 error = gfs2_permission(dir, MAY_WRITE | MAY_EXEC);
167 if (error) 167 if (error)
168 goto out_gunlock; 168 goto out_gunlock;
169 169
@@ -669,7 +669,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
669 } 669 }
670 } 670 }
671 } else { 671 } else {
672 error = permission(ndir, MAY_WRITE | MAY_EXEC, NULL); 672 error = gfs2_permission(ndir, MAY_WRITE | MAY_EXEC);
673 if (error) 673 if (error)
674 goto out_gunlock; 674 goto out_gunlock;
675 675
@@ -704,7 +704,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
704 /* Check out the dir to be renamed */ 704 /* Check out the dir to be renamed */
705 705
706 if (dir_rename) { 706 if (dir_rename) {
707 error = permission(odentry->d_inode, MAY_WRITE, NULL); 707 error = gfs2_permission(odentry->d_inode, MAY_WRITE);
708 if (error) 708 if (error)
709 goto out_gunlock; 709 goto out_gunlock;
710 } 710 }
@@ -891,7 +891,7 @@ static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
891 * Returns: errno 891 * Returns: errno
892 */ 892 */
893 893
894static int gfs2_permission(struct inode *inode, int mask, struct nameidata *nd) 894int gfs2_permission(struct inode *inode, int mask)
895{ 895{
896 struct gfs2_inode *ip = GFS2_I(inode); 896 struct gfs2_inode *ip = GFS2_I(inode);
897 struct gfs2_holder i_gh; 897 struct gfs2_holder i_gh;
@@ -905,13 +905,22 @@ static int gfs2_permission(struct inode *inode, int mask, struct nameidata *nd)
905 unlock = 1; 905 unlock = 1;
906 } 906 }
907 907
908 error = generic_permission(inode, mask, gfs2_check_acl); 908 if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode))
909 error = -EACCES;
910 else
911 error = generic_permission(inode, mask, gfs2_check_acl);
909 if (unlock) 912 if (unlock)
910 gfs2_glock_dq_uninit(&i_gh); 913 gfs2_glock_dq_uninit(&i_gh);
911 914
912 return error; 915 return error;
913} 916}
914 917
918static int gfs2_iop_permission(struct inode *inode, int mask,
919 struct nameidata *nd)
920{
921 return gfs2_permission(inode, mask);
922}
923
915static int setattr_size(struct inode *inode, struct iattr *attr) 924static int setattr_size(struct inode *inode, struct iattr *attr)
916{ 925{
917 struct gfs2_inode *ip = GFS2_I(inode); 926 struct gfs2_inode *ip = GFS2_I(inode);
@@ -1141,7 +1150,7 @@ static int gfs2_removexattr(struct dentry *dentry, const char *name)
1141} 1150}
1142 1151
1143const struct inode_operations gfs2_file_iops = { 1152const struct inode_operations gfs2_file_iops = {
1144 .permission = gfs2_permission, 1153 .permission = gfs2_iop_permission,
1145 .setattr = gfs2_setattr, 1154 .setattr = gfs2_setattr,
1146 .getattr = gfs2_getattr, 1155 .getattr = gfs2_getattr,
1147 .setxattr = gfs2_setxattr, 1156 .setxattr = gfs2_setxattr,
@@ -1160,7 +1169,7 @@ const struct inode_operations gfs2_dir_iops = {
1160 .rmdir = gfs2_rmdir, 1169 .rmdir = gfs2_rmdir,
1161 .mknod = gfs2_mknod, 1170 .mknod = gfs2_mknod,
1162 .rename = gfs2_rename, 1171 .rename = gfs2_rename,
1163 .permission = gfs2_permission, 1172 .permission = gfs2_iop_permission,
1164 .setattr = gfs2_setattr, 1173 .setattr = gfs2_setattr,
1165 .getattr = gfs2_getattr, 1174 .getattr = gfs2_getattr,
1166 .setxattr = gfs2_setxattr, 1175 .setxattr = gfs2_setxattr,
@@ -1172,7 +1181,7 @@ const struct inode_operations gfs2_dir_iops = {
1172const struct inode_operations gfs2_symlink_iops = { 1181const struct inode_operations gfs2_symlink_iops = {
1173 .readlink = gfs2_readlink, 1182 .readlink = gfs2_readlink,
1174 .follow_link = gfs2_follow_link, 1183 .follow_link = gfs2_follow_link,
1175 .permission = gfs2_permission, 1184 .permission = gfs2_iop_permission,
1176 .setattr = gfs2_setattr, 1185 .setattr = gfs2_setattr,
1177 .getattr = gfs2_getattr, 1186 .getattr = gfs2_getattr,
1178 .setxattr = gfs2_setxattr, 1187 .setxattr = gfs2_setxattr,
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
index 0b7cc920eb89..f66ea0f7a356 100644
--- a/fs/gfs2/ops_super.c
+++ b/fs/gfs2/ops_super.c
@@ -126,7 +126,7 @@ static void gfs2_put_super(struct super_block *sb)
126 gfs2_clear_rgrpd(sdp); 126 gfs2_clear_rgrpd(sdp);
127 gfs2_jindex_free(sdp); 127 gfs2_jindex_free(sdp);
128 /* Take apart glock structures and buffer lists */ 128 /* Take apart glock structures and buffer lists */
129 gfs2_gl_hash_clear(sdp, WAIT); 129 gfs2_gl_hash_clear(sdp);
130 /* Unmount the locking protocol */ 130 /* Unmount the locking protocol */
131 gfs2_lm_unmount(sdp); 131 gfs2_lm_unmount(sdp);
132 132
@@ -155,7 +155,7 @@ static void gfs2_write_super(struct super_block *sb)
155static int gfs2_sync_fs(struct super_block *sb, int wait) 155static int gfs2_sync_fs(struct super_block *sb, int wait)
156{ 156{
157 sb->s_dirt = 0; 157 sb->s_dirt = 0;
158 if (wait) 158 if (wait && sb->s_fs_info)
159 gfs2_log_flush(sb->s_fs_info, NULL); 159 gfs2_log_flush(sb->s_fs_info, NULL);
160 return 0; 160 return 0;
161} 161}
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 56aaf915c59a..3e073f5144fa 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -904,7 +904,7 @@ static int need_sync(struct gfs2_quota_data *qd)
904 do_sync = 0; 904 do_sync = 0;
905 else { 905 else {
906 value *= gfs2_jindex_size(sdp) * num; 906 value *= gfs2_jindex_size(sdp) * num;
907 do_div(value, den); 907 value = div_s64(value, den);
908 value += (s64)be64_to_cpu(qd->qd_qb.qb_value); 908 value += (s64)be64_to_cpu(qd->qd_qb.qb_value);
909 if (value < (s64)be64_to_cpu(qd->qd_qb.qb_limit)) 909 if (value < (s64)be64_to_cpu(qd->qd_qb.qb_limit))
910 do_sync = 0; 910 do_sync = 0;
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index 2888e4b4b1c5..d5e91f4f6a0b 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -428,6 +428,9 @@ static int clean_journal(struct gfs2_jdesc *jd, struct gfs2_log_header_host *hea
428static void gfs2_lm_recovery_done(struct gfs2_sbd *sdp, unsigned int jid, 428static void gfs2_lm_recovery_done(struct gfs2_sbd *sdp, unsigned int jid,
429 unsigned int message) 429 unsigned int message)
430{ 430{
431 if (!sdp->sd_lockstruct.ls_ops->lm_recovery_done)
432 return;
433
431 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) 434 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
432 sdp->sd_lockstruct.ls_ops->lm_recovery_done( 435 sdp->sd_lockstruct.ls_ops->lm_recovery_done(
433 sdp->sd_lockstruct.ls_lockspace, jid, message); 436 sdp->sd_lockstruct.ls_lockspace, jid, message);
@@ -505,7 +508,7 @@ int gfs2_recover_journal(struct gfs2_jdesc *jd)
505 508
506 error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, 509 error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED,
507 LM_FLAG_NOEXP | LM_FLAG_PRIORITY | 510 LM_FLAG_NOEXP | LM_FLAG_PRIORITY |
508 GL_NOCANCEL | GL_NOCACHE, &t_gh); 511 GL_NOCACHE, &t_gh);
509 if (error) 512 if (error)
510 goto fail_gunlock_ji; 513 goto fail_gunlock_ji;
511 514
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 3401628d742b..2d90fb253505 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -371,11 +371,6 @@ static void clear_rgrpdi(struct gfs2_sbd *sdp)
371 371
372 spin_lock(&sdp->sd_rindex_spin); 372 spin_lock(&sdp->sd_rindex_spin);
373 sdp->sd_rindex_forward = NULL; 373 sdp->sd_rindex_forward = NULL;
374 head = &sdp->sd_rindex_recent_list;
375 while (!list_empty(head)) {
376 rgd = list_entry(head->next, struct gfs2_rgrpd, rd_recent);
377 list_del(&rgd->rd_recent);
378 }
379 spin_unlock(&sdp->sd_rindex_spin); 374 spin_unlock(&sdp->sd_rindex_spin);
380 375
381 head = &sdp->sd_rindex_list; 376 head = &sdp->sd_rindex_list;
@@ -945,107 +940,30 @@ static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked)
945} 940}
946 941
947/** 942/**
948 * recent_rgrp_first - get first RG from "recent" list
949 * @sdp: The GFS2 superblock
950 * @rglast: address of the rgrp used last
951 *
952 * Returns: The first rgrp in the recent list
953 */
954
955static struct gfs2_rgrpd *recent_rgrp_first(struct gfs2_sbd *sdp,
956 u64 rglast)
957{
958 struct gfs2_rgrpd *rgd;
959
960 spin_lock(&sdp->sd_rindex_spin);
961
962 if (rglast) {
963 list_for_each_entry(rgd, &sdp->sd_rindex_recent_list, rd_recent) {
964 if (rgrp_contains_block(rgd, rglast))
965 goto out;
966 }
967 }
968 rgd = NULL;
969 if (!list_empty(&sdp->sd_rindex_recent_list))
970 rgd = list_entry(sdp->sd_rindex_recent_list.next,
971 struct gfs2_rgrpd, rd_recent);
972out:
973 spin_unlock(&sdp->sd_rindex_spin);
974 return rgd;
975}
976
977/**
978 * recent_rgrp_next - get next RG from "recent" list 943 * recent_rgrp_next - get next RG from "recent" list
979 * @cur_rgd: current rgrp 944 * @cur_rgd: current rgrp
980 * @remove:
981 * 945 *
982 * Returns: The next rgrp in the recent list 946 * Returns: The next rgrp in the recent list
983 */ 947 */
984 948
985static struct gfs2_rgrpd *recent_rgrp_next(struct gfs2_rgrpd *cur_rgd, 949static struct gfs2_rgrpd *recent_rgrp_next(struct gfs2_rgrpd *cur_rgd)
986 int remove)
987{ 950{
988 struct gfs2_sbd *sdp = cur_rgd->rd_sbd; 951 struct gfs2_sbd *sdp = cur_rgd->rd_sbd;
989 struct list_head *head; 952 struct list_head *head;
990 struct gfs2_rgrpd *rgd; 953 struct gfs2_rgrpd *rgd;
991 954
992 spin_lock(&sdp->sd_rindex_spin); 955 spin_lock(&sdp->sd_rindex_spin);
993 956 head = &sdp->sd_rindex_mru_list;
994 head = &sdp->sd_rindex_recent_list; 957 if (unlikely(cur_rgd->rd_list_mru.next == head)) {
995 958 spin_unlock(&sdp->sd_rindex_spin);
996 list_for_each_entry(rgd, head, rd_recent) { 959 return NULL;
997 if (rgd == cur_rgd) {
998 if (cur_rgd->rd_recent.next != head)
999 rgd = list_entry(cur_rgd->rd_recent.next,
1000 struct gfs2_rgrpd, rd_recent);
1001 else
1002 rgd = NULL;
1003
1004 if (remove)
1005 list_del(&cur_rgd->rd_recent);
1006
1007 goto out;
1008 }
1009 } 960 }
1010 961 rgd = list_entry(cur_rgd->rd_list_mru.next, struct gfs2_rgrpd, rd_list_mru);
1011 rgd = NULL;
1012 if (!list_empty(head))
1013 rgd = list_entry(head->next, struct gfs2_rgrpd, rd_recent);
1014
1015out:
1016 spin_unlock(&sdp->sd_rindex_spin); 962 spin_unlock(&sdp->sd_rindex_spin);
1017 return rgd; 963 return rgd;
1018} 964}
1019 965
1020/** 966/**
1021 * recent_rgrp_add - add an RG to tail of "recent" list
1022 * @new_rgd: The rgrp to add
1023 *
1024 */
1025
1026static void recent_rgrp_add(struct gfs2_rgrpd *new_rgd)
1027{
1028 struct gfs2_sbd *sdp = new_rgd->rd_sbd;
1029 struct gfs2_rgrpd *rgd;
1030 unsigned int count = 0;
1031 unsigned int max = sdp->sd_rgrps / gfs2_jindex_size(sdp);
1032
1033 spin_lock(&sdp->sd_rindex_spin);
1034
1035 list_for_each_entry(rgd, &sdp->sd_rindex_recent_list, rd_recent) {
1036 if (rgd == new_rgd)
1037 goto out;
1038
1039 if (++count >= max)
1040 goto out;
1041 }
1042 list_add_tail(&new_rgd->rd_recent, &sdp->sd_rindex_recent_list);
1043
1044out:
1045 spin_unlock(&sdp->sd_rindex_spin);
1046}
1047
1048/**
1049 * forward_rgrp_get - get an rgrp to try next from full list 967 * forward_rgrp_get - get an rgrp to try next from full list
1050 * @sdp: The GFS2 superblock 968 * @sdp: The GFS2 superblock
1051 * 969 *
@@ -1112,9 +1030,7 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
1112 int loops = 0; 1030 int loops = 0;
1113 int error, rg_locked; 1031 int error, rg_locked;
1114 1032
1115 /* Try recently successful rgrps */ 1033 rgd = gfs2_blk2rgrpd(sdp, ip->i_goal);
1116
1117 rgd = recent_rgrp_first(sdp, ip->i_goal);
1118 1034
1119 while (rgd) { 1035 while (rgd) {
1120 rg_locked = 0; 1036 rg_locked = 0;
@@ -1136,11 +1052,9 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
1136 gfs2_glock_dq_uninit(&al->al_rgd_gh); 1052 gfs2_glock_dq_uninit(&al->al_rgd_gh);
1137 if (inode) 1053 if (inode)
1138 return inode; 1054 return inode;
1139 rgd = recent_rgrp_next(rgd, 1); 1055 /* fall through */
1140 break;
1141
1142 case GLR_TRYFAILED: 1056 case GLR_TRYFAILED:
1143 rgd = recent_rgrp_next(rgd, 0); 1057 rgd = recent_rgrp_next(rgd);
1144 break; 1058 break;
1145 1059
1146 default: 1060 default:
@@ -1199,7 +1113,9 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
1199 1113
1200out: 1114out:
1201 if (begin) { 1115 if (begin) {
1202 recent_rgrp_add(rgd); 1116 spin_lock(&sdp->sd_rindex_spin);
1117 list_move(&rgd->rd_list_mru, &sdp->sd_rindex_mru_list);
1118 spin_unlock(&sdp->sd_rindex_spin);
1203 rgd = gfs2_rgrpd_get_next(rgd); 1119 rgd = gfs2_rgrpd_get_next(rgd);
1204 if (!rgd) 1120 if (!rgd)
1205 rgd = gfs2_rgrpd_get_first(sdp); 1121 rgd = gfs2_rgrpd_get_first(sdp);
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 7aeacbc65f35..63a8a902d9db 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -65,7 +65,6 @@ void gfs2_tune_init(struct gfs2_tune *gt)
65 gt->gt_quota_quantum = 60; 65 gt->gt_quota_quantum = 60;
66 gt->gt_atime_quantum = 3600; 66 gt->gt_atime_quantum = 3600;
67 gt->gt_new_files_jdata = 0; 67 gt->gt_new_files_jdata = 0;
68 gt->gt_new_files_directio = 0;
69 gt->gt_max_readahead = 1 << 18; 68 gt->gt_max_readahead = 1 << 18;
70 gt->gt_stall_secs = 600; 69 gt->gt_stall_secs = 600;
71 gt->gt_complain_secs = 10; 70 gt->gt_complain_secs = 10;
@@ -941,8 +940,7 @@ static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp,
941 } 940 }
942 941
943 error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_DEFERRED, 942 error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_DEFERRED,
944 LM_FLAG_PRIORITY | GL_NOCACHE, 943 GL_NOCACHE, t_gh);
945 t_gh);
946 944
947 list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) { 945 list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
948 error = gfs2_jdesc_check(jd); 946 error = gfs2_jdesc_check(jd);
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 9ab9fc85ecd0..74846559fc3f 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -110,18 +110,6 @@ static ssize_t statfs_sync_store(struct gfs2_sbd *sdp, const char *buf,
110 return len; 110 return len;
111} 111}
112 112
113static ssize_t shrink_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
114{
115 if (!capable(CAP_SYS_ADMIN))
116 return -EACCES;
117
118 if (simple_strtol(buf, NULL, 0) != 1)
119 return -EINVAL;
120
121 gfs2_gl_hash_clear(sdp, NO_WAIT);
122 return len;
123}
124
125static ssize_t quota_sync_store(struct gfs2_sbd *sdp, const char *buf, 113static ssize_t quota_sync_store(struct gfs2_sbd *sdp, const char *buf,
126 size_t len) 114 size_t len)
127{ 115{
@@ -175,7 +163,6 @@ static struct gfs2_attr gfs2_attr_##name = __ATTR(name, mode, show, store)
175GFS2_ATTR(id, 0444, id_show, NULL); 163GFS2_ATTR(id, 0444, id_show, NULL);
176GFS2_ATTR(fsname, 0444, fsname_show, NULL); 164GFS2_ATTR(fsname, 0444, fsname_show, NULL);
177GFS2_ATTR(freeze, 0644, freeze_show, freeze_store); 165GFS2_ATTR(freeze, 0644, freeze_show, freeze_store);
178GFS2_ATTR(shrink, 0200, NULL, shrink_store);
179GFS2_ATTR(withdraw, 0644, withdraw_show, withdraw_store); 166GFS2_ATTR(withdraw, 0644, withdraw_show, withdraw_store);
180GFS2_ATTR(statfs_sync, 0200, NULL, statfs_sync_store); 167GFS2_ATTR(statfs_sync, 0200, NULL, statfs_sync_store);
181GFS2_ATTR(quota_sync, 0200, NULL, quota_sync_store); 168GFS2_ATTR(quota_sync, 0200, NULL, quota_sync_store);
@@ -186,7 +173,6 @@ static struct attribute *gfs2_attrs[] = {
186 &gfs2_attr_id.attr, 173 &gfs2_attr_id.attr,
187 &gfs2_attr_fsname.attr, 174 &gfs2_attr_fsname.attr,
188 &gfs2_attr_freeze.attr, 175 &gfs2_attr_freeze.attr,
189 &gfs2_attr_shrink.attr,
190 &gfs2_attr_withdraw.attr, 176 &gfs2_attr_withdraw.attr,
191 &gfs2_attr_statfs_sync.attr, 177 &gfs2_attr_statfs_sync.attr,
192 &gfs2_attr_quota_sync.attr, 178 &gfs2_attr_quota_sync.attr,
@@ -426,7 +412,6 @@ TUNE_ATTR(max_readahead, 0);
426TUNE_ATTR(complain_secs, 0); 412TUNE_ATTR(complain_secs, 0);
427TUNE_ATTR(statfs_slow, 0); 413TUNE_ATTR(statfs_slow, 0);
428TUNE_ATTR(new_files_jdata, 0); 414TUNE_ATTR(new_files_jdata, 0);
429TUNE_ATTR(new_files_directio, 0);
430TUNE_ATTR(quota_simul_sync, 1); 415TUNE_ATTR(quota_simul_sync, 1);
431TUNE_ATTR(quota_cache_secs, 1); 416TUNE_ATTR(quota_cache_secs, 1);
432TUNE_ATTR(stall_secs, 1); 417TUNE_ATTR(stall_secs, 1);
@@ -455,7 +440,6 @@ static struct attribute *tune_attrs[] = {
455 &tune_attr_quotad_secs.attr, 440 &tune_attr_quotad_secs.attr,
456 &tune_attr_quota_scale.attr, 441 &tune_attr_quota_scale.attr,
457 &tune_attr_new_files_jdata.attr, 442 &tune_attr_new_files_jdata.attr,
458 &tune_attr_new_files_directio.attr,
459 NULL, 443 NULL,
460}; 444};
461 445
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 6914598022ce..91389c8aee8a 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -688,7 +688,6 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact
688 688
689 J_ASSERT(transaction->t_state == T_FINISHED); 689 J_ASSERT(transaction->t_state == T_FINISHED);
690 J_ASSERT(transaction->t_buffers == NULL); 690 J_ASSERT(transaction->t_buffers == NULL);
691 J_ASSERT(transaction->t_sync_datalist == NULL);
692 J_ASSERT(transaction->t_forget == NULL); 691 J_ASSERT(transaction->t_forget == NULL);
693 J_ASSERT(transaction->t_iobuf_list == NULL); 692 J_ASSERT(transaction->t_iobuf_list == NULL);
694 J_ASSERT(transaction->t_shadow_list == NULL); 693 J_ASSERT(transaction->t_shadow_list == NULL);
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index a2ed72f7ceee..f8b3be873226 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -22,6 +22,8 @@
22#include <linux/pagemap.h> 22#include <linux/pagemap.h>
23#include <linux/jiffies.h> 23#include <linux/jiffies.h>
24#include <linux/crc32.h> 24#include <linux/crc32.h>
25#include <linux/writeback.h>
26#include <linux/backing-dev.h>
25 27
26/* 28/*
27 * Default IO end handler for temporary BJ_IO buffer_heads. 29 * Default IO end handler for temporary BJ_IO buffer_heads.
@@ -37,8 +39,8 @@ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
37} 39}
38 40
39/* 41/*
40 * When an ext3-ordered file is truncated, it is possible that many pages are 42 * When an ext4 file is truncated, it is possible that some pages are not
41 * not sucessfully freed, because they are attached to a committing transaction. 43 * successfully freed, because they are attached to a committing transaction.
42 * After the transaction commits, these pages are left on the LRU, with no 44 * After the transaction commits, these pages are left on the LRU, with no
43 * ->mapping, and with attached buffers. These pages are trivially reclaimable 45 * ->mapping, and with attached buffers. These pages are trivially reclaimable
44 * by the VM, but their apparent absence upsets the VM accounting, and it makes 46 * by the VM, but their apparent absence upsets the VM accounting, and it makes
@@ -80,21 +82,6 @@ nope:
80} 82}
81 83
82/* 84/*
83 * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
84 * held. For ranking reasons we must trylock. If we lose, schedule away and
85 * return 0. j_list_lock is dropped in this case.
86 */
87static int inverted_lock(journal_t *journal, struct buffer_head *bh)
88{
89 if (!jbd_trylock_bh_state(bh)) {
90 spin_unlock(&journal->j_list_lock);
91 schedule();
92 return 0;
93 }
94 return 1;
95}
96
97/*
98 * Done it all: now submit the commit record. We should have 85 * Done it all: now submit the commit record. We should have
99 * cleaned up our previous buffers by now, so if we are in abort 86 * cleaned up our previous buffers by now, so if we are in abort
100 * mode we can now just skip the rest of the journal write 87 * mode we can now just skip the rest of the journal write
@@ -112,6 +99,7 @@ static int journal_submit_commit_record(journal_t *journal,
112 struct buffer_head *bh; 99 struct buffer_head *bh;
113 int ret; 100 int ret;
114 int barrier_done = 0; 101 int barrier_done = 0;
102 struct timespec now = current_kernel_time();
115 103
116 if (is_journal_aborted(journal)) 104 if (is_journal_aborted(journal))
117 return 0; 105 return 0;
@@ -126,6 +114,8 @@ static int journal_submit_commit_record(journal_t *journal,
126 tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); 114 tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
127 tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK); 115 tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
128 tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid); 116 tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
117 tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
118 tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
129 119
130 if (JBD2_HAS_COMPAT_FEATURE(journal, 120 if (JBD2_HAS_COMPAT_FEATURE(journal,
131 JBD2_FEATURE_COMPAT_CHECKSUM)) { 121 JBD2_FEATURE_COMPAT_CHECKSUM)) {
@@ -197,159 +187,104 @@ static int journal_wait_on_commit_record(struct buffer_head *bh)
197} 187}
198 188
199/* 189/*
200 * Wait for all submitted IO to complete. 190 * write the filemap data using writepage() address_space_operations.
191 * We don't do block allocation here even for delalloc. We don't
192 * use writepages() because with dealyed allocation we may be doing
193 * block allocation in writepages().
201 */ 194 */
202static int journal_wait_on_locked_list(journal_t *journal, 195static int journal_submit_inode_data_buffers(struct address_space *mapping)
203 transaction_t *commit_transaction)
204{ 196{
205 int ret = 0; 197 int ret;
206 struct journal_head *jh; 198 struct writeback_control wbc = {
207 199 .sync_mode = WB_SYNC_ALL,
208 while (commit_transaction->t_locked_list) { 200 .nr_to_write = mapping->nrpages * 2,
209 struct buffer_head *bh; 201 .range_start = 0,
210 202 .range_end = i_size_read(mapping->host),
211 jh = commit_transaction->t_locked_list->b_tprev; 203 .for_writepages = 1,
212 bh = jh2bh(jh); 204 };
213 get_bh(bh); 205
214 if (buffer_locked(bh)) { 206 ret = generic_writepages(mapping, &wbc);
215 spin_unlock(&journal->j_list_lock);
216 wait_on_buffer(bh);
217 if (unlikely(!buffer_uptodate(bh)))
218 ret = -EIO;
219 spin_lock(&journal->j_list_lock);
220 }
221 if (!inverted_lock(journal, bh)) {
222 put_bh(bh);
223 spin_lock(&journal->j_list_lock);
224 continue;
225 }
226 if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
227 __jbd2_journal_unfile_buffer(jh);
228 jbd_unlock_bh_state(bh);
229 jbd2_journal_remove_journal_head(bh);
230 put_bh(bh);
231 } else {
232 jbd_unlock_bh_state(bh);
233 }
234 put_bh(bh);
235 cond_resched_lock(&journal->j_list_lock);
236 }
237 return ret; 207 return ret;
238 } 208}
239 209
240static void journal_do_submit_data(struct buffer_head **wbuf, int bufs) 210/*
211 * Submit all the data buffers of inode associated with the transaction to
212 * disk.
213 *
214 * We are in a committing transaction. Therefore no new inode can be added to
215 * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
216 * operate on from being released while we write out pages.
217 */
218static int journal_submit_data_buffers(journal_t *journal,
219 transaction_t *commit_transaction)
241{ 220{
242 int i; 221 struct jbd2_inode *jinode;
222 int err, ret = 0;
223 struct address_space *mapping;
243 224
244 for (i = 0; i < bufs; i++) { 225 spin_lock(&journal->j_list_lock);
245 wbuf[i]->b_end_io = end_buffer_write_sync; 226 list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
246 /* We use-up our safety reference in submit_bh() */ 227 mapping = jinode->i_vfs_inode->i_mapping;
247 submit_bh(WRITE, wbuf[i]); 228 jinode->i_flags |= JI_COMMIT_RUNNING;
229 spin_unlock(&journal->j_list_lock);
230 /*
231 * submit the inode data buffers. We use writepage
232 * instead of writepages. Because writepages can do
233 * block allocation with delalloc. We need to write
234 * only allocated blocks here.
235 */
236 err = journal_submit_inode_data_buffers(mapping);
237 if (!ret)
238 ret = err;
239 spin_lock(&journal->j_list_lock);
240 J_ASSERT(jinode->i_transaction == commit_transaction);
241 jinode->i_flags &= ~JI_COMMIT_RUNNING;
242 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
248 } 243 }
244 spin_unlock(&journal->j_list_lock);
245 return ret;
249} 246}
250 247
251/* 248/*
252 * Submit all the data buffers to disk 249 * Wait for data submitted for writeout, refile inodes to proper
250 * transaction if needed.
251 *
253 */ 252 */
254static void journal_submit_data_buffers(journal_t *journal, 253static int journal_finish_inode_data_buffers(journal_t *journal,
255 transaction_t *commit_transaction) 254 transaction_t *commit_transaction)
256{ 255{
257 struct journal_head *jh; 256 struct jbd2_inode *jinode, *next_i;
258 struct buffer_head *bh; 257 int err, ret = 0;
259 int locked;
260 int bufs = 0;
261 struct buffer_head **wbuf = journal->j_wbuf;
262 258
263 /* 259 /* For locking, see the comment in journal_submit_data_buffers() */
264 * Whenever we unlock the journal and sleep, things can get added
265 * onto ->t_sync_datalist, so we have to keep looping back to
266 * write_out_data until we *know* that the list is empty.
267 *
268 * Cleanup any flushed data buffers from the data list. Even in
269 * abort mode, we want to flush this out as soon as possible.
270 */
271write_out_data:
272 cond_resched();
273 spin_lock(&journal->j_list_lock); 260 spin_lock(&journal->j_list_lock);
261 list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
262 jinode->i_flags |= JI_COMMIT_RUNNING;
263 spin_unlock(&journal->j_list_lock);
264 err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
265 if (!ret)
266 ret = err;
267 spin_lock(&journal->j_list_lock);
268 jinode->i_flags &= ~JI_COMMIT_RUNNING;
269 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
270 }
274 271
275 while (commit_transaction->t_sync_datalist) { 272 /* Now refile inode to proper lists */
276 jh = commit_transaction->t_sync_datalist; 273 list_for_each_entry_safe(jinode, next_i,
277 bh = jh2bh(jh); 274 &commit_transaction->t_inode_list, i_list) {
278 locked = 0; 275 list_del(&jinode->i_list);
279 276 if (jinode->i_next_transaction) {
280 /* Get reference just to make sure buffer does not disappear 277 jinode->i_transaction = jinode->i_next_transaction;
281 * when we are forced to drop various locks */ 278 jinode->i_next_transaction = NULL;
282 get_bh(bh); 279 list_add(&jinode->i_list,
283 /* If the buffer is dirty, we need to submit IO and hence 280 &jinode->i_transaction->t_inode_list);
284 * we need the buffer lock. We try to lock the buffer without
285 * blocking. If we fail, we need to drop j_list_lock and do
286 * blocking lock_buffer().
287 */
288 if (buffer_dirty(bh)) {
289 if (test_set_buffer_locked(bh)) {
290 BUFFER_TRACE(bh, "needs blocking lock");
291 spin_unlock(&journal->j_list_lock);
292 /* Write out all data to prevent deadlocks */
293 journal_do_submit_data(wbuf, bufs);
294 bufs = 0;
295 lock_buffer(bh);
296 spin_lock(&journal->j_list_lock);
297 }
298 locked = 1;
299 }
300 /* We have to get bh_state lock. Again out of order, sigh. */
301 if (!inverted_lock(journal, bh)) {
302 jbd_lock_bh_state(bh);
303 spin_lock(&journal->j_list_lock);
304 }
305 /* Someone already cleaned up the buffer? */
306 if (!buffer_jbd(bh)
307 || jh->b_transaction != commit_transaction
308 || jh->b_jlist != BJ_SyncData) {
309 jbd_unlock_bh_state(bh);
310 if (locked)
311 unlock_buffer(bh);
312 BUFFER_TRACE(bh, "already cleaned up");
313 put_bh(bh);
314 continue;
315 }
316 if (locked && test_clear_buffer_dirty(bh)) {
317 BUFFER_TRACE(bh, "needs writeout, adding to array");
318 wbuf[bufs++] = bh;
319 __jbd2_journal_file_buffer(jh, commit_transaction,
320 BJ_Locked);
321 jbd_unlock_bh_state(bh);
322 if (bufs == journal->j_wbufsize) {
323 spin_unlock(&journal->j_list_lock);
324 journal_do_submit_data(wbuf, bufs);
325 bufs = 0;
326 goto write_out_data;
327 }
328 } else if (!locked && buffer_locked(bh)) {
329 __jbd2_journal_file_buffer(jh, commit_transaction,
330 BJ_Locked);
331 jbd_unlock_bh_state(bh);
332 put_bh(bh);
333 } else { 281 } else {
334 BUFFER_TRACE(bh, "writeout complete: unfile"); 282 jinode->i_transaction = NULL;
335 __jbd2_journal_unfile_buffer(jh);
336 jbd_unlock_bh_state(bh);
337 if (locked)
338 unlock_buffer(bh);
339 jbd2_journal_remove_journal_head(bh);
340 /* Once for our safety reference, once for
341 * jbd2_journal_remove_journal_head() */
342 put_bh(bh);
343 put_bh(bh);
344 }
345
346 if (need_resched() || spin_needbreak(&journal->j_list_lock)) {
347 spin_unlock(&journal->j_list_lock);
348 goto write_out_data;
349 } 283 }
350 } 284 }
351 spin_unlock(&journal->j_list_lock); 285 spin_unlock(&journal->j_list_lock);
352 journal_do_submit_data(wbuf, bufs); 286
287 return ret;
353} 288}
354 289
355static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh) 290static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
@@ -524,21 +459,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
524 * Now start flushing things to disk, in the order they appear 459 * Now start flushing things to disk, in the order they appear
525 * on the transaction lists. Data blocks go first. 460 * on the transaction lists. Data blocks go first.
526 */ 461 */
527 err = 0; 462 err = journal_submit_data_buffers(journal, commit_transaction);
528 journal_submit_data_buffers(journal, commit_transaction);
529
530 /*
531 * Wait for all previously submitted IO to complete if commit
532 * record is to be written synchronously.
533 */
534 spin_lock(&journal->j_list_lock);
535 if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
536 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
537 err = journal_wait_on_locked_list(journal,
538 commit_transaction);
539
540 spin_unlock(&journal->j_list_lock);
541
542 if (err) 463 if (err)
543 jbd2_journal_abort(journal, err); 464 jbd2_journal_abort(journal, err);
544 465
@@ -547,16 +468,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
547 jbd_debug(3, "JBD: commit phase 2\n"); 468 jbd_debug(3, "JBD: commit phase 2\n");
548 469
549 /* 470 /*
550 * If we found any dirty or locked buffers, then we should have
551 * looped back up to the write_out_data label. If there weren't
552 * any then journal_clean_data_list should have wiped the list
553 * clean by now, so check that it is in fact empty.
554 */
555 J_ASSERT (commit_transaction->t_sync_datalist == NULL);
556
557 jbd_debug (3, "JBD: commit phase 3\n");
558
559 /*
560 * Way to go: we have now written out all of the data for a 471 * Way to go: we have now written out all of the data for a
561 * transaction! Now comes the tricky part: we need to write out 472 * transaction! Now comes the tricky part: we need to write out
562 * metadata. Loop over the transaction's entire buffer list: 473 * metadata. Loop over the transaction's entire buffer list:
@@ -574,6 +485,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
574 J_ASSERT(commit_transaction->t_nr_buffers <= 485 J_ASSERT(commit_transaction->t_nr_buffers <=
575 commit_transaction->t_outstanding_credits); 486 commit_transaction->t_outstanding_credits);
576 487
488 err = 0;
577 descriptor = NULL; 489 descriptor = NULL;
578 bufs = 0; 490 bufs = 0;
579 while (commit_transaction->t_buffers) { 491 while (commit_transaction->t_buffers) {
@@ -748,15 +660,19 @@ start_journal_io:
748 &cbh, crc32_sum); 660 &cbh, crc32_sum);
749 if (err) 661 if (err)
750 __jbd2_journal_abort_hard(journal); 662 __jbd2_journal_abort_hard(journal);
751
752 spin_lock(&journal->j_list_lock);
753 err = journal_wait_on_locked_list(journal,
754 commit_transaction);
755 spin_unlock(&journal->j_list_lock);
756 if (err)
757 __jbd2_journal_abort_hard(journal);
758 } 663 }
759 664
665 /*
666 * This is the right place to wait for data buffers both for ASYNC
667 * and !ASYNC commit. If commit is ASYNC, we need to wait only after
668 * the commit block went to disk (which happens above). If commit is
669 * SYNC, we need to wait for data buffers before we start writing
670 * commit block, which happens below in such setting.
671 */
672 err = journal_finish_inode_data_buffers(journal, commit_transaction);
673 if (err)
674 jbd2_journal_abort(journal, err);
675
760 /* Lo and behold: we have just managed to send a transaction to 676 /* Lo and behold: we have just managed to send a transaction to
761 the log. Before we can commit it, wait for the IO so far to 677 the log. Before we can commit it, wait for the IO so far to
762 complete. Control buffers being written are on the 678 complete. Control buffers being written are on the
@@ -768,7 +684,7 @@ start_journal_io:
768 so we incur less scheduling load. 684 so we incur less scheduling load.
769 */ 685 */
770 686
771 jbd_debug(3, "JBD: commit phase 4\n"); 687 jbd_debug(3, "JBD: commit phase 3\n");
772 688
773 /* 689 /*
774 * akpm: these are BJ_IO, and j_list_lock is not needed. 690 * akpm: these are BJ_IO, and j_list_lock is not needed.
@@ -827,7 +743,7 @@ wait_for_iobuf:
827 743
828 J_ASSERT (commit_transaction->t_shadow_list == NULL); 744 J_ASSERT (commit_transaction->t_shadow_list == NULL);
829 745
830 jbd_debug(3, "JBD: commit phase 5\n"); 746 jbd_debug(3, "JBD: commit phase 4\n");
831 747
832 /* Here we wait for the revoke record and descriptor record buffers */ 748 /* Here we wait for the revoke record and descriptor record buffers */
833 wait_for_ctlbuf: 749 wait_for_ctlbuf:
@@ -854,7 +770,7 @@ wait_for_iobuf:
854 /* AKPM: bforget here */ 770 /* AKPM: bforget here */
855 } 771 }
856 772
857 jbd_debug(3, "JBD: commit phase 6\n"); 773 jbd_debug(3, "JBD: commit phase 5\n");
858 774
859 if (!JBD2_HAS_INCOMPAT_FEATURE(journal, 775 if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
860 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { 776 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
@@ -874,9 +790,9 @@ wait_for_iobuf:
874 transaction can be removed from any checkpoint list it was on 790 transaction can be removed from any checkpoint list it was on
875 before. */ 791 before. */
876 792
877 jbd_debug(3, "JBD: commit phase 7\n"); 793 jbd_debug(3, "JBD: commit phase 6\n");
878 794
879 J_ASSERT(commit_transaction->t_sync_datalist == NULL); 795 J_ASSERT(list_empty(&commit_transaction->t_inode_list));
880 J_ASSERT(commit_transaction->t_buffers == NULL); 796 J_ASSERT(commit_transaction->t_buffers == NULL);
881 J_ASSERT(commit_transaction->t_checkpoint_list == NULL); 797 J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
882 J_ASSERT(commit_transaction->t_iobuf_list == NULL); 798 J_ASSERT(commit_transaction->t_iobuf_list == NULL);
@@ -997,7 +913,7 @@ restart_loop:
997 913
998 /* Done with this transaction! */ 914 /* Done with this transaction! */
999 915
1000 jbd_debug(3, "JBD: commit phase 8\n"); 916 jbd_debug(3, "JBD: commit phase 7\n");
1001 917
1002 J_ASSERT(commit_transaction->t_state == T_COMMIT); 918 J_ASSERT(commit_transaction->t_state == T_COMMIT);
1003 919
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 2e24567c4a79..b26c6d9fe6ae 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -50,7 +50,6 @@ EXPORT_SYMBOL(jbd2_journal_unlock_updates);
50EXPORT_SYMBOL(jbd2_journal_get_write_access); 50EXPORT_SYMBOL(jbd2_journal_get_write_access);
51EXPORT_SYMBOL(jbd2_journal_get_create_access); 51EXPORT_SYMBOL(jbd2_journal_get_create_access);
52EXPORT_SYMBOL(jbd2_journal_get_undo_access); 52EXPORT_SYMBOL(jbd2_journal_get_undo_access);
53EXPORT_SYMBOL(jbd2_journal_dirty_data);
54EXPORT_SYMBOL(jbd2_journal_dirty_metadata); 53EXPORT_SYMBOL(jbd2_journal_dirty_metadata);
55EXPORT_SYMBOL(jbd2_journal_release_buffer); 54EXPORT_SYMBOL(jbd2_journal_release_buffer);
56EXPORT_SYMBOL(jbd2_journal_forget); 55EXPORT_SYMBOL(jbd2_journal_forget);
@@ -82,6 +81,10 @@ EXPORT_SYMBOL(jbd2_journal_blocks_per_page);
82EXPORT_SYMBOL(jbd2_journal_invalidatepage); 81EXPORT_SYMBOL(jbd2_journal_invalidatepage);
83EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers); 82EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers);
84EXPORT_SYMBOL(jbd2_journal_force_commit); 83EXPORT_SYMBOL(jbd2_journal_force_commit);
84EXPORT_SYMBOL(jbd2_journal_file_inode);
85EXPORT_SYMBOL(jbd2_journal_init_jbd_inode);
86EXPORT_SYMBOL(jbd2_journal_release_jbd_inode);
87EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
85 88
86static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *); 89static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
87static void __journal_abort_soft (journal_t *journal, int errno); 90static void __journal_abort_soft (journal_t *journal, int errno);
@@ -2195,6 +2198,54 @@ void jbd2_journal_put_journal_head(struct journal_head *jh)
2195} 2198}
2196 2199
2197/* 2200/*
2201 * Initialize jbd inode head
2202 */
2203void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode)
2204{
2205 jinode->i_transaction = NULL;
2206 jinode->i_next_transaction = NULL;
2207 jinode->i_vfs_inode = inode;
2208 jinode->i_flags = 0;
2209 INIT_LIST_HEAD(&jinode->i_list);
2210}
2211
2212/*
2213 * Function to be called before we start removing inode from memory (i.e.,
2214 * clear_inode() is a fine place to be called from). It removes inode from
2215 * transaction's lists.
2216 */
2217void jbd2_journal_release_jbd_inode(journal_t *journal,
2218 struct jbd2_inode *jinode)
2219{
2220 int writeout = 0;
2221
2222 if (!journal)
2223 return;
2224restart:
2225 spin_lock(&journal->j_list_lock);
2226 /* Is commit writing out inode - we have to wait */
2227 if (jinode->i_flags & JI_COMMIT_RUNNING) {
2228 wait_queue_head_t *wq;
2229 DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING);
2230 wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING);
2231 prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
2232 spin_unlock(&journal->j_list_lock);
2233 schedule();
2234 finish_wait(wq, &wait.wait);
2235 goto restart;
2236 }
2237
2238 /* Do we need to wait for data writeback? */
2239 if (journal->j_committing_transaction == jinode->i_transaction)
2240 writeout = 1;
2241 if (jinode->i_transaction) {
2242 list_del(&jinode->i_list);
2243 jinode->i_transaction = NULL;
2244 }
2245 spin_unlock(&journal->j_list_lock);
2246}
2247
2248/*
2198 * debugfs tunables 2249 * debugfs tunables
2199 */ 2250 */
2200#ifdef CONFIG_JBD2_DEBUG 2251#ifdef CONFIG_JBD2_DEBUG
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index d6e006e67804..4f7cadbb19fa 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -41,7 +41,6 @@ static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh);
41 * new transaction and we can't block without protecting against other 41 * new transaction and we can't block without protecting against other
42 * processes trying to touch the journal while it is in transition. 42 * processes trying to touch the journal while it is in transition.
43 * 43 *
44 * Called under j_state_lock
45 */ 44 */
46 45
47static transaction_t * 46static transaction_t *
@@ -52,6 +51,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
52 transaction->t_tid = journal->j_transaction_sequence++; 51 transaction->t_tid = journal->j_transaction_sequence++;
53 transaction->t_expires = jiffies + journal->j_commit_interval; 52 transaction->t_expires = jiffies + journal->j_commit_interval;
54 spin_lock_init(&transaction->t_handle_lock); 53 spin_lock_init(&transaction->t_handle_lock);
54 INIT_LIST_HEAD(&transaction->t_inode_list);
55 55
56 /* Set up the commit timer for the new transaction. */ 56 /* Set up the commit timer for the new transaction. */
57 journal->j_commit_timer.expires = round_jiffies(transaction->t_expires); 57 journal->j_commit_timer.expires = round_jiffies(transaction->t_expires);
@@ -943,183 +943,6 @@ out:
943} 943}
944 944
945/** 945/**
946 * int jbd2_journal_dirty_data() - mark a buffer as containing dirty data which
947 * needs to be flushed before we can commit the
948 * current transaction.
949 * @handle: transaction
950 * @bh: bufferhead to mark
951 *
952 * The buffer is placed on the transaction's data list and is marked as
953 * belonging to the transaction.
954 *
955 * Returns error number or 0 on success.
956 *
957 * jbd2_journal_dirty_data() can be called via page_launder->ext3_writepage
958 * by kswapd.
959 */
960int jbd2_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
961{
962 journal_t *journal = handle->h_transaction->t_journal;
963 int need_brelse = 0;
964 struct journal_head *jh;
965
966 if (is_handle_aborted(handle))
967 return 0;
968
969 jh = jbd2_journal_add_journal_head(bh);
970 JBUFFER_TRACE(jh, "entry");
971
972 /*
973 * The buffer could *already* be dirty. Writeout can start
974 * at any time.
975 */
976 jbd_debug(4, "jh: %p, tid:%d\n", jh, handle->h_transaction->t_tid);
977
978 /*
979 * What if the buffer is already part of a running transaction?
980 *
981 * There are two cases:
982 * 1) It is part of the current running transaction. Refile it,
983 * just in case we have allocated it as metadata, deallocated
984 * it, then reallocated it as data.
985 * 2) It is part of the previous, still-committing transaction.
986 * If all we want to do is to guarantee that the buffer will be
987 * written to disk before this new transaction commits, then
988 * being sure that the *previous* transaction has this same
989 * property is sufficient for us! Just leave it on its old
990 * transaction.
991 *
992 * In case (2), the buffer must not already exist as metadata
993 * --- that would violate write ordering (a transaction is free
994 * to write its data at any point, even before the previous
995 * committing transaction has committed). The caller must
996 * never, ever allow this to happen: there's nothing we can do
997 * about it in this layer.
998 */
999 jbd_lock_bh_state(bh);
1000 spin_lock(&journal->j_list_lock);
1001
1002 /* Now that we have bh_state locked, are we really still mapped? */
1003 if (!buffer_mapped(bh)) {
1004 JBUFFER_TRACE(jh, "unmapped buffer, bailing out");
1005 goto no_journal;
1006 }
1007
1008 if (jh->b_transaction) {
1009 JBUFFER_TRACE(jh, "has transaction");
1010 if (jh->b_transaction != handle->h_transaction) {
1011 JBUFFER_TRACE(jh, "belongs to older transaction");
1012 J_ASSERT_JH(jh, jh->b_transaction ==
1013 journal->j_committing_transaction);
1014
1015 /* @@@ IS THIS TRUE ? */
1016 /*
1017 * Not any more. Scenario: someone does a write()
1018 * in data=journal mode. The buffer's transaction has
1019 * moved into commit. Then someone does another
1020 * write() to the file. We do the frozen data copyout
1021 * and set b_next_transaction to point to j_running_t.
1022 * And while we're in that state, someone does a
1023 * writepage() in an attempt to pageout the same area
1024 * of the file via a shared mapping. At present that
1025 * calls jbd2_journal_dirty_data(), and we get right here.
1026 * It may be too late to journal the data. Simply
1027 * falling through to the next test will suffice: the
1028 * data will be dirty and wil be checkpointed. The
1029 * ordering comments in the next comment block still
1030 * apply.
1031 */
1032 //J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
1033
1034 /*
1035 * If we're journalling data, and this buffer was
1036 * subject to a write(), it could be metadata, forget
1037 * or shadow against the committing transaction. Now,
1038 * someone has dirtied the same darn page via a mapping
1039 * and it is being writepage()'d.
1040 * We *could* just steal the page from commit, with some
1041 * fancy locking there. Instead, we just skip it -
1042 * don't tie the page's buffers to the new transaction
1043 * at all.
1044 * Implication: if we crash before the writepage() data
1045 * is written into the filesystem, recovery will replay
1046 * the write() data.
1047 */
1048 if (jh->b_jlist != BJ_None &&
1049 jh->b_jlist != BJ_SyncData &&
1050 jh->b_jlist != BJ_Locked) {
1051 JBUFFER_TRACE(jh, "Not stealing");
1052 goto no_journal;
1053 }
1054
1055 /*
1056 * This buffer may be undergoing writeout in commit. We
1057 * can't return from here and let the caller dirty it
1058 * again because that can cause the write-out loop in
1059 * commit to never terminate.
1060 */
1061 if (buffer_dirty(bh)) {
1062 get_bh(bh);
1063 spin_unlock(&journal->j_list_lock);
1064 jbd_unlock_bh_state(bh);
1065 need_brelse = 1;
1066 sync_dirty_buffer(bh);
1067 jbd_lock_bh_state(bh);
1068 spin_lock(&journal->j_list_lock);
1069 /* Since we dropped the lock... */
1070 if (!buffer_mapped(bh)) {
1071 JBUFFER_TRACE(jh, "buffer got unmapped");
1072 goto no_journal;
1073 }
1074 /* The buffer may become locked again at any
1075 time if it is redirtied */
1076 }
1077
1078 /* journal_clean_data_list() may have got there first */
1079 if (jh->b_transaction != NULL) {
1080 JBUFFER_TRACE(jh, "unfile from commit");
1081 __jbd2_journal_temp_unlink_buffer(jh);
1082 /* It still points to the committing
1083 * transaction; move it to this one so
1084 * that the refile assert checks are
1085 * happy. */
1086 jh->b_transaction = handle->h_transaction;
1087 }
1088 /* The buffer will be refiled below */
1089
1090 }
1091 /*
1092 * Special case --- the buffer might actually have been
1093 * allocated and then immediately deallocated in the previous,
1094 * committing transaction, so might still be left on that
1095 * transaction's metadata lists.
1096 */
1097 if (jh->b_jlist != BJ_SyncData && jh->b_jlist != BJ_Locked) {
1098 JBUFFER_TRACE(jh, "not on correct data list: unfile");
1099 J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow);
1100 __jbd2_journal_temp_unlink_buffer(jh);
1101 jh->b_transaction = handle->h_transaction;
1102 JBUFFER_TRACE(jh, "file as data");
1103 __jbd2_journal_file_buffer(jh, handle->h_transaction,
1104 BJ_SyncData);
1105 }
1106 } else {
1107 JBUFFER_TRACE(jh, "not on a transaction");
1108 __jbd2_journal_file_buffer(jh, handle->h_transaction, BJ_SyncData);
1109 }
1110no_journal:
1111 spin_unlock(&journal->j_list_lock);
1112 jbd_unlock_bh_state(bh);
1113 if (need_brelse) {
1114 BUFFER_TRACE(bh, "brelse");
1115 __brelse(bh);
1116 }
1117 JBUFFER_TRACE(jh, "exit");
1118 jbd2_journal_put_journal_head(jh);
1119 return 0;
1120}
1121
1122/**
1123 * int jbd2_journal_dirty_metadata() - mark a buffer as containing dirty metadata 946 * int jbd2_journal_dirty_metadata() - mark a buffer as containing dirty metadata
1124 * @handle: transaction to add buffer to. 947 * @handle: transaction to add buffer to.
1125 * @bh: buffer to mark 948 * @bh: buffer to mark
@@ -1541,10 +1364,10 @@ __blist_del_buffer(struct journal_head **list, struct journal_head *jh)
1541 * Remove a buffer from the appropriate transaction list. 1364 * Remove a buffer from the appropriate transaction list.
1542 * 1365 *
1543 * Note that this function can *change* the value of 1366 * Note that this function can *change* the value of
1544 * bh->b_transaction->t_sync_datalist, t_buffers, t_forget, 1367 * bh->b_transaction->t_buffers, t_forget, t_iobuf_list, t_shadow_list,
1545 * t_iobuf_list, t_shadow_list, t_log_list or t_reserved_list. If the caller 1368 * t_log_list or t_reserved_list. If the caller is holding onto a copy of one
1546 * is holding onto a copy of one of thee pointers, it could go bad. 1369 * of these pointers, it could go bad. Generally the caller needs to re-read
1547 * Generally the caller needs to re-read the pointer from the transaction_t. 1370 * the pointer from the transaction_t.
1548 * 1371 *
1549 * Called under j_list_lock. The journal may not be locked. 1372 * Called under j_list_lock. The journal may not be locked.
1550 */ 1373 */
@@ -1566,9 +1389,6 @@ void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
1566 switch (jh->b_jlist) { 1389 switch (jh->b_jlist) {
1567 case BJ_None: 1390 case BJ_None:
1568 return; 1391 return;
1569 case BJ_SyncData:
1570 list = &transaction->t_sync_datalist;
1571 break;
1572 case BJ_Metadata: 1392 case BJ_Metadata:
1573 transaction->t_nr_buffers--; 1393 transaction->t_nr_buffers--;
1574 J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0); 1394 J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0);
@@ -1589,9 +1409,6 @@ void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
1589 case BJ_Reserved: 1409 case BJ_Reserved:
1590 list = &transaction->t_reserved_list; 1410 list = &transaction->t_reserved_list;
1591 break; 1411 break;
1592 case BJ_Locked:
1593 list = &transaction->t_locked_list;
1594 break;
1595 } 1412 }
1596 1413
1597 __blist_del_buffer(list, jh); 1414 __blist_del_buffer(list, jh);
@@ -1634,15 +1451,7 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
1634 goto out; 1451 goto out;
1635 1452
1636 spin_lock(&journal->j_list_lock); 1453 spin_lock(&journal->j_list_lock);
1637 if (jh->b_transaction != NULL && jh->b_cp_transaction == NULL) { 1454 if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) {
1638 if (jh->b_jlist == BJ_SyncData || jh->b_jlist == BJ_Locked) {
1639 /* A written-back ordered data buffer */
1640 JBUFFER_TRACE(jh, "release data");
1641 __jbd2_journal_unfile_buffer(jh);
1642 jbd2_journal_remove_journal_head(bh);
1643 __brelse(bh);
1644 }
1645 } else if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) {
1646 /* written-back checkpointed metadata buffer */ 1455 /* written-back checkpointed metadata buffer */
1647 if (jh->b_jlist == BJ_None) { 1456 if (jh->b_jlist == BJ_None) {
1648 JBUFFER_TRACE(jh, "remove from checkpoint list"); 1457 JBUFFER_TRACE(jh, "remove from checkpoint list");
@@ -1656,12 +1465,43 @@ out:
1656 return; 1465 return;
1657} 1466}
1658 1467
1468/*
1469 * jbd2_journal_try_to_free_buffers() could race with
1470 * jbd2_journal_commit_transaction(). The later might still hold the
1471 * reference count to the buffers when inspecting them on
1472 * t_syncdata_list or t_locked_list.
1473 *
1474 * jbd2_journal_try_to_free_buffers() will call this function to
1475 * wait for the current transaction to finish syncing data buffers, before
1476 * try to free that buffer.
1477 *
1478 * Called with journal->j_state_lock hold.
1479 */
1480static void jbd2_journal_wait_for_transaction_sync_data(journal_t *journal)
1481{
1482 transaction_t *transaction;
1483 tid_t tid;
1484
1485 spin_lock(&journal->j_state_lock);
1486 transaction = journal->j_committing_transaction;
1487
1488 if (!transaction) {
1489 spin_unlock(&journal->j_state_lock);
1490 return;
1491 }
1492
1493 tid = transaction->t_tid;
1494 spin_unlock(&journal->j_state_lock);
1495 jbd2_log_wait_commit(journal, tid);
1496}
1659 1497
1660/** 1498/**
1661 * int jbd2_journal_try_to_free_buffers() - try to free page buffers. 1499 * int jbd2_journal_try_to_free_buffers() - try to free page buffers.
1662 * @journal: journal for operation 1500 * @journal: journal for operation
1663 * @page: to try and free 1501 * @page: to try and free
1664 * @unused_gfp_mask: unused 1502 * @gfp_mask: we use the mask to detect how hard should we try to release
1503 * buffers. If __GFP_WAIT and __GFP_FS is set, we wait for commit code to
1504 * release the buffers.
1665 * 1505 *
1666 * 1506 *
1667 * For all the buffers on this page, 1507 * For all the buffers on this page,
@@ -1690,9 +1530,11 @@ out:
1690 * journal_try_to_free_buffer() is changing its state. But that 1530 * journal_try_to_free_buffer() is changing its state. But that
1691 * cannot happen because we never reallocate freed data as metadata 1531 * cannot happen because we never reallocate freed data as metadata
1692 * while the data is part of a transaction. Yes? 1532 * while the data is part of a transaction. Yes?
1533 *
1534 * Return 0 on failure, 1 on success
1693 */ 1535 */
1694int jbd2_journal_try_to_free_buffers(journal_t *journal, 1536int jbd2_journal_try_to_free_buffers(journal_t *journal,
1695 struct page *page, gfp_t unused_gfp_mask) 1537 struct page *page, gfp_t gfp_mask)
1696{ 1538{
1697 struct buffer_head *head; 1539 struct buffer_head *head;
1698 struct buffer_head *bh; 1540 struct buffer_head *bh;
@@ -1708,7 +1550,8 @@ int jbd2_journal_try_to_free_buffers(journal_t *journal,
1708 /* 1550 /*
1709 * We take our own ref against the journal_head here to avoid 1551 * We take our own ref against the journal_head here to avoid
1710 * having to add tons of locking around each instance of 1552 * having to add tons of locking around each instance of
1711 * jbd2_journal_remove_journal_head() and jbd2_journal_put_journal_head(). 1553 * jbd2_journal_remove_journal_head() and
1554 * jbd2_journal_put_journal_head().
1712 */ 1555 */
1713 jh = jbd2_journal_grab_journal_head(bh); 1556 jh = jbd2_journal_grab_journal_head(bh);
1714 if (!jh) 1557 if (!jh)
@@ -1721,7 +1564,28 @@ int jbd2_journal_try_to_free_buffers(journal_t *journal,
1721 if (buffer_jbd(bh)) 1564 if (buffer_jbd(bh))
1722 goto busy; 1565 goto busy;
1723 } while ((bh = bh->b_this_page) != head); 1566 } while ((bh = bh->b_this_page) != head);
1567
1724 ret = try_to_free_buffers(page); 1568 ret = try_to_free_buffers(page);
1569
1570 /*
1571 * There are a number of places where jbd2_journal_try_to_free_buffers()
1572 * could race with jbd2_journal_commit_transaction(), the later still
1573 * holds the reference to the buffers to free while processing them.
1574 * try_to_free_buffers() failed to free those buffers. Some of the
1575 * caller of releasepage() request page buffers to be dropped, otherwise
1576 * treat the fail-to-free as errors (such as generic_file_direct_IO())
1577 *
1578 * So, if the caller of try_to_release_page() wants the synchronous
1579 * behaviour(i.e make sure buffers are dropped upon return),
1580 * let's wait for the current transaction to finish flush of
1581 * dirty data buffers, then try to free those buffers again,
1582 * with the journal locked.
1583 */
1584 if (ret == 0 && (gfp_mask & __GFP_WAIT) && (gfp_mask & __GFP_FS)) {
1585 jbd2_journal_wait_for_transaction_sync_data(journal);
1586 ret = try_to_free_buffers(page);
1587 }
1588
1725busy: 1589busy:
1726 return ret; 1590 return ret;
1727} 1591}
@@ -1823,6 +1687,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
1823 if (!buffer_jbd(bh)) 1687 if (!buffer_jbd(bh))
1824 goto zap_buffer_unlocked; 1688 goto zap_buffer_unlocked;
1825 1689
1690 /* OK, we have data buffer in journaled mode */
1826 spin_lock(&journal->j_state_lock); 1691 spin_lock(&journal->j_state_lock);
1827 jbd_lock_bh_state(bh); 1692 jbd_lock_bh_state(bh);
1828 spin_lock(&journal->j_list_lock); 1693 spin_lock(&journal->j_list_lock);
@@ -1886,15 +1751,6 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
1886 } 1751 }
1887 } else if (transaction == journal->j_committing_transaction) { 1752 } else if (transaction == journal->j_committing_transaction) {
1888 JBUFFER_TRACE(jh, "on committing transaction"); 1753 JBUFFER_TRACE(jh, "on committing transaction");
1889 if (jh->b_jlist == BJ_Locked) {
1890 /*
1891 * The buffer is on the committing transaction's locked
1892 * list. We have the buffer locked, so I/O has
1893 * completed. So we can nail the buffer now.
1894 */
1895 may_free = __dispose_buffer(jh, transaction);
1896 goto zap_buffer;
1897 }
1898 /* 1754 /*
1899 * If it is committing, we simply cannot touch it. We 1755 * If it is committing, we simply cannot touch it. We
1900 * can remove it's next_transaction pointer from the 1756 * can remove it's next_transaction pointer from the
@@ -2027,9 +1883,6 @@ void __jbd2_journal_file_buffer(struct journal_head *jh,
2027 J_ASSERT_JH(jh, !jh->b_committed_data); 1883 J_ASSERT_JH(jh, !jh->b_committed_data);
2028 J_ASSERT_JH(jh, !jh->b_frozen_data); 1884 J_ASSERT_JH(jh, !jh->b_frozen_data);
2029 return; 1885 return;
2030 case BJ_SyncData:
2031 list = &transaction->t_sync_datalist;
2032 break;
2033 case BJ_Metadata: 1886 case BJ_Metadata:
2034 transaction->t_nr_buffers++; 1887 transaction->t_nr_buffers++;
2035 list = &transaction->t_buffers; 1888 list = &transaction->t_buffers;
@@ -2049,9 +1902,6 @@ void __jbd2_journal_file_buffer(struct journal_head *jh,
2049 case BJ_Reserved: 1902 case BJ_Reserved:
2050 list = &transaction->t_reserved_list; 1903 list = &transaction->t_reserved_list;
2051 break; 1904 break;
2052 case BJ_Locked:
2053 list = &transaction->t_locked_list;
2054 break;
2055 } 1905 }
2056 1906
2057 __blist_add_buffer(list, jh); 1907 __blist_add_buffer(list, jh);
@@ -2141,3 +1991,88 @@ void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh)
2141 spin_unlock(&journal->j_list_lock); 1991 spin_unlock(&journal->j_list_lock);
2142 __brelse(bh); 1992 __brelse(bh);
2143} 1993}
1994
1995/*
1996 * File inode in the inode list of the handle's transaction
1997 */
1998int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode)
1999{
2000 transaction_t *transaction = handle->h_transaction;
2001 journal_t *journal = transaction->t_journal;
2002
2003 if (is_handle_aborted(handle))
2004 return -EIO;
2005
2006 jbd_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino,
2007 transaction->t_tid);
2008
2009 /*
2010 * First check whether inode isn't already on the transaction's
2011 * lists without taking the lock. Note that this check is safe
2012 * without the lock as we cannot race with somebody removing inode
2013 * from the transaction. The reason is that we remove inode from the
2014 * transaction only in journal_release_jbd_inode() and when we commit
2015 * the transaction. We are guarded from the first case by holding
2016 * a reference to the inode. We are safe against the second case
2017 * because if jinode->i_transaction == transaction, commit code
2018 * cannot touch the transaction because we hold reference to it,
2019 * and if jinode->i_next_transaction == transaction, commit code
2020 * will only file the inode where we want it.
2021 */
2022 if (jinode->i_transaction == transaction ||
2023 jinode->i_next_transaction == transaction)
2024 return 0;
2025
2026 spin_lock(&journal->j_list_lock);
2027
2028 if (jinode->i_transaction == transaction ||
2029 jinode->i_next_transaction == transaction)
2030 goto done;
2031
2032 /* On some different transaction's list - should be
2033 * the committing one */
2034 if (jinode->i_transaction) {
2035 J_ASSERT(jinode->i_next_transaction == NULL);
2036 J_ASSERT(jinode->i_transaction ==
2037 journal->j_committing_transaction);
2038 jinode->i_next_transaction = transaction;
2039 goto done;
2040 }
2041 /* Not on any transaction list... */
2042 J_ASSERT(!jinode->i_next_transaction);
2043 jinode->i_transaction = transaction;
2044 list_add(&jinode->i_list, &transaction->t_inode_list);
2045done:
2046 spin_unlock(&journal->j_list_lock);
2047
2048 return 0;
2049}
2050
2051/*
2052 * This function must be called when inode is journaled in ordered mode
2053 * before truncation happens. It starts writeout of truncated part in
2054 * case it is in the committing transaction so that we stand to ordered
2055 * mode consistency guarantees.
2056 */
2057int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode,
2058 loff_t new_size)
2059{
2060 journal_t *journal;
2061 transaction_t *commit_trans;
2062 int ret = 0;
2063
2064 if (!inode->i_transaction && !inode->i_next_transaction)
2065 goto out;
2066 journal = inode->i_transaction->t_journal;
2067 spin_lock(&journal->j_state_lock);
2068 commit_trans = journal->j_committing_transaction;
2069 spin_unlock(&journal->j_state_lock);
2070 if (inode->i_transaction == commit_trans) {
2071 ret = filemap_fdatawrite_range(inode->i_vfs_inode->i_mapping,
2072 new_size, LLONG_MAX);
2073 if (ret)
2074 jbd2_journal_abort(journal, ret);
2075 }
2076out:
2077 return ret;
2078}
diff --git a/fs/jfs/jfs_debug.c b/fs/jfs/jfs_debug.c
index bf6ab19b86ee..6a73de84bcef 100644
--- a/fs/jfs/jfs_debug.c
+++ b/fs/jfs/jfs_debug.c
@@ -21,6 +21,7 @@
21#include <linux/ctype.h> 21#include <linux/ctype.h>
22#include <linux/module.h> 22#include <linux/module.h>
23#include <linux/proc_fs.h> 23#include <linux/proc_fs.h>
24#include <linux/seq_file.h>
24#include <asm/uaccess.h> 25#include <asm/uaccess.h>
25#include "jfs_incore.h" 26#include "jfs_incore.h"
26#include "jfs_filsys.h" 27#include "jfs_filsys.h"
@@ -30,29 +31,19 @@
30 31
31static struct proc_dir_entry *base; 32static struct proc_dir_entry *base;
32#ifdef CONFIG_JFS_DEBUG 33#ifdef CONFIG_JFS_DEBUG
33static int loglevel_read(char *page, char **start, off_t off, 34static int jfs_loglevel_proc_show(struct seq_file *m, void *v)
34 int count, int *eof, void *data)
35{ 35{
36 int len; 36 seq_printf(m, "%d\n", jfsloglevel);
37 37 return 0;
38 len = sprintf(page, "%d\n", jfsloglevel); 38}
39
40 len -= off;
41 *start = page + off;
42
43 if (len > count)
44 len = count;
45 else
46 *eof = 1;
47
48 if (len < 0)
49 len = 0;
50 39
51 return len; 40static int jfs_loglevel_proc_open(struct inode *inode, struct file *file)
41{
42 return single_open(file, jfs_loglevel_proc_show, NULL);
52} 43}
53 44
54static int loglevel_write(struct file *file, const char __user *buffer, 45static ssize_t jfs_loglevel_proc_write(struct file *file,
55 unsigned long count, void *data) 46 const char __user *buffer, size_t count, loff_t *ppos)
56{ 47{
57 char c; 48 char c;
58 49
@@ -65,22 +56,30 @@ static int loglevel_write(struct file *file, const char __user *buffer,
65 jfsloglevel = c - '0'; 56 jfsloglevel = c - '0';
66 return count; 57 return count;
67} 58}
59
60static const struct file_operations jfs_loglevel_proc_fops = {
61 .owner = THIS_MODULE,
62 .open = jfs_loglevel_proc_open,
63 .read = seq_read,
64 .llseek = seq_lseek,
65 .release = single_release,
66 .write = jfs_loglevel_proc_write,
67};
68#endif 68#endif
69 69
70static struct { 70static struct {
71 const char *name; 71 const char *name;
72 read_proc_t *read_fn; 72 const struct file_operations *proc_fops;
73 write_proc_t *write_fn;
74} Entries[] = { 73} Entries[] = {
75#ifdef CONFIG_JFS_STATISTICS 74#ifdef CONFIG_JFS_STATISTICS
76 { "lmstats", jfs_lmstats_read, }, 75 { "lmstats", &jfs_lmstats_proc_fops, },
77 { "txstats", jfs_txstats_read, }, 76 { "txstats", &jfs_txstats_proc_fops, },
78 { "xtstat", jfs_xtstat_read, }, 77 { "xtstat", &jfs_xtstat_proc_fops, },
79 { "mpstat", jfs_mpstat_read, }, 78 { "mpstat", &jfs_mpstat_proc_fops, },
80#endif 79#endif
81#ifdef CONFIG_JFS_DEBUG 80#ifdef CONFIG_JFS_DEBUG
82 { "TxAnchor", jfs_txanchor_read, }, 81 { "TxAnchor", &jfs_txanchor_proc_fops, },
83 { "loglevel", loglevel_read, loglevel_write } 82 { "loglevel", &jfs_loglevel_proc_fops }
84#endif 83#endif
85}; 84};
86#define NPROCENT ARRAY_SIZE(Entries) 85#define NPROCENT ARRAY_SIZE(Entries)
@@ -93,13 +92,8 @@ void jfs_proc_init(void)
93 return; 92 return;
94 base->owner = THIS_MODULE; 93 base->owner = THIS_MODULE;
95 94
96 for (i = 0; i < NPROCENT; i++) { 95 for (i = 0; i < NPROCENT; i++)
97 struct proc_dir_entry *p; 96 proc_create(Entries[i].name, 0, base, Entries[i].proc_fops);
98 if ((p = create_proc_entry(Entries[i].name, 0, base))) {
99 p->read_proc = Entries[i].read_fn;
100 p->write_proc = Entries[i].write_fn;
101 }
102 }
103} 97}
104 98
105void jfs_proc_clean(void) 99void jfs_proc_clean(void)
diff --git a/fs/jfs/jfs_debug.h b/fs/jfs/jfs_debug.h
index 044c1e654cc0..eafd1300a00b 100644
--- a/fs/jfs/jfs_debug.h
+++ b/fs/jfs/jfs_debug.h
@@ -62,7 +62,7 @@ extern void jfs_proc_clean(void);
62 62
63extern int jfsloglevel; 63extern int jfsloglevel;
64 64
65extern int jfs_txanchor_read(char *, char **, off_t, int, int *, void *); 65extern const struct file_operations jfs_txanchor_proc_fops;
66 66
67/* information message: e.g., configuration, major event */ 67/* information message: e.g., configuration, major event */
68#define jfs_info(fmt, arg...) do { \ 68#define jfs_info(fmt, arg...) do { \
@@ -105,10 +105,10 @@ extern int jfs_txanchor_read(char *, char **, off_t, int, int *, void *);
105 * ---------- 105 * ----------
106 */ 106 */
107#ifdef CONFIG_JFS_STATISTICS 107#ifdef CONFIG_JFS_STATISTICS
108extern int jfs_lmstats_read(char *, char **, off_t, int, int *, void *); 108extern const struct file_operations jfs_lmstats_proc_fops;
109extern int jfs_txstats_read(char *, char **, off_t, int, int *, void *); 109extern const struct file_operations jfs_txstats_proc_fops;
110extern int jfs_mpstat_read(char *, char **, off_t, int, int *, void *); 110extern const struct file_operations jfs_mpstat_proc_fops;
111extern int jfs_xtstat_read(char *, char **, off_t, int, int *, void *); 111extern const struct file_operations jfs_xtstat_proc_fops;
112 112
113#define INCREMENT(x) ((x)++) 113#define INCREMENT(x) ((x)++)
114#define DECREMENT(x) ((x)--) 114#define DECREMENT(x) ((x)--)
diff --git a/fs/jfs/jfs_dtree.h b/fs/jfs/jfs_dtree.h
index cdac2d5bafeb..2545bb317235 100644
--- a/fs/jfs/jfs_dtree.h
+++ b/fs/jfs/jfs_dtree.h
@@ -243,9 +243,6 @@ typedef union {
243#define JFS_REMOVE 3 243#define JFS_REMOVE 3
244#define JFS_RENAME 4 244#define JFS_RENAME 4
245 245
246#define DIRENTSIZ(namlen) \
247 ( (sizeof(struct dirent) - 2*(JFS_NAME_MAX+1) + 2*((namlen)+1) + 3) &~ 3 )
248
249/* 246/*
250 * Maximum file offset for directories. 247 * Maximum file offset for directories.
251 */ 248 */
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index 734ec916beaf..d6363d8309d0 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -1520,7 +1520,7 @@ int diAlloc(struct inode *pip, bool dir, struct inode *ip)
1520 jfs_error(ip->i_sb, 1520 jfs_error(ip->i_sb,
1521 "diAlloc: can't find free bit " 1521 "diAlloc: can't find free bit "
1522 "in wmap"); 1522 "in wmap");
1523 return EIO; 1523 return -EIO;
1524 } 1524 }
1525 1525
1526 /* determine the inode number within the 1526 /* determine the inode number within the
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index 325a9679b95a..cd2ec2988b59 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -69,6 +69,7 @@
69#include <linux/freezer.h> 69#include <linux/freezer.h>
70#include <linux/delay.h> 70#include <linux/delay.h>
71#include <linux/mutex.h> 71#include <linux/mutex.h>
72#include <linux/seq_file.h>
72#include "jfs_incore.h" 73#include "jfs_incore.h"
73#include "jfs_filsys.h" 74#include "jfs_filsys.h"
74#include "jfs_metapage.h" 75#include "jfs_metapage.h"
@@ -2503,13 +2504,9 @@ exit:
2503} 2504}
2504 2505
2505#ifdef CONFIG_JFS_STATISTICS 2506#ifdef CONFIG_JFS_STATISTICS
2506int jfs_lmstats_read(char *buffer, char **start, off_t offset, int length, 2507static int jfs_lmstats_proc_show(struct seq_file *m, void *v)
2507 int *eof, void *data)
2508{ 2508{
2509 int len = 0; 2509 seq_printf(m,
2510 off_t begin;
2511
2512 len += sprintf(buffer,
2513 "JFS Logmgr stats\n" 2510 "JFS Logmgr stats\n"
2514 "================\n" 2511 "================\n"
2515 "commits = %d\n" 2512 "commits = %d\n"
@@ -2522,19 +2519,19 @@ int jfs_lmstats_read(char *buffer, char **start, off_t offset, int length,
2522 lmStat.pagedone, 2519 lmStat.pagedone,
2523 lmStat.full_page, 2520 lmStat.full_page,
2524 lmStat.partial_page); 2521 lmStat.partial_page);
2522 return 0;
2523}
2525 2524
2526 begin = offset; 2525static int jfs_lmstats_proc_open(struct inode *inode, struct file *file)
2527 *start = buffer + begin; 2526{
2528 len -= begin; 2527 return single_open(file, jfs_lmstats_proc_show, NULL);
2529
2530 if (len > length)
2531 len = length;
2532 else
2533 *eof = 1;
2534
2535 if (len < 0)
2536 len = 0;
2537
2538 return len;
2539} 2528}
2529
2530const struct file_operations jfs_lmstats_proc_fops = {
2531 .owner = THIS_MODULE,
2532 .open = jfs_lmstats_proc_open,
2533 .read = seq_read,
2534 .llseek = seq_lseek,
2535 .release = single_release,
2536};
2540#endif /* CONFIG_JFS_STATISTICS */ 2537#endif /* CONFIG_JFS_STATISTICS */
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index d1e64f2f2fcd..854ff0ec574f 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -19,10 +19,12 @@
19 19
20#include <linux/fs.h> 20#include <linux/fs.h>
21#include <linux/mm.h> 21#include <linux/mm.h>
22#include <linux/module.h>
22#include <linux/bio.h> 23#include <linux/bio.h>
23#include <linux/init.h> 24#include <linux/init.h>
24#include <linux/buffer_head.h> 25#include <linux/buffer_head.h>
25#include <linux/mempool.h> 26#include <linux/mempool.h>
27#include <linux/seq_file.h>
26#include "jfs_incore.h" 28#include "jfs_incore.h"
27#include "jfs_superblock.h" 29#include "jfs_superblock.h"
28#include "jfs_filsys.h" 30#include "jfs_filsys.h"
@@ -804,13 +806,9 @@ void __invalidate_metapages(struct inode *ip, s64 addr, int len)
804} 806}
805 807
806#ifdef CONFIG_JFS_STATISTICS 808#ifdef CONFIG_JFS_STATISTICS
807int jfs_mpstat_read(char *buffer, char **start, off_t offset, int length, 809static int jfs_mpstat_proc_show(struct seq_file *m, void *v)
808 int *eof, void *data)
809{ 810{
810 int len = 0; 811 seq_printf(m,
811 off_t begin;
812
813 len += sprintf(buffer,
814 "JFS Metapage statistics\n" 812 "JFS Metapage statistics\n"
815 "=======================\n" 813 "=======================\n"
816 "page allocations = %d\n" 814 "page allocations = %d\n"
@@ -819,19 +817,19 @@ int jfs_mpstat_read(char *buffer, char **start, off_t offset, int length,
819 mpStat.pagealloc, 817 mpStat.pagealloc,
820 mpStat.pagefree, 818 mpStat.pagefree,
821 mpStat.lockwait); 819 mpStat.lockwait);
820 return 0;
821}
822 822
823 begin = offset; 823static int jfs_mpstat_proc_open(struct inode *inode, struct file *file)
824 *start = buffer + begin; 824{
825 len -= begin; 825 return single_open(file, jfs_mpstat_proc_show, NULL);
826
827 if (len > length)
828 len = length;
829 else
830 *eof = 1;
831
832 if (len < 0)
833 len = 0;
834
835 return len;
836} 826}
827
828const struct file_operations jfs_mpstat_proc_fops = {
829 .owner = THIS_MODULE,
830 .open = jfs_mpstat_proc_open,
831 .read = seq_read,
832 .llseek = seq_lseek,
833 .release = single_release,
834};
837#endif 835#endif
diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
index e7c60ae6b5b2..f26e4d03ada5 100644
--- a/fs/jfs/jfs_txnmgr.c
+++ b/fs/jfs/jfs_txnmgr.c
@@ -49,6 +49,7 @@
49#include <linux/module.h> 49#include <linux/module.h>
50#include <linux/moduleparam.h> 50#include <linux/moduleparam.h>
51#include <linux/kthread.h> 51#include <linux/kthread.h>
52#include <linux/seq_file.h>
52#include "jfs_incore.h" 53#include "jfs_incore.h"
53#include "jfs_inode.h" 54#include "jfs_inode.h"
54#include "jfs_filsys.h" 55#include "jfs_filsys.h"
@@ -3009,11 +3010,8 @@ int jfs_sync(void *arg)
3009} 3010}
3010 3011
3011#if defined(CONFIG_PROC_FS) && defined(CONFIG_JFS_DEBUG) 3012#if defined(CONFIG_PROC_FS) && defined(CONFIG_JFS_DEBUG)
3012int jfs_txanchor_read(char *buffer, char **start, off_t offset, int length, 3013static int jfs_txanchor_proc_show(struct seq_file *m, void *v)
3013 int *eof, void *data)
3014{ 3014{
3015 int len = 0;
3016 off_t begin;
3017 char *freewait; 3015 char *freewait;
3018 char *freelockwait; 3016 char *freelockwait;
3019 char *lowlockwait; 3017 char *lowlockwait;
@@ -3025,7 +3023,7 @@ int jfs_txanchor_read(char *buffer, char **start, off_t offset, int length,
3025 lowlockwait = 3023 lowlockwait =
3026 waitqueue_active(&TxAnchor.lowlockwait) ? "active" : "empty"; 3024 waitqueue_active(&TxAnchor.lowlockwait) ? "active" : "empty";
3027 3025
3028 len += sprintf(buffer, 3026 seq_printf(m,
3029 "JFS TxAnchor\n" 3027 "JFS TxAnchor\n"
3030 "============\n" 3028 "============\n"
3031 "freetid = %d\n" 3029 "freetid = %d\n"
@@ -3044,31 +3042,27 @@ int jfs_txanchor_read(char *buffer, char **start, off_t offset, int length,
3044 TxAnchor.tlocksInUse, 3042 TxAnchor.tlocksInUse,
3045 jfs_tlocks_low, 3043 jfs_tlocks_low,
3046 list_empty(&TxAnchor.unlock_queue) ? "" : "not "); 3044 list_empty(&TxAnchor.unlock_queue) ? "" : "not ");
3045 return 0;
3046}
3047 3047
3048 begin = offset; 3048static int jfs_txanchor_proc_open(struct inode *inode, struct file *file)
3049 *start = buffer + begin; 3049{
3050 len -= begin; 3050 return single_open(file, jfs_txanchor_proc_show, NULL);
3051
3052 if (len > length)
3053 len = length;
3054 else
3055 *eof = 1;
3056
3057 if (len < 0)
3058 len = 0;
3059
3060 return len;
3061} 3051}
3052
3053const struct file_operations jfs_txanchor_proc_fops = {
3054 .owner = THIS_MODULE,
3055 .open = jfs_txanchor_proc_open,
3056 .read = seq_read,
3057 .llseek = seq_lseek,
3058 .release = single_release,
3059};
3062#endif 3060#endif
3063 3061
3064#if defined(CONFIG_PROC_FS) && defined(CONFIG_JFS_STATISTICS) 3062#if defined(CONFIG_PROC_FS) && defined(CONFIG_JFS_STATISTICS)
3065int jfs_txstats_read(char *buffer, char **start, off_t offset, int length, 3063static int jfs_txstats_proc_show(struct seq_file *m, void *v)
3066 int *eof, void *data)
3067{ 3064{
3068 int len = 0; 3065 seq_printf(m,
3069 off_t begin;
3070
3071 len += sprintf(buffer,
3072 "JFS TxStats\n" 3066 "JFS TxStats\n"
3073 "===========\n" 3067 "===========\n"
3074 "calls to txBegin = %d\n" 3068 "calls to txBegin = %d\n"
@@ -3089,19 +3083,19 @@ int jfs_txstats_read(char *buffer, char **start, off_t offset, int length,
3089 TxStat.txBeginAnon_lockslow, 3083 TxStat.txBeginAnon_lockslow,
3090 TxStat.txLockAlloc, 3084 TxStat.txLockAlloc,
3091 TxStat.txLockAlloc_freelock); 3085 TxStat.txLockAlloc_freelock);
3086 return 0;
3087}
3092 3088
3093 begin = offset; 3089static int jfs_txstats_proc_open(struct inode *inode, struct file *file)
3094 *start = buffer + begin; 3090{
3095 len -= begin; 3091 return single_open(file, jfs_txstats_proc_show, NULL);
3096
3097 if (len > length)
3098 len = length;
3099 else
3100 *eof = 1;
3101
3102 if (len < 0)
3103 len = 0;
3104
3105 return len;
3106} 3092}
3093
3094const struct file_operations jfs_txstats_proc_fops = {
3095 .owner = THIS_MODULE,
3096 .open = jfs_txstats_proc_open,
3097 .read = seq_read,
3098 .llseek = seq_lseek,
3099 .release = single_release,
3100};
3107#endif 3101#endif
diff --git a/fs/jfs/jfs_xtree.c b/fs/jfs/jfs_xtree.c
index 5a61ebf2cbcc..ae3acafb447b 100644
--- a/fs/jfs/jfs_xtree.c
+++ b/fs/jfs/jfs_xtree.c
@@ -20,7 +20,9 @@
20 */ 20 */
21 21
22#include <linux/fs.h> 22#include <linux/fs.h>
23#include <linux/module.h>
23#include <linux/quotaops.h> 24#include <linux/quotaops.h>
25#include <linux/seq_file.h>
24#include "jfs_incore.h" 26#include "jfs_incore.h"
25#include "jfs_filsys.h" 27#include "jfs_filsys.h"
26#include "jfs_metapage.h" 28#include "jfs_metapage.h"
@@ -4134,13 +4136,9 @@ s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size)
4134} 4136}
4135 4137
4136#ifdef CONFIG_JFS_STATISTICS 4138#ifdef CONFIG_JFS_STATISTICS
4137int jfs_xtstat_read(char *buffer, char **start, off_t offset, int length, 4139static int jfs_xtstat_proc_show(struct seq_file *m, void *v)
4138 int *eof, void *data)
4139{ 4140{
4140 int len = 0; 4141 seq_printf(m,
4141 off_t begin;
4142
4143 len += sprintf(buffer,
4144 "JFS Xtree statistics\n" 4142 "JFS Xtree statistics\n"
4145 "====================\n" 4143 "====================\n"
4146 "searches = %d\n" 4144 "searches = %d\n"
@@ -4149,19 +4147,19 @@ int jfs_xtstat_read(char *buffer, char **start, off_t offset, int length,
4149 xtStat.search, 4147 xtStat.search,
4150 xtStat.fastSearch, 4148 xtStat.fastSearch,
4151 xtStat.split); 4149 xtStat.split);
4150 return 0;
4151}
4152 4152
4153 begin = offset; 4153static int jfs_xtstat_proc_open(struct inode *inode, struct file *file)
4154 *start = buffer + begin; 4154{
4155 len -= begin; 4155 return single_open(file, jfs_xtstat_proc_show, NULL);
4156
4157 if (len > length)
4158 len = length;
4159 else
4160 *eof = 1;
4161
4162 if (len < 0)
4163 len = 0;
4164
4165 return len;
4166} 4156}
4157
4158const struct file_operations jfs_xtstat_proc_fops = {
4159 .owner = THIS_MODULE,
4160 .open = jfs_xtstat_proc_open,
4161 .read = seq_read,
4162 .llseek = seq_lseek,
4163 .release = single_release,
4164};
4167#endif 4165#endif
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 0ba6778edaa2..2aba82386810 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -1455,7 +1455,7 @@ static struct dentry *jfs_lookup(struct inode *dip, struct dentry *dentry, struc
1455 free_UCSname(&key); 1455 free_UCSname(&key);
1456 if (rc == -ENOENT) { 1456 if (rc == -ENOENT) {
1457 d_add(dentry, NULL); 1457 d_add(dentry, NULL);
1458 return ERR_PTR(0); 1458 return NULL;
1459 } else if (rc) { 1459 } else if (rc) {
1460 jfs_err("jfs_lookup: dtSearch returned %d", rc); 1460 jfs_err("jfs_lookup: dtSearch returned %d", rc);
1461 return ERR_PTR(rc); 1461 return ERR_PTR(rc);
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 50ea65451732..0288e6d7936a 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -499,7 +499,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
499 inode = jfs_iget(sb, ROOT_I); 499 inode = jfs_iget(sb, ROOT_I);
500 if (IS_ERR(inode)) { 500 if (IS_ERR(inode)) {
501 ret = PTR_ERR(inode); 501 ret = PTR_ERR(inode);
502 goto out_no_root; 502 goto out_no_rw;
503 } 503 }
504 sb->s_root = d_alloc_root(inode); 504 sb->s_root = d_alloc_root(inode);
505 if (!sb->s_root) 505 if (!sb->s_root)
@@ -521,9 +521,8 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
521 return 0; 521 return 0;
522 522
523out_no_root: 523out_no_root:
524 jfs_err("jfs_read_super: get root inode failed"); 524 jfs_err("jfs_read_super: get root dentry failed");
525 if (inode) 525 iput(inode);
526 iput(inode);
527 526
528out_no_rw: 527out_no_rw:
529 rc = jfs_umount(sb); 528 rc = jfs_umount(sb);
diff --git a/fs/libfs.c b/fs/libfs.c
index 892d41cb3382..baeb71ee1cde 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -512,6 +512,20 @@ void simple_release_fs(struct vfsmount **mount, int *count)
512 mntput(mnt); 512 mntput(mnt);
513} 513}
514 514
515/**
516 * simple_read_from_buffer - copy data from the buffer to user space
517 * @to: the user space buffer to read to
518 * @count: the maximum number of bytes to read
519 * @ppos: the current position in the buffer
520 * @from: the buffer to read from
521 * @available: the size of the buffer
522 *
523 * The simple_read_from_buffer() function reads up to @count bytes from the
524 * buffer @from at offset @ppos into the user space address starting at @to.
525 *
526 * On success, the number of bytes read is returned and the offset @ppos is
527 * advanced by this number, or negative value is returned on error.
528 **/
515ssize_t simple_read_from_buffer(void __user *to, size_t count, loff_t *ppos, 529ssize_t simple_read_from_buffer(void __user *to, size_t count, loff_t *ppos,
516 const void *from, size_t available) 530 const void *from, size_t available)
517{ 531{
@@ -528,6 +542,20 @@ ssize_t simple_read_from_buffer(void __user *to, size_t count, loff_t *ppos,
528 return count; 542 return count;
529} 543}
530 544
545/**
546 * memory_read_from_buffer - copy data from the buffer
547 * @to: the kernel space buffer to read to
548 * @count: the maximum number of bytes to read
549 * @ppos: the current position in the buffer
550 * @from: the buffer to read from
551 * @available: the size of the buffer
552 *
553 * The memory_read_from_buffer() function reads up to @count bytes from the
554 * buffer @from at offset @ppos into the kernel space address starting at @to.
555 *
556 * On success, the number of bytes read is returned and the offset @ppos is
557 * advanced by this number, or negative value is returned on error.
558 **/
531ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos, 559ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos,
532 const void *from, size_t available) 560 const void *from, size_t available)
533{ 561{
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 5df517b81f3f..1f6dc518505c 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -224,7 +224,9 @@ void nlm_release_call(struct nlm_rqst *call)
224 224
225static void nlmclnt_rpc_release(void *data) 225static void nlmclnt_rpc_release(void *data)
226{ 226{
227 lock_kernel();
227 nlm_release_call(data); 228 nlm_release_call(data);
229 unlock_kernel();
228} 230}
229 231
230static int nlm_wait_on_grace(wait_queue_head_t *queue) 232static int nlm_wait_on_grace(wait_queue_head_t *queue)
@@ -430,7 +432,7 @@ nlmclnt_test(struct nlm_rqst *req, struct file_lock *fl)
430 * Report the conflicting lock back to the application. 432 * Report the conflicting lock back to the application.
431 */ 433 */
432 fl->fl_start = req->a_res.lock.fl.fl_start; 434 fl->fl_start = req->a_res.lock.fl.fl_start;
433 fl->fl_end = req->a_res.lock.fl.fl_start; 435 fl->fl_end = req->a_res.lock.fl.fl_end;
434 fl->fl_type = req->a_res.lock.fl.fl_type; 436 fl->fl_type = req->a_res.lock.fl.fl_type;
435 fl->fl_pid = 0; 437 fl->fl_pid = 0;
436 break; 438 break;
@@ -710,7 +712,9 @@ static void nlmclnt_unlock_callback(struct rpc_task *task, void *data)
710die: 712die:
711 return; 713 return;
712 retry_rebind: 714 retry_rebind:
715 lock_kernel();
713 nlm_rebind_host(req->a_host); 716 nlm_rebind_host(req->a_host);
717 unlock_kernel();
714 retry_unlock: 718 retry_unlock:
715 rpc_restart_call(task); 719 rpc_restart_call(task);
716} 720}
@@ -788,7 +792,9 @@ retry_cancel:
788 /* Don't ever retry more than 3 times */ 792 /* Don't ever retry more than 3 times */
789 if (req->a_retries++ >= NLMCLNT_MAX_RETRIES) 793 if (req->a_retries++ >= NLMCLNT_MAX_RETRIES)
790 goto die; 794 goto die;
795 lock_kernel();
791 nlm_rebind_host(req->a_host); 796 nlm_rebind_host(req->a_host);
797 unlock_kernel();
792 rpc_restart_call(task); 798 rpc_restart_call(task);
793 rpc_delay(task, 30 * HZ); 799 rpc_delay(task, 30 * HZ);
794} 800}
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index 385437e3387d..2e27176ff42f 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -248,7 +248,9 @@ static void nlm4svc_callback_exit(struct rpc_task *task, void *data)
248 248
249static void nlm4svc_callback_release(void *data) 249static void nlm4svc_callback_release(void *data)
250{ 250{
251 lock_kernel();
251 nlm_release_call(data); 252 nlm_release_call(data);
253 unlock_kernel();
252} 254}
253 255
254static const struct rpc_call_ops nlm4svc_callback_ops = { 256static const struct rpc_call_ops nlm4svc_callback_ops = {
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 81aca859bfde..56a08ab9a4cb 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -795,6 +795,7 @@ static void nlmsvc_grant_callback(struct rpc_task *task, void *data)
795 795
796 dprintk("lockd: GRANT_MSG RPC callback\n"); 796 dprintk("lockd: GRANT_MSG RPC callback\n");
797 797
798 lock_kernel();
798 /* if the block is not on a list at this point then it has 799 /* if the block is not on a list at this point then it has
799 * been invalidated. Don't try to requeue it. 800 * been invalidated. Don't try to requeue it.
800 * 801 *
@@ -804,7 +805,7 @@ static void nlmsvc_grant_callback(struct rpc_task *task, void *data)
804 * for nlm_blocked? 805 * for nlm_blocked?
805 */ 806 */
806 if (list_empty(&block->b_list)) 807 if (list_empty(&block->b_list))
807 return; 808 goto out;
808 809
809 /* Technically, we should down the file semaphore here. Since we 810 /* Technically, we should down the file semaphore here. Since we
810 * move the block towards the head of the queue only, no harm 811 * move the block towards the head of the queue only, no harm
@@ -818,13 +819,17 @@ static void nlmsvc_grant_callback(struct rpc_task *task, void *data)
818 } 819 }
819 nlmsvc_insert_block(block, timeout); 820 nlmsvc_insert_block(block, timeout);
820 svc_wake_up(block->b_daemon); 821 svc_wake_up(block->b_daemon);
822out:
823 unlock_kernel();
821} 824}
822 825
823static void nlmsvc_grant_release(void *data) 826static void nlmsvc_grant_release(void *data)
824{ 827{
825 struct nlm_rqst *call = data; 828 struct nlm_rqst *call = data;
826 829
830 lock_kernel();
827 nlmsvc_release_block(call->a_block); 831 nlmsvc_release_block(call->a_block);
832 unlock_kernel();
828} 833}
829 834
830static const struct rpc_call_ops nlmsvc_grant_ops = { 835static const struct rpc_call_ops nlmsvc_grant_ops = {
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 88379cc6e0b1..ce6952b50a75 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -278,7 +278,9 @@ static void nlmsvc_callback_exit(struct rpc_task *task, void *data)
278 278
279static void nlmsvc_callback_release(void *data) 279static void nlmsvc_callback_release(void *data)
280{ 280{
281 lock_kernel();
281 nlm_release_call(data); 282 nlm_release_call(data);
283 unlock_kernel();
282} 284}
283 285
284static const struct rpc_call_ops nlmsvc_callback_ops = { 286static const struct rpc_call_ops nlmsvc_callback_ops = {
diff --git a/fs/mpage.c b/fs/mpage.c
index 235e4d3873a8..dbcc7af76a15 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -82,7 +82,7 @@ static void mpage_end_io_write(struct bio *bio, int err)
82 bio_put(bio); 82 bio_put(bio);
83} 83}
84 84
85static struct bio *mpage_bio_submit(int rw, struct bio *bio) 85struct bio *mpage_bio_submit(int rw, struct bio *bio)
86{ 86{
87 bio->bi_end_io = mpage_end_io_read; 87 bio->bi_end_io = mpage_end_io_read;
88 if (rw == WRITE) 88 if (rw == WRITE)
@@ -90,6 +90,7 @@ static struct bio *mpage_bio_submit(int rw, struct bio *bio)
90 submit_bio(rw, bio); 90 submit_bio(rw, bio);
91 return NULL; 91 return NULL;
92} 92}
93EXPORT_SYMBOL(mpage_bio_submit);
93 94
94static struct bio * 95static struct bio *
95mpage_alloc(struct block_device *bdev, 96mpage_alloc(struct block_device *bdev,
@@ -435,15 +436,9 @@ EXPORT_SYMBOL(mpage_readpage);
435 * written, so it can intelligently allocate a suitably-sized BIO. For now, 436 * written, so it can intelligently allocate a suitably-sized BIO. For now,
436 * just allocate full-size (16-page) BIOs. 437 * just allocate full-size (16-page) BIOs.
437 */ 438 */
438struct mpage_data {
439 struct bio *bio;
440 sector_t last_block_in_bio;
441 get_block_t *get_block;
442 unsigned use_writepage;
443};
444 439
445static int __mpage_writepage(struct page *page, struct writeback_control *wbc, 440int __mpage_writepage(struct page *page, struct writeback_control *wbc,
446 void *data) 441 void *data)
447{ 442{
448 struct mpage_data *mpd = data; 443 struct mpage_data *mpd = data;
449 struct bio *bio = mpd->bio; 444 struct bio *bio = mpd->bio;
@@ -651,6 +646,7 @@ out:
651 mpd->bio = bio; 646 mpd->bio = bio;
652 return ret; 647 return ret;
653} 648}
649EXPORT_SYMBOL(__mpage_writepage);
654 650
655/** 651/**
656 * mpage_writepages - walk the list of dirty pages of the given address space & writepage() all of them 652 * mpage_writepages - walk the list of dirty pages of the given address space & writepage() all of them
diff --git a/fs/msdos/namei.c b/fs/msdos/namei.c
index 05ff4f1d7026..1f7f2956412a 100644
--- a/fs/msdos/namei.c
+++ b/fs/msdos/namei.c
@@ -214,7 +214,7 @@ static struct dentry *msdos_lookup(struct inode *dir, struct dentry *dentry,
214 214
215 dentry->d_op = &msdos_dentry_operations; 215 dentry->d_op = &msdos_dentry_operations;
216 216
217 lock_kernel(); 217 lock_super(sb);
218 res = msdos_find(dir, dentry->d_name.name, dentry->d_name.len, &sinfo); 218 res = msdos_find(dir, dentry->d_name.name, dentry->d_name.len, &sinfo);
219 if (res == -ENOENT) 219 if (res == -ENOENT)
220 goto add; 220 goto add;
@@ -232,7 +232,7 @@ add:
232 if (dentry) 232 if (dentry)
233 dentry->d_op = &msdos_dentry_operations; 233 dentry->d_op = &msdos_dentry_operations;
234out: 234out:
235 unlock_kernel(); 235 unlock_super(sb);
236 if (!res) 236 if (!res)
237 return dentry; 237 return dentry;
238 return ERR_PTR(res); 238 return ERR_PTR(res);
@@ -286,7 +286,7 @@ static int msdos_create(struct inode *dir, struct dentry *dentry, int mode,
286 unsigned char msdos_name[MSDOS_NAME]; 286 unsigned char msdos_name[MSDOS_NAME];
287 int err, is_hid; 287 int err, is_hid;
288 288
289 lock_kernel(); 289 lock_super(sb);
290 290
291 err = msdos_format_name(dentry->d_name.name, dentry->d_name.len, 291 err = msdos_format_name(dentry->d_name.name, dentry->d_name.len,
292 msdos_name, &MSDOS_SB(sb)->options); 292 msdos_name, &MSDOS_SB(sb)->options);
@@ -315,7 +315,7 @@ static int msdos_create(struct inode *dir, struct dentry *dentry, int mode,
315 315
316 d_instantiate(dentry, inode); 316 d_instantiate(dentry, inode);
317out: 317out:
318 unlock_kernel(); 318 unlock_super(sb);
319 if (!err) 319 if (!err)
320 err = fat_flush_inodes(sb, dir, inode); 320 err = fat_flush_inodes(sb, dir, inode);
321 return err; 321 return err;
@@ -324,11 +324,12 @@ out:
324/***** Remove a directory */ 324/***** Remove a directory */
325static int msdos_rmdir(struct inode *dir, struct dentry *dentry) 325static int msdos_rmdir(struct inode *dir, struct dentry *dentry)
326{ 326{
327 struct super_block *sb = dir->i_sb;
327 struct inode *inode = dentry->d_inode; 328 struct inode *inode = dentry->d_inode;
328 struct fat_slot_info sinfo; 329 struct fat_slot_info sinfo;
329 int err; 330 int err;
330 331
331 lock_kernel(); 332 lock_super(sb);
332 /* 333 /*
333 * Check whether the directory is not in use, then check 334 * Check whether the directory is not in use, then check
334 * whether it is empty. 335 * whether it is empty.
@@ -349,9 +350,9 @@ static int msdos_rmdir(struct inode *dir, struct dentry *dentry)
349 inode->i_ctime = CURRENT_TIME_SEC; 350 inode->i_ctime = CURRENT_TIME_SEC;
350 fat_detach(inode); 351 fat_detach(inode);
351out: 352out:
352 unlock_kernel(); 353 unlock_super(sb);
353 if (!err) 354 if (!err)
354 err = fat_flush_inodes(inode->i_sb, dir, inode); 355 err = fat_flush_inodes(sb, dir, inode);
355 356
356 return err; 357 return err;
357} 358}
@@ -366,7 +367,7 @@ static int msdos_mkdir(struct inode *dir, struct dentry *dentry, int mode)
366 struct timespec ts; 367 struct timespec ts;
367 int err, is_hid, cluster; 368 int err, is_hid, cluster;
368 369
369 lock_kernel(); 370 lock_super(sb);
370 371
371 err = msdos_format_name(dentry->d_name.name, dentry->d_name.len, 372 err = msdos_format_name(dentry->d_name.name, dentry->d_name.len,
372 msdos_name, &MSDOS_SB(sb)->options); 373 msdos_name, &MSDOS_SB(sb)->options);
@@ -404,14 +405,14 @@ static int msdos_mkdir(struct inode *dir, struct dentry *dentry, int mode)
404 405
405 d_instantiate(dentry, inode); 406 d_instantiate(dentry, inode);
406 407
407 unlock_kernel(); 408 unlock_super(sb);
408 fat_flush_inodes(sb, dir, inode); 409 fat_flush_inodes(sb, dir, inode);
409 return 0; 410 return 0;
410 411
411out_free: 412out_free:
412 fat_free_clusters(dir, cluster); 413 fat_free_clusters(dir, cluster);
413out: 414out:
414 unlock_kernel(); 415 unlock_super(sb);
415 return err; 416 return err;
416} 417}
417 418
@@ -419,10 +420,11 @@ out:
419static int msdos_unlink(struct inode *dir, struct dentry *dentry) 420static int msdos_unlink(struct inode *dir, struct dentry *dentry)
420{ 421{
421 struct inode *inode = dentry->d_inode; 422 struct inode *inode = dentry->d_inode;
423 struct super_block *sb= inode->i_sb;
422 struct fat_slot_info sinfo; 424 struct fat_slot_info sinfo;
423 int err; 425 int err;
424 426
425 lock_kernel(); 427 lock_super(sb);
426 err = msdos_find(dir, dentry->d_name.name, dentry->d_name.len, &sinfo); 428 err = msdos_find(dir, dentry->d_name.name, dentry->d_name.len, &sinfo);
427 if (err) 429 if (err)
428 goto out; 430 goto out;
@@ -434,9 +436,9 @@ static int msdos_unlink(struct inode *dir, struct dentry *dentry)
434 inode->i_ctime = CURRENT_TIME_SEC; 436 inode->i_ctime = CURRENT_TIME_SEC;
435 fat_detach(inode); 437 fat_detach(inode);
436out: 438out:
437 unlock_kernel(); 439 unlock_super(sb);
438 if (!err) 440 if (!err)
439 err = fat_flush_inodes(inode->i_sb, dir, inode); 441 err = fat_flush_inodes(sb, dir, inode);
440 442
441 return err; 443 return err;
442} 444}
@@ -618,10 +620,11 @@ error_inode:
618static int msdos_rename(struct inode *old_dir, struct dentry *old_dentry, 620static int msdos_rename(struct inode *old_dir, struct dentry *old_dentry,
619 struct inode *new_dir, struct dentry *new_dentry) 621 struct inode *new_dir, struct dentry *new_dentry)
620{ 622{
623 struct super_block *sb = old_dir->i_sb;
621 unsigned char old_msdos_name[MSDOS_NAME], new_msdos_name[MSDOS_NAME]; 624 unsigned char old_msdos_name[MSDOS_NAME], new_msdos_name[MSDOS_NAME];
622 int err, is_hid; 625 int err, is_hid;
623 626
624 lock_kernel(); 627 lock_super(sb);
625 628
626 err = msdos_format_name(old_dentry->d_name.name, 629 err = msdos_format_name(old_dentry->d_name.name,
627 old_dentry->d_name.len, old_msdos_name, 630 old_dentry->d_name.len, old_msdos_name,
@@ -640,9 +643,9 @@ static int msdos_rename(struct inode *old_dir, struct dentry *old_dentry,
640 err = do_msdos_rename(old_dir, old_msdos_name, old_dentry, 643 err = do_msdos_rename(old_dir, old_msdos_name, old_dentry,
641 new_dir, new_msdos_name, new_dentry, is_hid); 644 new_dir, new_msdos_name, new_dentry, is_hid);
642out: 645out:
643 unlock_kernel(); 646 unlock_super(sb);
644 if (!err) 647 if (!err)
645 err = fat_flush_inodes(old_dir->i_sb, old_dir, new_dir); 648 err = fat_flush_inodes(sb, old_dir, new_dir);
646 return err; 649 return err;
647} 650}
648 651
diff --git a/fs/namespace.c b/fs/namespace.c
index 4fc302c2a0e0..4f6f7635b59c 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -750,7 +750,7 @@ struct proc_fs_info {
750 const char *str; 750 const char *str;
751}; 751};
752 752
753static void show_sb_opts(struct seq_file *m, struct super_block *sb) 753static int show_sb_opts(struct seq_file *m, struct super_block *sb)
754{ 754{
755 static const struct proc_fs_info fs_info[] = { 755 static const struct proc_fs_info fs_info[] = {
756 { MS_SYNCHRONOUS, ",sync" }, 756 { MS_SYNCHRONOUS, ",sync" },
@@ -764,6 +764,8 @@ static void show_sb_opts(struct seq_file *m, struct super_block *sb)
764 if (sb->s_flags & fs_infop->flag) 764 if (sb->s_flags & fs_infop->flag)
765 seq_puts(m, fs_infop->str); 765 seq_puts(m, fs_infop->str);
766 } 766 }
767
768 return security_sb_show_options(m, sb);
767} 769}
768 770
769static void show_mnt_opts(struct seq_file *m, struct vfsmount *mnt) 771static void show_mnt_opts(struct seq_file *m, struct vfsmount *mnt)
@@ -806,11 +808,14 @@ static int show_vfsmnt(struct seq_file *m, void *v)
806 seq_putc(m, ' '); 808 seq_putc(m, ' ');
807 show_type(m, mnt->mnt_sb); 809 show_type(m, mnt->mnt_sb);
808 seq_puts(m, __mnt_is_readonly(mnt) ? " ro" : " rw"); 810 seq_puts(m, __mnt_is_readonly(mnt) ? " ro" : " rw");
809 show_sb_opts(m, mnt->mnt_sb); 811 err = show_sb_opts(m, mnt->mnt_sb);
812 if (err)
813 goto out;
810 show_mnt_opts(m, mnt); 814 show_mnt_opts(m, mnt);
811 if (mnt->mnt_sb->s_op->show_options) 815 if (mnt->mnt_sb->s_op->show_options)
812 err = mnt->mnt_sb->s_op->show_options(m, mnt); 816 err = mnt->mnt_sb->s_op->show_options(m, mnt);
813 seq_puts(m, " 0 0\n"); 817 seq_puts(m, " 0 0\n");
818out:
814 return err; 819 return err;
815} 820}
816 821
@@ -865,10 +870,13 @@ static int show_mountinfo(struct seq_file *m, void *v)
865 seq_putc(m, ' '); 870 seq_putc(m, ' ');
866 mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none"); 871 mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none");
867 seq_puts(m, sb->s_flags & MS_RDONLY ? " ro" : " rw"); 872 seq_puts(m, sb->s_flags & MS_RDONLY ? " ro" : " rw");
868 show_sb_opts(m, sb); 873 err = show_sb_opts(m, sb);
874 if (err)
875 goto out;
869 if (sb->s_op->show_options) 876 if (sb->s_op->show_options)
870 err = sb->s_op->show_options(m, mnt); 877 err = sb->s_op->show_options(m, mnt);
871 seq_putc(m, '\n'); 878 seq_putc(m, '\n');
879out:
872 return err; 880 return err;
873} 881}
874 882
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c
index 2b145de45b39..6a7d901f1936 100644
--- a/fs/ncpfs/file.c
+++ b/fs/ncpfs/file.c
@@ -18,6 +18,7 @@
18#include <linux/slab.h> 18#include <linux/slab.h>
19#include <linux/vmalloc.h> 19#include <linux/vmalloc.h>
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/smp_lock.h>
21 22
22#include <linux/ncp_fs.h> 23#include <linux/ncp_fs.h>
23#include "ncplib_kernel.h" 24#include "ncplib_kernel.h"
@@ -281,9 +282,18 @@ static int ncp_release(struct inode *inode, struct file *file) {
281 return 0; 282 return 0;
282} 283}
283 284
285static loff_t ncp_remote_llseek(struct file *file, loff_t offset, int origin)
286{
287 loff_t ret;
288 lock_kernel();
289 ret = generic_file_llseek_unlocked(file, offset, origin);
290 unlock_kernel();
291 return ret;
292}
293
284const struct file_operations ncp_file_operations = 294const struct file_operations ncp_file_operations =
285{ 295{
286 .llseek = remote_llseek, 296 .llseek = ncp_remote_llseek,
287 .read = ncp_file_read, 297 .read = ncp_file_read,
288 .write = ncp_file_write, 298 .write = ncp_file_write,
289 .ioctl = ncp_ioctl, 299 .ioctl = ncp_ioctl,
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index c1e7c8300629..f447f4b4476c 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -27,7 +27,7 @@
27 27
28struct nfs_callback_data { 28struct nfs_callback_data {
29 unsigned int users; 29 unsigned int users;
30 struct svc_serv *serv; 30 struct svc_rqst *rqst;
31 struct task_struct *task; 31 struct task_struct *task;
32}; 32};
33 33
@@ -91,21 +91,17 @@ nfs_callback_svc(void *vrqstp)
91 svc_process(rqstp); 91 svc_process(rqstp);
92 } 92 }
93 unlock_kernel(); 93 unlock_kernel();
94 nfs_callback_info.task = NULL;
95 svc_exit_thread(rqstp);
96 return 0; 94 return 0;
97} 95}
98 96
99/* 97/*
100 * Bring up the server process if it is not already up. 98 * Bring up the callback thread if it is not already up.
101 */ 99 */
102int nfs_callback_up(void) 100int nfs_callback_up(void)
103{ 101{
104 struct svc_serv *serv = NULL; 102 struct svc_serv *serv = NULL;
105 struct svc_rqst *rqstp;
106 int ret = 0; 103 int ret = 0;
107 104
108 lock_kernel();
109 mutex_lock(&nfs_callback_mutex); 105 mutex_lock(&nfs_callback_mutex);
110 if (nfs_callback_info.users++ || nfs_callback_info.task != NULL) 106 if (nfs_callback_info.users++ || nfs_callback_info.task != NULL)
111 goto out; 107 goto out;
@@ -121,22 +117,23 @@ int nfs_callback_up(void)
121 nfs_callback_tcpport = ret; 117 nfs_callback_tcpport = ret;
122 dprintk("Callback port = 0x%x\n", nfs_callback_tcpport); 118 dprintk("Callback port = 0x%x\n", nfs_callback_tcpport);
123 119
124 rqstp = svc_prepare_thread(serv, &serv->sv_pools[0]); 120 nfs_callback_info.rqst = svc_prepare_thread(serv, &serv->sv_pools[0]);
125 if (IS_ERR(rqstp)) { 121 if (IS_ERR(nfs_callback_info.rqst)) {
126 ret = PTR_ERR(rqstp); 122 ret = PTR_ERR(nfs_callback_info.rqst);
123 nfs_callback_info.rqst = NULL;
127 goto out_err; 124 goto out_err;
128 } 125 }
129 126
130 svc_sock_update_bufs(serv); 127 svc_sock_update_bufs(serv);
131 nfs_callback_info.serv = serv;
132 128
133 nfs_callback_info.task = kthread_run(nfs_callback_svc, rqstp, 129 nfs_callback_info.task = kthread_run(nfs_callback_svc,
130 nfs_callback_info.rqst,
134 "nfsv4-svc"); 131 "nfsv4-svc");
135 if (IS_ERR(nfs_callback_info.task)) { 132 if (IS_ERR(nfs_callback_info.task)) {
136 ret = PTR_ERR(nfs_callback_info.task); 133 ret = PTR_ERR(nfs_callback_info.task);
137 nfs_callback_info.serv = NULL; 134 svc_exit_thread(nfs_callback_info.rqst);
135 nfs_callback_info.rqst = NULL;
138 nfs_callback_info.task = NULL; 136 nfs_callback_info.task = NULL;
139 svc_exit_thread(rqstp);
140 goto out_err; 137 goto out_err;
141 } 138 }
142out: 139out:
@@ -149,7 +146,6 @@ out:
149 if (serv) 146 if (serv)
150 svc_destroy(serv); 147 svc_destroy(serv);
151 mutex_unlock(&nfs_callback_mutex); 148 mutex_unlock(&nfs_callback_mutex);
152 unlock_kernel();
153 return ret; 149 return ret;
154out_err: 150out_err:
155 dprintk("Couldn't create callback socket or server thread; err = %d\n", 151 dprintk("Couldn't create callback socket or server thread; err = %d\n",
@@ -159,17 +155,19 @@ out_err:
159} 155}
160 156
161/* 157/*
162 * Kill the server process if it is not already down. 158 * Kill the callback thread if it's no longer being used.
163 */ 159 */
164void nfs_callback_down(void) 160void nfs_callback_down(void)
165{ 161{
166 lock_kernel();
167 mutex_lock(&nfs_callback_mutex); 162 mutex_lock(&nfs_callback_mutex);
168 nfs_callback_info.users--; 163 nfs_callback_info.users--;
169 if (nfs_callback_info.users == 0 && nfs_callback_info.task != NULL) 164 if (nfs_callback_info.users == 0 && nfs_callback_info.task != NULL) {
170 kthread_stop(nfs_callback_info.task); 165 kthread_stop(nfs_callback_info.task);
166 svc_exit_thread(nfs_callback_info.rqst);
167 nfs_callback_info.rqst = NULL;
168 nfs_callback_info.task = NULL;
169 }
171 mutex_unlock(&nfs_callback_mutex); 170 mutex_unlock(&nfs_callback_mutex);
172 unlock_kernel();
173} 171}
174 172
175static int nfs_callback_authenticate(struct svc_rqst *rqstp) 173static int nfs_callback_authenticate(struct svc_rqst *rqstp)
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index f2a092ca69b5..5ee23e7058b3 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -431,14 +431,14 @@ static void nfs_init_timeout_values(struct rpc_timeout *to, int proto,
431{ 431{
432 to->to_initval = timeo * HZ / 10; 432 to->to_initval = timeo * HZ / 10;
433 to->to_retries = retrans; 433 to->to_retries = retrans;
434 if (!to->to_retries)
435 to->to_retries = 2;
436 434
437 switch (proto) { 435 switch (proto) {
438 case XPRT_TRANSPORT_TCP: 436 case XPRT_TRANSPORT_TCP:
439 case XPRT_TRANSPORT_RDMA: 437 case XPRT_TRANSPORT_RDMA:
438 if (to->to_retries == 0)
439 to->to_retries = NFS_DEF_TCP_RETRANS;
440 if (to->to_initval == 0) 440 if (to->to_initval == 0)
441 to->to_initval = 60 * HZ; 441 to->to_initval = NFS_DEF_TCP_TIMEO * HZ / 10;
442 if (to->to_initval > NFS_MAX_TCP_TIMEOUT) 442 if (to->to_initval > NFS_MAX_TCP_TIMEOUT)
443 to->to_initval = NFS_MAX_TCP_TIMEOUT; 443 to->to_initval = NFS_MAX_TCP_TIMEOUT;
444 to->to_increment = to->to_initval; 444 to->to_increment = to->to_initval;
@@ -450,14 +450,17 @@ static void nfs_init_timeout_values(struct rpc_timeout *to, int proto,
450 to->to_exponential = 0; 450 to->to_exponential = 0;
451 break; 451 break;
452 case XPRT_TRANSPORT_UDP: 452 case XPRT_TRANSPORT_UDP:
453 default: 453 if (to->to_retries == 0)
454 to->to_retries = NFS_DEF_UDP_RETRANS;
454 if (!to->to_initval) 455 if (!to->to_initval)
455 to->to_initval = 11 * HZ / 10; 456 to->to_initval = NFS_DEF_UDP_TIMEO * HZ / 10;
456 if (to->to_initval > NFS_MAX_UDP_TIMEOUT) 457 if (to->to_initval > NFS_MAX_UDP_TIMEOUT)
457 to->to_initval = NFS_MAX_UDP_TIMEOUT; 458 to->to_initval = NFS_MAX_UDP_TIMEOUT;
458 to->to_maxval = NFS_MAX_UDP_TIMEOUT; 459 to->to_maxval = NFS_MAX_UDP_TIMEOUT;
459 to->to_exponential = 1; 460 to->to_exponential = 1;
460 break; 461 break;
462 default:
463 BUG();
461 } 464 }
462} 465}
463 466
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 58d43daec084..28a238dab23a 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -133,13 +133,14 @@ nfs_opendir(struct inode *inode, struct file *filp)
133{ 133{
134 int res; 134 int res;
135 135
136 dfprintk(VFS, "NFS: opendir(%s/%ld)\n", 136 dfprintk(FILE, "NFS: open dir(%s/%s)\n",
137 inode->i_sb->s_id, inode->i_ino); 137 filp->f_path.dentry->d_parent->d_name.name,
138 filp->f_path.dentry->d_name.name);
139
140 nfs_inc_stats(inode, NFSIOS_VFSOPEN);
138 141
139 lock_kernel();
140 /* Call generic open code in order to cache credentials */ 142 /* Call generic open code in order to cache credentials */
141 res = nfs_open(inode, filp); 143 res = nfs_open(inode, filp);
142 unlock_kernel();
143 return res; 144 return res;
144} 145}
145 146
@@ -204,7 +205,7 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page)
204 * Note: assumes we have exclusive access to this mapping either 205 * Note: assumes we have exclusive access to this mapping either
205 * through inode->i_mutex or some other mechanism. 206 * through inode->i_mutex or some other mechanism.
206 */ 207 */
207 if (page->index == 0 && invalidate_inode_pages2_range(inode->i_mapping, PAGE_CACHE_SIZE, -1) < 0) { 208 if (invalidate_inode_pages2_range(inode->i_mapping, page->index + 1, -1) < 0) {
208 /* Should never happen */ 209 /* Should never happen */
209 nfs_zap_mapping(inode, inode->i_mapping); 210 nfs_zap_mapping(inode, inode->i_mapping);
210 } 211 }
@@ -528,13 +529,11 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
528 struct nfs_fattr fattr; 529 struct nfs_fattr fattr;
529 long res; 530 long res;
530 531
531 dfprintk(VFS, "NFS: readdir(%s/%s) starting at cookie %Lu\n", 532 dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n",
532 dentry->d_parent->d_name.name, dentry->d_name.name, 533 dentry->d_parent->d_name.name, dentry->d_name.name,
533 (long long)filp->f_pos); 534 (long long)filp->f_pos);
534 nfs_inc_stats(inode, NFSIOS_VFSGETDENTS); 535 nfs_inc_stats(inode, NFSIOS_VFSGETDENTS);
535 536
536 lock_kernel();
537
538 /* 537 /*
539 * filp->f_pos points to the dirent entry number. 538 * filp->f_pos points to the dirent entry number.
540 * *desc->dir_cookie has the cookie for the next entry. We have 539 * *desc->dir_cookie has the cookie for the next entry. We have
@@ -592,10 +591,9 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
592 } 591 }
593out: 592out:
594 nfs_unblock_sillyrename(dentry); 593 nfs_unblock_sillyrename(dentry);
595 unlock_kernel();
596 if (res > 0) 594 if (res > 0)
597 res = 0; 595 res = 0;
598 dfprintk(VFS, "NFS: readdir(%s/%s) returns %ld\n", 596 dfprintk(FILE, "NFS: readdir(%s/%s) returns %ld\n",
599 dentry->d_parent->d_name.name, dentry->d_name.name, 597 dentry->d_parent->d_name.name, dentry->d_name.name,
600 res); 598 res);
601 return res; 599 return res;
@@ -603,7 +601,15 @@ out:
603 601
604static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int origin) 602static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int origin)
605{ 603{
606 mutex_lock(&filp->f_path.dentry->d_inode->i_mutex); 604 struct dentry *dentry = filp->f_path.dentry;
605 struct inode *inode = dentry->d_inode;
606
607 dfprintk(FILE, "NFS: llseek dir(%s/%s, %lld, %d)\n",
608 dentry->d_parent->d_name.name,
609 dentry->d_name.name,
610 offset, origin);
611
612 mutex_lock(&inode->i_mutex);
607 switch (origin) { 613 switch (origin) {
608 case 1: 614 case 1:
609 offset += filp->f_pos; 615 offset += filp->f_pos;
@@ -619,7 +625,7 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int origin)
619 nfs_file_open_context(filp)->dir_cookie = 0; 625 nfs_file_open_context(filp)->dir_cookie = 0;
620 } 626 }
621out: 627out:
622 mutex_unlock(&filp->f_path.dentry->d_inode->i_mutex); 628 mutex_unlock(&inode->i_mutex);
623 return offset; 629 return offset;
624} 630}
625 631
@@ -629,10 +635,11 @@ out:
629 */ 635 */
630static int nfs_fsync_dir(struct file *filp, struct dentry *dentry, int datasync) 636static int nfs_fsync_dir(struct file *filp, struct dentry *dentry, int datasync)
631{ 637{
632 dfprintk(VFS, "NFS: fsync_dir(%s/%s) datasync %d\n", 638 dfprintk(FILE, "NFS: fsync dir(%s/%s) datasync %d\n",
633 dentry->d_parent->d_name.name, dentry->d_name.name, 639 dentry->d_parent->d_name.name, dentry->d_name.name,
634 datasync); 640 datasync);
635 641
642 nfs_inc_stats(dentry->d_inode, NFSIOS_VFSFSYNC);
636 return 0; 643 return 0;
637} 644}
638 645
@@ -767,7 +774,6 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd)
767 struct nfs_fattr fattr; 774 struct nfs_fattr fattr;
768 775
769 parent = dget_parent(dentry); 776 parent = dget_parent(dentry);
770 lock_kernel();
771 dir = parent->d_inode; 777 dir = parent->d_inode;
772 nfs_inc_stats(dir, NFSIOS_DENTRYREVALIDATE); 778 nfs_inc_stats(dir, NFSIOS_DENTRYREVALIDATE);
773 inode = dentry->d_inode; 779 inode = dentry->d_inode;
@@ -805,7 +811,6 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd)
805 811
806 nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); 812 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
807 out_valid: 813 out_valid:
808 unlock_kernel();
809 dput(parent); 814 dput(parent);
810 dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) is valid\n", 815 dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) is valid\n",
811 __func__, dentry->d_parent->d_name.name, 816 __func__, dentry->d_parent->d_name.name,
@@ -824,7 +829,6 @@ out_zap_parent:
824 shrink_dcache_parent(dentry); 829 shrink_dcache_parent(dentry);
825 } 830 }
826 d_drop(dentry); 831 d_drop(dentry);
827 unlock_kernel();
828 dput(parent); 832 dput(parent);
829 dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) is invalid\n", 833 dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) is invalid\n",
830 __func__, dentry->d_parent->d_name.name, 834 __func__, dentry->d_parent->d_name.name,
@@ -858,6 +862,14 @@ static int nfs_dentry_delete(struct dentry *dentry)
858 862
859} 863}
860 864
865static void nfs_drop_nlink(struct inode *inode)
866{
867 spin_lock(&inode->i_lock);
868 if (inode->i_nlink > 0)
869 drop_nlink(inode);
870 spin_unlock(&inode->i_lock);
871}
872
861/* 873/*
862 * Called when the dentry loses inode. 874 * Called when the dentry loses inode.
863 * We use it to clean up silly-renamed files. 875 * We use it to clean up silly-renamed files.
@@ -869,10 +881,8 @@ static void nfs_dentry_iput(struct dentry *dentry, struct inode *inode)
869 NFS_I(inode)->cache_validity |= NFS_INO_INVALID_DATA; 881 NFS_I(inode)->cache_validity |= NFS_INO_INVALID_DATA;
870 882
871 if (dentry->d_flags & DCACHE_NFSFS_RENAMED) { 883 if (dentry->d_flags & DCACHE_NFSFS_RENAMED) {
872 lock_kernel();
873 drop_nlink(inode); 884 drop_nlink(inode);
874 nfs_complete_unlink(dentry, inode); 885 nfs_complete_unlink(dentry, inode);
875 unlock_kernel();
876 } 886 }
877 iput(inode); 887 iput(inode);
878} 888}
@@ -903,8 +913,6 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
903 res = ERR_PTR(-ENOMEM); 913 res = ERR_PTR(-ENOMEM);
904 dentry->d_op = NFS_PROTO(dir)->dentry_ops; 914 dentry->d_op = NFS_PROTO(dir)->dentry_ops;
905 915
906 lock_kernel();
907
908 /* 916 /*
909 * If we're doing an exclusive create, optimize away the lookup 917 * If we're doing an exclusive create, optimize away the lookup
910 * but don't hash the dentry. 918 * but don't hash the dentry.
@@ -912,7 +920,7 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
912 if (nfs_is_exclusive_create(dir, nd)) { 920 if (nfs_is_exclusive_create(dir, nd)) {
913 d_instantiate(dentry, NULL); 921 d_instantiate(dentry, NULL);
914 res = NULL; 922 res = NULL;
915 goto out_unlock; 923 goto out;
916 } 924 }
917 925
918 parent = dentry->d_parent; 926 parent = dentry->d_parent;
@@ -940,8 +948,6 @@ no_entry:
940 nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); 948 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
941out_unblock_sillyrename: 949out_unblock_sillyrename:
942 nfs_unblock_sillyrename(parent); 950 nfs_unblock_sillyrename(parent);
943out_unlock:
944 unlock_kernel();
945out: 951out:
946 return res; 952 return res;
947} 953}
@@ -999,9 +1005,7 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
999 } 1005 }
1000 1006
1001 /* Open the file on the server */ 1007 /* Open the file on the server */
1002 lock_kernel();
1003 res = nfs4_atomic_open(dir, dentry, nd); 1008 res = nfs4_atomic_open(dir, dentry, nd);
1004 unlock_kernel();
1005 if (IS_ERR(res)) { 1009 if (IS_ERR(res)) {
1006 error = PTR_ERR(res); 1010 error = PTR_ERR(res);
1007 switch (error) { 1011 switch (error) {
@@ -1063,9 +1067,7 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
1063 * operations that change the directory. We therefore save the 1067 * operations that change the directory. We therefore save the
1064 * change attribute *before* we do the RPC call. 1068 * change attribute *before* we do the RPC call.
1065 */ 1069 */
1066 lock_kernel();
1067 ret = nfs4_open_revalidate(dir, dentry, openflags, nd); 1070 ret = nfs4_open_revalidate(dir, dentry, openflags, nd);
1068 unlock_kernel();
1069out: 1071out:
1070 dput(parent); 1072 dput(parent);
1071 if (!ret) 1073 if (!ret)
@@ -1218,14 +1220,11 @@ static int nfs_create(struct inode *dir, struct dentry *dentry, int mode,
1218 if ((nd->flags & LOOKUP_CREATE) != 0) 1220 if ((nd->flags & LOOKUP_CREATE) != 0)
1219 open_flags = nd->intent.open.flags; 1221 open_flags = nd->intent.open.flags;
1220 1222
1221 lock_kernel();
1222 error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags, nd); 1223 error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags, nd);
1223 if (error != 0) 1224 if (error != 0)
1224 goto out_err; 1225 goto out_err;
1225 unlock_kernel();
1226 return 0; 1226 return 0;
1227out_err: 1227out_err:
1228 unlock_kernel();
1229 d_drop(dentry); 1228 d_drop(dentry);
1230 return error; 1229 return error;
1231} 1230}
@@ -1248,14 +1247,11 @@ nfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
1248 attr.ia_mode = mode; 1247 attr.ia_mode = mode;
1249 attr.ia_valid = ATTR_MODE; 1248 attr.ia_valid = ATTR_MODE;
1250 1249
1251 lock_kernel();
1252 status = NFS_PROTO(dir)->mknod(dir, dentry, &attr, rdev); 1250 status = NFS_PROTO(dir)->mknod(dir, dentry, &attr, rdev);
1253 if (status != 0) 1251 if (status != 0)
1254 goto out_err; 1252 goto out_err;
1255 unlock_kernel();
1256 return 0; 1253 return 0;
1257out_err: 1254out_err:
1258 unlock_kernel();
1259 d_drop(dentry); 1255 d_drop(dentry);
1260 return status; 1256 return status;
1261} 1257}
@@ -1274,15 +1270,12 @@ static int nfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
1274 attr.ia_valid = ATTR_MODE; 1270 attr.ia_valid = ATTR_MODE;
1275 attr.ia_mode = mode | S_IFDIR; 1271 attr.ia_mode = mode | S_IFDIR;
1276 1272
1277 lock_kernel();
1278 error = NFS_PROTO(dir)->mkdir(dir, dentry, &attr); 1273 error = NFS_PROTO(dir)->mkdir(dir, dentry, &attr);
1279 if (error != 0) 1274 if (error != 0)
1280 goto out_err; 1275 goto out_err;
1281 unlock_kernel();
1282 return 0; 1276 return 0;
1283out_err: 1277out_err:
1284 d_drop(dentry); 1278 d_drop(dentry);
1285 unlock_kernel();
1286 return error; 1279 return error;
1287} 1280}
1288 1281
@@ -1299,14 +1292,12 @@ static int nfs_rmdir(struct inode *dir, struct dentry *dentry)
1299 dfprintk(VFS, "NFS: rmdir(%s/%ld), %s\n", 1292 dfprintk(VFS, "NFS: rmdir(%s/%ld), %s\n",
1300 dir->i_sb->s_id, dir->i_ino, dentry->d_name.name); 1293 dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
1301 1294
1302 lock_kernel();
1303 error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name); 1295 error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name);
1304 /* Ensure the VFS deletes this inode */ 1296 /* Ensure the VFS deletes this inode */
1305 if (error == 0 && dentry->d_inode != NULL) 1297 if (error == 0 && dentry->d_inode != NULL)
1306 clear_nlink(dentry->d_inode); 1298 clear_nlink(dentry->d_inode);
1307 else if (error == -ENOENT) 1299 else if (error == -ENOENT)
1308 nfs_dentry_handle_enoent(dentry); 1300 nfs_dentry_handle_enoent(dentry);
1309 unlock_kernel();
1310 1301
1311 return error; 1302 return error;
1312} 1303}
@@ -1408,7 +1399,7 @@ static int nfs_safe_remove(struct dentry *dentry)
1408 error = NFS_PROTO(dir)->remove(dir, &dentry->d_name); 1399 error = NFS_PROTO(dir)->remove(dir, &dentry->d_name);
1409 /* The VFS may want to delete this inode */ 1400 /* The VFS may want to delete this inode */
1410 if (error == 0) 1401 if (error == 0)
1411 drop_nlink(inode); 1402 nfs_drop_nlink(inode);
1412 nfs_mark_for_revalidate(inode); 1403 nfs_mark_for_revalidate(inode);
1413 } else 1404 } else
1414 error = NFS_PROTO(dir)->remove(dir, &dentry->d_name); 1405 error = NFS_PROTO(dir)->remove(dir, &dentry->d_name);
@@ -1431,7 +1422,6 @@ static int nfs_unlink(struct inode *dir, struct dentry *dentry)
1431 dfprintk(VFS, "NFS: unlink(%s/%ld, %s)\n", dir->i_sb->s_id, 1422 dfprintk(VFS, "NFS: unlink(%s/%ld, %s)\n", dir->i_sb->s_id,
1432 dir->i_ino, dentry->d_name.name); 1423 dir->i_ino, dentry->d_name.name);
1433 1424
1434 lock_kernel();
1435 spin_lock(&dcache_lock); 1425 spin_lock(&dcache_lock);
1436 spin_lock(&dentry->d_lock); 1426 spin_lock(&dentry->d_lock);
1437 if (atomic_read(&dentry->d_count) > 1) { 1427 if (atomic_read(&dentry->d_count) > 1) {
@@ -1440,7 +1430,6 @@ static int nfs_unlink(struct inode *dir, struct dentry *dentry)
1440 /* Start asynchronous writeout of the inode */ 1430 /* Start asynchronous writeout of the inode */
1441 write_inode_now(dentry->d_inode, 0); 1431 write_inode_now(dentry->d_inode, 0);
1442 error = nfs_sillyrename(dir, dentry); 1432 error = nfs_sillyrename(dir, dentry);
1443 unlock_kernel();
1444 return error; 1433 return error;
1445 } 1434 }
1446 if (!d_unhashed(dentry)) { 1435 if (!d_unhashed(dentry)) {
@@ -1454,7 +1443,6 @@ static int nfs_unlink(struct inode *dir, struct dentry *dentry)
1454 nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); 1443 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
1455 } else if (need_rehash) 1444 } else if (need_rehash)
1456 d_rehash(dentry); 1445 d_rehash(dentry);
1457 unlock_kernel();
1458 return error; 1446 return error;
1459} 1447}
1460 1448
@@ -1491,13 +1479,9 @@ static int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *sym
1491 attr.ia_mode = S_IFLNK | S_IRWXUGO; 1479 attr.ia_mode = S_IFLNK | S_IRWXUGO;
1492 attr.ia_valid = ATTR_MODE; 1480 attr.ia_valid = ATTR_MODE;
1493 1481
1494 lock_kernel();
1495
1496 page = alloc_page(GFP_HIGHUSER); 1482 page = alloc_page(GFP_HIGHUSER);
1497 if (!page) { 1483 if (!page)
1498 unlock_kernel();
1499 return -ENOMEM; 1484 return -ENOMEM;
1500 }
1501 1485
1502 kaddr = kmap_atomic(page, KM_USER0); 1486 kaddr = kmap_atomic(page, KM_USER0);
1503 memcpy(kaddr, symname, pathlen); 1487 memcpy(kaddr, symname, pathlen);
@@ -1512,7 +1496,6 @@ static int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *sym
1512 dentry->d_name.name, symname, error); 1496 dentry->d_name.name, symname, error);
1513 d_drop(dentry); 1497 d_drop(dentry);
1514 __free_page(page); 1498 __free_page(page);
1515 unlock_kernel();
1516 return error; 1499 return error;
1517 } 1500 }
1518 1501
@@ -1530,7 +1513,6 @@ static int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *sym
1530 } else 1513 } else
1531 __free_page(page); 1514 __free_page(page);
1532 1515
1533 unlock_kernel();
1534 return 0; 1516 return 0;
1535} 1517}
1536 1518
@@ -1544,14 +1526,12 @@ nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
1544 old_dentry->d_parent->d_name.name, old_dentry->d_name.name, 1526 old_dentry->d_parent->d_name.name, old_dentry->d_name.name,
1545 dentry->d_parent->d_name.name, dentry->d_name.name); 1527 dentry->d_parent->d_name.name, dentry->d_name.name);
1546 1528
1547 lock_kernel();
1548 d_drop(dentry); 1529 d_drop(dentry);
1549 error = NFS_PROTO(dir)->link(inode, dir, &dentry->d_name); 1530 error = NFS_PROTO(dir)->link(inode, dir, &dentry->d_name);
1550 if (error == 0) { 1531 if (error == 0) {
1551 atomic_inc(&inode->i_count); 1532 atomic_inc(&inode->i_count);
1552 d_add(dentry, inode); 1533 d_add(dentry, inode);
1553 } 1534 }
1554 unlock_kernel();
1555 return error; 1535 return error;
1556} 1536}
1557 1537
@@ -1591,7 +1571,6 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
1591 * To prevent any new references to the target during the rename, 1571 * To prevent any new references to the target during the rename,
1592 * we unhash the dentry and free the inode in advance. 1572 * we unhash the dentry and free the inode in advance.
1593 */ 1573 */
1594 lock_kernel();
1595 if (!d_unhashed(new_dentry)) { 1574 if (!d_unhashed(new_dentry)) {
1596 d_drop(new_dentry); 1575 d_drop(new_dentry);
1597 rehash = new_dentry; 1576 rehash = new_dentry;
@@ -1635,7 +1614,7 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
1635 /* dentry still busy? */ 1614 /* dentry still busy? */
1636 goto out; 1615 goto out;
1637 } else 1616 } else
1638 drop_nlink(new_inode); 1617 nfs_drop_nlink(new_inode);
1639 1618
1640go_ahead: 1619go_ahead:
1641 /* 1620 /*
@@ -1669,7 +1648,6 @@ out:
1669 /* new dentry created? */ 1648 /* new dentry created? */
1670 if (dentry) 1649 if (dentry)
1671 dput(dentry); 1650 dput(dentry);
1672 unlock_kernel();
1673 return error; 1651 return error;
1674} 1652}
1675 1653
@@ -1962,8 +1940,6 @@ int nfs_permission(struct inode *inode, int mask, struct nameidata *nd)
1962 } 1940 }
1963 1941
1964force_lookup: 1942force_lookup:
1965 lock_kernel();
1966
1967 if (!NFS_PROTO(inode)->access) 1943 if (!NFS_PROTO(inode)->access)
1968 goto out_notsup; 1944 goto out_notsup;
1969 1945
@@ -1973,7 +1949,6 @@ force_lookup:
1973 put_rpccred(cred); 1949 put_rpccred(cred);
1974 } else 1950 } else
1975 res = PTR_ERR(cred); 1951 res = PTR_ERR(cred);
1976 unlock_kernel();
1977out: 1952out:
1978 dfprintk(VFS, "NFS: permission(%s/%ld), mask=0x%x, res=%d\n", 1953 dfprintk(VFS, "NFS: permission(%s/%ld), mask=0x%x, res=%d\n",
1979 inode->i_sb->s_id, inode->i_ino, mask, res); 1954 inode->i_sb->s_id, inode->i_ino, mask, res);
@@ -1982,7 +1957,6 @@ out_notsup:
1982 res = nfs_revalidate_inode(NFS_SERVER(inode), inode); 1957 res = nfs_revalidate_inode(NFS_SERVER(inode), inode);
1983 if (res == 0) 1958 if (res == 0)
1984 res = generic_permission(inode, mask, NULL); 1959 res = generic_permission(inode, mask, NULL);
1985 unlock_kernel();
1986 goto out; 1960 goto out;
1987} 1961}
1988 1962
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 4757a2b326a1..08f6b040d289 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -890,7 +890,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov,
890 count = iov_length(iov, nr_segs); 890 count = iov_length(iov, nr_segs);
891 nfs_add_stats(mapping->host, NFSIOS_DIRECTREADBYTES, count); 891 nfs_add_stats(mapping->host, NFSIOS_DIRECTREADBYTES, count);
892 892
893 dprintk("nfs: direct read(%s/%s, %zd@%Ld)\n", 893 dfprintk(FILE, "NFS: direct read(%s/%s, %zd@%Ld)\n",
894 file->f_path.dentry->d_parent->d_name.name, 894 file->f_path.dentry->d_parent->d_name.name,
895 file->f_path.dentry->d_name.name, 895 file->f_path.dentry->d_name.name,
896 count, (long long) pos); 896 count, (long long) pos);
@@ -947,7 +947,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
947 count = iov_length(iov, nr_segs); 947 count = iov_length(iov, nr_segs);
948 nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES, count); 948 nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES, count);
949 949
950 dfprintk(VFS, "nfs: direct write(%s/%s, %zd@%Ld)\n", 950 dfprintk(FILE, "NFS: direct write(%s/%s, %zd@%Ld)\n",
951 file->f_path.dentry->d_parent->d_name.name, 951 file->f_path.dentry->d_parent->d_name.name,
952 file->f_path.dentry->d_name.name, 952 file->f_path.dentry->d_name.name,
953 count, (long long) pos); 953 count, (long long) pos);
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index d84a3d8f32af..78460657f5cb 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -50,7 +50,7 @@ static ssize_t nfs_file_read(struct kiocb *, const struct iovec *iov,
50static ssize_t nfs_file_write(struct kiocb *, const struct iovec *iov, 50static ssize_t nfs_file_write(struct kiocb *, const struct iovec *iov,
51 unsigned long nr_segs, loff_t pos); 51 unsigned long nr_segs, loff_t pos);
52static int nfs_file_flush(struct file *, fl_owner_t id); 52static int nfs_file_flush(struct file *, fl_owner_t id);
53static int nfs_fsync(struct file *, struct dentry *dentry, int datasync); 53static int nfs_file_fsync(struct file *, struct dentry *dentry, int datasync);
54static int nfs_check_flags(int flags); 54static int nfs_check_flags(int flags);
55static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl); 55static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl);
56static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl); 56static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl);
@@ -72,7 +72,7 @@ const struct file_operations nfs_file_operations = {
72 .open = nfs_file_open, 72 .open = nfs_file_open,
73 .flush = nfs_file_flush, 73 .flush = nfs_file_flush,
74 .release = nfs_file_release, 74 .release = nfs_file_release,
75 .fsync = nfs_fsync, 75 .fsync = nfs_file_fsync,
76 .lock = nfs_lock, 76 .lock = nfs_lock,
77 .flock = nfs_flock, 77 .flock = nfs_flock,
78 .splice_read = nfs_file_splice_read, 78 .splice_read = nfs_file_splice_read,
@@ -119,25 +119,33 @@ nfs_file_open(struct inode *inode, struct file *filp)
119{ 119{
120 int res; 120 int res;
121 121
122 dprintk("NFS: open file(%s/%s)\n",
123 filp->f_path.dentry->d_parent->d_name.name,
124 filp->f_path.dentry->d_name.name);
125
122 res = nfs_check_flags(filp->f_flags); 126 res = nfs_check_flags(filp->f_flags);
123 if (res) 127 if (res)
124 return res; 128 return res;
125 129
126 nfs_inc_stats(inode, NFSIOS_VFSOPEN); 130 nfs_inc_stats(inode, NFSIOS_VFSOPEN);
127 lock_kernel(); 131 res = nfs_open(inode, filp);
128 res = NFS_PROTO(inode)->file_open(inode, filp);
129 unlock_kernel();
130 return res; 132 return res;
131} 133}
132 134
133static int 135static int
134nfs_file_release(struct inode *inode, struct file *filp) 136nfs_file_release(struct inode *inode, struct file *filp)
135{ 137{
138 struct dentry *dentry = filp->f_path.dentry;
139
140 dprintk("NFS: release(%s/%s)\n",
141 dentry->d_parent->d_name.name,
142 dentry->d_name.name);
143
136 /* Ensure that dirty pages are flushed out with the right creds */ 144 /* Ensure that dirty pages are flushed out with the right creds */
137 if (filp->f_mode & FMODE_WRITE) 145 if (filp->f_mode & FMODE_WRITE)
138 nfs_wb_all(filp->f_path.dentry->d_inode); 146 nfs_wb_all(dentry->d_inode);
139 nfs_inc_stats(inode, NFSIOS_VFSRELEASE); 147 nfs_inc_stats(inode, NFSIOS_VFSRELEASE);
140 return NFS_PROTO(inode)->file_release(inode, filp); 148 return nfs_release(inode, filp);
141} 149}
142 150
143/** 151/**
@@ -170,6 +178,13 @@ force_reval:
170 178
171static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin) 179static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin)
172{ 180{
181 loff_t loff;
182
183 dprintk("NFS: llseek file(%s/%s, %lld, %d)\n",
184 filp->f_path.dentry->d_parent->d_name.name,
185 filp->f_path.dentry->d_name.name,
186 offset, origin);
187
173 /* origin == SEEK_END => we must revalidate the cached file length */ 188 /* origin == SEEK_END => we must revalidate the cached file length */
174 if (origin == SEEK_END) { 189 if (origin == SEEK_END) {
175 struct inode *inode = filp->f_mapping->host; 190 struct inode *inode = filp->f_mapping->host;
@@ -177,11 +192,14 @@ static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin)
177 if (retval < 0) 192 if (retval < 0)
178 return (loff_t)retval; 193 return (loff_t)retval;
179 } 194 }
180 return remote_llseek(filp, offset, origin); 195 lock_kernel(); /* BKL needed? */
196 loff = generic_file_llseek_unlocked(filp, offset, origin);
197 unlock_kernel();
198 return loff;
181} 199}
182 200
183/* 201/*
184 * Helper for nfs_file_flush() and nfs_fsync() 202 * Helper for nfs_file_flush() and nfs_file_fsync()
185 * 203 *
186 * Notice that it clears the NFS_CONTEXT_ERROR_WRITE before synching to 204 * Notice that it clears the NFS_CONTEXT_ERROR_WRITE before synching to
187 * disk, but it retrieves and clears ctx->error after synching, despite 205 * disk, but it retrieves and clears ctx->error after synching, despite
@@ -207,16 +225,18 @@ static int nfs_do_fsync(struct nfs_open_context *ctx, struct inode *inode)
207 225
208/* 226/*
209 * Flush all dirty pages, and check for write errors. 227 * Flush all dirty pages, and check for write errors.
210 *
211 */ 228 */
212static int 229static int
213nfs_file_flush(struct file *file, fl_owner_t id) 230nfs_file_flush(struct file *file, fl_owner_t id)
214{ 231{
215 struct nfs_open_context *ctx = nfs_file_open_context(file); 232 struct nfs_open_context *ctx = nfs_file_open_context(file);
216 struct inode *inode = file->f_path.dentry->d_inode; 233 struct dentry *dentry = file->f_path.dentry;
234 struct inode *inode = dentry->d_inode;
217 int status; 235 int status;
218 236
219 dfprintk(VFS, "nfs: flush(%s/%ld)\n", inode->i_sb->s_id, inode->i_ino); 237 dprintk("NFS: flush(%s/%s)\n",
238 dentry->d_parent->d_name.name,
239 dentry->d_name.name);
220 240
221 if ((file->f_mode & FMODE_WRITE) == 0) 241 if ((file->f_mode & FMODE_WRITE) == 0)
222 return 0; 242 return 0;
@@ -241,7 +261,7 @@ nfs_file_read(struct kiocb *iocb, const struct iovec *iov,
241 if (iocb->ki_filp->f_flags & O_DIRECT) 261 if (iocb->ki_filp->f_flags & O_DIRECT)
242 return nfs_file_direct_read(iocb, iov, nr_segs, pos); 262 return nfs_file_direct_read(iocb, iov, nr_segs, pos);
243 263
244 dfprintk(VFS, "nfs: read(%s/%s, %lu@%lu)\n", 264 dprintk("NFS: read(%s/%s, %lu@%lu)\n",
245 dentry->d_parent->d_name.name, dentry->d_name.name, 265 dentry->d_parent->d_name.name, dentry->d_name.name,
246 (unsigned long) count, (unsigned long) pos); 266 (unsigned long) count, (unsigned long) pos);
247 267
@@ -261,7 +281,7 @@ nfs_file_splice_read(struct file *filp, loff_t *ppos,
261 struct inode *inode = dentry->d_inode; 281 struct inode *inode = dentry->d_inode;
262 ssize_t res; 282 ssize_t res;
263 283
264 dfprintk(VFS, "nfs: splice_read(%s/%s, %lu@%Lu)\n", 284 dprintk("NFS: splice_read(%s/%s, %lu@%Lu)\n",
265 dentry->d_parent->d_name.name, dentry->d_name.name, 285 dentry->d_parent->d_name.name, dentry->d_name.name,
266 (unsigned long) count, (unsigned long long) *ppos); 286 (unsigned long) count, (unsigned long long) *ppos);
267 287
@@ -278,7 +298,7 @@ nfs_file_mmap(struct file * file, struct vm_area_struct * vma)
278 struct inode *inode = dentry->d_inode; 298 struct inode *inode = dentry->d_inode;
279 int status; 299 int status;
280 300
281 dfprintk(VFS, "nfs: mmap(%s/%s)\n", 301 dprintk("NFS: mmap(%s/%s)\n",
282 dentry->d_parent->d_name.name, dentry->d_name.name); 302 dentry->d_parent->d_name.name, dentry->d_name.name);
283 303
284 status = nfs_revalidate_mapping(inode, file->f_mapping); 304 status = nfs_revalidate_mapping(inode, file->f_mapping);
@@ -296,12 +316,14 @@ nfs_file_mmap(struct file * file, struct vm_area_struct * vma)
296 * whether any write errors occurred for this process. 316 * whether any write errors occurred for this process.
297 */ 317 */
298static int 318static int
299nfs_fsync(struct file *file, struct dentry *dentry, int datasync) 319nfs_file_fsync(struct file *file, struct dentry *dentry, int datasync)
300{ 320{
301 struct nfs_open_context *ctx = nfs_file_open_context(file); 321 struct nfs_open_context *ctx = nfs_file_open_context(file);
302 struct inode *inode = dentry->d_inode; 322 struct inode *inode = dentry->d_inode;
303 323
304 dfprintk(VFS, "nfs: fsync(%s/%ld)\n", inode->i_sb->s_id, inode->i_ino); 324 dprintk("NFS: fsync file(%s/%s) datasync %d\n",
325 dentry->d_parent->d_name.name, dentry->d_name.name,
326 datasync);
305 327
306 nfs_inc_stats(inode, NFSIOS_VFSFSYNC); 328 nfs_inc_stats(inode, NFSIOS_VFSFSYNC);
307 return nfs_do_fsync(ctx, inode); 329 return nfs_do_fsync(ctx, inode);
@@ -324,6 +346,11 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
324 struct page *page; 346 struct page *page;
325 index = pos >> PAGE_CACHE_SHIFT; 347 index = pos >> PAGE_CACHE_SHIFT;
326 348
349 dfprintk(PAGECACHE, "NFS: write_begin(%s/%s(%ld), %u@%lld)\n",
350 file->f_path.dentry->d_parent->d_name.name,
351 file->f_path.dentry->d_name.name,
352 mapping->host->i_ino, len, (long long) pos);
353
327 page = __grab_cache_page(mapping, index); 354 page = __grab_cache_page(mapping, index);
328 if (!page) 355 if (!page)
329 return -ENOMEM; 356 return -ENOMEM;
@@ -344,9 +371,32 @@ static int nfs_write_end(struct file *file, struct address_space *mapping,
344 unsigned offset = pos & (PAGE_CACHE_SIZE - 1); 371 unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
345 int status; 372 int status;
346 373
347 lock_kernel(); 374 dfprintk(PAGECACHE, "NFS: write_end(%s/%s(%ld), %u@%lld)\n",
375 file->f_path.dentry->d_parent->d_name.name,
376 file->f_path.dentry->d_name.name,
377 mapping->host->i_ino, len, (long long) pos);
378
379 /*
380 * Zero any uninitialised parts of the page, and then mark the page
381 * as up to date if it turns out that we're extending the file.
382 */
383 if (!PageUptodate(page)) {
384 unsigned pglen = nfs_page_length(page);
385 unsigned end = offset + len;
386
387 if (pglen == 0) {
388 zero_user_segments(page, 0, offset,
389 end, PAGE_CACHE_SIZE);
390 SetPageUptodate(page);
391 } else if (end >= pglen) {
392 zero_user_segment(page, end, PAGE_CACHE_SIZE);
393 if (offset == 0)
394 SetPageUptodate(page);
395 } else
396 zero_user_segment(page, pglen, PAGE_CACHE_SIZE);
397 }
398
348 status = nfs_updatepage(file, page, offset, copied); 399 status = nfs_updatepage(file, page, offset, copied);
349 unlock_kernel();
350 400
351 unlock_page(page); 401 unlock_page(page);
352 page_cache_release(page); 402 page_cache_release(page);
@@ -358,6 +408,8 @@ static int nfs_write_end(struct file *file, struct address_space *mapping,
358 408
359static void nfs_invalidate_page(struct page *page, unsigned long offset) 409static void nfs_invalidate_page(struct page *page, unsigned long offset)
360{ 410{
411 dfprintk(PAGECACHE, "NFS: invalidate_page(%p, %lu)\n", page, offset);
412
361 if (offset != 0) 413 if (offset != 0)
362 return; 414 return;
363 /* Cancel any unstarted writes on this page */ 415 /* Cancel any unstarted writes on this page */
@@ -366,13 +418,20 @@ static void nfs_invalidate_page(struct page *page, unsigned long offset)
366 418
367static int nfs_release_page(struct page *page, gfp_t gfp) 419static int nfs_release_page(struct page *page, gfp_t gfp)
368{ 420{
421 dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page);
422
369 /* If PagePrivate() is set, then the page is not freeable */ 423 /* If PagePrivate() is set, then the page is not freeable */
370 return 0; 424 return 0;
371} 425}
372 426
373static int nfs_launder_page(struct page *page) 427static int nfs_launder_page(struct page *page)
374{ 428{
375 return nfs_wb_page(page->mapping->host, page); 429 struct inode *inode = page->mapping->host;
430
431 dfprintk(PAGECACHE, "NFS: launder_page(%ld, %llu)\n",
432 inode->i_ino, (long long)page_offset(page));
433
434 return nfs_wb_page(inode, page);
376} 435}
377 436
378const struct address_space_operations nfs_file_aops = { 437const struct address_space_operations nfs_file_aops = {
@@ -392,13 +451,19 @@ const struct address_space_operations nfs_file_aops = {
392static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page) 451static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page)
393{ 452{
394 struct file *filp = vma->vm_file; 453 struct file *filp = vma->vm_file;
454 struct dentry *dentry = filp->f_path.dentry;
395 unsigned pagelen; 455 unsigned pagelen;
396 int ret = -EINVAL; 456 int ret = -EINVAL;
397 struct address_space *mapping; 457 struct address_space *mapping;
398 458
459 dfprintk(PAGECACHE, "NFS: vm_page_mkwrite(%s/%s(%ld), offset %lld)\n",
460 dentry->d_parent->d_name.name, dentry->d_name.name,
461 filp->f_mapping->host->i_ino,
462 (long long)page_offset(page));
463
399 lock_page(page); 464 lock_page(page);
400 mapping = page->mapping; 465 mapping = page->mapping;
401 if (mapping != vma->vm_file->f_path.dentry->d_inode->i_mapping) 466 if (mapping != dentry->d_inode->i_mapping)
402 goto out_unlock; 467 goto out_unlock;
403 468
404 ret = 0; 469 ret = 0;
@@ -446,9 +511,9 @@ static ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov,
446 if (iocb->ki_filp->f_flags & O_DIRECT) 511 if (iocb->ki_filp->f_flags & O_DIRECT)
447 return nfs_file_direct_write(iocb, iov, nr_segs, pos); 512 return nfs_file_direct_write(iocb, iov, nr_segs, pos);
448 513
449 dfprintk(VFS, "nfs: write(%s/%s(%ld), %lu@%Ld)\n", 514 dprintk("NFS: write(%s/%s, %lu@%Ld)\n",
450 dentry->d_parent->d_name.name, dentry->d_name.name, 515 dentry->d_parent->d_name.name, dentry->d_name.name,
451 inode->i_ino, (unsigned long) count, (long long) pos); 516 (unsigned long) count, (long long) pos);
452 517
453 result = -EBUSY; 518 result = -EBUSY;
454 if (IS_SWAPFILE(inode)) 519 if (IS_SWAPFILE(inode))
@@ -582,7 +647,8 @@ static int do_setlk(struct file *filp, int cmd, struct file_lock *fl)
582 * This makes locking act as a cache coherency point. 647 * This makes locking act as a cache coherency point.
583 */ 648 */
584 nfs_sync_mapping(filp->f_mapping); 649 nfs_sync_mapping(filp->f_mapping);
585 nfs_zap_caches(inode); 650 if (!nfs_have_delegation(inode, FMODE_READ))
651 nfs_zap_caches(inode);
586out: 652out:
587 return status; 653 return status;
588} 654}
@@ -592,23 +658,35 @@ out:
592 */ 658 */
593static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl) 659static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl)
594{ 660{
595 struct inode * inode = filp->f_mapping->host; 661 struct inode *inode = filp->f_mapping->host;
662 int ret = -ENOLCK;
596 663
597 dprintk("NFS: nfs_lock(f=%s/%ld, t=%x, fl=%x, r=%Ld:%Ld)\n", 664 dprintk("NFS: lock(%s/%s, t=%x, fl=%x, r=%lld:%lld)\n",
598 inode->i_sb->s_id, inode->i_ino, 665 filp->f_path.dentry->d_parent->d_name.name,
666 filp->f_path.dentry->d_name.name,
599 fl->fl_type, fl->fl_flags, 667 fl->fl_type, fl->fl_flags,
600 (long long)fl->fl_start, (long long)fl->fl_end); 668 (long long)fl->fl_start, (long long)fl->fl_end);
669
601 nfs_inc_stats(inode, NFSIOS_VFSLOCK); 670 nfs_inc_stats(inode, NFSIOS_VFSLOCK);
602 671
603 /* No mandatory locks over NFS */ 672 /* No mandatory locks over NFS */
604 if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK) 673 if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
605 return -ENOLCK; 674 goto out_err;
675
676 if (NFS_PROTO(inode)->lock_check_bounds != NULL) {
677 ret = NFS_PROTO(inode)->lock_check_bounds(fl);
678 if (ret < 0)
679 goto out_err;
680 }
606 681
607 if (IS_GETLK(cmd)) 682 if (IS_GETLK(cmd))
608 return do_getlk(filp, cmd, fl); 683 ret = do_getlk(filp, cmd, fl);
609 if (fl->fl_type == F_UNLCK) 684 else if (fl->fl_type == F_UNLCK)
610 return do_unlk(filp, cmd, fl); 685 ret = do_unlk(filp, cmd, fl);
611 return do_setlk(filp, cmd, fl); 686 else
687 ret = do_setlk(filp, cmd, fl);
688out_err:
689 return ret;
612} 690}
613 691
614/* 692/*
@@ -616,9 +694,9 @@ static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl)
616 */ 694 */
617static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl) 695static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl)
618{ 696{
619 dprintk("NFS: nfs_flock(f=%s/%ld, t=%x, fl=%x)\n", 697 dprintk("NFS: flock(%s/%s, t=%x, fl=%x)\n",
620 filp->f_path.dentry->d_inode->i_sb->s_id, 698 filp->f_path.dentry->d_parent->d_name.name,
621 filp->f_path.dentry->d_inode->i_ino, 699 filp->f_path.dentry->d_name.name,
622 fl->fl_type, fl->fl_flags); 700 fl->fl_type, fl->fl_flags);
623 701
624 /* 702 /*
@@ -641,12 +719,15 @@ static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl)
641 return do_setlk(filp, cmd, fl); 719 return do_setlk(filp, cmd, fl);
642} 720}
643 721
722/*
723 * There is no protocol support for leases, so we have no way to implement
724 * them correctly in the face of opens by other clients.
725 */
644static int nfs_setlease(struct file *file, long arg, struct file_lock **fl) 726static int nfs_setlease(struct file *file, long arg, struct file_lock **fl)
645{ 727{
646 /* 728 dprintk("NFS: setlease(%s/%s, arg=%ld)\n",
647 * There is no protocol support for leases, so we have no way 729 file->f_path.dentry->d_parent->d_name.name,
648 * to implement them correctly in the face of opens by other 730 file->f_path.dentry->d_name.name, arg);
649 * clients. 731
650 */
651 return -EINVAL; 732 return -EINVAL;
652} 733}
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 596c5d8e86f4..df23f987da6b 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -57,8 +57,6 @@ static int enable_ino64 = NFS_64_BIT_INODE_NUMBERS_ENABLED;
57static void nfs_invalidate_inode(struct inode *); 57static void nfs_invalidate_inode(struct inode *);
58static int nfs_update_inode(struct inode *, struct nfs_fattr *); 58static int nfs_update_inode(struct inode *, struct nfs_fattr *);
59 59
60static void nfs_zap_acl_cache(struct inode *);
61
62static struct kmem_cache * nfs_inode_cachep; 60static struct kmem_cache * nfs_inode_cachep;
63 61
64static inline unsigned long 62static inline unsigned long
@@ -167,7 +165,7 @@ void nfs_zap_mapping(struct inode *inode, struct address_space *mapping)
167 } 165 }
168} 166}
169 167
170static void nfs_zap_acl_cache(struct inode *inode) 168void nfs_zap_acl_cache(struct inode *inode)
171{ 169{
172 void (*clear_acl_cache)(struct inode *); 170 void (*clear_acl_cache)(struct inode *);
173 171
@@ -347,7 +345,7 @@ out_no_inode:
347 goto out; 345 goto out;
348} 346}
349 347
350#define NFS_VALID_ATTRS (ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_SIZE|ATTR_ATIME|ATTR_ATIME_SET|ATTR_MTIME|ATTR_MTIME_SET) 348#define NFS_VALID_ATTRS (ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_SIZE|ATTR_ATIME|ATTR_ATIME_SET|ATTR_MTIME|ATTR_MTIME_SET|ATTR_FILE)
351 349
352int 350int
353nfs_setattr(struct dentry *dentry, struct iattr *attr) 351nfs_setattr(struct dentry *dentry, struct iattr *attr)
@@ -369,10 +367,9 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
369 367
370 /* Optimization: if the end result is no change, don't RPC */ 368 /* Optimization: if the end result is no change, don't RPC */
371 attr->ia_valid &= NFS_VALID_ATTRS; 369 attr->ia_valid &= NFS_VALID_ATTRS;
372 if (attr->ia_valid == 0) 370 if ((attr->ia_valid & ~ATTR_FILE) == 0)
373 return 0; 371 return 0;
374 372
375 lock_kernel();
376 /* Write all dirty data */ 373 /* Write all dirty data */
377 if (S_ISREG(inode->i_mode)) { 374 if (S_ISREG(inode->i_mode)) {
378 filemap_write_and_wait(inode->i_mapping); 375 filemap_write_and_wait(inode->i_mapping);
@@ -386,11 +383,66 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
386 error = NFS_PROTO(inode)->setattr(dentry, &fattr, attr); 383 error = NFS_PROTO(inode)->setattr(dentry, &fattr, attr);
387 if (error == 0) 384 if (error == 0)
388 nfs_refresh_inode(inode, &fattr); 385 nfs_refresh_inode(inode, &fattr);
389 unlock_kernel();
390 return error; 386 return error;
391} 387}
392 388
393/** 389/**
390 * nfs_vmtruncate - unmap mappings "freed" by truncate() syscall
391 * @inode: inode of the file used
392 * @offset: file offset to start truncating
393 *
394 * This is a copy of the common vmtruncate, but with the locking
395 * corrected to take into account the fact that NFS requires
396 * inode->i_size to be updated under the inode->i_lock.
397 */
398static int nfs_vmtruncate(struct inode * inode, loff_t offset)
399{
400 if (i_size_read(inode) < offset) {
401 unsigned long limit;
402
403 limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
404 if (limit != RLIM_INFINITY && offset > limit)
405 goto out_sig;
406 if (offset > inode->i_sb->s_maxbytes)
407 goto out_big;
408 spin_lock(&inode->i_lock);
409 i_size_write(inode, offset);
410 spin_unlock(&inode->i_lock);
411 } else {
412 struct address_space *mapping = inode->i_mapping;
413
414 /*
415 * truncation of in-use swapfiles is disallowed - it would
416 * cause subsequent swapout to scribble on the now-freed
417 * blocks.
418 */
419 if (IS_SWAPFILE(inode))
420 return -ETXTBSY;
421 spin_lock(&inode->i_lock);
422 i_size_write(inode, offset);
423 spin_unlock(&inode->i_lock);
424
425 /*
426 * unmap_mapping_range is called twice, first simply for
427 * efficiency so that truncate_inode_pages does fewer
428 * single-page unmaps. However after this first call, and
429 * before truncate_inode_pages finishes, it is possible for
430 * private pages to be COWed, which remain after
431 * truncate_inode_pages finishes, hence the second
432 * unmap_mapping_range call must be made for correctness.
433 */
434 unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
435 truncate_inode_pages(mapping, offset);
436 unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
437 }
438 return 0;
439out_sig:
440 send_sig(SIGXFSZ, current, 0);
441out_big:
442 return -EFBIG;
443}
444
445/**
394 * nfs_setattr_update_inode - Update inode metadata after a setattr call. 446 * nfs_setattr_update_inode - Update inode metadata after a setattr call.
395 * @inode: pointer to struct inode 447 * @inode: pointer to struct inode
396 * @attr: pointer to struct iattr 448 * @attr: pointer to struct iattr
@@ -416,8 +468,7 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr)
416 } 468 }
417 if ((attr->ia_valid & ATTR_SIZE) != 0) { 469 if ((attr->ia_valid & ATTR_SIZE) != 0) {
418 nfs_inc_stats(inode, NFSIOS_SETATTRTRUNC); 470 nfs_inc_stats(inode, NFSIOS_SETATTRTRUNC);
419 inode->i_size = attr->ia_size; 471 nfs_vmtruncate(inode, attr->ia_size);
420 vmtruncate(inode, attr->ia_size);
421 } 472 }
422} 473}
423 474
@@ -647,7 +698,6 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
647 inode->i_sb->s_id, (long long)NFS_FILEID(inode)); 698 inode->i_sb->s_id, (long long)NFS_FILEID(inode));
648 699
649 nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE); 700 nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE);
650 lock_kernel();
651 if (is_bad_inode(inode)) 701 if (is_bad_inode(inode))
652 goto out_nowait; 702 goto out_nowait;
653 if (NFS_STALE(inode)) 703 if (NFS_STALE(inode))
@@ -696,7 +746,6 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
696 nfs_wake_up_inode(inode); 746 nfs_wake_up_inode(inode);
697 747
698 out_nowait: 748 out_nowait:
699 unlock_kernel();
700 return status; 749 return status;
701} 750}
702 751
@@ -831,9 +880,9 @@ static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
831 if (S_ISDIR(inode->i_mode)) 880 if (S_ISDIR(inode->i_mode))
832 nfsi->cache_validity |= NFS_INO_INVALID_DATA; 881 nfsi->cache_validity |= NFS_INO_INVALID_DATA;
833 } 882 }
834 if (inode->i_size == nfs_size_to_loff_t(fattr->pre_size) && 883 if (i_size_read(inode) == nfs_size_to_loff_t(fattr->pre_size) &&
835 nfsi->npages == 0) 884 nfsi->npages == 0)
836 inode->i_size = nfs_size_to_loff_t(fattr->size); 885 i_size_write(inode, nfs_size_to_loff_t(fattr->size));
837 } 886 }
838} 887}
839 888
@@ -974,7 +1023,7 @@ int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fa
974 (fattr->valid & NFS_ATTR_WCC) == 0) { 1023 (fattr->valid & NFS_ATTR_WCC) == 0) {
975 memcpy(&fattr->pre_ctime, &inode->i_ctime, sizeof(fattr->pre_ctime)); 1024 memcpy(&fattr->pre_ctime, &inode->i_ctime, sizeof(fattr->pre_ctime));
976 memcpy(&fattr->pre_mtime, &inode->i_mtime, sizeof(fattr->pre_mtime)); 1025 memcpy(&fattr->pre_mtime, &inode->i_mtime, sizeof(fattr->pre_mtime));
977 fattr->pre_size = inode->i_size; 1026 fattr->pre_size = i_size_read(inode);
978 fattr->valid |= NFS_ATTR_WCC; 1027 fattr->valid |= NFS_ATTR_WCC;
979 } 1028 }
980 return nfs_post_op_update_inode(inode, fattr); 1029 return nfs_post_op_update_inode(inode, fattr);
@@ -1059,7 +1108,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1059 /* Do we perhaps have any outstanding writes, or has 1108 /* Do we perhaps have any outstanding writes, or has
1060 * the file grown beyond our last write? */ 1109 * the file grown beyond our last write? */
1061 if (nfsi->npages == 0 || new_isize > cur_isize) { 1110 if (nfsi->npages == 0 || new_isize > cur_isize) {
1062 inode->i_size = new_isize; 1111 i_size_write(inode, new_isize);
1063 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; 1112 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
1064 } 1113 }
1065 dprintk("NFS: isize change on server for file %s/%ld\n", 1114 dprintk("NFS: isize change on server for file %s/%ld\n",
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 04ae867dddba..24241fcbb98d 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -150,6 +150,7 @@ extern void nfs_clear_inode(struct inode *);
150#ifdef CONFIG_NFS_V4 150#ifdef CONFIG_NFS_V4
151extern void nfs4_clear_inode(struct inode *); 151extern void nfs4_clear_inode(struct inode *);
152#endif 152#endif
153void nfs_zap_acl_cache(struct inode *inode);
153 154
154/* super.c */ 155/* super.c */
155extern struct file_system_type nfs_xdev_fs_type; 156extern struct file_system_type nfs_xdev_fs_type;
diff --git a/fs/nfs/iostat.h b/fs/nfs/iostat.h
index 6350ecbde589..a36952810032 100644
--- a/fs/nfs/iostat.h
+++ b/fs/nfs/iostat.h
@@ -5,135 +5,41 @@
5 * 5 *
6 * Copyright (C) 2005, 2006 Chuck Lever <cel@netapp.com> 6 * Copyright (C) 2005, 2006 Chuck Lever <cel@netapp.com>
7 * 7 *
8 * NFS client per-mount statistics provide information about the health of
9 * the NFS client and the health of each NFS mount point. Generally these
10 * are not for detailed problem diagnosis, but simply to indicate that there
11 * is a problem.
12 *
13 * These counters are not meant to be human-readable, but are meant to be
14 * integrated into system monitoring tools such as "sar" and "iostat". As
15 * such, the counters are sampled by the tools over time, and are never
16 * zeroed after a file system is mounted. Moving averages can be computed
17 * by the tools by taking the difference between two instantaneous samples
18 * and dividing that by the time between the samples.
19 */ 8 */
20 9
21#ifndef _NFS_IOSTAT 10#ifndef _NFS_IOSTAT
22#define _NFS_IOSTAT 11#define _NFS_IOSTAT
23 12
24#define NFS_IOSTAT_VERS "1.0"
25
26/*
27 * NFS byte counters
28 *
29 * 1. SERVER - the number of payload bytes read from or written to the
30 * server by the NFS client via an NFS READ or WRITE request.
31 *
32 * 2. NORMAL - the number of bytes read or written by applications via
33 * the read(2) and write(2) system call interfaces.
34 *
35 * 3. DIRECT - the number of bytes read or written from files opened
36 * with the O_DIRECT flag.
37 *
38 * These counters give a view of the data throughput into and out of the NFS
39 * client. Comparing the number of bytes requested by an application with the
40 * number of bytes the client requests from the server can provide an
41 * indication of client efficiency (per-op, cache hits, etc).
42 *
43 * These counters can also help characterize which access methods are in
44 * use. DIRECT by itself shows whether there is any O_DIRECT traffic.
45 * NORMAL + DIRECT shows how much data is going through the system call
46 * interface. A large amount of SERVER traffic without much NORMAL or
47 * DIRECT traffic shows that applications are using mapped files.
48 *
49 * NFS page counters
50 *
51 * These count the number of pages read or written via nfs_readpage(),
52 * nfs_readpages(), or their write equivalents.
53 */
54enum nfs_stat_bytecounters {
55 NFSIOS_NORMALREADBYTES = 0,
56 NFSIOS_NORMALWRITTENBYTES,
57 NFSIOS_DIRECTREADBYTES,
58 NFSIOS_DIRECTWRITTENBYTES,
59 NFSIOS_SERVERREADBYTES,
60 NFSIOS_SERVERWRITTENBYTES,
61 NFSIOS_READPAGES,
62 NFSIOS_WRITEPAGES,
63 __NFSIOS_BYTESMAX,
64};
65
66/*
67 * NFS event counters
68 *
69 * These counters provide a low-overhead way of monitoring client activity
70 * without enabling NFS trace debugging. The counters show the rate at
71 * which VFS requests are made, and how often the client invalidates its
72 * data and attribute caches. This allows system administrators to monitor
73 * such things as how close-to-open is working, and answer questions such
74 * as "why are there so many GETATTR requests on the wire?"
75 *
76 * They also count anamolous events such as short reads and writes, silly
77 * renames due to close-after-delete, and operations that change the size
78 * of a file (such operations can often be the source of data corruption
79 * if applications aren't using file locking properly).
80 */
81enum nfs_stat_eventcounters {
82 NFSIOS_INODEREVALIDATE = 0,
83 NFSIOS_DENTRYREVALIDATE,
84 NFSIOS_DATAINVALIDATE,
85 NFSIOS_ATTRINVALIDATE,
86 NFSIOS_VFSOPEN,
87 NFSIOS_VFSLOOKUP,
88 NFSIOS_VFSACCESS,
89 NFSIOS_VFSUPDATEPAGE,
90 NFSIOS_VFSREADPAGE,
91 NFSIOS_VFSREADPAGES,
92 NFSIOS_VFSWRITEPAGE,
93 NFSIOS_VFSWRITEPAGES,
94 NFSIOS_VFSGETDENTS,
95 NFSIOS_VFSSETATTR,
96 NFSIOS_VFSFLUSH,
97 NFSIOS_VFSFSYNC,
98 NFSIOS_VFSLOCK,
99 NFSIOS_VFSRELEASE,
100 NFSIOS_CONGESTIONWAIT,
101 NFSIOS_SETATTRTRUNC,
102 NFSIOS_EXTENDWRITE,
103 NFSIOS_SILLYRENAME,
104 NFSIOS_SHORTREAD,
105 NFSIOS_SHORTWRITE,
106 NFSIOS_DELAY,
107 __NFSIOS_COUNTSMAX,
108};
109
110#ifdef __KERNEL__
111
112#include <linux/percpu.h> 13#include <linux/percpu.h>
113#include <linux/cache.h> 14#include <linux/cache.h>
15#include <linux/nfs_iostat.h>
114 16
115struct nfs_iostats { 17struct nfs_iostats {
116 unsigned long long bytes[__NFSIOS_BYTESMAX]; 18 unsigned long long bytes[__NFSIOS_BYTESMAX];
117 unsigned long events[__NFSIOS_COUNTSMAX]; 19 unsigned long events[__NFSIOS_COUNTSMAX];
118} ____cacheline_aligned; 20} ____cacheline_aligned;
119 21
120static inline void nfs_inc_server_stats(struct nfs_server *server, enum nfs_stat_eventcounters stat) 22static inline void nfs_inc_server_stats(const struct nfs_server *server,
23 enum nfs_stat_eventcounters stat)
121{ 24{
122 struct nfs_iostats *iostats; 25 struct nfs_iostats *iostats;
123 int cpu; 26 int cpu;
124 27
125 cpu = get_cpu(); 28 cpu = get_cpu();
126 iostats = per_cpu_ptr(server->io_stats, cpu); 29 iostats = per_cpu_ptr(server->io_stats, cpu);
127 iostats->events[stat] ++; 30 iostats->events[stat]++;
128 put_cpu_no_resched(); 31 put_cpu_no_resched();
129} 32}
130 33
131static inline void nfs_inc_stats(struct inode *inode, enum nfs_stat_eventcounters stat) 34static inline void nfs_inc_stats(const struct inode *inode,
35 enum nfs_stat_eventcounters stat)
132{ 36{
133 nfs_inc_server_stats(NFS_SERVER(inode), stat); 37 nfs_inc_server_stats(NFS_SERVER(inode), stat);
134} 38}
135 39
136static inline void nfs_add_server_stats(struct nfs_server *server, enum nfs_stat_bytecounters stat, unsigned long addend) 40static inline void nfs_add_server_stats(const struct nfs_server *server,
41 enum nfs_stat_bytecounters stat,
42 unsigned long addend)
137{ 43{
138 struct nfs_iostats *iostats; 44 struct nfs_iostats *iostats;
139 int cpu; 45 int cpu;
@@ -144,7 +50,9 @@ static inline void nfs_add_server_stats(struct nfs_server *server, enum nfs_stat
144 put_cpu_no_resched(); 50 put_cpu_no_resched();
145} 51}
146 52
147static inline void nfs_add_stats(struct inode *inode, enum nfs_stat_bytecounters stat, unsigned long addend) 53static inline void nfs_add_stats(const struct inode *inode,
54 enum nfs_stat_bytecounters stat,
55 unsigned long addend)
148{ 56{
149 nfs_add_server_stats(NFS_SERVER(inode), stat, addend); 57 nfs_add_server_stats(NFS_SERVER(inode), stat, addend);
150} 58}
@@ -160,5 +68,4 @@ static inline void nfs_free_iostats(struct nfs_iostats *stats)
160 free_percpu(stats); 68 free_percpu(stats);
161} 69}
162 70
163#endif 71#endif /* _NFS_IOSTAT */
164#endif
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index 9b7362565c0c..423842f51ac9 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -5,6 +5,8 @@
5#include <linux/posix_acl_xattr.h> 5#include <linux/posix_acl_xattr.h>
6#include <linux/nfsacl.h> 6#include <linux/nfsacl.h>
7 7
8#include "internal.h"
9
8#define NFSDBG_FACILITY NFSDBG_PROC 10#define NFSDBG_FACILITY NFSDBG_PROC
9 11
10ssize_t nfs3_listxattr(struct dentry *dentry, char *buffer, size_t size) 12ssize_t nfs3_listxattr(struct dentry *dentry, char *buffer, size_t size)
@@ -205,6 +207,8 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)
205 status = nfs_revalidate_inode(server, inode); 207 status = nfs_revalidate_inode(server, inode);
206 if (status < 0) 208 if (status < 0)
207 return ERR_PTR(status); 209 return ERR_PTR(status);
210 if (NFS_I(inode)->cache_validity & NFS_INO_INVALID_ACL)
211 nfs_zap_acl_cache(inode);
208 acl = nfs3_get_cached_acl(inode, type); 212 acl = nfs3_get_cached_acl(inode, type);
209 if (acl != ERR_PTR(-EAGAIN)) 213 if (acl != ERR_PTR(-EAGAIN))
210 return acl; 214 return acl;
@@ -319,9 +323,8 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
319 dprintk("NFS call setacl\n"); 323 dprintk("NFS call setacl\n");
320 msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_SETACL]; 324 msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_SETACL];
321 status = rpc_call_sync(server->client_acl, &msg, 0); 325 status = rpc_call_sync(server->client_acl, &msg, 0);
322 spin_lock(&inode->i_lock); 326 nfs_access_zap_cache(inode);
323 NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ACCESS; 327 nfs_zap_acl_cache(inode);
324 spin_unlock(&inode->i_lock);
325 dprintk("NFS reply setacl: %d\n", status); 328 dprintk("NFS reply setacl: %d\n", status);
326 329
327 /* pages may have been allocated at the xdr layer. */ 330 /* pages may have been allocated at the xdr layer. */
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index c3523ad03ed1..1e750e4574a9 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -129,6 +129,8 @@ nfs3_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
129 int status; 129 int status;
130 130
131 dprintk("NFS call setattr\n"); 131 dprintk("NFS call setattr\n");
132 if (sattr->ia_valid & ATTR_FILE)
133 msg.rpc_cred = nfs_file_cred(sattr->ia_file);
132 nfs_fattr_init(fattr); 134 nfs_fattr_init(fattr);
133 status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); 135 status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
134 if (status == 0) 136 if (status == 0)
@@ -248,6 +250,53 @@ static int nfs3_proc_readlink(struct inode *inode, struct page *page,
248 return status; 250 return status;
249} 251}
250 252
253struct nfs3_createdata {
254 struct rpc_message msg;
255 union {
256 struct nfs3_createargs create;
257 struct nfs3_mkdirargs mkdir;
258 struct nfs3_symlinkargs symlink;
259 struct nfs3_mknodargs mknod;
260 } arg;
261 struct nfs3_diropres res;
262 struct nfs_fh fh;
263 struct nfs_fattr fattr;
264 struct nfs_fattr dir_attr;
265};
266
267static struct nfs3_createdata *nfs3_alloc_createdata(void)
268{
269 struct nfs3_createdata *data;
270
271 data = kzalloc(sizeof(*data), GFP_KERNEL);
272 if (data != NULL) {
273 data->msg.rpc_argp = &data->arg;
274 data->msg.rpc_resp = &data->res;
275 data->res.fh = &data->fh;
276 data->res.fattr = &data->fattr;
277 data->res.dir_attr = &data->dir_attr;
278 nfs_fattr_init(data->res.fattr);
279 nfs_fattr_init(data->res.dir_attr);
280 }
281 return data;
282}
283
284static int nfs3_do_create(struct inode *dir, struct dentry *dentry, struct nfs3_createdata *data)
285{
286 int status;
287
288 status = rpc_call_sync(NFS_CLIENT(dir), &data->msg, 0);
289 nfs_post_op_update_inode(dir, data->res.dir_attr);
290 if (status == 0)
291 status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
292 return status;
293}
294
295static void nfs3_free_createdata(struct nfs3_createdata *data)
296{
297 kfree(data);
298}
299
251/* 300/*
252 * Create a regular file. 301 * Create a regular file.
253 * For now, we don't implement O_EXCL. 302 * For now, we don't implement O_EXCL.
@@ -256,70 +305,60 @@ static int
256nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, 305nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
257 int flags, struct nameidata *nd) 306 int flags, struct nameidata *nd)
258{ 307{
259 struct nfs_fh fhandle; 308 struct nfs3_createdata *data;
260 struct nfs_fattr fattr;
261 struct nfs_fattr dir_attr;
262 struct nfs3_createargs arg = {
263 .fh = NFS_FH(dir),
264 .name = dentry->d_name.name,
265 .len = dentry->d_name.len,
266 .sattr = sattr,
267 };
268 struct nfs3_diropres res = {
269 .dir_attr = &dir_attr,
270 .fh = &fhandle,
271 .fattr = &fattr
272 };
273 struct rpc_message msg = {
274 .rpc_proc = &nfs3_procedures[NFS3PROC_CREATE],
275 .rpc_argp = &arg,
276 .rpc_resp = &res,
277 };
278 mode_t mode = sattr->ia_mode; 309 mode_t mode = sattr->ia_mode;
279 int status; 310 int status = -ENOMEM;
280 311
281 dprintk("NFS call create %s\n", dentry->d_name.name); 312 dprintk("NFS call create %s\n", dentry->d_name.name);
282 arg.createmode = NFS3_CREATE_UNCHECKED; 313
314 data = nfs3_alloc_createdata();
315 if (data == NULL)
316 goto out;
317
318 data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_CREATE];
319 data->arg.create.fh = NFS_FH(dir);
320 data->arg.create.name = dentry->d_name.name;
321 data->arg.create.len = dentry->d_name.len;
322 data->arg.create.sattr = sattr;
323
324 data->arg.create.createmode = NFS3_CREATE_UNCHECKED;
283 if (flags & O_EXCL) { 325 if (flags & O_EXCL) {
284 arg.createmode = NFS3_CREATE_EXCLUSIVE; 326 data->arg.create.createmode = NFS3_CREATE_EXCLUSIVE;
285 arg.verifier[0] = jiffies; 327 data->arg.create.verifier[0] = jiffies;
286 arg.verifier[1] = current->pid; 328 data->arg.create.verifier[1] = current->pid;
287 } 329 }
288 330
289 sattr->ia_mode &= ~current->fs->umask; 331 sattr->ia_mode &= ~current->fs->umask;
290 332
291again: 333 for (;;) {
292 nfs_fattr_init(&dir_attr); 334 status = nfs3_do_create(dir, dentry, data);
293 nfs_fattr_init(&fattr);
294 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
295 nfs_refresh_inode(dir, &dir_attr);
296 335
297 /* If the server doesn't support the exclusive creation semantics, 336 if (status != -ENOTSUPP)
298 * try again with simple 'guarded' mode. */ 337 break;
299 if (status == -ENOTSUPP) { 338 /* If the server doesn't support the exclusive creation
300 switch (arg.createmode) { 339 * semantics, try again with simple 'guarded' mode. */
340 switch (data->arg.create.createmode) {
301 case NFS3_CREATE_EXCLUSIVE: 341 case NFS3_CREATE_EXCLUSIVE:
302 arg.createmode = NFS3_CREATE_GUARDED; 342 data->arg.create.createmode = NFS3_CREATE_GUARDED;
303 break; 343 break;
304 344
305 case NFS3_CREATE_GUARDED: 345 case NFS3_CREATE_GUARDED:
306 arg.createmode = NFS3_CREATE_UNCHECKED; 346 data->arg.create.createmode = NFS3_CREATE_UNCHECKED;
307 break; 347 break;
308 348
309 case NFS3_CREATE_UNCHECKED: 349 case NFS3_CREATE_UNCHECKED:
310 goto out; 350 goto out;
311 } 351 }
312 goto again; 352 nfs_fattr_init(data->res.dir_attr);
353 nfs_fattr_init(data->res.fattr);
313 } 354 }
314 355
315 if (status == 0)
316 status = nfs_instantiate(dentry, &fhandle, &fattr);
317 if (status != 0) 356 if (status != 0)
318 goto out; 357 goto out;
319 358
320 /* When we created the file with exclusive semantics, make 359 /* When we created the file with exclusive semantics, make
321 * sure we set the attributes afterwards. */ 360 * sure we set the attributes afterwards. */
322 if (arg.createmode == NFS3_CREATE_EXCLUSIVE) { 361 if (data->arg.create.createmode == NFS3_CREATE_EXCLUSIVE) {
323 dprintk("NFS call setattr (post-create)\n"); 362 dprintk("NFS call setattr (post-create)\n");
324 363
325 if (!(sattr->ia_valid & ATTR_ATIME_SET)) 364 if (!(sattr->ia_valid & ATTR_ATIME_SET))
@@ -330,14 +369,15 @@ again:
330 /* Note: we could use a guarded setattr here, but I'm 369 /* Note: we could use a guarded setattr here, but I'm
331 * not sure this buys us anything (and I'd have 370 * not sure this buys us anything (and I'd have
332 * to revamp the NFSv3 XDR code) */ 371 * to revamp the NFSv3 XDR code) */
333 status = nfs3_proc_setattr(dentry, &fattr, sattr); 372 status = nfs3_proc_setattr(dentry, data->res.fattr, sattr);
334 nfs_post_op_update_inode(dentry->d_inode, &fattr); 373 nfs_post_op_update_inode(dentry->d_inode, data->res.fattr);
335 dprintk("NFS reply setattr (post-create): %d\n", status); 374 dprintk("NFS reply setattr (post-create): %d\n", status);
375 if (status != 0)
376 goto out;
336 } 377 }
337 if (status != 0)
338 goto out;
339 status = nfs3_proc_set_default_acl(dir, dentry->d_inode, mode); 378 status = nfs3_proc_set_default_acl(dir, dentry->d_inode, mode);
340out: 379out:
380 nfs3_free_createdata(data);
341 dprintk("NFS reply create: %d\n", status); 381 dprintk("NFS reply create: %d\n", status);
342 return status; 382 return status;
343} 383}
@@ -452,40 +492,28 @@ static int
452nfs3_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page, 492nfs3_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
453 unsigned int len, struct iattr *sattr) 493 unsigned int len, struct iattr *sattr)
454{ 494{
455 struct nfs_fh fhandle; 495 struct nfs3_createdata *data;
456 struct nfs_fattr fattr, dir_attr; 496 int status = -ENOMEM;
457 struct nfs3_symlinkargs arg = {
458 .fromfh = NFS_FH(dir),
459 .fromname = dentry->d_name.name,
460 .fromlen = dentry->d_name.len,
461 .pages = &page,
462 .pathlen = len,
463 .sattr = sattr
464 };
465 struct nfs3_diropres res = {
466 .dir_attr = &dir_attr,
467 .fh = &fhandle,
468 .fattr = &fattr
469 };
470 struct rpc_message msg = {
471 .rpc_proc = &nfs3_procedures[NFS3PROC_SYMLINK],
472 .rpc_argp = &arg,
473 .rpc_resp = &res,
474 };
475 int status;
476 497
477 if (len > NFS3_MAXPATHLEN) 498 if (len > NFS3_MAXPATHLEN)
478 return -ENAMETOOLONG; 499 return -ENAMETOOLONG;
479 500
480 dprintk("NFS call symlink %s\n", dentry->d_name.name); 501 dprintk("NFS call symlink %s\n", dentry->d_name.name);
481 502
482 nfs_fattr_init(&dir_attr); 503 data = nfs3_alloc_createdata();
483 nfs_fattr_init(&fattr); 504 if (data == NULL)
484 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
485 nfs_post_op_update_inode(dir, &dir_attr);
486 if (status != 0)
487 goto out; 505 goto out;
488 status = nfs_instantiate(dentry, &fhandle, &fattr); 506 data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_SYMLINK];
507 data->arg.symlink.fromfh = NFS_FH(dir);
508 data->arg.symlink.fromname = dentry->d_name.name;
509 data->arg.symlink.fromlen = dentry->d_name.len;
510 data->arg.symlink.pages = &page;
511 data->arg.symlink.pathlen = len;
512 data->arg.symlink.sattr = sattr;
513
514 status = nfs3_do_create(dir, dentry, data);
515
516 nfs3_free_createdata(data);
489out: 517out:
490 dprintk("NFS reply symlink: %d\n", status); 518 dprintk("NFS reply symlink: %d\n", status);
491 return status; 519 return status;
@@ -494,42 +522,31 @@ out:
494static int 522static int
495nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr) 523nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
496{ 524{
497 struct nfs_fh fhandle; 525 struct nfs3_createdata *data;
498 struct nfs_fattr fattr, dir_attr;
499 struct nfs3_mkdirargs arg = {
500 .fh = NFS_FH(dir),
501 .name = dentry->d_name.name,
502 .len = dentry->d_name.len,
503 .sattr = sattr
504 };
505 struct nfs3_diropres res = {
506 .dir_attr = &dir_attr,
507 .fh = &fhandle,
508 .fattr = &fattr
509 };
510 struct rpc_message msg = {
511 .rpc_proc = &nfs3_procedures[NFS3PROC_MKDIR],
512 .rpc_argp = &arg,
513 .rpc_resp = &res,
514 };
515 int mode = sattr->ia_mode; 526 int mode = sattr->ia_mode;
516 int status; 527 int status = -ENOMEM;
517 528
518 dprintk("NFS call mkdir %s\n", dentry->d_name.name); 529 dprintk("NFS call mkdir %s\n", dentry->d_name.name);
519 530
520 sattr->ia_mode &= ~current->fs->umask; 531 sattr->ia_mode &= ~current->fs->umask;
521 532
522 nfs_fattr_init(&dir_attr); 533 data = nfs3_alloc_createdata();
523 nfs_fattr_init(&fattr); 534 if (data == NULL)
524 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
525 nfs_post_op_update_inode(dir, &dir_attr);
526 if (status != 0)
527 goto out; 535 goto out;
528 status = nfs_instantiate(dentry, &fhandle, &fattr); 536
537 data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_MKDIR];
538 data->arg.mkdir.fh = NFS_FH(dir);
539 data->arg.mkdir.name = dentry->d_name.name;
540 data->arg.mkdir.len = dentry->d_name.len;
541 data->arg.mkdir.sattr = sattr;
542
543 status = nfs3_do_create(dir, dentry, data);
529 if (status != 0) 544 if (status != 0)
530 goto out; 545 goto out;
546
531 status = nfs3_proc_set_default_acl(dir, dentry->d_inode, mode); 547 status = nfs3_proc_set_default_acl(dir, dentry->d_inode, mode);
532out: 548out:
549 nfs3_free_createdata(data);
533 dprintk("NFS reply mkdir: %d\n", status); 550 dprintk("NFS reply mkdir: %d\n", status);
534 return status; 551 return status;
535} 552}
@@ -615,52 +632,50 @@ static int
615nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr, 632nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
616 dev_t rdev) 633 dev_t rdev)
617{ 634{
618 struct nfs_fh fh; 635 struct nfs3_createdata *data;
619 struct nfs_fattr fattr, dir_attr;
620 struct nfs3_mknodargs arg = {
621 .fh = NFS_FH(dir),
622 .name = dentry->d_name.name,
623 .len = dentry->d_name.len,
624 .sattr = sattr,
625 .rdev = rdev
626 };
627 struct nfs3_diropres res = {
628 .dir_attr = &dir_attr,
629 .fh = &fh,
630 .fattr = &fattr
631 };
632 struct rpc_message msg = {
633 .rpc_proc = &nfs3_procedures[NFS3PROC_MKNOD],
634 .rpc_argp = &arg,
635 .rpc_resp = &res,
636 };
637 mode_t mode = sattr->ia_mode; 636 mode_t mode = sattr->ia_mode;
638 int status; 637 int status = -ENOMEM;
639
640 switch (sattr->ia_mode & S_IFMT) {
641 case S_IFBLK: arg.type = NF3BLK; break;
642 case S_IFCHR: arg.type = NF3CHR; break;
643 case S_IFIFO: arg.type = NF3FIFO; break;
644 case S_IFSOCK: arg.type = NF3SOCK; break;
645 default: return -EINVAL;
646 }
647 638
648 dprintk("NFS call mknod %s %u:%u\n", dentry->d_name.name, 639 dprintk("NFS call mknod %s %u:%u\n", dentry->d_name.name,
649 MAJOR(rdev), MINOR(rdev)); 640 MAJOR(rdev), MINOR(rdev));
650 641
651 sattr->ia_mode &= ~current->fs->umask; 642 sattr->ia_mode &= ~current->fs->umask;
652 643
653 nfs_fattr_init(&dir_attr); 644 data = nfs3_alloc_createdata();
654 nfs_fattr_init(&fattr); 645 if (data == NULL)
655 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
656 nfs_post_op_update_inode(dir, &dir_attr);
657 if (status != 0)
658 goto out; 646 goto out;
659 status = nfs_instantiate(dentry, &fh, &fattr); 647
648 data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_MKNOD];
649 data->arg.mknod.fh = NFS_FH(dir);
650 data->arg.mknod.name = dentry->d_name.name;
651 data->arg.mknod.len = dentry->d_name.len;
652 data->arg.mknod.sattr = sattr;
653 data->arg.mknod.rdev = rdev;
654
655 switch (sattr->ia_mode & S_IFMT) {
656 case S_IFBLK:
657 data->arg.mknod.type = NF3BLK;
658 break;
659 case S_IFCHR:
660 data->arg.mknod.type = NF3CHR;
661 break;
662 case S_IFIFO:
663 data->arg.mknod.type = NF3FIFO;
664 break;
665 case S_IFSOCK:
666 data->arg.mknod.type = NF3SOCK;
667 break;
668 default:
669 status = -EINVAL;
670 goto out;
671 }
672
673 status = nfs3_do_create(dir, dentry, data);
660 if (status != 0) 674 if (status != 0)
661 goto out; 675 goto out;
662 status = nfs3_proc_set_default_acl(dir, dentry->d_inode, mode); 676 status = nfs3_proc_set_default_acl(dir, dentry->d_inode, mode);
663out: 677out:
678 nfs3_free_createdata(data);
664 dprintk("NFS reply mknod: %d\n", status); 679 dprintk("NFS reply mknod: %d\n", status);
665 return status; 680 return status;
666} 681}
@@ -801,8 +816,6 @@ const struct nfs_rpc_ops nfs_v3_clientops = {
801 .write_done = nfs3_write_done, 816 .write_done = nfs3_write_done,
802 .commit_setup = nfs3_proc_commit_setup, 817 .commit_setup = nfs3_proc_commit_setup,
803 .commit_done = nfs3_commit_done, 818 .commit_done = nfs3_commit_done,
804 .file_open = nfs_open,
805 .file_release = nfs_release,
806 .lock = nfs3_proc_lock, 819 .lock = nfs3_proc_lock,
807 .clear_acl_cache = nfs3_forget_cached_acls, 820 .clear_acl_cache = nfs3_forget_cached_acls,
808}; 821};
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 1293e0acd82b..c910413eaeca 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -451,9 +451,7 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata)
451 /* Save the delegation */ 451 /* Save the delegation */
452 memcpy(stateid.data, delegation->stateid.data, sizeof(stateid.data)); 452 memcpy(stateid.data, delegation->stateid.data, sizeof(stateid.data));
453 rcu_read_unlock(); 453 rcu_read_unlock();
454 lock_kernel();
455 ret = nfs_may_open(state->inode, state->owner->so_cred, open_mode); 454 ret = nfs_may_open(state->inode, state->owner->so_cred, open_mode);
456 unlock_kernel();
457 if (ret != 0) 455 if (ret != 0)
458 goto out; 456 goto out;
459 ret = -EAGAIN; 457 ret = -EAGAIN;
@@ -1139,8 +1137,9 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir, struct path *path, int
1139 return res; 1137 return res;
1140} 1138}
1141 1139
1142static int _nfs4_do_setattr(struct inode *inode, struct nfs_fattr *fattr, 1140static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
1143 struct iattr *sattr, struct nfs4_state *state) 1141 struct nfs_fattr *fattr, struct iattr *sattr,
1142 struct nfs4_state *state)
1144{ 1143{
1145 struct nfs_server *server = NFS_SERVER(inode); 1144 struct nfs_server *server = NFS_SERVER(inode);
1146 struct nfs_setattrargs arg = { 1145 struct nfs_setattrargs arg = {
@@ -1154,9 +1153,10 @@ static int _nfs4_do_setattr(struct inode *inode, struct nfs_fattr *fattr,
1154 .server = server, 1153 .server = server,
1155 }; 1154 };
1156 struct rpc_message msg = { 1155 struct rpc_message msg = {
1157 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETATTR], 1156 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETATTR],
1158 .rpc_argp = &arg, 1157 .rpc_argp = &arg,
1159 .rpc_resp = &res, 1158 .rpc_resp = &res,
1159 .rpc_cred = cred,
1160 }; 1160 };
1161 unsigned long timestamp = jiffies; 1161 unsigned long timestamp = jiffies;
1162 int status; 1162 int status;
@@ -1166,7 +1166,6 @@ static int _nfs4_do_setattr(struct inode *inode, struct nfs_fattr *fattr,
1166 if (nfs4_copy_delegation_stateid(&arg.stateid, inode)) { 1166 if (nfs4_copy_delegation_stateid(&arg.stateid, inode)) {
1167 /* Use that stateid */ 1167 /* Use that stateid */
1168 } else if (state != NULL) { 1168 } else if (state != NULL) {
1169 msg.rpc_cred = state->owner->so_cred;
1170 nfs4_copy_stateid(&arg.stateid, state, current->files); 1169 nfs4_copy_stateid(&arg.stateid, state, current->files);
1171 } else 1170 } else
1172 memcpy(&arg.stateid, &zero_stateid, sizeof(arg.stateid)); 1171 memcpy(&arg.stateid, &zero_stateid, sizeof(arg.stateid));
@@ -1177,15 +1176,16 @@ static int _nfs4_do_setattr(struct inode *inode, struct nfs_fattr *fattr,
1177 return status; 1176 return status;
1178} 1177}
1179 1178
1180static int nfs4_do_setattr(struct inode *inode, struct nfs_fattr *fattr, 1179static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
1181 struct iattr *sattr, struct nfs4_state *state) 1180 struct nfs_fattr *fattr, struct iattr *sattr,
1181 struct nfs4_state *state)
1182{ 1182{
1183 struct nfs_server *server = NFS_SERVER(inode); 1183 struct nfs_server *server = NFS_SERVER(inode);
1184 struct nfs4_exception exception = { }; 1184 struct nfs4_exception exception = { };
1185 int err; 1185 int err;
1186 do { 1186 do {
1187 err = nfs4_handle_exception(server, 1187 err = nfs4_handle_exception(server,
1188 _nfs4_do_setattr(inode, fattr, sattr, state), 1188 _nfs4_do_setattr(inode, cred, fattr, sattr, state),
1189 &exception); 1189 &exception);
1190 } while (exception.retry); 1190 } while (exception.retry);
1191 return err; 1191 return err;
@@ -1647,29 +1647,25 @@ static int
1647nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, 1647nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
1648 struct iattr *sattr) 1648 struct iattr *sattr)
1649{ 1649{
1650 struct rpc_cred *cred;
1651 struct inode *inode = dentry->d_inode; 1650 struct inode *inode = dentry->d_inode;
1652 struct nfs_open_context *ctx; 1651 struct rpc_cred *cred = NULL;
1653 struct nfs4_state *state = NULL; 1652 struct nfs4_state *state = NULL;
1654 int status; 1653 int status;
1655 1654
1656 nfs_fattr_init(fattr); 1655 nfs_fattr_init(fattr);
1657 1656
1658 cred = rpc_lookup_cred();
1659 if (IS_ERR(cred))
1660 return PTR_ERR(cred);
1661
1662 /* Search for an existing open(O_WRITE) file */ 1657 /* Search for an existing open(O_WRITE) file */
1663 ctx = nfs_find_open_context(inode, cred, FMODE_WRITE); 1658 if (sattr->ia_valid & ATTR_FILE) {
1664 if (ctx != NULL) 1659 struct nfs_open_context *ctx;
1660
1661 ctx = nfs_file_open_context(sattr->ia_file);
1662 cred = ctx->cred;
1665 state = ctx->state; 1663 state = ctx->state;
1664 }
1666 1665
1667 status = nfs4_do_setattr(inode, fattr, sattr, state); 1666 status = nfs4_do_setattr(inode, cred, fattr, sattr, state);
1668 if (status == 0) 1667 if (status == 0)
1669 nfs_setattr_update_inode(inode, sattr); 1668 nfs_setattr_update_inode(inode, sattr);
1670 if (ctx != NULL)
1671 put_nfs_open_context(ctx);
1672 put_rpccred(cred);
1673 return status; 1669 return status;
1674} 1670}
1675 1671
@@ -1897,17 +1893,16 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
1897 goto out; 1893 goto out;
1898 } 1894 }
1899 state = nfs4_do_open(dir, &path, flags, sattr, cred); 1895 state = nfs4_do_open(dir, &path, flags, sattr, cred);
1900 put_rpccred(cred);
1901 d_drop(dentry); 1896 d_drop(dentry);
1902 if (IS_ERR(state)) { 1897 if (IS_ERR(state)) {
1903 status = PTR_ERR(state); 1898 status = PTR_ERR(state);
1904 goto out; 1899 goto out_putcred;
1905 } 1900 }
1906 d_add(dentry, igrab(state->inode)); 1901 d_add(dentry, igrab(state->inode));
1907 nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); 1902 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
1908 if (flags & O_EXCL) { 1903 if (flags & O_EXCL) {
1909 struct nfs_fattr fattr; 1904 struct nfs_fattr fattr;
1910 status = nfs4_do_setattr(state->inode, &fattr, sattr, state); 1905 status = nfs4_do_setattr(state->inode, cred, &fattr, sattr, state);
1911 if (status == 0) 1906 if (status == 0)
1912 nfs_setattr_update_inode(state->inode, sattr); 1907 nfs_setattr_update_inode(state->inode, sattr);
1913 nfs_post_op_update_inode(state->inode, &fattr); 1908 nfs_post_op_update_inode(state->inode, &fattr);
@@ -1916,6 +1911,8 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
1916 status = nfs4_intent_set_file(nd, &path, state); 1911 status = nfs4_intent_set_file(nd, &path, state);
1917 else 1912 else
1918 nfs4_close_sync(&path, state, flags); 1913 nfs4_close_sync(&path, state, flags);
1914out_putcred:
1915 put_rpccred(cred);
1919out: 1916out:
1920 return status; 1917 return status;
1921} 1918}
@@ -2079,47 +2076,81 @@ static int nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *n
2079 return err; 2076 return err;
2080} 2077}
2081 2078
2079struct nfs4_createdata {
2080 struct rpc_message msg;
2081 struct nfs4_create_arg arg;
2082 struct nfs4_create_res res;
2083 struct nfs_fh fh;
2084 struct nfs_fattr fattr;
2085 struct nfs_fattr dir_fattr;
2086};
2087
2088static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir,
2089 struct qstr *name, struct iattr *sattr, u32 ftype)
2090{
2091 struct nfs4_createdata *data;
2092
2093 data = kzalloc(sizeof(*data), GFP_KERNEL);
2094 if (data != NULL) {
2095 struct nfs_server *server = NFS_SERVER(dir);
2096
2097 data->msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE];
2098 data->msg.rpc_argp = &data->arg;
2099 data->msg.rpc_resp = &data->res;
2100 data->arg.dir_fh = NFS_FH(dir);
2101 data->arg.server = server;
2102 data->arg.name = name;
2103 data->arg.attrs = sattr;
2104 data->arg.ftype = ftype;
2105 data->arg.bitmask = server->attr_bitmask;
2106 data->res.server = server;
2107 data->res.fh = &data->fh;
2108 data->res.fattr = &data->fattr;
2109 data->res.dir_fattr = &data->dir_fattr;
2110 nfs_fattr_init(data->res.fattr);
2111 nfs_fattr_init(data->res.dir_fattr);
2112 }
2113 return data;
2114}
2115
2116static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_createdata *data)
2117{
2118 int status = rpc_call_sync(NFS_CLIENT(dir), &data->msg, 0);
2119 if (status == 0) {
2120 update_changeattr(dir, &data->res.dir_cinfo);
2121 nfs_post_op_update_inode(dir, data->res.dir_fattr);
2122 status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
2123 }
2124 return status;
2125}
2126
2127static void nfs4_free_createdata(struct nfs4_createdata *data)
2128{
2129 kfree(data);
2130}
2131
2082static int _nfs4_proc_symlink(struct inode *dir, struct dentry *dentry, 2132static int _nfs4_proc_symlink(struct inode *dir, struct dentry *dentry,
2083 struct page *page, unsigned int len, struct iattr *sattr) 2133 struct page *page, unsigned int len, struct iattr *sattr)
2084{ 2134{
2085 struct nfs_server *server = NFS_SERVER(dir); 2135 struct nfs4_createdata *data;
2086 struct nfs_fh fhandle; 2136 int status = -ENAMETOOLONG;
2087 struct nfs_fattr fattr, dir_fattr;
2088 struct nfs4_create_arg arg = {
2089 .dir_fh = NFS_FH(dir),
2090 .server = server,
2091 .name = &dentry->d_name,
2092 .attrs = sattr,
2093 .ftype = NF4LNK,
2094 .bitmask = server->attr_bitmask,
2095 };
2096 struct nfs4_create_res res = {
2097 .server = server,
2098 .fh = &fhandle,
2099 .fattr = &fattr,
2100 .dir_fattr = &dir_fattr,
2101 };
2102 struct rpc_message msg = {
2103 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SYMLINK],
2104 .rpc_argp = &arg,
2105 .rpc_resp = &res,
2106 };
2107 int status;
2108 2137
2109 if (len > NFS4_MAXPATHLEN) 2138 if (len > NFS4_MAXPATHLEN)
2110 return -ENAMETOOLONG; 2139 goto out;
2111 2140
2112 arg.u.symlink.pages = &page; 2141 status = -ENOMEM;
2113 arg.u.symlink.len = len; 2142 data = nfs4_alloc_createdata(dir, &dentry->d_name, sattr, NF4LNK);
2114 nfs_fattr_init(&fattr); 2143 if (data == NULL)
2115 nfs_fattr_init(&dir_fattr); 2144 goto out;
2145
2146 data->msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SYMLINK];
2147 data->arg.u.symlink.pages = &page;
2148 data->arg.u.symlink.len = len;
2116 2149
2117 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); 2150 status = nfs4_do_create(dir, dentry, data);
2118 if (!status) { 2151
2119 update_changeattr(dir, &res.dir_cinfo); 2152 nfs4_free_createdata(data);
2120 nfs_post_op_update_inode(dir, res.dir_fattr); 2153out:
2121 status = nfs_instantiate(dentry, &fhandle, &fattr);
2122 }
2123 return status; 2154 return status;
2124} 2155}
2125 2156
@@ -2140,39 +2171,17 @@ static int nfs4_proc_symlink(struct inode *dir, struct dentry *dentry,
2140static int _nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry, 2171static int _nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
2141 struct iattr *sattr) 2172 struct iattr *sattr)
2142{ 2173{
2143 struct nfs_server *server = NFS_SERVER(dir); 2174 struct nfs4_createdata *data;
2144 struct nfs_fh fhandle; 2175 int status = -ENOMEM;
2145 struct nfs_fattr fattr, dir_fattr;
2146 struct nfs4_create_arg arg = {
2147 .dir_fh = NFS_FH(dir),
2148 .server = server,
2149 .name = &dentry->d_name,
2150 .attrs = sattr,
2151 .ftype = NF4DIR,
2152 .bitmask = server->attr_bitmask,
2153 };
2154 struct nfs4_create_res res = {
2155 .server = server,
2156 .fh = &fhandle,
2157 .fattr = &fattr,
2158 .dir_fattr = &dir_fattr,
2159 };
2160 struct rpc_message msg = {
2161 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE],
2162 .rpc_argp = &arg,
2163 .rpc_resp = &res,
2164 };
2165 int status;
2166 2176
2167 nfs_fattr_init(&fattr); 2177 data = nfs4_alloc_createdata(dir, &dentry->d_name, sattr, NF4DIR);
2168 nfs_fattr_init(&dir_fattr); 2178 if (data == NULL)
2169 2179 goto out;
2170 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); 2180
2171 if (!status) { 2181 status = nfs4_do_create(dir, dentry, data);
2172 update_changeattr(dir, &res.dir_cinfo); 2182
2173 nfs_post_op_update_inode(dir, res.dir_fattr); 2183 nfs4_free_createdata(data);
2174 status = nfs_instantiate(dentry, &fhandle, &fattr); 2184out:
2175 }
2176 return status; 2185 return status;
2177} 2186}
2178 2187
@@ -2242,56 +2251,34 @@ static int nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
2242static int _nfs4_proc_mknod(struct inode *dir, struct dentry *dentry, 2251static int _nfs4_proc_mknod(struct inode *dir, struct dentry *dentry,
2243 struct iattr *sattr, dev_t rdev) 2252 struct iattr *sattr, dev_t rdev)
2244{ 2253{
2245 struct nfs_server *server = NFS_SERVER(dir); 2254 struct nfs4_createdata *data;
2246 struct nfs_fh fh; 2255 int mode = sattr->ia_mode;
2247 struct nfs_fattr fattr, dir_fattr; 2256 int status = -ENOMEM;
2248 struct nfs4_create_arg arg = {
2249 .dir_fh = NFS_FH(dir),
2250 .server = server,
2251 .name = &dentry->d_name,
2252 .attrs = sattr,
2253 .bitmask = server->attr_bitmask,
2254 };
2255 struct nfs4_create_res res = {
2256 .server = server,
2257 .fh = &fh,
2258 .fattr = &fattr,
2259 .dir_fattr = &dir_fattr,
2260 };
2261 struct rpc_message msg = {
2262 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE],
2263 .rpc_argp = &arg,
2264 .rpc_resp = &res,
2265 };
2266 int status;
2267 int mode = sattr->ia_mode;
2268
2269 nfs_fattr_init(&fattr);
2270 nfs_fattr_init(&dir_fattr);
2271 2257
2272 BUG_ON(!(sattr->ia_valid & ATTR_MODE)); 2258 BUG_ON(!(sattr->ia_valid & ATTR_MODE));
2273 BUG_ON(!S_ISFIFO(mode) && !S_ISBLK(mode) && !S_ISCHR(mode) && !S_ISSOCK(mode)); 2259 BUG_ON(!S_ISFIFO(mode) && !S_ISBLK(mode) && !S_ISCHR(mode) && !S_ISSOCK(mode));
2260
2261 data = nfs4_alloc_createdata(dir, &dentry->d_name, sattr, NF4SOCK);
2262 if (data == NULL)
2263 goto out;
2264
2274 if (S_ISFIFO(mode)) 2265 if (S_ISFIFO(mode))
2275 arg.ftype = NF4FIFO; 2266 data->arg.ftype = NF4FIFO;
2276 else if (S_ISBLK(mode)) { 2267 else if (S_ISBLK(mode)) {
2277 arg.ftype = NF4BLK; 2268 data->arg.ftype = NF4BLK;
2278 arg.u.device.specdata1 = MAJOR(rdev); 2269 data->arg.u.device.specdata1 = MAJOR(rdev);
2279 arg.u.device.specdata2 = MINOR(rdev); 2270 data->arg.u.device.specdata2 = MINOR(rdev);
2280 } 2271 }
2281 else if (S_ISCHR(mode)) { 2272 else if (S_ISCHR(mode)) {
2282 arg.ftype = NF4CHR; 2273 data->arg.ftype = NF4CHR;
2283 arg.u.device.specdata1 = MAJOR(rdev); 2274 data->arg.u.device.specdata1 = MAJOR(rdev);
2284 arg.u.device.specdata2 = MINOR(rdev); 2275 data->arg.u.device.specdata2 = MINOR(rdev);
2285 } 2276 }
2286 else
2287 arg.ftype = NF4SOCK;
2288 2277
2289 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); 2278 status = nfs4_do_create(dir, dentry, data);
2290 if (status == 0) { 2279
2291 update_changeattr(dir, &res.dir_cinfo); 2280 nfs4_free_createdata(data);
2292 nfs_post_op_update_inode(dir, res.dir_fattr); 2281out:
2293 status = nfs_instantiate(dentry, &fh, &fattr);
2294 }
2295 return status; 2282 return status;
2296} 2283}
2297 2284
@@ -2706,6 +2693,8 @@ static ssize_t nfs4_proc_get_acl(struct inode *inode, void *buf, size_t buflen)
2706 ret = nfs_revalidate_inode(server, inode); 2693 ret = nfs_revalidate_inode(server, inode);
2707 if (ret < 0) 2694 if (ret < 0)
2708 return ret; 2695 return ret;
2696 if (NFS_I(inode)->cache_validity & NFS_INO_INVALID_ACL)
2697 nfs_zap_acl_cache(inode);
2709 ret = nfs4_read_cached_acl(inode, buf, buflen); 2698 ret = nfs4_read_cached_acl(inode, buf, buflen);
2710 if (ret != -ENOENT) 2699 if (ret != -ENOENT)
2711 return ret; 2700 return ret;
@@ -2733,7 +2722,8 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl
2733 nfs_inode_return_delegation(inode); 2722 nfs_inode_return_delegation(inode);
2734 buf_to_pages(buf, buflen, arg.acl_pages, &arg.acl_pgbase); 2723 buf_to_pages(buf, buflen, arg.acl_pages, &arg.acl_pgbase);
2735 ret = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); 2724 ret = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
2736 nfs_zap_caches(inode); 2725 nfs_access_zap_cache(inode);
2726 nfs_zap_acl_cache(inode);
2737 return ret; 2727 return ret;
2738} 2728}
2739 2729
@@ -2767,8 +2757,7 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server)
2767 task->tk_status = 0; 2757 task->tk_status = 0;
2768 return -EAGAIN; 2758 return -EAGAIN;
2769 case -NFS4ERR_DELAY: 2759 case -NFS4ERR_DELAY:
2770 nfs_inc_server_stats((struct nfs_server *) server, 2760 nfs_inc_server_stats(server, NFSIOS_DELAY);
2771 NFSIOS_DELAY);
2772 case -NFS4ERR_GRACE: 2761 case -NFS4ERR_GRACE:
2773 rpc_delay(task, NFS4_POLL_RETRY_MAX); 2762 rpc_delay(task, NFS4_POLL_RETRY_MAX);
2774 task->tk_status = 0; 2763 task->tk_status = 0;
@@ -2933,7 +2922,7 @@ static int _nfs4_proc_setclientid_confirm(struct nfs_client *clp, struct rpc_cre
2933 2922
2934int nfs4_proc_setclientid_confirm(struct nfs_client *clp, struct rpc_cred *cred) 2923int nfs4_proc_setclientid_confirm(struct nfs_client *clp, struct rpc_cred *cred)
2935{ 2924{
2936 long timeout; 2925 long timeout = 0;
2937 int err; 2926 int err;
2938 do { 2927 do {
2939 err = _nfs4_proc_setclientid_confirm(clp, cred); 2928 err = _nfs4_proc_setclientid_confirm(clp, cred);
@@ -3725,8 +3714,6 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
3725 .write_done = nfs4_write_done, 3714 .write_done = nfs4_write_done,
3726 .commit_setup = nfs4_proc_commit_setup, 3715 .commit_setup = nfs4_proc_commit_setup,
3727 .commit_done = nfs4_commit_done, 3716 .commit_done = nfs4_commit_done,
3728 .file_open = nfs_open,
3729 .file_release = nfs_release,
3730 .lock = nfs4_proc_lock, 3717 .lock = nfs4_proc_lock,
3731 .clear_acl_cache = nfs4_zap_acl_attr, 3718 .clear_acl_cache = nfs4_zap_acl_attr,
3732}; 3719};
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 856a8934f610..401ef8b28f97 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -940,7 +940,6 @@ static int reclaimer(void *ptr)
940 allow_signal(SIGKILL); 940 allow_signal(SIGKILL);
941 941
942 /* Ensure exclusive access to NFSv4 state */ 942 /* Ensure exclusive access to NFSv4 state */
943 lock_kernel();
944 down_write(&clp->cl_sem); 943 down_write(&clp->cl_sem);
945 /* Are there any NFS mounts out there? */ 944 /* Are there any NFS mounts out there? */
946 if (list_empty(&clp->cl_superblocks)) 945 if (list_empty(&clp->cl_superblocks))
@@ -1000,7 +999,6 @@ restart_loop:
1000 nfs_delegation_reap_unclaimed(clp); 999 nfs_delegation_reap_unclaimed(clp);
1001out: 1000out:
1002 up_write(&clp->cl_sem); 1001 up_write(&clp->cl_sem);
1003 unlock_kernel();
1004 if (status == -NFS4ERR_CB_PATH_DOWN) 1002 if (status == -NFS4ERR_CB_PATH_DOWN)
1005 nfs_handle_cb_pathdown(clp); 1003 nfs_handle_cb_pathdown(clp);
1006 nfs4_clear_recover_bit(clp); 1004 nfs4_clear_recover_bit(clp);
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index 531379d36823..46763d1cd397 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -1,6 +1,4 @@
1/* 1/*
2 * $Id: nfsroot.c,v 1.45 1998/03/07 10:44:46 mj Exp $
3 *
4 * Copyright (C) 1995, 1996 Gero Kuhlmann <gero@gkminix.han.de> 2 * Copyright (C) 1995, 1996 Gero Kuhlmann <gero@gkminix.han.de>
5 * 3 *
6 * Allow an NFS filesystem to be mounted as root. The way this works is: 4 * Allow an NFS filesystem to be mounted as root. The way this works is:
@@ -297,10 +295,10 @@ static int __init root_nfs_name(char *name)
297 nfs_data.flags = NFS_MOUNT_NONLM; /* No lockd in nfs root yet */ 295 nfs_data.flags = NFS_MOUNT_NONLM; /* No lockd in nfs root yet */
298 nfs_data.rsize = NFS_DEF_FILE_IO_SIZE; 296 nfs_data.rsize = NFS_DEF_FILE_IO_SIZE;
299 nfs_data.wsize = NFS_DEF_FILE_IO_SIZE; 297 nfs_data.wsize = NFS_DEF_FILE_IO_SIZE;
300 nfs_data.acregmin = 3; 298 nfs_data.acregmin = NFS_DEF_ACREGMIN;
301 nfs_data.acregmax = 60; 299 nfs_data.acregmax = NFS_DEF_ACREGMAX;
302 nfs_data.acdirmin = 30; 300 nfs_data.acdirmin = NFS_DEF_ACDIRMIN;
303 nfs_data.acdirmax = 60; 301 nfs_data.acdirmax = NFS_DEF_ACDIRMAX;
304 strcpy(buf, NFS_ROOT); 302 strcpy(buf, NFS_ROOT);
305 303
306 /* Process options received from the remote server */ 304 /* Process options received from the remote server */
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 03599bfe81cf..4dbb84df1b68 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -129,6 +129,8 @@ nfs_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
129 sattr->ia_mode &= S_IALLUGO; 129 sattr->ia_mode &= S_IALLUGO;
130 130
131 dprintk("NFS call setattr\n"); 131 dprintk("NFS call setattr\n");
132 if (sattr->ia_valid & ATTR_FILE)
133 msg.rpc_cred = nfs_file_cred(sattr->ia_file);
132 nfs_fattr_init(fattr); 134 nfs_fattr_init(fattr);
133 status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); 135 status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
134 if (status == 0) 136 if (status == 0)
@@ -598,6 +600,29 @@ nfs_proc_lock(struct file *filp, int cmd, struct file_lock *fl)
598 return nlmclnt_proc(NFS_SERVER(inode)->nlm_host, cmd, fl); 600 return nlmclnt_proc(NFS_SERVER(inode)->nlm_host, cmd, fl);
599} 601}
600 602
603/* Helper functions for NFS lock bounds checking */
604#define NFS_LOCK32_OFFSET_MAX ((__s32)0x7fffffffUL)
605static int nfs_lock_check_bounds(const struct file_lock *fl)
606{
607 __s32 start, end;
608
609 start = (__s32)fl->fl_start;
610 if ((loff_t)start != fl->fl_start)
611 goto out_einval;
612
613 if (fl->fl_end != OFFSET_MAX) {
614 end = (__s32)fl->fl_end;
615 if ((loff_t)end != fl->fl_end)
616 goto out_einval;
617 } else
618 end = NFS_LOCK32_OFFSET_MAX;
619
620 if (start < 0 || start > end)
621 goto out_einval;
622 return 0;
623out_einval:
624 return -EINVAL;
625}
601 626
602const struct nfs_rpc_ops nfs_v2_clientops = { 627const struct nfs_rpc_ops nfs_v2_clientops = {
603 .version = 2, /* protocol version */ 628 .version = 2, /* protocol version */
@@ -630,7 +655,6 @@ const struct nfs_rpc_ops nfs_v2_clientops = {
630 .write_setup = nfs_proc_write_setup, 655 .write_setup = nfs_proc_write_setup,
631 .write_done = nfs_write_done, 656 .write_done = nfs_write_done,
632 .commit_setup = nfs_proc_commit_setup, 657 .commit_setup = nfs_proc_commit_setup,
633 .file_open = nfs_open,
634 .file_release = nfs_release,
635 .lock = nfs_proc_lock, 658 .lock = nfs_proc_lock,
659 .lock_check_bounds = nfs_lock_check_bounds,
636}; 660};
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 614efeed5437..1b94e3650f5c 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -47,6 +47,7 @@
47#include <linux/inet.h> 47#include <linux/inet.h>
48#include <linux/in6.h> 48#include <linux/in6.h>
49#include <net/ipv6.h> 49#include <net/ipv6.h>
50#include <linux/netdevice.h>
50#include <linux/nfs_xdr.h> 51#include <linux/nfs_xdr.h>
51#include <linux/magic.h> 52#include <linux/magic.h>
52#include <linux/parser.h> 53#include <linux/parser.h>
@@ -65,7 +66,6 @@
65enum { 66enum {
66 /* Mount options that take no arguments */ 67 /* Mount options that take no arguments */
67 Opt_soft, Opt_hard, 68 Opt_soft, Opt_hard,
68 Opt_intr, Opt_nointr,
69 Opt_posix, Opt_noposix, 69 Opt_posix, Opt_noposix,
70 Opt_cto, Opt_nocto, 70 Opt_cto, Opt_nocto,
71 Opt_ac, Opt_noac, 71 Opt_ac, Opt_noac,
@@ -92,8 +92,8 @@ enum {
92 Opt_sec, Opt_proto, Opt_mountproto, Opt_mounthost, 92 Opt_sec, Opt_proto, Opt_mountproto, Opt_mounthost,
93 Opt_addr, Opt_mountaddr, Opt_clientaddr, 93 Opt_addr, Opt_mountaddr, Opt_clientaddr,
94 94
95 /* Mount options that are ignored */ 95 /* Special mount options */
96 Opt_userspace, Opt_deprecated, 96 Opt_userspace, Opt_deprecated, Opt_sloppy,
97 97
98 Opt_err 98 Opt_err
99}; 99};
@@ -101,10 +101,14 @@ enum {
101static match_table_t nfs_mount_option_tokens = { 101static match_table_t nfs_mount_option_tokens = {
102 { Opt_userspace, "bg" }, 102 { Opt_userspace, "bg" },
103 { Opt_userspace, "fg" }, 103 { Opt_userspace, "fg" },
104 { Opt_userspace, "retry=%s" },
105
106 { Opt_sloppy, "sloppy" },
107
104 { Opt_soft, "soft" }, 108 { Opt_soft, "soft" },
105 { Opt_hard, "hard" }, 109 { Opt_hard, "hard" },
106 { Opt_intr, "intr" }, 110 { Opt_deprecated, "intr" },
107 { Opt_nointr, "nointr" }, 111 { Opt_deprecated, "nointr" },
108 { Opt_posix, "posix" }, 112 { Opt_posix, "posix" },
109 { Opt_noposix, "noposix" }, 113 { Opt_noposix, "noposix" },
110 { Opt_cto, "cto" }, 114 { Opt_cto, "cto" },
@@ -136,7 +140,6 @@ static match_table_t nfs_mount_option_tokens = {
136 { Opt_acdirmin, "acdirmin=%u" }, 140 { Opt_acdirmin, "acdirmin=%u" },
137 { Opt_acdirmax, "acdirmax=%u" }, 141 { Opt_acdirmax, "acdirmax=%u" },
138 { Opt_actimeo, "actimeo=%u" }, 142 { Opt_actimeo, "actimeo=%u" },
139 { Opt_userspace, "retry=%u" },
140 { Opt_namelen, "namlen=%u" }, 143 { Opt_namelen, "namlen=%u" },
141 { Opt_mountport, "mountport=%u" }, 144 { Opt_mountport, "mountport=%u" },
142 { Opt_mountvers, "mountvers=%u" }, 145 { Opt_mountvers, "mountvers=%u" },
@@ -207,6 +210,7 @@ static int nfs_xdev_get_sb(struct file_system_type *fs_type,
207 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt); 210 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
208static void nfs_kill_super(struct super_block *); 211static void nfs_kill_super(struct super_block *);
209static void nfs_put_super(struct super_block *); 212static void nfs_put_super(struct super_block *);
213static int nfs_remount(struct super_block *sb, int *flags, char *raw_data);
210 214
211static struct file_system_type nfs_fs_type = { 215static struct file_system_type nfs_fs_type = {
212 .owner = THIS_MODULE, 216 .owner = THIS_MODULE,
@@ -234,6 +238,7 @@ static const struct super_operations nfs_sops = {
234 .umount_begin = nfs_umount_begin, 238 .umount_begin = nfs_umount_begin,
235 .show_options = nfs_show_options, 239 .show_options = nfs_show_options,
236 .show_stats = nfs_show_stats, 240 .show_stats = nfs_show_stats,
241 .remount_fs = nfs_remount,
237}; 242};
238 243
239#ifdef CONFIG_NFS_V4 244#ifdef CONFIG_NFS_V4
@@ -278,6 +283,7 @@ static const struct super_operations nfs4_sops = {
278 .umount_begin = nfs_umount_begin, 283 .umount_begin = nfs_umount_begin,
279 .show_options = nfs_show_options, 284 .show_options = nfs_show_options,
280 .show_stats = nfs_show_stats, 285 .show_stats = nfs_show_stats,
286 .remount_fs = nfs_remount,
281}; 287};
282#endif 288#endif
283 289
@@ -368,8 +374,6 @@ static int nfs_statfs(struct dentry *dentry, struct kstatfs *buf)
368 }; 374 };
369 int error; 375 int error;
370 376
371 lock_kernel();
372
373 error = server->nfs_client->rpc_ops->statfs(server, fh, &res); 377 error = server->nfs_client->rpc_ops->statfs(server, fh, &res);
374 if (error < 0) 378 if (error < 0)
375 goto out_err; 379 goto out_err;
@@ -401,12 +405,10 @@ static int nfs_statfs(struct dentry *dentry, struct kstatfs *buf)
401 405
402 buf->f_namelen = server->namelen; 406 buf->f_namelen = server->namelen;
403 407
404 unlock_kernel();
405 return 0; 408 return 0;
406 409
407 out_err: 410 out_err:
408 dprintk("%s: statfs error = %d\n", __func__, -error); 411 dprintk("%s: statfs error = %d\n", __func__, -error);
409 unlock_kernel();
410 return error; 412 return error;
411} 413}
412 414
@@ -514,13 +516,13 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
514 if (nfss->bsize != 0) 516 if (nfss->bsize != 0)
515 seq_printf(m, ",bsize=%u", nfss->bsize); 517 seq_printf(m, ",bsize=%u", nfss->bsize);
516 seq_printf(m, ",namlen=%u", nfss->namelen); 518 seq_printf(m, ",namlen=%u", nfss->namelen);
517 if (nfss->acregmin != 3*HZ || showdefaults) 519 if (nfss->acregmin != NFS_DEF_ACREGMIN*HZ || showdefaults)
518 seq_printf(m, ",acregmin=%u", nfss->acregmin/HZ); 520 seq_printf(m, ",acregmin=%u", nfss->acregmin/HZ);
519 if (nfss->acregmax != 60*HZ || showdefaults) 521 if (nfss->acregmax != NFS_DEF_ACREGMAX*HZ || showdefaults)
520 seq_printf(m, ",acregmax=%u", nfss->acregmax/HZ); 522 seq_printf(m, ",acregmax=%u", nfss->acregmax/HZ);
521 if (nfss->acdirmin != 30*HZ || showdefaults) 523 if (nfss->acdirmin != NFS_DEF_ACDIRMIN*HZ || showdefaults)
522 seq_printf(m, ",acdirmin=%u", nfss->acdirmin/HZ); 524 seq_printf(m, ",acdirmin=%u", nfss->acdirmin/HZ);
523 if (nfss->acdirmax != 60*HZ || showdefaults) 525 if (nfss->acdirmax != NFS_DEF_ACDIRMAX*HZ || showdefaults)
524 seq_printf(m, ",acdirmax=%u", nfss->acdirmax/HZ); 526 seq_printf(m, ",acdirmax=%u", nfss->acdirmax/HZ);
525 for (nfs_infop = nfs_info; nfs_infop->flag; nfs_infop++) { 527 for (nfs_infop = nfs_info; nfs_infop->flag; nfs_infop++) {
526 if (nfss->flags & nfs_infop->flag) 528 if (nfss->flags & nfs_infop->flag)
@@ -702,49 +704,233 @@ static int nfs_verify_server_address(struct sockaddr *addr)
702 return 0; 704 return 0;
703} 705}
704 706
707static void nfs_parse_ipv4_address(char *string, size_t str_len,
708 struct sockaddr *sap, size_t *addr_len)
709{
710 struct sockaddr_in *sin = (struct sockaddr_in *)sap;
711 u8 *addr = (u8 *)&sin->sin_addr.s_addr;
712
713 if (str_len <= INET_ADDRSTRLEN) {
714 dfprintk(MOUNT, "NFS: parsing IPv4 address %*s\n",
715 (int)str_len, string);
716
717 sin->sin_family = AF_INET;
718 *addr_len = sizeof(*sin);
719 if (in4_pton(string, str_len, addr, '\0', NULL))
720 return;
721 }
722
723 sap->sa_family = AF_UNSPEC;
724 *addr_len = 0;
725}
726
727#define IPV6_SCOPE_DELIMITER '%'
728
729#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
730static void nfs_parse_ipv6_scope_id(const char *string, const size_t str_len,
731 const char *delim,
732 struct sockaddr_in6 *sin6)
733{
734 char *p;
735 size_t len;
736
737 if (!(ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL))
738 return ;
739 if (*delim != IPV6_SCOPE_DELIMITER)
740 return;
741
742 len = (string + str_len) - delim - 1;
743 p = kstrndup(delim + 1, len, GFP_KERNEL);
744 if (p) {
745 unsigned long scope_id = 0;
746 struct net_device *dev;
747
748 dev = dev_get_by_name(&init_net, p);
749 if (dev != NULL) {
750 scope_id = dev->ifindex;
751 dev_put(dev);
752 } else {
753 /* scope_id is set to zero on error */
754 strict_strtoul(p, 10, &scope_id);
755 }
756
757 kfree(p);
758 sin6->sin6_scope_id = scope_id;
759 dfprintk(MOUNT, "NFS: IPv6 scope ID = %lu\n", scope_id);
760 }
761}
762
763static void nfs_parse_ipv6_address(char *string, size_t str_len,
764 struct sockaddr *sap, size_t *addr_len)
765{
766 struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
767 u8 *addr = (u8 *)&sin6->sin6_addr.in6_u;
768 const char *delim;
769
770 if (str_len <= INET6_ADDRSTRLEN) {
771 dfprintk(MOUNT, "NFS: parsing IPv6 address %*s\n",
772 (int)str_len, string);
773
774 sin6->sin6_family = AF_INET6;
775 *addr_len = sizeof(*sin6);
776 if (in6_pton(string, str_len, addr, IPV6_SCOPE_DELIMITER, &delim)) {
777 nfs_parse_ipv6_scope_id(string, str_len, delim, sin6);
778 return;
779 }
780 }
781
782 sap->sa_family = AF_UNSPEC;
783 *addr_len = 0;
784}
785#else
786static void nfs_parse_ipv6_address(char *string, size_t str_len,
787 struct sockaddr *sap, size_t *addr_len)
788{
789 sap->sa_family = AF_UNSPEC;
790 *addr_len = 0;
791}
792#endif
793
705/* 794/*
706 * Parse string addresses passed in via a mount option, 795 * Construct a sockaddr based on the contents of a string that contains
707 * and construct a sockaddr based on the result. 796 * an IP address in presentation format.
708 * 797 *
709 * If address parsing fails, set the sockaddr's address 798 * If there is a problem constructing the new sockaddr, set the address
710 * family to AF_UNSPEC to force nfs_verify_server_address() 799 * family to AF_UNSPEC.
711 * to punt the mount.
712 */ 800 */
713static void nfs_parse_server_address(char *value, 801static void nfs_parse_ip_address(char *string, size_t str_len,
714 struct sockaddr *sap, 802 struct sockaddr *sap, size_t *addr_len)
715 size_t *len)
716{ 803{
717 if (strchr(value, ':')) { 804 unsigned int i, colons;
718 struct sockaddr_in6 *ap = (struct sockaddr_in6 *)sap;
719 u8 *addr = (u8 *)&ap->sin6_addr.in6_u;
720 805
721 ap->sin6_family = AF_INET6; 806 colons = 0;
722 *len = sizeof(*ap); 807 for (i = 0; i < str_len; i++)
723 if (in6_pton(value, -1, addr, '\0', NULL)) 808 if (string[i] == ':')
724 return; 809 colons++;
725 } else { 810
726 struct sockaddr_in *ap = (struct sockaddr_in *)sap; 811 if (colons >= 2)
727 u8 *addr = (u8 *)&ap->sin_addr.s_addr; 812 nfs_parse_ipv6_address(string, str_len, sap, addr_len);
813 else
814 nfs_parse_ipv4_address(string, str_len, sap, addr_len);
815}
816
817/*
818 * Sanity check the NFS transport protocol.
819 *
820 */
821static void nfs_validate_transport_protocol(struct nfs_parsed_mount_data *mnt)
822{
823 switch (mnt->nfs_server.protocol) {
824 case XPRT_TRANSPORT_UDP:
825 case XPRT_TRANSPORT_TCP:
826 case XPRT_TRANSPORT_RDMA:
827 break;
828 default:
829 mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP;
830 }
831}
832
833/*
834 * For text based NFSv2/v3 mounts, the mount protocol transport default
835 * settings should depend upon the specified NFS transport.
836 */
837static void nfs_set_mount_transport_protocol(struct nfs_parsed_mount_data *mnt)
838{
839 nfs_validate_transport_protocol(mnt);
728 840
729 ap->sin_family = AF_INET; 841 if (mnt->mount_server.protocol == XPRT_TRANSPORT_UDP ||
730 *len = sizeof(*ap); 842 mnt->mount_server.protocol == XPRT_TRANSPORT_TCP)
731 if (in4_pton(value, -1, addr, '\0', NULL))
732 return; 843 return;
844 switch (mnt->nfs_server.protocol) {
845 case XPRT_TRANSPORT_UDP:
846 mnt->mount_server.protocol = XPRT_TRANSPORT_UDP;
847 break;
848 case XPRT_TRANSPORT_TCP:
849 case XPRT_TRANSPORT_RDMA:
850 mnt->mount_server.protocol = XPRT_TRANSPORT_TCP;
733 } 851 }
852}
734 853
735 sap->sa_family = AF_UNSPEC; 854/*
736 *len = 0; 855 * Parse the value of the 'sec=' option.
856 *
857 * The flavor_len setting is for v4 mounts.
858 */
859static int nfs_parse_security_flavors(char *value,
860 struct nfs_parsed_mount_data *mnt)
861{
862 substring_t args[MAX_OPT_ARGS];
863
864 dfprintk(MOUNT, "NFS: parsing sec=%s option\n", value);
865
866 switch (match_token(value, nfs_secflavor_tokens, args)) {
867 case Opt_sec_none:
868 mnt->auth_flavor_len = 0;
869 mnt->auth_flavors[0] = RPC_AUTH_NULL;
870 break;
871 case Opt_sec_sys:
872 mnt->auth_flavor_len = 0;
873 mnt->auth_flavors[0] = RPC_AUTH_UNIX;
874 break;
875 case Opt_sec_krb5:
876 mnt->auth_flavor_len = 1;
877 mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5;
878 break;
879 case Opt_sec_krb5i:
880 mnt->auth_flavor_len = 1;
881 mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5I;
882 break;
883 case Opt_sec_krb5p:
884 mnt->auth_flavor_len = 1;
885 mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5P;
886 break;
887 case Opt_sec_lkey:
888 mnt->auth_flavor_len = 1;
889 mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEY;
890 break;
891 case Opt_sec_lkeyi:
892 mnt->auth_flavor_len = 1;
893 mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYI;
894 break;
895 case Opt_sec_lkeyp:
896 mnt->auth_flavor_len = 1;
897 mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYP;
898 break;
899 case Opt_sec_spkm:
900 mnt->auth_flavor_len = 1;
901 mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKM;
902 break;
903 case Opt_sec_spkmi:
904 mnt->auth_flavor_len = 1;
905 mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMI;
906 break;
907 case Opt_sec_spkmp:
908 mnt->auth_flavor_len = 1;
909 mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMP;
910 break;
911 default:
912 return 0;
913 }
914
915 return 1;
916}
917
918static void nfs_parse_invalid_value(const char *option)
919{
920 dfprintk(MOUNT, "NFS: bad value specified for %s option\n", option);
737} 921}
738 922
739/* 923/*
740 * Error-check and convert a string of mount options from user space into 924 * Error-check and convert a string of mount options from user space into
741 * a data structure 925 * a data structure. The whole mount string is processed; bad options are
926 * skipped as they are encountered. If there were no errors, return 1;
927 * otherwise return 0 (zero).
742 */ 928 */
743static int nfs_parse_mount_options(char *raw, 929static int nfs_parse_mount_options(char *raw,
744 struct nfs_parsed_mount_data *mnt) 930 struct nfs_parsed_mount_data *mnt)
745{ 931{
746 char *p, *string, *secdata; 932 char *p, *string, *secdata;
747 int rc; 933 int rc, sloppy = 0, errors = 0;
748 934
749 if (!raw) { 935 if (!raw) {
750 dfprintk(MOUNT, "NFS: mount options string was NULL.\n"); 936 dfprintk(MOUNT, "NFS: mount options string was NULL.\n");
@@ -777,15 +963,16 @@ static int nfs_parse_mount_options(char *raw,
777 963
778 token = match_token(p, nfs_mount_option_tokens, args); 964 token = match_token(p, nfs_mount_option_tokens, args);
779 switch (token) { 965 switch (token) {
966
967 /*
968 * boolean options: foo/nofoo
969 */
780 case Opt_soft: 970 case Opt_soft:
781 mnt->flags |= NFS_MOUNT_SOFT; 971 mnt->flags |= NFS_MOUNT_SOFT;
782 break; 972 break;
783 case Opt_hard: 973 case Opt_hard:
784 mnt->flags &= ~NFS_MOUNT_SOFT; 974 mnt->flags &= ~NFS_MOUNT_SOFT;
785 break; 975 break;
786 case Opt_intr:
787 case Opt_nointr:
788 break;
789 case Opt_posix: 976 case Opt_posix:
790 mnt->flags |= NFS_MOUNT_POSIX; 977 mnt->flags |= NFS_MOUNT_POSIX;
791 break; 978 break;
@@ -819,20 +1006,14 @@ static int nfs_parse_mount_options(char *raw,
819 case Opt_udp: 1006 case Opt_udp:
820 mnt->flags &= ~NFS_MOUNT_TCP; 1007 mnt->flags &= ~NFS_MOUNT_TCP;
821 mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP; 1008 mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP;
822 mnt->timeo = 7;
823 mnt->retrans = 5;
824 break; 1009 break;
825 case Opt_tcp: 1010 case Opt_tcp:
826 mnt->flags |= NFS_MOUNT_TCP; 1011 mnt->flags |= NFS_MOUNT_TCP;
827 mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP; 1012 mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP;
828 mnt->timeo = 600;
829 mnt->retrans = 2;
830 break; 1013 break;
831 case Opt_rdma: 1014 case Opt_rdma:
832 mnt->flags |= NFS_MOUNT_TCP; /* for side protocols */ 1015 mnt->flags |= NFS_MOUNT_TCP; /* for side protocols */
833 mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA; 1016 mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA;
834 mnt->timeo = 600;
835 mnt->retrans = 2;
836 break; 1017 break;
837 case Opt_acl: 1018 case Opt_acl:
838 mnt->flags &= ~NFS_MOUNT_NOACL; 1019 mnt->flags &= ~NFS_MOUNT_NOACL;
@@ -853,165 +1034,144 @@ static int nfs_parse_mount_options(char *raw,
853 mnt->flags |= NFS_MOUNT_UNSHARED; 1034 mnt->flags |= NFS_MOUNT_UNSHARED;
854 break; 1035 break;
855 1036
1037 /*
1038 * options that take numeric values
1039 */
856 case Opt_port: 1040 case Opt_port:
857 if (match_int(args, &option)) 1041 if (match_int(args, &option) ||
858 return 0; 1042 option < 0 || option > USHORT_MAX) {
859 if (option < 0 || option > 65535) 1043 errors++;
860 return 0; 1044 nfs_parse_invalid_value("port");
861 mnt->nfs_server.port = option; 1045 } else
1046 mnt->nfs_server.port = option;
862 break; 1047 break;
863 case Opt_rsize: 1048 case Opt_rsize:
864 if (match_int(args, &mnt->rsize)) 1049 if (match_int(args, &option) || option < 0) {
865 return 0; 1050 errors++;
1051 nfs_parse_invalid_value("rsize");
1052 } else
1053 mnt->rsize = option;
866 break; 1054 break;
867 case Opt_wsize: 1055 case Opt_wsize:
868 if (match_int(args, &mnt->wsize)) 1056 if (match_int(args, &option) || option < 0) {
869 return 0; 1057 errors++;
1058 nfs_parse_invalid_value("wsize");
1059 } else
1060 mnt->wsize = option;
870 break; 1061 break;
871 case Opt_bsize: 1062 case Opt_bsize:
872 if (match_int(args, &option)) 1063 if (match_int(args, &option) || option < 0) {
873 return 0; 1064 errors++;
874 if (option < 0) 1065 nfs_parse_invalid_value("bsize");
875 return 0; 1066 } else
876 mnt->bsize = option; 1067 mnt->bsize = option;
877 break; 1068 break;
878 case Opt_timeo: 1069 case Opt_timeo:
879 if (match_int(args, &mnt->timeo)) 1070 if (match_int(args, &option) || option <= 0) {
880 return 0; 1071 errors++;
1072 nfs_parse_invalid_value("timeo");
1073 } else
1074 mnt->timeo = option;
881 break; 1075 break;
882 case Opt_retrans: 1076 case Opt_retrans:
883 if (match_int(args, &mnt->retrans)) 1077 if (match_int(args, &option) || option <= 0) {
884 return 0; 1078 errors++;
1079 nfs_parse_invalid_value("retrans");
1080 } else
1081 mnt->retrans = option;
885 break; 1082 break;
886 case Opt_acregmin: 1083 case Opt_acregmin:
887 if (match_int(args, &mnt->acregmin)) 1084 if (match_int(args, &option) || option < 0) {
888 return 0; 1085 errors++;
1086 nfs_parse_invalid_value("acregmin");
1087 } else
1088 mnt->acregmin = option;
889 break; 1089 break;
890 case Opt_acregmax: 1090 case Opt_acregmax:
891 if (match_int(args, &mnt->acregmax)) 1091 if (match_int(args, &option) || option < 0) {
892 return 0; 1092 errors++;
1093 nfs_parse_invalid_value("acregmax");
1094 } else
1095 mnt->acregmax = option;
893 break; 1096 break;
894 case Opt_acdirmin: 1097 case Opt_acdirmin:
895 if (match_int(args, &mnt->acdirmin)) 1098 if (match_int(args, &option) || option < 0) {
896 return 0; 1099 errors++;
1100 nfs_parse_invalid_value("acdirmin");
1101 } else
1102 mnt->acdirmin = option;
897 break; 1103 break;
898 case Opt_acdirmax: 1104 case Opt_acdirmax:
899 if (match_int(args, &mnt->acdirmax)) 1105 if (match_int(args, &option) || option < 0) {
900 return 0; 1106 errors++;
1107 nfs_parse_invalid_value("acdirmax");
1108 } else
1109 mnt->acdirmax = option;
901 break; 1110 break;
902 case Opt_actimeo: 1111 case Opt_actimeo:
903 if (match_int(args, &option)) 1112 if (match_int(args, &option) || option < 0) {
904 return 0; 1113 errors++;
905 if (option < 0) 1114 nfs_parse_invalid_value("actimeo");
906 return 0; 1115 } else
907 mnt->acregmin = 1116 mnt->acregmin = mnt->acregmax =
908 mnt->acregmax = 1117 mnt->acdirmin = mnt->acdirmax = option;
909 mnt->acdirmin =
910 mnt->acdirmax = option;
911 break; 1118 break;
912 case Opt_namelen: 1119 case Opt_namelen:
913 if (match_int(args, &mnt->namlen)) 1120 if (match_int(args, &option) || option < 0) {
914 return 0; 1121 errors++;
1122 nfs_parse_invalid_value("namlen");
1123 } else
1124 mnt->namlen = option;
915 break; 1125 break;
916 case Opt_mountport: 1126 case Opt_mountport:
917 if (match_int(args, &option)) 1127 if (match_int(args, &option) ||
918 return 0; 1128 option < 0 || option > USHORT_MAX) {
919 if (option < 0 || option > 65535) 1129 errors++;
920 return 0; 1130 nfs_parse_invalid_value("mountport");
921 mnt->mount_server.port = option; 1131 } else
1132 mnt->mount_server.port = option;
922 break; 1133 break;
923 case Opt_mountvers: 1134 case Opt_mountvers:
924 if (match_int(args, &option)) 1135 if (match_int(args, &option) ||
925 return 0; 1136 option < NFS_MNT_VERSION ||
926 if (option < 0) 1137 option > NFS_MNT3_VERSION) {
927 return 0; 1138 errors++;
928 mnt->mount_server.version = option; 1139 nfs_parse_invalid_value("mountvers");
1140 } else
1141 mnt->mount_server.version = option;
929 break; 1142 break;
930 case Opt_nfsvers: 1143 case Opt_nfsvers:
931 if (match_int(args, &option)) 1144 if (match_int(args, &option)) {
932 return 0; 1145 errors++;
1146 nfs_parse_invalid_value("nfsvers");
1147 break;
1148 }
933 switch (option) { 1149 switch (option) {
934 case 2: 1150 case NFS2_VERSION:
935 mnt->flags &= ~NFS_MOUNT_VER3; 1151 mnt->flags &= ~NFS_MOUNT_VER3;
936 break; 1152 break;
937 case 3: 1153 case NFS3_VERSION:
938 mnt->flags |= NFS_MOUNT_VER3; 1154 mnt->flags |= NFS_MOUNT_VER3;
939 break; 1155 break;
940 default: 1156 default:
941 goto out_unrec_vers; 1157 errors++;
1158 nfs_parse_invalid_value("nfsvers");
942 } 1159 }
943 break; 1160 break;
944 1161
1162 /*
1163 * options that take text values
1164 */
945 case Opt_sec: 1165 case Opt_sec:
946 string = match_strdup(args); 1166 string = match_strdup(args);
947 if (string == NULL) 1167 if (string == NULL)
948 goto out_nomem; 1168 goto out_nomem;
949 token = match_token(string, nfs_secflavor_tokens, args); 1169 rc = nfs_parse_security_flavors(string, mnt);
950 kfree(string); 1170 kfree(string);
951 1171 if (!rc) {
952 /* 1172 errors++;
953 * The flags setting is for v2/v3. The flavor_len 1173 dfprintk(MOUNT, "NFS: unrecognized "
954 * setting is for v4. v2/v3 also need to know the 1174 "security flavor\n");
955 * difference between NULL and UNIX.
956 */
957 switch (token) {
958 case Opt_sec_none:
959 mnt->flags &= ~NFS_MOUNT_SECFLAVOUR;
960 mnt->auth_flavor_len = 0;
961 mnt->auth_flavors[0] = RPC_AUTH_NULL;
962 break;
963 case Opt_sec_sys:
964 mnt->flags &= ~NFS_MOUNT_SECFLAVOUR;
965 mnt->auth_flavor_len = 0;
966 mnt->auth_flavors[0] = RPC_AUTH_UNIX;
967 break;
968 case Opt_sec_krb5:
969 mnt->flags |= NFS_MOUNT_SECFLAVOUR;
970 mnt->auth_flavor_len = 1;
971 mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5;
972 break;
973 case Opt_sec_krb5i:
974 mnt->flags |= NFS_MOUNT_SECFLAVOUR;
975 mnt->auth_flavor_len = 1;
976 mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5I;
977 break;
978 case Opt_sec_krb5p:
979 mnt->flags |= NFS_MOUNT_SECFLAVOUR;
980 mnt->auth_flavor_len = 1;
981 mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5P;
982 break;
983 case Opt_sec_lkey:
984 mnt->flags |= NFS_MOUNT_SECFLAVOUR;
985 mnt->auth_flavor_len = 1;
986 mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEY;
987 break;
988 case Opt_sec_lkeyi:
989 mnt->flags |= NFS_MOUNT_SECFLAVOUR;
990 mnt->auth_flavor_len = 1;
991 mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYI;
992 break;
993 case Opt_sec_lkeyp:
994 mnt->flags |= NFS_MOUNT_SECFLAVOUR;
995 mnt->auth_flavor_len = 1;
996 mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYP;
997 break;
998 case Opt_sec_spkm:
999 mnt->flags |= NFS_MOUNT_SECFLAVOUR;
1000 mnt->auth_flavor_len = 1;
1001 mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKM;
1002 break;
1003 case Opt_sec_spkmi:
1004 mnt->flags |= NFS_MOUNT_SECFLAVOUR;
1005 mnt->auth_flavor_len = 1;
1006 mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMI;
1007 break;
1008 case Opt_sec_spkmp:
1009 mnt->flags |= NFS_MOUNT_SECFLAVOUR;
1010 mnt->auth_flavor_len = 1;
1011 mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMP;
1012 break;
1013 default:
1014 goto out_unrec_sec;
1015 } 1175 }
1016 break; 1176 break;
1017 case Opt_proto: 1177 case Opt_proto:
@@ -1026,24 +1186,20 @@ static int nfs_parse_mount_options(char *raw,
1026 case Opt_xprt_udp: 1186 case Opt_xprt_udp:
1027 mnt->flags &= ~NFS_MOUNT_TCP; 1187 mnt->flags &= ~NFS_MOUNT_TCP;
1028 mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP; 1188 mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP;
1029 mnt->timeo = 7;
1030 mnt->retrans = 5;
1031 break; 1189 break;
1032 case Opt_xprt_tcp: 1190 case Opt_xprt_tcp:
1033 mnt->flags |= NFS_MOUNT_TCP; 1191 mnt->flags |= NFS_MOUNT_TCP;
1034 mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP; 1192 mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP;
1035 mnt->timeo = 600;
1036 mnt->retrans = 2;
1037 break; 1193 break;
1038 case Opt_xprt_rdma: 1194 case Opt_xprt_rdma:
1039 /* vector side protocols to TCP */ 1195 /* vector side protocols to TCP */
1040 mnt->flags |= NFS_MOUNT_TCP; 1196 mnt->flags |= NFS_MOUNT_TCP;
1041 mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA; 1197 mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA;
1042 mnt->timeo = 600;
1043 mnt->retrans = 2;
1044 break; 1198 break;
1045 default: 1199 default:
1046 goto out_unrec_xprt; 1200 errors++;
1201 dfprintk(MOUNT, "NFS: unrecognized "
1202 "transport protocol\n");
1047 } 1203 }
1048 break; 1204 break;
1049 case Opt_mountproto: 1205 case Opt_mountproto:
@@ -1063,16 +1219,19 @@ static int nfs_parse_mount_options(char *raw,
1063 break; 1219 break;
1064 case Opt_xprt_rdma: /* not used for side protocols */ 1220 case Opt_xprt_rdma: /* not used for side protocols */
1065 default: 1221 default:
1066 goto out_unrec_xprt; 1222 errors++;
1223 dfprintk(MOUNT, "NFS: unrecognized "
1224 "transport protocol\n");
1067 } 1225 }
1068 break; 1226 break;
1069 case Opt_addr: 1227 case Opt_addr:
1070 string = match_strdup(args); 1228 string = match_strdup(args);
1071 if (string == NULL) 1229 if (string == NULL)
1072 goto out_nomem; 1230 goto out_nomem;
1073 nfs_parse_server_address(string, (struct sockaddr *) 1231 nfs_parse_ip_address(string, strlen(string),
1074 &mnt->nfs_server.address, 1232 (struct sockaddr *)
1075 &mnt->nfs_server.addrlen); 1233 &mnt->nfs_server.address,
1234 &mnt->nfs_server.addrlen);
1076 kfree(string); 1235 kfree(string);
1077 break; 1236 break;
1078 case Opt_clientaddr: 1237 case Opt_clientaddr:
@@ -1093,24 +1252,33 @@ static int nfs_parse_mount_options(char *raw,
1093 string = match_strdup(args); 1252 string = match_strdup(args);
1094 if (string == NULL) 1253 if (string == NULL)
1095 goto out_nomem; 1254 goto out_nomem;
1096 nfs_parse_server_address(string, (struct sockaddr *) 1255 nfs_parse_ip_address(string, strlen(string),
1097 &mnt->mount_server.address, 1256 (struct sockaddr *)
1098 &mnt->mount_server.addrlen); 1257 &mnt->mount_server.address,
1258 &mnt->mount_server.addrlen);
1099 kfree(string); 1259 kfree(string);
1100 break; 1260 break;
1101 1261
1262 /*
1263 * Special options
1264 */
1265 case Opt_sloppy:
1266 sloppy = 1;
1267 dfprintk(MOUNT, "NFS: relaxing parsing rules\n");
1268 break;
1102 case Opt_userspace: 1269 case Opt_userspace:
1103 case Opt_deprecated: 1270 case Opt_deprecated:
1271 dfprintk(MOUNT, "NFS: ignoring mount option "
1272 "'%s'\n", p);
1104 break; 1273 break;
1105 1274
1106 default: 1275 default:
1107 goto out_unknown; 1276 errors++;
1277 dfprintk(MOUNT, "NFS: unrecognized mount option "
1278 "'%s'\n", p);
1108 } 1279 }
1109 } 1280 }
1110 1281
1111 nfs_set_port((struct sockaddr *)&mnt->nfs_server.address,
1112 mnt->nfs_server.port);
1113
1114 return 1; 1282 return 1;
1115 1283
1116out_nomem: 1284out_nomem:
@@ -1120,21 +1288,6 @@ out_security_failure:
1120 free_secdata(secdata); 1288 free_secdata(secdata);
1121 printk(KERN_INFO "NFS: security options invalid: %d\n", rc); 1289 printk(KERN_INFO "NFS: security options invalid: %d\n", rc);
1122 return 0; 1290 return 0;
1123out_unrec_vers:
1124 printk(KERN_INFO "NFS: unrecognized NFS version number\n");
1125 return 0;
1126
1127out_unrec_xprt:
1128 printk(KERN_INFO "NFS: unrecognized transport protocol\n");
1129 return 0;
1130
1131out_unrec_sec:
1132 printk(KERN_INFO "NFS: unrecognized security flavor\n");
1133 return 0;
1134
1135out_unknown:
1136 printk(KERN_INFO "NFS: unknown mount option: %s\n", p);
1137 return 0;
1138} 1291}
1139 1292
1140/* 1293/*
@@ -1188,11 +1341,146 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args,
1188 if (status == 0) 1341 if (status == 0)
1189 return 0; 1342 return 0;
1190 1343
1191 dfprintk(MOUNT, "NFS: unable to mount server %s, error %d", 1344 dfprintk(MOUNT, "NFS: unable to mount server %s, error %d\n",
1192 hostname, status); 1345 hostname, status);
1193 return status; 1346 return status;
1194} 1347}
1195 1348
1349static int nfs_parse_simple_hostname(const char *dev_name,
1350 char **hostname, size_t maxnamlen,
1351 char **export_path, size_t maxpathlen)
1352{
1353 size_t len;
1354 char *colon, *comma;
1355
1356 colon = strchr(dev_name, ':');
1357 if (colon == NULL)
1358 goto out_bad_devname;
1359
1360 len = colon - dev_name;
1361 if (len > maxnamlen)
1362 goto out_hostname;
1363
1364 /* N.B. caller will free nfs_server.hostname in all cases */
1365 *hostname = kstrndup(dev_name, len, GFP_KERNEL);
1366 if (!*hostname)
1367 goto out_nomem;
1368
1369 /* kill possible hostname list: not supported */
1370 comma = strchr(*hostname, ',');
1371 if (comma != NULL) {
1372 if (comma == *hostname)
1373 goto out_bad_devname;
1374 *comma = '\0';
1375 }
1376
1377 colon++;
1378 len = strlen(colon);
1379 if (len > maxpathlen)
1380 goto out_path;
1381 *export_path = kstrndup(colon, len, GFP_KERNEL);
1382 if (!*export_path)
1383 goto out_nomem;
1384
1385 dfprintk(MOUNT, "NFS: MNTPATH: '%s'\n", *export_path);
1386 return 0;
1387
1388out_bad_devname:
1389 dfprintk(MOUNT, "NFS: device name not in host:path format\n");
1390 return -EINVAL;
1391
1392out_nomem:
1393 dfprintk(MOUNT, "NFS: not enough memory to parse device name\n");
1394 return -ENOMEM;
1395
1396out_hostname:
1397 dfprintk(MOUNT, "NFS: server hostname too long\n");
1398 return -ENAMETOOLONG;
1399
1400out_path:
1401 dfprintk(MOUNT, "NFS: export pathname too long\n");
1402 return -ENAMETOOLONG;
1403}
1404
1405/*
1406 * Hostname has square brackets around it because it contains one or
1407 * more colons. We look for the first closing square bracket, and a
1408 * colon must follow it.
1409 */
1410static int nfs_parse_protected_hostname(const char *dev_name,
1411 char **hostname, size_t maxnamlen,
1412 char **export_path, size_t maxpathlen)
1413{
1414 size_t len;
1415 char *start, *end;
1416
1417 start = (char *)(dev_name + 1);
1418
1419 end = strchr(start, ']');
1420 if (end == NULL)
1421 goto out_bad_devname;
1422 if (*(end + 1) != ':')
1423 goto out_bad_devname;
1424
1425 len = end - start;
1426 if (len > maxnamlen)
1427 goto out_hostname;
1428
1429 /* N.B. caller will free nfs_server.hostname in all cases */
1430 *hostname = kstrndup(start, len, GFP_KERNEL);
1431 if (*hostname == NULL)
1432 goto out_nomem;
1433
1434 end += 2;
1435 len = strlen(end);
1436 if (len > maxpathlen)
1437 goto out_path;
1438 *export_path = kstrndup(end, len, GFP_KERNEL);
1439 if (!*export_path)
1440 goto out_nomem;
1441
1442 return 0;
1443
1444out_bad_devname:
1445 dfprintk(MOUNT, "NFS: device name not in host:path format\n");
1446 return -EINVAL;
1447
1448out_nomem:
1449 dfprintk(MOUNT, "NFS: not enough memory to parse device name\n");
1450 return -ENOMEM;
1451
1452out_hostname:
1453 dfprintk(MOUNT, "NFS: server hostname too long\n");
1454 return -ENAMETOOLONG;
1455
1456out_path:
1457 dfprintk(MOUNT, "NFS: export pathname too long\n");
1458 return -ENAMETOOLONG;
1459}
1460
1461/*
1462 * Split "dev_name" into "hostname:export_path".
1463 *
1464 * The leftmost colon demarks the split between the server's hostname
1465 * and the export path. If the hostname starts with a left square
1466 * bracket, then it may contain colons.
1467 *
1468 * Note: caller frees hostname and export path, even on error.
1469 */
1470static int nfs_parse_devname(const char *dev_name,
1471 char **hostname, size_t maxnamlen,
1472 char **export_path, size_t maxpathlen)
1473{
1474 if (*dev_name == '[')
1475 return nfs_parse_protected_hostname(dev_name,
1476 hostname, maxnamlen,
1477 export_path, maxpathlen);
1478
1479 return nfs_parse_simple_hostname(dev_name,
1480 hostname, maxnamlen,
1481 export_path, maxpathlen);
1482}
1483
1196/* 1484/*
1197 * Validate the NFS2/NFS3 mount data 1485 * Validate the NFS2/NFS3 mount data
1198 * - fills in the mount root filehandle 1486 * - fills in the mount root filehandle
@@ -1222,16 +1510,14 @@ static int nfs_validate_mount_data(void *options,
1222 args->flags = (NFS_MOUNT_VER3 | NFS_MOUNT_TCP); 1510 args->flags = (NFS_MOUNT_VER3 | NFS_MOUNT_TCP);
1223 args->rsize = NFS_MAX_FILE_IO_SIZE; 1511 args->rsize = NFS_MAX_FILE_IO_SIZE;
1224 args->wsize = NFS_MAX_FILE_IO_SIZE; 1512 args->wsize = NFS_MAX_FILE_IO_SIZE;
1225 args->timeo = 600; 1513 args->acregmin = NFS_DEF_ACREGMIN;
1226 args->retrans = 2; 1514 args->acregmax = NFS_DEF_ACREGMAX;
1227 args->acregmin = 3; 1515 args->acdirmin = NFS_DEF_ACDIRMIN;
1228 args->acregmax = 60; 1516 args->acdirmax = NFS_DEF_ACDIRMAX;
1229 args->acdirmin = 30;
1230 args->acdirmax = 60;
1231 args->mount_server.port = 0; /* autobind unless user sets port */ 1517 args->mount_server.port = 0; /* autobind unless user sets port */
1232 args->mount_server.protocol = XPRT_TRANSPORT_UDP;
1233 args->nfs_server.port = 0; /* autobind unless user sets port */ 1518 args->nfs_server.port = 0; /* autobind unless user sets port */
1234 args->nfs_server.protocol = XPRT_TRANSPORT_TCP; 1519 args->nfs_server.protocol = XPRT_TRANSPORT_TCP;
1520 args->auth_flavors[0] = RPC_AUTH_UNIX;
1235 1521
1236 switch (data->version) { 1522 switch (data->version) {
1237 case 1: 1523 case 1:
@@ -1289,7 +1575,9 @@ static int nfs_validate_mount_data(void *options,
1289 args->nfs_server.hostname = kstrdup(data->hostname, GFP_KERNEL); 1575 args->nfs_server.hostname = kstrdup(data->hostname, GFP_KERNEL);
1290 args->namlen = data->namlen; 1576 args->namlen = data->namlen;
1291 args->bsize = data->bsize; 1577 args->bsize = data->bsize;
1292 args->auth_flavors[0] = data->pseudoflavor; 1578
1579 if (data->flags & NFS_MOUNT_SECFLAVOUR)
1580 args->auth_flavors[0] = data->pseudoflavor;
1293 if (!args->nfs_server.hostname) 1581 if (!args->nfs_server.hostname)
1294 goto out_nomem; 1582 goto out_nomem;
1295 1583
@@ -1321,8 +1609,6 @@ static int nfs_validate_mount_data(void *options,
1321 1609
1322 break; 1610 break;
1323 default: { 1611 default: {
1324 unsigned int len;
1325 char *c;
1326 int status; 1612 int status;
1327 1613
1328 if (nfs_parse_mount_options((char *)options, args) == 0) 1614 if (nfs_parse_mount_options((char *)options, args) == 0)
@@ -1332,21 +1618,22 @@ static int nfs_validate_mount_data(void *options,
1332 &args->nfs_server.address)) 1618 &args->nfs_server.address))
1333 goto out_no_address; 1619 goto out_no_address;
1334 1620
1335 c = strchr(dev_name, ':'); 1621 nfs_set_port((struct sockaddr *)&args->nfs_server.address,
1336 if (c == NULL) 1622 args->nfs_server.port);
1337 return -EINVAL;
1338 len = c - dev_name;
1339 /* N.B. caller will free nfs_server.hostname in all cases */
1340 args->nfs_server.hostname = kstrndup(dev_name, len, GFP_KERNEL);
1341 if (!args->nfs_server.hostname)
1342 goto out_nomem;
1343 1623
1344 c++; 1624 nfs_set_mount_transport_protocol(args);
1345 if (strlen(c) > NFS_MAXPATHLEN) 1625
1346 return -ENAMETOOLONG; 1626 status = nfs_parse_devname(dev_name,
1347 args->nfs_server.export_path = c; 1627 &args->nfs_server.hostname,
1628 PAGE_SIZE,
1629 &args->nfs_server.export_path,
1630 NFS_MAXPATHLEN);
1631 if (!status)
1632 status = nfs_try_mount(args, mntfh);
1633
1634 kfree(args->nfs_server.export_path);
1635 args->nfs_server.export_path = NULL;
1348 1636
1349 status = nfs_try_mount(args, mntfh);
1350 if (status) 1637 if (status)
1351 return status; 1638 return status;
1352 1639
@@ -1354,9 +1641,6 @@ static int nfs_validate_mount_data(void *options,
1354 } 1641 }
1355 } 1642 }
1356 1643
1357 if (!(args->flags & NFS_MOUNT_SECFLAVOUR))
1358 args->auth_flavors[0] = RPC_AUTH_UNIX;
1359
1360#ifndef CONFIG_NFS_V3 1644#ifndef CONFIG_NFS_V3
1361 if (args->flags & NFS_MOUNT_VER3) 1645 if (args->flags & NFS_MOUNT_VER3)
1362 goto out_v3_not_compiled; 1646 goto out_v3_not_compiled;
@@ -1396,6 +1680,80 @@ out_invalid_fh:
1396 return -EINVAL; 1680 return -EINVAL;
1397} 1681}
1398 1682
1683static int
1684nfs_compare_remount_data(struct nfs_server *nfss,
1685 struct nfs_parsed_mount_data *data)
1686{
1687 if (data->flags != nfss->flags ||
1688 data->rsize != nfss->rsize ||
1689 data->wsize != nfss->wsize ||
1690 data->retrans != nfss->client->cl_timeout->to_retries ||
1691 data->auth_flavors[0] != nfss->client->cl_auth->au_flavor ||
1692 data->acregmin != nfss->acregmin / HZ ||
1693 data->acregmax != nfss->acregmax / HZ ||
1694 data->acdirmin != nfss->acdirmin / HZ ||
1695 data->acdirmax != nfss->acdirmax / HZ ||
1696 data->timeo != (10U * nfss->client->cl_timeout->to_initval / HZ) ||
1697 data->nfs_server.addrlen != nfss->nfs_client->cl_addrlen ||
1698 memcmp(&data->nfs_server.address, &nfss->nfs_client->cl_addr,
1699 data->nfs_server.addrlen) != 0)
1700 return -EINVAL;
1701
1702 return 0;
1703}
1704
1705static int
1706nfs_remount(struct super_block *sb, int *flags, char *raw_data)
1707{
1708 int error;
1709 struct nfs_server *nfss = sb->s_fs_info;
1710 struct nfs_parsed_mount_data *data;
1711 struct nfs_mount_data *options = (struct nfs_mount_data *)raw_data;
1712 struct nfs4_mount_data *options4 = (struct nfs4_mount_data *)raw_data;
1713 u32 nfsvers = nfss->nfs_client->rpc_ops->version;
1714
1715 /*
1716 * Userspace mount programs that send binary options generally send
1717 * them populated with default values. We have no way to know which
1718 * ones were explicitly specified. Fall back to legacy behavior and
1719 * just return success.
1720 */
1721 if ((nfsvers == 4 && options4->version == 1) ||
1722 (nfsvers <= 3 && options->version >= 1 &&
1723 options->version <= 6))
1724 return 0;
1725
1726 data = kzalloc(sizeof(*data), GFP_KERNEL);
1727 if (data == NULL)
1728 return -ENOMEM;
1729
1730 /* fill out struct with values from existing mount */
1731 data->flags = nfss->flags;
1732 data->rsize = nfss->rsize;
1733 data->wsize = nfss->wsize;
1734 data->retrans = nfss->client->cl_timeout->to_retries;
1735 data->auth_flavors[0] = nfss->client->cl_auth->au_flavor;
1736 data->acregmin = nfss->acregmin / HZ;
1737 data->acregmax = nfss->acregmax / HZ;
1738 data->acdirmin = nfss->acdirmin / HZ;
1739 data->acdirmax = nfss->acdirmax / HZ;
1740 data->timeo = 10U * nfss->client->cl_timeout->to_initval / HZ;
1741 data->nfs_server.addrlen = nfss->nfs_client->cl_addrlen;
1742 memcpy(&data->nfs_server.address, &nfss->nfs_client->cl_addr,
1743 data->nfs_server.addrlen);
1744
1745 /* overwrite those values with any that were specified */
1746 error = nfs_parse_mount_options((char *)options, data);
1747 if (error < 0)
1748 goto out;
1749
1750 /* compare new mount options with old ones */
1751 error = nfs_compare_remount_data(nfss, data);
1752out:
1753 kfree(data);
1754 return error;
1755}
1756
1399/* 1757/*
1400 * Initialise the common bits of the superblock 1758 * Initialise the common bits of the superblock
1401 */ 1759 */
@@ -1811,14 +2169,13 @@ static int nfs4_validate_mount_data(void *options,
1811 2169
1812 args->rsize = NFS_MAX_FILE_IO_SIZE; 2170 args->rsize = NFS_MAX_FILE_IO_SIZE;
1813 args->wsize = NFS_MAX_FILE_IO_SIZE; 2171 args->wsize = NFS_MAX_FILE_IO_SIZE;
1814 args->timeo = 600; 2172 args->acregmin = NFS_DEF_ACREGMIN;
1815 args->retrans = 2; 2173 args->acregmax = NFS_DEF_ACREGMAX;
1816 args->acregmin = 3; 2174 args->acdirmin = NFS_DEF_ACDIRMIN;
1817 args->acregmax = 60; 2175 args->acdirmax = NFS_DEF_ACDIRMAX;
1818 args->acdirmin = 30;
1819 args->acdirmax = 60;
1820 args->nfs_server.port = NFS_PORT; /* 2049 unless user set port= */ 2176 args->nfs_server.port = NFS_PORT; /* 2049 unless user set port= */
1821 args->nfs_server.protocol = XPRT_TRANSPORT_TCP; 2177 args->auth_flavors[0] = RPC_AUTH_UNIX;
2178 args->auth_flavor_len = 0;
1822 2179
1823 switch (data->version) { 2180 switch (data->version) {
1824 case 1: 2181 case 1:
@@ -1834,18 +2191,13 @@ static int nfs4_validate_mount_data(void *options,
1834 &args->nfs_server.address)) 2191 &args->nfs_server.address))
1835 goto out_no_address; 2192 goto out_no_address;
1836 2193
1837 switch (data->auth_flavourlen) { 2194 if (data->auth_flavourlen) {
1838 case 0: 2195 if (data->auth_flavourlen > 1)
1839 args->auth_flavors[0] = RPC_AUTH_UNIX; 2196 goto out_inval_auth;
1840 break;
1841 case 1:
1842 if (copy_from_user(&args->auth_flavors[0], 2197 if (copy_from_user(&args->auth_flavors[0],
1843 data->auth_flavours, 2198 data->auth_flavours,
1844 sizeof(args->auth_flavors[0]))) 2199 sizeof(args->auth_flavors[0])))
1845 return -EFAULT; 2200 return -EFAULT;
1846 break;
1847 default:
1848 goto out_inval_auth;
1849 } 2201 }
1850 2202
1851 c = strndup_user(data->hostname.data, NFS4_MAXNAMLEN); 2203 c = strndup_user(data->hostname.data, NFS4_MAXNAMLEN);
@@ -1879,10 +2231,11 @@ static int nfs4_validate_mount_data(void *options,
1879 args->acdirmin = data->acdirmin; 2231 args->acdirmin = data->acdirmin;
1880 args->acdirmax = data->acdirmax; 2232 args->acdirmax = data->acdirmax;
1881 args->nfs_server.protocol = data->proto; 2233 args->nfs_server.protocol = data->proto;
2234 nfs_validate_transport_protocol(args);
1882 2235
1883 break; 2236 break;
1884 default: { 2237 default: {
1885 unsigned int len; 2238 int status;
1886 2239
1887 if (nfs_parse_mount_options((char *)options, args) == 0) 2240 if (nfs_parse_mount_options((char *)options, args) == 0)
1888 return -EINVAL; 2241 return -EINVAL;
@@ -1891,44 +2244,25 @@ static int nfs4_validate_mount_data(void *options,
1891 &args->nfs_server.address)) 2244 &args->nfs_server.address))
1892 return -EINVAL; 2245 return -EINVAL;
1893 2246
1894 switch (args->auth_flavor_len) { 2247 nfs_set_port((struct sockaddr *)&args->nfs_server.address,
1895 case 0: 2248 args->nfs_server.port);
1896 args->auth_flavors[0] = RPC_AUTH_UNIX;
1897 break;
1898 case 1:
1899 break;
1900 default:
1901 goto out_inval_auth;
1902 }
1903 2249
1904 /* 2250 nfs_validate_transport_protocol(args);
1905 * Split "dev_name" into "hostname:mntpath".
1906 */
1907 c = strchr(dev_name, ':');
1908 if (c == NULL)
1909 return -EINVAL;
1910 /* while calculating len, pretend ':' is '\0' */
1911 len = c - dev_name;
1912 if (len > NFS4_MAXNAMLEN)
1913 return -ENAMETOOLONG;
1914 /* N.B. caller will free nfs_server.hostname in all cases */
1915 args->nfs_server.hostname = kstrndup(dev_name, len, GFP_KERNEL);
1916 if (!args->nfs_server.hostname)
1917 goto out_nomem;
1918
1919 c++; /* step over the ':' */
1920 len = strlen(c);
1921 if (len > NFS4_MAXPATHLEN)
1922 return -ENAMETOOLONG;
1923 args->nfs_server.export_path = kstrndup(c, len, GFP_KERNEL);
1924 if (!args->nfs_server.export_path)
1925 goto out_nomem;
1926 2251
1927 dprintk("NFS: MNTPATH: '%s'\n", args->nfs_server.export_path); 2252 if (args->auth_flavor_len > 1)
2253 goto out_inval_auth;
1928 2254
1929 if (args->client_address == NULL) 2255 if (args->client_address == NULL)
1930 goto out_no_client_address; 2256 goto out_no_client_address;
1931 2257
2258 status = nfs_parse_devname(dev_name,
2259 &args->nfs_server.hostname,
2260 NFS4_MAXNAMLEN,
2261 &args->nfs_server.export_path,
2262 NFS4_MAXPATHLEN);
2263 if (status < 0)
2264 return status;
2265
1932 break; 2266 break;
1933 } 2267 }
1934 } 2268 }
@@ -1944,10 +2278,6 @@ out_inval_auth:
1944 data->auth_flavourlen); 2278 data->auth_flavourlen);
1945 return -EINVAL; 2279 return -EINVAL;
1946 2280
1947out_nomem:
1948 dfprintk(MOUNT, "NFS4: not enough memory to handle mount options\n");
1949 return -ENOMEM;
1950
1951out_no_address: 2281out_no_address:
1952 dfprintk(MOUNT, "NFS4: mount program didn't pass remote address\n"); 2282 dfprintk(MOUNT, "NFS4: mount program didn't pass remote address\n");
1953 return -EINVAL; 2283 return -EINVAL;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index f333848fd3be..3229e217c773 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -34,9 +34,6 @@
34/* 34/*
35 * Local function declarations 35 * Local function declarations
36 */ 36 */
37static struct nfs_page * nfs_update_request(struct nfs_open_context*,
38 struct page *,
39 unsigned int, unsigned int);
40static void nfs_pageio_init_write(struct nfs_pageio_descriptor *desc, 37static void nfs_pageio_init_write(struct nfs_pageio_descriptor *desc,
41 struct inode *inode, int ioflags); 38 struct inode *inode, int ioflags);
42static void nfs_redirty_request(struct nfs_page *req); 39static void nfs_redirty_request(struct nfs_page *req);
@@ -136,16 +133,21 @@ static struct nfs_page *nfs_page_find_request(struct page *page)
136static void nfs_grow_file(struct page *page, unsigned int offset, unsigned int count) 133static void nfs_grow_file(struct page *page, unsigned int offset, unsigned int count)
137{ 134{
138 struct inode *inode = page->mapping->host; 135 struct inode *inode = page->mapping->host;
139 loff_t end, i_size = i_size_read(inode); 136 loff_t end, i_size;
140 pgoff_t end_index = (i_size - 1) >> PAGE_CACHE_SHIFT; 137 pgoff_t end_index;
141 138
139 spin_lock(&inode->i_lock);
140 i_size = i_size_read(inode);
141 end_index = (i_size - 1) >> PAGE_CACHE_SHIFT;
142 if (i_size > 0 && page->index < end_index) 142 if (i_size > 0 && page->index < end_index)
143 return; 143 goto out;
144 end = ((loff_t)page->index << PAGE_CACHE_SHIFT) + ((loff_t)offset+count); 144 end = ((loff_t)page->index << PAGE_CACHE_SHIFT) + ((loff_t)offset+count);
145 if (i_size >= end) 145 if (i_size >= end)
146 return; 146 goto out;
147 nfs_inc_stats(inode, NFSIOS_EXTENDWRITE);
148 i_size_write(inode, end); 147 i_size_write(inode, end);
148 nfs_inc_stats(inode, NFSIOS_EXTENDWRITE);
149out:
150 spin_unlock(&inode->i_lock);
149} 151}
150 152
151/* A writeback failed: mark the page as bad, and invalidate the page cache */ 153/* A writeback failed: mark the page as bad, and invalidate the page cache */
@@ -169,29 +171,6 @@ static void nfs_mark_uptodate(struct page *page, unsigned int base, unsigned int
169 SetPageUptodate(page); 171 SetPageUptodate(page);
170} 172}
171 173
172static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page,
173 unsigned int offset, unsigned int count)
174{
175 struct nfs_page *req;
176 int ret;
177
178 for (;;) {
179 req = nfs_update_request(ctx, page, offset, count);
180 if (!IS_ERR(req))
181 break;
182 ret = PTR_ERR(req);
183 if (ret != -EBUSY)
184 return ret;
185 ret = nfs_wb_page(page->mapping->host, page);
186 if (ret != 0)
187 return ret;
188 }
189 /* Update file length */
190 nfs_grow_file(page, offset, count);
191 nfs_clear_page_tag_locked(req);
192 return 0;
193}
194
195static int wb_priority(struct writeback_control *wbc) 174static int wb_priority(struct writeback_control *wbc)
196{ 175{
197 if (wbc->for_reclaim) 176 if (wbc->for_reclaim)
@@ -268,12 +247,9 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
268 return ret; 247 return ret;
269 spin_lock(&inode->i_lock); 248 spin_lock(&inode->i_lock);
270 } 249 }
271 if (test_bit(PG_NEED_COMMIT, &req->wb_flags)) { 250 if (test_bit(PG_CLEAN, &req->wb_flags)) {
272 /* This request is marked for commit */
273 spin_unlock(&inode->i_lock); 251 spin_unlock(&inode->i_lock);
274 nfs_clear_page_tag_locked(req); 252 BUG();
275 nfs_pageio_complete(pgio);
276 return 0;
277 } 253 }
278 if (nfs_set_page_writeback(page) != 0) { 254 if (nfs_set_page_writeback(page) != 0) {
279 spin_unlock(&inode->i_lock); 255 spin_unlock(&inode->i_lock);
@@ -355,11 +331,19 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
355/* 331/*
356 * Insert a write request into an inode 332 * Insert a write request into an inode
357 */ 333 */
358static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req) 334static int nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
359{ 335{
360 struct nfs_inode *nfsi = NFS_I(inode); 336 struct nfs_inode *nfsi = NFS_I(inode);
361 int error; 337 int error;
362 338
339 error = radix_tree_preload(GFP_NOFS);
340 if (error != 0)
341 goto out;
342
343 /* Lock the request! */
344 nfs_lock_request_dontget(req);
345
346 spin_lock(&inode->i_lock);
363 error = radix_tree_insert(&nfsi->nfs_page_tree, req->wb_index, req); 347 error = radix_tree_insert(&nfsi->nfs_page_tree, req->wb_index, req);
364 BUG_ON(error); 348 BUG_ON(error);
365 if (!nfsi->npages) { 349 if (!nfsi->npages) {
@@ -373,6 +357,10 @@ static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
373 kref_get(&req->wb_kref); 357 kref_get(&req->wb_kref);
374 radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index, 358 radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index,
375 NFS_PAGE_TAG_LOCKED); 359 NFS_PAGE_TAG_LOCKED);
360 spin_unlock(&inode->i_lock);
361 radix_tree_preload_end();
362out:
363 return error;
376} 364}
377 365
378/* 366/*
@@ -405,19 +393,6 @@ nfs_mark_request_dirty(struct nfs_page *req)
405 __set_page_dirty_nobuffers(req->wb_page); 393 __set_page_dirty_nobuffers(req->wb_page);
406} 394}
407 395
408/*
409 * Check if a request is dirty
410 */
411static inline int
412nfs_dirty_request(struct nfs_page *req)
413{
414 struct page *page = req->wb_page;
415
416 if (page == NULL || test_bit(PG_NEED_COMMIT, &req->wb_flags))
417 return 0;
418 return !PageWriteback(page);
419}
420
421#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) 396#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
422/* 397/*
423 * Add a request to the inode's commit list. 398 * Add a request to the inode's commit list.
@@ -430,7 +405,7 @@ nfs_mark_request_commit(struct nfs_page *req)
430 405
431 spin_lock(&inode->i_lock); 406 spin_lock(&inode->i_lock);
432 nfsi->ncommit++; 407 nfsi->ncommit++;
433 set_bit(PG_NEED_COMMIT, &(req)->wb_flags); 408 set_bit(PG_CLEAN, &(req)->wb_flags);
434 radix_tree_tag_set(&nfsi->nfs_page_tree, 409 radix_tree_tag_set(&nfsi->nfs_page_tree,
435 req->wb_index, 410 req->wb_index,
436 NFS_PAGE_TAG_COMMIT); 411 NFS_PAGE_TAG_COMMIT);
@@ -440,6 +415,19 @@ nfs_mark_request_commit(struct nfs_page *req)
440 __mark_inode_dirty(inode, I_DIRTY_DATASYNC); 415 __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
441} 416}
442 417
418static int
419nfs_clear_request_commit(struct nfs_page *req)
420{
421 struct page *page = req->wb_page;
422
423 if (test_and_clear_bit(PG_CLEAN, &(req)->wb_flags)) {
424 dec_zone_page_state(page, NR_UNSTABLE_NFS);
425 dec_bdi_stat(page->mapping->backing_dev_info, BDI_RECLAIMABLE);
426 return 1;
427 }
428 return 0;
429}
430
443static inline 431static inline
444int nfs_write_need_commit(struct nfs_write_data *data) 432int nfs_write_need_commit(struct nfs_write_data *data)
445{ 433{
@@ -449,7 +437,7 @@ int nfs_write_need_commit(struct nfs_write_data *data)
449static inline 437static inline
450int nfs_reschedule_unstable_write(struct nfs_page *req) 438int nfs_reschedule_unstable_write(struct nfs_page *req)
451{ 439{
452 if (test_bit(PG_NEED_COMMIT, &req->wb_flags)) { 440 if (test_and_clear_bit(PG_NEED_COMMIT, &req->wb_flags)) {
453 nfs_mark_request_commit(req); 441 nfs_mark_request_commit(req);
454 return 1; 442 return 1;
455 } 443 }
@@ -465,6 +453,12 @@ nfs_mark_request_commit(struct nfs_page *req)
465{ 453{
466} 454}
467 455
456static inline int
457nfs_clear_request_commit(struct nfs_page *req)
458{
459 return 0;
460}
461
468static inline 462static inline
469int nfs_write_need_commit(struct nfs_write_data *data) 463int nfs_write_need_commit(struct nfs_write_data *data)
470{ 464{
@@ -522,11 +516,8 @@ static void nfs_cancel_commit_list(struct list_head *head)
522 516
523 while(!list_empty(head)) { 517 while(!list_empty(head)) {
524 req = nfs_list_entry(head->next); 518 req = nfs_list_entry(head->next);
525 dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
526 dec_bdi_stat(req->wb_page->mapping->backing_dev_info,
527 BDI_RECLAIMABLE);
528 nfs_list_remove_request(req); 519 nfs_list_remove_request(req);
529 clear_bit(PG_NEED_COMMIT, &(req)->wb_flags); 520 nfs_clear_request_commit(req);
530 nfs_inode_remove_request(req); 521 nfs_inode_remove_request(req);
531 nfs_unlock_request(req); 522 nfs_unlock_request(req);
532 } 523 }
@@ -564,110 +555,124 @@ static inline int nfs_scan_commit(struct inode *inode, struct list_head *dst, pg
564#endif 555#endif
565 556
566/* 557/*
567 * Try to update any existing write request, or create one if there is none. 558 * Search for an existing write request, and attempt to update
568 * In order to match, the request's credentials must match those of 559 * it to reflect a new dirty region on a given page.
569 * the calling process.
570 * 560 *
571 * Note: Should always be called with the Page Lock held! 561 * If the attempt fails, then the existing request is flushed out
562 * to disk.
572 */ 563 */
573static struct nfs_page * nfs_update_request(struct nfs_open_context* ctx, 564static struct nfs_page *nfs_try_to_update_request(struct inode *inode,
574 struct page *page, unsigned int offset, unsigned int bytes) 565 struct page *page,
566 unsigned int offset,
567 unsigned int bytes)
575{ 568{
576 struct address_space *mapping = page->mapping; 569 struct nfs_page *req;
577 struct inode *inode = mapping->host; 570 unsigned int rqend;
578 struct nfs_page *req, *new = NULL; 571 unsigned int end;
579 pgoff_t rqend, end; 572 int error;
573
574 if (!PagePrivate(page))
575 return NULL;
580 576
581 end = offset + bytes; 577 end = offset + bytes;
578 spin_lock(&inode->i_lock);
582 579
583 for (;;) { 580 for (;;) {
584 /* Loop over all inode entries and see if we find 581 req = nfs_page_find_request_locked(page);
585 * A request for the page we wish to update 582 if (req == NULL)
583 goto out_unlock;
584
585 rqend = req->wb_offset + req->wb_bytes;
586 /*
587 * Tell the caller to flush out the request if
588 * the offsets are non-contiguous.
589 * Note: nfs_flush_incompatible() will already
590 * have flushed out requests having wrong owners.
586 */ 591 */
587 if (new) { 592 if (offset > rqend
588 if (radix_tree_preload(GFP_NOFS)) { 593 || end < req->wb_offset)
589 nfs_release_request(new); 594 goto out_flushme;
590 return ERR_PTR(-ENOMEM);
591 }
592 }
593 595
594 spin_lock(&inode->i_lock); 596 if (nfs_set_page_tag_locked(req))
595 req = nfs_page_find_request_locked(page);
596 if (req) {
597 if (!nfs_set_page_tag_locked(req)) {
598 int error;
599
600 spin_unlock(&inode->i_lock);
601 error = nfs_wait_on_request(req);
602 nfs_release_request(req);
603 if (error < 0) {
604 if (new) {
605 radix_tree_preload_end();
606 nfs_release_request(new);
607 }
608 return ERR_PTR(error);
609 }
610 continue;
611 }
612 spin_unlock(&inode->i_lock);
613 if (new) {
614 radix_tree_preload_end();
615 nfs_release_request(new);
616 }
617 break; 597 break;
618 }
619 598
620 if (new) { 599 /* The request is locked, so wait and then retry */
621 nfs_lock_request_dontget(new);
622 nfs_inode_add_request(inode, new);
623 spin_unlock(&inode->i_lock);
624 radix_tree_preload_end();
625 req = new;
626 goto zero_page;
627 }
628 spin_unlock(&inode->i_lock); 600 spin_unlock(&inode->i_lock);
629 601 error = nfs_wait_on_request(req);
630 new = nfs_create_request(ctx, inode, page, offset, bytes); 602 nfs_release_request(req);
631 if (IS_ERR(new)) 603 if (error != 0)
632 return new; 604 goto out_err;
605 spin_lock(&inode->i_lock);
633 } 606 }
634 607
635 /* We have a request for our page. 608 if (nfs_clear_request_commit(req))
636 * If the creds don't match, or the 609 radix_tree_tag_clear(&NFS_I(inode)->nfs_page_tree,
637 * page addresses don't match, 610 req->wb_index, NFS_PAGE_TAG_COMMIT);
638 * tell the caller to wait on the conflicting
639 * request.
640 */
641 rqend = req->wb_offset + req->wb_bytes;
642 if (req->wb_context != ctx
643 || req->wb_page != page
644 || !nfs_dirty_request(req)
645 || offset > rqend || end < req->wb_offset) {
646 nfs_clear_page_tag_locked(req);
647 return ERR_PTR(-EBUSY);
648 }
649 611
650 /* Okay, the request matches. Update the region */ 612 /* Okay, the request matches. Update the region */
651 if (offset < req->wb_offset) { 613 if (offset < req->wb_offset) {
652 req->wb_offset = offset; 614 req->wb_offset = offset;
653 req->wb_pgbase = offset; 615 req->wb_pgbase = offset;
654 req->wb_bytes = max(end, rqend) - req->wb_offset;
655 goto zero_page;
656 } 616 }
657
658 if (end > rqend) 617 if (end > rqend)
659 req->wb_bytes = end - req->wb_offset; 618 req->wb_bytes = end - req->wb_offset;
660 619 else
620 req->wb_bytes = rqend - req->wb_offset;
621out_unlock:
622 spin_unlock(&inode->i_lock);
661 return req; 623 return req;
662zero_page: 624out_flushme:
663 /* If this page might potentially be marked as up to date, 625 spin_unlock(&inode->i_lock);
664 * then we need to zero any uninitalised data. */ 626 nfs_release_request(req);
665 if (req->wb_pgbase == 0 && req->wb_bytes != PAGE_CACHE_SIZE 627 error = nfs_wb_page(inode, page);
666 && !PageUptodate(req->wb_page)) 628out_err:
667 zero_user_segment(req->wb_page, req->wb_bytes, PAGE_CACHE_SIZE); 629 return ERR_PTR(error);
630}
631
632/*
633 * Try to update an existing write request, or create one if there is none.
634 *
635 * Note: Should always be called with the Page Lock held to prevent races
636 * if we have to add a new request. Also assumes that the caller has
637 * already called nfs_flush_incompatible() if necessary.
638 */
639static struct nfs_page * nfs_setup_write_request(struct nfs_open_context* ctx,
640 struct page *page, unsigned int offset, unsigned int bytes)
641{
642 struct inode *inode = page->mapping->host;
643 struct nfs_page *req;
644 int error;
645
646 req = nfs_try_to_update_request(inode, page, offset, bytes);
647 if (req != NULL)
648 goto out;
649 req = nfs_create_request(ctx, inode, page, offset, bytes);
650 if (IS_ERR(req))
651 goto out;
652 error = nfs_inode_add_request(inode, req);
653 if (error != 0) {
654 nfs_release_request(req);
655 req = ERR_PTR(error);
656 }
657out:
668 return req; 658 return req;
669} 659}
670 660
661static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page,
662 unsigned int offset, unsigned int count)
663{
664 struct nfs_page *req;
665
666 req = nfs_setup_write_request(ctx, page, offset, count);
667 if (IS_ERR(req))
668 return PTR_ERR(req);
669 /* Update file length */
670 nfs_grow_file(page, offset, count);
671 nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes);
672 nfs_clear_page_tag_locked(req);
673 return 0;
674}
675
671int nfs_flush_incompatible(struct file *file, struct page *page) 676int nfs_flush_incompatible(struct file *file, struct page *page)
672{ 677{
673 struct nfs_open_context *ctx = nfs_file_open_context(file); 678 struct nfs_open_context *ctx = nfs_file_open_context(file);
@@ -685,8 +690,7 @@ int nfs_flush_incompatible(struct file *file, struct page *page)
685 req = nfs_page_find_request(page); 690 req = nfs_page_find_request(page);
686 if (req == NULL) 691 if (req == NULL)
687 return 0; 692 return 0;
688 do_flush = req->wb_page != page || req->wb_context != ctx 693 do_flush = req->wb_page != page || req->wb_context != ctx;
689 || !nfs_dirty_request(req);
690 nfs_release_request(req); 694 nfs_release_request(req);
691 if (!do_flush) 695 if (!do_flush)
692 return 0; 696 return 0;
@@ -721,10 +725,10 @@ int nfs_updatepage(struct file *file, struct page *page,
721 725
722 nfs_inc_stats(inode, NFSIOS_VFSUPDATEPAGE); 726 nfs_inc_stats(inode, NFSIOS_VFSUPDATEPAGE);
723 727
724 dprintk("NFS: nfs_updatepage(%s/%s %d@%Ld)\n", 728 dprintk("NFS: nfs_updatepage(%s/%s %d@%lld)\n",
725 file->f_path.dentry->d_parent->d_name.name, 729 file->f_path.dentry->d_parent->d_name.name,
726 file->f_path.dentry->d_name.name, count, 730 file->f_path.dentry->d_name.name, count,
727 (long long)(page_offset(page) +offset)); 731 (long long)(page_offset(page) + offset));
728 732
729 /* If we're not using byte range locks, and we know the page 733 /* If we're not using byte range locks, and we know the page
730 * is up to date, it may be more efficient to extend the write 734 * is up to date, it may be more efficient to extend the write
@@ -744,7 +748,7 @@ int nfs_updatepage(struct file *file, struct page *page,
744 else 748 else
745 __set_page_dirty_nobuffers(page); 749 __set_page_dirty_nobuffers(page);
746 750
747 dprintk("NFS: nfs_updatepage returns %d (isize %Ld)\n", 751 dprintk("NFS: nfs_updatepage returns %d (isize %lld)\n",
748 status, (long long)i_size_read(inode)); 752 status, (long long)i_size_read(inode));
749 return status; 753 return status;
750} 754}
@@ -752,12 +756,7 @@ int nfs_updatepage(struct file *file, struct page *page,
752static void nfs_writepage_release(struct nfs_page *req) 756static void nfs_writepage_release(struct nfs_page *req)
753{ 757{
754 758
755 if (PageError(req->wb_page)) { 759 if (PageError(req->wb_page) || !nfs_reschedule_unstable_write(req)) {
756 nfs_end_page_writeback(req->wb_page);
757 nfs_inode_remove_request(req);
758 } else if (!nfs_reschedule_unstable_write(req)) {
759 /* Set the PG_uptodate flag */
760 nfs_mark_uptodate(req->wb_page, req->wb_pgbase, req->wb_bytes);
761 nfs_end_page_writeback(req->wb_page); 760 nfs_end_page_writeback(req->wb_page);
762 nfs_inode_remove_request(req); 761 nfs_inode_remove_request(req);
763 } else 762 } else
@@ -834,7 +833,7 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
834 NFS_PROTO(inode)->write_setup(data, &msg); 833 NFS_PROTO(inode)->write_setup(data, &msg);
835 834
836 dprintk("NFS: %5u initiated write call " 835 dprintk("NFS: %5u initiated write call "
837 "(req %s/%Ld, %u bytes @ offset %Lu)\n", 836 "(req %s/%lld, %u bytes @ offset %llu)\n",
838 data->task.tk_pid, 837 data->task.tk_pid,
839 inode->i_sb->s_id, 838 inode->i_sb->s_id,
840 (long long)NFS_FILEID(inode), 839 (long long)NFS_FILEID(inode),
@@ -978,13 +977,13 @@ static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
978static void nfs_writeback_done_partial(struct rpc_task *task, void *calldata) 977static void nfs_writeback_done_partial(struct rpc_task *task, void *calldata)
979{ 978{
980 struct nfs_write_data *data = calldata; 979 struct nfs_write_data *data = calldata;
981 struct nfs_page *req = data->req;
982 980
983 dprintk("NFS: write (%s/%Ld %d@%Ld)", 981 dprintk("NFS: %5u write(%s/%lld %d@%lld)",
984 req->wb_context->path.dentry->d_inode->i_sb->s_id, 982 task->tk_pid,
985 (long long)NFS_FILEID(req->wb_context->path.dentry->d_inode), 983 data->req->wb_context->path.dentry->d_inode->i_sb->s_id,
986 req->wb_bytes, 984 (long long)
987 (long long)req_offset(req)); 985 NFS_FILEID(data->req->wb_context->path.dentry->d_inode),
986 data->req->wb_bytes, (long long)req_offset(data->req));
988 987
989 nfs_writeback_done(task, data); 988 nfs_writeback_done(task, data);
990} 989}
@@ -1058,7 +1057,8 @@ static void nfs_writeback_release_full(void *calldata)
1058 1057
1059 nfs_list_remove_request(req); 1058 nfs_list_remove_request(req);
1060 1059
1061 dprintk("NFS: write (%s/%Ld %d@%Ld)", 1060 dprintk("NFS: %5u write (%s/%lld %d@%lld)",
1061 data->task.tk_pid,
1062 req->wb_context->path.dentry->d_inode->i_sb->s_id, 1062 req->wb_context->path.dentry->d_inode->i_sb->s_id,
1063 (long long)NFS_FILEID(req->wb_context->path.dentry->d_inode), 1063 (long long)NFS_FILEID(req->wb_context->path.dentry->d_inode),
1064 req->wb_bytes, 1064 req->wb_bytes,
@@ -1078,8 +1078,6 @@ static void nfs_writeback_release_full(void *calldata)
1078 dprintk(" marked for commit\n"); 1078 dprintk(" marked for commit\n");
1079 goto next; 1079 goto next;
1080 } 1080 }
1081 /* Set the PG_uptodate flag? */
1082 nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes);
1083 dprintk(" OK\n"); 1081 dprintk(" OK\n");
1084remove_request: 1082remove_request:
1085 nfs_end_page_writeback(page); 1083 nfs_end_page_writeback(page);
@@ -1133,7 +1131,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
1133 static unsigned long complain; 1131 static unsigned long complain;
1134 1132
1135 if (time_before(complain, jiffies)) { 1133 if (time_before(complain, jiffies)) {
1136 dprintk("NFS: faulty NFS server %s:" 1134 dprintk("NFS: faulty NFS server %s:"
1137 " (committed = %d) != (stable = %d)\n", 1135 " (committed = %d) != (stable = %d)\n",
1138 NFS_SERVER(data->inode)->nfs_client->cl_hostname, 1136 NFS_SERVER(data->inode)->nfs_client->cl_hostname,
1139 resp->verf->committed, argp->stable); 1137 resp->verf->committed, argp->stable);
@@ -1297,12 +1295,9 @@ static void nfs_commit_release(void *calldata)
1297 while (!list_empty(&data->pages)) { 1295 while (!list_empty(&data->pages)) {
1298 req = nfs_list_entry(data->pages.next); 1296 req = nfs_list_entry(data->pages.next);
1299 nfs_list_remove_request(req); 1297 nfs_list_remove_request(req);
1300 clear_bit(PG_NEED_COMMIT, &(req)->wb_flags); 1298 nfs_clear_request_commit(req);
1301 dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
1302 dec_bdi_stat(req->wb_page->mapping->backing_dev_info,
1303 BDI_RECLAIMABLE);
1304 1299
1305 dprintk("NFS: commit (%s/%Ld %d@%Ld)", 1300 dprintk("NFS: commit (%s/%lld %d@%lld)",
1306 req->wb_context->path.dentry->d_inode->i_sb->s_id, 1301 req->wb_context->path.dentry->d_inode->i_sb->s_id,
1307 (long long)NFS_FILEID(req->wb_context->path.dentry->d_inode), 1302 (long long)NFS_FILEID(req->wb_context->path.dentry->d_inode),
1308 req->wb_bytes, 1303 req->wb_bytes,
@@ -1318,9 +1313,6 @@ static void nfs_commit_release(void *calldata)
1318 * returned by the server against all stored verfs. */ 1313 * returned by the server against all stored verfs. */
1319 if (!memcmp(req->wb_verf.verifier, data->verf.verifier, sizeof(data->verf.verifier))) { 1314 if (!memcmp(req->wb_verf.verifier, data->verf.verifier, sizeof(data->verf.verifier))) {
1320 /* We have a match */ 1315 /* We have a match */
1321 /* Set the PG_uptodate flag */
1322 nfs_mark_uptodate(req->wb_page, req->wb_pgbase,
1323 req->wb_bytes);
1324 nfs_inode_remove_request(req); 1316 nfs_inode_remove_request(req);
1325 dprintk(" OK\n"); 1317 dprintk(" OK\n");
1326 goto next; 1318 goto next;
@@ -1479,7 +1471,7 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page)
1479 req = nfs_page_find_request(page); 1471 req = nfs_page_find_request(page);
1480 if (req == NULL) 1472 if (req == NULL)
1481 goto out; 1473 goto out;
1482 if (test_bit(PG_NEED_COMMIT, &req->wb_flags)) { 1474 if (test_bit(PG_CLEAN, &req->wb_flags)) {
1483 nfs_release_request(req); 1475 nfs_release_request(req);
1484 break; 1476 break;
1485 } 1477 }
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 4d4760e687c3..702fa577aa6e 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -381,7 +381,7 @@ static int do_probe_callback(void *data)
381 .program = &cb_program, 381 .program = &cb_program,
382 .version = nfs_cb_version[1]->number, 382 .version = nfs_cb_version[1]->number,
383 .authflavor = RPC_AUTH_UNIX, /* XXX: need AUTH_GSS... */ 383 .authflavor = RPC_AUTH_UNIX, /* XXX: need AUTH_GSS... */
384 .flags = (RPC_CLNT_CREATE_NOPING), 384 .flags = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET),
385 }; 385 };
386 struct rpc_message msg = { 386 struct rpc_message msg = {
387 .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL], 387 .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL],
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 17964c0505a9..1db080135c6d 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -174,10 +174,17 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
174 * need to use BH_New is when we're extending i_size on a file 174 * need to use BH_New is when we're extending i_size on a file
175 * system which doesn't support holes, in which case BH_New 175 * system which doesn't support holes, in which case BH_New
176 * allows block_prepare_write() to zero. 176 * allows block_prepare_write() to zero.
177 *
178 * If we see this on a sparse file system, then a truncate has
179 * raced us and removed the cluster. In this case, we clear
180 * the buffers dirty and uptodate bits and let the buffer code
181 * ignore it as a hole.
177 */ 182 */
178 mlog_bug_on_msg(create && p_blkno == 0 && ocfs2_sparse_alloc(osb), 183 if (create && p_blkno == 0 && ocfs2_sparse_alloc(osb)) {
179 "ino %lu, iblock %llu\n", inode->i_ino, 184 clear_buffer_dirty(bh_result);
180 (unsigned long long)iblock); 185 clear_buffer_uptodate(bh_result);
186 goto bail;
187 }
181 188
182 /* Treat the unwritten extent as a hole for zeroing purposes. */ 189 /* Treat the unwritten extent as a hole for zeroing purposes. */
183 if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN)) 190 if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index f02ccb34604d..443d108211ab 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -1489,25 +1489,28 @@ static struct o2hb_heartbeat_group *to_o2hb_heartbeat_group(struct config_group
1489 : NULL; 1489 : NULL;
1490} 1490}
1491 1491
1492static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *group, 1492static int o2hb_heartbeat_group_make_item(struct config_group *group,
1493 const char *name) 1493 const char *name,
1494 struct config_item **new_item)
1494{ 1495{
1495 struct o2hb_region *reg = NULL; 1496 struct o2hb_region *reg = NULL;
1496 struct config_item *ret = NULL; 1497 int ret = 0;
1497 1498
1498 reg = kzalloc(sizeof(struct o2hb_region), GFP_KERNEL); 1499 reg = kzalloc(sizeof(struct o2hb_region), GFP_KERNEL);
1499 if (reg == NULL) 1500 if (reg == NULL) {
1500 goto out; /* ENOMEM */ 1501 ret = -ENOMEM;
1502 goto out;
1503 }
1501 1504
1502 config_item_init_type_name(&reg->hr_item, name, &o2hb_region_type); 1505 config_item_init_type_name(&reg->hr_item, name, &o2hb_region_type);
1503 1506
1504 ret = &reg->hr_item; 1507 *new_item = &reg->hr_item;
1505 1508
1506 spin_lock(&o2hb_live_lock); 1509 spin_lock(&o2hb_live_lock);
1507 list_add_tail(&reg->hr_all_item, &o2hb_all_regions); 1510 list_add_tail(&reg->hr_all_item, &o2hb_all_regions);
1508 spin_unlock(&o2hb_live_lock); 1511 spin_unlock(&o2hb_live_lock);
1509out: 1512out:
1510 if (ret == NULL) 1513 if (ret)
1511 kfree(reg); 1514 kfree(reg);
1512 1515
1513 return ret; 1516 return ret;
diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c
index 7bf3c0ea7bd9..d8bfa0eb41b2 100644
--- a/fs/ocfs2/cluster/netdebug.c
+++ b/fs/ocfs2/cluster/netdebug.c
@@ -146,8 +146,10 @@ static int nst_seq_show(struct seq_file *seq, void *v)
146 nst->st_task->comm, nst->st_node, 146 nst->st_task->comm, nst->st_node,
147 nst->st_sc, nst->st_id, nst->st_msg_type, 147 nst->st_sc, nst->st_id, nst->st_msg_type,
148 nst->st_msg_key, 148 nst->st_msg_key,
149 nst->st_sock_time.tv_sec, nst->st_sock_time.tv_usec, 149 nst->st_sock_time.tv_sec,
150 nst->st_send_time.tv_sec, nst->st_send_time.tv_usec, 150 (unsigned long)nst->st_sock_time.tv_usec,
151 nst->st_send_time.tv_sec,
152 (unsigned long)nst->st_send_time.tv_usec,
151 nst->st_status_time.tv_sec, 153 nst->st_status_time.tv_sec,
152 nst->st_status_time.tv_usec); 154 nst->st_status_time.tv_usec);
153 } 155 }
@@ -274,7 +276,7 @@ static void *sc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
274 return sc; /* unused, just needs to be null when done */ 276 return sc; /* unused, just needs to be null when done */
275} 277}
276 278
277#define TV_SEC_USEC(TV) TV.tv_sec, TV.tv_usec 279#define TV_SEC_USEC(TV) TV.tv_sec, (unsigned long)TV.tv_usec
278 280
279static int sc_seq_show(struct seq_file *seq, void *v) 281static int sc_seq_show(struct seq_file *seq, void *v)
280{ 282{
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index cfdb08b484ed..b364b7052e46 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -644,27 +644,32 @@ out:
644 return ret; 644 return ret;
645} 645}
646 646
647static struct config_item *o2nm_node_group_make_item(struct config_group *group, 647static int o2nm_node_group_make_item(struct config_group *group,
648 const char *name) 648 const char *name,
649 struct config_item **new_item)
649{ 650{
650 struct o2nm_node *node = NULL; 651 struct o2nm_node *node = NULL;
651 struct config_item *ret = NULL; 652 int ret = 0;
652 653
653 if (strlen(name) > O2NM_MAX_NAME_LEN) 654 if (strlen(name) > O2NM_MAX_NAME_LEN) {
654 goto out; /* ENAMETOOLONG */ 655 ret = -ENAMETOOLONG;
656 goto out;
657 }
655 658
656 node = kzalloc(sizeof(struct o2nm_node), GFP_KERNEL); 659 node = kzalloc(sizeof(struct o2nm_node), GFP_KERNEL);
657 if (node == NULL) 660 if (node == NULL) {
658 goto out; /* ENOMEM */ 661 ret = -ENOMEM;
662 goto out;
663 }
659 664
660 strcpy(node->nd_name, name); /* use item.ci_namebuf instead? */ 665 strcpy(node->nd_name, name); /* use item.ci_namebuf instead? */
661 config_item_init_type_name(&node->nd_item, name, &o2nm_node_type); 666 config_item_init_type_name(&node->nd_item, name, &o2nm_node_type);
662 spin_lock_init(&node->nd_lock); 667 spin_lock_init(&node->nd_lock);
663 668
664 ret = &node->nd_item; 669 *new_item = &node->nd_item;
665 670
666out: 671out:
667 if (ret == NULL) 672 if (ret)
668 kfree(node); 673 kfree(node);
669 674
670 return ret; 675 return ret;
@@ -751,25 +756,31 @@ static struct o2nm_cluster_group *to_o2nm_cluster_group(struct config_group *gro
751} 756}
752#endif 757#endif
753 758
754static struct config_group *o2nm_cluster_group_make_group(struct config_group *group, 759static int o2nm_cluster_group_make_group(struct config_group *group,
755 const char *name) 760 const char *name,
761 struct config_group **new_group)
756{ 762{
757 struct o2nm_cluster *cluster = NULL; 763 struct o2nm_cluster *cluster = NULL;
758 struct o2nm_node_group *ns = NULL; 764 struct o2nm_node_group *ns = NULL;
759 struct config_group *o2hb_group = NULL, *ret = NULL; 765 struct config_group *o2hb_group = NULL;
760 void *defs = NULL; 766 void *defs = NULL;
767 int ret = 0;
761 768
762 /* this runs under the parent dir's i_mutex; there can be only 769 /* this runs under the parent dir's i_mutex; there can be only
763 * one caller in here at a time */ 770 * one caller in here at a time */
764 if (o2nm_single_cluster) 771 if (o2nm_single_cluster) {
765 goto out; /* ENOSPC */ 772 ret = -ENOSPC;
773 goto out;
774 }
766 775
767 cluster = kzalloc(sizeof(struct o2nm_cluster), GFP_KERNEL); 776 cluster = kzalloc(sizeof(struct o2nm_cluster), GFP_KERNEL);
768 ns = kzalloc(sizeof(struct o2nm_node_group), GFP_KERNEL); 777 ns = kzalloc(sizeof(struct o2nm_node_group), GFP_KERNEL);
769 defs = kcalloc(3, sizeof(struct config_group *), GFP_KERNEL); 778 defs = kcalloc(3, sizeof(struct config_group *), GFP_KERNEL);
770 o2hb_group = o2hb_alloc_hb_set(); 779 o2hb_group = o2hb_alloc_hb_set();
771 if (cluster == NULL || ns == NULL || o2hb_group == NULL || defs == NULL) 780 if (cluster == NULL || ns == NULL || o2hb_group == NULL || defs == NULL) {
781 ret = -ENOMEM;
772 goto out; 782 goto out;
783 }
773 784
774 config_group_init_type_name(&cluster->cl_group, name, 785 config_group_init_type_name(&cluster->cl_group, name,
775 &o2nm_cluster_type); 786 &o2nm_cluster_type);
@@ -786,11 +797,11 @@ static struct config_group *o2nm_cluster_group_make_group(struct config_group *g
786 cluster->cl_idle_timeout_ms = O2NET_IDLE_TIMEOUT_MS_DEFAULT; 797 cluster->cl_idle_timeout_ms = O2NET_IDLE_TIMEOUT_MS_DEFAULT;
787 cluster->cl_keepalive_delay_ms = O2NET_KEEPALIVE_DELAY_MS_DEFAULT; 798 cluster->cl_keepalive_delay_ms = O2NET_KEEPALIVE_DELAY_MS_DEFAULT;
788 799
789 ret = &cluster->cl_group; 800 *new_group = &cluster->cl_group;
790 o2nm_single_cluster = cluster; 801 o2nm_single_cluster = cluster;
791 802
792out: 803out:
793 if (ret == NULL) { 804 if (ret) {
794 kfree(cluster); 805 kfree(cluster);
795 kfree(ns); 806 kfree(ns);
796 o2hb_free_hb_set(o2hb_group); 807 o2hb_free_hb_set(o2hb_group);
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index efc015c6128a..44f87caf3683 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -606,7 +606,9 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
606 606
607 res->last_used = 0; 607 res->last_used = 0;
608 608
609 spin_lock(&dlm->spinlock);
609 list_add_tail(&res->tracking, &dlm->tracking_list); 610 list_add_tail(&res->tracking, &dlm->tracking_list);
611 spin_unlock(&dlm->spinlock);
610 612
611 memset(res->lvb, 0, DLM_LVB_LEN); 613 memset(res->lvb, 0, DLM_LVB_LEN);
612 memset(res->refmap, 0, sizeof(res->refmap)); 614 memset(res->refmap, 0, sizeof(res->refmap));
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 394d25a131a5..eae3d643a5e4 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -31,6 +31,7 @@
31#include <linux/pagemap.h> 31#include <linux/pagemap.h>
32#include <linux/debugfs.h> 32#include <linux/debugfs.h>
33#include <linux/seq_file.h> 33#include <linux/seq_file.h>
34#include <linux/time.h>
34 35
35#define MLOG_MASK_PREFIX ML_DLM_GLUE 36#define MLOG_MASK_PREFIX ML_DLM_GLUE
36#include <cluster/masklog.h> 37#include <cluster/masklog.h>
@@ -59,6 +60,9 @@ struct ocfs2_mask_waiter {
59 struct completion mw_complete; 60 struct completion mw_complete;
60 unsigned long mw_mask; 61 unsigned long mw_mask;
61 unsigned long mw_goal; 62 unsigned long mw_goal;
63#ifdef CONFIG_OCFS2_FS_STATS
64 unsigned long long mw_lock_start;
65#endif
62}; 66};
63 67
64static struct ocfs2_super *ocfs2_get_dentry_osb(struct ocfs2_lock_res *lockres); 68static struct ocfs2_super *ocfs2_get_dentry_osb(struct ocfs2_lock_res *lockres);
@@ -366,6 +370,75 @@ static void ocfs2_remove_lockres_tracking(struct ocfs2_lock_res *res)
366 spin_unlock(&ocfs2_dlm_tracking_lock); 370 spin_unlock(&ocfs2_dlm_tracking_lock);
367} 371}
368 372
373#ifdef CONFIG_OCFS2_FS_STATS
374static void ocfs2_init_lock_stats(struct ocfs2_lock_res *res)
375{
376 res->l_lock_num_prmode = 0;
377 res->l_lock_num_prmode_failed = 0;
378 res->l_lock_total_prmode = 0;
379 res->l_lock_max_prmode = 0;
380 res->l_lock_num_exmode = 0;
381 res->l_lock_num_exmode_failed = 0;
382 res->l_lock_total_exmode = 0;
383 res->l_lock_max_exmode = 0;
384 res->l_lock_refresh = 0;
385}
386
387static void ocfs2_update_lock_stats(struct ocfs2_lock_res *res, int level,
388 struct ocfs2_mask_waiter *mw, int ret)
389{
390 unsigned long long *num, *sum;
391 unsigned int *max, *failed;
392 struct timespec ts = current_kernel_time();
393 unsigned long long time = timespec_to_ns(&ts) - mw->mw_lock_start;
394
395 if (level == LKM_PRMODE) {
396 num = &res->l_lock_num_prmode;
397 sum = &res->l_lock_total_prmode;
398 max = &res->l_lock_max_prmode;
399 failed = &res->l_lock_num_prmode_failed;
400 } else if (level == LKM_EXMODE) {
401 num = &res->l_lock_num_exmode;
402 sum = &res->l_lock_total_exmode;
403 max = &res->l_lock_max_exmode;
404 failed = &res->l_lock_num_exmode_failed;
405 } else
406 return;
407
408 (*num)++;
409 (*sum) += time;
410 if (time > *max)
411 *max = time;
412 if (ret)
413 (*failed)++;
414}
415
416static inline void ocfs2_track_lock_refresh(struct ocfs2_lock_res *lockres)
417{
418 lockres->l_lock_refresh++;
419}
420
421static inline void ocfs2_init_start_time(struct ocfs2_mask_waiter *mw)
422{
423 struct timespec ts = current_kernel_time();
424 mw->mw_lock_start = timespec_to_ns(&ts);
425}
426#else
427static inline void ocfs2_init_lock_stats(struct ocfs2_lock_res *res)
428{
429}
430static inline void ocfs2_update_lock_stats(struct ocfs2_lock_res *res,
431 int level, struct ocfs2_mask_waiter *mw, int ret)
432{
433}
434static inline void ocfs2_track_lock_refresh(struct ocfs2_lock_res *lockres)
435{
436}
437static inline void ocfs2_init_start_time(struct ocfs2_mask_waiter *mw)
438{
439}
440#endif
441
369static void ocfs2_lock_res_init_common(struct ocfs2_super *osb, 442static void ocfs2_lock_res_init_common(struct ocfs2_super *osb,
370 struct ocfs2_lock_res *res, 443 struct ocfs2_lock_res *res,
371 enum ocfs2_lock_type type, 444 enum ocfs2_lock_type type,
@@ -385,6 +458,8 @@ static void ocfs2_lock_res_init_common(struct ocfs2_super *osb,
385 res->l_flags = OCFS2_LOCK_INITIALIZED; 458 res->l_flags = OCFS2_LOCK_INITIALIZED;
386 459
387 ocfs2_add_lockres_tracking(res, osb->osb_dlm_debug); 460 ocfs2_add_lockres_tracking(res, osb->osb_dlm_debug);
461
462 ocfs2_init_lock_stats(res);
388} 463}
389 464
390void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res) 465void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res)
@@ -1048,6 +1123,7 @@ static void ocfs2_init_mask_waiter(struct ocfs2_mask_waiter *mw)
1048{ 1123{
1049 INIT_LIST_HEAD(&mw->mw_item); 1124 INIT_LIST_HEAD(&mw->mw_item);
1050 init_completion(&mw->mw_complete); 1125 init_completion(&mw->mw_complete);
1126 ocfs2_init_start_time(mw);
1051} 1127}
1052 1128
1053static int ocfs2_wait_for_mask(struct ocfs2_mask_waiter *mw) 1129static int ocfs2_wait_for_mask(struct ocfs2_mask_waiter *mw)
@@ -1254,6 +1330,7 @@ out:
1254 goto again; 1330 goto again;
1255 mlog_errno(ret); 1331 mlog_errno(ret);
1256 } 1332 }
1333 ocfs2_update_lock_stats(lockres, level, &mw, ret);
1257 1334
1258 mlog_exit(ret); 1335 mlog_exit(ret);
1259 return ret; 1336 return ret;
@@ -1554,8 +1631,8 @@ out:
1554 */ 1631 */
1555int ocfs2_file_lock(struct file *file, int ex, int trylock) 1632int ocfs2_file_lock(struct file *file, int ex, int trylock)
1556{ 1633{
1557 int ret, level = ex ? LKM_EXMODE : LKM_PRMODE; 1634 int ret, level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
1558 unsigned int lkm_flags = trylock ? LKM_NOQUEUE : 0; 1635 unsigned int lkm_flags = trylock ? DLM_LKF_NOQUEUE : 0;
1559 unsigned long flags; 1636 unsigned long flags;
1560 struct ocfs2_file_private *fp = file->private_data; 1637 struct ocfs2_file_private *fp = file->private_data;
1561 struct ocfs2_lock_res *lockres = &fp->fp_flock; 1638 struct ocfs2_lock_res *lockres = &fp->fp_flock;
@@ -1582,7 +1659,7 @@ int ocfs2_file_lock(struct file *file, int ex, int trylock)
1582 * Get the lock at NLMODE to start - that way we 1659 * Get the lock at NLMODE to start - that way we
1583 * can cancel the upconvert request if need be. 1660 * can cancel the upconvert request if need be.
1584 */ 1661 */
1585 ret = ocfs2_lock_create(osb, lockres, LKM_NLMODE, 0); 1662 ret = ocfs2_lock_create(osb, lockres, DLM_LOCK_NL, 0);
1586 if (ret < 0) { 1663 if (ret < 0) {
1587 mlog_errno(ret); 1664 mlog_errno(ret);
1588 goto out; 1665 goto out;
@@ -1597,7 +1674,7 @@ int ocfs2_file_lock(struct file *file, int ex, int trylock)
1597 } 1674 }
1598 1675
1599 lockres->l_action = OCFS2_AST_CONVERT; 1676 lockres->l_action = OCFS2_AST_CONVERT;
1600 lkm_flags |= LKM_CONVERT; 1677 lkm_flags |= DLM_LKF_CONVERT;
1601 lockres->l_requested = level; 1678 lockres->l_requested = level;
1602 lockres_or_flags(lockres, OCFS2_LOCK_BUSY); 1679 lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
1603 1680
@@ -1664,7 +1741,7 @@ void ocfs2_file_unlock(struct file *file)
1664 if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) 1741 if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED))
1665 return; 1742 return;
1666 1743
1667 if (lockres->l_level == LKM_NLMODE) 1744 if (lockres->l_level == DLM_LOCK_NL)
1668 return; 1745 return;
1669 1746
1670 mlog(0, "Unlock: \"%s\" flags: 0x%lx, level: %d, act: %d\n", 1747 mlog(0, "Unlock: \"%s\" flags: 0x%lx, level: %d, act: %d\n",
@@ -1678,11 +1755,11 @@ void ocfs2_file_unlock(struct file *file)
1678 lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED); 1755 lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED);
1679 lockres->l_blocking = DLM_LOCK_EX; 1756 lockres->l_blocking = DLM_LOCK_EX;
1680 1757
1681 gen = ocfs2_prepare_downconvert(lockres, LKM_NLMODE); 1758 gen = ocfs2_prepare_downconvert(lockres, DLM_LOCK_NL);
1682 lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0); 1759 lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
1683 spin_unlock_irqrestore(&lockres->l_lock, flags); 1760 spin_unlock_irqrestore(&lockres->l_lock, flags);
1684 1761
1685 ret = ocfs2_downconvert_lock(osb, lockres, LKM_NLMODE, 0, gen); 1762 ret = ocfs2_downconvert_lock(osb, lockres, DLM_LOCK_NL, 0, gen);
1686 if (ret) { 1763 if (ret) {
1687 mlog_errno(ret); 1764 mlog_errno(ret);
1688 return; 1765 return;
@@ -1983,6 +2060,7 @@ static int ocfs2_inode_lock_update(struct inode *inode,
1983 le32_to_cpu(fe->i_flags)); 2060 le32_to_cpu(fe->i_flags));
1984 2061
1985 ocfs2_refresh_inode(inode, fe); 2062 ocfs2_refresh_inode(inode, fe);
2063 ocfs2_track_lock_refresh(lockres);
1986 } 2064 }
1987 2065
1988 status = 0; 2066 status = 0;
@@ -2267,6 +2345,7 @@ int ocfs2_super_lock(struct ocfs2_super *osb,
2267 2345
2268 if (status < 0) 2346 if (status < 0)
2269 mlog_errno(status); 2347 mlog_errno(status);
2348 ocfs2_track_lock_refresh(lockres);
2270 } 2349 }
2271bail: 2350bail:
2272 mlog_exit(status); 2351 mlog_exit(status);
@@ -2461,7 +2540,7 @@ static void *ocfs2_dlm_seq_next(struct seq_file *m, void *v, loff_t *pos)
2461} 2540}
2462 2541
2463/* So that debugfs.ocfs2 can determine which format is being used */ 2542/* So that debugfs.ocfs2 can determine which format is being used */
2464#define OCFS2_DLM_DEBUG_STR_VERSION 1 2543#define OCFS2_DLM_DEBUG_STR_VERSION 2
2465static int ocfs2_dlm_seq_show(struct seq_file *m, void *v) 2544static int ocfs2_dlm_seq_show(struct seq_file *m, void *v)
2466{ 2545{
2467 int i; 2546 int i;
@@ -2502,6 +2581,47 @@ static int ocfs2_dlm_seq_show(struct seq_file *m, void *v)
2502 for(i = 0; i < DLM_LVB_LEN; i++) 2581 for(i = 0; i < DLM_LVB_LEN; i++)
2503 seq_printf(m, "0x%x\t", lvb[i]); 2582 seq_printf(m, "0x%x\t", lvb[i]);
2504 2583
2584#ifdef CONFIG_OCFS2_FS_STATS
2585# define lock_num_prmode(_l) (_l)->l_lock_num_prmode
2586# define lock_num_exmode(_l) (_l)->l_lock_num_exmode
2587# define lock_num_prmode_failed(_l) (_l)->l_lock_num_prmode_failed
2588# define lock_num_exmode_failed(_l) (_l)->l_lock_num_exmode_failed
2589# define lock_total_prmode(_l) (_l)->l_lock_total_prmode
2590# define lock_total_exmode(_l) (_l)->l_lock_total_exmode
2591# define lock_max_prmode(_l) (_l)->l_lock_max_prmode
2592# define lock_max_exmode(_l) (_l)->l_lock_max_exmode
2593# define lock_refresh(_l) (_l)->l_lock_refresh
2594#else
2595# define lock_num_prmode(_l) (0ULL)
2596# define lock_num_exmode(_l) (0ULL)
2597# define lock_num_prmode_failed(_l) (0)
2598# define lock_num_exmode_failed(_l) (0)
2599# define lock_total_prmode(_l) (0ULL)
2600# define lock_total_exmode(_l) (0ULL)
2601# define lock_max_prmode(_l) (0)
2602# define lock_max_exmode(_l) (0)
2603# define lock_refresh(_l) (0)
2604#endif
2605 /* The following seq_print was added in version 2 of this output */
2606 seq_printf(m, "%llu\t"
2607 "%llu\t"
2608 "%u\t"
2609 "%u\t"
2610 "%llu\t"
2611 "%llu\t"
2612 "%u\t"
2613 "%u\t"
2614 "%u\t",
2615 lock_num_prmode(lockres),
2616 lock_num_exmode(lockres),
2617 lock_num_prmode_failed(lockres),
2618 lock_num_exmode_failed(lockres),
2619 lock_total_prmode(lockres),
2620 lock_total_exmode(lockres),
2621 lock_max_prmode(lockres),
2622 lock_max_exmode(lockres),
2623 lock_refresh(lockres));
2624
2505 /* End the line */ 2625 /* End the line */
2506 seq_printf(m, "\n"); 2626 seq_printf(m, "\n");
2507 return 0; 2627 return 0;
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 57e0d30cde98..e8514e8b6ce8 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2202,7 +2202,7 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
2202 2202
2203 ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos); 2203 ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos);
2204 if (ret == -EINVAL) 2204 if (ret == -EINVAL)
2205 mlog(ML_ERROR, "generic_file_aio_read returned -EINVAL\n"); 2205 mlog(0, "generic_file_aio_read returned -EINVAL\n");
2206 2206
2207 /* buffered aio wouldn't have proper lock coverage today */ 2207 /* buffered aio wouldn't have proper lock coverage today */
2208 BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT)); 2208 BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 9698338adc39..a8c19cb3cfdd 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -329,7 +329,7 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks)
329 329
330 mlog(0, "Trying to extend transaction by %d blocks\n", nblocks); 330 mlog(0, "Trying to extend transaction by %d blocks\n", nblocks);
331 331
332#ifdef OCFS2_DEBUG_FS 332#ifdef CONFIG_OCFS2_DEBUG_FS
333 status = 1; 333 status = 1;
334#else 334#else
335 status = journal_extend(handle, nblocks); 335 status = journal_extend(handle, nblocks);
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index be774bdc8b36..28e492e4ec88 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -498,7 +498,7 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
498 498
499 alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data; 499 alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
500 500
501#ifdef OCFS2_DEBUG_FS 501#ifdef CONFIG_OCFS2_DEBUG_FS
502 if (le32_to_cpu(alloc->id1.bitmap1.i_used) != 502 if (le32_to_cpu(alloc->id1.bitmap1.i_used) !=
503 ocfs2_local_alloc_count_bits(alloc)) { 503 ocfs2_local_alloc_count_bits(alloc)) {
504 ocfs2_error(osb->sb, "local alloc inode %llu says it has " 504 ocfs2_error(osb->sb, "local alloc inode %llu says it has "
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 31692379c170..1cb814be8ef1 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -132,6 +132,18 @@ struct ocfs2_lock_res {
132 wait_queue_head_t l_event; 132 wait_queue_head_t l_event;
133 133
134 struct list_head l_debug_list; 134 struct list_head l_debug_list;
135
136#ifdef CONFIG_OCFS2_FS_STATS
137 unsigned long long l_lock_num_prmode; /* PR acquires */
138 unsigned long long l_lock_num_exmode; /* EX acquires */
139 unsigned int l_lock_num_prmode_failed; /* Failed PR gets */
140 unsigned int l_lock_num_exmode_failed; /* Failed EX gets */
141 unsigned long long l_lock_total_prmode; /* Tot wait for PR */
142 unsigned long long l_lock_total_exmode; /* Tot wait for EX */
143 unsigned int l_lock_max_prmode; /* Max wait for PR */
144 unsigned int l_lock_max_exmode; /* Max wait for EX */
145 unsigned int l_lock_refresh; /* Disk refreshes */
146#endif
135}; 147};
136 148
137struct ocfs2_dlm_debug { 149struct ocfs2_dlm_debug {
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 52c426665154..3f1945177629 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -901,7 +901,7 @@ static inline int ocfs2_sprintf_system_inode_name(char *buf, int len,
901 * list has a copy per slot. 901 * list has a copy per slot.
902 */ 902 */
903 if (type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE) 903 if (type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE)
904 chars = snprintf(buf, len, 904 chars = snprintf(buf, len, "%s",
905 ocfs2_system_inodes[type].si_name); 905 ocfs2_system_inodes[type].si_name);
906 else 906 else
907 chars = snprintf(buf, len, 907 chars = snprintf(buf, len,
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index c021280dd462..353fc35c6748 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -21,6 +21,7 @@
21#include <linux/fs.h> 21#include <linux/fs.h>
22#include <linux/miscdevice.h> 22#include <linux/miscdevice.h>
23#include <linux/mutex.h> 23#include <linux/mutex.h>
24#include <linux/smp_lock.h>
24#include <linux/reboot.h> 25#include <linux/reboot.h>
25#include <asm/uaccess.h> 26#include <asm/uaccess.h>
26 27
@@ -549,26 +550,17 @@ static ssize_t ocfs2_control_read(struct file *file,
549 size_t count, 550 size_t count,
550 loff_t *ppos) 551 loff_t *ppos)
551{ 552{
552 char *proto_string = OCFS2_CONTROL_PROTO; 553 ssize_t ret;
553 size_t to_write = 0;
554
555 if (*ppos >= OCFS2_CONTROL_PROTO_LEN)
556 return 0;
557
558 to_write = OCFS2_CONTROL_PROTO_LEN - *ppos;
559 if (to_write > count)
560 to_write = count;
561 if (copy_to_user(buf, proto_string + *ppos, to_write))
562 return -EFAULT;
563 554
564 *ppos += to_write; 555 ret = simple_read_from_buffer(buf, count, ppos,
556 OCFS2_CONTROL_PROTO, OCFS2_CONTROL_PROTO_LEN);
565 557
566 /* Have we read the whole protocol list? */ 558 /* Have we read the whole protocol list? */
567 if (*ppos >= OCFS2_CONTROL_PROTO_LEN) 559 if (ret > 0 && *ppos >= OCFS2_CONTROL_PROTO_LEN)
568 ocfs2_control_set_handshake_state(file, 560 ocfs2_control_set_handshake_state(file,
569 OCFS2_CONTROL_HANDSHAKE_READ); 561 OCFS2_CONTROL_HANDSHAKE_READ);
570 562
571 return to_write; 563 return ret;
572} 564}
573 565
574static int ocfs2_control_release(struct inode *inode, struct file *file) 566static int ocfs2_control_release(struct inode *inode, struct file *file)
@@ -619,10 +611,12 @@ static int ocfs2_control_open(struct inode *inode, struct file *file)
619 return -ENOMEM; 611 return -ENOMEM;
620 p->op_this_node = -1; 612 p->op_this_node = -1;
621 613
614 lock_kernel();
622 mutex_lock(&ocfs2_control_lock); 615 mutex_lock(&ocfs2_control_lock);
623 file->private_data = p; 616 file->private_data = p;
624 list_add(&p->op_list, &ocfs2_control_private_list); 617 list_add(&p->op_list, &ocfs2_control_private_list);
625 mutex_unlock(&ocfs2_control_lock); 618 mutex_unlock(&ocfs2_control_lock);
619 unlock_kernel();
626 620
627 return 0; 621 return 0;
628} 622}
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index df63ba20ae90..ccecfe5094fa 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1703,7 +1703,11 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
1703 local = ocfs2_mount_local(osb); 1703 local = ocfs2_mount_local(osb);
1704 1704
1705 /* will play back anything left in the journal. */ 1705 /* will play back anything left in the journal. */
1706 ocfs2_journal_load(osb->journal, local); 1706 status = ocfs2_journal_load(osb->journal, local);
1707 if (status < 0) {
1708 mlog(ML_ERROR, "ocfs2 journal load failed! %d\n", status);
1709 goto finally;
1710 }
1707 1711
1708 if (dirty) { 1712 if (dirty) {
1709 /* recover my local alloc if we didn't unmount cleanly. */ 1713 /* recover my local alloc if we didn't unmount cleanly. */
diff --git a/fs/open.c b/fs/open.c
index a1450086e92f..a99ad09c3197 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -16,6 +16,7 @@
16#include <linux/namei.h> 16#include <linux/namei.h>
17#include <linux/backing-dev.h> 17#include <linux/backing-dev.h>
18#include <linux/capability.h> 18#include <linux/capability.h>
19#include <linux/securebits.h>
19#include <linux/security.h> 20#include <linux/security.h>
20#include <linux/mount.h> 21#include <linux/mount.h>
21#include <linux/vfs.h> 22#include <linux/vfs.h>
@@ -425,7 +426,7 @@ asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode)
425{ 426{
426 struct nameidata nd; 427 struct nameidata nd;
427 int old_fsuid, old_fsgid; 428 int old_fsuid, old_fsgid;
428 kernel_cap_t old_cap; 429 kernel_cap_t uninitialized_var(old_cap); /* !SECURE_NO_SETUID_FIXUP */
429 int res; 430 int res;
430 431
431 if (mode & ~S_IRWXO) /* where's F_OK, X_OK, W_OK, R_OK? */ 432 if (mode & ~S_IRWXO) /* where's F_OK, X_OK, W_OK, R_OK? */
@@ -433,23 +434,27 @@ asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode)
433 434
434 old_fsuid = current->fsuid; 435 old_fsuid = current->fsuid;
435 old_fsgid = current->fsgid; 436 old_fsgid = current->fsgid;
436 old_cap = current->cap_effective;
437 437
438 current->fsuid = current->uid; 438 current->fsuid = current->uid;
439 current->fsgid = current->gid; 439 current->fsgid = current->gid;
440 440
441 /* 441 if (!issecure(SECURE_NO_SETUID_FIXUP)) {
442 * Clear the capabilities if we switch to a non-root user 442 /*
443 * 443 * Clear the capabilities if we switch to a non-root user
444 * FIXME: There is a race here against sys_capset. The 444 */
445 * capabilities can change yet we will restore the old 445#ifndef CONFIG_SECURITY_FILE_CAPABILITIES
446 * value below. We should hold task_capabilities_lock, 446 /*
447 * but we cannot because user_path_walk can sleep. 447 * FIXME: There is a race here against sys_capset. The
448 */ 448 * capabilities can change yet we will restore the old
449 if (current->uid) 449 * value below. We should hold task_capabilities_lock,
450 cap_clear(current->cap_effective); 450 * but we cannot because user_path_walk can sleep.
451 else 451 */
452 current->cap_effective = current->cap_permitted; 452#endif /* ndef CONFIG_SECURITY_FILE_CAPABILITIES */
453 if (current->uid)
454 old_cap = cap_set_effective(__cap_empty_set);
455 else
456 old_cap = cap_set_effective(current->cap_permitted);
457 }
453 458
454 res = __user_walk_fd(dfd, filename, LOOKUP_FOLLOW|LOOKUP_ACCESS, &nd); 459 res = __user_walk_fd(dfd, filename, LOOKUP_FOLLOW|LOOKUP_ACCESS, &nd);
455 if (res) 460 if (res)
@@ -478,7 +483,9 @@ out_path_release:
478out: 483out:
479 current->fsuid = old_fsuid; 484 current->fsuid = old_fsuid;
480 current->fsgid = old_fsgid; 485 current->fsgid = old_fsgid;
481 current->cap_effective = old_cap; 486
487 if (!issecure(SECURE_NO_SETUID_FIXUP))
488 cap_set_effective(old_cap);
482 489
483 return res; 490 return res;
484} 491}
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 3b455371e7ff..58c3e6a8e15e 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -233,7 +233,7 @@ static int check_mem_permission(struct task_struct *task)
233 */ 233 */
234 if (task->parent == current && (task->ptrace & PT_PTRACED) && 234 if (task->parent == current && (task->ptrace & PT_PTRACED) &&
235 task_is_stopped_or_traced(task) && 235 task_is_stopped_or_traced(task) &&
236 ptrace_may_attach(task)) 236 ptrace_may_access(task, PTRACE_MODE_ATTACH))
237 return 0; 237 return 0;
238 238
239 /* 239 /*
@@ -251,7 +251,8 @@ struct mm_struct *mm_for_maps(struct task_struct *task)
251 task_lock(task); 251 task_lock(task);
252 if (task->mm != mm) 252 if (task->mm != mm)
253 goto out; 253 goto out;
254 if (task->mm != current->mm && __ptrace_may_attach(task) < 0) 254 if (task->mm != current->mm &&
255 __ptrace_may_access(task, PTRACE_MODE_READ) < 0)
255 goto out; 256 goto out;
256 task_unlock(task); 257 task_unlock(task);
257 return mm; 258 return mm;
@@ -518,7 +519,7 @@ static int proc_fd_access_allowed(struct inode *inode)
518 */ 519 */
519 task = get_proc_task(inode); 520 task = get_proc_task(inode);
520 if (task) { 521 if (task) {
521 allowed = ptrace_may_attach(task); 522 allowed = ptrace_may_access(task, PTRACE_MODE_READ);
522 put_task_struct(task); 523 put_task_struct(task);
523 } 524 }
524 return allowed; 525 return allowed;
@@ -904,7 +905,7 @@ static ssize_t environ_read(struct file *file, char __user *buf,
904 if (!task) 905 if (!task)
905 goto out_no_task; 906 goto out_no_task;
906 907
907 if (!ptrace_may_attach(task)) 908 if (!ptrace_may_access(task, PTRACE_MODE_READ))
908 goto out; 909 goto out;
909 910
910 ret = -ENOMEM; 911 ret = -ENOMEM;
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 7e277f2ad466..c652d469dc08 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -123,6 +123,11 @@ static int uptime_read_proc(char *page, char **start, off_t off,
123 return proc_calc_metrics(page, start, off, count, eof, len); 123 return proc_calc_metrics(page, start, off, count, eof, len);
124} 124}
125 125
126int __attribute__((weak)) arch_report_meminfo(char *page)
127{
128 return 0;
129}
130
126static int meminfo_read_proc(char *page, char **start, off_t off, 131static int meminfo_read_proc(char *page, char **start, off_t off,
127 int count, int *eof, void *data) 132 int count, int *eof, void *data)
128{ 133{
@@ -221,6 +226,8 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
221 226
222 len += hugetlb_report_meminfo(page + len); 227 len += hugetlb_report_meminfo(page + len);
223 228
229 len += arch_report_meminfo(page + len);
230
224 return proc_calc_metrics(page, start, off, count, eof, len); 231 return proc_calc_metrics(page, start, off, count, eof, len);
225#undef K 232#undef K
226} 233}
@@ -472,6 +479,13 @@ static const struct file_operations proc_vmalloc_operations = {
472}; 479};
473#endif 480#endif
474 481
482#ifndef arch_irq_stat_cpu
483#define arch_irq_stat_cpu(cpu) 0
484#endif
485#ifndef arch_irq_stat
486#define arch_irq_stat() 0
487#endif
488
475static int show_stat(struct seq_file *p, void *v) 489static int show_stat(struct seq_file *p, void *v)
476{ 490{
477 int i; 491 int i;
@@ -509,7 +523,9 @@ static int show_stat(struct seq_file *p, void *v)
509 sum += temp; 523 sum += temp;
510 per_irq_sum[j] += temp; 524 per_irq_sum[j] += temp;
511 } 525 }
526 sum += arch_irq_stat_cpu(i);
512 } 527 }
528 sum += arch_irq_stat();
513 529
514 seq_printf(p, "cpu %llu %llu %llu %llu %llu %llu %llu %llu %llu\n", 530 seq_printf(p, "cpu %llu %llu %llu %llu %llu %llu %llu %llu %llu\n",
515 (unsigned long long)cputime64_to_clock_t(user), 531 (unsigned long long)cputime64_to_clock_t(user),
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index ab8ccc9d14ff..164bd9f9ede3 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -210,7 +210,7 @@ static int show_map(struct seq_file *m, void *v)
210 dev_t dev = 0; 210 dev_t dev = 0;
211 int len; 211 int len;
212 212
213 if (maps_protect && !ptrace_may_attach(task)) 213 if (maps_protect && !ptrace_may_access(task, PTRACE_MODE_READ))
214 return -EACCES; 214 return -EACCES;
215 215
216 if (file) { 216 if (file) {
@@ -476,10 +476,10 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
476 return -ESRCH; 476 return -ESRCH;
477 mm = get_task_mm(task); 477 mm = get_task_mm(task);
478 if (mm) { 478 if (mm) {
479 static struct mm_walk clear_refs_walk; 479 struct mm_walk clear_refs_walk = {
480 memset(&clear_refs_walk, 0, sizeof(clear_refs_walk)); 480 .pmd_entry = clear_refs_pte_range,
481 clear_refs_walk.pmd_entry = clear_refs_pte_range; 481 .mm = mm,
482 clear_refs_walk.mm = mm; 482 };
483 down_read(&mm->mmap_sem); 483 down_read(&mm->mmap_sem);
484 for (vma = mm->mmap; vma; vma = vma->vm_next) { 484 for (vma = mm->mmap; vma; vma = vma->vm_next) {
485 clear_refs_walk.private = vma; 485 clear_refs_walk.private = vma;
@@ -602,11 +602,6 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
602 return err; 602 return err;
603} 603}
604 604
605static struct mm_walk pagemap_walk = {
606 .pmd_entry = pagemap_pte_range,
607 .pte_hole = pagemap_pte_hole
608};
609
610/* 605/*
611 * /proc/pid/pagemap - an array mapping virtual pages to pfns 606 * /proc/pid/pagemap - an array mapping virtual pages to pfns
612 * 607 *
@@ -641,12 +636,17 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
641 struct pagemapread pm; 636 struct pagemapread pm;
642 int pagecount; 637 int pagecount;
643 int ret = -ESRCH; 638 int ret = -ESRCH;
639 struct mm_walk pagemap_walk;
640 unsigned long src;
641 unsigned long svpfn;
642 unsigned long start_vaddr;
643 unsigned long end_vaddr;
644 644
645 if (!task) 645 if (!task)
646 goto out; 646 goto out;
647 647
648 ret = -EACCES; 648 ret = -EACCES;
649 if (!ptrace_may_attach(task)) 649 if (!ptrace_may_access(task, PTRACE_MODE_READ))
650 goto out_task; 650 goto out_task;
651 651
652 ret = -EINVAL; 652 ret = -EINVAL;
@@ -659,11 +659,15 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
659 if (!mm) 659 if (!mm)
660 goto out_task; 660 goto out_task;
661 661
662 ret = -ENOMEM; 662
663 uaddr = (unsigned long)buf & PAGE_MASK; 663 uaddr = (unsigned long)buf & PAGE_MASK;
664 uend = (unsigned long)(buf + count); 664 uend = (unsigned long)(buf + count);
665 pagecount = (PAGE_ALIGN(uend) - uaddr) / PAGE_SIZE; 665 pagecount = (PAGE_ALIGN(uend) - uaddr) / PAGE_SIZE;
666 pages = kmalloc(pagecount * sizeof(struct page *), GFP_KERNEL); 666 ret = 0;
667 if (pagecount == 0)
668 goto out_mm;
669 pages = kcalloc(pagecount, sizeof(struct page *), GFP_KERNEL);
670 ret = -ENOMEM;
667 if (!pages) 671 if (!pages)
668 goto out_mm; 672 goto out_mm;
669 673
@@ -684,33 +688,33 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
684 pm.out = (u64 *)buf; 688 pm.out = (u64 *)buf;
685 pm.end = (u64 *)(buf + count); 689 pm.end = (u64 *)(buf + count);
686 690
687 if (!ptrace_may_attach(task)) { 691 pagemap_walk.pmd_entry = pagemap_pte_range;
688 ret = -EIO; 692 pagemap_walk.pte_hole = pagemap_pte_hole;
689 } else { 693 pagemap_walk.mm = mm;
690 unsigned long src = *ppos; 694 pagemap_walk.private = &pm;
691 unsigned long svpfn = src / PM_ENTRY_BYTES; 695
692 unsigned long start_vaddr = svpfn << PAGE_SHIFT; 696 src = *ppos;
693 unsigned long end_vaddr = TASK_SIZE_OF(task); 697 svpfn = src / PM_ENTRY_BYTES;
694 698 start_vaddr = svpfn << PAGE_SHIFT;
695 /* watch out for wraparound */ 699 end_vaddr = TASK_SIZE_OF(task);
696 if (svpfn > TASK_SIZE_OF(task) >> PAGE_SHIFT) 700
697 start_vaddr = end_vaddr; 701 /* watch out for wraparound */
698 702 if (svpfn > TASK_SIZE_OF(task) >> PAGE_SHIFT)
699 /* 703 start_vaddr = end_vaddr;
700 * The odds are that this will stop walking way 704
701 * before end_vaddr, because the length of the 705 /*
702 * user buffer is tracked in "pm", and the walk 706 * The odds are that this will stop walking way
703 * will stop when we hit the end of the buffer. 707 * before end_vaddr, because the length of the
704 */ 708 * user buffer is tracked in "pm", and the walk
705 ret = walk_page_range(start_vaddr, end_vaddr, 709 * will stop when we hit the end of the buffer.
706 &pagemap_walk); 710 */
707 if (ret == PM_END_OF_BUFFER) 711 ret = walk_page_range(start_vaddr, end_vaddr, &pagemap_walk);
708 ret = 0; 712 if (ret == PM_END_OF_BUFFER)
709 /* don't need mmap_sem for these, but this looks cleaner */ 713 ret = 0;
710 *ppos += (char *)pm.out - buf; 714 /* don't need mmap_sem for these, but this looks cleaner */
711 if (!ret) 715 *ppos += (char *)pm.out - buf;
712 ret = (char *)pm.out - buf; 716 if (!ret)
713 } 717 ret = (char *)pm.out - buf;
714 718
715out_pages: 719out_pages:
716 for (; pagecount; pagecount--) { 720 for (; pagecount; pagecount--) {
@@ -743,7 +747,7 @@ static int show_numa_map_checked(struct seq_file *m, void *v)
743 struct proc_maps_private *priv = m->private; 747 struct proc_maps_private *priv = m->private;
744 struct task_struct *task = priv->task; 748 struct task_struct *task = priv->task;
745 749
746 if (maps_protect && !ptrace_may_attach(task)) 750 if (maps_protect && !ptrace_may_access(task, PTRACE_MODE_READ))
747 return -EACCES; 751 return -EACCES;
748 752
749 return show_numa_map(m, v); 753 return show_numa_map(m, v);
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 4b4f9cc2f186..5d84e7121df8 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -113,7 +113,7 @@ static int show_map(struct seq_file *m, void *_vml)
113 struct proc_maps_private *priv = m->private; 113 struct proc_maps_private *priv = m->private;
114 struct task_struct *task = priv->task; 114 struct task_struct *task = priv->task;
115 115
116 if (maps_protect && !ptrace_may_attach(task)) 116 if (maps_protect && !ptrace_may_access(task, PTRACE_MODE_READ))
117 return -EACCES; 117 return -EACCES;
118 118
119 return nommu_vma_show(m, vml->vma); 119 return nommu_vma_show(m, vml->vma);
diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c
index 9590b9024300..78f613cb9c76 100644
--- a/fs/ramfs/file-mmu.c
+++ b/fs/ramfs/file-mmu.c
@@ -45,6 +45,7 @@ const struct file_operations ramfs_file_operations = {
45 .mmap = generic_file_mmap, 45 .mmap = generic_file_mmap,
46 .fsync = simple_sync_file, 46 .fsync = simple_sync_file,
47 .splice_read = generic_file_splice_read, 47 .splice_read = generic_file_splice_read,
48 .splice_write = generic_file_splice_write,
48 .llseek = generic_file_llseek, 49 .llseek = generic_file_llseek,
49}; 50};
50 51
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 0989bc2c2f69..52312ec93ff4 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -43,6 +43,7 @@ const struct file_operations ramfs_file_operations = {
43 .aio_write = generic_file_aio_write, 43 .aio_write = generic_file_aio_write,
44 .fsync = simple_sync_file, 44 .fsync = simple_sync_file,
45 .splice_read = generic_file_splice_read, 45 .splice_read = generic_file_splice_read,
46 .splice_write = generic_file_splice_write,
46 .llseek = generic_file_llseek, 47 .llseek = generic_file_llseek,
47}; 48};
48 49
diff --git a/fs/read_write.c b/fs/read_write.c
index f0d1240a5c69..9ba495d5a29b 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -31,12 +31,12 @@ const struct file_operations generic_ro_fops = {
31 31
32EXPORT_SYMBOL(generic_ro_fops); 32EXPORT_SYMBOL(generic_ro_fops);
33 33
34loff_t generic_file_llseek(struct file *file, loff_t offset, int origin) 34loff_t
35generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin)
35{ 36{
36 loff_t retval; 37 loff_t retval;
37 struct inode *inode = file->f_mapping->host; 38 struct inode *inode = file->f_mapping->host;
38 39
39 mutex_lock(&inode->i_mutex);
40 switch (origin) { 40 switch (origin) {
41 case SEEK_END: 41 case SEEK_END:
42 offset += inode->i_size; 42 offset += inode->i_size;
@@ -46,42 +46,26 @@ loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)
46 } 46 }
47 retval = -EINVAL; 47 retval = -EINVAL;
48 if (offset>=0 && offset<=inode->i_sb->s_maxbytes) { 48 if (offset>=0 && offset<=inode->i_sb->s_maxbytes) {
49 /* Special lock needed here? */
49 if (offset != file->f_pos) { 50 if (offset != file->f_pos) {
50 file->f_pos = offset; 51 file->f_pos = offset;
51 file->f_version = 0; 52 file->f_version = 0;
52 } 53 }
53 retval = offset; 54 retval = offset;
54 } 55 }
55 mutex_unlock(&inode->i_mutex);
56 return retval; 56 return retval;
57} 57}
58EXPORT_SYMBOL(generic_file_llseek_unlocked);
58 59
59EXPORT_SYMBOL(generic_file_llseek); 60loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)
60
61loff_t remote_llseek(struct file *file, loff_t offset, int origin)
62{ 61{
63 loff_t retval; 62 loff_t n;
64 63 mutex_lock(&file->f_dentry->d_inode->i_mutex);
65 lock_kernel(); 64 n = generic_file_llseek_unlocked(file, offset, origin);
66 switch (origin) { 65 mutex_unlock(&file->f_dentry->d_inode->i_mutex);
67 case SEEK_END: 66 return n;
68 offset += i_size_read(file->f_path.dentry->d_inode);
69 break;
70 case SEEK_CUR:
71 offset += file->f_pos;
72 }
73 retval = -EINVAL;
74 if (offset>=0 && offset<=file->f_path.dentry->d_inode->i_sb->s_maxbytes) {
75 if (offset != file->f_pos) {
76 file->f_pos = offset;
77 file->f_version = 0;
78 }
79 retval = offset;
80 }
81 unlock_kernel();
82 return retval;
83} 67}
84EXPORT_SYMBOL(remote_llseek); 68EXPORT_SYMBOL(generic_file_llseek);
85 69
86loff_t no_llseek(struct file *file, loff_t offset, int origin) 70loff_t no_llseek(struct file *file, loff_t offset, int origin)
87{ 71{
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 57917932212e..192269698a8a 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -45,6 +45,8 @@ void reiserfs_delete_inode(struct inode *inode)
45 goto out; 45 goto out;
46 reiserfs_update_inode_transaction(inode); 46 reiserfs_update_inode_transaction(inode);
47 47
48 reiserfs_discard_prealloc(&th, inode);
49
48 err = reiserfs_delete_object(&th, inode); 50 err = reiserfs_delete_object(&th, inode);
49 51
50 /* Do quota update inside a transaction for journaled quotas. We must do that 52 /* Do quota update inside a transaction for journaled quotas. We must do that
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index ed424d708e69..1d40f2bd1970 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -2165,8 +2165,10 @@ static ssize_t reiserfs_quota_write(struct super_block *sb, int type,
2165 blk++; 2165 blk++;
2166 } 2166 }
2167out: 2167out:
2168 if (len == towrite) 2168 if (len == towrite) {
2169 mutex_unlock(&inode->i_mutex);
2169 return err; 2170 return err;
2171 }
2170 if (inode->i_size < off + len - towrite) 2172 if (inode->i_size < off + len - towrite)
2171 i_size_write(inode, off + len - towrite); 2173 i_size_write(inode, off + len - towrite);
2172 inode->i_version++; 2174 inode->i_version++;
diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c
index efbe29af3d7a..2294783320cb 100644
--- a/fs/smbfs/file.c
+++ b/fs/smbfs/file.c
@@ -422,9 +422,18 @@ smb_file_permission(struct inode *inode, int mask, struct nameidata *nd)
422 return error; 422 return error;
423} 423}
424 424
425static loff_t smb_remote_llseek(struct file *file, loff_t offset, int origin)
426{
427 loff_t ret;
428 lock_kernel();
429 ret = generic_file_llseek_unlocked(file, offset, origin);
430 unlock_kernel();
431 return ret;
432}
433
425const struct file_operations smb_file_operations = 434const struct file_operations smb_file_operations =
426{ 435{
427 .llseek = remote_llseek, 436 .llseek = smb_remote_llseek,
428 .read = do_sync_read, 437 .read = do_sync_read,
429 .aio_read = smb_file_aio_read, 438 .aio_read = smb_file_aio_read,
430 .write = do_sync_write, 439 .write = do_sync_write,
diff --git a/fs/splice.c b/fs/splice.c
index aa5f6f60b305..399442179d89 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -379,13 +379,22 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
379 lock_page(page); 379 lock_page(page);
380 380
381 /* 381 /*
382 * page was truncated, stop here. if this isn't the 382 * Page was truncated, or invalidated by the
383 * first page, we'll just complete what we already 383 * filesystem. Redo the find/create, but this time the
384 * added 384 * page is kept locked, so there's no chance of another
385 * race with truncate/invalidate.
385 */ 386 */
386 if (!page->mapping) { 387 if (!page->mapping) {
387 unlock_page(page); 388 unlock_page(page);
388 break; 389 page = find_or_create_page(mapping, index,
390 mapping_gfp_mask(mapping));
391
392 if (!page) {
393 error = -ENOMEM;
394 break;
395 }
396 page_cache_release(pages[page_nr]);
397 pages[page_nr] = page;
389 } 398 }
390 /* 399 /*
391 * page was already under io and is now done, great 400 * page was already under io and is now done, great
diff --git a/fs/ubifs/Kconfig b/fs/ubifs/Kconfig
new file mode 100644
index 000000000000..91ceeda7e5bf
--- /dev/null
+++ b/fs/ubifs/Kconfig
@@ -0,0 +1,72 @@
1config UBIFS_FS
2 tristate "UBIFS file system support"
3 select CRC16
4 select CRC32
5 select CRYPTO if UBIFS_FS_ADVANCED_COMPR
6 select CRYPTO if UBIFS_FS_LZO
7 select CRYPTO if UBIFS_FS_ZLIB
8 select CRYPTO_LZO if UBIFS_FS_LZO
9 select CRYPTO_DEFLATE if UBIFS_FS_ZLIB
10 depends on MTD_UBI
11 help
12 UBIFS is a file system for flash devices which works on top of UBI.
13
14config UBIFS_FS_XATTR
15 bool "Extended attributes support"
16 depends on UBIFS_FS
17 help
18 This option enables support of extended attributes.
19
20config UBIFS_FS_ADVANCED_COMPR
21 bool "Advanced compression options"
22 depends on UBIFS_FS
23 help
24 This option allows to explicitly choose which compressions, if any,
25 are enabled in UBIFS. Removing compressors means inbility to read
26 existing file systems.
27
28 If unsure, say 'N'.
29
30config UBIFS_FS_LZO
31 bool "LZO compression support" if UBIFS_FS_ADVANCED_COMPR
32 depends on UBIFS_FS
33 default y
34 help
35 LZO compressor is generally faster then zlib but compresses worse.
36 Say 'Y' if unsure.
37
38config UBIFS_FS_ZLIB
39 bool "ZLIB compression support" if UBIFS_FS_ADVANCED_COMPR
40 depends on UBIFS_FS
41 default y
42 help
43 Zlib copresses better then LZO but it is slower. Say 'Y' if unsure.
44
45# Debugging-related stuff
46config UBIFS_FS_DEBUG
47 bool "Enable debugging"
48 depends on UBIFS_FS
49 select DEBUG_FS
50 select KALLSYMS_ALL
51 help
52 This option enables UBIFS debugging.
53
54config UBIFS_FS_DEBUG_MSG_LVL
55 int "Default message level (0 = no extra messages, 3 = lots)"
56 depends on UBIFS_FS_DEBUG
57 default "0"
58 help
59 This controls the amount of debugging messages produced by UBIFS.
60 If reporting bugs, please try to have available a full dump of the
61 messages at level 1 while the misbehaviour was occurring. Level 2
62 may become necessary if level 1 messages were not enough to find the
63 bug. Generally Level 3 should be avoided.
64
65config UBIFS_FS_DEBUG_CHKS
66 bool "Enable extra checks"
67 depends on UBIFS_FS_DEBUG
68 help
69 If extra checks are enabled UBIFS will check the consistency of its
70 internal data structures during operation. However, UBIFS performance
71 is dramatically slower when this option is selected especially if the
72 file system is large.
diff --git a/fs/ubifs/Makefile b/fs/ubifs/Makefile
new file mode 100644
index 000000000000..80e93c35e496
--- /dev/null
+++ b/fs/ubifs/Makefile
@@ -0,0 +1,9 @@
1obj-$(CONFIG_UBIFS_FS) += ubifs.o
2
3ubifs-y += shrinker.o journal.o file.o dir.o super.o sb.o io.o
4ubifs-y += tnc.o master.o scan.o replay.o log.o commit.o gc.o orphan.o
5ubifs-y += budget.o find.o tnc_commit.o compress.o lpt.o lprops.o
6ubifs-y += recovery.o ioctl.o lpt_commit.o tnc_misc.o
7
8ubifs-$(CONFIG_UBIFS_FS_DEBUG) += debug.o
9ubifs-$(CONFIG_UBIFS_FS_XATTR) += xattr.o
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
new file mode 100644
index 000000000000..d81fb9ed2b8e
--- /dev/null
+++ b/fs/ubifs/budget.c
@@ -0,0 +1,731 @@
1/*
2 * This file is part of UBIFS.
3 *
4 * Copyright (C) 2006-2008 Nokia Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published by
8 * the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along with
16 * this program; if not, write to the Free Software Foundation, Inc., 51
17 * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 * Authors: Adrian Hunter
20 * Artem Bityutskiy (Битюцкий Артём)
21 */
22
23/*
24 * This file implements the budgeting sub-system which is responsible for UBIFS
25 * space management.
26 *
27 * Factors such as compression, wasted space at the ends of LEBs, space in other
28 * journal heads, the effect of updates on the index, and so on, make it
29 * impossible to accurately predict the amount of space needed. Consequently
30 * approximations are used.
31 */
32
33#include "ubifs.h"
34#include <linux/writeback.h>
35#include <asm/div64.h>
36
37/*
38 * When pessimistic budget calculations say that there is no enough space,
39 * UBIFS starts writing back dirty inodes and pages, doing garbage collection,
40 * or committing. The below constants define maximum number of times UBIFS
41 * repeats the operations.
42 */
43#define MAX_SHRINK_RETRIES 8
44#define MAX_GC_RETRIES 4
45#define MAX_CMT_RETRIES 2
46#define MAX_NOSPC_RETRIES 1
47
48/*
49 * The below constant defines amount of dirty pages which should be written
50 * back at when trying to shrink the liability.
51 */
52#define NR_TO_WRITE 16
53
54/**
55 * struct retries_info - information about re-tries while making free space.
56 * @prev_liability: previous liability
57 * @shrink_cnt: how many times the liability was shrinked
58 * @shrink_retries: count of liability shrink re-tries (increased when
59 * liability does not shrink)
60 * @try_gc: GC should be tried first
61 * @gc_retries: how many times GC was run
62 * @cmt_retries: how many times commit has been done
63 * @nospc_retries: how many times GC returned %-ENOSPC
64 *
65 * Since we consider budgeting to be the fast-path, and this structure has to
66 * be allocated on stack and zeroed out, we make it smaller using bit-fields.
67 */
68struct retries_info {
69 long long prev_liability;
70 unsigned int shrink_cnt;
71 unsigned int shrink_retries:5;
72 unsigned int try_gc:1;
73 unsigned int gc_retries:4;
74 unsigned int cmt_retries:3;
75 unsigned int nospc_retries:1;
76};
77
78/**
79 * shrink_liability - write-back some dirty pages/inodes.
80 * @c: UBIFS file-system description object
81 * @nr_to_write: how many dirty pages to write-back
82 *
83 * This function shrinks UBIFS liability by means of writing back some amount
84 * of dirty inodes and their pages. Returns the amount of pages which were
85 * written back. The returned value does not include dirty inodes which were
86 * synchronized.
87 *
88 * Note, this function synchronizes even VFS inodes which are locked
89 * (@i_mutex) by the caller of the budgeting function, because write-back does
90 * not touch @i_mutex.
91 */
92static int shrink_liability(struct ubifs_info *c, int nr_to_write)
93{
94 int nr_written;
95 struct writeback_control wbc = {
96 .sync_mode = WB_SYNC_NONE,
97 .range_end = LLONG_MAX,
98 .nr_to_write = nr_to_write,
99 };
100
101 generic_sync_sb_inodes(c->vfs_sb, &wbc);
102 nr_written = nr_to_write - wbc.nr_to_write;
103
104 if (!nr_written) {
105 /*
106 * Re-try again but wait on pages/inodes which are being
107 * written-back concurrently (e.g., by pdflush).
108 */
109 memset(&wbc, 0, sizeof(struct writeback_control));
110 wbc.sync_mode = WB_SYNC_ALL;
111 wbc.range_end = LLONG_MAX;
112 wbc.nr_to_write = nr_to_write;
113 generic_sync_sb_inodes(c->vfs_sb, &wbc);
114 nr_written = nr_to_write - wbc.nr_to_write;
115 }
116
117 dbg_budg("%d pages were written back", nr_written);
118 return nr_written;
119}
120
121
122/**
123 * run_gc - run garbage collector.
124 * @c: UBIFS file-system description object
125 *
126 * This function runs garbage collector to make some more free space. Returns
127 * zero if a free LEB has been produced, %-EAGAIN if commit is required, and a
128 * negative error code in case of failure.
129 */
130static int run_gc(struct ubifs_info *c)
131{
132 int err, lnum;
133
134 /* Make some free space by garbage-collecting dirty space */
135 down_read(&c->commit_sem);
136 lnum = ubifs_garbage_collect(c, 1);
137 up_read(&c->commit_sem);
138 if (lnum < 0)
139 return lnum;
140
141 /* GC freed one LEB, return it to lprops */
142 dbg_budg("GC freed LEB %d", lnum);
143 err = ubifs_return_leb(c, lnum);
144 if (err)
145 return err;
146 return 0;
147}
148
149/**
150 * make_free_space - make more free space on the file-system.
151 * @c: UBIFS file-system description object
152 * @ri: information about previous invocations of this function
153 *
154 * This function is called when an operation cannot be budgeted because there
155 * is supposedly no free space. But in most cases there is some free space:
156 * o budgeting is pessimistic, so it always budgets more then it is actually
157 * needed, so shrinking the liability is one way to make free space - the
158 * cached data will take less space then it was budgeted for;
159 * o GC may turn some dark space into free space (budgeting treats dark space
160 * as not available);
161 * o commit may free some LEB, i.e., turn freeable LEBs into free LEBs.
162 *
163 * So this function tries to do the above. Returns %-EAGAIN if some free space
164 * was presumably made and the caller has to re-try budgeting the operation.
165 * Returns %-ENOSPC if it couldn't do more free space, and other negative error
166 * codes on failures.
167 */
168static int make_free_space(struct ubifs_info *c, struct retries_info *ri)
169{
170 int err;
171
172 /*
173 * If we have some dirty pages and inodes (liability), try to write
174 * them back unless this was tried too many times without effect
175 * already.
176 */
177 if (ri->shrink_retries < MAX_SHRINK_RETRIES && !ri->try_gc) {
178 long long liability;
179
180 spin_lock(&c->space_lock);
181 liability = c->budg_idx_growth + c->budg_data_growth +
182 c->budg_dd_growth;
183 spin_unlock(&c->space_lock);
184
185 if (ri->prev_liability >= liability) {
186 /* Liability does not shrink, next time try GC then */
187 ri->shrink_retries += 1;
188 if (ri->gc_retries < MAX_GC_RETRIES)
189 ri->try_gc = 1;
190 dbg_budg("liability did not shrink: retries %d of %d",
191 ri->shrink_retries, MAX_SHRINK_RETRIES);
192 }
193
194 dbg_budg("force write-back (count %d)", ri->shrink_cnt);
195 shrink_liability(c, NR_TO_WRITE + ri->shrink_cnt);
196
197 ri->prev_liability = liability;
198 ri->shrink_cnt += 1;
199 return -EAGAIN;
200 }
201
202 /*
203 * Try to run garbage collector unless it was already tried too many
204 * times.
205 */
206 if (ri->gc_retries < MAX_GC_RETRIES) {
207 ri->gc_retries += 1;
208 dbg_budg("run GC, retries %d of %d",
209 ri->gc_retries, MAX_GC_RETRIES);
210
211 ri->try_gc = 0;
212 err = run_gc(c);
213 if (!err)
214 return -EAGAIN;
215
216 if (err == -EAGAIN) {
217 dbg_budg("GC asked to commit");
218 err = ubifs_run_commit(c);
219 if (err)
220 return err;
221 return -EAGAIN;
222 }
223
224 if (err != -ENOSPC)
225 return err;
226
227 /*
228 * GC could not make any progress. If this is the first time,
229 * then it makes sense to try to commit, because it might make
230 * some dirty space.
231 */
232 dbg_budg("GC returned -ENOSPC, retries %d",
233 ri->nospc_retries);
234 if (ri->nospc_retries >= MAX_NOSPC_RETRIES)
235 return err;
236 ri->nospc_retries += 1;
237 }
238
239 /* Neither GC nor write-back helped, try to commit */
240 if (ri->cmt_retries < MAX_CMT_RETRIES) {
241 ri->cmt_retries += 1;
242 dbg_budg("run commit, retries %d of %d",
243 ri->cmt_retries, MAX_CMT_RETRIES);
244 err = ubifs_run_commit(c);
245 if (err)
246 return err;
247 return -EAGAIN;
248 }
249 return -ENOSPC;
250}
251
252/**
253 * ubifs_calc_min_idx_lebs - calculate amount of eraseblocks for the index.
254 * @c: UBIFS file-system description object
255 *
256 * This function calculates and returns the number of eraseblocks which should
257 * be kept for index usage.
258 */
259int ubifs_calc_min_idx_lebs(struct ubifs_info *c)
260{
261 int ret;
262 uint64_t idx_size;
263
264 idx_size = c->old_idx_sz + c->budg_idx_growth + c->budg_uncommitted_idx;
265
266 /* And make sure we have twice the index size of space reserved */
267 idx_size <<= 1;
268
269 /*
270 * We do not maintain 'old_idx_size' as 'old_idx_lebs'/'old_idx_bytes'
271 * pair, nor similarly the two variables for the new index size, so we
272 * have to do this costly 64-bit division on fast-path.
273 */
274 if (do_div(idx_size, c->leb_size - c->max_idx_node_sz))
275 ret = idx_size + 1;
276 else
277 ret = idx_size;
278 /*
279 * The index head is not available for the in-the-gaps method, so add an
280 * extra LEB to compensate.
281 */
282 ret += 1;
283 /*
284 * At present the index needs at least 2 LEBs: one for the index head
285 * and one for in-the-gaps method (which currently does not cater for
286 * the index head and so excludes it from consideration).
287 */
288 if (ret < 2)
289 ret = 2;
290 return ret;
291}
292
293/**
294 * ubifs_calc_available - calculate available FS space.
295 * @c: UBIFS file-system description object
296 * @min_idx_lebs: minimum number of LEBs reserved for the index
297 *
298 * This function calculates and returns amount of FS space available for use.
299 */
300long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs)
301{
302 int subtract_lebs;
303 long long available;
304
305 /*
306 * Force the amount available to the total size reported if the used
307 * space is zero.
308 */
309 if (c->lst.total_used <= UBIFS_INO_NODE_SZ &&
310 c->budg_data_growth + c->budg_dd_growth == 0) {
311 /* Do the same calculation as for c->block_cnt */
312 available = c->main_lebs - 2;
313 available *= c->leb_size - c->dark_wm;
314 return available;
315 }
316
317 available = c->main_bytes - c->lst.total_used;
318
319 /*
320 * Now 'available' contains theoretically available flash space
321 * assuming there is no index, so we have to subtract the space which
322 * is reserved for the index.
323 */
324 subtract_lebs = min_idx_lebs;
325
326 /* Take into account that GC reserves one LEB for its own needs */
327 subtract_lebs += 1;
328
329 /*
330 * The GC journal head LEB is not really accessible. And since
331 * different write types go to different heads, we may count only on
332 * one head's space.
333 */
334 subtract_lebs += c->jhead_cnt - 1;
335
336 /* We also reserve one LEB for deletions, which bypass budgeting */
337 subtract_lebs += 1;
338
339 available -= (long long)subtract_lebs * c->leb_size;
340
341 /* Subtract the dead space which is not available for use */
342 available -= c->lst.total_dead;
343
344 /*
345 * Subtract dark space, which might or might not be usable - it depends
346 * on the data which we have on the media and which will be written. If
347 * this is a lot of uncompressed or not-compressible data, the dark
348 * space cannot be used.
349 */
350 available -= c->lst.total_dark;
351
352 /*
353 * However, there is more dark space. The index may be bigger than
354 * @min_idx_lebs. Those extra LEBs are assumed to be available, but
355 * their dark space is not included in total_dark, so it is subtracted
356 * here.
357 */
358 if (c->lst.idx_lebs > min_idx_lebs) {
359 subtract_lebs = c->lst.idx_lebs - min_idx_lebs;
360 available -= subtract_lebs * c->dark_wm;
361 }
362
363 /* The calculations are rough and may end up with a negative number */
364 return available > 0 ? available : 0;
365}
366
367/**
368 * can_use_rp - check whether the user is allowed to use reserved pool.
369 * @c: UBIFS file-system description object
370 *
371 * UBIFS has so-called "reserved pool" which is flash space reserved
372 * for the superuser and for uses whose UID/GID is recorded in UBIFS superblock.
373 * This function checks whether current user is allowed to use reserved pool.
374 * Returns %1 current user is allowed to use reserved pool and %0 otherwise.
375 */
376static int can_use_rp(struct ubifs_info *c)
377{
378 if (current->fsuid == c->rp_uid || capable(CAP_SYS_RESOURCE) ||
379 (c->rp_gid != 0 && in_group_p(c->rp_gid)))
380 return 1;
381 return 0;
382}
383
384/**
385 * do_budget_space - reserve flash space for index and data growth.
386 * @c: UBIFS file-system description object
387 *
388 * This function makes sure UBIFS has enough free eraseblocks for index growth
389 * and data.
390 *
391 * When budgeting index space, UBIFS reserves twice as more LEBs as the index
392 * would take if it was consolidated and written to the flash. This guarantees
393 * that the "in-the-gaps" commit method always succeeds and UBIFS will always
394 * be able to commit dirty index. So this function basically adds amount of
395 * budgeted index space to the size of the current index, multiplies this by 2,
396 * and makes sure this does not exceed the amount of free eraseblocks.
397 *
398 * Notes about @c->min_idx_lebs and @c->lst.idx_lebs variables:
399 * o @c->lst.idx_lebs is the number of LEBs the index currently uses. It might
400 * be large, because UBIFS does not do any index consolidation as long as
401 * there is free space. IOW, the index may take a lot of LEBs, but the LEBs
402 * will contain a lot of dirt.
403 * o @c->min_idx_lebs is the the index presumably takes. IOW, the index may be
404 * consolidated to take up to @c->min_idx_lebs LEBs.
405 *
406 * This function returns zero in case of success, and %-ENOSPC in case of
407 * failure.
408 */
409static int do_budget_space(struct ubifs_info *c)
410{
411 long long outstanding, available;
412 int lebs, rsvd_idx_lebs, min_idx_lebs;
413
414 /* First budget index space */
415 min_idx_lebs = ubifs_calc_min_idx_lebs(c);
416
417 /* Now 'min_idx_lebs' contains number of LEBs to reserve */
418 if (min_idx_lebs > c->lst.idx_lebs)
419 rsvd_idx_lebs = min_idx_lebs - c->lst.idx_lebs;
420 else
421 rsvd_idx_lebs = 0;
422
423 /*
424 * The number of LEBs that are available to be used by the index is:
425 *
426 * @c->lst.empty_lebs + @c->freeable_cnt + @c->idx_gc_cnt -
427 * @c->lst.taken_empty_lebs
428 *
429 * @empty_lebs are available because they are empty. @freeable_cnt are
430 * available because they contain only free and dirty space and the
431 * index allocation always occurs after wbufs are synch'ed.
432 * @idx_gc_cnt are available because they are index LEBs that have been
433 * garbage collected (including trivial GC) and are awaiting the commit
434 * before they can be unmapped - note that the in-the-gaps method will
435 * grab these if it needs them. @taken_empty_lebs are empty_lebs that
436 * have already been allocated for some purpose (also includes those
437 * LEBs on the @idx_gc list).
438 *
439 * Note, @taken_empty_lebs may temporarily be higher by one because of
440 * the way we serialize LEB allocations and budgeting. See a comment in
441 * 'ubifs_find_free_space()'.
442 */
443 lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt -
444 c->lst.taken_empty_lebs;
445 if (unlikely(rsvd_idx_lebs > lebs)) {
446 dbg_budg("out of indexing space: min_idx_lebs %d (old %d), "
447 "rsvd_idx_lebs %d", min_idx_lebs, c->min_idx_lebs,
448 rsvd_idx_lebs);
449 return -ENOSPC;
450 }
451
452 available = ubifs_calc_available(c, min_idx_lebs);
453 outstanding = c->budg_data_growth + c->budg_dd_growth;
454
455 if (unlikely(available < outstanding)) {
456 dbg_budg("out of data space: available %lld, outstanding %lld",
457 available, outstanding);
458 return -ENOSPC;
459 }
460
461 if (available - outstanding <= c->rp_size && !can_use_rp(c))
462 return -ENOSPC;
463
464 c->min_idx_lebs = min_idx_lebs;
465 return 0;
466}
467
468/**
469 * calc_idx_growth - calculate approximate index growth from budgeting request.
470 * @c: UBIFS file-system description object
471 * @req: budgeting request
472 *
473 * For now we assume each new node adds one znode. But this is rather poor
474 * approximation, though.
475 */
476static int calc_idx_growth(const struct ubifs_info *c,
477 const struct ubifs_budget_req *req)
478{
479 int znodes;
480
481 znodes = req->new_ino + (req->new_page << UBIFS_BLOCKS_PER_PAGE_SHIFT) +
482 req->new_dent;
483 return znodes * c->max_idx_node_sz;
484}
485
486/**
487 * calc_data_growth - calculate approximate amount of new data from budgeting
488 * request.
489 * @c: UBIFS file-system description object
490 * @req: budgeting request
491 */
492static int calc_data_growth(const struct ubifs_info *c,
493 const struct ubifs_budget_req *req)
494{
495 int data_growth;
496
497 data_growth = req->new_ino ? c->inode_budget : 0;
498 if (req->new_page)
499 data_growth += c->page_budget;
500 if (req->new_dent)
501 data_growth += c->dent_budget;
502 data_growth += req->new_ino_d;
503 return data_growth;
504}
505
506/**
507 * calc_dd_growth - calculate approximate amount of data which makes other data
508 * dirty from budgeting request.
509 * @c: UBIFS file-system description object
510 * @req: budgeting request
511 */
512static int calc_dd_growth(const struct ubifs_info *c,
513 const struct ubifs_budget_req *req)
514{
515 int dd_growth;
516
517 dd_growth = req->dirtied_page ? c->page_budget : 0;
518
519 if (req->dirtied_ino)
520 dd_growth += c->inode_budget << (req->dirtied_ino - 1);
521 if (req->mod_dent)
522 dd_growth += c->dent_budget;
523 dd_growth += req->dirtied_ino_d;
524 return dd_growth;
525}
526
527/**
528 * ubifs_budget_space - ensure there is enough space to complete an operation.
529 * @c: UBIFS file-system description object
530 * @req: budget request
531 *
532 * This function allocates budget for an operation. It uses pessimistic
533 * approximation of how much flash space the operation needs. The goal of this
534 * function is to make sure UBIFS always has flash space to flush all dirty
535 * pages, dirty inodes, and dirty znodes (liability). This function may force
536 * commit, garbage-collection or write-back. Returns zero in case of success,
537 * %-ENOSPC if there is no free space and other negative error codes in case of
538 * failures.
539 */
540int ubifs_budget_space(struct ubifs_info *c, struct ubifs_budget_req *req)
541{
542 int uninitialized_var(cmt_retries), uninitialized_var(wb_retries);
543 int err, idx_growth, data_growth, dd_growth;
544 struct retries_info ri;
545
546 ubifs_assert(req->dirtied_ino <= 4);
547 ubifs_assert(req->dirtied_ino_d <= UBIFS_MAX_INO_DATA * 4);
548
549 data_growth = calc_data_growth(c, req);
550 dd_growth = calc_dd_growth(c, req);
551 if (!data_growth && !dd_growth)
552 return 0;
553 idx_growth = calc_idx_growth(c, req);
554 memset(&ri, 0, sizeof(struct retries_info));
555
556again:
557 spin_lock(&c->space_lock);
558 ubifs_assert(c->budg_idx_growth >= 0);
559 ubifs_assert(c->budg_data_growth >= 0);
560 ubifs_assert(c->budg_dd_growth >= 0);
561
562 if (unlikely(c->nospace) && (c->nospace_rp || !can_use_rp(c))) {
563 dbg_budg("no space");
564 spin_unlock(&c->space_lock);
565 return -ENOSPC;
566 }
567
568 c->budg_idx_growth += idx_growth;
569 c->budg_data_growth += data_growth;
570 c->budg_dd_growth += dd_growth;
571
572 err = do_budget_space(c);
573 if (likely(!err)) {
574 req->idx_growth = idx_growth;
575 req->data_growth = data_growth;
576 req->dd_growth = dd_growth;
577 spin_unlock(&c->space_lock);
578 return 0;
579 }
580
581 /* Restore the old values */
582 c->budg_idx_growth -= idx_growth;
583 c->budg_data_growth -= data_growth;
584 c->budg_dd_growth -= dd_growth;
585 spin_unlock(&c->space_lock);
586
587 if (req->fast) {
588 dbg_budg("no space for fast budgeting");
589 return err;
590 }
591
592 err = make_free_space(c, &ri);
593 if (err == -EAGAIN) {
594 dbg_budg("try again");
595 cond_resched();
596 goto again;
597 } else if (err == -ENOSPC) {
598 dbg_budg("FS is full, -ENOSPC");
599 c->nospace = 1;
600 if (can_use_rp(c) || c->rp_size == 0)
601 c->nospace_rp = 1;
602 smp_wmb();
603 } else
604 ubifs_err("cannot budget space, error %d", err);
605 return err;
606}
607
608/**
609 * ubifs_release_budget - release budgeted free space.
610 * @c: UBIFS file-system description object
611 * @req: budget request
612 *
613 * This function releases the space budgeted by 'ubifs_budget_space()'. Note,
614 * since the index changes (which were budgeted for in @req->idx_growth) will
615 * only be written to the media on commit, this function moves the index budget
616 * from @c->budg_idx_growth to @c->budg_uncommitted_idx. The latter will be
617 * zeroed by the commit operation.
618 */
619void ubifs_release_budget(struct ubifs_info *c, struct ubifs_budget_req *req)
620{
621 ubifs_assert(req->dirtied_ino <= 4);
622 ubifs_assert(req->dirtied_ino_d <= UBIFS_MAX_INO_DATA * 4);
623 if (!req->recalculate) {
624 ubifs_assert(req->idx_growth >= 0);
625 ubifs_assert(req->data_growth >= 0);
626 ubifs_assert(req->dd_growth >= 0);
627 }
628
629 if (req->recalculate) {
630 req->data_growth = calc_data_growth(c, req);
631 req->dd_growth = calc_dd_growth(c, req);
632 req->idx_growth = calc_idx_growth(c, req);
633 }
634
635 if (!req->data_growth && !req->dd_growth)
636 return;
637
638 c->nospace = c->nospace_rp = 0;
639 smp_wmb();
640
641 spin_lock(&c->space_lock);
642 c->budg_idx_growth -= req->idx_growth;
643 c->budg_uncommitted_idx += req->idx_growth;
644 c->budg_data_growth -= req->data_growth;
645 c->budg_dd_growth -= req->dd_growth;
646 c->min_idx_lebs = ubifs_calc_min_idx_lebs(c);
647
648 ubifs_assert(c->budg_idx_growth >= 0);
649 ubifs_assert(c->budg_data_growth >= 0);
650 ubifs_assert(c->min_idx_lebs < c->main_lebs);
651 spin_unlock(&c->space_lock);
652}
653
654/**
655 * ubifs_convert_page_budget - convert budget of a new page.
656 * @c: UBIFS file-system description object
657 *
658 * This function converts budget which was allocated for a new page of data to
659 * the budget of changing an existing page of data. The latter is smaller then
660 * the former, so this function only does simple re-calculation and does not
661 * involve any write-back.
662 */
663void ubifs_convert_page_budget(struct ubifs_info *c)
664{
665 spin_lock(&c->space_lock);
666 /* Release the index growth reservation */
667 c->budg_idx_growth -= c->max_idx_node_sz << UBIFS_BLOCKS_PER_PAGE_SHIFT;
668 /* Release the data growth reservation */
669 c->budg_data_growth -= c->page_budget;
670 /* Increase the dirty data growth reservation instead */
671 c->budg_dd_growth += c->page_budget;
672 /* And re-calculate the indexing space reservation */
673 c->min_idx_lebs = ubifs_calc_min_idx_lebs(c);
674 spin_unlock(&c->space_lock);
675}
676
677/**
678 * ubifs_release_dirty_inode_budget - release dirty inode budget.
679 * @c: UBIFS file-system description object
680 * @ui: UBIFS inode to release the budget for
681 *
682 * This function releases budget corresponding to a dirty inode. It is usually
683 * called when after the inode has been written to the media and marked as
684 * clean.
685 */
686void ubifs_release_dirty_inode_budget(struct ubifs_info *c,
687 struct ubifs_inode *ui)
688{
689 struct ubifs_budget_req req = {.dd_growth = c->inode_budget,
690 .dirtied_ino_d = ui->data_len};
691
692 ubifs_release_budget(c, &req);
693}
694
695/**
696 * ubifs_budg_get_free_space - return amount of free space.
697 * @c: UBIFS file-system description object
698 *
699 * This function returns amount of free space on the file-system.
700 */
701long long ubifs_budg_get_free_space(struct ubifs_info *c)
702{
703 int min_idx_lebs, rsvd_idx_lebs;
704 long long available, outstanding, free;
705
706 /* Do exactly the same calculations as in 'do_budget_space()' */
707 spin_lock(&c->space_lock);
708 min_idx_lebs = ubifs_calc_min_idx_lebs(c);
709
710 if (min_idx_lebs > c->lst.idx_lebs)
711 rsvd_idx_lebs = min_idx_lebs - c->lst.idx_lebs;
712 else
713 rsvd_idx_lebs = 0;
714
715 if (rsvd_idx_lebs > c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt
716 - c->lst.taken_empty_lebs) {
717 spin_unlock(&c->space_lock);
718 return 0;
719 }
720
721 available = ubifs_calc_available(c, min_idx_lebs);
722 outstanding = c->budg_data_growth + c->budg_dd_growth;
723 c->min_idx_lebs = min_idx_lebs;
724 spin_unlock(&c->space_lock);
725
726 if (available > outstanding)
727 free = ubifs_reported_space(c, available - outstanding);
728 else
729 free = 0;
730 return free;
731}
diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c
new file mode 100644
index 000000000000..3b516316c9b3
--- /dev/null
+++ b/fs/ubifs/commit.c
@@ -0,0 +1,677 @@
1/*
2 * This file is part of UBIFS.
3 *
4 * Copyright (C) 2006-2008 Nokia Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published by
8 * the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along with
16 * this program; if not, write to the Free Software Foundation, Inc., 51
17 * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 * Authors: Adrian Hunter
20 * Artem Bityutskiy (Битюцкий Артём)
21 */
22
23/*
24 * This file implements functions that manage the running of the commit process.
25 * Each affected module has its own functions to accomplish their part in the
26 * commit and those functions are called here.
27 *
28 * The commit is the process whereby all updates to the index and LEB properties
29 * are written out together and the journal becomes empty. This keeps the
30 * file system consistent - at all times the state can be recreated by reading
31 * the index and LEB properties and then replaying the journal.
32 *
33 * The commit is split into two parts named "commit start" and "commit end".
34 * During commit start, the commit process has exclusive access to the journal
35 * by holding the commit semaphore down for writing. As few I/O operations as
36 * possible are performed during commit start, instead the nodes that are to be
37 * written are merely identified. During commit end, the commit semaphore is no
38 * longer held and the journal is again in operation, allowing users to continue
39 * to use the file system while the bulk of the commit I/O is performed. The
40 * purpose of this two-step approach is to prevent the commit from causing any
41 * latency blips. Note that in any case, the commit does not prevent lookups
42 * (as permitted by the TNC mutex), or access to VFS data structures e.g. page
43 * cache.
44 */
45
46#include <linux/freezer.h>
47#include <linux/kthread.h>
48#include "ubifs.h"
49
50/**
51 * do_commit - commit the journal.
52 * @c: UBIFS file-system description object
53 *
54 * This function implements UBIFS commit. It has to be called with commit lock
55 * locked. Returns zero in case of success and a negative error code in case of
56 * failure.
57 */
58static int do_commit(struct ubifs_info *c)
59{
60 int err, new_ltail_lnum, old_ltail_lnum, i;
61 struct ubifs_zbranch zroot;
62 struct ubifs_lp_stats lst;
63
64 dbg_cmt("start");
65 if (c->ro_media) {
66 err = -EROFS;
67 goto out_up;
68 }
69
70 /* Sync all write buffers (necessary for recovery) */
71 for (i = 0; i < c->jhead_cnt; i++) {
72 err = ubifs_wbuf_sync(&c->jheads[i].wbuf);
73 if (err)
74 goto out_up;
75 }
76
77 err = ubifs_gc_start_commit(c);
78 if (err)
79 goto out_up;
80 err = dbg_check_lprops(c);
81 if (err)
82 goto out_up;
83 err = ubifs_log_start_commit(c, &new_ltail_lnum);
84 if (err)
85 goto out_up;
86 err = ubifs_tnc_start_commit(c, &zroot);
87 if (err)
88 goto out_up;
89 err = ubifs_lpt_start_commit(c);
90 if (err)
91 goto out_up;
92 err = ubifs_orphan_start_commit(c);
93 if (err)
94 goto out_up;
95
96 ubifs_get_lp_stats(c, &lst);
97
98 up_write(&c->commit_sem);
99
100 err = ubifs_tnc_end_commit(c);
101 if (err)
102 goto out;
103 err = ubifs_lpt_end_commit(c);
104 if (err)
105 goto out;
106 err = ubifs_orphan_end_commit(c);
107 if (err)
108 goto out;
109 old_ltail_lnum = c->ltail_lnum;
110 err = ubifs_log_end_commit(c, new_ltail_lnum);
111 if (err)
112 goto out;
113 err = dbg_check_old_index(c, &zroot);
114 if (err)
115 goto out;
116
117 mutex_lock(&c->mst_mutex);
118 c->mst_node->cmt_no = cpu_to_le64(++c->cmt_no);
119 c->mst_node->log_lnum = cpu_to_le32(new_ltail_lnum);
120 c->mst_node->root_lnum = cpu_to_le32(zroot.lnum);
121 c->mst_node->root_offs = cpu_to_le32(zroot.offs);
122 c->mst_node->root_len = cpu_to_le32(zroot.len);
123 c->mst_node->ihead_lnum = cpu_to_le32(c->ihead_lnum);
124 c->mst_node->ihead_offs = cpu_to_le32(c->ihead_offs);
125 c->mst_node->index_size = cpu_to_le64(c->old_idx_sz);
126 c->mst_node->lpt_lnum = cpu_to_le32(c->lpt_lnum);
127 c->mst_node->lpt_offs = cpu_to_le32(c->lpt_offs);
128 c->mst_node->nhead_lnum = cpu_to_le32(c->nhead_lnum);
129 c->mst_node->nhead_offs = cpu_to_le32(c->nhead_offs);
130 c->mst_node->ltab_lnum = cpu_to_le32(c->ltab_lnum);
131 c->mst_node->ltab_offs = cpu_to_le32(c->ltab_offs);
132 c->mst_node->lsave_lnum = cpu_to_le32(c->lsave_lnum);
133 c->mst_node->lsave_offs = cpu_to_le32(c->lsave_offs);
134 c->mst_node->lscan_lnum = cpu_to_le32(c->lscan_lnum);
135 c->mst_node->empty_lebs = cpu_to_le32(lst.empty_lebs);
136 c->mst_node->idx_lebs = cpu_to_le32(lst.idx_lebs);
137 c->mst_node->total_free = cpu_to_le64(lst.total_free);
138 c->mst_node->total_dirty = cpu_to_le64(lst.total_dirty);
139 c->mst_node->total_used = cpu_to_le64(lst.total_used);
140 c->mst_node->total_dead = cpu_to_le64(lst.total_dead);
141 c->mst_node->total_dark = cpu_to_le64(lst.total_dark);
142 if (c->no_orphs)
143 c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS);
144 else
145 c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_NO_ORPHS);
146 err = ubifs_write_master(c);
147 mutex_unlock(&c->mst_mutex);
148 if (err)
149 goto out;
150
151 err = ubifs_log_post_commit(c, old_ltail_lnum);
152 if (err)
153 goto out;
154 err = ubifs_gc_end_commit(c);
155 if (err)
156 goto out;
157 err = ubifs_lpt_post_commit(c);
158 if (err)
159 goto out;
160
161 spin_lock(&c->cs_lock);
162 c->cmt_state = COMMIT_RESTING;
163 wake_up(&c->cmt_wq);
164 dbg_cmt("commit end");
165 spin_unlock(&c->cs_lock);
166
167 return 0;
168
169out_up:
170 up_write(&c->commit_sem);
171out:
172 ubifs_err("commit failed, error %d", err);
173 spin_lock(&c->cs_lock);
174 c->cmt_state = COMMIT_BROKEN;
175 wake_up(&c->cmt_wq);
176 spin_unlock(&c->cs_lock);
177 ubifs_ro_mode(c, err);
178 return err;
179}
180
181/**
182 * run_bg_commit - run background commit if it is needed.
183 * @c: UBIFS file-system description object
184 *
185 * This function runs background commit if it is needed. Returns zero in case
186 * of success and a negative error code in case of failure.
187 */
188static int run_bg_commit(struct ubifs_info *c)
189{
190 spin_lock(&c->cs_lock);
191 /*
192 * Run background commit only if background commit was requested or if
193 * commit is required.
194 */
195 if (c->cmt_state != COMMIT_BACKGROUND &&
196 c->cmt_state != COMMIT_REQUIRED)
197 goto out;
198 spin_unlock(&c->cs_lock);
199
200 down_write(&c->commit_sem);
201 spin_lock(&c->cs_lock);
202 if (c->cmt_state == COMMIT_REQUIRED)
203 c->cmt_state = COMMIT_RUNNING_REQUIRED;
204 else if (c->cmt_state == COMMIT_BACKGROUND)
205 c->cmt_state = COMMIT_RUNNING_BACKGROUND;
206 else
207 goto out_cmt_unlock;
208 spin_unlock(&c->cs_lock);
209
210 return do_commit(c);
211
212out_cmt_unlock:
213 up_write(&c->commit_sem);
214out:
215 spin_unlock(&c->cs_lock);
216 return 0;
217}
218
219/**
220 * ubifs_bg_thread - UBIFS background thread function.
221 * @info: points to the file-system description object
222 *
223 * This function implements various file-system background activities:
224 * o when a write-buffer timer expires it synchronizes the appropriate
225 * write-buffer;
226 * o when the journal is about to be full, it starts in-advance commit.
227 *
228 * Note, other stuff like background garbage collection may be added here in
229 * future.
230 */
231int ubifs_bg_thread(void *info)
232{
233 int err;
234 struct ubifs_info *c = info;
235
236 ubifs_msg("background thread \"%s\" started, PID %d",
237 c->bgt_name, current->pid);
238 set_freezable();
239
240 while (1) {
241 if (kthread_should_stop())
242 break;
243
244 if (try_to_freeze())
245 continue;
246
247 set_current_state(TASK_INTERRUPTIBLE);
248 /* Check if there is something to do */
249 if (!c->need_bgt) {
250 /*
251 * Nothing prevents us from going sleep now and
252 * be never woken up and block the task which
253 * could wait in 'kthread_stop()' forever.
254 */
255 if (kthread_should_stop())
256 break;
257 schedule();
258 continue;
259 } else
260 __set_current_state(TASK_RUNNING);
261
262 c->need_bgt = 0;
263 err = ubifs_bg_wbufs_sync(c);
264 if (err)
265 ubifs_ro_mode(c, err);
266
267 run_bg_commit(c);
268 cond_resched();
269 }
270
271 dbg_msg("background thread \"%s\" stops", c->bgt_name);
272 return 0;
273}
274
275/**
276 * ubifs_commit_required - set commit state to "required".
277 * @c: UBIFS file-system description object
278 *
279 * This function is called if a commit is required but cannot be done from the
280 * calling function, so it is just flagged instead.
281 */
282void ubifs_commit_required(struct ubifs_info *c)
283{
284 spin_lock(&c->cs_lock);
285 switch (c->cmt_state) {
286 case COMMIT_RESTING:
287 case COMMIT_BACKGROUND:
288 dbg_cmt("old: %s, new: %s", dbg_cstate(c->cmt_state),
289 dbg_cstate(COMMIT_REQUIRED));
290 c->cmt_state = COMMIT_REQUIRED;
291 break;
292 case COMMIT_RUNNING_BACKGROUND:
293 dbg_cmt("old: %s, new: %s", dbg_cstate(c->cmt_state),
294 dbg_cstate(COMMIT_RUNNING_REQUIRED));
295 c->cmt_state = COMMIT_RUNNING_REQUIRED;
296 break;
297 case COMMIT_REQUIRED:
298 case COMMIT_RUNNING_REQUIRED:
299 case COMMIT_BROKEN:
300 break;
301 }
302 spin_unlock(&c->cs_lock);
303}
304
305/**
306 * ubifs_request_bg_commit - notify the background thread to do a commit.
307 * @c: UBIFS file-system description object
308 *
309 * This function is called if the journal is full enough to make a commit
310 * worthwhile, so background thread is kicked to start it.
311 */
312void ubifs_request_bg_commit(struct ubifs_info *c)
313{
314 spin_lock(&c->cs_lock);
315 if (c->cmt_state == COMMIT_RESTING) {
316 dbg_cmt("old: %s, new: %s", dbg_cstate(c->cmt_state),
317 dbg_cstate(COMMIT_BACKGROUND));
318 c->cmt_state = COMMIT_BACKGROUND;
319 spin_unlock(&c->cs_lock);
320 ubifs_wake_up_bgt(c);
321 } else
322 spin_unlock(&c->cs_lock);
323}
324
325/**
326 * wait_for_commit - wait for commit.
327 * @c: UBIFS file-system description object
328 *
329 * This function sleeps until the commit operation is no longer running.
330 */
331static int wait_for_commit(struct ubifs_info *c)
332{
333 dbg_cmt("pid %d goes sleep", current->pid);
334
335 /*
336 * The following sleeps if the condition is false, and will be woken
337 * when the commit ends. It is possible, although very unlikely, that we
338 * will wake up and see the subsequent commit running, rather than the
339 * one we were waiting for, and go back to sleep. However, we will be
340 * woken again, so there is no danger of sleeping forever.
341 */
342 wait_event(c->cmt_wq, c->cmt_state != COMMIT_RUNNING_BACKGROUND &&
343 c->cmt_state != COMMIT_RUNNING_REQUIRED);
344 dbg_cmt("commit finished, pid %d woke up", current->pid);
345 return 0;
346}
347
348/**
349 * ubifs_run_commit - run or wait for commit.
350 * @c: UBIFS file-system description object
351 *
352 * This function runs commit and returns zero in case of success and a negative
353 * error code in case of failure.
354 */
355int ubifs_run_commit(struct ubifs_info *c)
356{
357 int err = 0;
358
359 spin_lock(&c->cs_lock);
360 if (c->cmt_state == COMMIT_BROKEN) {
361 err = -EINVAL;
362 goto out;
363 }
364
365 if (c->cmt_state == COMMIT_RUNNING_BACKGROUND)
366 /*
367 * We set the commit state to 'running required' to indicate
368 * that we want it to complete as quickly as possible.
369 */
370 c->cmt_state = COMMIT_RUNNING_REQUIRED;
371
372 if (c->cmt_state == COMMIT_RUNNING_REQUIRED) {
373 spin_unlock(&c->cs_lock);
374 return wait_for_commit(c);
375 }
376 spin_unlock(&c->cs_lock);
377
378 /* Ok, the commit is indeed needed */
379
380 down_write(&c->commit_sem);
381 spin_lock(&c->cs_lock);
382 /*
383 * Since we unlocked 'c->cs_lock', the state may have changed, so
384 * re-check it.
385 */
386 if (c->cmt_state == COMMIT_BROKEN) {
387 err = -EINVAL;
388 goto out_cmt_unlock;
389 }
390
391 if (c->cmt_state == COMMIT_RUNNING_BACKGROUND)
392 c->cmt_state = COMMIT_RUNNING_REQUIRED;
393
394 if (c->cmt_state == COMMIT_RUNNING_REQUIRED) {
395 up_write(&c->commit_sem);
396 spin_unlock(&c->cs_lock);
397 return wait_for_commit(c);
398 }
399 c->cmt_state = COMMIT_RUNNING_REQUIRED;
400 spin_unlock(&c->cs_lock);
401
402 err = do_commit(c);
403 return err;
404
405out_cmt_unlock:
406 up_write(&c->commit_sem);
407out:
408 spin_unlock(&c->cs_lock);
409 return err;
410}
411
412/**
413 * ubifs_gc_should_commit - determine if it is time for GC to run commit.
414 * @c: UBIFS file-system description object
415 *
416 * This function is called by garbage collection to determine if commit should
417 * be run. If commit state is @COMMIT_BACKGROUND, which means that the journal
418 * is full enough to start commit, this function returns true. It is not
419 * absolutely necessary to commit yet, but it feels like this should be better
420 * then to keep doing GC. This function returns %1 if GC has to initiate commit
421 * and %0 if not.
422 */
423int ubifs_gc_should_commit(struct ubifs_info *c)
424{
425 int ret = 0;
426
427 spin_lock(&c->cs_lock);
428 if (c->cmt_state == COMMIT_BACKGROUND) {
429 dbg_cmt("commit required now");
430 c->cmt_state = COMMIT_REQUIRED;
431 } else
432 dbg_cmt("commit not requested");
433 if (c->cmt_state == COMMIT_REQUIRED)
434 ret = 1;
435 spin_unlock(&c->cs_lock);
436 return ret;
437}
438
439#ifdef CONFIG_UBIFS_FS_DEBUG
440
441/**
442 * struct idx_node - hold index nodes during index tree traversal.
443 * @list: list
444 * @iip: index in parent (slot number of this indexing node in the parent
445 * indexing node)
446 * @upper_key: all keys in this indexing node have to be less or equivalent to
447 * this key
448 * @idx: index node (8-byte aligned because all node structures must be 8-byte
449 * aligned)
450 */
451struct idx_node {
452 struct list_head list;
453 int iip;
454 union ubifs_key upper_key;
455 struct ubifs_idx_node idx __attribute__((aligned(8)));
456};
457
458/**
459 * dbg_old_index_check_init - get information for the next old index check.
460 * @c: UBIFS file-system description object
461 * @zroot: root of the index
462 *
463 * This function records information about the index that will be needed for the
464 * next old index check i.e. 'dbg_check_old_index()'.
465 *
466 * This function returns %0 on success and a negative error code on failure.
467 */
468int dbg_old_index_check_init(struct ubifs_info *c, struct ubifs_zbranch *zroot)
469{
470 struct ubifs_idx_node *idx;
471 int lnum, offs, len, err = 0;
472
473 c->old_zroot = *zroot;
474
475 lnum = c->old_zroot.lnum;
476 offs = c->old_zroot.offs;
477 len = c->old_zroot.len;
478
479 idx = kmalloc(c->max_idx_node_sz, GFP_NOFS);
480 if (!idx)
481 return -ENOMEM;
482
483 err = ubifs_read_node(c, idx, UBIFS_IDX_NODE, len, lnum, offs);
484 if (err)
485 goto out;
486
487 c->old_zroot_level = le16_to_cpu(idx->level);
488 c->old_zroot_sqnum = le64_to_cpu(idx->ch.sqnum);
489out:
490 kfree(idx);
491 return err;
492}
493
494/**
495 * dbg_check_old_index - check the old copy of the index.
496 * @c: UBIFS file-system description object
497 * @zroot: root of the new index
498 *
499 * In order to be able to recover from an unclean unmount, a complete copy of
500 * the index must exist on flash. This is the "old" index. The commit process
501 * must write the "new" index to flash without overwriting or destroying any
502 * part of the old index. This function is run at commit end in order to check
503 * that the old index does indeed exist completely intact.
504 *
505 * This function returns %0 on success and a negative error code on failure.
506 */
507int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot)
508{
509 int lnum, offs, len, err = 0, uninitialized_var(last_level), child_cnt;
510 int first = 1, iip;
511 union ubifs_key lower_key, upper_key, l_key, u_key;
512 unsigned long long uninitialized_var(last_sqnum);
513 struct ubifs_idx_node *idx;
514 struct list_head list;
515 struct idx_node *i;
516 size_t sz;
517
518 if (!(ubifs_chk_flags & UBIFS_CHK_OLD_IDX))
519 goto out;
520
521 INIT_LIST_HEAD(&list);
522
523 sz = sizeof(struct idx_node) + ubifs_idx_node_sz(c, c->fanout) -
524 UBIFS_IDX_NODE_SZ;
525
526 /* Start at the old zroot */
527 lnum = c->old_zroot.lnum;
528 offs = c->old_zroot.offs;
529 len = c->old_zroot.len;
530 iip = 0;
531
532 /*
533 * Traverse the index tree preorder depth-first i.e. do a node and then
534 * its subtrees from left to right.
535 */
536 while (1) {
537 struct ubifs_branch *br;
538
539 /* Get the next index node */
540 i = kmalloc(sz, GFP_NOFS);
541 if (!i) {
542 err = -ENOMEM;
543 goto out_free;
544 }
545 i->iip = iip;
546 /* Keep the index nodes on our path in a linked list */
547 list_add_tail(&i->list, &list);
548 /* Read the index node */
549 idx = &i->idx;
550 err = ubifs_read_node(c, idx, UBIFS_IDX_NODE, len, lnum, offs);
551 if (err)
552 goto out_free;
553 /* Validate index node */
554 child_cnt = le16_to_cpu(idx->child_cnt);
555 if (child_cnt < 1 || child_cnt > c->fanout) {
556 err = 1;
557 goto out_dump;
558 }
559 if (first) {
560 first = 0;
561 /* Check root level and sqnum */
562 if (le16_to_cpu(idx->level) != c->old_zroot_level) {
563 err = 2;
564 goto out_dump;
565 }
566 if (le64_to_cpu(idx->ch.sqnum) != c->old_zroot_sqnum) {
567 err = 3;
568 goto out_dump;
569 }
570 /* Set last values as though root had a parent */
571 last_level = le16_to_cpu(idx->level) + 1;
572 last_sqnum = le64_to_cpu(idx->ch.sqnum) + 1;
573 key_read(c, ubifs_idx_key(c, idx), &lower_key);
574 highest_ino_key(c, &upper_key, INUM_WATERMARK);
575 }
576 key_copy(c, &upper_key, &i->upper_key);
577 if (le16_to_cpu(idx->level) != last_level - 1) {
578 err = 3;
579 goto out_dump;
580 }
581 /*
582 * The index is always written bottom up hence a child's sqnum
583 * is always less than the parents.
584 */
585 if (le64_to_cpu(idx->ch.sqnum) >= last_sqnum) {
586 err = 4;
587 goto out_dump;
588 }
589 /* Check key range */
590 key_read(c, ubifs_idx_key(c, idx), &l_key);
591 br = ubifs_idx_branch(c, idx, child_cnt - 1);
592 key_read(c, &br->key, &u_key);
593 if (keys_cmp(c, &lower_key, &l_key) > 0) {
594 err = 5;
595 goto out_dump;
596 }
597 if (keys_cmp(c, &upper_key, &u_key) < 0) {
598 err = 6;
599 goto out_dump;
600 }
601 if (keys_cmp(c, &upper_key, &u_key) == 0)
602 if (!is_hash_key(c, &u_key)) {
603 err = 7;
604 goto out_dump;
605 }
606 /* Go to next index node */
607 if (le16_to_cpu(idx->level) == 0) {
608 /* At the bottom, so go up until can go right */
609 while (1) {
610 /* Drop the bottom of the list */
611 list_del(&i->list);
612 kfree(i);
613 /* No more list means we are done */
614 if (list_empty(&list))
615 goto out;
616 /* Look at the new bottom */
617 i = list_entry(list.prev, struct idx_node,
618 list);
619 idx = &i->idx;
620 /* Can we go right */
621 if (iip + 1 < le16_to_cpu(idx->child_cnt)) {
622 iip = iip + 1;
623 break;
624 } else
625 /* Nope, so go up again */
626 iip = i->iip;
627 }
628 } else
629 /* Go down left */
630 iip = 0;
631 /*
632 * We have the parent in 'idx' and now we set up for reading the
633 * child pointed to by slot 'iip'.
634 */
635 last_level = le16_to_cpu(idx->level);
636 last_sqnum = le64_to_cpu(idx->ch.sqnum);
637 br = ubifs_idx_branch(c, idx, iip);
638 lnum = le32_to_cpu(br->lnum);
639 offs = le32_to_cpu(br->offs);
640 len = le32_to_cpu(br->len);
641 key_read(c, &br->key, &lower_key);
642 if (iip + 1 < le16_to_cpu(idx->child_cnt)) {
643 br = ubifs_idx_branch(c, idx, iip + 1);
644 key_read(c, &br->key, &upper_key);
645 } else
646 key_copy(c, &i->upper_key, &upper_key);
647 }
648out:
649 err = dbg_old_index_check_init(c, zroot);
650 if (err)
651 goto out_free;
652
653 return 0;
654
655out_dump:
656 dbg_err("dumping index node (iip=%d)", i->iip);
657 dbg_dump_node(c, idx);
658 list_del(&i->list);
659 kfree(i);
660 if (!list_empty(&list)) {
661 i = list_entry(list.prev, struct idx_node, list);
662 dbg_err("dumping parent index node");
663 dbg_dump_node(c, &i->idx);
664 }
665out_free:
666 while (!list_empty(&list)) {
667 i = list_entry(list.next, struct idx_node, list);
668 list_del(&i->list);
669 kfree(i);
670 }
671 ubifs_err("failed, error %d", err);
672 if (err > 0)
673 err = -EINVAL;
674 return err;
675}
676
677#endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/compress.c b/fs/ubifs/compress.c
new file mode 100644
index 000000000000..5bb51dac3c16
--- /dev/null
+++ b/fs/ubifs/compress.c
@@ -0,0 +1,253 @@
1/*
2 * This file is part of UBIFS.
3 *
4 * Copyright (C) 2006-2008 Nokia Corporation.
5 * Copyright (C) 2006, 2007 University of Szeged, Hungary
6 *
7 * This program is free software; you can redistribute it and/or modify it
8 * under the terms of the GNU General Public License version 2 as published by
9 * the Free Software Foundation.
10 *
11 * This program is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
14 * more details.
15 *
16 * You should have received a copy of the GNU General Public License along with
17 * this program; if not, write to the Free Software Foundation, Inc., 51
18 * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Authors: Adrian Hunter
21 * Artem Bityutskiy (Битюцкий Артём)
22 * Zoltan Sogor
23 */
24
25/*
26 * This file provides a single place to access to compression and
27 * decompression.
28 */
29
30#include <linux/crypto.h>
31#include "ubifs.h"
32
33/* Fake description object for the "none" compressor */
34static struct ubifs_compressor none_compr = {
35 .compr_type = UBIFS_COMPR_NONE,
36 .name = "no compression",
37 .capi_name = "",
38};
39
40#ifdef CONFIG_UBIFS_FS_LZO
41static DEFINE_MUTEX(lzo_mutex);
42
43static struct ubifs_compressor lzo_compr = {
44 .compr_type = UBIFS_COMPR_LZO,
45 .comp_mutex = &lzo_mutex,
46 .name = "LZO",
47 .capi_name = "lzo",
48};
49#else
50static struct ubifs_compressor lzo_compr = {
51 .compr_type = UBIFS_COMPR_LZO,
52 .name = "LZO",
53};
54#endif
55
56#ifdef CONFIG_UBIFS_FS_ZLIB
57static DEFINE_MUTEX(deflate_mutex);
58static DEFINE_MUTEX(inflate_mutex);
59
60static struct ubifs_compressor zlib_compr = {
61 .compr_type = UBIFS_COMPR_ZLIB,
62 .comp_mutex = &deflate_mutex,
63 .decomp_mutex = &inflate_mutex,
64 .name = "zlib",
65 .capi_name = "deflate",
66};
67#else
68static struct ubifs_compressor zlib_compr = {
69 .compr_type = UBIFS_COMPR_ZLIB,
70 .name = "zlib",
71};
72#endif
73
74/* All UBIFS compressors */
75struct ubifs_compressor *ubifs_compressors[UBIFS_COMPR_TYPES_CNT];
76
77/**
78 * ubifs_compress - compress data.
79 * @in_buf: data to compress
80 * @in_len: length of the data to compress
81 * @out_buf: output buffer where compressed data should be stored
82 * @out_len: output buffer length is returned here
83 * @compr_type: type of compression to use on enter, actually used compression
84 * type on exit
85 *
86 * This function compresses input buffer @in_buf of length @in_len and stores
87 * the result in the output buffer @out_buf and the resulting length in
88 * @out_len. If the input buffer does not compress, it is just copied to the
89 * @out_buf. The same happens if @compr_type is %UBIFS_COMPR_NONE or if
90 * compression error occurred.
91 *
92 * Note, if the input buffer was not compressed, it is copied to the output
93 * buffer and %UBIFS_COMPR_NONE is returned in @compr_type.
94 *
95 * This functions returns %0 on success or a negative error code on failure.
96 */
97void ubifs_compress(const void *in_buf, int in_len, void *out_buf, int *out_len,
98 int *compr_type)
99{
100 int err;
101 struct ubifs_compressor *compr = ubifs_compressors[*compr_type];
102
103 if (*compr_type == UBIFS_COMPR_NONE)
104 goto no_compr;
105
106 /* If the input data is small, do not even try to compress it */
107 if (in_len < UBIFS_MIN_COMPR_LEN)
108 goto no_compr;
109
110 if (compr->comp_mutex)
111 mutex_lock(compr->comp_mutex);
112 err = crypto_comp_compress(compr->cc, in_buf, in_len, out_buf,
113 out_len);
114 if (compr->comp_mutex)
115 mutex_unlock(compr->comp_mutex);
116 if (unlikely(err)) {
117 ubifs_warn("cannot compress %d bytes, compressor %s, "
118 "error %d, leave data uncompressed",
119 in_len, compr->name, err);
120 goto no_compr;
121 }
122
123 /*
124 * Presently, we just require that compression results in less data,
125 * rather than any defined minimum compression ratio or amount.
126 */
127 if (ALIGN(*out_len, 8) >= ALIGN(in_len, 8))
128 goto no_compr;
129
130 return;
131
132no_compr:
133 memcpy(out_buf, in_buf, in_len);
134 *out_len = in_len;
135 *compr_type = UBIFS_COMPR_NONE;
136}
137
138/**
139 * ubifs_decompress - decompress data.
140 * @in_buf: data to decompress
141 * @in_len: length of the data to decompress
142 * @out_buf: output buffer where decompressed data should
143 * @out_len: output length is returned here
144 * @compr_type: type of compression
145 *
146 * This function decompresses data from buffer @in_buf into buffer @out_buf.
147 * The length of the uncompressed data is returned in @out_len. This functions
148 * returns %0 on success or a negative error code on failure.
149 */
150int ubifs_decompress(const void *in_buf, int in_len, void *out_buf,
151 int *out_len, int compr_type)
152{
153 int err;
154 struct ubifs_compressor *compr;
155
156 if (unlikely(compr_type < 0 || compr_type >= UBIFS_COMPR_TYPES_CNT)) {
157 ubifs_err("invalid compression type %d", compr_type);
158 return -EINVAL;
159 }
160
161 compr = ubifs_compressors[compr_type];
162
163 if (unlikely(!compr->capi_name)) {
164 ubifs_err("%s compression is not compiled in", compr->name);
165 return -EINVAL;
166 }
167
168 if (compr_type == UBIFS_COMPR_NONE) {
169 memcpy(out_buf, in_buf, in_len);
170 *out_len = in_len;
171 return 0;
172 }
173
174 if (compr->decomp_mutex)
175 mutex_lock(compr->decomp_mutex);
176 err = crypto_comp_decompress(compr->cc, in_buf, in_len, out_buf,
177 out_len);
178 if (compr->decomp_mutex)
179 mutex_unlock(compr->decomp_mutex);
180 if (err)
181 ubifs_err("cannot decompress %d bytes, compressor %s, "
182 "error %d", in_len, compr->name, err);
183
184 return err;
185}
186
187/**
188 * compr_init - initialize a compressor.
189 * @compr: compressor description object
190 *
191 * This function initializes the requested compressor and returns zero in case
192 * of success or a negative error code in case of failure.
193 */
194static int __init compr_init(struct ubifs_compressor *compr)
195{
196 if (compr->capi_name) {
197 compr->cc = crypto_alloc_comp(compr->capi_name, 0, 0);
198 if (IS_ERR(compr->cc)) {
199 ubifs_err("cannot initialize compressor %s, error %ld",
200 compr->name, PTR_ERR(compr->cc));
201 return PTR_ERR(compr->cc);
202 }
203 }
204
205 ubifs_compressors[compr->compr_type] = compr;
206 return 0;
207}
208
209/**
210 * compr_exit - de-initialize a compressor.
211 * @compr: compressor description object
212 */
213static void compr_exit(struct ubifs_compressor *compr)
214{
215 if (compr->capi_name)
216 crypto_free_comp(compr->cc);
217 return;
218}
219
220/**
221 * ubifs_compressors_init - initialize UBIFS compressors.
222 *
223 * This function initializes the compressor which were compiled in. Returns
224 * zero in case of success and a negative error code in case of failure.
225 */
226int __init ubifs_compressors_init(void)
227{
228 int err;
229
230 err = compr_init(&lzo_compr);
231 if (err)
232 return err;
233
234 err = compr_init(&zlib_compr);
235 if (err)
236 goto out_lzo;
237
238 ubifs_compressors[UBIFS_COMPR_NONE] = &none_compr;
239 return 0;
240
241out_lzo:
242 compr_exit(&lzo_compr);
243 return err;
244}
245
246/**
247 * ubifs_compressors_exit - de-initialize UBIFS compressors.
248 */
249void __exit ubifs_compressors_exit(void)
250{
251 compr_exit(&lzo_compr);
252 compr_exit(&zlib_compr);
253}
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
new file mode 100644
index 000000000000..4e3aaeba4eca
--- /dev/null
+++ b/fs/ubifs/debug.c
@@ -0,0 +1,2289 @@
1/*
2 * This file is part of UBIFS.
3 *
4 * Copyright (C) 2006-2008 Nokia Corporation
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published by
8 * the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along with
16 * this program; if not, write to the Free Software Foundation, Inc., 51
17 * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 * Authors: Artem Bityutskiy (Битюцкий Артём)
20 * Adrian Hunter
21 */
22
23/*
24 * This file implements most of the debugging stuff which is compiled in only
25 * when it is enabled. But some debugging check functions are implemented in
26 * corresponding subsystem, just because they are closely related and utilize
27 * various local functions of those subsystems.
28 */
29
30#define UBIFS_DBG_PRESERVE_UBI
31
32#include "ubifs.h"
33#include <linux/module.h>
34#include <linux/moduleparam.h>
35
36#ifdef CONFIG_UBIFS_FS_DEBUG
37
38DEFINE_SPINLOCK(dbg_lock);
39
40static char dbg_key_buf0[128];
41static char dbg_key_buf1[128];
42
43unsigned int ubifs_msg_flags = UBIFS_MSG_FLAGS_DEFAULT;
44unsigned int ubifs_chk_flags = UBIFS_CHK_FLAGS_DEFAULT;
45unsigned int ubifs_tst_flags;
46
47module_param_named(debug_msgs, ubifs_msg_flags, uint, S_IRUGO | S_IWUSR);
48module_param_named(debug_chks, ubifs_chk_flags, uint, S_IRUGO | S_IWUSR);
49module_param_named(debug_tsts, ubifs_tst_flags, uint, S_IRUGO | S_IWUSR);
50
51MODULE_PARM_DESC(debug_msgs, "Debug message type flags");
52MODULE_PARM_DESC(debug_chks, "Debug check flags");
53MODULE_PARM_DESC(debug_tsts, "Debug special test flags");
54
55static const char *get_key_fmt(int fmt)
56{
57 switch (fmt) {
58 case UBIFS_SIMPLE_KEY_FMT:
59 return "simple";
60 default:
61 return "unknown/invalid format";
62 }
63}
64
65static const char *get_key_hash(int hash)
66{
67 switch (hash) {
68 case UBIFS_KEY_HASH_R5:
69 return "R5";
70 case UBIFS_KEY_HASH_TEST:
71 return "test";
72 default:
73 return "unknown/invalid name hash";
74 }
75}
76
77static const char *get_key_type(int type)
78{
79 switch (type) {
80 case UBIFS_INO_KEY:
81 return "inode";
82 case UBIFS_DENT_KEY:
83 return "direntry";
84 case UBIFS_XENT_KEY:
85 return "xentry";
86 case UBIFS_DATA_KEY:
87 return "data";
88 case UBIFS_TRUN_KEY:
89 return "truncate";
90 default:
91 return "unknown/invalid key";
92 }
93}
94
95static void sprintf_key(const struct ubifs_info *c, const union ubifs_key *key,
96 char *buffer)
97{
98 char *p = buffer;
99 int type = key_type(c, key);
100
101 if (c->key_fmt == UBIFS_SIMPLE_KEY_FMT) {
102 switch (type) {
103 case UBIFS_INO_KEY:
104 sprintf(p, "(%lu, %s)", key_inum(c, key),
105 get_key_type(type));
106 break;
107 case UBIFS_DENT_KEY:
108 case UBIFS_XENT_KEY:
109 sprintf(p, "(%lu, %s, %#08x)", key_inum(c, key),
110 get_key_type(type), key_hash(c, key));
111 break;
112 case UBIFS_DATA_KEY:
113 sprintf(p, "(%lu, %s, %u)", key_inum(c, key),
114 get_key_type(type), key_block(c, key));
115 break;
116 case UBIFS_TRUN_KEY:
117 sprintf(p, "(%lu, %s)",
118 key_inum(c, key), get_key_type(type));
119 break;
120 default:
121 sprintf(p, "(bad key type: %#08x, %#08x)",
122 key->u32[0], key->u32[1]);
123 }
124 } else
125 sprintf(p, "bad key format %d", c->key_fmt);
126}
127
128const char *dbg_key_str0(const struct ubifs_info *c, const union ubifs_key *key)
129{
130 /* dbg_lock must be held */
131 sprintf_key(c, key, dbg_key_buf0);
132 return dbg_key_buf0;
133}
134
135const char *dbg_key_str1(const struct ubifs_info *c, const union ubifs_key *key)
136{
137 /* dbg_lock must be held */
138 sprintf_key(c, key, dbg_key_buf1);
139 return dbg_key_buf1;
140}
141
142const char *dbg_ntype(int type)
143{
144 switch (type) {
145 case UBIFS_PAD_NODE:
146 return "padding node";
147 case UBIFS_SB_NODE:
148 return "superblock node";
149 case UBIFS_MST_NODE:
150 return "master node";
151 case UBIFS_REF_NODE:
152 return "reference node";
153 case UBIFS_INO_NODE:
154 return "inode node";
155 case UBIFS_DENT_NODE:
156 return "direntry node";
157 case UBIFS_XENT_NODE:
158 return "xentry node";
159 case UBIFS_DATA_NODE:
160 return "data node";
161 case UBIFS_TRUN_NODE:
162 return "truncate node";
163 case UBIFS_IDX_NODE:
164 return "indexing node";
165 case UBIFS_CS_NODE:
166 return "commit start node";
167 case UBIFS_ORPH_NODE:
168 return "orphan node";
169 default:
170 return "unknown node";
171 }
172}
173
174static const char *dbg_gtype(int type)
175{
176 switch (type) {
177 case UBIFS_NO_NODE_GROUP:
178 return "no node group";
179 case UBIFS_IN_NODE_GROUP:
180 return "in node group";
181 case UBIFS_LAST_OF_NODE_GROUP:
182 return "last of node group";
183 default:
184 return "unknown";
185 }
186}
187
188const char *dbg_cstate(int cmt_state)
189{
190 switch (cmt_state) {
191 case COMMIT_RESTING:
192 return "commit resting";
193 case COMMIT_BACKGROUND:
194 return "background commit requested";
195 case COMMIT_REQUIRED:
196 return "commit required";
197 case COMMIT_RUNNING_BACKGROUND:
198 return "BACKGROUND commit running";
199 case COMMIT_RUNNING_REQUIRED:
200 return "commit running and required";
201 case COMMIT_BROKEN:
202 return "broken commit";
203 default:
204 return "unknown commit state";
205 }
206}
207
208static void dump_ch(const struct ubifs_ch *ch)
209{
210 printk(KERN_DEBUG "\tmagic %#x\n", le32_to_cpu(ch->magic));
211 printk(KERN_DEBUG "\tcrc %#x\n", le32_to_cpu(ch->crc));
212 printk(KERN_DEBUG "\tnode_type %d (%s)\n", ch->node_type,
213 dbg_ntype(ch->node_type));
214 printk(KERN_DEBUG "\tgroup_type %d (%s)\n", ch->group_type,
215 dbg_gtype(ch->group_type));
216 printk(KERN_DEBUG "\tsqnum %llu\n",
217 (unsigned long long)le64_to_cpu(ch->sqnum));
218 printk(KERN_DEBUG "\tlen %u\n", le32_to_cpu(ch->len));
219}
220
221void dbg_dump_inode(const struct ubifs_info *c, const struct inode *inode)
222{
223 const struct ubifs_inode *ui = ubifs_inode(inode);
224
225 printk(KERN_DEBUG "inode %lu\n", inode->i_ino);
226 printk(KERN_DEBUG "size %llu\n",
227 (unsigned long long)i_size_read(inode));
228 printk(KERN_DEBUG "nlink %u\n", inode->i_nlink);
229 printk(KERN_DEBUG "uid %u\n", (unsigned int)inode->i_uid);
230 printk(KERN_DEBUG "gid %u\n", (unsigned int)inode->i_gid);
231 printk(KERN_DEBUG "atime %u.%u\n",
232 (unsigned int)inode->i_atime.tv_sec,
233 (unsigned int)inode->i_atime.tv_nsec);
234 printk(KERN_DEBUG "mtime %u.%u\n",
235 (unsigned int)inode->i_mtime.tv_sec,
236 (unsigned int)inode->i_mtime.tv_nsec);
237 printk(KERN_DEBUG "ctime %u.%u\n",
238 (unsigned int)inode->i_ctime.tv_sec,
239 (unsigned int)inode->i_ctime.tv_nsec);
240 printk(KERN_DEBUG "creat_sqnum %llu\n", ui->creat_sqnum);
241 printk(KERN_DEBUG "xattr_size %u\n", ui->xattr_size);
242 printk(KERN_DEBUG "xattr_cnt %u\n", ui->xattr_cnt);
243 printk(KERN_DEBUG "xattr_names %u\n", ui->xattr_names);
244 printk(KERN_DEBUG "dirty %u\n", ui->dirty);
245 printk(KERN_DEBUG "xattr %u\n", ui->xattr);
246 printk(KERN_DEBUG "flags %d\n", ui->flags);
247 printk(KERN_DEBUG "compr_type %d\n", ui->compr_type);
248 printk(KERN_DEBUG "data_len %d\n", ui->data_len);
249}
250
251void dbg_dump_node(const struct ubifs_info *c, const void *node)
252{
253 int i, n;
254 union ubifs_key key;
255 const struct ubifs_ch *ch = node;
256
257 if (dbg_failure_mode)
258 return;
259
260 /* If the magic is incorrect, just hexdump the first bytes */
261 if (le32_to_cpu(ch->magic) != UBIFS_NODE_MAGIC) {
262 printk(KERN_DEBUG "Not a node, first %zu bytes:", UBIFS_CH_SZ);
263 print_hex_dump(KERN_DEBUG, "", DUMP_PREFIX_OFFSET, 32, 1,
264 (void *)node, UBIFS_CH_SZ, 1);
265 return;
266 }
267
268 spin_lock(&dbg_lock);
269 dump_ch(node);
270
271 switch (ch->node_type) {
272 case UBIFS_PAD_NODE:
273 {
274 const struct ubifs_pad_node *pad = node;
275
276 printk(KERN_DEBUG "\tpad_len %u\n",
277 le32_to_cpu(pad->pad_len));
278 break;
279 }
280 case UBIFS_SB_NODE:
281 {
282 const struct ubifs_sb_node *sup = node;
283 unsigned int sup_flags = le32_to_cpu(sup->flags);
284
285 printk(KERN_DEBUG "\tkey_hash %d (%s)\n",
286 (int)sup->key_hash, get_key_hash(sup->key_hash));
287 printk(KERN_DEBUG "\tkey_fmt %d (%s)\n",
288 (int)sup->key_fmt, get_key_fmt(sup->key_fmt));
289 printk(KERN_DEBUG "\tflags %#x\n", sup_flags);
290 printk(KERN_DEBUG "\t big_lpt %u\n",
291 !!(sup_flags & UBIFS_FLG_BIGLPT));
292 printk(KERN_DEBUG "\tmin_io_size %u\n",
293 le32_to_cpu(sup->min_io_size));
294 printk(KERN_DEBUG "\tleb_size %u\n",
295 le32_to_cpu(sup->leb_size));
296 printk(KERN_DEBUG "\tleb_cnt %u\n",
297 le32_to_cpu(sup->leb_cnt));
298 printk(KERN_DEBUG "\tmax_leb_cnt %u\n",
299 le32_to_cpu(sup->max_leb_cnt));
300 printk(KERN_DEBUG "\tmax_bud_bytes %llu\n",
301 (unsigned long long)le64_to_cpu(sup->max_bud_bytes));
302 printk(KERN_DEBUG "\tlog_lebs %u\n",
303 le32_to_cpu(sup->log_lebs));
304 printk(KERN_DEBUG "\tlpt_lebs %u\n",
305 le32_to_cpu(sup->lpt_lebs));
306 printk(KERN_DEBUG "\torph_lebs %u\n",
307 le32_to_cpu(sup->orph_lebs));
308 printk(KERN_DEBUG "\tjhead_cnt %u\n",
309 le32_to_cpu(sup->jhead_cnt));
310 printk(KERN_DEBUG "\tfanout %u\n",
311 le32_to_cpu(sup->fanout));
312 printk(KERN_DEBUG "\tlsave_cnt %u\n",
313 le32_to_cpu(sup->lsave_cnt));
314 printk(KERN_DEBUG "\tdefault_compr %u\n",
315 (int)le16_to_cpu(sup->default_compr));
316 printk(KERN_DEBUG "\trp_size %llu\n",
317 (unsigned long long)le64_to_cpu(sup->rp_size));
318 printk(KERN_DEBUG "\trp_uid %u\n",
319 le32_to_cpu(sup->rp_uid));
320 printk(KERN_DEBUG "\trp_gid %u\n",
321 le32_to_cpu(sup->rp_gid));
322 printk(KERN_DEBUG "\tfmt_version %u\n",
323 le32_to_cpu(sup->fmt_version));
324 printk(KERN_DEBUG "\ttime_gran %u\n",
325 le32_to_cpu(sup->time_gran));
326 printk(KERN_DEBUG "\tUUID %02X%02X%02X%02X-%02X%02X"
327 "-%02X%02X-%02X%02X-%02X%02X%02X%02X%02X%02X\n",
328 sup->uuid[0], sup->uuid[1], sup->uuid[2], sup->uuid[3],
329 sup->uuid[4], sup->uuid[5], sup->uuid[6], sup->uuid[7],
330 sup->uuid[8], sup->uuid[9], sup->uuid[10], sup->uuid[11],
331 sup->uuid[12], sup->uuid[13], sup->uuid[14],
332 sup->uuid[15]);
333 break;
334 }
335 case UBIFS_MST_NODE:
336 {
337 const struct ubifs_mst_node *mst = node;
338
339 printk(KERN_DEBUG "\thighest_inum %llu\n",
340 (unsigned long long)le64_to_cpu(mst->highest_inum));
341 printk(KERN_DEBUG "\tcommit number %llu\n",
342 (unsigned long long)le64_to_cpu(mst->cmt_no));
343 printk(KERN_DEBUG "\tflags %#x\n",
344 le32_to_cpu(mst->flags));
345 printk(KERN_DEBUG "\tlog_lnum %u\n",
346 le32_to_cpu(mst->log_lnum));
347 printk(KERN_DEBUG "\troot_lnum %u\n",
348 le32_to_cpu(mst->root_lnum));
349 printk(KERN_DEBUG "\troot_offs %u\n",
350 le32_to_cpu(mst->root_offs));
351 printk(KERN_DEBUG "\troot_len %u\n",
352 le32_to_cpu(mst->root_len));
353 printk(KERN_DEBUG "\tgc_lnum %u\n",
354 le32_to_cpu(mst->gc_lnum));
355 printk(KERN_DEBUG "\tihead_lnum %u\n",
356 le32_to_cpu(mst->ihead_lnum));
357 printk(KERN_DEBUG "\tihead_offs %u\n",
358 le32_to_cpu(mst->ihead_offs));
359 printk(KERN_DEBUG "\tindex_size %u\n",
360 le32_to_cpu(mst->index_size));
361 printk(KERN_DEBUG "\tlpt_lnum %u\n",
362 le32_to_cpu(mst->lpt_lnum));
363 printk(KERN_DEBUG "\tlpt_offs %u\n",
364 le32_to_cpu(mst->lpt_offs));
365 printk(KERN_DEBUG "\tnhead_lnum %u\n",
366 le32_to_cpu(mst->nhead_lnum));
367 printk(KERN_DEBUG "\tnhead_offs %u\n",
368 le32_to_cpu(mst->nhead_offs));
369 printk(KERN_DEBUG "\tltab_lnum %u\n",
370 le32_to_cpu(mst->ltab_lnum));
371 printk(KERN_DEBUG "\tltab_offs %u\n",
372 le32_to_cpu(mst->ltab_offs));
373 printk(KERN_DEBUG "\tlsave_lnum %u\n",
374 le32_to_cpu(mst->lsave_lnum));
375 printk(KERN_DEBUG "\tlsave_offs %u\n",
376 le32_to_cpu(mst->lsave_offs));
377 printk(KERN_DEBUG "\tlscan_lnum %u\n",
378 le32_to_cpu(mst->lscan_lnum));
379 printk(KERN_DEBUG "\tleb_cnt %u\n",
380 le32_to_cpu(mst->leb_cnt));
381 printk(KERN_DEBUG "\tempty_lebs %u\n",
382 le32_to_cpu(mst->empty_lebs));
383 printk(KERN_DEBUG "\tidx_lebs %u\n",
384 le32_to_cpu(mst->idx_lebs));
385 printk(KERN_DEBUG "\ttotal_free %llu\n",
386 (unsigned long long)le64_to_cpu(mst->total_free));
387 printk(KERN_DEBUG "\ttotal_dirty %llu\n",
388 (unsigned long long)le64_to_cpu(mst->total_dirty));
389 printk(KERN_DEBUG "\ttotal_used %llu\n",
390 (unsigned long long)le64_to_cpu(mst->total_used));
391 printk(KERN_DEBUG "\ttotal_dead %llu\n",
392 (unsigned long long)le64_to_cpu(mst->total_dead));
393 printk(KERN_DEBUG "\ttotal_dark %llu\n",
394 (unsigned long long)le64_to_cpu(mst->total_dark));
395 break;
396 }
397 case UBIFS_REF_NODE:
398 {
399 const struct ubifs_ref_node *ref = node;
400
401 printk(KERN_DEBUG "\tlnum %u\n",
402 le32_to_cpu(ref->lnum));
403 printk(KERN_DEBUG "\toffs %u\n",
404 le32_to_cpu(ref->offs));
405 printk(KERN_DEBUG "\tjhead %u\n",
406 le32_to_cpu(ref->jhead));
407 break;
408 }
409 case UBIFS_INO_NODE:
410 {
411 const struct ubifs_ino_node *ino = node;
412
413 key_read(c, &ino->key, &key);
414 printk(KERN_DEBUG "\tkey %s\n", DBGKEY(&key));
415 printk(KERN_DEBUG "\tcreat_sqnum %llu\n",
416 (unsigned long long)le64_to_cpu(ino->creat_sqnum));
417 printk(KERN_DEBUG "\tsize %llu\n",
418 (unsigned long long)le64_to_cpu(ino->size));
419 printk(KERN_DEBUG "\tnlink %u\n",
420 le32_to_cpu(ino->nlink));
421 printk(KERN_DEBUG "\tatime %lld.%u\n",
422 (long long)le64_to_cpu(ino->atime_sec),
423 le32_to_cpu(ino->atime_nsec));
424 printk(KERN_DEBUG "\tmtime %lld.%u\n",
425 (long long)le64_to_cpu(ino->mtime_sec),
426 le32_to_cpu(ino->mtime_nsec));
427 printk(KERN_DEBUG "\tctime %lld.%u\n",
428 (long long)le64_to_cpu(ino->ctime_sec),
429 le32_to_cpu(ino->ctime_nsec));
430 printk(KERN_DEBUG "\tuid %u\n",
431 le32_to_cpu(ino->uid));
432 printk(KERN_DEBUG "\tgid %u\n",
433 le32_to_cpu(ino->gid));
434 printk(KERN_DEBUG "\tmode %u\n",
435 le32_to_cpu(ino->mode));
436 printk(KERN_DEBUG "\tflags %#x\n",
437 le32_to_cpu(ino->flags));
438 printk(KERN_DEBUG "\txattr_cnt %u\n",
439 le32_to_cpu(ino->xattr_cnt));
440 printk(KERN_DEBUG "\txattr_size %u\n",
441 le32_to_cpu(ino->xattr_size));
442 printk(KERN_DEBUG "\txattr_names %u\n",
443 le32_to_cpu(ino->xattr_names));
444 printk(KERN_DEBUG "\tcompr_type %#x\n",
445 (int)le16_to_cpu(ino->compr_type));
446 printk(KERN_DEBUG "\tdata len %u\n",
447 le32_to_cpu(ino->data_len));
448 break;
449 }
450 case UBIFS_DENT_NODE:
451 case UBIFS_XENT_NODE:
452 {
453 const struct ubifs_dent_node *dent = node;
454 int nlen = le16_to_cpu(dent->nlen);
455
456 key_read(c, &dent->key, &key);
457 printk(KERN_DEBUG "\tkey %s\n", DBGKEY(&key));
458 printk(KERN_DEBUG "\tinum %llu\n",
459 (unsigned long long)le64_to_cpu(dent->inum));
460 printk(KERN_DEBUG "\ttype %d\n", (int)dent->type);
461 printk(KERN_DEBUG "\tnlen %d\n", nlen);
462 printk(KERN_DEBUG "\tname ");
463
464 if (nlen > UBIFS_MAX_NLEN)
465 printk(KERN_DEBUG "(bad name length, not printing, "
466 "bad or corrupted node)");
467 else {
468 for (i = 0; i < nlen && dent->name[i]; i++)
469 printk("%c", dent->name[i]);
470 }
471 printk("\n");
472
473 break;
474 }
475 case UBIFS_DATA_NODE:
476 {
477 const struct ubifs_data_node *dn = node;
478 int dlen = le32_to_cpu(ch->len) - UBIFS_DATA_NODE_SZ;
479
480 key_read(c, &dn->key, &key);
481 printk(KERN_DEBUG "\tkey %s\n", DBGKEY(&key));
482 printk(KERN_DEBUG "\tsize %u\n",
483 le32_to_cpu(dn->size));
484 printk(KERN_DEBUG "\tcompr_typ %d\n",
485 (int)le16_to_cpu(dn->compr_type));
486 printk(KERN_DEBUG "\tdata size %d\n",
487 dlen);
488 printk(KERN_DEBUG "\tdata:\n");
489 print_hex_dump(KERN_DEBUG, "\t", DUMP_PREFIX_OFFSET, 32, 1,
490 (void *)&dn->data, dlen, 0);
491 break;
492 }
493 case UBIFS_TRUN_NODE:
494 {
495 const struct ubifs_trun_node *trun = node;
496
497 printk(KERN_DEBUG "\tinum %u\n",
498 le32_to_cpu(trun->inum));
499 printk(KERN_DEBUG "\told_size %llu\n",
500 (unsigned long long)le64_to_cpu(trun->old_size));
501 printk(KERN_DEBUG "\tnew_size %llu\n",
502 (unsigned long long)le64_to_cpu(trun->new_size));
503 break;
504 }
505 case UBIFS_IDX_NODE:
506 {
507 const struct ubifs_idx_node *idx = node;
508
509 n = le16_to_cpu(idx->child_cnt);
510 printk(KERN_DEBUG "\tchild_cnt %d\n", n);
511 printk(KERN_DEBUG "\tlevel %d\n",
512 (int)le16_to_cpu(idx->level));
513 printk(KERN_DEBUG "\tBranches:\n");
514
515 for (i = 0; i < n && i < c->fanout - 1; i++) {
516 const struct ubifs_branch *br;
517
518 br = ubifs_idx_branch(c, idx, i);
519 key_read(c, &br->key, &key);
520 printk(KERN_DEBUG "\t%d: LEB %d:%d len %d key %s\n",
521 i, le32_to_cpu(br->lnum), le32_to_cpu(br->offs),
522 le32_to_cpu(br->len), DBGKEY(&key));
523 }
524 break;
525 }
526 case UBIFS_CS_NODE:
527 break;
528 case UBIFS_ORPH_NODE:
529 {
530 const struct ubifs_orph_node *orph = node;
531
532 printk(KERN_DEBUG "\tcommit number %llu\n",
533 (unsigned long long)
534 le64_to_cpu(orph->cmt_no) & LLONG_MAX);
535 printk(KERN_DEBUG "\tlast node flag %llu\n",
536 (unsigned long long)(le64_to_cpu(orph->cmt_no)) >> 63);
537 n = (le32_to_cpu(ch->len) - UBIFS_ORPH_NODE_SZ) >> 3;
538 printk(KERN_DEBUG "\t%d orphan inode numbers:\n", n);
539 for (i = 0; i < n; i++)
540 printk(KERN_DEBUG "\t ino %llu\n",
541 le64_to_cpu(orph->inos[i]));
542 break;
543 }
544 default:
545 printk(KERN_DEBUG "node type %d was not recognized\n",
546 (int)ch->node_type);
547 }
548 spin_unlock(&dbg_lock);
549}
550
551void dbg_dump_budget_req(const struct ubifs_budget_req *req)
552{
553 spin_lock(&dbg_lock);
554 printk(KERN_DEBUG "Budgeting request: new_ino %d, dirtied_ino %d\n",
555 req->new_ino, req->dirtied_ino);
556 printk(KERN_DEBUG "\tnew_ino_d %d, dirtied_ino_d %d\n",
557 req->new_ino_d, req->dirtied_ino_d);
558 printk(KERN_DEBUG "\tnew_page %d, dirtied_page %d\n",
559 req->new_page, req->dirtied_page);
560 printk(KERN_DEBUG "\tnew_dent %d, mod_dent %d\n",
561 req->new_dent, req->mod_dent);
562 printk(KERN_DEBUG "\tidx_growth %d\n", req->idx_growth);
563 printk(KERN_DEBUG "\tdata_growth %d dd_growth %d\n",
564 req->data_growth, req->dd_growth);
565 spin_unlock(&dbg_lock);
566}
567
568void dbg_dump_lstats(const struct ubifs_lp_stats *lst)
569{
570 spin_lock(&dbg_lock);
571 printk(KERN_DEBUG "Lprops statistics: empty_lebs %d, idx_lebs %d\n",
572 lst->empty_lebs, lst->idx_lebs);
573 printk(KERN_DEBUG "\ttaken_empty_lebs %d, total_free %lld, "
574 "total_dirty %lld\n", lst->taken_empty_lebs, lst->total_free,
575 lst->total_dirty);
576 printk(KERN_DEBUG "\ttotal_used %lld, total_dark %lld, "
577 "total_dead %lld\n", lst->total_used, lst->total_dark,
578 lst->total_dead);
579 spin_unlock(&dbg_lock);
580}
581
582void dbg_dump_budg(struct ubifs_info *c)
583{
584 int i;
585 struct rb_node *rb;
586 struct ubifs_bud *bud;
587 struct ubifs_gced_idx_leb *idx_gc;
588
589 spin_lock(&dbg_lock);
590 printk(KERN_DEBUG "Budgeting info: budg_data_growth %lld, "
591 "budg_dd_growth %lld, budg_idx_growth %lld\n",
592 c->budg_data_growth, c->budg_dd_growth, c->budg_idx_growth);
593 printk(KERN_DEBUG "\tdata budget sum %lld, total budget sum %lld, "
594 "freeable_cnt %d\n", c->budg_data_growth + c->budg_dd_growth,
595 c->budg_data_growth + c->budg_dd_growth + c->budg_idx_growth,
596 c->freeable_cnt);
597 printk(KERN_DEBUG "\tmin_idx_lebs %d, old_idx_sz %lld, "
598 "calc_idx_sz %lld, idx_gc_cnt %d\n", c->min_idx_lebs,
599 c->old_idx_sz, c->calc_idx_sz, c->idx_gc_cnt);
600 printk(KERN_DEBUG "\tdirty_pg_cnt %ld, dirty_zn_cnt %ld, "
601 "clean_zn_cnt %ld\n", atomic_long_read(&c->dirty_pg_cnt),
602 atomic_long_read(&c->dirty_zn_cnt),
603 atomic_long_read(&c->clean_zn_cnt));
604 printk(KERN_DEBUG "\tdark_wm %d, dead_wm %d, max_idx_node_sz %d\n",
605 c->dark_wm, c->dead_wm, c->max_idx_node_sz);
606 printk(KERN_DEBUG "\tgc_lnum %d, ihead_lnum %d\n",
607 c->gc_lnum, c->ihead_lnum);
608 for (i = 0; i < c->jhead_cnt; i++)
609 printk(KERN_DEBUG "\tjhead %d\t LEB %d\n",
610 c->jheads[i].wbuf.jhead, c->jheads[i].wbuf.lnum);
611 for (rb = rb_first(&c->buds); rb; rb = rb_next(rb)) {
612 bud = rb_entry(rb, struct ubifs_bud, rb);
613 printk(KERN_DEBUG "\tbud LEB %d\n", bud->lnum);
614 }
615 list_for_each_entry(bud, &c->old_buds, list)
616 printk(KERN_DEBUG "\told bud LEB %d\n", bud->lnum);
617 list_for_each_entry(idx_gc, &c->idx_gc, list)
618 printk(KERN_DEBUG "\tGC'ed idx LEB %d unmap %d\n",
619 idx_gc->lnum, idx_gc->unmap);
620 printk(KERN_DEBUG "\tcommit state %d\n", c->cmt_state);
621 spin_unlock(&dbg_lock);
622}
623
624void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp)
625{
626 printk(KERN_DEBUG "LEB %d lprops: free %d, dirty %d (used %d), "
627 "flags %#x\n", lp->lnum, lp->free, lp->dirty,
628 c->leb_size - lp->free - lp->dirty, lp->flags);
629}
630
631void dbg_dump_lprops(struct ubifs_info *c)
632{
633 int lnum, err;
634 struct ubifs_lprops lp;
635 struct ubifs_lp_stats lst;
636
637 printk(KERN_DEBUG "Dumping LEB properties\n");
638 ubifs_get_lp_stats(c, &lst);
639 dbg_dump_lstats(&lst);
640
641 for (lnum = c->main_first; lnum < c->leb_cnt; lnum++) {
642 err = ubifs_read_one_lp(c, lnum, &lp);
643 if (err)
644 ubifs_err("cannot read lprops for LEB %d", lnum);
645
646 dbg_dump_lprop(c, &lp);
647 }
648}
649
650void dbg_dump_leb(const struct ubifs_info *c, int lnum)
651{
652 struct ubifs_scan_leb *sleb;
653 struct ubifs_scan_node *snod;
654
655 if (dbg_failure_mode)
656 return;
657
658 printk(KERN_DEBUG "Dumping LEB %d\n", lnum);
659
660 sleb = ubifs_scan(c, lnum, 0, c->dbg_buf);
661 if (IS_ERR(sleb)) {
662 ubifs_err("scan error %d", (int)PTR_ERR(sleb));
663 return;
664 }
665
666 printk(KERN_DEBUG "LEB %d has %d nodes ending at %d\n", lnum,
667 sleb->nodes_cnt, sleb->endpt);
668
669 list_for_each_entry(snod, &sleb->nodes, list) {
670 cond_resched();
671 printk(KERN_DEBUG "Dumping node at LEB %d:%d len %d\n", lnum,
672 snod->offs, snod->len);
673 dbg_dump_node(c, snod->node);
674 }
675
676 ubifs_scan_destroy(sleb);
677 return;
678}
679
680void dbg_dump_znode(const struct ubifs_info *c,
681 const struct ubifs_znode *znode)
682{
683 int n;
684 const struct ubifs_zbranch *zbr;
685
686 spin_lock(&dbg_lock);
687 if (znode->parent)
688 zbr = &znode->parent->zbranch[znode->iip];
689 else
690 zbr = &c->zroot;
691
692 printk(KERN_DEBUG "znode %p, LEB %d:%d len %d parent %p iip %d level %d"
693 " child_cnt %d flags %lx\n", znode, zbr->lnum, zbr->offs,
694 zbr->len, znode->parent, znode->iip, znode->level,
695 znode->child_cnt, znode->flags);
696
697 if (znode->child_cnt <= 0 || znode->child_cnt > c->fanout) {
698 spin_unlock(&dbg_lock);
699 return;
700 }
701
702 printk(KERN_DEBUG "zbranches:\n");
703 for (n = 0; n < znode->child_cnt; n++) {
704 zbr = &znode->zbranch[n];
705 if (znode->level > 0)
706 printk(KERN_DEBUG "\t%d: znode %p LEB %d:%d len %d key "
707 "%s\n", n, zbr->znode, zbr->lnum,
708 zbr->offs, zbr->len,
709 DBGKEY(&zbr->key));
710 else
711 printk(KERN_DEBUG "\t%d: LNC %p LEB %d:%d len %d key "
712 "%s\n", n, zbr->znode, zbr->lnum,
713 zbr->offs, zbr->len,
714 DBGKEY(&zbr->key));
715 }
716 spin_unlock(&dbg_lock);
717}
718
719void dbg_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat)
720{
721 int i;
722
723 printk(KERN_DEBUG "Dumping heap cat %d (%d elements)\n",
724 cat, heap->cnt);
725 for (i = 0; i < heap->cnt; i++) {
726 struct ubifs_lprops *lprops = heap->arr[i];
727
728 printk(KERN_DEBUG "\t%d. LEB %d hpos %d free %d dirty %d "
729 "flags %d\n", i, lprops->lnum, lprops->hpos,
730 lprops->free, lprops->dirty, lprops->flags);
731 }
732}
733
734void dbg_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
735 struct ubifs_nnode *parent, int iip)
736{
737 int i;
738
739 printk(KERN_DEBUG "Dumping pnode:\n");
740 printk(KERN_DEBUG "\taddress %zx parent %zx cnext %zx\n",
741 (size_t)pnode, (size_t)parent, (size_t)pnode->cnext);
742 printk(KERN_DEBUG "\tflags %lu iip %d level %d num %d\n",
743 pnode->flags, iip, pnode->level, pnode->num);
744 for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
745 struct ubifs_lprops *lp = &pnode->lprops[i];
746
747 printk(KERN_DEBUG "\t%d: free %d dirty %d flags %d lnum %d\n",
748 i, lp->free, lp->dirty, lp->flags, lp->lnum);
749 }
750}
751
752void dbg_dump_tnc(struct ubifs_info *c)
753{
754 struct ubifs_znode *znode;
755 int level;
756
757 printk(KERN_DEBUG "\n");
758 printk(KERN_DEBUG "Dumping the TNC tree\n");
759 znode = ubifs_tnc_levelorder_next(c->zroot.znode, NULL);
760 level = znode->level;
761 printk(KERN_DEBUG "== Level %d ==\n", level);
762 while (znode) {
763 if (level != znode->level) {
764 level = znode->level;
765 printk(KERN_DEBUG "== Level %d ==\n", level);
766 }
767 dbg_dump_znode(c, znode);
768 znode = ubifs_tnc_levelorder_next(c->zroot.znode, znode);
769 }
770
771 printk(KERN_DEBUG "\n");
772}
773
774static int dump_znode(struct ubifs_info *c, struct ubifs_znode *znode,
775 void *priv)
776{
777 dbg_dump_znode(c, znode);
778 return 0;
779}
780
781/**
782 * dbg_dump_index - dump the on-flash index.
783 * @c: UBIFS file-system description object
784 *
785 * This function dumps whole UBIFS indexing B-tree, unlike 'dbg_dump_tnc()'
786 * which dumps only in-memory znodes and does not read znodes which from flash.
787 */
788void dbg_dump_index(struct ubifs_info *c)
789{
790 dbg_walk_index(c, NULL, dump_znode, NULL);
791}
792
793/**
794 * dbg_check_synced_i_size - check synchronized inode size.
795 * @inode: inode to check
796 *
797 * If inode is clean, synchronized inode size has to be equivalent to current
798 * inode size. This function has to be called only for locked inodes (@i_mutex
799 * has to be locked). Returns %0 if synchronized inode size if correct, and
800 * %-EINVAL if not.
801 */
802int dbg_check_synced_i_size(struct inode *inode)
803{
804 int err = 0;
805 struct ubifs_inode *ui = ubifs_inode(inode);
806
807 if (!(ubifs_chk_flags & UBIFS_CHK_GEN))
808 return 0;
809 if (!S_ISREG(inode->i_mode))
810 return 0;
811
812 mutex_lock(&ui->ui_mutex);
813 spin_lock(&ui->ui_lock);
814 if (ui->ui_size != ui->synced_i_size && !ui->dirty) {
815 ubifs_err("ui_size is %lld, synced_i_size is %lld, but inode "
816 "is clean", ui->ui_size, ui->synced_i_size);
817 ubifs_err("i_ino %lu, i_mode %#x, i_size %lld", inode->i_ino,
818 inode->i_mode, i_size_read(inode));
819 dbg_dump_stack();
820 err = -EINVAL;
821 }
822 spin_unlock(&ui->ui_lock);
823 mutex_unlock(&ui->ui_mutex);
824 return err;
825}
826
827/*
828 * dbg_check_dir - check directory inode size and link count.
829 * @c: UBIFS file-system description object
830 * @dir: the directory to calculate size for
831 * @size: the result is returned here
832 *
833 * This function makes sure that directory size and link count are correct.
834 * Returns zero in case of success and a negative error code in case of
835 * failure.
836 *
837 * Note, it is good idea to make sure the @dir->i_mutex is locked before
838 * calling this function.
839 */
840int dbg_check_dir_size(struct ubifs_info *c, const struct inode *dir)
841{
842 unsigned int nlink = 2;
843 union ubifs_key key;
844 struct ubifs_dent_node *dent, *pdent = NULL;
845 struct qstr nm = { .name = NULL };
846 loff_t size = UBIFS_INO_NODE_SZ;
847
848 if (!(ubifs_chk_flags & UBIFS_CHK_GEN))
849 return 0;
850
851 if (!S_ISDIR(dir->i_mode))
852 return 0;
853
854 lowest_dent_key(c, &key, dir->i_ino);
855 while (1) {
856 int err;
857
858 dent = ubifs_tnc_next_ent(c, &key, &nm);
859 if (IS_ERR(dent)) {
860 err = PTR_ERR(dent);
861 if (err == -ENOENT)
862 break;
863 return err;
864 }
865
866 nm.name = dent->name;
867 nm.len = le16_to_cpu(dent->nlen);
868 size += CALC_DENT_SIZE(nm.len);
869 if (dent->type == UBIFS_ITYPE_DIR)
870 nlink += 1;
871 kfree(pdent);
872 pdent = dent;
873 key_read(c, &dent->key, &key);
874 }
875 kfree(pdent);
876
877 if (i_size_read(dir) != size) {
878 ubifs_err("directory inode %lu has size %llu, "
879 "but calculated size is %llu", dir->i_ino,
880 (unsigned long long)i_size_read(dir),
881 (unsigned long long)size);
882 dump_stack();
883 return -EINVAL;
884 }
885 if (dir->i_nlink != nlink) {
886 ubifs_err("directory inode %lu has nlink %u, but calculated "
887 "nlink is %u", dir->i_ino, dir->i_nlink, nlink);
888 dump_stack();
889 return -EINVAL;
890 }
891
892 return 0;
893}
894
895/**
896 * dbg_check_key_order - make sure that colliding keys are properly ordered.
897 * @c: UBIFS file-system description object
898 * @zbr1: first zbranch
899 * @zbr2: following zbranch
900 *
901 * In UBIFS indexing B-tree colliding keys has to be sorted in binary order of
902 * names of the direntries/xentries which are referred by the keys. This
903 * function reads direntries/xentries referred by @zbr1 and @zbr2 and makes
904 * sure the name of direntry/xentry referred by @zbr1 is less than
905 * direntry/xentry referred by @zbr2. Returns zero if this is true, %1 if not,
906 * and a negative error code in case of failure.
907 */
908static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1,
909 struct ubifs_zbranch *zbr2)
910{
911 int err, nlen1, nlen2, cmp;
912 struct ubifs_dent_node *dent1, *dent2;
913 union ubifs_key key;
914
915 ubifs_assert(!keys_cmp(c, &zbr1->key, &zbr2->key));
916 dent1 = kmalloc(UBIFS_MAX_DENT_NODE_SZ, GFP_NOFS);
917 if (!dent1)
918 return -ENOMEM;
919 dent2 = kmalloc(UBIFS_MAX_DENT_NODE_SZ, GFP_NOFS);
920 if (!dent2) {
921 err = -ENOMEM;
922 goto out_free;
923 }
924
925 err = ubifs_tnc_read_node(c, zbr1, dent1);
926 if (err)
927 goto out_free;
928 err = ubifs_validate_entry(c, dent1);
929 if (err)
930 goto out_free;
931
932 err = ubifs_tnc_read_node(c, zbr2, dent2);
933 if (err)
934 goto out_free;
935 err = ubifs_validate_entry(c, dent2);
936 if (err)
937 goto out_free;
938
939 /* Make sure node keys are the same as in zbranch */
940 err = 1;
941 key_read(c, &dent1->key, &key);
942 if (keys_cmp(c, &zbr1->key, &key)) {
943 dbg_err("1st entry at %d:%d has key %s", zbr1->lnum,
944 zbr1->offs, DBGKEY(&key));
945 dbg_err("but it should have key %s according to tnc",
946 DBGKEY(&zbr1->key));
947 dbg_dump_node(c, dent1);
948 goto out_free;
949 }
950
951 key_read(c, &dent2->key, &key);
952 if (keys_cmp(c, &zbr2->key, &key)) {
953 dbg_err("2nd entry at %d:%d has key %s", zbr1->lnum,
954 zbr1->offs, DBGKEY(&key));
955 dbg_err("but it should have key %s according to tnc",
956 DBGKEY(&zbr2->key));
957 dbg_dump_node(c, dent2);
958 goto out_free;
959 }
960
961 nlen1 = le16_to_cpu(dent1->nlen);
962 nlen2 = le16_to_cpu(dent2->nlen);
963
964 cmp = memcmp(dent1->name, dent2->name, min_t(int, nlen1, nlen2));
965 if (cmp < 0 || (cmp == 0 && nlen1 < nlen2)) {
966 err = 0;
967 goto out_free;
968 }
969 if (cmp == 0 && nlen1 == nlen2)
970 dbg_err("2 xent/dent nodes with the same name");
971 else
972 dbg_err("bad order of colliding key %s",
973 DBGKEY(&key));
974
975 dbg_msg("first node at %d:%d\n", zbr1->lnum, zbr1->offs);
976 dbg_dump_node(c, dent1);
977 dbg_msg("second node at %d:%d\n", zbr2->lnum, zbr2->offs);
978 dbg_dump_node(c, dent2);
979
980out_free:
981 kfree(dent2);
982 kfree(dent1);
983 return err;
984}
985
986/**
987 * dbg_check_znode - check if znode is all right.
988 * @c: UBIFS file-system description object
989 * @zbr: zbranch which points to this znode
990 *
991 * This function makes sure that znode referred to by @zbr is all right.
992 * Returns zero if it is, and %-EINVAL if it is not.
993 */
994static int dbg_check_znode(struct ubifs_info *c, struct ubifs_zbranch *zbr)
995{
996 struct ubifs_znode *znode = zbr->znode;
997 struct ubifs_znode *zp = znode->parent;
998 int n, err, cmp;
999
1000 if (znode->child_cnt <= 0 || znode->child_cnt > c->fanout) {
1001 err = 1;
1002 goto out;
1003 }
1004 if (znode->level < 0) {
1005 err = 2;
1006 goto out;
1007 }
1008 if (znode->iip < 0 || znode->iip >= c->fanout) {
1009 err = 3;
1010 goto out;
1011 }
1012
1013 if (zbr->len == 0)
1014 /* Only dirty zbranch may have no on-flash nodes */
1015 if (!ubifs_zn_dirty(znode)) {
1016 err = 4;
1017 goto out;
1018 }
1019
1020 if (ubifs_zn_dirty(znode)) {
1021 /*
1022 * If znode is dirty, its parent has to be dirty as well. The
1023 * order of the operation is important, so we have to have
1024 * memory barriers.
1025 */
1026 smp_mb();
1027 if (zp && !ubifs_zn_dirty(zp)) {
1028 /*
1029 * The dirty flag is atomic and is cleared outside the
1030 * TNC mutex, so znode's dirty flag may now have
1031 * been cleared. The child is always cleared before the
1032 * parent, so we just need to check again.
1033 */
1034 smp_mb();
1035 if (ubifs_zn_dirty(znode)) {
1036 err = 5;
1037 goto out;
1038 }
1039 }
1040 }
1041
1042 if (zp) {
1043 const union ubifs_key *min, *max;
1044
1045 if (znode->level != zp->level - 1) {
1046 err = 6;
1047 goto out;
1048 }
1049
1050 /* Make sure the 'parent' pointer in our znode is correct */
1051 err = ubifs_search_zbranch(c, zp, &zbr->key, &n);
1052 if (!err) {
1053 /* This zbranch does not exist in the parent */
1054 err = 7;
1055 goto out;
1056 }
1057
1058 if (znode->iip >= zp->child_cnt) {
1059 err = 8;
1060 goto out;
1061 }
1062
1063 if (znode->iip != n) {
1064 /* This may happen only in case of collisions */
1065 if (keys_cmp(c, &zp->zbranch[n].key,
1066 &zp->zbranch[znode->iip].key)) {
1067 err = 9;
1068 goto out;
1069 }
1070 n = znode->iip;
1071 }
1072
1073 /*
1074 * Make sure that the first key in our znode is greater than or
1075 * equal to the key in the pointing zbranch.
1076 */
1077 min = &zbr->key;
1078 cmp = keys_cmp(c, min, &znode->zbranch[0].key);
1079 if (cmp == 1) {
1080 err = 10;
1081 goto out;
1082 }
1083
1084 if (n + 1 < zp->child_cnt) {
1085 max = &zp->zbranch[n + 1].key;
1086
1087 /*
1088 * Make sure the last key in our znode is less or
1089 * equivalent than the the key in zbranch which goes
1090 * after our pointing zbranch.
1091 */
1092 cmp = keys_cmp(c, max,
1093 &znode->zbranch[znode->child_cnt - 1].key);
1094 if (cmp == -1) {
1095 err = 11;
1096 goto out;
1097 }
1098 }
1099 } else {
1100 /* This may only be root znode */
1101 if (zbr != &c->zroot) {
1102 err = 12;
1103 goto out;
1104 }
1105 }
1106
1107 /*
1108 * Make sure that next key is greater or equivalent then the previous
1109 * one.
1110 */
1111 for (n = 1; n < znode->child_cnt; n++) {
1112 cmp = keys_cmp(c, &znode->zbranch[n - 1].key,
1113 &znode->zbranch[n].key);
1114 if (cmp > 0) {
1115 err = 13;
1116 goto out;
1117 }
1118 if (cmp == 0) {
1119 /* This can only be keys with colliding hash */
1120 if (!is_hash_key(c, &znode->zbranch[n].key)) {
1121 err = 14;
1122 goto out;
1123 }
1124
1125 if (znode->level != 0 || c->replaying)
1126 continue;
1127
1128 /*
1129 * Colliding keys should follow binary order of
1130 * corresponding xentry/dentry names.
1131 */
1132 err = dbg_check_key_order(c, &znode->zbranch[n - 1],
1133 &znode->zbranch[n]);
1134 if (err < 0)
1135 return err;
1136 if (err) {
1137 err = 15;
1138 goto out;
1139 }
1140 }
1141 }
1142
1143 for (n = 0; n < znode->child_cnt; n++) {
1144 if (!znode->zbranch[n].znode &&
1145 (znode->zbranch[n].lnum == 0 ||
1146 znode->zbranch[n].len == 0)) {
1147 err = 16;
1148 goto out;
1149 }
1150
1151 if (znode->zbranch[n].lnum != 0 &&
1152 znode->zbranch[n].len == 0) {
1153 err = 17;
1154 goto out;
1155 }
1156
1157 if (znode->zbranch[n].lnum == 0 &&
1158 znode->zbranch[n].len != 0) {
1159 err = 18;
1160 goto out;
1161 }
1162
1163 if (znode->zbranch[n].lnum == 0 &&
1164 znode->zbranch[n].offs != 0) {
1165 err = 19;
1166 goto out;
1167 }
1168
1169 if (znode->level != 0 && znode->zbranch[n].znode)
1170 if (znode->zbranch[n].znode->parent != znode) {
1171 err = 20;
1172 goto out;
1173 }
1174 }
1175
1176 return 0;
1177
1178out:
1179 ubifs_err("failed, error %d", err);
1180 ubifs_msg("dump of the znode");
1181 dbg_dump_znode(c, znode);
1182 if (zp) {
1183 ubifs_msg("dump of the parent znode");
1184 dbg_dump_znode(c, zp);
1185 }
1186 dump_stack();
1187 return -EINVAL;
1188}
1189
1190/**
1191 * dbg_check_tnc - check TNC tree.
1192 * @c: UBIFS file-system description object
1193 * @extra: do extra checks that are possible at start commit
1194 *
1195 * This function traverses whole TNC tree and checks every znode. Returns zero
1196 * if everything is all right and %-EINVAL if something is wrong with TNC.
1197 */
1198int dbg_check_tnc(struct ubifs_info *c, int extra)
1199{
1200 struct ubifs_znode *znode;
1201 long clean_cnt = 0, dirty_cnt = 0;
1202 int err, last;
1203
1204 if (!(ubifs_chk_flags & UBIFS_CHK_TNC))
1205 return 0;
1206
1207 ubifs_assert(mutex_is_locked(&c->tnc_mutex));
1208 if (!c->zroot.znode)
1209 return 0;
1210
1211 znode = ubifs_tnc_postorder_first(c->zroot.znode);
1212 while (1) {
1213 struct ubifs_znode *prev;
1214 struct ubifs_zbranch *zbr;
1215
1216 if (!znode->parent)
1217 zbr = &c->zroot;
1218 else
1219 zbr = &znode->parent->zbranch[znode->iip];
1220
1221 err = dbg_check_znode(c, zbr);
1222 if (err)
1223 return err;
1224
1225 if (extra) {
1226 if (ubifs_zn_dirty(znode))
1227 dirty_cnt += 1;
1228 else
1229 clean_cnt += 1;
1230 }
1231
1232 prev = znode;
1233 znode = ubifs_tnc_postorder_next(znode);
1234 if (!znode)
1235 break;
1236
1237 /*
1238 * If the last key of this znode is equivalent to the first key
1239 * of the next znode (collision), then check order of the keys.
1240 */
1241 last = prev->child_cnt - 1;
1242 if (prev->level == 0 && znode->level == 0 && !c->replaying &&
1243 !keys_cmp(c, &prev->zbranch[last].key,
1244 &znode->zbranch[0].key)) {
1245 err = dbg_check_key_order(c, &prev->zbranch[last],
1246 &znode->zbranch[0]);
1247 if (err < 0)
1248 return err;
1249 if (err) {
1250 ubifs_msg("first znode");
1251 dbg_dump_znode(c, prev);
1252 ubifs_msg("second znode");
1253 dbg_dump_znode(c, znode);
1254 return -EINVAL;
1255 }
1256 }
1257 }
1258
1259 if (extra) {
1260 if (clean_cnt != atomic_long_read(&c->clean_zn_cnt)) {
1261 ubifs_err("incorrect clean_zn_cnt %ld, calculated %ld",
1262 atomic_long_read(&c->clean_zn_cnt),
1263 clean_cnt);
1264 return -EINVAL;
1265 }
1266 if (dirty_cnt != atomic_long_read(&c->dirty_zn_cnt)) {
1267 ubifs_err("incorrect dirty_zn_cnt %ld, calculated %ld",
1268 atomic_long_read(&c->dirty_zn_cnt),
1269 dirty_cnt);
1270 return -EINVAL;
1271 }
1272 }
1273
1274 return 0;
1275}
1276
1277/**
1278 * dbg_walk_index - walk the on-flash index.
1279 * @c: UBIFS file-system description object
1280 * @leaf_cb: called for each leaf node
1281 * @znode_cb: called for each indexing node
1282 * @priv: private date which is passed to callbacks
1283 *
1284 * This function walks the UBIFS index and calls the @leaf_cb for each leaf
1285 * node and @znode_cb for each indexing node. Returns zero in case of success
1286 * and a negative error code in case of failure.
1287 *
1288 * It would be better if this function removed every znode it pulled to into
1289 * the TNC, so that the behavior more closely matched the non-debugging
1290 * behavior.
1291 */
1292int dbg_walk_index(struct ubifs_info *c, dbg_leaf_callback leaf_cb,
1293 dbg_znode_callback znode_cb, void *priv)
1294{
1295 int err;
1296 struct ubifs_zbranch *zbr;
1297 struct ubifs_znode *znode, *child;
1298
1299 mutex_lock(&c->tnc_mutex);
1300 /* If the root indexing node is not in TNC - pull it */
1301 if (!c->zroot.znode) {
1302 c->zroot.znode = ubifs_load_znode(c, &c->zroot, NULL, 0);
1303 if (IS_ERR(c->zroot.znode)) {
1304 err = PTR_ERR(c->zroot.znode);
1305 c->zroot.znode = NULL;
1306 goto out_unlock;
1307 }
1308 }
1309
1310 /*
1311 * We are going to traverse the indexing tree in the postorder manner.
1312 * Go down and find the leftmost indexing node where we are going to
1313 * start from.
1314 */
1315 znode = c->zroot.znode;
1316 while (znode->level > 0) {
1317 zbr = &znode->zbranch[0];
1318 child = zbr->znode;
1319 if (!child) {
1320 child = ubifs_load_znode(c, zbr, znode, 0);
1321 if (IS_ERR(child)) {
1322 err = PTR_ERR(child);
1323 goto out_unlock;
1324 }
1325 zbr->znode = child;
1326 }
1327
1328 znode = child;
1329 }
1330
1331 /* Iterate over all indexing nodes */
1332 while (1) {
1333 int idx;
1334
1335 cond_resched();
1336
1337 if (znode_cb) {
1338 err = znode_cb(c, znode, priv);
1339 if (err) {
1340 ubifs_err("znode checking function returned "
1341 "error %d", err);
1342 dbg_dump_znode(c, znode);
1343 goto out_dump;
1344 }
1345 }
1346 if (leaf_cb && znode->level == 0) {
1347 for (idx = 0; idx < znode->child_cnt; idx++) {
1348 zbr = &znode->zbranch[idx];
1349 err = leaf_cb(c, zbr, priv);
1350 if (err) {
1351 ubifs_err("leaf checking function "
1352 "returned error %d, for leaf "
1353 "at LEB %d:%d",
1354 err, zbr->lnum, zbr->offs);
1355 goto out_dump;
1356 }
1357 }
1358 }
1359
1360 if (!znode->parent)
1361 break;
1362
1363 idx = znode->iip + 1;
1364 znode = znode->parent;
1365 if (idx < znode->child_cnt) {
1366 /* Switch to the next index in the parent */
1367 zbr = &znode->zbranch[idx];
1368 child = zbr->znode;
1369 if (!child) {
1370 child = ubifs_load_znode(c, zbr, znode, idx);
1371 if (IS_ERR(child)) {
1372 err = PTR_ERR(child);
1373 goto out_unlock;
1374 }
1375 zbr->znode = child;
1376 }
1377 znode = child;
1378 } else
1379 /*
1380 * This is the last child, switch to the parent and
1381 * continue.
1382 */
1383 continue;
1384
1385 /* Go to the lowest leftmost znode in the new sub-tree */
1386 while (znode->level > 0) {
1387 zbr = &znode->zbranch[0];
1388 child = zbr->znode;
1389 if (!child) {
1390 child = ubifs_load_znode(c, zbr, znode, 0);
1391 if (IS_ERR(child)) {
1392 err = PTR_ERR(child);
1393 goto out_unlock;
1394 }
1395 zbr->znode = child;
1396 }
1397 znode = child;
1398 }
1399 }
1400
1401 mutex_unlock(&c->tnc_mutex);
1402 return 0;
1403
1404out_dump:
1405 if (znode->parent)
1406 zbr = &znode->parent->zbranch[znode->iip];
1407 else
1408 zbr = &c->zroot;
1409 ubifs_msg("dump of znode at LEB %d:%d", zbr->lnum, zbr->offs);
1410 dbg_dump_znode(c, znode);
1411out_unlock:
1412 mutex_unlock(&c->tnc_mutex);
1413 return err;
1414}
1415
1416/**
1417 * add_size - add znode size to partially calculated index size.
1418 * @c: UBIFS file-system description object
1419 * @znode: znode to add size for
1420 * @priv: partially calculated index size
1421 *
1422 * This is a helper function for 'dbg_check_idx_size()' which is called for
1423 * every indexing node and adds its size to the 'long long' variable pointed to
1424 * by @priv.
1425 */
1426static int add_size(struct ubifs_info *c, struct ubifs_znode *znode, void *priv)
1427{
1428 long long *idx_size = priv;
1429 int add;
1430
1431 add = ubifs_idx_node_sz(c, znode->child_cnt);
1432 add = ALIGN(add, 8);
1433 *idx_size += add;
1434 return 0;
1435}
1436
1437/**
1438 * dbg_check_idx_size - check index size.
1439 * @c: UBIFS file-system description object
1440 * @idx_size: size to check
1441 *
1442 * This function walks the UBIFS index, calculates its size and checks that the
1443 * size is equivalent to @idx_size. Returns zero in case of success and a
1444 * negative error code in case of failure.
1445 */
1446int dbg_check_idx_size(struct ubifs_info *c, long long idx_size)
1447{
1448 int err;
1449 long long calc = 0;
1450
1451 if (!(ubifs_chk_flags & UBIFS_CHK_IDX_SZ))
1452 return 0;
1453
1454 err = dbg_walk_index(c, NULL, add_size, &calc);
1455 if (err) {
1456 ubifs_err("error %d while walking the index", err);
1457 return err;
1458 }
1459
1460 if (calc != idx_size) {
1461 ubifs_err("index size check failed: calculated size is %lld, "
1462 "should be %lld", calc, idx_size);
1463 dump_stack();
1464 return -EINVAL;
1465 }
1466
1467 return 0;
1468}
1469
1470/**
1471 * struct fsck_inode - information about an inode used when checking the file-system.
1472 * @rb: link in the RB-tree of inodes
1473 * @inum: inode number
1474 * @mode: inode type, permissions, etc
1475 * @nlink: inode link count
1476 * @xattr_cnt: count of extended attributes
1477 * @references: how many directory/xattr entries refer this inode (calculated
1478 * while walking the index)
1479 * @calc_cnt: for directory inode count of child directories
1480 * @size: inode size (read from on-flash inode)
1481 * @xattr_sz: summary size of all extended attributes (read from on-flash
1482 * inode)
1483 * @calc_sz: for directories calculated directory size
1484 * @calc_xcnt: count of extended attributes
1485 * @calc_xsz: calculated summary size of all extended attributes
1486 * @xattr_nms: sum of lengths of all extended attribute names belonging to this
1487 * inode (read from on-flash inode)
1488 * @calc_xnms: calculated sum of lengths of all extended attribute names
1489 */
1490struct fsck_inode {
1491 struct rb_node rb;
1492 ino_t inum;
1493 umode_t mode;
1494 unsigned int nlink;
1495 unsigned int xattr_cnt;
1496 int references;
1497 int calc_cnt;
1498 long long size;
1499 unsigned int xattr_sz;
1500 long long calc_sz;
1501 long long calc_xcnt;
1502 long long calc_xsz;
1503 unsigned int xattr_nms;
1504 long long calc_xnms;
1505};
1506
1507/**
1508 * struct fsck_data - private FS checking information.
1509 * @inodes: RB-tree of all inodes (contains @struct fsck_inode objects)
1510 */
1511struct fsck_data {
1512 struct rb_root inodes;
1513};
1514
1515/**
1516 * add_inode - add inode information to RB-tree of inodes.
1517 * @c: UBIFS file-system description object
1518 * @fsckd: FS checking information
1519 * @ino: raw UBIFS inode to add
1520 *
1521 * This is a helper function for 'check_leaf()' which adds information about
1522 * inode @ino to the RB-tree of inodes. Returns inode information pointer in
1523 * case of success and a negative error code in case of failure.
1524 */
1525static struct fsck_inode *add_inode(struct ubifs_info *c,
1526 struct fsck_data *fsckd,
1527 struct ubifs_ino_node *ino)
1528{
1529 struct rb_node **p, *parent = NULL;
1530 struct fsck_inode *fscki;
1531 ino_t inum = key_inum_flash(c, &ino->key);
1532
1533 p = &fsckd->inodes.rb_node;
1534 while (*p) {
1535 parent = *p;
1536 fscki = rb_entry(parent, struct fsck_inode, rb);
1537 if (inum < fscki->inum)
1538 p = &(*p)->rb_left;
1539 else if (inum > fscki->inum)
1540 p = &(*p)->rb_right;
1541 else
1542 return fscki;
1543 }
1544
1545 if (inum > c->highest_inum) {
1546 ubifs_err("too high inode number, max. is %lu",
1547 c->highest_inum);
1548 return ERR_PTR(-EINVAL);
1549 }
1550
1551 fscki = kzalloc(sizeof(struct fsck_inode), GFP_NOFS);
1552 if (!fscki)
1553 return ERR_PTR(-ENOMEM);
1554
1555 fscki->inum = inum;
1556 fscki->nlink = le32_to_cpu(ino->nlink);
1557 fscki->size = le64_to_cpu(ino->size);
1558 fscki->xattr_cnt = le32_to_cpu(ino->xattr_cnt);
1559 fscki->xattr_sz = le32_to_cpu(ino->xattr_size);
1560 fscki->xattr_nms = le32_to_cpu(ino->xattr_names);
1561 fscki->mode = le32_to_cpu(ino->mode);
1562 if (S_ISDIR(fscki->mode)) {
1563 fscki->calc_sz = UBIFS_INO_NODE_SZ;
1564 fscki->calc_cnt = 2;
1565 }
1566 rb_link_node(&fscki->rb, parent, p);
1567 rb_insert_color(&fscki->rb, &fsckd->inodes);
1568 return fscki;
1569}
1570
1571/**
1572 * search_inode - search inode in the RB-tree of inodes.
1573 * @fsckd: FS checking information
1574 * @inum: inode number to search
1575 *
1576 * This is a helper function for 'check_leaf()' which searches inode @inum in
1577 * the RB-tree of inodes and returns an inode information pointer or %NULL if
1578 * the inode was not found.
1579 */
1580static struct fsck_inode *search_inode(struct fsck_data *fsckd, ino_t inum)
1581{
1582 struct rb_node *p;
1583 struct fsck_inode *fscki;
1584
1585 p = fsckd->inodes.rb_node;
1586 while (p) {
1587 fscki = rb_entry(p, struct fsck_inode, rb);
1588 if (inum < fscki->inum)
1589 p = p->rb_left;
1590 else if (inum > fscki->inum)
1591 p = p->rb_right;
1592 else
1593 return fscki;
1594 }
1595 return NULL;
1596}
1597
1598/**
1599 * read_add_inode - read inode node and add it to RB-tree of inodes.
1600 * @c: UBIFS file-system description object
1601 * @fsckd: FS checking information
1602 * @inum: inode number to read
1603 *
1604 * This is a helper function for 'check_leaf()' which finds inode node @inum in
1605 * the index, reads it, and adds it to the RB-tree of inodes. Returns inode
1606 * information pointer in case of success and a negative error code in case of
1607 * failure.
1608 */
1609static struct fsck_inode *read_add_inode(struct ubifs_info *c,
1610 struct fsck_data *fsckd, ino_t inum)
1611{
1612 int n, err;
1613 union ubifs_key key;
1614 struct ubifs_znode *znode;
1615 struct ubifs_zbranch *zbr;
1616 struct ubifs_ino_node *ino;
1617 struct fsck_inode *fscki;
1618
1619 fscki = search_inode(fsckd, inum);
1620 if (fscki)
1621 return fscki;
1622
1623 ino_key_init(c, &key, inum);
1624 err = ubifs_lookup_level0(c, &key, &znode, &n);
1625 if (!err) {
1626 ubifs_err("inode %lu not found in index", inum);
1627 return ERR_PTR(-ENOENT);
1628 } else if (err < 0) {
1629 ubifs_err("error %d while looking up inode %lu", err, inum);
1630 return ERR_PTR(err);
1631 }
1632
1633 zbr = &znode->zbranch[n];
1634 if (zbr->len < UBIFS_INO_NODE_SZ) {
1635 ubifs_err("bad node %lu node length %d", inum, zbr->len);
1636 return ERR_PTR(-EINVAL);
1637 }
1638
1639 ino = kmalloc(zbr->len, GFP_NOFS);
1640 if (!ino)
1641 return ERR_PTR(-ENOMEM);
1642
1643 err = ubifs_tnc_read_node(c, zbr, ino);
1644 if (err) {
1645 ubifs_err("cannot read inode node at LEB %d:%d, error %d",
1646 zbr->lnum, zbr->offs, err);
1647 kfree(ino);
1648 return ERR_PTR(err);
1649 }
1650
1651 fscki = add_inode(c, fsckd, ino);
1652 kfree(ino);
1653 if (IS_ERR(fscki)) {
1654 ubifs_err("error %ld while adding inode %lu node",
1655 PTR_ERR(fscki), inum);
1656 return fscki;
1657 }
1658
1659 return fscki;
1660}
1661
1662/**
1663 * check_leaf - check leaf node.
1664 * @c: UBIFS file-system description object
1665 * @zbr: zbranch of the leaf node to check
1666 * @priv: FS checking information
1667 *
1668 * This is a helper function for 'dbg_check_filesystem()' which is called for
1669 * every single leaf node while walking the indexing tree. It checks that the
1670 * leaf node referred from the indexing tree exists, has correct CRC, and does
1671 * some other basic validation. This function is also responsible for building
1672 * an RB-tree of inodes - it adds all inodes into the RB-tree. It also
1673 * calculates reference count, size, etc for each inode in order to later
1674 * compare them to the information stored inside the inodes and detect possible
1675 * inconsistencies. Returns zero in case of success and a negative error code
1676 * in case of failure.
1677 */
1678static int check_leaf(struct ubifs_info *c, struct ubifs_zbranch *zbr,
1679 void *priv)
1680{
1681 ino_t inum;
1682 void *node;
1683 struct ubifs_ch *ch;
1684 int err, type = key_type(c, &zbr->key);
1685 struct fsck_inode *fscki;
1686
1687 if (zbr->len < UBIFS_CH_SZ) {
1688 ubifs_err("bad leaf length %d (LEB %d:%d)",
1689 zbr->len, zbr->lnum, zbr->offs);
1690 return -EINVAL;
1691 }
1692
1693 node = kmalloc(zbr->len, GFP_NOFS);
1694 if (!node)
1695 return -ENOMEM;
1696
1697 err = ubifs_tnc_read_node(c, zbr, node);
1698 if (err) {
1699 ubifs_err("cannot read leaf node at LEB %d:%d, error %d",
1700 zbr->lnum, zbr->offs, err);
1701 goto out_free;
1702 }
1703
1704 /* If this is an inode node, add it to RB-tree of inodes */
1705 if (type == UBIFS_INO_KEY) {
1706 fscki = add_inode(c, priv, node);
1707 if (IS_ERR(fscki)) {
1708 err = PTR_ERR(fscki);
1709 ubifs_err("error %d while adding inode node", err);
1710 goto out_dump;
1711 }
1712 goto out;
1713 }
1714
1715 if (type != UBIFS_DENT_KEY && type != UBIFS_XENT_KEY &&
1716 type != UBIFS_DATA_KEY) {
1717 ubifs_err("unexpected node type %d at LEB %d:%d",
1718 type, zbr->lnum, zbr->offs);
1719 err = -EINVAL;
1720 goto out_free;
1721 }
1722
1723 ch = node;
1724 if (le64_to_cpu(ch->sqnum) > c->max_sqnum) {
1725 ubifs_err("too high sequence number, max. is %llu",
1726 c->max_sqnum);
1727 err = -EINVAL;
1728 goto out_dump;
1729 }
1730
1731 if (type == UBIFS_DATA_KEY) {
1732 long long blk_offs;
1733 struct ubifs_data_node *dn = node;
1734
1735 /*
1736 * Search the inode node this data node belongs to and insert
1737 * it to the RB-tree of inodes.
1738 */
1739 inum = key_inum_flash(c, &dn->key);
1740 fscki = read_add_inode(c, priv, inum);
1741 if (IS_ERR(fscki)) {
1742 err = PTR_ERR(fscki);
1743 ubifs_err("error %d while processing data node and "
1744 "trying to find inode node %lu", err, inum);
1745 goto out_dump;
1746 }
1747
1748 /* Make sure the data node is within inode size */
1749 blk_offs = key_block_flash(c, &dn->key);
1750 blk_offs <<= UBIFS_BLOCK_SHIFT;
1751 blk_offs += le32_to_cpu(dn->size);
1752 if (blk_offs > fscki->size) {
1753 ubifs_err("data node at LEB %d:%d is not within inode "
1754 "size %lld", zbr->lnum, zbr->offs,
1755 fscki->size);
1756 err = -EINVAL;
1757 goto out_dump;
1758 }
1759 } else {
1760 int nlen;
1761 struct ubifs_dent_node *dent = node;
1762 struct fsck_inode *fscki1;
1763
1764 err = ubifs_validate_entry(c, dent);
1765 if (err)
1766 goto out_dump;
1767
1768 /*
1769 * Search the inode node this entry refers to and the parent
1770 * inode node and insert them to the RB-tree of inodes.
1771 */
1772 inum = le64_to_cpu(dent->inum);
1773 fscki = read_add_inode(c, priv, inum);
1774 if (IS_ERR(fscki)) {
1775 err = PTR_ERR(fscki);
1776 ubifs_err("error %d while processing entry node and "
1777 "trying to find inode node %lu", err, inum);
1778 goto out_dump;
1779 }
1780
1781 /* Count how many direntries or xentries refers this inode */
1782 fscki->references += 1;
1783
1784 inum = key_inum_flash(c, &dent->key);
1785 fscki1 = read_add_inode(c, priv, inum);
1786 if (IS_ERR(fscki1)) {
1787 err = PTR_ERR(fscki);
1788 ubifs_err("error %d while processing entry node and "
1789 "trying to find parent inode node %lu",
1790 err, inum);
1791 goto out_dump;
1792 }
1793
1794 nlen = le16_to_cpu(dent->nlen);
1795 if (type == UBIFS_XENT_KEY) {
1796 fscki1->calc_xcnt += 1;
1797 fscki1->calc_xsz += CALC_DENT_SIZE(nlen);
1798 fscki1->calc_xsz += CALC_XATTR_BYTES(fscki->size);
1799 fscki1->calc_xnms += nlen;
1800 } else {
1801 fscki1->calc_sz += CALC_DENT_SIZE(nlen);
1802 if (dent->type == UBIFS_ITYPE_DIR)
1803 fscki1->calc_cnt += 1;
1804 }
1805 }
1806
1807out:
1808 kfree(node);
1809 return 0;
1810
1811out_dump:
1812 ubifs_msg("dump of node at LEB %d:%d", zbr->lnum, zbr->offs);
1813 dbg_dump_node(c, node);
1814out_free:
1815 kfree(node);
1816 return err;
1817}
1818
1819/**
1820 * free_inodes - free RB-tree of inodes.
1821 * @fsckd: FS checking information
1822 */
1823static void free_inodes(struct fsck_data *fsckd)
1824{
1825 struct rb_node *this = fsckd->inodes.rb_node;
1826 struct fsck_inode *fscki;
1827
1828 while (this) {
1829 if (this->rb_left)
1830 this = this->rb_left;
1831 else if (this->rb_right)
1832 this = this->rb_right;
1833 else {
1834 fscki = rb_entry(this, struct fsck_inode, rb);
1835 this = rb_parent(this);
1836 if (this) {
1837 if (this->rb_left == &fscki->rb)
1838 this->rb_left = NULL;
1839 else
1840 this->rb_right = NULL;
1841 }
1842 kfree(fscki);
1843 }
1844 }
1845}
1846
1847/**
1848 * check_inodes - checks all inodes.
1849 * @c: UBIFS file-system description object
1850 * @fsckd: FS checking information
1851 *
1852 * This is a helper function for 'dbg_check_filesystem()' which walks the
1853 * RB-tree of inodes after the index scan has been finished, and checks that
1854 * inode nlink, size, etc are correct. Returns zero if inodes are fine,
1855 * %-EINVAL if not, and a negative error code in case of failure.
1856 */
1857static int check_inodes(struct ubifs_info *c, struct fsck_data *fsckd)
1858{
1859 int n, err;
1860 union ubifs_key key;
1861 struct ubifs_znode *znode;
1862 struct ubifs_zbranch *zbr;
1863 struct ubifs_ino_node *ino;
1864 struct fsck_inode *fscki;
1865 struct rb_node *this = rb_first(&fsckd->inodes);
1866
1867 while (this) {
1868 fscki = rb_entry(this, struct fsck_inode, rb);
1869 this = rb_next(this);
1870
1871 if (S_ISDIR(fscki->mode)) {
1872 /*
1873 * Directories have to have exactly one reference (they
1874 * cannot have hardlinks), although root inode is an
1875 * exception.
1876 */
1877 if (fscki->inum != UBIFS_ROOT_INO &&
1878 fscki->references != 1) {
1879 ubifs_err("directory inode %lu has %d "
1880 "direntries which refer it, but "
1881 "should be 1", fscki->inum,
1882 fscki->references);
1883 goto out_dump;
1884 }
1885 if (fscki->inum == UBIFS_ROOT_INO &&
1886 fscki->references != 0) {
1887 ubifs_err("root inode %lu has non-zero (%d) "
1888 "direntries which refer it",
1889 fscki->inum, fscki->references);
1890 goto out_dump;
1891 }
1892 if (fscki->calc_sz != fscki->size) {
1893 ubifs_err("directory inode %lu size is %lld, "
1894 "but calculated size is %lld",
1895 fscki->inum, fscki->size,
1896 fscki->calc_sz);
1897 goto out_dump;
1898 }
1899 if (fscki->calc_cnt != fscki->nlink) {
1900 ubifs_err("directory inode %lu nlink is %d, "
1901 "but calculated nlink is %d",
1902 fscki->inum, fscki->nlink,
1903 fscki->calc_cnt);
1904 goto out_dump;
1905 }
1906 } else {
1907 if (fscki->references != fscki->nlink) {
1908 ubifs_err("inode %lu nlink is %d, but "
1909 "calculated nlink is %d", fscki->inum,
1910 fscki->nlink, fscki->references);
1911 goto out_dump;
1912 }
1913 }
1914 if (fscki->xattr_sz != fscki->calc_xsz) {
1915 ubifs_err("inode %lu has xattr size %u, but "
1916 "calculated size is %lld",
1917 fscki->inum, fscki->xattr_sz,
1918 fscki->calc_xsz);
1919 goto out_dump;
1920 }
1921 if (fscki->xattr_cnt != fscki->calc_xcnt) {
1922 ubifs_err("inode %lu has %u xattrs, but "
1923 "calculated count is %lld", fscki->inum,
1924 fscki->xattr_cnt, fscki->calc_xcnt);
1925 goto out_dump;
1926 }
1927 if (fscki->xattr_nms != fscki->calc_xnms) {
1928 ubifs_err("inode %lu has xattr names' size %u, but "
1929 "calculated names' size is %lld",
1930 fscki->inum, fscki->xattr_nms,
1931 fscki->calc_xnms);
1932 goto out_dump;
1933 }
1934 }
1935
1936 return 0;
1937
1938out_dump:
1939 /* Read the bad inode and dump it */
1940 ino_key_init(c, &key, fscki->inum);
1941 err = ubifs_lookup_level0(c, &key, &znode, &n);
1942 if (!err) {
1943 ubifs_err("inode %lu not found in index", fscki->inum);
1944 return -ENOENT;
1945 } else if (err < 0) {
1946 ubifs_err("error %d while looking up inode %lu",
1947 err, fscki->inum);
1948 return err;
1949 }
1950
1951 zbr = &znode->zbranch[n];
1952 ino = kmalloc(zbr->len, GFP_NOFS);
1953 if (!ino)
1954 return -ENOMEM;
1955
1956 err = ubifs_tnc_read_node(c, zbr, ino);
1957 if (err) {
1958 ubifs_err("cannot read inode node at LEB %d:%d, error %d",
1959 zbr->lnum, zbr->offs, err);
1960 kfree(ino);
1961 return err;
1962 }
1963
1964 ubifs_msg("dump of the inode %lu sitting in LEB %d:%d",
1965 fscki->inum, zbr->lnum, zbr->offs);
1966 dbg_dump_node(c, ino);
1967 kfree(ino);
1968 return -EINVAL;
1969}
1970
1971/**
1972 * dbg_check_filesystem - check the file-system.
1973 * @c: UBIFS file-system description object
1974 *
1975 * This function checks the file system, namely:
1976 * o makes sure that all leaf nodes exist and their CRCs are correct;
1977 * o makes sure inode nlink, size, xattr size/count are correct (for all
1978 * inodes).
1979 *
1980 * The function reads whole indexing tree and all nodes, so it is pretty
1981 * heavy-weight. Returns zero if the file-system is consistent, %-EINVAL if
1982 * not, and a negative error code in case of failure.
1983 */
1984int dbg_check_filesystem(struct ubifs_info *c)
1985{
1986 int err;
1987 struct fsck_data fsckd;
1988
1989 if (!(ubifs_chk_flags & UBIFS_CHK_FS))
1990 return 0;
1991
1992 fsckd.inodes = RB_ROOT;
1993 err = dbg_walk_index(c, check_leaf, NULL, &fsckd);
1994 if (err)
1995 goto out_free;
1996
1997 err = check_inodes(c, &fsckd);
1998 if (err)
1999 goto out_free;
2000
2001 free_inodes(&fsckd);
2002 return 0;
2003
2004out_free:
2005 ubifs_err("file-system check failed with error %d", err);
2006 dump_stack();
2007 free_inodes(&fsckd);
2008 return err;
2009}
2010
2011static int invocation_cnt;
2012
2013int dbg_force_in_the_gaps(void)
2014{
2015 if (!dbg_force_in_the_gaps_enabled)
2016 return 0;
2017 /* Force in-the-gaps every 8th commit */
2018 return !((invocation_cnt++) & 0x7);
2019}
2020
2021/* Failure mode for recovery testing */
2022
2023#define chance(n, d) (simple_rand() <= (n) * 32768LL / (d))
2024
2025struct failure_mode_info {
2026 struct list_head list;
2027 struct ubifs_info *c;
2028};
2029
2030static LIST_HEAD(fmi_list);
2031static DEFINE_SPINLOCK(fmi_lock);
2032
2033static unsigned int next;
2034
2035static int simple_rand(void)
2036{
2037 if (next == 0)
2038 next = current->pid;
2039 next = next * 1103515245 + 12345;
2040 return (next >> 16) & 32767;
2041}
2042
2043void dbg_failure_mode_registration(struct ubifs_info *c)
2044{
2045 struct failure_mode_info *fmi;
2046
2047 fmi = kmalloc(sizeof(struct failure_mode_info), GFP_NOFS);
2048 if (!fmi) {
2049 dbg_err("Failed to register failure mode - no memory");
2050 return;
2051 }
2052 fmi->c = c;
2053 spin_lock(&fmi_lock);
2054 list_add_tail(&fmi->list, &fmi_list);
2055 spin_unlock(&fmi_lock);
2056}
2057
2058void dbg_failure_mode_deregistration(struct ubifs_info *c)
2059{
2060 struct failure_mode_info *fmi, *tmp;
2061
2062 spin_lock(&fmi_lock);
2063 list_for_each_entry_safe(fmi, tmp, &fmi_list, list)
2064 if (fmi->c == c) {
2065 list_del(&fmi->list);
2066 kfree(fmi);
2067 }
2068 spin_unlock(&fmi_lock);
2069}
2070
2071static struct ubifs_info *dbg_find_info(struct ubi_volume_desc *desc)
2072{
2073 struct failure_mode_info *fmi;
2074
2075 spin_lock(&fmi_lock);
2076 list_for_each_entry(fmi, &fmi_list, list)
2077 if (fmi->c->ubi == desc) {
2078 struct ubifs_info *c = fmi->c;
2079
2080 spin_unlock(&fmi_lock);
2081 return c;
2082 }
2083 spin_unlock(&fmi_lock);
2084 return NULL;
2085}
2086
2087static int in_failure_mode(struct ubi_volume_desc *desc)
2088{
2089 struct ubifs_info *c = dbg_find_info(desc);
2090
2091 if (c && dbg_failure_mode)
2092 return c->failure_mode;
2093 return 0;
2094}
2095
2096static int do_fail(struct ubi_volume_desc *desc, int lnum, int write)
2097{
2098 struct ubifs_info *c = dbg_find_info(desc);
2099
2100 if (!c || !dbg_failure_mode)
2101 return 0;
2102 if (c->failure_mode)
2103 return 1;
2104 if (!c->fail_cnt) {
2105 /* First call - decide delay to failure */
2106 if (chance(1, 2)) {
2107 unsigned int delay = 1 << (simple_rand() >> 11);
2108
2109 if (chance(1, 2)) {
2110 c->fail_delay = 1;
2111 c->fail_timeout = jiffies +
2112 msecs_to_jiffies(delay);
2113 dbg_rcvry("failing after %ums", delay);
2114 } else {
2115 c->fail_delay = 2;
2116 c->fail_cnt_max = delay;
2117 dbg_rcvry("failing after %u calls", delay);
2118 }
2119 }
2120 c->fail_cnt += 1;
2121 }
2122 /* Determine if failure delay has expired */
2123 if (c->fail_delay == 1) {
2124 if (time_before(jiffies, c->fail_timeout))
2125 return 0;
2126 } else if (c->fail_delay == 2)
2127 if (c->fail_cnt++ < c->fail_cnt_max)
2128 return 0;
2129 if (lnum == UBIFS_SB_LNUM) {
2130 if (write) {
2131 if (chance(1, 2))
2132 return 0;
2133 } else if (chance(19, 20))
2134 return 0;
2135 dbg_rcvry("failing in super block LEB %d", lnum);
2136 } else if (lnum == UBIFS_MST_LNUM || lnum == UBIFS_MST_LNUM + 1) {
2137 if (chance(19, 20))
2138 return 0;
2139 dbg_rcvry("failing in master LEB %d", lnum);
2140 } else if (lnum >= UBIFS_LOG_LNUM && lnum <= c->log_last) {
2141 if (write) {
2142 if (chance(99, 100))
2143 return 0;
2144 } else if (chance(399, 400))
2145 return 0;
2146 dbg_rcvry("failing in log LEB %d", lnum);
2147 } else if (lnum >= c->lpt_first && lnum <= c->lpt_last) {
2148 if (write) {
2149 if (chance(7, 8))
2150 return 0;
2151 } else if (chance(19, 20))
2152 return 0;
2153 dbg_rcvry("failing in LPT LEB %d", lnum);
2154 } else if (lnum >= c->orph_first && lnum <= c->orph_last) {
2155 if (write) {
2156 if (chance(1, 2))
2157 return 0;
2158 } else if (chance(9, 10))
2159 return 0;
2160 dbg_rcvry("failing in orphan LEB %d", lnum);
2161 } else if (lnum == c->ihead_lnum) {
2162 if (chance(99, 100))
2163 return 0;
2164 dbg_rcvry("failing in index head LEB %d", lnum);
2165 } else if (c->jheads && lnum == c->jheads[GCHD].wbuf.lnum) {
2166 if (chance(9, 10))
2167 return 0;
2168 dbg_rcvry("failing in GC head LEB %d", lnum);
2169 } else if (write && !RB_EMPTY_ROOT(&c->buds) &&
2170 !ubifs_search_bud(c, lnum)) {
2171 if (chance(19, 20))
2172 return 0;
2173 dbg_rcvry("failing in non-bud LEB %d", lnum);
2174 } else if (c->cmt_state == COMMIT_RUNNING_BACKGROUND ||
2175 c->cmt_state == COMMIT_RUNNING_REQUIRED) {
2176 if (chance(999, 1000))
2177 return 0;
2178 dbg_rcvry("failing in bud LEB %d commit running", lnum);
2179 } else {
2180 if (chance(9999, 10000))
2181 return 0;
2182 dbg_rcvry("failing in bud LEB %d commit not running", lnum);
2183 }
2184 ubifs_err("*** SETTING FAILURE MODE ON (LEB %d) ***", lnum);
2185 c->failure_mode = 1;
2186 dump_stack();
2187 return 1;
2188}
2189
2190static void cut_data(const void *buf, int len)
2191{
2192 int flen, i;
2193 unsigned char *p = (void *)buf;
2194
2195 flen = (len * (long long)simple_rand()) >> 15;
2196 for (i = flen; i < len; i++)
2197 p[i] = 0xff;
2198}
2199
2200int dbg_leb_read(struct ubi_volume_desc *desc, int lnum, char *buf, int offset,
2201 int len, int check)
2202{
2203 if (in_failure_mode(desc))
2204 return -EIO;
2205 return ubi_leb_read(desc, lnum, buf, offset, len, check);
2206}
2207
2208int dbg_leb_write(struct ubi_volume_desc *desc, int lnum, const void *buf,
2209 int offset, int len, int dtype)
2210{
2211 int err;
2212
2213 if (in_failure_mode(desc))
2214 return -EIO;
2215 if (do_fail(desc, lnum, 1))
2216 cut_data(buf, len);
2217 err = ubi_leb_write(desc, lnum, buf, offset, len, dtype);
2218 if (err)
2219 return err;
2220 if (in_failure_mode(desc))
2221 return -EIO;
2222 return 0;
2223}
2224
2225int dbg_leb_change(struct ubi_volume_desc *desc, int lnum, const void *buf,
2226 int len, int dtype)
2227{
2228 int err;
2229
2230 if (do_fail(desc, lnum, 1))
2231 return -EIO;
2232 err = ubi_leb_change(desc, lnum, buf, len, dtype);
2233 if (err)
2234 return err;
2235 if (do_fail(desc, lnum, 1))
2236 return -EIO;
2237 return 0;
2238}
2239
2240int dbg_leb_erase(struct ubi_volume_desc *desc, int lnum)
2241{
2242 int err;
2243
2244 if (do_fail(desc, lnum, 0))
2245 return -EIO;
2246 err = ubi_leb_erase(desc, lnum);
2247 if (err)
2248 return err;
2249 if (do_fail(desc, lnum, 0))
2250 return -EIO;
2251 return 0;
2252}
2253
2254int dbg_leb_unmap(struct ubi_volume_desc *desc, int lnum)
2255{
2256 int err;
2257
2258 if (do_fail(desc, lnum, 0))
2259 return -EIO;
2260 err = ubi_leb_unmap(desc, lnum);
2261 if (err)
2262 return err;
2263 if (do_fail(desc, lnum, 0))
2264 return -EIO;
2265 return 0;
2266}
2267
2268int dbg_is_mapped(struct ubi_volume_desc *desc, int lnum)
2269{
2270 if (in_failure_mode(desc))
2271 return -EIO;
2272 return ubi_is_mapped(desc, lnum);
2273}
2274
2275int dbg_leb_map(struct ubi_volume_desc *desc, int lnum, int dtype)
2276{
2277 int err;
2278
2279 if (do_fail(desc, lnum, 0))
2280 return -EIO;
2281 err = ubi_leb_map(desc, lnum, dtype);
2282 if (err)
2283 return err;
2284 if (do_fail(desc, lnum, 0))
2285 return -EIO;
2286 return 0;
2287}
2288
2289#endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
new file mode 100644
index 000000000000..3c4f1e93c9e0
--- /dev/null
+++ b/fs/ubifs/debug.h
@@ -0,0 +1,403 @@
1/*
2 * This file is part of UBIFS.
3 *
4 * Copyright (C) 2006-2008 Nokia Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published by
8 * the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along with
16 * this program; if not, write to the Free Software Foundation, Inc., 51
17 * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 * Authors: Artem Bityutskiy (Битюцкий Артём)
20 * Adrian Hunter
21 */
22
23#ifndef __UBIFS_DEBUG_H__
24#define __UBIFS_DEBUG_H__
25
26#ifdef CONFIG_UBIFS_FS_DEBUG
27
28#define UBIFS_DBG(op) op
29
30#define ubifs_assert(expr) do { \
31 if (unlikely(!(expr))) { \
32 printk(KERN_CRIT "UBIFS assert failed in %s at %u (pid %d)\n", \
33 __func__, __LINE__, current->pid); \
34 dbg_dump_stack(); \
35 } \
36} while (0)
37
38#define ubifs_assert_cmt_locked(c) do { \
39 if (unlikely(down_write_trylock(&(c)->commit_sem))) { \
40 up_write(&(c)->commit_sem); \
41 printk(KERN_CRIT "commit lock is not locked!\n"); \
42 ubifs_assert(0); \
43 } \
44} while (0)
45
46#define dbg_dump_stack() do { \
47 if (!dbg_failure_mode) \
48 dump_stack(); \
49} while (0)
50
51/* Generic debugging messages */
52#define dbg_msg(fmt, ...) do { \
53 spin_lock(&dbg_lock); \
54 printk(KERN_DEBUG "UBIFS DBG (pid %d): %s: " fmt "\n", current->pid, \
55 __func__, ##__VA_ARGS__); \
56 spin_unlock(&dbg_lock); \
57} while (0)
58
59#define dbg_do_msg(typ, fmt, ...) do { \
60 if (ubifs_msg_flags & typ) \
61 dbg_msg(fmt, ##__VA_ARGS__); \
62} while (0)
63
64#define dbg_err(fmt, ...) do { \
65 spin_lock(&dbg_lock); \
66 ubifs_err(fmt, ##__VA_ARGS__); \
67 spin_unlock(&dbg_lock); \
68} while (0)
69
70const char *dbg_key_str0(const struct ubifs_info *c,
71 const union ubifs_key *key);
72const char *dbg_key_str1(const struct ubifs_info *c,
73 const union ubifs_key *key);
74
75/*
76 * DBGKEY macros require dbg_lock to be held, which it is in the dbg message
77 * macros.
78 */
79#define DBGKEY(key) dbg_key_str0(c, (key))
80#define DBGKEY1(key) dbg_key_str1(c, (key))
81
82/* General messages */
83#define dbg_gen(fmt, ...) dbg_do_msg(UBIFS_MSG_GEN, fmt, ##__VA_ARGS__)
84
85/* Additional journal messages */
86#define dbg_jnl(fmt, ...) dbg_do_msg(UBIFS_MSG_JNL, fmt, ##__VA_ARGS__)
87
88/* Additional TNC messages */
89#define dbg_tnc(fmt, ...) dbg_do_msg(UBIFS_MSG_TNC, fmt, ##__VA_ARGS__)
90
91/* Additional lprops messages */
92#define dbg_lp(fmt, ...) dbg_do_msg(UBIFS_MSG_LP, fmt, ##__VA_ARGS__)
93
94/* Additional LEB find messages */
95#define dbg_find(fmt, ...) dbg_do_msg(UBIFS_MSG_FIND, fmt, ##__VA_ARGS__)
96
97/* Additional mount messages */
98#define dbg_mnt(fmt, ...) dbg_do_msg(UBIFS_MSG_MNT, fmt, ##__VA_ARGS__)
99
100/* Additional I/O messages */
101#define dbg_io(fmt, ...) dbg_do_msg(UBIFS_MSG_IO, fmt, ##__VA_ARGS__)
102
103/* Additional commit messages */
104#define dbg_cmt(fmt, ...) dbg_do_msg(UBIFS_MSG_CMT, fmt, ##__VA_ARGS__)
105
106/* Additional budgeting messages */
107#define dbg_budg(fmt, ...) dbg_do_msg(UBIFS_MSG_BUDG, fmt, ##__VA_ARGS__)
108
109/* Additional log messages */
110#define dbg_log(fmt, ...) dbg_do_msg(UBIFS_MSG_LOG, fmt, ##__VA_ARGS__)
111
112/* Additional gc messages */
113#define dbg_gc(fmt, ...) dbg_do_msg(UBIFS_MSG_GC, fmt, ##__VA_ARGS__)
114
115/* Additional scan messages */
116#define dbg_scan(fmt, ...) dbg_do_msg(UBIFS_MSG_SCAN, fmt, ##__VA_ARGS__)
117
118/* Additional recovery messages */
119#define dbg_rcvry(fmt, ...) dbg_do_msg(UBIFS_MSG_RCVRY, fmt, ##__VA_ARGS__)
120
121/*
122 * Debugging message type flags (must match msg_type_names in debug.c).
123 *
124 * UBIFS_MSG_GEN: general messages
125 * UBIFS_MSG_JNL: journal messages
126 * UBIFS_MSG_MNT: mount messages
127 * UBIFS_MSG_CMT: commit messages
128 * UBIFS_MSG_FIND: LEB find messages
129 * UBIFS_MSG_BUDG: budgeting messages
130 * UBIFS_MSG_GC: garbage collection messages
131 * UBIFS_MSG_TNC: TNC messages
132 * UBIFS_MSG_LP: lprops messages
133 * UBIFS_MSG_IO: I/O messages
134 * UBIFS_MSG_LOG: log messages
135 * UBIFS_MSG_SCAN: scan messages
136 * UBIFS_MSG_RCVRY: recovery messages
137 */
138enum {
139 UBIFS_MSG_GEN = 0x1,
140 UBIFS_MSG_JNL = 0x2,
141 UBIFS_MSG_MNT = 0x4,
142 UBIFS_MSG_CMT = 0x8,
143 UBIFS_MSG_FIND = 0x10,
144 UBIFS_MSG_BUDG = 0x20,
145 UBIFS_MSG_GC = 0x40,
146 UBIFS_MSG_TNC = 0x80,
147 UBIFS_MSG_LP = 0x100,
148 UBIFS_MSG_IO = 0x200,
149 UBIFS_MSG_LOG = 0x400,
150 UBIFS_MSG_SCAN = 0x800,
151 UBIFS_MSG_RCVRY = 0x1000,
152};
153
154/* Debugging message type flags for each default debug message level */
155#define UBIFS_MSG_LVL_0 0
156#define UBIFS_MSG_LVL_1 0x1
157#define UBIFS_MSG_LVL_2 0x7f
158#define UBIFS_MSG_LVL_3 0xffff
159
160/*
161 * Debugging check flags (must match chk_names in debug.c).
162 *
163 * UBIFS_CHK_GEN: general checks
164 * UBIFS_CHK_TNC: check TNC
165 * UBIFS_CHK_IDX_SZ: check index size
166 * UBIFS_CHK_ORPH: check orphans
167 * UBIFS_CHK_OLD_IDX: check the old index
168 * UBIFS_CHK_LPROPS: check lprops
169 * UBIFS_CHK_FS: check the file-system
170 */
171enum {
172 UBIFS_CHK_GEN = 0x1,
173 UBIFS_CHK_TNC = 0x2,
174 UBIFS_CHK_IDX_SZ = 0x4,
175 UBIFS_CHK_ORPH = 0x8,
176 UBIFS_CHK_OLD_IDX = 0x10,
177 UBIFS_CHK_LPROPS = 0x20,
178 UBIFS_CHK_FS = 0x40,
179};
180
181/*
182 * Special testing flags (must match tst_names in debug.c).
183 *
184 * UBIFS_TST_FORCE_IN_THE_GAPS: force the use of in-the-gaps method
185 * UBIFS_TST_RCVRY: failure mode for recovery testing
186 */
187enum {
188 UBIFS_TST_FORCE_IN_THE_GAPS = 0x2,
189 UBIFS_TST_RCVRY = 0x4,
190};
191
192#if CONFIG_UBIFS_FS_DEBUG_MSG_LVL == 1
193#define UBIFS_MSG_FLAGS_DEFAULT UBIFS_MSG_LVL_1
194#elif CONFIG_UBIFS_FS_DEBUG_MSG_LVL == 2
195#define UBIFS_MSG_FLAGS_DEFAULT UBIFS_MSG_LVL_2
196#elif CONFIG_UBIFS_FS_DEBUG_MSG_LVL == 3
197#define UBIFS_MSG_FLAGS_DEFAULT UBIFS_MSG_LVL_3
198#else
199#define UBIFS_MSG_FLAGS_DEFAULT UBIFS_MSG_LVL_0
200#endif
201
202#ifdef CONFIG_UBIFS_FS_DEBUG_CHKS
203#define UBIFS_CHK_FLAGS_DEFAULT 0xffffffff
204#else
205#define UBIFS_CHK_FLAGS_DEFAULT 0
206#endif
207
208extern spinlock_t dbg_lock;
209
210extern unsigned int ubifs_msg_flags;
211extern unsigned int ubifs_chk_flags;
212extern unsigned int ubifs_tst_flags;
213
214/* Dump functions */
215
216const char *dbg_ntype(int type);
217const char *dbg_cstate(int cmt_state);
218const char *dbg_get_key_dump(const struct ubifs_info *c,
219 const union ubifs_key *key);
220void dbg_dump_inode(const struct ubifs_info *c, const struct inode *inode);
221void dbg_dump_node(const struct ubifs_info *c, const void *node);
222void dbg_dump_budget_req(const struct ubifs_budget_req *req);
223void dbg_dump_lstats(const struct ubifs_lp_stats *lst);
224void dbg_dump_budg(struct ubifs_info *c);
225void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp);
226void dbg_dump_lprops(struct ubifs_info *c);
227void dbg_dump_leb(const struct ubifs_info *c, int lnum);
228void dbg_dump_znode(const struct ubifs_info *c,
229 const struct ubifs_znode *znode);
230void dbg_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat);
231void dbg_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
232 struct ubifs_nnode *parent, int iip);
233void dbg_dump_tnc(struct ubifs_info *c);
234void dbg_dump_index(struct ubifs_info *c);
235
236/* Checking helper functions */
237
238typedef int (*dbg_leaf_callback)(struct ubifs_info *c,
239 struct ubifs_zbranch *zbr, void *priv);
240typedef int (*dbg_znode_callback)(struct ubifs_info *c,
241 struct ubifs_znode *znode, void *priv);
242
243int dbg_walk_index(struct ubifs_info *c, dbg_leaf_callback leaf_cb,
244 dbg_znode_callback znode_cb, void *priv);
245
246/* Checking functions */
247
248int dbg_check_lprops(struct ubifs_info *c);
249
250int dbg_old_index_check_init(struct ubifs_info *c, struct ubifs_zbranch *zroot);
251int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot);
252
253int dbg_check_cats(struct ubifs_info *c);
254
255int dbg_check_ltab(struct ubifs_info *c);
256
257int dbg_check_synced_i_size(struct inode *inode);
258
259int dbg_check_dir_size(struct ubifs_info *c, const struct inode *dir);
260
261int dbg_check_tnc(struct ubifs_info *c, int extra);
262
263int dbg_check_idx_size(struct ubifs_info *c, long long idx_size);
264
265int dbg_check_filesystem(struct ubifs_info *c);
266
267void dbg_check_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat,
268 int add_pos);
269
270int dbg_check_lprops(struct ubifs_info *c);
271int dbg_check_lpt_nodes(struct ubifs_info *c, struct ubifs_cnode *cnode,
272 int row, int col);
273
274/* Force the use of in-the-gaps method for testing */
275
276#define dbg_force_in_the_gaps_enabled \
277 (ubifs_tst_flags & UBIFS_TST_FORCE_IN_THE_GAPS)
278
279int dbg_force_in_the_gaps(void);
280
281/* Failure mode for recovery testing */
282
283#define dbg_failure_mode (ubifs_tst_flags & UBIFS_TST_RCVRY)
284
285void dbg_failure_mode_registration(struct ubifs_info *c);
286void dbg_failure_mode_deregistration(struct ubifs_info *c);
287
288#ifndef UBIFS_DBG_PRESERVE_UBI
289
290#define ubi_leb_read dbg_leb_read
291#define ubi_leb_write dbg_leb_write
292#define ubi_leb_change dbg_leb_change
293#define ubi_leb_erase dbg_leb_erase
294#define ubi_leb_unmap dbg_leb_unmap
295#define ubi_is_mapped dbg_is_mapped
296#define ubi_leb_map dbg_leb_map
297
298#endif
299
300int dbg_leb_read(struct ubi_volume_desc *desc, int lnum, char *buf, int offset,
301 int len, int check);
302int dbg_leb_write(struct ubi_volume_desc *desc, int lnum, const void *buf,
303 int offset, int len, int dtype);
304int dbg_leb_change(struct ubi_volume_desc *desc, int lnum, const void *buf,
305 int len, int dtype);
306int dbg_leb_erase(struct ubi_volume_desc *desc, int lnum);
307int dbg_leb_unmap(struct ubi_volume_desc *desc, int lnum);
308int dbg_is_mapped(struct ubi_volume_desc *desc, int lnum);
309int dbg_leb_map(struct ubi_volume_desc *desc, int lnum, int dtype);
310
311static inline int dbg_read(struct ubi_volume_desc *desc, int lnum, char *buf,
312 int offset, int len)
313{
314 return dbg_leb_read(desc, lnum, buf, offset, len, 0);
315}
316
317static inline int dbg_write(struct ubi_volume_desc *desc, int lnum,
318 const void *buf, int offset, int len)
319{
320 return dbg_leb_write(desc, lnum, buf, offset, len, UBI_UNKNOWN);
321}
322
323static inline int dbg_change(struct ubi_volume_desc *desc, int lnum,
324 const void *buf, int len)
325{
326 return dbg_leb_change(desc, lnum, buf, len, UBI_UNKNOWN);
327}
328
329#else /* !CONFIG_UBIFS_FS_DEBUG */
330
331#define UBIFS_DBG(op)
332#define ubifs_assert(expr) ({})
333#define ubifs_assert_cmt_locked(c)
334#define dbg_dump_stack()
335#define dbg_err(fmt, ...) ({})
336#define dbg_msg(fmt, ...) ({})
337#define dbg_key(c, key, fmt, ...) ({})
338
339#define dbg_gen(fmt, ...) ({})
340#define dbg_jnl(fmt, ...) ({})
341#define dbg_tnc(fmt, ...) ({})
342#define dbg_lp(fmt, ...) ({})
343#define dbg_find(fmt, ...) ({})
344#define dbg_mnt(fmt, ...) ({})
345#define dbg_io(fmt, ...) ({})
346#define dbg_cmt(fmt, ...) ({})
347#define dbg_budg(fmt, ...) ({})
348#define dbg_log(fmt, ...) ({})
349#define dbg_gc(fmt, ...) ({})
350#define dbg_scan(fmt, ...) ({})
351#define dbg_rcvry(fmt, ...) ({})
352
353#define dbg_ntype(type) ""
354#define dbg_cstate(cmt_state) ""
355#define dbg_get_key_dump(c, key) ({})
356#define dbg_dump_inode(c, inode) ({})
357#define dbg_dump_node(c, node) ({})
358#define dbg_dump_budget_req(req) ({})
359#define dbg_dump_lstats(lst) ({})
360#define dbg_dump_budg(c) ({})
361#define dbg_dump_lprop(c, lp) ({})
362#define dbg_dump_lprops(c) ({})
363#define dbg_dump_leb(c, lnum) ({})
364#define dbg_dump_znode(c, znode) ({})
365#define dbg_dump_heap(c, heap, cat) ({})
366#define dbg_dump_pnode(c, pnode, parent, iip) ({})
367#define dbg_dump_tnc(c) ({})
368#define dbg_dump_index(c) ({})
369
370#define dbg_walk_index(c, leaf_cb, znode_cb, priv) 0
371
372#define dbg_old_index_check_init(c, zroot) 0
373#define dbg_check_old_index(c, zroot) 0
374
375#define dbg_check_cats(c) 0
376
377#define dbg_check_ltab(c) 0
378
379#define dbg_check_synced_i_size(inode) 0
380
381#define dbg_check_dir_size(c, dir) 0
382
383#define dbg_check_tnc(c, x) 0
384
385#define dbg_check_idx_size(c, idx_size) 0
386
387#define dbg_check_filesystem(c) 0
388
389#define dbg_check_heap(c, heap, cat, add_pos) ({})
390
391#define dbg_check_lprops(c) 0
392#define dbg_check_lpt_nodes(c, cnode, row, col) 0
393
394#define dbg_force_in_the_gaps_enabled 0
395#define dbg_force_in_the_gaps() 0
396
397#define dbg_failure_mode 0
398#define dbg_failure_mode_registration(c) ({})
399#define dbg_failure_mode_deregistration(c) ({})
400
401#endif /* !CONFIG_UBIFS_FS_DEBUG */
402
403#endif /* !__UBIFS_DEBUG_H__ */
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
new file mode 100644
index 000000000000..e90374be7d3b
--- /dev/null
+++ b/fs/ubifs/dir.c
@@ -0,0 +1,1240 @@
1/* * This file is part of UBIFS.
2 *
3 * Copyright (C) 2006-2008 Nokia Corporation.
4 * Copyright (C) 2006, 2007 University of Szeged, Hungary
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published by
8 * the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along with
16 * this program; if not, write to the Free Software Foundation, Inc., 51
17 * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 * Authors: Artem Bityutskiy (Битюцкий Артём)
20 * Adrian Hunter
21 * Zoltan Sogor
22 */
23
24/*
25 * This file implements directory operations.
26 *
27 * All FS operations in this file allocate budget before writing anything to the
28 * media. If they fail to allocate it, the error is returned. The only
29 * exceptions are 'ubifs_unlink()' and 'ubifs_rmdir()' which keep working even
30 * if they unable to allocate the budget, because deletion %-ENOSPC failure is
31 * not what users are usually ready to get. UBIFS budgeting subsystem has some
32 * space reserved for these purposes.
33 *
34 * All operations in this file write all inodes which they change straight
35 * away, instead of marking them dirty. For example, 'ubifs_link()' changes
36 * @i_size of the parent inode and writes the parent inode together with the
37 * target inode. This was done to simplify file-system recovery which would
38 * otherwise be very difficult to do. The only exception is rename which marks
39 * the re-named inode dirty (because its @i_ctime is updated) but does not
40 * write it, but just marks it as dirty.
41 */
42
43#include "ubifs.h"
44
45/**
46 * inherit_flags - inherit flags of the parent inode.
47 * @dir: parent inode
48 * @mode: new inode mode flags
49 *
50 * This is a helper function for 'ubifs_new_inode()' which inherits flag of the
51 * parent directory inode @dir. UBIFS inodes inherit the following flags:
52 * o %UBIFS_COMPR_FL, which is useful to switch compression on/of on
53 * sub-directory basis;
54 * o %UBIFS_SYNC_FL - useful for the same reasons;
55 * o %UBIFS_DIRSYNC_FL - similar, but relevant only to directories.
56 *
57 * This function returns the inherited flags.
58 */
59static int inherit_flags(const struct inode *dir, int mode)
60{
61 int flags;
62 const struct ubifs_inode *ui = ubifs_inode(dir);
63
64 if (!S_ISDIR(dir->i_mode))
65 /*
66 * The parent is not a directory, which means that an extended
67 * attribute inode is being created. No flags.
68 */
69 return 0;
70
71 flags = ui->flags & (UBIFS_COMPR_FL | UBIFS_SYNC_FL | UBIFS_DIRSYNC_FL);
72 if (!S_ISDIR(mode))
73 /* The "DIRSYNC" flag only applies to directories */
74 flags &= ~UBIFS_DIRSYNC_FL;
75 return flags;
76}
77
78/**
79 * ubifs_new_inode - allocate new UBIFS inode object.
80 * @c: UBIFS file-system description object
81 * @dir: parent directory inode
82 * @mode: inode mode flags
83 *
84 * This function finds an unused inode number, allocates new inode and
85 * initializes it. Returns new inode in case of success and an error code in
86 * case of failure.
87 */
88struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir,
89 int mode)
90{
91 struct inode *inode;
92 struct ubifs_inode *ui;
93
94 inode = new_inode(c->vfs_sb);
95 ui = ubifs_inode(inode);
96 if (!inode)
97 return ERR_PTR(-ENOMEM);
98
99 /*
100 * Set 'S_NOCMTIME' to prevent VFS form updating [mc]time of inodes and
101 * marking them dirty in file write path (see 'file_update_time()').
102 * UBIFS has to fully control "clean <-> dirty" transitions of inodes
103 * to make budgeting work.
104 */
105 inode->i_flags |= (S_NOCMTIME);
106
107 inode->i_uid = current->fsuid;
108 if (dir->i_mode & S_ISGID) {
109 inode->i_gid = dir->i_gid;
110 if (S_ISDIR(mode))
111 mode |= S_ISGID;
112 } else
113 inode->i_gid = current->fsgid;
114 inode->i_mode = mode;
115 inode->i_mtime = inode->i_atime = inode->i_ctime =
116 ubifs_current_time(inode);
117 inode->i_mapping->nrpages = 0;
118 /* Disable readahead */
119 inode->i_mapping->backing_dev_info = &c->bdi;
120
121 switch (mode & S_IFMT) {
122 case S_IFREG:
123 inode->i_mapping->a_ops = &ubifs_file_address_operations;
124 inode->i_op = &ubifs_file_inode_operations;
125 inode->i_fop = &ubifs_file_operations;
126 break;
127 case S_IFDIR:
128 inode->i_op = &ubifs_dir_inode_operations;
129 inode->i_fop = &ubifs_dir_operations;
130 inode->i_size = ui->ui_size = UBIFS_INO_NODE_SZ;
131 break;
132 case S_IFLNK:
133 inode->i_op = &ubifs_symlink_inode_operations;
134 break;
135 case S_IFSOCK:
136 case S_IFIFO:
137 case S_IFBLK:
138 case S_IFCHR:
139 inode->i_op = &ubifs_file_inode_operations;
140 break;
141 default:
142 BUG();
143 }
144
145 ui->flags = inherit_flags(dir, mode);
146 ubifs_set_inode_flags(inode);
147 if (S_ISREG(mode))
148 ui->compr_type = c->default_compr;
149 else
150 ui->compr_type = UBIFS_COMPR_NONE;
151 ui->synced_i_size = 0;
152
153 spin_lock(&c->cnt_lock);
154 /* Inode number overflow is currently not supported */
155 if (c->highest_inum >= INUM_WARN_WATERMARK) {
156 if (c->highest_inum >= INUM_WATERMARK) {
157 spin_unlock(&c->cnt_lock);
158 ubifs_err("out of inode numbers");
159 make_bad_inode(inode);
160 iput(inode);
161 return ERR_PTR(-EINVAL);
162 }
163 ubifs_warn("running out of inode numbers (current %lu, max %d)",
164 c->highest_inum, INUM_WATERMARK);
165 }
166
167 inode->i_ino = ++c->highest_inum;
168 inode->i_generation = ++c->vfs_gen;
169 /*
170 * The creation sequence number remains with this inode for its
171 * lifetime. All nodes for this inode have a greater sequence number,
172 * and so it is possible to distinguish obsolete nodes belonging to a
173 * previous incarnation of the same inode number - for example, for the
174 * purpose of rebuilding the index.
175 */
176 ui->creat_sqnum = ++c->max_sqnum;
177 spin_unlock(&c->cnt_lock);
178 return inode;
179}
180
181#ifdef CONFIG_UBIFS_FS_DEBUG
182
183static int dbg_check_name(struct ubifs_dent_node *dent, struct qstr *nm)
184{
185 if (!(ubifs_chk_flags & UBIFS_CHK_GEN))
186 return 0;
187 if (le16_to_cpu(dent->nlen) != nm->len)
188 return -EINVAL;
189 if (memcmp(dent->name, nm->name, nm->len))
190 return -EINVAL;
191 return 0;
192}
193
194#else
195
196#define dbg_check_name(dent, nm) 0
197
198#endif
199
200static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry,
201 struct nameidata *nd)
202{
203 int err;
204 union ubifs_key key;
205 struct inode *inode = NULL;
206 struct ubifs_dent_node *dent;
207 struct ubifs_info *c = dir->i_sb->s_fs_info;
208
209 dbg_gen("'%.*s' in dir ino %lu",
210 dentry->d_name.len, dentry->d_name.name, dir->i_ino);
211
212 if (dentry->d_name.len > UBIFS_MAX_NLEN)
213 return ERR_PTR(-ENAMETOOLONG);
214
215 dent = kmalloc(UBIFS_MAX_DENT_NODE_SZ, GFP_NOFS);
216 if (!dent)
217 return ERR_PTR(-ENOMEM);
218
219 dent_key_init(c, &key, dir->i_ino, &dentry->d_name);
220
221 err = ubifs_tnc_lookup_nm(c, &key, dent, &dentry->d_name);
222 if (err) {
223 /*
224 * Do not hash the direntry if parent 'i_nlink' is zero, because
225 * this has side-effects - '->delete_inode()' call will not be
226 * called for the parent orphan inode, because 'd_count' of its
227 * direntry will stay 1 (it'll be negative direntry I guess)
228 * and prevent 'iput_final()' until the dentry is destroyed due
229 * to unmount or memory pressure.
230 */
231 if (err == -ENOENT && dir->i_nlink != 0) {
232 dbg_gen("not found");
233 goto done;
234 }
235 goto out;
236 }
237
238 if (dbg_check_name(dent, &dentry->d_name)) {
239 err = -EINVAL;
240 goto out;
241 }
242
243 inode = ubifs_iget(dir->i_sb, le64_to_cpu(dent->inum));
244 if (IS_ERR(inode)) {
245 /*
246 * This should not happen. Probably the file-system needs
247 * checking.
248 */
249 err = PTR_ERR(inode);
250 ubifs_err("dead directory entry '%.*s', error %d",
251 dentry->d_name.len, dentry->d_name.name, err);
252 ubifs_ro_mode(c, err);
253 goto out;
254 }
255
256done:
257 kfree(dent);
258 /*
259 * Note, d_splice_alias() would be required instead if we supported
260 * NFS.
261 */
262 d_add(dentry, inode);
263 return NULL;
264
265out:
266 kfree(dent);
267 return ERR_PTR(err);
268}
269
270static int ubifs_create(struct inode *dir, struct dentry *dentry, int mode,
271 struct nameidata *nd)
272{
273 struct inode *inode;
274 struct ubifs_info *c = dir->i_sb->s_fs_info;
275 int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len);
276 struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
277 .dirtied_ino = 1 };
278 struct ubifs_inode *dir_ui = ubifs_inode(dir);
279
280 /*
281 * Budget request settings: new inode, new direntry, changing the
282 * parent directory inode.
283 */
284
285 dbg_gen("dent '%.*s', mode %#x in dir ino %lu",
286 dentry->d_name.len, dentry->d_name.name, mode, dir->i_ino);
287
288 err = ubifs_budget_space(c, &req);
289 if (err)
290 return err;
291
292 inode = ubifs_new_inode(c, dir, mode);
293 if (IS_ERR(inode)) {
294 err = PTR_ERR(inode);
295 goto out_budg;
296 }
297
298 mutex_lock(&dir_ui->ui_mutex);
299 dir->i_size += sz_change;
300 dir_ui->ui_size = dir->i_size;
301 dir->i_mtime = dir->i_ctime = inode->i_ctime;
302 err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0);
303 if (err)
304 goto out_cancel;
305 mutex_unlock(&dir_ui->ui_mutex);
306
307 ubifs_release_budget(c, &req);
308 insert_inode_hash(inode);
309 d_instantiate(dentry, inode);
310 return 0;
311
312out_cancel:
313 dir->i_size -= sz_change;
314 dir_ui->ui_size = dir->i_size;
315 mutex_unlock(&dir_ui->ui_mutex);
316 make_bad_inode(inode);
317 iput(inode);
318out_budg:
319 ubifs_release_budget(c, &req);
320 ubifs_err("cannot create regular file, error %d", err);
321 return err;
322}
323
324/**
325 * vfs_dent_type - get VFS directory entry type.
326 * @type: UBIFS directory entry type
327 *
328 * This function converts UBIFS directory entry type into VFS directory entry
329 * type.
330 */
331static unsigned int vfs_dent_type(uint8_t type)
332{
333 switch (type) {
334 case UBIFS_ITYPE_REG:
335 return DT_REG;
336 case UBIFS_ITYPE_DIR:
337 return DT_DIR;
338 case UBIFS_ITYPE_LNK:
339 return DT_LNK;
340 case UBIFS_ITYPE_BLK:
341 return DT_BLK;
342 case UBIFS_ITYPE_CHR:
343 return DT_CHR;
344 case UBIFS_ITYPE_FIFO:
345 return DT_FIFO;
346 case UBIFS_ITYPE_SOCK:
347 return DT_SOCK;
348 default:
349 BUG();
350 }
351 return 0;
352}
353
354/*
355 * The classical Unix view for directory is that it is a linear array of
356 * (name, inode number) entries. Linux/VFS assumes this model as well.
357 * Particularly, 'readdir()' call wants us to return a directory entry offset
358 * which later may be used to continue 'readdir()'ing the directory or to
359 * 'seek()' to that specific direntry. Obviously UBIFS does not really fit this
360 * model because directory entries are identified by keys, which may collide.
361 *
362 * UBIFS uses directory entry hash value for directory offsets, so
363 * 'seekdir()'/'telldir()' may not always work because of possible key
364 * collisions. But UBIFS guarantees that consecutive 'readdir()' calls work
365 * properly by means of saving full directory entry name in the private field
366 * of the file description object.
367 *
368 * This means that UBIFS cannot support NFS which requires full
369 * 'seekdir()'/'telldir()' support.
370 */
371static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir)
372{
373 int err, over = 0;
374 struct qstr nm;
375 union ubifs_key key;
376 struct ubifs_dent_node *dent;
377 struct inode *dir = file->f_path.dentry->d_inode;
378 struct ubifs_info *c = dir->i_sb->s_fs_info;
379
380 dbg_gen("dir ino %lu, f_pos %#llx", dir->i_ino, file->f_pos);
381
382 if (file->f_pos > UBIFS_S_KEY_HASH_MASK || file->f_pos == 2)
383 /*
384 * The directory was seek'ed to a senseless position or there
385 * are no more entries.
386 */
387 return 0;
388
389 /* File positions 0 and 1 correspond to "." and ".." */
390 if (file->f_pos == 0) {
391 ubifs_assert(!file->private_data);
392 over = filldir(dirent, ".", 1, 0, dir->i_ino, DT_DIR);
393 if (over)
394 return 0;
395 file->f_pos = 1;
396 }
397
398 if (file->f_pos == 1) {
399 ubifs_assert(!file->private_data);
400 over = filldir(dirent, "..", 2, 1,
401 parent_ino(file->f_path.dentry), DT_DIR);
402 if (over)
403 return 0;
404
405 /* Find the first entry in TNC and save it */
406 lowest_dent_key(c, &key, dir->i_ino);
407 nm.name = NULL;
408 dent = ubifs_tnc_next_ent(c, &key, &nm);
409 if (IS_ERR(dent)) {
410 err = PTR_ERR(dent);
411 goto out;
412 }
413
414 file->f_pos = key_hash_flash(c, &dent->key);
415 file->private_data = dent;
416 }
417
418 dent = file->private_data;
419 if (!dent) {
420 /*
421 * The directory was seek'ed to and is now readdir'ed.
422 * Find the entry corresponding to @file->f_pos or the
423 * closest one.
424 */
425 dent_key_init_hash(c, &key, dir->i_ino, file->f_pos);
426 nm.name = NULL;
427 dent = ubifs_tnc_next_ent(c, &key, &nm);
428 if (IS_ERR(dent)) {
429 err = PTR_ERR(dent);
430 goto out;
431 }
432 file->f_pos = key_hash_flash(c, &dent->key);
433 file->private_data = dent;
434 }
435
436 while (1) {
437 dbg_gen("feed '%s', ino %llu, new f_pos %#x",
438 dent->name, le64_to_cpu(dent->inum),
439 key_hash_flash(c, &dent->key));
440 ubifs_assert(dent->ch.sqnum > ubifs_inode(dir)->creat_sqnum);
441
442 nm.len = le16_to_cpu(dent->nlen);
443 over = filldir(dirent, dent->name, nm.len, file->f_pos,
444 le64_to_cpu(dent->inum),
445 vfs_dent_type(dent->type));
446 if (over)
447 return 0;
448
449 /* Switch to the next entry */
450 key_read(c, &dent->key, &key);
451 nm.name = dent->name;
452 dent = ubifs_tnc_next_ent(c, &key, &nm);
453 if (IS_ERR(dent)) {
454 err = PTR_ERR(dent);
455 goto out;
456 }
457
458 kfree(file->private_data);
459 file->f_pos = key_hash_flash(c, &dent->key);
460 file->private_data = dent;
461 cond_resched();
462 }
463
464out:
465 if (err != -ENOENT) {
466 ubifs_err("cannot find next direntry, error %d", err);
467 return err;
468 }
469
470 kfree(file->private_data);
471 file->private_data = NULL;
472 file->f_pos = 2;
473 return 0;
474}
475
476/* If a directory is seeked, we have to free saved readdir() state */
477static loff_t ubifs_dir_llseek(struct file *file, loff_t offset, int origin)
478{
479 kfree(file->private_data);
480 file->private_data = NULL;
481 return generic_file_llseek(file, offset, origin);
482}
483
484/* Free saved readdir() state when the directory is closed */
485static int ubifs_dir_release(struct inode *dir, struct file *file)
486{
487 kfree(file->private_data);
488 file->private_data = NULL;
489 return 0;
490}
491
492/**
493 * lock_2_inodes - lock two UBIFS inodes.
494 * @inode1: first inode
495 * @inode2: second inode
496 */
497static void lock_2_inodes(struct inode *inode1, struct inode *inode2)
498{
499 if (inode1->i_ino < inode2->i_ino) {
500 mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_2);
501 mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_3);
502 } else {
503 mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_2);
504 mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_3);
505 }
506}
507
508/**
509 * unlock_2_inodes - unlock two UBIFS inodes inodes.
510 * @inode1: first inode
511 * @inode2: second inode
512 */
513static void unlock_2_inodes(struct inode *inode1, struct inode *inode2)
514{
515 mutex_unlock(&ubifs_inode(inode1)->ui_mutex);
516 mutex_unlock(&ubifs_inode(inode2)->ui_mutex);
517}
518
519static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
520 struct dentry *dentry)
521{
522 struct ubifs_info *c = dir->i_sb->s_fs_info;
523 struct inode *inode = old_dentry->d_inode;
524 struct ubifs_inode *ui = ubifs_inode(inode);
525 struct ubifs_inode *dir_ui = ubifs_inode(dir);
526 int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len);
527 struct ubifs_budget_req req = { .new_dent = 1, .dirtied_ino = 2,
528 .dirtied_ino_d = ui->data_len };
529
530 /*
531 * Budget request settings: new direntry, changing the target inode,
532 * changing the parent inode.
533 */
534
535 dbg_gen("dent '%.*s' to ino %lu (nlink %d) in dir ino %lu",
536 dentry->d_name.len, dentry->d_name.name, inode->i_ino,
537 inode->i_nlink, dir->i_ino);
538 err = dbg_check_synced_i_size(inode);
539 if (err)
540 return err;
541
542 err = ubifs_budget_space(c, &req);
543 if (err)
544 return err;
545
546 lock_2_inodes(dir, inode);
547 inc_nlink(inode);
548 atomic_inc(&inode->i_count);
549 inode->i_ctime = ubifs_current_time(inode);
550 dir->i_size += sz_change;
551 dir_ui->ui_size = dir->i_size;
552 dir->i_mtime = dir->i_ctime = inode->i_ctime;
553 err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0);
554 if (err)
555 goto out_cancel;
556 unlock_2_inodes(dir, inode);
557
558 ubifs_release_budget(c, &req);
559 d_instantiate(dentry, inode);
560 return 0;
561
562out_cancel:
563 dir->i_size -= sz_change;
564 dir_ui->ui_size = dir->i_size;
565 drop_nlink(inode);
566 unlock_2_inodes(dir, inode);
567 ubifs_release_budget(c, &req);
568 iput(inode);
569 return err;
570}
571
572static int ubifs_unlink(struct inode *dir, struct dentry *dentry)
573{
574 struct ubifs_info *c = dir->i_sb->s_fs_info;
575 struct inode *inode = dentry->d_inode;
576 struct ubifs_inode *dir_ui = ubifs_inode(dir);
577 int sz_change = CALC_DENT_SIZE(dentry->d_name.len);
578 int err, budgeted = 1;
579 struct ubifs_budget_req req = { .mod_dent = 1, .dirtied_ino = 2 };
580
581 /*
582 * Budget request settings: deletion direntry, deletion inode (+1 for
583 * @dirtied_ino), changing the parent directory inode. If budgeting
584 * fails, go ahead anyway because we have extra space reserved for
585 * deletions.
586 */
587
588 dbg_gen("dent '%.*s' from ino %lu (nlink %d) in dir ino %lu",
589 dentry->d_name.len, dentry->d_name.name, inode->i_ino,
590 inode->i_nlink, dir->i_ino);
591 err = dbg_check_synced_i_size(inode);
592 if (err)
593 return err;
594
595 err = ubifs_budget_space(c, &req);
596 if (err) {
597 if (err != -ENOSPC)
598 return err;
599 err = 0;
600 budgeted = 0;
601 }
602
603 lock_2_inodes(dir, inode);
604 inode->i_ctime = ubifs_current_time(dir);
605 drop_nlink(inode);
606 dir->i_size -= sz_change;
607 dir_ui->ui_size = dir->i_size;
608 dir->i_mtime = dir->i_ctime = inode->i_ctime;
609 err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 1, 0);
610 if (err)
611 goto out_cancel;
612 unlock_2_inodes(dir, inode);
613
614 if (budgeted)
615 ubifs_release_budget(c, &req);
616 else {
617 /* We've deleted something - clean the "no space" flags */
618 c->nospace = c->nospace_rp = 0;
619 smp_wmb();
620 }
621 return 0;
622
623out_cancel:
624 dir->i_size += sz_change;
625 dir_ui->ui_size = dir->i_size;
626 inc_nlink(inode);
627 unlock_2_inodes(dir, inode);
628 if (budgeted)
629 ubifs_release_budget(c, &req);
630 return err;
631}
632
633/**
634 * check_dir_empty - check if a directory is empty or not.
635 * @c: UBIFS file-system description object
636 * @dir: VFS inode object of the directory to check
637 *
638 * This function checks if directory @dir is empty. Returns zero if the
639 * directory is empty, %-ENOTEMPTY if it is not, and other negative error codes
640 * in case of of errors.
641 */
642static int check_dir_empty(struct ubifs_info *c, struct inode *dir)
643{
644 struct qstr nm = { .name = NULL };
645 struct ubifs_dent_node *dent;
646 union ubifs_key key;
647 int err;
648
649 lowest_dent_key(c, &key, dir->i_ino);
650 dent = ubifs_tnc_next_ent(c, &key, &nm);
651 if (IS_ERR(dent)) {
652 err = PTR_ERR(dent);
653 if (err == -ENOENT)
654 err = 0;
655 } else {
656 kfree(dent);
657 err = -ENOTEMPTY;
658 }
659 return err;
660}
661
662static int ubifs_rmdir(struct inode *dir, struct dentry *dentry)
663{
664 struct ubifs_info *c = dir->i_sb->s_fs_info;
665 struct inode *inode = dentry->d_inode;
666 int sz_change = CALC_DENT_SIZE(dentry->d_name.len);
667 int err, budgeted = 1;
668 struct ubifs_inode *dir_ui = ubifs_inode(dir);
669 struct ubifs_budget_req req = { .mod_dent = 1, .dirtied_ino = 2 };
670
671 /*
672 * Budget request settings: deletion direntry, deletion inode and
673 * changing the parent inode. If budgeting fails, go ahead anyway
674 * because we have extra space reserved for deletions.
675 */
676
677 dbg_gen("directory '%.*s', ino %lu in dir ino %lu", dentry->d_name.len,
678 dentry->d_name.name, inode->i_ino, dir->i_ino);
679
680 err = check_dir_empty(c, dentry->d_inode);
681 if (err)
682 return err;
683
684 err = ubifs_budget_space(c, &req);
685 if (err) {
686 if (err != -ENOSPC)
687 return err;
688 budgeted = 0;
689 }
690
691 lock_2_inodes(dir, inode);
692 inode->i_ctime = ubifs_current_time(dir);
693 clear_nlink(inode);
694 drop_nlink(dir);
695 dir->i_size -= sz_change;
696 dir_ui->ui_size = dir->i_size;
697 dir->i_mtime = dir->i_ctime = inode->i_ctime;
698 err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 1, 0);
699 if (err)
700 goto out_cancel;
701 unlock_2_inodes(dir, inode);
702
703 if (budgeted)
704 ubifs_release_budget(c, &req);
705 else {
706 /* We've deleted something - clean the "no space" flags */
707 c->nospace = c->nospace_rp = 0;
708 smp_wmb();
709 }
710 return 0;
711
712out_cancel:
713 dir->i_size += sz_change;
714 dir_ui->ui_size = dir->i_size;
715 inc_nlink(dir);
716 inc_nlink(inode);
717 inc_nlink(inode);
718 unlock_2_inodes(dir, inode);
719 if (budgeted)
720 ubifs_release_budget(c, &req);
721 return err;
722}
723
724static int ubifs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
725{
726 struct inode *inode;
727 struct ubifs_inode *dir_ui = ubifs_inode(dir);
728 struct ubifs_info *c = dir->i_sb->s_fs_info;
729 int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len);
730 struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
731 .dirtied_ino_d = 1 };
732
733 /*
734 * Budget request settings: new inode, new direntry and changing parent
735 * directory inode.
736 */
737
738 dbg_gen("dent '%.*s', mode %#x in dir ino %lu",
739 dentry->d_name.len, dentry->d_name.name, mode, dir->i_ino);
740
741 err = ubifs_budget_space(c, &req);
742 if (err)
743 return err;
744
745 inode = ubifs_new_inode(c, dir, S_IFDIR | mode);
746 if (IS_ERR(inode)) {
747 err = PTR_ERR(inode);
748 goto out_budg;
749 }
750
751 mutex_lock(&dir_ui->ui_mutex);
752 insert_inode_hash(inode);
753 inc_nlink(inode);
754 inc_nlink(dir);
755 dir->i_size += sz_change;
756 dir_ui->ui_size = dir->i_size;
757 dir->i_mtime = dir->i_ctime = inode->i_ctime;
758 err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0);
759 if (err) {
760 ubifs_err("cannot create directory, error %d", err);
761 goto out_cancel;
762 }
763 mutex_unlock(&dir_ui->ui_mutex);
764
765 ubifs_release_budget(c, &req);
766 d_instantiate(dentry, inode);
767 return 0;
768
769out_cancel:
770 dir->i_size -= sz_change;
771 dir_ui->ui_size = dir->i_size;
772 drop_nlink(dir);
773 mutex_unlock(&dir_ui->ui_mutex);
774 make_bad_inode(inode);
775 iput(inode);
776out_budg:
777 ubifs_release_budget(c, &req);
778 return err;
779}
780
781static int ubifs_mknod(struct inode *dir, struct dentry *dentry,
782 int mode, dev_t rdev)
783{
784 struct inode *inode;
785 struct ubifs_inode *ui;
786 struct ubifs_inode *dir_ui = ubifs_inode(dir);
787 struct ubifs_info *c = dir->i_sb->s_fs_info;
788 union ubifs_dev_desc *dev = NULL;
789 int sz_change = CALC_DENT_SIZE(dentry->d_name.len);
790 int err, devlen = 0;
791 struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
792 .new_ino_d = devlen, .dirtied_ino = 1 };
793
794 /*
795 * Budget request settings: new inode, new direntry and changing parent
796 * directory inode.
797 */
798
799 dbg_gen("dent '%.*s' in dir ino %lu",
800 dentry->d_name.len, dentry->d_name.name, dir->i_ino);
801
802 if (!new_valid_dev(rdev))
803 return -EINVAL;
804
805 if (S_ISBLK(mode) || S_ISCHR(mode)) {
806 dev = kmalloc(sizeof(union ubifs_dev_desc), GFP_NOFS);
807 if (!dev)
808 return -ENOMEM;
809 devlen = ubifs_encode_dev(dev, rdev);
810 }
811
812 err = ubifs_budget_space(c, &req);
813 if (err) {
814 kfree(dev);
815 return err;
816 }
817
818 inode = ubifs_new_inode(c, dir, mode);
819 if (IS_ERR(inode)) {
820 kfree(dev);
821 err = PTR_ERR(inode);
822 goto out_budg;
823 }
824
825 init_special_inode(inode, inode->i_mode, rdev);
826 inode->i_size = ubifs_inode(inode)->ui_size = devlen;
827 ui = ubifs_inode(inode);
828 ui->data = dev;
829 ui->data_len = devlen;
830
831 mutex_lock(&dir_ui->ui_mutex);
832 dir->i_size += sz_change;
833 dir_ui->ui_size = dir->i_size;
834 dir->i_mtime = dir->i_ctime = inode->i_ctime;
835 err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0);
836 if (err)
837 goto out_cancel;
838 mutex_unlock(&dir_ui->ui_mutex);
839
840 ubifs_release_budget(c, &req);
841 insert_inode_hash(inode);
842 d_instantiate(dentry, inode);
843 return 0;
844
845out_cancel:
846 dir->i_size -= sz_change;
847 dir_ui->ui_size = dir->i_size;
848 mutex_unlock(&dir_ui->ui_mutex);
849 make_bad_inode(inode);
850 iput(inode);
851out_budg:
852 ubifs_release_budget(c, &req);
853 return err;
854}
855
856static int ubifs_symlink(struct inode *dir, struct dentry *dentry,
857 const char *symname)
858{
859 struct inode *inode;
860 struct ubifs_inode *ui;
861 struct ubifs_inode *dir_ui = ubifs_inode(dir);
862 struct ubifs_info *c = dir->i_sb->s_fs_info;
863 int err, len = strlen(symname);
864 int sz_change = CALC_DENT_SIZE(dentry->d_name.len);
865 struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
866 .new_ino_d = len, .dirtied_ino = 1 };
867
868 /*
869 * Budget request settings: new inode, new direntry and changing parent
870 * directory inode.
871 */
872
873 dbg_gen("dent '%.*s', target '%s' in dir ino %lu", dentry->d_name.len,
874 dentry->d_name.name, symname, dir->i_ino);
875
876 if (len > UBIFS_MAX_INO_DATA)
877 return -ENAMETOOLONG;
878
879 err = ubifs_budget_space(c, &req);
880 if (err)
881 return err;
882
883 inode = ubifs_new_inode(c, dir, S_IFLNK | S_IRWXUGO);
884 if (IS_ERR(inode)) {
885 err = PTR_ERR(inode);
886 goto out_budg;
887 }
888
889 ui = ubifs_inode(inode);
890 ui->data = kmalloc(len + 1, GFP_NOFS);
891 if (!ui->data) {
892 err = -ENOMEM;
893 goto out_inode;
894 }
895
896 memcpy(ui->data, symname, len);
897 ((char *)ui->data)[len] = '\0';
898 /*
899 * The terminating zero byte is not written to the flash media and it
900 * is put just to make later in-memory string processing simpler. Thus,
901 * data length is @len, not @len + %1.
902 */
903 ui->data_len = len;
904 inode->i_size = ubifs_inode(inode)->ui_size = len;
905
906 mutex_lock(&dir_ui->ui_mutex);
907 dir->i_size += sz_change;
908 dir_ui->ui_size = dir->i_size;
909 dir->i_mtime = dir->i_ctime = inode->i_ctime;
910 err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0);
911 if (err)
912 goto out_cancel;
913 mutex_unlock(&dir_ui->ui_mutex);
914
915 ubifs_release_budget(c, &req);
916 insert_inode_hash(inode);
917 d_instantiate(dentry, inode);
918 return 0;
919
920out_cancel:
921 dir->i_size -= sz_change;
922 dir_ui->ui_size = dir->i_size;
923 mutex_unlock(&dir_ui->ui_mutex);
924out_inode:
925 make_bad_inode(inode);
926 iput(inode);
927out_budg:
928 ubifs_release_budget(c, &req);
929 return err;
930}
931
932/**
933 * lock_3_inodes - lock three UBIFS inodes for rename.
934 * @inode1: first inode
935 * @inode2: second inode
936 * @inode3: third inode
937 *
938 * For 'ubifs_rename()', @inode1 may be the same as @inode2 whereas @inode3 may
939 * be null.
940 */
941static void lock_3_inodes(struct inode *inode1, struct inode *inode2,
942 struct inode *inode3)
943{
944 struct inode *i1, *i2, *i3;
945
946 if (!inode3) {
947 if (inode1 != inode2) {
948 lock_2_inodes(inode1, inode2);
949 return;
950 }
951 mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_1);
952 return;
953 }
954
955 if (inode1 == inode2) {
956 lock_2_inodes(inode1, inode3);
957 return;
958 }
959
960 /* 3 different inodes */
961 if (inode1 < inode2) {
962 i3 = inode2;
963 if (inode1 < inode3) {
964 i1 = inode1;
965 i2 = inode3;
966 } else {
967 i1 = inode3;
968 i2 = inode1;
969 }
970 } else {
971 i3 = inode1;
972 if (inode2 < inode3) {
973 i1 = inode2;
974 i2 = inode3;
975 } else {
976 i1 = inode3;
977 i2 = inode2;
978 }
979 }
980 mutex_lock_nested(&ubifs_inode(i1)->ui_mutex, WB_MUTEX_1);
981 lock_2_inodes(i2, i3);
982}
983
984/**
985 * unlock_3_inodes - unlock three UBIFS inodes for rename.
986 * @inode1: first inode
987 * @inode2: second inode
988 * @inode3: third inode
989 */
990static void unlock_3_inodes(struct inode *inode1, struct inode *inode2,
991 struct inode *inode3)
992{
993 mutex_unlock(&ubifs_inode(inode1)->ui_mutex);
994 if (inode1 != inode2)
995 mutex_unlock(&ubifs_inode(inode2)->ui_mutex);
996 if (inode3)
997 mutex_unlock(&ubifs_inode(inode3)->ui_mutex);
998}
999
1000static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry,
1001 struct inode *new_dir, struct dentry *new_dentry)
1002{
1003 struct ubifs_info *c = old_dir->i_sb->s_fs_info;
1004 struct inode *old_inode = old_dentry->d_inode;
1005 struct inode *new_inode = new_dentry->d_inode;
1006 struct ubifs_inode *old_inode_ui = ubifs_inode(old_inode);
1007 int err, release, sync = 0, move = (new_dir != old_dir);
1008 int is_dir = S_ISDIR(old_inode->i_mode);
1009 int unlink = !!new_inode;
1010 int new_sz = CALC_DENT_SIZE(new_dentry->d_name.len);
1011 int old_sz = CALC_DENT_SIZE(old_dentry->d_name.len);
1012 struct ubifs_budget_req req = { .new_dent = 1, .mod_dent = 1,
1013 .dirtied_ino = 3 };
1014 struct ubifs_budget_req ino_req = { .dirtied_ino = 1,
1015 .dirtied_ino_d = old_inode_ui->data_len };
1016 struct timespec time;
1017
1018 /*
1019 * Budget request settings: deletion direntry, new direntry, removing
1020 * the old inode, and changing old and new parent directory inodes.
1021 *
1022 * However, this operation also marks the target inode as dirty and
1023 * does not write it, so we allocate budget for the target inode
1024 * separately.
1025 */
1026
1027 dbg_gen("dent '%.*s' ino %lu in dir ino %lu to dent '%.*s' in "
1028 "dir ino %lu", old_dentry->d_name.len, old_dentry->d_name.name,
1029 old_inode->i_ino, old_dir->i_ino, new_dentry->d_name.len,
1030 new_dentry->d_name.name, new_dir->i_ino);
1031
1032 if (unlink && is_dir) {
1033 err = check_dir_empty(c, new_inode);
1034 if (err)
1035 return err;
1036 }
1037
1038 err = ubifs_budget_space(c, &req);
1039 if (err)
1040 return err;
1041 err = ubifs_budget_space(c, &ino_req);
1042 if (err) {
1043 ubifs_release_budget(c, &req);
1044 return err;
1045 }
1046
1047 lock_3_inodes(old_dir, new_dir, new_inode);
1048
1049 /*
1050 * Like most other Unix systems, set the @i_ctime for inodes on a
1051 * rename.
1052 */
1053 time = ubifs_current_time(old_dir);
1054 old_inode->i_ctime = time;
1055
1056 /* We must adjust parent link count when renaming directories */
1057 if (is_dir) {
1058 if (move) {
1059 /*
1060 * @old_dir loses a link because we are moving
1061 * @old_inode to a different directory.
1062 */
1063 drop_nlink(old_dir);
1064 /*
1065 * @new_dir only gains a link if we are not also
1066 * overwriting an existing directory.
1067 */
1068 if (!unlink)
1069 inc_nlink(new_dir);
1070 } else {
1071 /*
1072 * @old_inode is not moving to a different directory,
1073 * but @old_dir still loses a link if we are
1074 * overwriting an existing directory.
1075 */
1076 if (unlink)
1077 drop_nlink(old_dir);
1078 }
1079 }
1080
1081 old_dir->i_size -= old_sz;
1082 ubifs_inode(old_dir)->ui_size = old_dir->i_size;
1083 old_dir->i_mtime = old_dir->i_ctime = time;
1084 new_dir->i_mtime = new_dir->i_ctime = time;
1085
1086 /*
1087 * And finally, if we unlinked a direntry which happened to have the
1088 * same name as the moved direntry, we have to decrement @i_nlink of
1089 * the unlinked inode and change its ctime.
1090 */
1091 if (unlink) {
1092 /*
1093 * Directories cannot have hard-links, so if this is a
1094 * directory, decrement its @i_nlink twice because an empty
1095 * directory has @i_nlink 2.
1096 */
1097 if (is_dir)
1098 drop_nlink(new_inode);
1099 new_inode->i_ctime = time;
1100 drop_nlink(new_inode);
1101 } else {
1102 new_dir->i_size += new_sz;
1103 ubifs_inode(new_dir)->ui_size = new_dir->i_size;
1104 }
1105
1106 /*
1107 * Do not ask 'ubifs_jnl_rename()' to flush write-buffer if @old_inode
1108 * is dirty, because this will be done later on at the end of
1109 * 'ubifs_rename()'.
1110 */
1111 if (IS_SYNC(old_inode)) {
1112 sync = IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir);
1113 if (unlink && IS_SYNC(new_inode))
1114 sync = 1;
1115 }
1116 err = ubifs_jnl_rename(c, old_dir, old_dentry, new_dir, new_dentry,
1117 sync);
1118 if (err)
1119 goto out_cancel;
1120
1121 unlock_3_inodes(old_dir, new_dir, new_inode);
1122 ubifs_release_budget(c, &req);
1123
1124 mutex_lock(&old_inode_ui->ui_mutex);
1125 release = old_inode_ui->dirty;
1126 mark_inode_dirty_sync(old_inode);
1127 mutex_unlock(&old_inode_ui->ui_mutex);
1128
1129 if (release)
1130 ubifs_release_budget(c, &ino_req);
1131 if (IS_SYNC(old_inode))
1132 err = old_inode->i_sb->s_op->write_inode(old_inode, 1);
1133 return err;
1134
1135out_cancel:
1136 if (unlink) {
1137 if (is_dir)
1138 inc_nlink(new_inode);
1139 inc_nlink(new_inode);
1140 } else {
1141 new_dir->i_size -= new_sz;
1142 ubifs_inode(new_dir)->ui_size = new_dir->i_size;
1143 }
1144 old_dir->i_size += old_sz;
1145 ubifs_inode(old_dir)->ui_size = old_dir->i_size;
1146 if (is_dir) {
1147 if (move) {
1148 inc_nlink(old_dir);
1149 if (!unlink)
1150 drop_nlink(new_dir);
1151 } else {
1152 if (unlink)
1153 inc_nlink(old_dir);
1154 }
1155 }
1156 unlock_3_inodes(old_dir, new_dir, new_inode);
1157 ubifs_release_budget(c, &ino_req);
1158 ubifs_release_budget(c, &req);
1159 return err;
1160}
1161
1162int ubifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
1163 struct kstat *stat)
1164{
1165 loff_t size;
1166 struct inode *inode = dentry->d_inode;
1167 struct ubifs_inode *ui = ubifs_inode(inode);
1168
1169 mutex_lock(&ui->ui_mutex);
1170 stat->dev = inode->i_sb->s_dev;
1171 stat->ino = inode->i_ino;
1172 stat->mode = inode->i_mode;
1173 stat->nlink = inode->i_nlink;
1174 stat->uid = inode->i_uid;
1175 stat->gid = inode->i_gid;
1176 stat->rdev = inode->i_rdev;
1177 stat->atime = inode->i_atime;
1178 stat->mtime = inode->i_mtime;
1179 stat->ctime = inode->i_ctime;
1180 stat->blksize = UBIFS_BLOCK_SIZE;
1181 stat->size = ui->ui_size;
1182
1183 /*
1184 * Unfortunately, the 'stat()' system call was designed for block
1185 * device based file systems, and it is not appropriate for UBIFS,
1186 * because UBIFS does not have notion of "block". For example, it is
1187 * difficult to tell how many block a directory takes - it actually
1188 * takes less than 300 bytes, but we have to round it to block size,
1189 * which introduces large mistake. This makes utilities like 'du' to
1190 * report completely senseless numbers. This is the reason why UBIFS
1191 * goes the same way as JFFS2 - it reports zero blocks for everything
1192 * but regular files, which makes more sense than reporting completely
1193 * wrong sizes.
1194 */
1195 if (S_ISREG(inode->i_mode)) {
1196 size = ui->xattr_size;
1197 size += stat->size;
1198 size = ALIGN(size, UBIFS_BLOCK_SIZE);
1199 /*
1200 * Note, user-space expects 512-byte blocks count irrespectively
1201 * of what was reported in @stat->size.
1202 */
1203 stat->blocks = size >> 9;
1204 } else
1205 stat->blocks = 0;
1206 mutex_unlock(&ui->ui_mutex);
1207 return 0;
1208}
1209
1210struct inode_operations ubifs_dir_inode_operations = {
1211 .lookup = ubifs_lookup,
1212 .create = ubifs_create,
1213 .link = ubifs_link,
1214 .symlink = ubifs_symlink,
1215 .unlink = ubifs_unlink,
1216 .mkdir = ubifs_mkdir,
1217 .rmdir = ubifs_rmdir,
1218 .mknod = ubifs_mknod,
1219 .rename = ubifs_rename,
1220 .setattr = ubifs_setattr,
1221 .getattr = ubifs_getattr,
1222#ifdef CONFIG_UBIFS_FS_XATTR
1223 .setxattr = ubifs_setxattr,
1224 .getxattr = ubifs_getxattr,
1225 .listxattr = ubifs_listxattr,
1226 .removexattr = ubifs_removexattr,
1227#endif
1228};
1229
1230struct file_operations ubifs_dir_operations = {
1231 .llseek = ubifs_dir_llseek,
1232 .release = ubifs_dir_release,
1233 .read = generic_read_dir,
1234 .readdir = ubifs_readdir,
1235 .fsync = ubifs_fsync,
1236 .unlocked_ioctl = ubifs_ioctl,
1237#ifdef CONFIG_COMPAT
1238 .compat_ioctl = ubifs_compat_ioctl,
1239#endif
1240};
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
new file mode 100644
index 000000000000..005a3b854d96
--- /dev/null
+++ b/fs/ubifs/file.c
@@ -0,0 +1,1275 @@
1/*
2 * This file is part of UBIFS.
3 *
4 * Copyright (C) 2006-2008 Nokia Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published by
8 * the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along with
16 * this program; if not, write to the Free Software Foundation, Inc., 51
17 * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 * Authors: Artem Bityutskiy (Битюцкий Артём)
20 * Adrian Hunter
21 */
22
23/*
24 * This file implements VFS file and inode operations of regular files, device
25 * nodes and symlinks as well as address space operations.
26 *
27 * UBIFS uses 2 page flags: PG_private and PG_checked. PG_private is set if the
28 * page is dirty and is used for budgeting purposes - dirty pages should not be
29 * budgeted. The PG_checked flag is set if full budgeting is required for the
30 * page e.g., when it corresponds to a file hole or it is just beyond the file
31 * size. The budgeting is done in 'ubifs_write_begin()', because it is OK to
32 * fail in this function, and the budget is released in 'ubifs_write_end()'. So
33 * the PG_private and PG_checked flags carry the information about how the page
34 * was budgeted, to make it possible to release the budget properly.
35 *
36 * A thing to keep in mind: inode's 'i_mutex' is locked in most VFS operations
37 * we implement. However, this is not true for '->writepage()', which might be
38 * called with 'i_mutex' unlocked. For example, when pdflush is performing
39 * write-back, it calls 'writepage()' with unlocked 'i_mutex', although the
40 * inode has 'I_LOCK' flag in this case. At "normal" work-paths 'i_mutex' is
41 * locked in '->writepage', e.g. in "sys_write -> alloc_pages -> direct reclaim
42 * path'. So, in '->writepage()' we are only guaranteed that the page is
43 * locked.
44 *
45 * Similarly, 'i_mutex' does not have to be locked in readpage(), e.g.,
46 * readahead path does not have it locked ("sys_read -> generic_file_aio_read
47 * -> ondemand_readahead -> readpage"). In case of readahead, 'I_LOCK' flag is
48 * not set as well. However, UBIFS disables readahead.
49 *
50 * This, for example means that there might be 2 concurrent '->writepage()'
51 * calls for the same inode, but different inode dirty pages.
52 */
53
54#include "ubifs.h"
55#include <linux/mount.h>
56
57static int read_block(struct inode *inode, void *addr, unsigned int block,
58 struct ubifs_data_node *dn)
59{
60 struct ubifs_info *c = inode->i_sb->s_fs_info;
61 int err, len, out_len;
62 union ubifs_key key;
63 unsigned int dlen;
64
65 data_key_init(c, &key, inode->i_ino, block);
66 err = ubifs_tnc_lookup(c, &key, dn);
67 if (err) {
68 if (err == -ENOENT)
69 /* Not found, so it must be a hole */
70 memset(addr, 0, UBIFS_BLOCK_SIZE);
71 return err;
72 }
73
74 ubifs_assert(dn->ch.sqnum > ubifs_inode(inode)->creat_sqnum);
75
76 len = le32_to_cpu(dn->size);
77 if (len <= 0 || len > UBIFS_BLOCK_SIZE)
78 goto dump;
79
80 dlen = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ;
81 out_len = UBIFS_BLOCK_SIZE;
82 err = ubifs_decompress(&dn->data, dlen, addr, &out_len,
83 le16_to_cpu(dn->compr_type));
84 if (err || len != out_len)
85 goto dump;
86
87 /*
88 * Data length can be less than a full block, even for blocks that are
89 * not the last in the file (e.g., as a result of making a hole and
90 * appending data). Ensure that the remainder is zeroed out.
91 */
92 if (len < UBIFS_BLOCK_SIZE)
93 memset(addr + len, 0, UBIFS_BLOCK_SIZE - len);
94
95 return 0;
96
97dump:
98 ubifs_err("bad data node (block %u, inode %lu)",
99 block, inode->i_ino);
100 dbg_dump_node(c, dn);
101 return -EINVAL;
102}
103
104static int do_readpage(struct page *page)
105{
106 void *addr;
107 int err = 0, i;
108 unsigned int block, beyond;
109 struct ubifs_data_node *dn;
110 struct inode *inode = page->mapping->host;
111 loff_t i_size = i_size_read(inode);
112
113 dbg_gen("ino %lu, pg %lu, i_size %lld, flags %#lx",
114 inode->i_ino, page->index, i_size, page->flags);
115 ubifs_assert(!PageChecked(page));
116 ubifs_assert(!PagePrivate(page));
117
118 addr = kmap(page);
119
120 block = page->index << UBIFS_BLOCKS_PER_PAGE_SHIFT;
121 beyond = (i_size + UBIFS_BLOCK_SIZE - 1) >> UBIFS_BLOCK_SHIFT;
122 if (block >= beyond) {
123 /* Reading beyond inode */
124 SetPageChecked(page);
125 memset(addr, 0, PAGE_CACHE_SIZE);
126 goto out;
127 }
128
129 dn = kmalloc(UBIFS_MAX_DATA_NODE_SZ, GFP_NOFS);
130 if (!dn) {
131 err = -ENOMEM;
132 goto error;
133 }
134
135 i = 0;
136 while (1) {
137 int ret;
138
139 if (block >= beyond) {
140 /* Reading beyond inode */
141 err = -ENOENT;
142 memset(addr, 0, UBIFS_BLOCK_SIZE);
143 } else {
144 ret = read_block(inode, addr, block, dn);
145 if (ret) {
146 err = ret;
147 if (err != -ENOENT)
148 break;
149 }
150 }
151 if (++i >= UBIFS_BLOCKS_PER_PAGE)
152 break;
153 block += 1;
154 addr += UBIFS_BLOCK_SIZE;
155 }
156 if (err) {
157 if (err == -ENOENT) {
158 /* Not found, so it must be a hole */
159 SetPageChecked(page);
160 dbg_gen("hole");
161 goto out_free;
162 }
163 ubifs_err("cannot read page %lu of inode %lu, error %d",
164 page->index, inode->i_ino, err);
165 goto error;
166 }
167
168out_free:
169 kfree(dn);
170out:
171 SetPageUptodate(page);
172 ClearPageError(page);
173 flush_dcache_page(page);
174 kunmap(page);
175 return 0;
176
177error:
178 kfree(dn);
179 ClearPageUptodate(page);
180 SetPageError(page);
181 flush_dcache_page(page);
182 kunmap(page);
183 return err;
184}
185
186/**
187 * release_new_page_budget - release budget of a new page.
188 * @c: UBIFS file-system description object
189 *
190 * This is a helper function which releases budget corresponding to the budget
191 * of one new page of data.
192 */
193static void release_new_page_budget(struct ubifs_info *c)
194{
195 struct ubifs_budget_req req = { .recalculate = 1, .new_page = 1 };
196
197 ubifs_release_budget(c, &req);
198}
199
200/**
201 * release_existing_page_budget - release budget of an existing page.
202 * @c: UBIFS file-system description object
203 *
204 * This is a helper function which releases budget corresponding to the budget
205 * of changing one one page of data which already exists on the flash media.
206 */
207static void release_existing_page_budget(struct ubifs_info *c)
208{
209 struct ubifs_budget_req req = { .dd_growth = c->page_budget};
210
211 ubifs_release_budget(c, &req);
212}
213
214static int write_begin_slow(struct address_space *mapping,
215 loff_t pos, unsigned len, struct page **pagep)
216{
217 struct inode *inode = mapping->host;
218 struct ubifs_info *c = inode->i_sb->s_fs_info;
219 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
220 struct ubifs_budget_req req = { .new_page = 1 };
221 int uninitialized_var(err), appending = !!(pos + len > inode->i_size);
222 struct page *page;
223
224 dbg_gen("ino %lu, pos %llu, len %u, i_size %lld",
225 inode->i_ino, pos, len, inode->i_size);
226
227 /*
228 * At the slow path we have to budget before locking the page, because
229 * budgeting may force write-back, which would wait on locked pages and
230 * deadlock if we had the page locked. At this point we do not know
231 * anything about the page, so assume that this is a new page which is
232 * written to a hole. This corresponds to largest budget. Later the
233 * budget will be amended if this is not true.
234 */
235 if (appending)
236 /* We are appending data, budget for inode change */
237 req.dirtied_ino = 1;
238
239 err = ubifs_budget_space(c, &req);
240 if (unlikely(err))
241 return err;
242
243 page = __grab_cache_page(mapping, index);
244 if (unlikely(!page)) {
245 ubifs_release_budget(c, &req);
246 return -ENOMEM;
247 }
248
249 if (!PageUptodate(page)) {
250 if (!(pos & PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE)
251 SetPageChecked(page);
252 else {
253 err = do_readpage(page);
254 if (err) {
255 unlock_page(page);
256 page_cache_release(page);
257 return err;
258 }
259 }
260
261 SetPageUptodate(page);
262 ClearPageError(page);
263 }
264
265 if (PagePrivate(page))
266 /*
267 * The page is dirty, which means it was budgeted twice:
268 * o first time the budget was allocated by the task which
269 * made the page dirty and set the PG_private flag;
270 * o and then we budgeted for it for the second time at the
271 * very beginning of this function.
272 *
273 * So what we have to do is to release the page budget we
274 * allocated.
275 */
276 release_new_page_budget(c);
277 else if (!PageChecked(page))
278 /*
279 * We are changing a page which already exists on the media.
280 * This means that changing the page does not make the amount
281 * of indexing information larger, and this part of the budget
282 * which we have already acquired may be released.
283 */
284 ubifs_convert_page_budget(c);
285
286 if (appending) {
287 struct ubifs_inode *ui = ubifs_inode(inode);
288
289 /*
290 * 'ubifs_write_end()' is optimized from the fast-path part of
291 * 'ubifs_write_begin()' and expects the @ui_mutex to be locked
292 * if data is appended.
293 */
294 mutex_lock(&ui->ui_mutex);
295 if (ui->dirty)
296 /*
297 * The inode is dirty already, so we may free the
298 * budget we allocated.
299 */
300 ubifs_release_dirty_inode_budget(c, ui);
301 }
302
303 *pagep = page;
304 return 0;
305}
306
307/**
308 * allocate_budget - allocate budget for 'ubifs_write_begin()'.
309 * @c: UBIFS file-system description object
310 * @page: page to allocate budget for
311 * @ui: UBIFS inode object the page belongs to
312 * @appending: non-zero if the page is appended
313 *
314 * This is a helper function for 'ubifs_write_begin()' which allocates budget
315 * for the operation. The budget is allocated differently depending on whether
316 * this is appending, whether the page is dirty or not, and so on. This
317 * function leaves the @ui->ui_mutex locked in case of appending. Returns zero
318 * in case of success and %-ENOSPC in case of failure.
319 */
320static int allocate_budget(struct ubifs_info *c, struct page *page,
321 struct ubifs_inode *ui, int appending)
322{
323 struct ubifs_budget_req req = { .fast = 1 };
324
325 if (PagePrivate(page)) {
326 if (!appending)
327 /*
328 * The page is dirty and we are not appending, which
329 * means no budget is needed at all.
330 */
331 return 0;
332
333 mutex_lock(&ui->ui_mutex);
334 if (ui->dirty)
335 /*
336 * The page is dirty and we are appending, so the inode
337 * has to be marked as dirty. However, it is already
338 * dirty, so we do not need any budget. We may return,
339 * but @ui->ui_mutex hast to be left locked because we
340 * should prevent write-back from flushing the inode
341 * and freeing the budget. The lock will be released in
342 * 'ubifs_write_end()'.
343 */
344 return 0;
345
346 /*
347 * The page is dirty, we are appending, the inode is clean, so
348 * we need to budget the inode change.
349 */
350 req.dirtied_ino = 1;
351 } else {
352 if (PageChecked(page))
353 /*
354 * The page corresponds to a hole and does not
355 * exist on the media. So changing it makes
356 * make the amount of indexing information
357 * larger, and we have to budget for a new
358 * page.
359 */
360 req.new_page = 1;
361 else
362 /*
363 * Not a hole, the change will not add any new
364 * indexing information, budget for page
365 * change.
366 */
367 req.dirtied_page = 1;
368
369 if (appending) {
370 mutex_lock(&ui->ui_mutex);
371 if (!ui->dirty)
372 /*
373 * The inode is clean but we will have to mark
374 * it as dirty because we are appending. This
375 * needs a budget.
376 */
377 req.dirtied_ino = 1;
378 }
379 }
380
381 return ubifs_budget_space(c, &req);
382}
383
384/*
385 * This function is called when a page of data is going to be written. Since
386 * the page of data will not necessarily go to the flash straight away, UBIFS
387 * has to reserve space on the media for it, which is done by means of
388 * budgeting.
389 *
390 * This is the hot-path of the file-system and we are trying to optimize it as
391 * much as possible. For this reasons it is split on 2 parts - slow and fast.
392 *
393 * There many budgeting cases:
394 * o a new page is appended - we have to budget for a new page and for
395 * changing the inode; however, if the inode is already dirty, there is
396 * no need to budget for it;
397 * o an existing clean page is changed - we have budget for it; if the page
398 * does not exist on the media (a hole), we have to budget for a new
399 * page; otherwise, we may budget for changing an existing page; the
400 * difference between these cases is that changing an existing page does
401 * not introduce anything new to the FS indexing information, so it does
402 * not grow, and smaller budget is acquired in this case;
403 * o an existing dirty page is changed - no need to budget at all, because
404 * the page budget has been acquired by earlier, when the page has been
405 * marked dirty.
406 *
407 * UBIFS budgeting sub-system may force write-back if it thinks there is no
408 * space to reserve. This imposes some locking restrictions and makes it
409 * impossible to take into account the above cases, and makes it impossible to
410 * optimize budgeting.
411 *
412 * The solution for this is that the fast path of 'ubifs_write_begin()' assumes
413 * there is a plenty of flash space and the budget will be acquired quickly,
414 * without forcing write-back. The slow path does not make this assumption.
415 */
416static int ubifs_write_begin(struct file *file, struct address_space *mapping,
417 loff_t pos, unsigned len, unsigned flags,
418 struct page **pagep, void **fsdata)
419{
420 struct inode *inode = mapping->host;
421 struct ubifs_info *c = inode->i_sb->s_fs_info;
422 struct ubifs_inode *ui = ubifs_inode(inode);
423 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
424 int uninitialized_var(err), appending = !!(pos + len > inode->i_size);
425 struct page *page;
426
427
428 ubifs_assert(ubifs_inode(inode)->ui_size == inode->i_size);
429
430 if (unlikely(c->ro_media))
431 return -EROFS;
432
433 /* Try out the fast-path part first */
434 page = __grab_cache_page(mapping, index);
435 if (unlikely(!page))
436 return -ENOMEM;
437
438 if (!PageUptodate(page)) {
439 /* The page is not loaded from the flash */
440 if (!(pos & PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE)
441 /*
442 * We change whole page so no need to load it. But we
443 * have to set the @PG_checked flag to make the further
444 * code the page is new. This might be not true, but it
445 * is better to budget more that to read the page from
446 * the media.
447 */
448 SetPageChecked(page);
449 else {
450 err = do_readpage(page);
451 if (err) {
452 unlock_page(page);
453 page_cache_release(page);
454 return err;
455 }
456 }
457
458 SetPageUptodate(page);
459 ClearPageError(page);
460 }
461
462 err = allocate_budget(c, page, ui, appending);
463 if (unlikely(err)) {
464 ubifs_assert(err == -ENOSPC);
465 /*
466 * Budgeting failed which means it would have to force
467 * write-back but didn't, because we set the @fast flag in the
468 * request. Write-back cannot be done now, while we have the
469 * page locked, because it would deadlock. Unlock and free
470 * everything and fall-back to slow-path.
471 */
472 if (appending) {
473 ubifs_assert(mutex_is_locked(&ui->ui_mutex));
474 mutex_unlock(&ui->ui_mutex);
475 }
476 unlock_page(page);
477 page_cache_release(page);
478
479 return write_begin_slow(mapping, pos, len, pagep);
480 }
481
482 /*
483 * Whee, we aquired budgeting quickly - without involving
484 * garbage-collection, committing or forceing write-back. We return
485 * with @ui->ui_mutex locked if we are appending pages, and unlocked
486 * otherwise. This is an optimization (slightly hacky though).
487 */
488 *pagep = page;
489 return 0;
490
491}
492
493/**
494 * cancel_budget - cancel budget.
495 * @c: UBIFS file-system description object
496 * @page: page to cancel budget for
497 * @ui: UBIFS inode object the page belongs to
498 * @appending: non-zero if the page is appended
499 *
500 * This is a helper function for a page write operation. It unlocks the
501 * @ui->ui_mutex in case of appending.
502 */
503static void cancel_budget(struct ubifs_info *c, struct page *page,
504 struct ubifs_inode *ui, int appending)
505{
506 if (appending) {
507 if (!ui->dirty)
508 ubifs_release_dirty_inode_budget(c, ui);
509 mutex_unlock(&ui->ui_mutex);
510 }
511 if (!PagePrivate(page)) {
512 if (PageChecked(page))
513 release_new_page_budget(c);
514 else
515 release_existing_page_budget(c);
516 }
517}
518
519static int ubifs_write_end(struct file *file, struct address_space *mapping,
520 loff_t pos, unsigned len, unsigned copied,
521 struct page *page, void *fsdata)
522{
523 struct inode *inode = mapping->host;
524 struct ubifs_inode *ui = ubifs_inode(inode);
525 struct ubifs_info *c = inode->i_sb->s_fs_info;
526 loff_t end_pos = pos + len;
527 int appending = !!(end_pos > inode->i_size);
528
529 dbg_gen("ino %lu, pos %llu, pg %lu, len %u, copied %d, i_size %lld",
530 inode->i_ino, pos, page->index, len, copied, inode->i_size);
531
532 if (unlikely(copied < len && len == PAGE_CACHE_SIZE)) {
533 /*
534 * VFS copied less data to the page that it intended and
535 * declared in its '->write_begin()' call via the @len
536 * argument. If the page was not up-to-date, and @len was
537 * @PAGE_CACHE_SIZE, the 'ubifs_write_begin()' function did
538 * not load it from the media (for optimization reasons). This
539 * means that part of the page contains garbage. So read the
540 * page now.
541 */
542 dbg_gen("copied %d instead of %d, read page and repeat",
543 copied, len);
544 cancel_budget(c, page, ui, appending);
545
546 /*
547 * Return 0 to force VFS to repeat the whole operation, or the
548 * error code if 'do_readpage()' failes.
549 */
550 copied = do_readpage(page);
551 goto out;
552 }
553
554 if (!PagePrivate(page)) {
555 SetPagePrivate(page);
556 atomic_long_inc(&c->dirty_pg_cnt);
557 __set_page_dirty_nobuffers(page);
558 }
559
560 if (appending) {
561 i_size_write(inode, end_pos);
562 ui->ui_size = end_pos;
563 /*
564 * Note, we do not set @I_DIRTY_PAGES (which means that the
565 * inode has dirty pages), this has been done in
566 * '__set_page_dirty_nobuffers()'.
567 */
568 __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
569 ubifs_assert(mutex_is_locked(&ui->ui_mutex));
570 mutex_unlock(&ui->ui_mutex);
571 }
572
573out:
574 unlock_page(page);
575 page_cache_release(page);
576 return copied;
577}
578
579static int ubifs_readpage(struct file *file, struct page *page)
580{
581 do_readpage(page);
582 unlock_page(page);
583 return 0;
584}
585
586static int do_writepage(struct page *page, int len)
587{
588 int err = 0, i, blen;
589 unsigned int block;
590 void *addr;
591 union ubifs_key key;
592 struct inode *inode = page->mapping->host;
593 struct ubifs_info *c = inode->i_sb->s_fs_info;
594
595#ifdef UBIFS_DEBUG
596 spin_lock(&ui->ui_lock);
597 ubifs_assert(page->index <= ui->synced_i_size << PAGE_CACHE_SIZE);
598 spin_unlock(&ui->ui_lock);
599#endif
600
601 /* Update radix tree tags */
602 set_page_writeback(page);
603
604 addr = kmap(page);
605 block = page->index << UBIFS_BLOCKS_PER_PAGE_SHIFT;
606 i = 0;
607 while (len) {
608 blen = min_t(int, len, UBIFS_BLOCK_SIZE);
609 data_key_init(c, &key, inode->i_ino, block);
610 err = ubifs_jnl_write_data(c, inode, &key, addr, blen);
611 if (err)
612 break;
613 if (++i >= UBIFS_BLOCKS_PER_PAGE)
614 break;
615 block += 1;
616 addr += blen;
617 len -= blen;
618 }
619 if (err) {
620 SetPageError(page);
621 ubifs_err("cannot write page %lu of inode %lu, error %d",
622 page->index, inode->i_ino, err);
623 ubifs_ro_mode(c, err);
624 }
625
626 ubifs_assert(PagePrivate(page));
627 if (PageChecked(page))
628 release_new_page_budget(c);
629 else
630 release_existing_page_budget(c);
631
632 atomic_long_dec(&c->dirty_pg_cnt);
633 ClearPagePrivate(page);
634 ClearPageChecked(page);
635
636 kunmap(page);
637 unlock_page(page);
638 end_page_writeback(page);
639 return err;
640}
641
642/*
643 * When writing-back dirty inodes, VFS first writes-back pages belonging to the
644 * inode, then the inode itself. For UBIFS this may cause a problem. Consider a
645 * situation when a we have an inode with size 0, then a megabyte of data is
646 * appended to the inode, then write-back starts and flushes some amount of the
647 * dirty pages, the journal becomes full, commit happens and finishes, and then
648 * an unclean reboot happens. When the file system is mounted next time, the
649 * inode size would still be 0, but there would be many pages which are beyond
650 * the inode size, they would be indexed and consume flash space. Because the
651 * journal has been committed, the replay would not be able to detect this
652 * situation and correct the inode size. This means UBIFS would have to scan
653 * whole index and correct all inode sizes, which is long an unacceptable.
654 *
655 * To prevent situations like this, UBIFS writes pages back only if they are
656 * within last synchronized inode size, i.e. the the size which has been
657 * written to the flash media last time. Otherwise, UBIFS forces inode
658 * write-back, thus making sure the on-flash inode contains current inode size,
659 * and then keeps writing pages back.
660 *
661 * Some locking issues explanation. 'ubifs_writepage()' first is called with
662 * the page locked, and it locks @ui_mutex. However, write-back does take inode
663 * @i_mutex, which means other VFS operations may be run on this inode at the
664 * same time. And the problematic one is truncation to smaller size, from where
665 * we have to call 'vmtruncate()', which first changes @inode->i_size, then
666 * drops the truncated pages. And while dropping the pages, it takes the page
667 * lock. This means that 'do_truncation()' cannot call 'vmtruncate()' with
668 * @ui_mutex locked, because it would deadlock with 'ubifs_writepage()'. This
669 * means that @inode->i_size is changed while @ui_mutex is unlocked.
670 *
671 * But in 'ubifs_writepage()' we have to guarantee that we do not write beyond
672 * inode size. How do we do this if @inode->i_size may became smaller while we
673 * are in the middle of 'ubifs_writepage()'? The UBIFS solution is the
674 * @ui->ui_isize "shadow" field which UBIFS uses instead of @inode->i_size
675 * internally and updates it under @ui_mutex.
676 *
677 * Q: why we do not worry that if we race with truncation, we may end up with a
678 * situation when the inode is truncated while we are in the middle of
679 * 'do_writepage()', so we do write beyond inode size?
680 * A: If we are in the middle of 'do_writepage()', truncation would be locked
681 * on the page lock and it would not write the truncated inode node to the
682 * journal before we have finished.
683 */
684static int ubifs_writepage(struct page *page, struct writeback_control *wbc)
685{
686 struct inode *inode = page->mapping->host;
687 struct ubifs_inode *ui = ubifs_inode(inode);
688 loff_t i_size = i_size_read(inode), synced_i_size;
689 pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
690 int err, len = i_size & (PAGE_CACHE_SIZE - 1);
691 void *kaddr;
692
693 dbg_gen("ino %lu, pg %lu, pg flags %#lx",
694 inode->i_ino, page->index, page->flags);
695 ubifs_assert(PagePrivate(page));
696
697 /* Is the page fully outside @i_size? (truncate in progress) */
698 if (page->index > end_index || (page->index == end_index && !len)) {
699 err = 0;
700 goto out_unlock;
701 }
702
703 spin_lock(&ui->ui_lock);
704 synced_i_size = ui->synced_i_size;
705 spin_unlock(&ui->ui_lock);
706
707 /* Is the page fully inside @i_size? */
708 if (page->index < end_index) {
709 if (page->index >= synced_i_size >> PAGE_CACHE_SHIFT) {
710 err = inode->i_sb->s_op->write_inode(inode, 1);
711 if (err)
712 goto out_unlock;
713 /*
714 * The inode has been written, but the write-buffer has
715 * not been synchronized, so in case of an unclean
716 * reboot we may end up with some pages beyond inode
717 * size, but they would be in the journal (because
718 * commit flushes write buffers) and recovery would deal
719 * with this.
720 */
721 }
722 return do_writepage(page, PAGE_CACHE_SIZE);
723 }
724
725 /*
726 * The page straddles @i_size. It must be zeroed out on each and every
727 * writepage invocation because it may be mmapped. "A file is mapped
728 * in multiples of the page size. For a file that is not a multiple of
729 * the page size, the remaining memory is zeroed when mapped, and
730 * writes to that region are not written out to the file."
731 */
732 kaddr = kmap_atomic(page, KM_USER0);
733 memset(kaddr + len, 0, PAGE_CACHE_SIZE - len);
734 flush_dcache_page(page);
735 kunmap_atomic(kaddr, KM_USER0);
736
737 if (i_size > synced_i_size) {
738 err = inode->i_sb->s_op->write_inode(inode, 1);
739 if (err)
740 goto out_unlock;
741 }
742
743 return do_writepage(page, len);
744
745out_unlock:
746 unlock_page(page);
747 return err;
748}
749
750/**
751 * do_attr_changes - change inode attributes.
752 * @inode: inode to change attributes for
753 * @attr: describes attributes to change
754 */
755static void do_attr_changes(struct inode *inode, const struct iattr *attr)
756{
757 if (attr->ia_valid & ATTR_UID)
758 inode->i_uid = attr->ia_uid;
759 if (attr->ia_valid & ATTR_GID)
760 inode->i_gid = attr->ia_gid;
761 if (attr->ia_valid & ATTR_ATIME)
762 inode->i_atime = timespec_trunc(attr->ia_atime,
763 inode->i_sb->s_time_gran);
764 if (attr->ia_valid & ATTR_MTIME)
765 inode->i_mtime = timespec_trunc(attr->ia_mtime,
766 inode->i_sb->s_time_gran);
767 if (attr->ia_valid & ATTR_CTIME)
768 inode->i_ctime = timespec_trunc(attr->ia_ctime,
769 inode->i_sb->s_time_gran);
770 if (attr->ia_valid & ATTR_MODE) {
771 umode_t mode = attr->ia_mode;
772
773 if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
774 mode &= ~S_ISGID;
775 inode->i_mode = mode;
776 }
777}
778
779/**
780 * do_truncation - truncate an inode.
781 * @c: UBIFS file-system description object
782 * @inode: inode to truncate
783 * @attr: inode attribute changes description
784 *
785 * This function implements VFS '->setattr()' call when the inode is truncated
786 * to a smaller size. Returns zero in case of success and a negative error code
787 * in case of failure.
788 */
789static int do_truncation(struct ubifs_info *c, struct inode *inode,
790 const struct iattr *attr)
791{
792 int err;
793 struct ubifs_budget_req req;
794 loff_t old_size = inode->i_size, new_size = attr->ia_size;
795 int offset = new_size & (UBIFS_BLOCK_SIZE - 1);
796 struct ubifs_inode *ui = ubifs_inode(inode);
797
798 dbg_gen("ino %lu, size %lld -> %lld", inode->i_ino, old_size, new_size);
799 memset(&req, 0, sizeof(struct ubifs_budget_req));
800
801 /*
802 * If this is truncation to a smaller size, and we do not truncate on a
803 * block boundary, budget for changing one data block, because the last
804 * block will be re-written.
805 */
806 if (new_size & (UBIFS_BLOCK_SIZE - 1))
807 req.dirtied_page = 1;
808
809 req.dirtied_ino = 1;
810 /* A funny way to budget for truncation node */
811 req.dirtied_ino_d = UBIFS_TRUN_NODE_SZ;
812 err = ubifs_budget_space(c, &req);
813 if (err)
814 return err;
815
816 err = vmtruncate(inode, new_size);
817 if (err)
818 goto out_budg;
819
820 if (offset) {
821 pgoff_t index = new_size >> PAGE_CACHE_SHIFT;
822 struct page *page;
823
824 page = find_lock_page(inode->i_mapping, index);
825 if (page) {
826 if (PageDirty(page)) {
827 /*
828 * 'ubifs_jnl_truncate()' will try to truncate
829 * the last data node, but it contains
830 * out-of-date data because the page is dirty.
831 * Write the page now, so that
832 * 'ubifs_jnl_truncate()' will see an already
833 * truncated (and up to date) data node.
834 */
835 ubifs_assert(PagePrivate(page));
836
837 clear_page_dirty_for_io(page);
838 if (UBIFS_BLOCKS_PER_PAGE_SHIFT)
839 offset = new_size &
840 (PAGE_CACHE_SIZE - 1);
841 err = do_writepage(page, offset);
842 page_cache_release(page);
843 if (err)
844 goto out_budg;
845 /*
846 * We could now tell 'ubifs_jnl_truncate()' not
847 * to read the last block.
848 */
849 } else {
850 /*
851 * We could 'kmap()' the page and pass the data
852 * to 'ubifs_jnl_truncate()' to save it from
853 * having to read it.
854 */
855 unlock_page(page);
856 page_cache_release(page);
857 }
858 }
859 }
860
861 mutex_lock(&ui->ui_mutex);
862 ui->ui_size = inode->i_size;
863 /* Truncation changes inode [mc]time */
864 inode->i_mtime = inode->i_ctime = ubifs_current_time(inode);
865 /* The other attributes may be changed at the same time as well */
866 do_attr_changes(inode, attr);
867
868 err = ubifs_jnl_truncate(c, inode, old_size, new_size);
869 mutex_unlock(&ui->ui_mutex);
870out_budg:
871 ubifs_release_budget(c, &req);
872 return err;
873}
874
875/**
876 * do_setattr - change inode attributes.
877 * @c: UBIFS file-system description object
878 * @inode: inode to change attributes for
879 * @attr: inode attribute changes description
880 *
881 * This function implements VFS '->setattr()' call for all cases except
882 * truncations to smaller size. Returns zero in case of success and a negative
883 * error code in case of failure.
884 */
885static int do_setattr(struct ubifs_info *c, struct inode *inode,
886 const struct iattr *attr)
887{
888 int err, release;
889 loff_t new_size = attr->ia_size;
890 struct ubifs_inode *ui = ubifs_inode(inode);
891 struct ubifs_budget_req req = { .dirtied_ino = 1,
892 .dirtied_ino_d = ui->data_len };
893
894 err = ubifs_budget_space(c, &req);
895 if (err)
896 return err;
897
898 if (attr->ia_valid & ATTR_SIZE) {
899 dbg_gen("size %lld -> %lld", inode->i_size, new_size);
900 err = vmtruncate(inode, new_size);
901 if (err)
902 goto out;
903 }
904
905 mutex_lock(&ui->ui_mutex);
906 if (attr->ia_valid & ATTR_SIZE) {
907 /* Truncation changes inode [mc]time */
908 inode->i_mtime = inode->i_ctime = ubifs_current_time(inode);
909 /* 'vmtruncate()' changed @i_size, update @ui_size */
910 ui->ui_size = inode->i_size;
911 }
912
913 do_attr_changes(inode, attr);
914
915 release = ui->dirty;
916 if (attr->ia_valid & ATTR_SIZE)
917 /*
918 * Inode length changed, so we have to make sure
919 * @I_DIRTY_DATASYNC is set.
920 */
921 __mark_inode_dirty(inode, I_DIRTY_SYNC | I_DIRTY_DATASYNC);
922 else
923 mark_inode_dirty_sync(inode);
924 mutex_unlock(&ui->ui_mutex);
925
926 if (release)
927 ubifs_release_budget(c, &req);
928 if (IS_SYNC(inode))
929 err = inode->i_sb->s_op->write_inode(inode, 1);
930 return err;
931
932out:
933 ubifs_release_budget(c, &req);
934 return err;
935}
936
937int ubifs_setattr(struct dentry *dentry, struct iattr *attr)
938{
939 int err;
940 struct inode *inode = dentry->d_inode;
941 struct ubifs_info *c = inode->i_sb->s_fs_info;
942
943 dbg_gen("ino %lu, ia_valid %#x", inode->i_ino, attr->ia_valid);
944 err = inode_change_ok(inode, attr);
945 if (err)
946 return err;
947
948 err = dbg_check_synced_i_size(inode);
949 if (err)
950 return err;
951
952 if ((attr->ia_valid & ATTR_SIZE) && attr->ia_size < inode->i_size)
953 /* Truncation to a smaller size */
954 err = do_truncation(c, inode, attr);
955 else
956 err = do_setattr(c, inode, attr);
957
958 return err;
959}
960
961static void ubifs_invalidatepage(struct page *page, unsigned long offset)
962{
963 struct inode *inode = page->mapping->host;
964 struct ubifs_info *c = inode->i_sb->s_fs_info;
965
966 ubifs_assert(PagePrivate(page));
967 if (offset)
968 /* Partial page remains dirty */
969 return;
970
971 if (PageChecked(page))
972 release_new_page_budget(c);
973 else
974 release_existing_page_budget(c);
975
976 atomic_long_dec(&c->dirty_pg_cnt);
977 ClearPagePrivate(page);
978 ClearPageChecked(page);
979}
980
981static void *ubifs_follow_link(struct dentry *dentry, struct nameidata *nd)
982{
983 struct ubifs_inode *ui = ubifs_inode(dentry->d_inode);
984
985 nd_set_link(nd, ui->data);
986 return NULL;
987}
988
989int ubifs_fsync(struct file *file, struct dentry *dentry, int datasync)
990{
991 struct inode *inode = dentry->d_inode;
992 struct ubifs_info *c = inode->i_sb->s_fs_info;
993 int err;
994
995 dbg_gen("syncing inode %lu", inode->i_ino);
996
997 /*
998 * VFS has already synchronized dirty pages for this inode. Synchronize
999 * the inode unless this is a 'datasync()' call.
1000 */
1001 if (!datasync || (inode->i_state & I_DIRTY_DATASYNC)) {
1002 err = inode->i_sb->s_op->write_inode(inode, 1);
1003 if (err)
1004 return err;
1005 }
1006
1007 /*
1008 * Nodes related to this inode may still sit in a write-buffer. Flush
1009 * them.
1010 */
1011 err = ubifs_sync_wbufs_by_inode(c, inode);
1012 if (err)
1013 return err;
1014
1015 return 0;
1016}
1017
1018/**
1019 * mctime_update_needed - check if mtime or ctime update is needed.
1020 * @inode: the inode to do the check for
1021 * @now: current time
1022 *
1023 * This helper function checks if the inode mtime/ctime should be updated or
1024 * not. If current values of the time-stamps are within the UBIFS inode time
1025 * granularity, they are not updated. This is an optimization.
1026 */
1027static inline int mctime_update_needed(const struct inode *inode,
1028 const struct timespec *now)
1029{
1030 if (!timespec_equal(&inode->i_mtime, now) ||
1031 !timespec_equal(&inode->i_ctime, now))
1032 return 1;
1033 return 0;
1034}
1035
1036/**
1037 * update_ctime - update mtime and ctime of an inode.
1038 * @c: UBIFS file-system description object
1039 * @inode: inode to update
1040 *
1041 * This function updates mtime and ctime of the inode if it is not equivalent to
1042 * current time. Returns zero in case of success and a negative error code in
1043 * case of failure.
1044 */
1045static int update_mctime(struct ubifs_info *c, struct inode *inode)
1046{
1047 struct timespec now = ubifs_current_time(inode);
1048 struct ubifs_inode *ui = ubifs_inode(inode);
1049
1050 if (mctime_update_needed(inode, &now)) {
1051 int err, release;
1052 struct ubifs_budget_req req = { .dirtied_ino = 1,
1053 .dirtied_ino_d = ui->data_len };
1054
1055 err = ubifs_budget_space(c, &req);
1056 if (err)
1057 return err;
1058
1059 mutex_lock(&ui->ui_mutex);
1060 inode->i_mtime = inode->i_ctime = ubifs_current_time(inode);
1061 release = ui->dirty;
1062 mark_inode_dirty_sync(inode);
1063 mutex_unlock(&ui->ui_mutex);
1064 if (release)
1065 ubifs_release_budget(c, &req);
1066 }
1067
1068 return 0;
1069}
1070
1071static ssize_t ubifs_aio_write(struct kiocb *iocb, const struct iovec *iov,
1072 unsigned long nr_segs, loff_t pos)
1073{
1074 int err;
1075 ssize_t ret;
1076 struct inode *inode = iocb->ki_filp->f_mapping->host;
1077 struct ubifs_info *c = inode->i_sb->s_fs_info;
1078
1079 err = update_mctime(c, inode);
1080 if (err)
1081 return err;
1082
1083 ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
1084 if (ret < 0)
1085 return ret;
1086
1087 if (ret > 0 && (IS_SYNC(inode) || iocb->ki_filp->f_flags & O_SYNC)) {
1088 err = ubifs_sync_wbufs_by_inode(c, inode);
1089 if (err)
1090 return err;
1091 }
1092
1093 return ret;
1094}
1095
1096static int ubifs_set_page_dirty(struct page *page)
1097{
1098 int ret;
1099
1100 ret = __set_page_dirty_nobuffers(page);
1101 /*
1102 * An attempt to dirty a page without budgeting for it - should not
1103 * happen.
1104 */
1105 ubifs_assert(ret == 0);
1106 return ret;
1107}
1108
1109static int ubifs_releasepage(struct page *page, gfp_t unused_gfp_flags)
1110{
1111 /*
1112 * An attempt to release a dirty page without budgeting for it - should
1113 * not happen.
1114 */
1115 if (PageWriteback(page))
1116 return 0;
1117 ubifs_assert(PagePrivate(page));
1118 ubifs_assert(0);
1119 ClearPagePrivate(page);
1120 ClearPageChecked(page);
1121 return 1;
1122}
1123
1124/*
1125 * mmap()d file has taken write protection fault and is being made
1126 * writable. UBIFS must ensure page is budgeted for.
1127 */
1128static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page)
1129{
1130 struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
1131 struct ubifs_info *c = inode->i_sb->s_fs_info;
1132 struct timespec now = ubifs_current_time(inode);
1133 struct ubifs_budget_req req = { .new_page = 1 };
1134 int err, update_time;
1135
1136 dbg_gen("ino %lu, pg %lu, i_size %lld", inode->i_ino, page->index,
1137 i_size_read(inode));
1138 ubifs_assert(!(inode->i_sb->s_flags & MS_RDONLY));
1139
1140 if (unlikely(c->ro_media))
1141 return -EROFS;
1142
1143 /*
1144 * We have not locked @page so far so we may budget for changing the
1145 * page. Note, we cannot do this after we locked the page, because
1146 * budgeting may cause write-back which would cause deadlock.
1147 *
1148 * At the moment we do not know whether the page is dirty or not, so we
1149 * assume that it is not and budget for a new page. We could look at
1150 * the @PG_private flag and figure this out, but we may race with write
1151 * back and the page state may change by the time we lock it, so this
1152 * would need additional care. We do not bother with this at the
1153 * moment, although it might be good idea to do. Instead, we allocate
1154 * budget for a new page and amend it later on if the page was in fact
1155 * dirty.
1156 *
1157 * The budgeting-related logic of this function is similar to what we
1158 * do in 'ubifs_write_begin()' and 'ubifs_write_end()'. Glance there
1159 * for more comments.
1160 */
1161 update_time = mctime_update_needed(inode, &now);
1162 if (update_time)
1163 /*
1164 * We have to change inode time stamp which requires extra
1165 * budgeting.
1166 */
1167 req.dirtied_ino = 1;
1168
1169 err = ubifs_budget_space(c, &req);
1170 if (unlikely(err)) {
1171 if (err == -ENOSPC)
1172 ubifs_warn("out of space for mmapped file "
1173 "(inode number %lu)", inode->i_ino);
1174 return err;
1175 }
1176
1177 lock_page(page);
1178 if (unlikely(page->mapping != inode->i_mapping ||
1179 page_offset(page) > i_size_read(inode))) {
1180 /* Page got truncated out from underneath us */
1181 err = -EINVAL;
1182 goto out_unlock;
1183 }
1184
1185 if (PagePrivate(page))
1186 release_new_page_budget(c);
1187 else {
1188 if (!PageChecked(page))
1189 ubifs_convert_page_budget(c);
1190 SetPagePrivate(page);
1191 atomic_long_inc(&c->dirty_pg_cnt);
1192 __set_page_dirty_nobuffers(page);
1193 }
1194
1195 if (update_time) {
1196 int release;
1197 struct ubifs_inode *ui = ubifs_inode(inode);
1198
1199 mutex_lock(&ui->ui_mutex);
1200 inode->i_mtime = inode->i_ctime = ubifs_current_time(inode);
1201 release = ui->dirty;
1202 mark_inode_dirty_sync(inode);
1203 mutex_unlock(&ui->ui_mutex);
1204 if (release)
1205 ubifs_release_dirty_inode_budget(c, ui);
1206 }
1207
1208 unlock_page(page);
1209 return 0;
1210
1211out_unlock:
1212 unlock_page(page);
1213 ubifs_release_budget(c, &req);
1214 return err;
1215}
1216
1217static struct vm_operations_struct ubifs_file_vm_ops = {
1218 .fault = filemap_fault,
1219 .page_mkwrite = ubifs_vm_page_mkwrite,
1220};
1221
1222static int ubifs_file_mmap(struct file *file, struct vm_area_struct *vma)
1223{
1224 int err;
1225
1226 /* 'generic_file_mmap()' takes care of NOMMU case */
1227 err = generic_file_mmap(file, vma);
1228 if (err)
1229 return err;
1230 vma->vm_ops = &ubifs_file_vm_ops;
1231 return 0;
1232}
1233
1234struct address_space_operations ubifs_file_address_operations = {
1235 .readpage = ubifs_readpage,
1236 .writepage = ubifs_writepage,
1237 .write_begin = ubifs_write_begin,
1238 .write_end = ubifs_write_end,
1239 .invalidatepage = ubifs_invalidatepage,
1240 .set_page_dirty = ubifs_set_page_dirty,
1241 .releasepage = ubifs_releasepage,
1242};
1243
1244struct inode_operations ubifs_file_inode_operations = {
1245 .setattr = ubifs_setattr,
1246 .getattr = ubifs_getattr,
1247#ifdef CONFIG_UBIFS_FS_XATTR
1248 .setxattr = ubifs_setxattr,
1249 .getxattr = ubifs_getxattr,
1250 .listxattr = ubifs_listxattr,
1251 .removexattr = ubifs_removexattr,
1252#endif
1253};
1254
1255struct inode_operations ubifs_symlink_inode_operations = {
1256 .readlink = generic_readlink,
1257 .follow_link = ubifs_follow_link,
1258 .setattr = ubifs_setattr,
1259 .getattr = ubifs_getattr,
1260};
1261
1262struct file_operations ubifs_file_operations = {
1263 .llseek = generic_file_llseek,
1264 .read = do_sync_read,
1265 .write = do_sync_write,
1266 .aio_read = generic_file_aio_read,
1267 .aio_write = ubifs_aio_write,
1268 .mmap = ubifs_file_mmap,
1269 .fsync = ubifs_fsync,
1270 .unlocked_ioctl = ubifs_ioctl,
1271 .splice_read = generic_file_splice_read,
1272#ifdef CONFIG_COMPAT
1273 .compat_ioctl = ubifs_compat_ioctl,
1274#endif
1275};
diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c
new file mode 100644
index 000000000000..10394c548367
--- /dev/null
+++ b/fs/ubifs/find.c
@@ -0,0 +1,975 @@
1/*
2 * This file is part of UBIFS.
3 *
4 * Copyright (C) 2006-2008 Nokia Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published by
8 * the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along with
16 * this program; if not, write to the Free Software Foundation, Inc., 51
17 * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 * Authors: Artem Bityutskiy (Битюцкий Артём)
20 * Adrian Hunter
21 */
22
23/*
24 * This file contains functions for finding LEBs for various purposes e.g.
25 * garbage collection. In general, lprops category heaps and lists are used
26 * for fast access, falling back on scanning the LPT as a last resort.
27 */
28
29#include <linux/sort.h>
30#include "ubifs.h"
31
32/**
33 * struct scan_data - data provided to scan callback functions
34 * @min_space: minimum number of bytes for which to scan
35 * @pick_free: whether it is OK to scan for empty LEBs
36 * @lnum: LEB number found is returned here
37 * @exclude_index: whether to exclude index LEBs
38 */
39struct scan_data {
40 int min_space;
41 int pick_free;
42 int lnum;
43 int exclude_index;
44};
45
46/**
47 * valuable - determine whether LEB properties are valuable.
48 * @c: the UBIFS file-system description object
49 * @lprops: LEB properties
50 *
51 * This function return %1 if the LEB properties should be added to the LEB
52 * properties tree in memory. Otherwise %0 is returned.
53 */
54static int valuable(struct ubifs_info *c, const struct ubifs_lprops *lprops)
55{
56 int n, cat = lprops->flags & LPROPS_CAT_MASK;
57 struct ubifs_lpt_heap *heap;
58
59 switch (cat) {
60 case LPROPS_DIRTY:
61 case LPROPS_DIRTY_IDX:
62 case LPROPS_FREE:
63 heap = &c->lpt_heap[cat - 1];
64 if (heap->cnt < heap->max_cnt)
65 return 1;
66 if (lprops->free + lprops->dirty >= c->dark_wm)
67 return 1;
68 return 0;
69 case LPROPS_EMPTY:
70 n = c->lst.empty_lebs + c->freeable_cnt -
71 c->lst.taken_empty_lebs;
72 if (n < c->lsave_cnt)
73 return 1;
74 return 0;
75 case LPROPS_FREEABLE:
76 return 1;
77 case LPROPS_FRDI_IDX:
78 return 1;
79 }
80 return 0;
81}
82
83/**
84 * scan_for_dirty_cb - dirty space scan callback.
85 * @c: the UBIFS file-system description object
86 * @lprops: LEB properties to scan
87 * @in_tree: whether the LEB properties are in main memory
88 * @data: information passed to and from the caller of the scan
89 *
90 * This function returns a code that indicates whether the scan should continue
91 * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree
92 * in main memory (%LPT_SCAN_ADD), or whether the scan should stop
93 * (%LPT_SCAN_STOP).
94 */
95static int scan_for_dirty_cb(struct ubifs_info *c,
96 const struct ubifs_lprops *lprops, int in_tree,
97 struct scan_data *data)
98{
99 int ret = LPT_SCAN_CONTINUE;
100
101 /* Exclude LEBs that are currently in use */
102 if (lprops->flags & LPROPS_TAKEN)
103 return LPT_SCAN_CONTINUE;
104 /* Determine whether to add these LEB properties to the tree */
105 if (!in_tree && valuable(c, lprops))
106 ret |= LPT_SCAN_ADD;
107 /* Exclude LEBs with too little space */
108 if (lprops->free + lprops->dirty < data->min_space)
109 return ret;
110 /* If specified, exclude index LEBs */
111 if (data->exclude_index && lprops->flags & LPROPS_INDEX)
112 return ret;
113 /* If specified, exclude empty or freeable LEBs */
114 if (lprops->free + lprops->dirty == c->leb_size) {
115 if (!data->pick_free)
116 return ret;
117 /* Exclude LEBs with too little dirty space (unless it is empty) */
118 } else if (lprops->dirty < c->dead_wm)
119 return ret;
120 /* Finally we found space */
121 data->lnum = lprops->lnum;
122 return LPT_SCAN_ADD | LPT_SCAN_STOP;
123}
124
125/**
126 * scan_for_dirty - find a data LEB with free space.
127 * @c: the UBIFS file-system description object
128 * @min_space: minimum amount free plus dirty space the returned LEB has to
129 * have
130 * @pick_free: if it is OK to return a free or freeable LEB
131 * @exclude_index: whether to exclude index LEBs
132 *
133 * This function returns a pointer to the LEB properties found or a negative
134 * error code.
135 */
136static const struct ubifs_lprops *scan_for_dirty(struct ubifs_info *c,
137 int min_space, int pick_free,
138 int exclude_index)
139{
140 const struct ubifs_lprops *lprops;
141 struct ubifs_lpt_heap *heap;
142 struct scan_data data;
143 int err, i;
144
145 /* There may be an LEB with enough dirty space on the free heap */
146 heap = &c->lpt_heap[LPROPS_FREE - 1];
147 for (i = 0; i < heap->cnt; i++) {
148 lprops = heap->arr[i];
149 if (lprops->free + lprops->dirty < min_space)
150 continue;
151 if (lprops->dirty < c->dead_wm)
152 continue;
153 return lprops;
154 }
155 /*
156 * A LEB may have fallen off of the bottom of the dirty heap, and ended
157 * up as uncategorized even though it has enough dirty space for us now,
158 * so check the uncategorized list. N.B. neither empty nor freeable LEBs
159 * can end up as uncategorized because they are kept on lists not
160 * finite-sized heaps.
161 */
162 list_for_each_entry(lprops, &c->uncat_list, list) {
163 if (lprops->flags & LPROPS_TAKEN)
164 continue;
165 if (lprops->free + lprops->dirty < min_space)
166 continue;
167 if (exclude_index && (lprops->flags & LPROPS_INDEX))
168 continue;
169 if (lprops->dirty < c->dead_wm)
170 continue;
171 return lprops;
172 }
173 /* We have looked everywhere in main memory, now scan the flash */
174 if (c->pnodes_have >= c->pnode_cnt)
175 /* All pnodes are in memory, so skip scan */
176 return ERR_PTR(-ENOSPC);
177 data.min_space = min_space;
178 data.pick_free = pick_free;
179 data.lnum = -1;
180 data.exclude_index = exclude_index;
181 err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum,
182 (ubifs_lpt_scan_callback)scan_for_dirty_cb,
183 &data);
184 if (err)
185 return ERR_PTR(err);
186 ubifs_assert(data.lnum >= c->main_first && data.lnum < c->leb_cnt);
187 c->lscan_lnum = data.lnum;
188 lprops = ubifs_lpt_lookup_dirty(c, data.lnum);
189 if (IS_ERR(lprops))
190 return lprops;
191 ubifs_assert(lprops->lnum == data.lnum);
192 ubifs_assert(lprops->free + lprops->dirty >= min_space);
193 ubifs_assert(lprops->dirty >= c->dead_wm ||
194 (pick_free &&
195 lprops->free + lprops->dirty == c->leb_size));
196 ubifs_assert(!(lprops->flags & LPROPS_TAKEN));
197 ubifs_assert(!exclude_index || !(lprops->flags & LPROPS_INDEX));
198 return lprops;
199}
200
201/**
202 * ubifs_find_dirty_leb - find a dirty LEB for the Garbage Collector.
203 * @c: the UBIFS file-system description object
204 * @ret_lp: LEB properties are returned here on exit
205 * @min_space: minimum amount free plus dirty space the returned LEB has to
206 * have
207 * @pick_free: controls whether it is OK to pick empty or index LEBs
208 *
209 * This function tries to find a dirty logical eraseblock which has at least
210 * @min_space free and dirty space. It prefers to take an LEB from the dirty or
211 * dirty index heap, and it falls-back to LPT scanning if the heaps are empty
212 * or do not have an LEB which satisfies the @min_space criteria.
213 *
214 * Note:
215 * o LEBs which have less than dead watermark of dirty space are never picked
216 * by this function;
217 *
218 * Returns zero and the LEB properties of
219 * found dirty LEB in case of success, %-ENOSPC if no dirty LEB was found and a
220 * negative error code in case of other failures. The returned LEB is marked as
221 * "taken".
222 *
223 * The additional @pick_free argument controls if this function has to return a
224 * free or freeable LEB if one is present. For example, GC must to set it to %1,
225 * when called from the journal space reservation function, because the
226 * appearance of free space may coincide with the loss of enough dirty space
227 * for GC to succeed anyway.
228 *
229 * In contrast, if the Garbage Collector is called from budgeting, it should
230 * just make free space, not return LEBs which are already free or freeable.
231 *
232 * In addition @pick_free is set to %2 by the recovery process in order to
233 * recover gc_lnum in which case an index LEB must not be returned.
234 */
235int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp,
236 int min_space, int pick_free)
237{
238 int err = 0, sum, exclude_index = pick_free == 2 ? 1 : 0;
239 const struct ubifs_lprops *lp = NULL, *idx_lp = NULL;
240 struct ubifs_lpt_heap *heap, *idx_heap;
241
242 ubifs_get_lprops(c);
243
244 if (pick_free) {
245 int lebs, rsvd_idx_lebs = 0;
246
247 spin_lock(&c->space_lock);
248 lebs = c->lst.empty_lebs;
249 lebs += c->freeable_cnt - c->lst.taken_empty_lebs;
250
251 /*
252 * Note, the index may consume more LEBs than have been reserved
253 * for it. It is OK because it might be consolidated by GC.
254 * But if the index takes fewer LEBs than it is reserved for it,
255 * this function must avoid picking those reserved LEBs.
256 */
257 if (c->min_idx_lebs >= c->lst.idx_lebs) {
258 rsvd_idx_lebs = c->min_idx_lebs - c->lst.idx_lebs;
259 exclude_index = 1;
260 }
261 spin_unlock(&c->space_lock);
262
263 /* Check if there are enough free LEBs for the index */
264 if (rsvd_idx_lebs < lebs) {
265 /* OK, try to find an empty LEB */
266 lp = ubifs_fast_find_empty(c);
267 if (lp)
268 goto found;
269
270 /* Or a freeable LEB */
271 lp = ubifs_fast_find_freeable(c);
272 if (lp)
273 goto found;
274 } else
275 /*
276 * We cannot pick free/freeable LEBs in the below code.
277 */
278 pick_free = 0;
279 } else {
280 spin_lock(&c->space_lock);
281 exclude_index = (c->min_idx_lebs >= c->lst.idx_lebs);
282 spin_unlock(&c->space_lock);
283 }
284
285 /* Look on the dirty and dirty index heaps */
286 heap = &c->lpt_heap[LPROPS_DIRTY - 1];
287 idx_heap = &c->lpt_heap[LPROPS_DIRTY_IDX - 1];
288
289 if (idx_heap->cnt && !exclude_index) {
290 idx_lp = idx_heap->arr[0];
291 sum = idx_lp->free + idx_lp->dirty;
292 /*
293 * Since we reserve twice as more space for the index than it
294 * actually takes, it does not make sense to pick indexing LEBs
295 * with less than half LEB of dirty space.
296 */
297 if (sum < min_space || sum < c->half_leb_size)
298 idx_lp = NULL;
299 }
300
301 if (heap->cnt) {
302 lp = heap->arr[0];
303 if (lp->dirty + lp->free < min_space)
304 lp = NULL;
305 }
306
307 /* Pick the LEB with most space */
308 if (idx_lp && lp) {
309 if (idx_lp->free + idx_lp->dirty >= lp->free + lp->dirty)
310 lp = idx_lp;
311 } else if (idx_lp && !lp)
312 lp = idx_lp;
313
314 if (lp) {
315 ubifs_assert(lp->dirty >= c->dead_wm);
316 goto found;
317 }
318
319 /* Did not find a dirty LEB on the dirty heaps, have to scan */
320 dbg_find("scanning LPT for a dirty LEB");
321 lp = scan_for_dirty(c, min_space, pick_free, exclude_index);
322 if (IS_ERR(lp)) {
323 err = PTR_ERR(lp);
324 goto out;
325 }
326 ubifs_assert(lp->dirty >= c->dead_wm ||
327 (pick_free && lp->free + lp->dirty == c->leb_size));
328
329found:
330 dbg_find("found LEB %d, free %d, dirty %d, flags %#x",
331 lp->lnum, lp->free, lp->dirty, lp->flags);
332
333 lp = ubifs_change_lp(c, lp, LPROPS_NC, LPROPS_NC,
334 lp->flags | LPROPS_TAKEN, 0);
335 if (IS_ERR(lp)) {
336 err = PTR_ERR(lp);
337 goto out;
338 }
339
340 memcpy(ret_lp, lp, sizeof(struct ubifs_lprops));
341
342out:
343 ubifs_release_lprops(c);
344 return err;
345}
346
347/**
348 * scan_for_free_cb - free space scan callback.
349 * @c: the UBIFS file-system description object
350 * @lprops: LEB properties to scan
351 * @in_tree: whether the LEB properties are in main memory
352 * @data: information passed to and from the caller of the scan
353 *
354 * This function returns a code that indicates whether the scan should continue
355 * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree
356 * in main memory (%LPT_SCAN_ADD), or whether the scan should stop
357 * (%LPT_SCAN_STOP).
358 */
359static int scan_for_free_cb(struct ubifs_info *c,
360 const struct ubifs_lprops *lprops, int in_tree,
361 struct scan_data *data)
362{
363 int ret = LPT_SCAN_CONTINUE;
364
365 /* Exclude LEBs that are currently in use */
366 if (lprops->flags & LPROPS_TAKEN)
367 return LPT_SCAN_CONTINUE;
368 /* Determine whether to add these LEB properties to the tree */
369 if (!in_tree && valuable(c, lprops))
370 ret |= LPT_SCAN_ADD;
371 /* Exclude index LEBs */
372 if (lprops->flags & LPROPS_INDEX)
373 return ret;
374 /* Exclude LEBs with too little space */
375 if (lprops->free < data->min_space)
376 return ret;
377 /* If specified, exclude empty LEBs */
378 if (!data->pick_free && lprops->free == c->leb_size)
379 return ret;
380 /*
381 * LEBs that have only free and dirty space must not be allocated
382 * because they may have been unmapped already or they may have data
383 * that is obsolete only because of nodes that are still sitting in a
384 * wbuf.
385 */
386 if (lprops->free + lprops->dirty == c->leb_size && lprops->dirty > 0)
387 return ret;
388 /* Finally we found space */
389 data->lnum = lprops->lnum;
390 return LPT_SCAN_ADD | LPT_SCAN_STOP;
391}
392
393/**
394 * do_find_free_space - find a data LEB with free space.
395 * @c: the UBIFS file-system description object
396 * @min_space: minimum amount of free space required
397 * @pick_free: whether it is OK to scan for empty LEBs
398 * @squeeze: whether to try to find space in a non-empty LEB first
399 *
400 * This function returns a pointer to the LEB properties found or a negative
401 * error code.
402 */
403static
404const struct ubifs_lprops *do_find_free_space(struct ubifs_info *c,
405 int min_space, int pick_free,
406 int squeeze)
407{
408 const struct ubifs_lprops *lprops;
409 struct ubifs_lpt_heap *heap;
410 struct scan_data data;
411 int err, i;
412
413 if (squeeze) {
414 lprops = ubifs_fast_find_free(c);
415 if (lprops && lprops->free >= min_space)
416 return lprops;
417 }
418 if (pick_free) {
419 lprops = ubifs_fast_find_empty(c);
420 if (lprops)
421 return lprops;
422 }
423 if (!squeeze) {
424 lprops = ubifs_fast_find_free(c);
425 if (lprops && lprops->free >= min_space)
426 return lprops;
427 }
428 /* There may be an LEB with enough free space on the dirty heap */
429 heap = &c->lpt_heap[LPROPS_DIRTY - 1];
430 for (i = 0; i < heap->cnt; i++) {
431 lprops = heap->arr[i];
432 if (lprops->free >= min_space)
433 return lprops;
434 }
435 /*
436 * A LEB may have fallen off of the bottom of the free heap, and ended
437 * up as uncategorized even though it has enough free space for us now,
438 * so check the uncategorized list. N.B. neither empty nor freeable LEBs
439 * can end up as uncategorized because they are kept on lists not
440 * finite-sized heaps.
441 */
442 list_for_each_entry(lprops, &c->uncat_list, list) {
443 if (lprops->flags & LPROPS_TAKEN)
444 continue;
445 if (lprops->flags & LPROPS_INDEX)
446 continue;
447 if (lprops->free >= min_space)
448 return lprops;
449 }
450 /* We have looked everywhere in main memory, now scan the flash */
451 if (c->pnodes_have >= c->pnode_cnt)
452 /* All pnodes are in memory, so skip scan */
453 return ERR_PTR(-ENOSPC);
454 data.min_space = min_space;
455 data.pick_free = pick_free;
456 data.lnum = -1;
457 err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum,
458 (ubifs_lpt_scan_callback)scan_for_free_cb,
459 &data);
460 if (err)
461 return ERR_PTR(err);
462 ubifs_assert(data.lnum >= c->main_first && data.lnum < c->leb_cnt);
463 c->lscan_lnum = data.lnum;
464 lprops = ubifs_lpt_lookup_dirty(c, data.lnum);
465 if (IS_ERR(lprops))
466 return lprops;
467 ubifs_assert(lprops->lnum == data.lnum);
468 ubifs_assert(lprops->free >= min_space);
469 ubifs_assert(!(lprops->flags & LPROPS_TAKEN));
470 ubifs_assert(!(lprops->flags & LPROPS_INDEX));
471 return lprops;
472}
473
474/**
475 * ubifs_find_free_space - find a data LEB with free space.
476 * @c: the UBIFS file-system description object
477 * @min_space: minimum amount of required free space
478 * @free: contains amount of free space in the LEB on exit
479 * @squeeze: whether to try to find space in a non-empty LEB first
480 *
481 * This function looks for an LEB with at least @min_space bytes of free space.
482 * It tries to find an empty LEB if possible. If no empty LEBs are available,
483 * this function searches for a non-empty data LEB. The returned LEB is marked
484 * as "taken".
485 *
486 * This function returns found LEB number in case of success, %-ENOSPC if it
487 * failed to find a LEB with @min_space bytes of free space and other a negative
488 * error codes in case of failure.
489 */
490int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free,
491 int squeeze)
492{
493 const struct ubifs_lprops *lprops;
494 int lebs, rsvd_idx_lebs, pick_free = 0, err, lnum, flags;
495
496 dbg_find("min_space %d", min_space);
497 ubifs_get_lprops(c);
498
499 /* Check if there are enough empty LEBs for commit */
500 spin_lock(&c->space_lock);
501 if (c->min_idx_lebs > c->lst.idx_lebs)
502 rsvd_idx_lebs = c->min_idx_lebs - c->lst.idx_lebs;
503 else
504 rsvd_idx_lebs = 0;
505 lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt -
506 c->lst.taken_empty_lebs;
507 ubifs_assert(lebs + c->lst.idx_lebs >= c->min_idx_lebs);
508 if (rsvd_idx_lebs < lebs)
509 /*
510 * OK to allocate an empty LEB, but we still don't want to go
511 * looking for one if there aren't any.
512 */
513 if (c->lst.empty_lebs - c->lst.taken_empty_lebs > 0) {
514 pick_free = 1;
515 /*
516 * Because we release the space lock, we must account
517 * for this allocation here. After the LEB properties
518 * flags have been updated, we subtract one. Note, the
519 * result of this is that lprops also decreases
520 * @taken_empty_lebs in 'ubifs_change_lp()', so it is
521 * off by one for a short period of time which may
522 * introduce a small disturbance to budgeting
523 * calculations, but this is harmless because at the
524 * worst case this would make the budgeting subsystem
525 * be more pessimistic than needed.
526 *
527 * Fundamentally, this is about serialization of the
528 * budgeting and lprops subsystems. We could make the
529 * @space_lock a mutex and avoid dropping it before
530 * calling 'ubifs_change_lp()', but mutex is more
531 * heavy-weight, and we want budgeting to be as fast as
532 * possible.
533 */
534 c->lst.taken_empty_lebs += 1;
535 }
536 spin_unlock(&c->space_lock);
537
538 lprops = do_find_free_space(c, min_space, pick_free, squeeze);
539 if (IS_ERR(lprops)) {
540 err = PTR_ERR(lprops);
541 goto out;
542 }
543
544 lnum = lprops->lnum;
545 flags = lprops->flags | LPROPS_TAKEN;
546
547 lprops = ubifs_change_lp(c, lprops, LPROPS_NC, LPROPS_NC, flags, 0);
548 if (IS_ERR(lprops)) {
549 err = PTR_ERR(lprops);
550 goto out;
551 }
552
553 if (pick_free) {
554 spin_lock(&c->space_lock);
555 c->lst.taken_empty_lebs -= 1;
556 spin_unlock(&c->space_lock);
557 }
558
559 *free = lprops->free;
560 ubifs_release_lprops(c);
561
562 if (*free == c->leb_size) {
563 /*
564 * Ensure that empty LEBs have been unmapped. They may not have
565 * been, for example, because of an unclean unmount. Also
566 * LEBs that were freeable LEBs (free + dirty == leb_size) will
567 * not have been unmapped.
568 */
569 err = ubifs_leb_unmap(c, lnum);
570 if (err)
571 return err;
572 }
573
574 dbg_find("found LEB %d, free %d", lnum, *free);
575 ubifs_assert(*free >= min_space);
576 return lnum;
577
578out:
579 if (pick_free) {
580 spin_lock(&c->space_lock);
581 c->lst.taken_empty_lebs -= 1;
582 spin_unlock(&c->space_lock);
583 }
584 ubifs_release_lprops(c);
585 return err;
586}
587
588/**
589 * scan_for_idx_cb - callback used by the scan for a free LEB for the index.
590 * @c: the UBIFS file-system description object
591 * @lprops: LEB properties to scan
592 * @in_tree: whether the LEB properties are in main memory
593 * @data: information passed to and from the caller of the scan
594 *
595 * This function returns a code that indicates whether the scan should continue
596 * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree
597 * in main memory (%LPT_SCAN_ADD), or whether the scan should stop
598 * (%LPT_SCAN_STOP).
599 */
600static int scan_for_idx_cb(struct ubifs_info *c,
601 const struct ubifs_lprops *lprops, int in_tree,
602 struct scan_data *data)
603{
604 int ret = LPT_SCAN_CONTINUE;
605
606 /* Exclude LEBs that are currently in use */
607 if (lprops->flags & LPROPS_TAKEN)
608 return LPT_SCAN_CONTINUE;
609 /* Determine whether to add these LEB properties to the tree */
610 if (!in_tree && valuable(c, lprops))
611 ret |= LPT_SCAN_ADD;
612 /* Exclude index LEBS */
613 if (lprops->flags & LPROPS_INDEX)
614 return ret;
615 /* Exclude LEBs that cannot be made empty */
616 if (lprops->free + lprops->dirty != c->leb_size)
617 return ret;
618 /*
619 * We are allocating for the index so it is safe to allocate LEBs with
620 * only free and dirty space, because write buffers are sync'd at commit
621 * start.
622 */
623 data->lnum = lprops->lnum;
624 return LPT_SCAN_ADD | LPT_SCAN_STOP;
625}
626
627/**
628 * scan_for_leb_for_idx - scan for a free LEB for the index.
629 * @c: the UBIFS file-system description object
630 */
631static const struct ubifs_lprops *scan_for_leb_for_idx(struct ubifs_info *c)
632{
633 struct ubifs_lprops *lprops;
634 struct scan_data data;
635 int err;
636
637 data.lnum = -1;
638 err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum,
639 (ubifs_lpt_scan_callback)scan_for_idx_cb,
640 &data);
641 if (err)
642 return ERR_PTR(err);
643 ubifs_assert(data.lnum >= c->main_first && data.lnum < c->leb_cnt);
644 c->lscan_lnum = data.lnum;
645 lprops = ubifs_lpt_lookup_dirty(c, data.lnum);
646 if (IS_ERR(lprops))
647 return lprops;
648 ubifs_assert(lprops->lnum == data.lnum);
649 ubifs_assert(lprops->free + lprops->dirty == c->leb_size);
650 ubifs_assert(!(lprops->flags & LPROPS_TAKEN));
651 ubifs_assert(!(lprops->flags & LPROPS_INDEX));
652 return lprops;
653}
654
655/**
656 * ubifs_find_free_leb_for_idx - find a free LEB for the index.
657 * @c: the UBIFS file-system description object
658 *
659 * This function looks for a free LEB and returns that LEB number. The returned
660 * LEB is marked as "taken", "index".
661 *
662 * Only empty LEBs are allocated. This is for two reasons. First, the commit
663 * calculates the number of LEBs to allocate based on the assumption that they
664 * will be empty. Secondly, free space at the end of an index LEB is not
665 * guaranteed to be empty because it may have been used by the in-the-gaps
666 * method prior to an unclean unmount.
667 *
668 * If no LEB is found %-ENOSPC is returned. For other failures another negative
669 * error code is returned.
670 */
671int ubifs_find_free_leb_for_idx(struct ubifs_info *c)
672{
673 const struct ubifs_lprops *lprops;
674 int lnum = -1, err, flags;
675
676 ubifs_get_lprops(c);
677
678 lprops = ubifs_fast_find_empty(c);
679 if (!lprops) {
680 lprops = ubifs_fast_find_freeable(c);
681 if (!lprops) {
682 ubifs_assert(c->freeable_cnt == 0);
683 if (c->lst.empty_lebs - c->lst.taken_empty_lebs > 0) {
684 lprops = scan_for_leb_for_idx(c);
685 if (IS_ERR(lprops)) {
686 err = PTR_ERR(lprops);
687 goto out;
688 }
689 }
690 }
691 }
692
693 if (!lprops) {
694 err = -ENOSPC;
695 goto out;
696 }
697
698 lnum = lprops->lnum;
699
700 dbg_find("found LEB %d, free %d, dirty %d, flags %#x",
701 lnum, lprops->free, lprops->dirty, lprops->flags);
702
703 flags = lprops->flags | LPROPS_TAKEN | LPROPS_INDEX;
704 lprops = ubifs_change_lp(c, lprops, c->leb_size, 0, flags, 0);
705 if (IS_ERR(lprops)) {
706 err = PTR_ERR(lprops);
707 goto out;
708 }
709
710 ubifs_release_lprops(c);
711
712 /*
713 * Ensure that empty LEBs have been unmapped. They may not have been,
714 * for example, because of an unclean unmount. Also LEBs that were
715 * freeable LEBs (free + dirty == leb_size) will not have been unmapped.
716 */
717 err = ubifs_leb_unmap(c, lnum);
718 if (err) {
719 ubifs_change_one_lp(c, lnum, LPROPS_NC, LPROPS_NC, 0,
720 LPROPS_TAKEN | LPROPS_INDEX, 0);
721 return err;
722 }
723
724 return lnum;
725
726out:
727 ubifs_release_lprops(c);
728 return err;
729}
730
731static int cmp_dirty_idx(const struct ubifs_lprops **a,
732 const struct ubifs_lprops **b)
733{
734 const struct ubifs_lprops *lpa = *a;
735 const struct ubifs_lprops *lpb = *b;
736
737 return lpa->dirty + lpa->free - lpb->dirty - lpb->free;
738}
739
740static void swap_dirty_idx(struct ubifs_lprops **a, struct ubifs_lprops **b,
741 int size)
742{
743 struct ubifs_lprops *t = *a;
744
745 *a = *b;
746 *b = t;
747}
748
749/**
750 * ubifs_save_dirty_idx_lnums - save an array of the most dirty index LEB nos.
751 * @c: the UBIFS file-system description object
752 *
753 * This function is called each commit to create an array of LEB numbers of
754 * dirty index LEBs sorted in order of dirty and free space. This is used by
755 * the in-the-gaps method of TNC commit.
756 */
757int ubifs_save_dirty_idx_lnums(struct ubifs_info *c)
758{
759 int i;
760
761 ubifs_get_lprops(c);
762 /* Copy the LPROPS_DIRTY_IDX heap */
763 c->dirty_idx.cnt = c->lpt_heap[LPROPS_DIRTY_IDX - 1].cnt;
764 memcpy(c->dirty_idx.arr, c->lpt_heap[LPROPS_DIRTY_IDX - 1].arr,
765 sizeof(void *) * c->dirty_idx.cnt);
766 /* Sort it so that the dirtiest is now at the end */
767 sort(c->dirty_idx.arr, c->dirty_idx.cnt, sizeof(void *),
768 (int (*)(const void *, const void *))cmp_dirty_idx,
769 (void (*)(void *, void *, int))swap_dirty_idx);
770 dbg_find("found %d dirty index LEBs", c->dirty_idx.cnt);
771 if (c->dirty_idx.cnt)
772 dbg_find("dirtiest index LEB is %d with dirty %d and free %d",
773 c->dirty_idx.arr[c->dirty_idx.cnt - 1]->lnum,
774 c->dirty_idx.arr[c->dirty_idx.cnt - 1]->dirty,
775 c->dirty_idx.arr[c->dirty_idx.cnt - 1]->free);
776 /* Replace the lprops pointers with LEB numbers */
777 for (i = 0; i < c->dirty_idx.cnt; i++)
778 c->dirty_idx.arr[i] = (void *)(size_t)c->dirty_idx.arr[i]->lnum;
779 ubifs_release_lprops(c);
780 return 0;
781}
782
783/**
784 * scan_dirty_idx_cb - callback used by the scan for a dirty index LEB.
785 * @c: the UBIFS file-system description object
786 * @lprops: LEB properties to scan
787 * @in_tree: whether the LEB properties are in main memory
788 * @data: information passed to and from the caller of the scan
789 *
790 * This function returns a code that indicates whether the scan should continue
791 * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree
792 * in main memory (%LPT_SCAN_ADD), or whether the scan should stop
793 * (%LPT_SCAN_STOP).
794 */
795static int scan_dirty_idx_cb(struct ubifs_info *c,
796 const struct ubifs_lprops *lprops, int in_tree,
797 struct scan_data *data)
798{
799 int ret = LPT_SCAN_CONTINUE;
800
801 /* Exclude LEBs that are currently in use */
802 if (lprops->flags & LPROPS_TAKEN)
803 return LPT_SCAN_CONTINUE;
804 /* Determine whether to add these LEB properties to the tree */
805 if (!in_tree && valuable(c, lprops))
806 ret |= LPT_SCAN_ADD;
807 /* Exclude non-index LEBs */
808 if (!(lprops->flags & LPROPS_INDEX))
809 return ret;
810 /* Exclude LEBs with too little space */
811 if (lprops->free + lprops->dirty < c->min_idx_node_sz)
812 return ret;
813 /* Finally we found space */
814 data->lnum = lprops->lnum;
815 return LPT_SCAN_ADD | LPT_SCAN_STOP;
816}
817
818/**
819 * find_dirty_idx_leb - find a dirty index LEB.
820 * @c: the UBIFS file-system description object
821 *
822 * This function returns LEB number upon success and a negative error code upon
823 * failure. In particular, -ENOSPC is returned if a dirty index LEB is not
824 * found.
825 *
826 * Note that this function scans the entire LPT but it is called very rarely.
827 */
828static int find_dirty_idx_leb(struct ubifs_info *c)
829{
830 const struct ubifs_lprops *lprops;
831 struct ubifs_lpt_heap *heap;
832 struct scan_data data;
833 int err, i, ret;
834
835 /* Check all structures in memory first */
836 data.lnum = -1;
837 heap = &c->lpt_heap[LPROPS_DIRTY_IDX - 1];
838 for (i = 0; i < heap->cnt; i++) {
839 lprops = heap->arr[i];
840 ret = scan_dirty_idx_cb(c, lprops, 1, &data);
841 if (ret & LPT_SCAN_STOP)
842 goto found;
843 }
844 list_for_each_entry(lprops, &c->frdi_idx_list, list) {
845 ret = scan_dirty_idx_cb(c, lprops, 1, &data);
846 if (ret & LPT_SCAN_STOP)
847 goto found;
848 }
849 list_for_each_entry(lprops, &c->uncat_list, list) {
850 ret = scan_dirty_idx_cb(c, lprops, 1, &data);
851 if (ret & LPT_SCAN_STOP)
852 goto found;
853 }
854 if (c->pnodes_have >= c->pnode_cnt)
855 /* All pnodes are in memory, so skip scan */
856 return -ENOSPC;
857 err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum,
858 (ubifs_lpt_scan_callback)scan_dirty_idx_cb,
859 &data);
860 if (err)
861 return err;
862found:
863 ubifs_assert(data.lnum >= c->main_first && data.lnum < c->leb_cnt);
864 c->lscan_lnum = data.lnum;
865 lprops = ubifs_lpt_lookup_dirty(c, data.lnum);
866 if (IS_ERR(lprops))
867 return PTR_ERR(lprops);
868 ubifs_assert(lprops->lnum == data.lnum);
869 ubifs_assert(lprops->free + lprops->dirty >= c->min_idx_node_sz);
870 ubifs_assert(!(lprops->flags & LPROPS_TAKEN));
871 ubifs_assert((lprops->flags & LPROPS_INDEX));
872
873 dbg_find("found dirty LEB %d, free %d, dirty %d, flags %#x",
874 lprops->lnum, lprops->free, lprops->dirty, lprops->flags);
875
876 lprops = ubifs_change_lp(c, lprops, LPROPS_NC, LPROPS_NC,
877 lprops->flags | LPROPS_TAKEN, 0);
878 if (IS_ERR(lprops))
879 return PTR_ERR(lprops);
880
881 return lprops->lnum;
882}
883
884/**
885 * get_idx_gc_leb - try to get a LEB number from trivial GC.
886 * @c: the UBIFS file-system description object
887 */
888static int get_idx_gc_leb(struct ubifs_info *c)
889{
890 const struct ubifs_lprops *lp;
891 int err, lnum;
892
893 err = ubifs_get_idx_gc_leb(c);
894 if (err < 0)
895 return err;
896 lnum = err;
897 /*
898 * The LEB was due to be unmapped after the commit but
899 * it is needed now for this commit.
900 */
901 lp = ubifs_lpt_lookup_dirty(c, lnum);
902 if (unlikely(IS_ERR(lp)))
903 return PTR_ERR(lp);
904 lp = ubifs_change_lp(c, lp, LPROPS_NC, LPROPS_NC,
905 lp->flags | LPROPS_INDEX, -1);
906 if (unlikely(IS_ERR(lp)))
907 return PTR_ERR(lp);
908 dbg_find("LEB %d, dirty %d and free %d flags %#x",
909 lp->lnum, lp->dirty, lp->free, lp->flags);
910 return lnum;
911}
912
913/**
914 * find_dirtiest_idx_leb - find dirtiest index LEB from dirtiest array.
915 * @c: the UBIFS file-system description object
916 */
917static int find_dirtiest_idx_leb(struct ubifs_info *c)
918{
919 const struct ubifs_lprops *lp;
920 int lnum;
921
922 while (1) {
923 if (!c->dirty_idx.cnt)
924 return -ENOSPC;
925 /* The lprops pointers were replaced by LEB numbers */
926 lnum = (size_t)c->dirty_idx.arr[--c->dirty_idx.cnt];
927 lp = ubifs_lpt_lookup(c, lnum);
928 if (IS_ERR(lp))
929 return PTR_ERR(lp);
930 if ((lp->flags & LPROPS_TAKEN) || !(lp->flags & LPROPS_INDEX))
931 continue;
932 lp = ubifs_change_lp(c, lp, LPROPS_NC, LPROPS_NC,
933 lp->flags | LPROPS_TAKEN, 0);
934 if (IS_ERR(lp))
935 return PTR_ERR(lp);
936 break;
937 }
938 dbg_find("LEB %d, dirty %d and free %d flags %#x", lp->lnum, lp->dirty,
939 lp->free, lp->flags);
940 ubifs_assert(lp->flags | LPROPS_TAKEN);
941 ubifs_assert(lp->flags | LPROPS_INDEX);
942 return lnum;
943}
944
945/**
946 * ubifs_find_dirty_idx_leb - try to find dirtiest index LEB as at last commit.
947 * @c: the UBIFS file-system description object
948 *
949 * This function attempts to find an untaken index LEB with the most free and
950 * dirty space that can be used without overwriting index nodes that were in the
951 * last index committed.
952 */
953int ubifs_find_dirty_idx_leb(struct ubifs_info *c)
954{
955 int err;
956
957 ubifs_get_lprops(c);
958
959 /*
960 * We made an array of the dirtiest index LEB numbers as at the start of
961 * last commit. Try that array first.
962 */
963 err = find_dirtiest_idx_leb(c);
964
965 /* Next try scanning the entire LPT */
966 if (err == -ENOSPC)
967 err = find_dirty_idx_leb(c);
968
969 /* Finally take any index LEBs awaiting trivial GC */
970 if (err == -ENOSPC)
971 err = get_idx_gc_leb(c);
972
973 ubifs_release_lprops(c);
974 return err;
975}
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
new file mode 100644
index 000000000000..d0f3dac29081
--- /dev/null
+++ b/fs/ubifs/gc.c
@@ -0,0 +1,773 @@
1/*
2 * This file is part of UBIFS.
3 *
4 * Copyright (C) 2006-2008 Nokia Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published by
8 * the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along with
16 * this program; if not, write to the Free Software Foundation, Inc., 51
17 * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 * Authors: Adrian Hunter
20 * Artem Bityutskiy (Битюцкий Артём)
21 */
22
23/*
24 * This file implements garbage collection. The procedure for garbage collection
25 * is different depending on whether a LEB as an index LEB (contains index
26 * nodes) or not. For non-index LEBs, garbage collection finds a LEB which
27 * contains a lot of dirty space (obsolete nodes), and copies the non-obsolete
28 * nodes to the journal, at which point the garbage-collected LEB is free to be
29 * reused. For index LEBs, garbage collection marks the non-obsolete index nodes
30 * dirty in the TNC, and after the next commit, the garbage-collected LEB is
31 * to be reused. Garbage collection will cause the number of dirty index nodes
32 * to grow, however sufficient space is reserved for the index to ensure the
33 * commit will never run out of space.
34 */
35
36#include <linux/pagemap.h>
37#include "ubifs.h"
38
39/*
40 * GC tries to optimize the way it fit nodes to available space, and it sorts
41 * nodes a little. The below constants are watermarks which define "large",
42 * "medium", and "small" nodes.
43 */
44#define MEDIUM_NODE_WM (UBIFS_BLOCK_SIZE / 4)
45#define SMALL_NODE_WM UBIFS_MAX_DENT_NODE_SZ
46
47/*
48 * GC may need to move more then one LEB to make progress. The below constants
49 * define "soft" and "hard" limits on the number of LEBs the garbage collector
50 * may move.
51 */
52#define SOFT_LEBS_LIMIT 4
53#define HARD_LEBS_LIMIT 32
54
55/**
56 * switch_gc_head - switch the garbage collection journal head.
57 * @c: UBIFS file-system description object
58 * @buf: buffer to write
59 * @len: length of the buffer to write
60 * @lnum: LEB number written is returned here
61 * @offs: offset written is returned here
62 *
63 * This function switch the GC head to the next LEB which is reserved in
64 * @c->gc_lnum. Returns %0 in case of success, %-EAGAIN if commit is required,
65 * and other negative error code in case of failures.
66 */
67static int switch_gc_head(struct ubifs_info *c)
68{
69 int err, gc_lnum = c->gc_lnum;
70 struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
71
72 ubifs_assert(gc_lnum != -1);
73 dbg_gc("switch GC head from LEB %d:%d to LEB %d (waste %d bytes)",
74 wbuf->lnum, wbuf->offs + wbuf->used, gc_lnum,
75 c->leb_size - wbuf->offs - wbuf->used);
76
77 err = ubifs_wbuf_sync_nolock(wbuf);
78 if (err)
79 return err;
80
81 /*
82 * The GC write-buffer was synchronized, we may safely unmap
83 * 'c->gc_lnum'.
84 */
85 err = ubifs_leb_unmap(c, gc_lnum);
86 if (err)
87 return err;
88
89 err = ubifs_add_bud_to_log(c, GCHD, gc_lnum, 0);
90 if (err)
91 return err;
92
93 c->gc_lnum = -1;
94 err = ubifs_wbuf_seek_nolock(wbuf, gc_lnum, 0, UBI_LONGTERM);
95 return err;
96}
97
98/**
99 * move_nodes - move nodes.
100 * @c: UBIFS file-system description object
101 * @sleb: describes nodes to move
102 *
103 * This function moves valid nodes from data LEB described by @sleb to the GC
104 * journal head. The obsolete nodes are dropped.
105 *
106 * When moving nodes we have to deal with classical bin-packing problem: the
107 * space in the current GC journal head LEB and in @c->gc_lnum are the "bins",
108 * where the nodes in the @sleb->nodes list are the elements which should be
109 * fit optimally to the bins. This function uses the "first fit decreasing"
110 * strategy, although it does not really sort the nodes but just split them on
111 * 3 classes - large, medium, and small, so they are roughly sorted.
112 *
113 * This function returns zero in case of success, %-EAGAIN if commit is
114 * required, and other negative error codes in case of other failures.
115 */
116static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
117{
118 struct ubifs_scan_node *snod, *tmp;
119 struct list_head large, medium, small;
120 struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
121 int avail, err, min = INT_MAX;
122
123 INIT_LIST_HEAD(&large);
124 INIT_LIST_HEAD(&medium);
125 INIT_LIST_HEAD(&small);
126
127 list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) {
128 struct list_head *lst;
129
130 ubifs_assert(snod->type != UBIFS_IDX_NODE);
131 ubifs_assert(snod->type != UBIFS_REF_NODE);
132 ubifs_assert(snod->type != UBIFS_CS_NODE);
133
134 err = ubifs_tnc_has_node(c, &snod->key, 0, sleb->lnum,
135 snod->offs, 0);
136 if (err < 0)
137 goto out;
138
139 lst = &snod->list;
140 list_del(lst);
141 if (!err) {
142 /* The node is obsolete, remove it from the list */
143 kfree(snod);
144 continue;
145 }
146
147 /*
148 * Sort the list of nodes so that large nodes go first, and
149 * small nodes go last.
150 */
151 if (snod->len > MEDIUM_NODE_WM)
152 list_add(lst, &large);
153 else if (snod->len > SMALL_NODE_WM)
154 list_add(lst, &medium);
155 else
156 list_add(lst, &small);
157
158 /* And find the smallest node */
159 if (snod->len < min)
160 min = snod->len;
161 }
162
163 /*
164 * Join the tree lists so that we'd have one roughly sorted list
165 * ('large' will be the head of the joined list).
166 */
167 list_splice(&medium, large.prev);
168 list_splice(&small, large.prev);
169
170 if (wbuf->lnum == -1) {
171 /*
172 * The GC journal head is not set, because it is the first GC
173 * invocation since mount.
174 */
175 err = switch_gc_head(c);
176 if (err)
177 goto out;
178 }
179
180 /* Write nodes to their new location. Use the first-fit strategy */
181 while (1) {
182 avail = c->leb_size - wbuf->offs - wbuf->used;
183 list_for_each_entry_safe(snod, tmp, &large, list) {
184 int new_lnum, new_offs;
185
186 if (avail < min)
187 break;
188
189 if (snod->len > avail)
190 /* This node does not fit */
191 continue;
192
193 cond_resched();
194
195 new_lnum = wbuf->lnum;
196 new_offs = wbuf->offs + wbuf->used;
197 err = ubifs_wbuf_write_nolock(wbuf, snod->node,
198 snod->len);
199 if (err)
200 goto out;
201 err = ubifs_tnc_replace(c, &snod->key, sleb->lnum,
202 snod->offs, new_lnum, new_offs,
203 snod->len);
204 if (err)
205 goto out;
206
207 avail = c->leb_size - wbuf->offs - wbuf->used;
208 list_del(&snod->list);
209 kfree(snod);
210 }
211
212 if (list_empty(&large))
213 break;
214
215 /*
216 * Waste the rest of the space in the LEB and switch to the
217 * next LEB.
218 */
219 err = switch_gc_head(c);
220 if (err)
221 goto out;
222 }
223
224 return 0;
225
226out:
227 list_for_each_entry_safe(snod, tmp, &large, list) {
228 list_del(&snod->list);
229 kfree(snod);
230 }
231 return err;
232}
233
234/**
235 * gc_sync_wbufs - sync write-buffers for GC.
236 * @c: UBIFS file-system description object
237 *
238 * We must guarantee that obsoleting nodes are on flash. Unfortunately they may
239 * be in a write-buffer instead. That is, a node could be written to a
240 * write-buffer, obsoleting another node in a LEB that is GC'd. If that LEB is
241 * erased before the write-buffer is sync'd and then there is an unclean
242 * unmount, then an existing node is lost. To avoid this, we sync all
243 * write-buffers.
244 *
245 * This function returns %0 on success or a negative error code on failure.
246 */
247static int gc_sync_wbufs(struct ubifs_info *c)
248{
249 int err, i;
250
251 for (i = 0; i < c->jhead_cnt; i++) {
252 if (i == GCHD)
253 continue;
254 err = ubifs_wbuf_sync(&c->jheads[i].wbuf);
255 if (err)
256 return err;
257 }
258 return 0;
259}
260
261/**
262 * ubifs_garbage_collect_leb - garbage-collect a logical eraseblock.
263 * @c: UBIFS file-system description object
264 * @lp: describes the LEB to garbage collect
265 *
266 * This function garbage-collects an LEB and returns one of the @LEB_FREED,
267 * @LEB_RETAINED, etc positive codes in case of success, %-EAGAIN if commit is
268 * required, and other negative error codes in case of failures.
269 */
270int ubifs_garbage_collect_leb(struct ubifs_info *c, struct ubifs_lprops *lp)
271{
272 struct ubifs_scan_leb *sleb;
273 struct ubifs_scan_node *snod;
274 struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
275 int err = 0, lnum = lp->lnum;
276
277 ubifs_assert(c->gc_lnum != -1 || wbuf->offs + wbuf->used == 0 ||
278 c->need_recovery);
279 ubifs_assert(c->gc_lnum != lnum);
280 ubifs_assert(wbuf->lnum != lnum);
281
282 /*
283 * We scan the entire LEB even though we only really need to scan up to
284 * (c->leb_size - lp->free).
285 */
286 sleb = ubifs_scan(c, lnum, 0, c->sbuf);
287 if (IS_ERR(sleb))
288 return PTR_ERR(sleb);
289
290 ubifs_assert(!list_empty(&sleb->nodes));
291 snod = list_entry(sleb->nodes.next, struct ubifs_scan_node, list);
292
293 if (snod->type == UBIFS_IDX_NODE) {
294 struct ubifs_gced_idx_leb *idx_gc;
295
296 dbg_gc("indexing LEB %d (free %d, dirty %d)",
297 lnum, lp->free, lp->dirty);
298 list_for_each_entry(snod, &sleb->nodes, list) {
299 struct ubifs_idx_node *idx = snod->node;
300 int level = le16_to_cpu(idx->level);
301
302 ubifs_assert(snod->type == UBIFS_IDX_NODE);
303 key_read(c, ubifs_idx_key(c, idx), &snod->key);
304 err = ubifs_dirty_idx_node(c, &snod->key, level, lnum,
305 snod->offs);
306 if (err)
307 goto out;
308 }
309
310 idx_gc = kmalloc(sizeof(struct ubifs_gced_idx_leb), GFP_NOFS);
311 if (!idx_gc) {
312 err = -ENOMEM;
313 goto out;
314 }
315
316 idx_gc->lnum = lnum;
317 idx_gc->unmap = 0;
318 list_add(&idx_gc->list, &c->idx_gc);
319
320 /*
321 * Don't release the LEB until after the next commit, because
322 * it may contain date which is needed for recovery. So
323 * although we freed this LEB, it will become usable only after
324 * the commit.
325 */
326 err = ubifs_change_one_lp(c, lnum, c->leb_size, 0, 0,
327 LPROPS_INDEX, 1);
328 if (err)
329 goto out;
330 err = LEB_FREED_IDX;
331 } else {
332 dbg_gc("data LEB %d (free %d, dirty %d)",
333 lnum, lp->free, lp->dirty);
334
335 err = move_nodes(c, sleb);
336 if (err)
337 goto out;
338
339 err = gc_sync_wbufs(c);
340 if (err)
341 goto out;
342
343 err = ubifs_change_one_lp(c, lnum, c->leb_size, 0, 0, 0, 0);
344 if (err)
345 goto out;
346
347 if (c->gc_lnum == -1) {
348 c->gc_lnum = lnum;
349 err = LEB_RETAINED;
350 } else {
351 err = ubifs_wbuf_sync_nolock(wbuf);
352 if (err)
353 goto out;
354
355 err = ubifs_leb_unmap(c, lnum);
356 if (err)
357 goto out;
358
359 err = LEB_FREED;
360 }
361 }
362
363out:
364 ubifs_scan_destroy(sleb);
365 return err;
366}
367
368/**
369 * ubifs_garbage_collect - UBIFS garbage collector.
370 * @c: UBIFS file-system description object
371 * @anyway: do GC even if there are free LEBs
372 *
373 * This function does out-of-place garbage collection. The return codes are:
374 * o positive LEB number if the LEB has been freed and may be used;
375 * o %-EAGAIN if the caller has to run commit;
376 * o %-ENOSPC if GC failed to make any progress;
377 * o other negative error codes in case of other errors.
378 *
379 * Garbage collector writes data to the journal when GC'ing data LEBs, and just
380 * marking indexing nodes dirty when GC'ing indexing LEBs. Thus, at some point
381 * commit may be required. But commit cannot be run from inside GC, because the
382 * caller might be holding the commit lock, so %-EAGAIN is returned instead;
383 * And this error code means that the caller has to run commit, and re-run GC
384 * if there is still no free space.
385 *
386 * There are many reasons why this function may return %-EAGAIN:
387 * o the log is full and there is no space to write an LEB reference for
388 * @c->gc_lnum;
389 * o the journal is too large and exceeds size limitations;
390 * o GC moved indexing LEBs, but they can be used only after the commit;
391 * o the shrinker fails to find clean znodes to free and requests the commit;
392 * o etc.
393 *
394 * Note, if the file-system is close to be full, this function may return
395 * %-EAGAIN infinitely, so the caller has to limit amount of re-invocations of
396 * the function. E.g., this happens if the limits on the journal size are too
397 * tough and GC writes too much to the journal before an LEB is freed. This
398 * might also mean that the journal is too large, and the TNC becomes to big,
399 * so that the shrinker is constantly called, finds not clean znodes to free,
400 * and requests commit. Well, this may also happen if the journal is all right,
401 * but another kernel process consumes too much memory. Anyway, infinite
402 * %-EAGAIN may happen, but in some extreme/misconfiguration cases.
403 */
404int ubifs_garbage_collect(struct ubifs_info *c, int anyway)
405{
406 int i, err, ret, min_space = c->dead_wm;
407 struct ubifs_lprops lp;
408 struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
409
410 ubifs_assert_cmt_locked(c);
411
412 if (ubifs_gc_should_commit(c))
413 return -EAGAIN;
414
415 mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
416
417 if (c->ro_media) {
418 ret = -EROFS;
419 goto out_unlock;
420 }
421
422 /* We expect the write-buffer to be empty on entry */
423 ubifs_assert(!wbuf->used);
424
425 for (i = 0; ; i++) {
426 int space_before = c->leb_size - wbuf->offs - wbuf->used;
427 int space_after;
428
429 cond_resched();
430
431 /* Give the commit an opportunity to run */
432 if (ubifs_gc_should_commit(c)) {
433 ret = -EAGAIN;
434 break;
435 }
436
437 if (i > SOFT_LEBS_LIMIT && !list_empty(&c->idx_gc)) {
438 /*
439 * We've done enough iterations. Indexing LEBs were
440 * moved and will be available after the commit.
441 */
442 dbg_gc("soft limit, some index LEBs GC'ed, -EAGAIN");
443 ubifs_commit_required(c);
444 ret = -EAGAIN;
445 break;
446 }
447
448 if (i > HARD_LEBS_LIMIT) {
449 /*
450 * We've moved too many LEBs and have not made
451 * progress, give up.
452 */
453 dbg_gc("hard limit, -ENOSPC");
454 ret = -ENOSPC;
455 break;
456 }
457
458 /*
459 * Empty and freeable LEBs can turn up while we waited for
460 * the wbuf lock, or while we have been running GC. In that
461 * case, we should just return one of those instead of
462 * continuing to GC dirty LEBs. Hence we request
463 * 'ubifs_find_dirty_leb()' to return an empty LEB if it can.
464 */
465 ret = ubifs_find_dirty_leb(c, &lp, min_space, anyway ? 0 : 1);
466 if (ret) {
467 if (ret == -ENOSPC)
468 dbg_gc("no more dirty LEBs");
469 break;
470 }
471
472 dbg_gc("found LEB %d: free %d, dirty %d, sum %d "
473 "(min. space %d)", lp.lnum, lp.free, lp.dirty,
474 lp.free + lp.dirty, min_space);
475
476 if (lp.free + lp.dirty == c->leb_size) {
477 /* An empty LEB was returned */
478 dbg_gc("LEB %d is free, return it", lp.lnum);
479 /*
480 * ubifs_find_dirty_leb() doesn't return freeable index
481 * LEBs.
482 */
483 ubifs_assert(!(lp.flags & LPROPS_INDEX));
484 if (lp.free != c->leb_size) {
485 /*
486 * Write buffers must be sync'd before
487 * unmapping freeable LEBs, because one of them
488 * may contain data which obsoletes something
489 * in 'lp.pnum'.
490 */
491 ret = gc_sync_wbufs(c);
492 if (ret)
493 goto out;
494 ret = ubifs_change_one_lp(c, lp.lnum,
495 c->leb_size, 0, 0, 0,
496 0);
497 if (ret)
498 goto out;
499 }
500 ret = ubifs_leb_unmap(c, lp.lnum);
501 if (ret)
502 goto out;
503 ret = lp.lnum;
504 break;
505 }
506
507 space_before = c->leb_size - wbuf->offs - wbuf->used;
508 if (wbuf->lnum == -1)
509 space_before = 0;
510
511 ret = ubifs_garbage_collect_leb(c, &lp);
512 if (ret < 0) {
513 if (ret == -EAGAIN || ret == -ENOSPC) {
514 /*
515 * These codes are not errors, so we have to
516 * return the LEB to lprops. But if the
517 * 'ubifs_return_leb()' function fails, its
518 * failure code is propagated to the caller
519 * instead of the original '-EAGAIN' or
520 * '-ENOSPC'.
521 */
522 err = ubifs_return_leb(c, lp.lnum);
523 if (err)
524 ret = err;
525 break;
526 }
527 goto out;
528 }
529
530 if (ret == LEB_FREED) {
531 /* An LEB has been freed and is ready for use */
532 dbg_gc("LEB %d freed, return", lp.lnum);
533 ret = lp.lnum;
534 break;
535 }
536
537 if (ret == LEB_FREED_IDX) {
538 /*
539 * This was an indexing LEB and it cannot be
540 * immediately used. And instead of requesting the
541 * commit straight away, we try to garbage collect some
542 * more.
543 */
544 dbg_gc("indexing LEB %d freed, continue", lp.lnum);
545 continue;
546 }
547
548 ubifs_assert(ret == LEB_RETAINED);
549 space_after = c->leb_size - wbuf->offs - wbuf->used;
550 dbg_gc("LEB %d retained, freed %d bytes", lp.lnum,
551 space_after - space_before);
552
553 if (space_after > space_before) {
554 /* GC makes progress, keep working */
555 min_space >>= 1;
556 if (min_space < c->dead_wm)
557 min_space = c->dead_wm;
558 continue;
559 }
560
561 dbg_gc("did not make progress");
562
563 /*
564 * GC moved an LEB bud have not done any progress. This means
565 * that the previous GC head LEB contained too few free space
566 * and the LEB which was GC'ed contained only large nodes which
567 * did not fit that space.
568 *
569 * We can do 2 things:
570 * 1. pick another LEB in a hope it'll contain a small node
571 * which will fit the space we have at the end of current GC
572 * head LEB, but there is no guarantee, so we try this out
573 * unless we have already been working for too long;
574 * 2. request an LEB with more dirty space, which will force
575 * 'ubifs_find_dirty_leb()' to start scanning the lprops
576 * table, instead of just picking one from the heap
577 * (previously it already picked the dirtiest LEB).
578 */
579 if (i < SOFT_LEBS_LIMIT) {
580 dbg_gc("try again");
581 continue;
582 }
583
584 min_space <<= 1;
585 if (min_space > c->dark_wm)
586 min_space = c->dark_wm;
587 dbg_gc("set min. space to %d", min_space);
588 }
589
590 if (ret == -ENOSPC && !list_empty(&c->idx_gc)) {
591 dbg_gc("no space, some index LEBs GC'ed, -EAGAIN");
592 ubifs_commit_required(c);
593 ret = -EAGAIN;
594 }
595
596 err = ubifs_wbuf_sync_nolock(wbuf);
597 if (!err)
598 err = ubifs_leb_unmap(c, c->gc_lnum);
599 if (err) {
600 ret = err;
601 goto out;
602 }
603out_unlock:
604 mutex_unlock(&wbuf->io_mutex);
605 return ret;
606
607out:
608 ubifs_assert(ret < 0);
609 ubifs_assert(ret != -ENOSPC && ret != -EAGAIN);
610 ubifs_ro_mode(c, ret);
611 ubifs_wbuf_sync_nolock(wbuf);
612 mutex_unlock(&wbuf->io_mutex);
613 ubifs_return_leb(c, lp.lnum);
614 return ret;
615}
616
617/**
618 * ubifs_gc_start_commit - garbage collection at start of commit.
619 * @c: UBIFS file-system description object
620 *
621 * If a LEB has only dirty and free space, then we may safely unmap it and make
622 * it free. Note, we cannot do this with indexing LEBs because dirty space may
623 * correspond index nodes that are required for recovery. In that case, the
624 * LEB cannot be unmapped until after the next commit.
625 *
626 * This function returns %0 upon success and a negative error code upon failure.
627 */
628int ubifs_gc_start_commit(struct ubifs_info *c)
629{
630 struct ubifs_gced_idx_leb *idx_gc;
631 const struct ubifs_lprops *lp;
632 int err = 0, flags;
633
634 ubifs_get_lprops(c);
635
636 /*
637 * Unmap (non-index) freeable LEBs. Note that recovery requires that all
638 * wbufs are sync'd before this, which is done in 'do_commit()'.
639 */
640 while (1) {
641 lp = ubifs_fast_find_freeable(c);
642 if (unlikely(IS_ERR(lp))) {
643 err = PTR_ERR(lp);
644 goto out;
645 }
646 if (!lp)
647 break;
648 ubifs_assert(!(lp->flags & LPROPS_TAKEN));
649 ubifs_assert(!(lp->flags & LPROPS_INDEX));
650 err = ubifs_leb_unmap(c, lp->lnum);
651 if (err)
652 goto out;
653 lp = ubifs_change_lp(c, lp, c->leb_size, 0, lp->flags, 0);
654 if (unlikely(IS_ERR(lp))) {
655 err = PTR_ERR(lp);
656 goto out;
657 }
658 ubifs_assert(!(lp->flags & LPROPS_TAKEN));
659 ubifs_assert(!(lp->flags & LPROPS_INDEX));
660 }
661
662 /* Mark GC'd index LEBs OK to unmap after this commit finishes */
663 list_for_each_entry(idx_gc, &c->idx_gc, list)
664 idx_gc->unmap = 1;
665
666 /* Record index freeable LEBs for unmapping after commit */
667 while (1) {
668 lp = ubifs_fast_find_frdi_idx(c);
669 if (unlikely(IS_ERR(lp))) {
670 err = PTR_ERR(lp);
671 goto out;
672 }
673 if (!lp)
674 break;
675 idx_gc = kmalloc(sizeof(struct ubifs_gced_idx_leb), GFP_NOFS);
676 if (!idx_gc) {
677 err = -ENOMEM;
678 goto out;
679 }
680 ubifs_assert(!(lp->flags & LPROPS_TAKEN));
681 ubifs_assert(lp->flags & LPROPS_INDEX);
682 /* Don't release the LEB until after the next commit */
683 flags = (lp->flags | LPROPS_TAKEN) ^ LPROPS_INDEX;
684 lp = ubifs_change_lp(c, lp, c->leb_size, 0, flags, 1);
685 if (unlikely(IS_ERR(lp))) {
686 err = PTR_ERR(lp);
687 kfree(idx_gc);
688 goto out;
689 }
690 ubifs_assert(lp->flags & LPROPS_TAKEN);
691 ubifs_assert(!(lp->flags & LPROPS_INDEX));
692 idx_gc->lnum = lp->lnum;
693 idx_gc->unmap = 1;
694 list_add(&idx_gc->list, &c->idx_gc);
695 }
696out:
697 ubifs_release_lprops(c);
698 return err;
699}
700
701/**
702 * ubifs_gc_end_commit - garbage collection at end of commit.
703 * @c: UBIFS file-system description object
704 *
705 * This function completes out-of-place garbage collection of index LEBs.
706 */
707int ubifs_gc_end_commit(struct ubifs_info *c)
708{
709 struct ubifs_gced_idx_leb *idx_gc, *tmp;
710 struct ubifs_wbuf *wbuf;
711 int err = 0;
712
713 wbuf = &c->jheads[GCHD].wbuf;
714 mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
715 list_for_each_entry_safe(idx_gc, tmp, &c->idx_gc, list)
716 if (idx_gc->unmap) {
717 dbg_gc("LEB %d", idx_gc->lnum);
718 err = ubifs_leb_unmap(c, idx_gc->lnum);
719 if (err)
720 goto out;
721 err = ubifs_change_one_lp(c, idx_gc->lnum, LPROPS_NC,
722 LPROPS_NC, 0, LPROPS_TAKEN, -1);
723 if (err)
724 goto out;
725 list_del(&idx_gc->list);
726 kfree(idx_gc);
727 }
728out:
729 mutex_unlock(&wbuf->io_mutex);
730 return err;
731}
732
733/**
734 * ubifs_destroy_idx_gc - destroy idx_gc list.
735 * @c: UBIFS file-system description object
736 *
737 * This function destroys the idx_gc list. It is called when unmounting or
738 * remounting read-only so locks are not needed.
739 */
740void ubifs_destroy_idx_gc(struct ubifs_info *c)
741{
742 while (!list_empty(&c->idx_gc)) {
743 struct ubifs_gced_idx_leb *idx_gc;
744
745 idx_gc = list_entry(c->idx_gc.next, struct ubifs_gced_idx_leb,
746 list);
747 c->idx_gc_cnt -= 1;
748 list_del(&idx_gc->list);
749 kfree(idx_gc);
750 }
751
752}
753
754/**
755 * ubifs_get_idx_gc_leb - get a LEB from GC'd index LEB list.
756 * @c: UBIFS file-system description object
757 *
758 * Called during start commit so locks are not needed.
759 */
760int ubifs_get_idx_gc_leb(struct ubifs_info *c)
761{
762 struct ubifs_gced_idx_leb *idx_gc;
763 int lnum;
764
765 if (list_empty(&c->idx_gc))
766 return -ENOSPC;
767 idx_gc = list_entry(c->idx_gc.next, struct ubifs_gced_idx_leb, list);
768 lnum = idx_gc->lnum;
769 /* c->idx_gc_cnt is updated by the caller when lprops are updated */
770 list_del(&idx_gc->list);
771 kfree(idx_gc);
772 return lnum;
773}
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
new file mode 100644
index 000000000000..3374f91b6709
--- /dev/null
+++ b/fs/ubifs/io.c
@@ -0,0 +1,914 @@
1/*
2 * This file is part of UBIFS.
3 *
4 * Copyright (C) 2006-2008 Nokia Corporation.
5 * Copyright (C) 2006, 2007 University of Szeged, Hungary
6 *
7 * This program is free software; you can redistribute it and/or modify it
8 * under the terms of the GNU General Public License version 2 as published by
9 * the Free Software Foundation.
10 *
11 * This program is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
14 * more details.
15 *
16 * You should have received a copy of the GNU General Public License along with
17 * this program; if not, write to the Free Software Foundation, Inc., 51
18 * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Authors: Artem Bityutskiy (Битюцкий Артём)
21 * Adrian Hunter
22 * Zoltan Sogor
23 */
24
25/*
26 * This file implements UBIFS I/O subsystem which provides various I/O-related
27 * helper functions (reading/writing/checking/validating nodes) and implements
28 * write-buffering support. Write buffers help to save space which otherwise
29 * would have been wasted for padding to the nearest minimal I/O unit boundary.
30 * Instead, data first goes to the write-buffer and is flushed when the
31 * buffer is full or when it is not used for some time (by timer). This is
32 * similarto the mechanism is used by JFFS2.
33 *
34 * Write-buffers are defined by 'struct ubifs_wbuf' objects and protected by
35 * mutexes defined inside these objects. Since sometimes upper-level code
36 * has to lock the write-buffer (e.g. journal space reservation code), many
37 * functions related to write-buffers have "nolock" suffix which means that the
38 * caller has to lock the write-buffer before calling this function.
39 *
40 * UBIFS stores nodes at 64 bit-aligned addresses. If the node length is not
41 * aligned, UBIFS starts the next node from the aligned address, and the padded
42 * bytes may contain any rubbish. In other words, UBIFS does not put padding
43 * bytes in those small gaps. Common headers of nodes store real node lengths,
44 * not aligned lengths. Indexing nodes also store real lengths in branches.
45 *
46 * UBIFS uses padding when it pads to the next min. I/O unit. In this case it
47 * uses padding nodes or padding bytes, if the padding node does not fit.
48 *
49 * All UBIFS nodes are protected by CRC checksums and UBIFS checks all nodes
50 * every time they are read from the flash media.
51 */
52
53#include <linux/crc32.h>
54#include "ubifs.h"
55
56/**
57 * ubifs_check_node - check node.
58 * @c: UBIFS file-system description object
59 * @buf: node to check
60 * @lnum: logical eraseblock number
61 * @offs: offset within the logical eraseblock
62 * @quiet: print no messages
63 *
64 * This function checks node magic number and CRC checksum. This function also
65 * validates node length to prevent UBIFS from becoming crazy when an attacker
66 * feeds it a file-system image with incorrect nodes. For example, too large
67 * node length in the common header could cause UBIFS to read memory outside of
68 * allocated buffer when checking the CRC checksum.
69 *
70 * This function returns zero in case of success %-EUCLEAN in case of bad CRC
71 * or magic.
72 */
73int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum,
74 int offs, int quiet)
75{
76 int err = -EINVAL, type, node_len;
77 uint32_t crc, node_crc, magic;
78 const struct ubifs_ch *ch = buf;
79
80 ubifs_assert(lnum >= 0 && lnum < c->leb_cnt && offs >= 0);
81 ubifs_assert(!(offs & 7) && offs < c->leb_size);
82
83 magic = le32_to_cpu(ch->magic);
84 if (magic != UBIFS_NODE_MAGIC) {
85 if (!quiet)
86 ubifs_err("bad magic %#08x, expected %#08x",
87 magic, UBIFS_NODE_MAGIC);
88 err = -EUCLEAN;
89 goto out;
90 }
91
92 type = ch->node_type;
93 if (type < 0 || type >= UBIFS_NODE_TYPES_CNT) {
94 if (!quiet)
95 ubifs_err("bad node type %d", type);
96 goto out;
97 }
98
99 node_len = le32_to_cpu(ch->len);
100 if (node_len + offs > c->leb_size)
101 goto out_len;
102
103 if (c->ranges[type].max_len == 0) {
104 if (node_len != c->ranges[type].len)
105 goto out_len;
106 } else if (node_len < c->ranges[type].min_len ||
107 node_len > c->ranges[type].max_len)
108 goto out_len;
109
110 crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8);
111 node_crc = le32_to_cpu(ch->crc);
112 if (crc != node_crc) {
113 if (!quiet)
114 ubifs_err("bad CRC: calculated %#08x, read %#08x",
115 crc, node_crc);
116 err = -EUCLEAN;
117 goto out;
118 }
119
120 return 0;
121
122out_len:
123 if (!quiet)
124 ubifs_err("bad node length %d", node_len);
125out:
126 if (!quiet) {
127 ubifs_err("bad node at LEB %d:%d", lnum, offs);
128 dbg_dump_node(c, buf);
129 dbg_dump_stack();
130 }
131 return err;
132}
133
134/**
135 * ubifs_pad - pad flash space.
136 * @c: UBIFS file-system description object
137 * @buf: buffer to put padding to
138 * @pad: how many bytes to pad
139 *
140 * The flash media obliges us to write only in chunks of %c->min_io_size and
141 * when we have to write less data we add padding node to the write-buffer and
142 * pad it to the next minimal I/O unit's boundary. Padding nodes help when the
143 * media is being scanned. If the amount of wasted space is not enough to fit a
144 * padding node which takes %UBIFS_PAD_NODE_SZ bytes, we write padding bytes
145 * pattern (%UBIFS_PADDING_BYTE).
146 *
147 * Padding nodes are also used to fill gaps when the "commit-in-gaps" method is
148 * used.
149 */
150void ubifs_pad(const struct ubifs_info *c, void *buf, int pad)
151{
152 uint32_t crc;
153
154 ubifs_assert(pad >= 0 && !(pad & 7));
155
156 if (pad >= UBIFS_PAD_NODE_SZ) {
157 struct ubifs_ch *ch = buf;
158 struct ubifs_pad_node *pad_node = buf;
159
160 ch->magic = cpu_to_le32(UBIFS_NODE_MAGIC);
161 ch->node_type = UBIFS_PAD_NODE;
162 ch->group_type = UBIFS_NO_NODE_GROUP;
163 ch->padding[0] = ch->padding[1] = 0;
164 ch->sqnum = 0;
165 ch->len = cpu_to_le32(UBIFS_PAD_NODE_SZ);
166 pad -= UBIFS_PAD_NODE_SZ;
167 pad_node->pad_len = cpu_to_le32(pad);
168 crc = crc32(UBIFS_CRC32_INIT, buf + 8, UBIFS_PAD_NODE_SZ - 8);
169 ch->crc = cpu_to_le32(crc);
170 memset(buf + UBIFS_PAD_NODE_SZ, 0, pad);
171 } else if (pad > 0)
172 /* Too little space, padding node won't fit */
173 memset(buf, UBIFS_PADDING_BYTE, pad);
174}
175
176/**
177 * next_sqnum - get next sequence number.
178 * @c: UBIFS file-system description object
179 */
180static unsigned long long next_sqnum(struct ubifs_info *c)
181{
182 unsigned long long sqnum;
183
184 spin_lock(&c->cnt_lock);
185 sqnum = ++c->max_sqnum;
186 spin_unlock(&c->cnt_lock);
187
188 if (unlikely(sqnum >= SQNUM_WARN_WATERMARK)) {
189 if (sqnum >= SQNUM_WATERMARK) {
190 ubifs_err("sequence number overflow %llu, end of life",
191 sqnum);
192 ubifs_ro_mode(c, -EINVAL);
193 }
194 ubifs_warn("running out of sequence numbers, end of life soon");
195 }
196
197 return sqnum;
198}
199
200/**
201 * ubifs_prepare_node - prepare node to be written to flash.
202 * @c: UBIFS file-system description object
203 * @node: the node to pad
204 * @len: node length
205 * @pad: if the buffer has to be padded
206 *
207 * This function prepares node at @node to be written to the media - it
208 * calculates node CRC, fills the common header, and adds proper padding up to
209 * the next minimum I/O unit if @pad is not zero.
210 */
211void ubifs_prepare_node(struct ubifs_info *c, void *node, int len, int pad)
212{
213 uint32_t crc;
214 struct ubifs_ch *ch = node;
215 unsigned long long sqnum = next_sqnum(c);
216
217 ubifs_assert(len >= UBIFS_CH_SZ);
218
219 ch->magic = cpu_to_le32(UBIFS_NODE_MAGIC);
220 ch->len = cpu_to_le32(len);
221 ch->group_type = UBIFS_NO_NODE_GROUP;
222 ch->sqnum = cpu_to_le64(sqnum);
223 ch->padding[0] = ch->padding[1] = 0;
224 crc = crc32(UBIFS_CRC32_INIT, node + 8, len - 8);
225 ch->crc = cpu_to_le32(crc);
226
227 if (pad) {
228 len = ALIGN(len, 8);
229 pad = ALIGN(len, c->min_io_size) - len;
230 ubifs_pad(c, node + len, pad);
231 }
232}
233
234/**
235 * ubifs_prep_grp_node - prepare node of a group to be written to flash.
236 * @c: UBIFS file-system description object
237 * @node: the node to pad
238 * @len: node length
239 * @last: indicates the last node of the group
240 *
241 * This function prepares node at @node to be written to the media - it
242 * calculates node CRC and fills the common header.
243 */
244void ubifs_prep_grp_node(struct ubifs_info *c, void *node, int len, int last)
245{
246 uint32_t crc;
247 struct ubifs_ch *ch = node;
248 unsigned long long sqnum = next_sqnum(c);
249
250 ubifs_assert(len >= UBIFS_CH_SZ);
251
252 ch->magic = cpu_to_le32(UBIFS_NODE_MAGIC);
253 ch->len = cpu_to_le32(len);
254 if (last)
255 ch->group_type = UBIFS_LAST_OF_NODE_GROUP;
256 else
257 ch->group_type = UBIFS_IN_NODE_GROUP;
258 ch->sqnum = cpu_to_le64(sqnum);
259 ch->padding[0] = ch->padding[1] = 0;
260 crc = crc32(UBIFS_CRC32_INIT, node + 8, len - 8);
261 ch->crc = cpu_to_le32(crc);
262}
263
264/**
265 * wbuf_timer_callback - write-buffer timer callback function.
266 * @data: timer data (write-buffer descriptor)
267 *
268 * This function is called when the write-buffer timer expires.
269 */
270static void wbuf_timer_callback_nolock(unsigned long data)
271{
272 struct ubifs_wbuf *wbuf = (struct ubifs_wbuf *)data;
273
274 wbuf->need_sync = 1;
275 wbuf->c->need_wbuf_sync = 1;
276 ubifs_wake_up_bgt(wbuf->c);
277}
278
279/**
280 * new_wbuf_timer - start new write-buffer timer.
281 * @wbuf: write-buffer descriptor
282 */
283static void new_wbuf_timer_nolock(struct ubifs_wbuf *wbuf)
284{
285 ubifs_assert(!timer_pending(&wbuf->timer));
286
287 if (!wbuf->timeout)
288 return;
289
290 wbuf->timer.expires = jiffies + wbuf->timeout;
291 add_timer(&wbuf->timer);
292}
293
294/**
295 * cancel_wbuf_timer - cancel write-buffer timer.
296 * @wbuf: write-buffer descriptor
297 */
298static void cancel_wbuf_timer_nolock(struct ubifs_wbuf *wbuf)
299{
300 /*
301 * If the syncer is waiting for the lock (from the background thread's
302 * context) and another task is changing write-buffer then the syncing
303 * should be canceled.
304 */
305 wbuf->need_sync = 0;
306 del_timer(&wbuf->timer);
307}
308
309/**
310 * ubifs_wbuf_sync_nolock - synchronize write-buffer.
311 * @wbuf: write-buffer to synchronize
312 *
313 * This function synchronizes write-buffer @buf and returns zero in case of
314 * success or a negative error code in case of failure.
315 */
316int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf)
317{
318 struct ubifs_info *c = wbuf->c;
319 int err, dirt;
320
321 cancel_wbuf_timer_nolock(wbuf);
322 if (!wbuf->used || wbuf->lnum == -1)
323 /* Write-buffer is empty or not seeked */
324 return 0;
325
326 dbg_io("LEB %d:%d, %d bytes",
327 wbuf->lnum, wbuf->offs, wbuf->used);
328 ubifs_assert(!(c->vfs_sb->s_flags & MS_RDONLY));
329 ubifs_assert(!(wbuf->avail & 7));
330 ubifs_assert(wbuf->offs + c->min_io_size <= c->leb_size);
331
332 if (c->ro_media)
333 return -EROFS;
334
335 ubifs_pad(c, wbuf->buf + wbuf->used, wbuf->avail);
336 err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, wbuf->offs,
337 c->min_io_size, wbuf->dtype);
338 if (err) {
339 ubifs_err("cannot write %d bytes to LEB %d:%d",
340 c->min_io_size, wbuf->lnum, wbuf->offs);
341 dbg_dump_stack();
342 return err;
343 }
344
345 dirt = wbuf->avail;
346
347 spin_lock(&wbuf->lock);
348 wbuf->offs += c->min_io_size;
349 wbuf->avail = c->min_io_size;
350 wbuf->used = 0;
351 wbuf->next_ino = 0;
352 spin_unlock(&wbuf->lock);
353
354 if (wbuf->sync_callback)
355 err = wbuf->sync_callback(c, wbuf->lnum,
356 c->leb_size - wbuf->offs, dirt);
357 return err;
358}
359
360/**
361 * ubifs_wbuf_seek_nolock - seek write-buffer.
362 * @wbuf: write-buffer
363 * @lnum: logical eraseblock number to seek to
364 * @offs: logical eraseblock offset to seek to
365 * @dtype: data type
366 *
367 * This function targets the write buffer to logical eraseblock @lnum:@offs.
368 * The write-buffer is synchronized if it is not empty. Returns zero in case of
369 * success and a negative error code in case of failure.
370 */
371int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs,
372 int dtype)
373{
374 const struct ubifs_info *c = wbuf->c;
375
376 dbg_io("LEB %d:%d", lnum, offs);
377 ubifs_assert(lnum >= 0 && lnum < c->leb_cnt);
378 ubifs_assert(offs >= 0 && offs <= c->leb_size);
379 ubifs_assert(offs % c->min_io_size == 0 && !(offs & 7));
380 ubifs_assert(lnum != wbuf->lnum);
381
382 if (wbuf->used > 0) {
383 int err = ubifs_wbuf_sync_nolock(wbuf);
384
385 if (err)
386 return err;
387 }
388
389 spin_lock(&wbuf->lock);
390 wbuf->lnum = lnum;
391 wbuf->offs = offs;
392 wbuf->avail = c->min_io_size;
393 wbuf->used = 0;
394 spin_unlock(&wbuf->lock);
395 wbuf->dtype = dtype;
396
397 return 0;
398}
399
400/**
401 * ubifs_bg_wbufs_sync - synchronize write-buffers.
402 * @c: UBIFS file-system description object
403 *
404 * This function is called by background thread to synchronize write-buffers.
405 * Returns zero in case of success and a negative error code in case of
406 * failure.
407 */
408int ubifs_bg_wbufs_sync(struct ubifs_info *c)
409{
410 int err, i;
411
412 if (!c->need_wbuf_sync)
413 return 0;
414 c->need_wbuf_sync = 0;
415
416 if (c->ro_media) {
417 err = -EROFS;
418 goto out_timers;
419 }
420
421 dbg_io("synchronize");
422 for (i = 0; i < c->jhead_cnt; i++) {
423 struct ubifs_wbuf *wbuf = &c->jheads[i].wbuf;
424
425 cond_resched();
426
427 /*
428 * If the mutex is locked then wbuf is being changed, so
429 * synchronization is not necessary.
430 */
431 if (mutex_is_locked(&wbuf->io_mutex))
432 continue;
433
434 mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
435 if (!wbuf->need_sync) {
436 mutex_unlock(&wbuf->io_mutex);
437 continue;
438 }
439
440 err = ubifs_wbuf_sync_nolock(wbuf);
441 mutex_unlock(&wbuf->io_mutex);
442 if (err) {
443 ubifs_err("cannot sync write-buffer, error %d", err);
444 ubifs_ro_mode(c, err);
445 goto out_timers;
446 }
447 }
448
449 return 0;
450
451out_timers:
452 /* Cancel all timers to prevent repeated errors */
453 for (i = 0; i < c->jhead_cnt; i++) {
454 struct ubifs_wbuf *wbuf = &c->jheads[i].wbuf;
455
456 mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
457 cancel_wbuf_timer_nolock(wbuf);
458 mutex_unlock(&wbuf->io_mutex);
459 }
460 return err;
461}
462
463/**
464 * ubifs_wbuf_write_nolock - write data to flash via write-buffer.
465 * @wbuf: write-buffer
466 * @buf: node to write
467 * @len: node length
468 *
469 * This function writes data to flash via write-buffer @wbuf. This means that
470 * the last piece of the node won't reach the flash media immediately if it
471 * does not take whole minimal I/O unit. Instead, the node will sit in RAM
472 * until the write-buffer is synchronized (e.g., by timer).
473 *
474 * This function returns zero in case of success and a negative error code in
475 * case of failure. If the node cannot be written because there is no more
476 * space in this logical eraseblock, %-ENOSPC is returned.
477 */
478int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
479{
480 struct ubifs_info *c = wbuf->c;
481 int err, written, n, aligned_len = ALIGN(len, 8), offs;
482
483 dbg_io("%d bytes (%s) to wbuf at LEB %d:%d", len,
484 dbg_ntype(((struct ubifs_ch *)buf)->node_type), wbuf->lnum,
485 wbuf->offs + wbuf->used);
486 ubifs_assert(len > 0 && wbuf->lnum >= 0 && wbuf->lnum < c->leb_cnt);
487 ubifs_assert(wbuf->offs >= 0 && wbuf->offs % c->min_io_size == 0);
488 ubifs_assert(!(wbuf->offs & 7) && wbuf->offs <= c->leb_size);
489 ubifs_assert(wbuf->avail > 0 && wbuf->avail <= c->min_io_size);
490 ubifs_assert(mutex_is_locked(&wbuf->io_mutex));
491
492 if (c->leb_size - wbuf->offs - wbuf->used < aligned_len) {
493 err = -ENOSPC;
494 goto out;
495 }
496
497 cancel_wbuf_timer_nolock(wbuf);
498
499 if (c->ro_media)
500 return -EROFS;
501
502 if (aligned_len <= wbuf->avail) {
503 /*
504 * The node is not very large and fits entirely within
505 * write-buffer.
506 */
507 memcpy(wbuf->buf + wbuf->used, buf, len);
508
509 if (aligned_len == wbuf->avail) {
510 dbg_io("flush wbuf to LEB %d:%d", wbuf->lnum,
511 wbuf->offs);
512 err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf,
513 wbuf->offs, c->min_io_size,
514 wbuf->dtype);
515 if (err)
516 goto out;
517
518 spin_lock(&wbuf->lock);
519 wbuf->offs += c->min_io_size;
520 wbuf->avail = c->min_io_size;
521 wbuf->used = 0;
522 wbuf->next_ino = 0;
523 spin_unlock(&wbuf->lock);
524 } else {
525 spin_lock(&wbuf->lock);
526 wbuf->avail -= aligned_len;
527 wbuf->used += aligned_len;
528 spin_unlock(&wbuf->lock);
529 }
530
531 goto exit;
532 }
533
534 /*
535 * The node is large enough and does not fit entirely within current
536 * minimal I/O unit. We have to fill and flush write-buffer and switch
537 * to the next min. I/O unit.
538 */
539 dbg_io("flush wbuf to LEB %d:%d", wbuf->lnum, wbuf->offs);
540 memcpy(wbuf->buf + wbuf->used, buf, wbuf->avail);
541 err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, wbuf->offs,
542 c->min_io_size, wbuf->dtype);
543 if (err)
544 goto out;
545
546 offs = wbuf->offs + c->min_io_size;
547 len -= wbuf->avail;
548 aligned_len -= wbuf->avail;
549 written = wbuf->avail;
550
551 /*
552 * The remaining data may take more whole min. I/O units, so write the
553 * remains multiple to min. I/O unit size directly to the flash media.
554 * We align node length to 8-byte boundary because we anyway flash wbuf
555 * if the remaining space is less than 8 bytes.
556 */
557 n = aligned_len >> c->min_io_shift;
558 if (n) {
559 n <<= c->min_io_shift;
560 dbg_io("write %d bytes to LEB %d:%d", n, wbuf->lnum, offs);
561 err = ubi_leb_write(c->ubi, wbuf->lnum, buf + written, offs, n,
562 wbuf->dtype);
563 if (err)
564 goto out;
565 offs += n;
566 aligned_len -= n;
567 len -= n;
568 written += n;
569 }
570
571 spin_lock(&wbuf->lock);
572 if (aligned_len)
573 /*
574 * And now we have what's left and what does not take whole
575 * min. I/O unit, so write it to the write-buffer and we are
576 * done.
577 */
578 memcpy(wbuf->buf, buf + written, len);
579
580 wbuf->offs = offs;
581 wbuf->used = aligned_len;
582 wbuf->avail = c->min_io_size - aligned_len;
583 wbuf->next_ino = 0;
584 spin_unlock(&wbuf->lock);
585
586exit:
587 if (wbuf->sync_callback) {
588 int free = c->leb_size - wbuf->offs - wbuf->used;
589
590 err = wbuf->sync_callback(c, wbuf->lnum, free, 0);
591 if (err)
592 goto out;
593 }
594
595 if (wbuf->used)
596 new_wbuf_timer_nolock(wbuf);
597
598 return 0;
599
600out:
601 ubifs_err("cannot write %d bytes to LEB %d:%d, error %d",
602 len, wbuf->lnum, wbuf->offs, err);
603 dbg_dump_node(c, buf);
604 dbg_dump_stack();
605 dbg_dump_leb(c, wbuf->lnum);
606 return err;
607}
608
609/**
610 * ubifs_write_node - write node to the media.
611 * @c: UBIFS file-system description object
612 * @buf: the node to write
613 * @len: node length
614 * @lnum: logical eraseblock number
615 * @offs: offset within the logical eraseblock
616 * @dtype: node life-time hint (%UBI_LONGTERM, %UBI_SHORTTERM, %UBI_UNKNOWN)
617 *
618 * This function automatically fills node magic number, assigns sequence
619 * number, and calculates node CRC checksum. The length of the @buf buffer has
620 * to be aligned to the minimal I/O unit size. This function automatically
621 * appends padding node and padding bytes if needed. Returns zero in case of
622 * success and a negative error code in case of failure.
623 */
624int ubifs_write_node(struct ubifs_info *c, void *buf, int len, int lnum,
625 int offs, int dtype)
626{
627 int err, buf_len = ALIGN(len, c->min_io_size);
628
629 dbg_io("LEB %d:%d, %s, length %d (aligned %d)",
630 lnum, offs, dbg_ntype(((struct ubifs_ch *)buf)->node_type), len,
631 buf_len);
632 ubifs_assert(lnum >= 0 && lnum < c->leb_cnt && offs >= 0);
633 ubifs_assert(offs % c->min_io_size == 0 && offs < c->leb_size);
634
635 if (c->ro_media)
636 return -EROFS;
637
638 ubifs_prepare_node(c, buf, len, 1);
639 err = ubi_leb_write(c->ubi, lnum, buf, offs, buf_len, dtype);
640 if (err) {
641 ubifs_err("cannot write %d bytes to LEB %d:%d, error %d",
642 buf_len, lnum, offs, err);
643 dbg_dump_node(c, buf);
644 dbg_dump_stack();
645 }
646
647 return err;
648}
649
650/**
651 * ubifs_read_node_wbuf - read node from the media or write-buffer.
652 * @wbuf: wbuf to check for un-written data
653 * @buf: buffer to read to
654 * @type: node type
655 * @len: node length
656 * @lnum: logical eraseblock number
657 * @offs: offset within the logical eraseblock
658 *
659 * This function reads a node of known type and length, checks it and stores
660 * in @buf. If the node partially or fully sits in the write-buffer, this
661 * function takes data from the buffer, otherwise it reads the flash media.
662 * Returns zero in case of success, %-EUCLEAN if CRC mismatched and a negative
663 * error code in case of failure.
664 */
665int ubifs_read_node_wbuf(struct ubifs_wbuf *wbuf, void *buf, int type, int len,
666 int lnum, int offs)
667{
668 const struct ubifs_info *c = wbuf->c;
669 int err, rlen, overlap;
670 struct ubifs_ch *ch = buf;
671
672 dbg_io("LEB %d:%d, %s, length %d", lnum, offs, dbg_ntype(type), len);
673 ubifs_assert(wbuf && lnum >= 0 && lnum < c->leb_cnt && offs >= 0);
674 ubifs_assert(!(offs & 7) && offs < c->leb_size);
675 ubifs_assert(type >= 0 && type < UBIFS_NODE_TYPES_CNT);
676
677 spin_lock(&wbuf->lock);
678 overlap = (lnum == wbuf->lnum && offs + len > wbuf->offs);
679 if (!overlap) {
680 /* We may safely unlock the write-buffer and read the data */
681 spin_unlock(&wbuf->lock);
682 return ubifs_read_node(c, buf, type, len, lnum, offs);
683 }
684
685 /* Don't read under wbuf */
686 rlen = wbuf->offs - offs;
687 if (rlen < 0)
688 rlen = 0;
689
690 /* Copy the rest from the write-buffer */
691 memcpy(buf + rlen, wbuf->buf + offs + rlen - wbuf->offs, len - rlen);
692 spin_unlock(&wbuf->lock);
693
694 if (rlen > 0) {
695 /* Read everything that goes before write-buffer */
696 err = ubi_read(c->ubi, lnum, buf, offs, rlen);
697 if (err && err != -EBADMSG) {
698 ubifs_err("failed to read node %d from LEB %d:%d, "
699 "error %d", type, lnum, offs, err);
700 dbg_dump_stack();
701 return err;
702 }
703 }
704
705 if (type != ch->node_type) {
706 ubifs_err("bad node type (%d but expected %d)",
707 ch->node_type, type);
708 goto out;
709 }
710
711 err = ubifs_check_node(c, buf, lnum, offs, 0);
712 if (err) {
713 ubifs_err("expected node type %d", type);
714 return err;
715 }
716
717 rlen = le32_to_cpu(ch->len);
718 if (rlen != len) {
719 ubifs_err("bad node length %d, expected %d", rlen, len);
720 goto out;
721 }
722
723 return 0;
724
725out:
726 ubifs_err("bad node at LEB %d:%d", lnum, offs);
727 dbg_dump_node(c, buf);
728 dbg_dump_stack();
729 return -EINVAL;
730}
731
732/**
733 * ubifs_read_node - read node.
734 * @c: UBIFS file-system description object
735 * @buf: buffer to read to
736 * @type: node type
737 * @len: node length (not aligned)
738 * @lnum: logical eraseblock number
739 * @offs: offset within the logical eraseblock
740 *
741 * This function reads a node of known type and and length, checks it and
742 * stores in @buf. Returns zero in case of success, %-EUCLEAN if CRC mismatched
743 * and a negative error code in case of failure.
744 */
745int ubifs_read_node(const struct ubifs_info *c, void *buf, int type, int len,
746 int lnum, int offs)
747{
748 int err, l;
749 struct ubifs_ch *ch = buf;
750
751 dbg_io("LEB %d:%d, %s, length %d", lnum, offs, dbg_ntype(type), len);
752 ubifs_assert(lnum >= 0 && lnum < c->leb_cnt && offs >= 0);
753 ubifs_assert(len >= UBIFS_CH_SZ && offs + len <= c->leb_size);
754 ubifs_assert(!(offs & 7) && offs < c->leb_size);
755 ubifs_assert(type >= 0 && type < UBIFS_NODE_TYPES_CNT);
756
757 err = ubi_read(c->ubi, lnum, buf, offs, len);
758 if (err && err != -EBADMSG) {
759 ubifs_err("cannot read node %d from LEB %d:%d, error %d",
760 type, lnum, offs, err);
761 return err;
762 }
763
764 if (type != ch->node_type) {
765 ubifs_err("bad node type (%d but expected %d)",
766 ch->node_type, type);
767 goto out;
768 }
769
770 err = ubifs_check_node(c, buf, lnum, offs, 0);
771 if (err) {
772 ubifs_err("expected node type %d", type);
773 return err;
774 }
775
776 l = le32_to_cpu(ch->len);
777 if (l != len) {
778 ubifs_err("bad node length %d, expected %d", l, len);
779 goto out;
780 }
781
782 return 0;
783
784out:
785 ubifs_err("bad node at LEB %d:%d", lnum, offs);
786 dbg_dump_node(c, buf);
787 dbg_dump_stack();
788 return -EINVAL;
789}
790
791/**
792 * ubifs_wbuf_init - initialize write-buffer.
793 * @c: UBIFS file-system description object
794 * @wbuf: write-buffer to initialize
795 *
796 * This function initializes write buffer. Returns zero in case of success
797 * %-ENOMEM in case of failure.
798 */
799int ubifs_wbuf_init(struct ubifs_info *c, struct ubifs_wbuf *wbuf)
800{
801 size_t size;
802
803 wbuf->buf = kmalloc(c->min_io_size, GFP_KERNEL);
804 if (!wbuf->buf)
805 return -ENOMEM;
806
807 size = (c->min_io_size / UBIFS_CH_SZ + 1) * sizeof(ino_t);
808 wbuf->inodes = kmalloc(size, GFP_KERNEL);
809 if (!wbuf->inodes) {
810 kfree(wbuf->buf);
811 wbuf->buf = NULL;
812 return -ENOMEM;
813 }
814
815 wbuf->used = 0;
816 wbuf->lnum = wbuf->offs = -1;
817 wbuf->avail = c->min_io_size;
818 wbuf->dtype = UBI_UNKNOWN;
819 wbuf->sync_callback = NULL;
820 mutex_init(&wbuf->io_mutex);
821 spin_lock_init(&wbuf->lock);
822
823 wbuf->c = c;
824 init_timer(&wbuf->timer);
825 wbuf->timer.function = wbuf_timer_callback_nolock;
826 wbuf->timer.data = (unsigned long)wbuf;
827 wbuf->timeout = DEFAULT_WBUF_TIMEOUT;
828 wbuf->next_ino = 0;
829
830 return 0;
831}
832
833/**
834 * ubifs_wbuf_add_ino_nolock - add an inode number into the wbuf inode array.
835 * @wbuf: the write-buffer whereto add
836 * @inum: the inode number
837 *
838 * This function adds an inode number to the inode array of the write-buffer.
839 */
840void ubifs_wbuf_add_ino_nolock(struct ubifs_wbuf *wbuf, ino_t inum)
841{
842 if (!wbuf->buf)
843 /* NOR flash or something similar */
844 return;
845
846 spin_lock(&wbuf->lock);
847 if (wbuf->used)
848 wbuf->inodes[wbuf->next_ino++] = inum;
849 spin_unlock(&wbuf->lock);
850}
851
852/**
853 * wbuf_has_ino - returns if the wbuf contains data from the inode.
854 * @wbuf: the write-buffer
855 * @inum: the inode number
856 *
857 * This function returns with %1 if the write-buffer contains some data from the
858 * given inode otherwise it returns with %0.
859 */
860static int wbuf_has_ino(struct ubifs_wbuf *wbuf, ino_t inum)
861{
862 int i, ret = 0;
863
864 spin_lock(&wbuf->lock);
865 for (i = 0; i < wbuf->next_ino; i++)
866 if (inum == wbuf->inodes[i]) {
867 ret = 1;
868 break;
869 }
870 spin_unlock(&wbuf->lock);
871
872 return ret;
873}
874
875/**
876 * ubifs_sync_wbufs_by_inode - synchronize write-buffers for an inode.
877 * @c: UBIFS file-system description object
878 * @inode: inode to synchronize
879 *
880 * This function synchronizes write-buffers which contain nodes belonging to
881 * @inode. Returns zero in case of success and a negative error code in case of
882 * failure.
883 */
884int ubifs_sync_wbufs_by_inode(struct ubifs_info *c, struct inode *inode)
885{
886 int i, err = 0;
887
888 for (i = 0; i < c->jhead_cnt; i++) {
889 struct ubifs_wbuf *wbuf = &c->jheads[i].wbuf;
890
891 if (i == GCHD)
892 /*
893 * GC head is special, do not look at it. Even if the
894 * head contains something related to this inode, it is
895 * a _copy_ of corresponding on-flash node which sits
896 * somewhere else.
897 */
898 continue;
899
900 if (!wbuf_has_ino(wbuf, inode->i_ino))
901 continue;
902
903 mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
904 if (wbuf_has_ino(wbuf, inode->i_ino))
905 err = ubifs_wbuf_sync_nolock(wbuf);
906 mutex_unlock(&wbuf->io_mutex);
907
908 if (err) {
909 ubifs_ro_mode(c, err);
910 return err;
911 }
912 }
913 return 0;
914}
diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c
new file mode 100644
index 000000000000..5e82cffe9695
--- /dev/null
+++ b/fs/ubifs/ioctl.c
@@ -0,0 +1,204 @@
1/*
2 * This file is part of UBIFS.
3 *
4 * Copyright (C) 2006-2008 Nokia Corporation.
5 * Copyright (C) 2006, 2007 University of Szeged, Hungary
6 *
7 * This program is free software; you can redistribute it and/or modify it
8 * under the terms of the GNU General Public License version 2 as published by
9 * the Free Software Foundation.
10 *
11 * This program is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
14 * more details.
15 *
16 * You should have received a copy of the GNU General Public License along with
17 * this program; if not, write to the Free Software Foundation, Inc., 51
18 * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Authors: Zoltan Sogor
21 * Artem Bityutskiy (Битюцкий Артём)
22 * Adrian Hunter
23 */
24
25/* This file implements EXT2-compatible extended attribute ioctl() calls */
26
27#include <linux/compat.h>
28#include <linux/smp_lock.h>
29#include <linux/mount.h>
30#include "ubifs.h"
31
32/**
33 * ubifs_set_inode_flags - set VFS inode flags.
34 * @inode: VFS inode to set flags for
35 *
36 * This function propagates flags from UBIFS inode object to VFS inode object.
37 */
38void ubifs_set_inode_flags(struct inode *inode)
39{
40 unsigned int flags = ubifs_inode(inode)->flags;
41
42 inode->i_flags &= ~(S_SYNC | S_APPEND | S_IMMUTABLE | S_DIRSYNC);
43 if (flags & UBIFS_SYNC_FL)
44 inode->i_flags |= S_SYNC;
45 if (flags & UBIFS_APPEND_FL)
46 inode->i_flags |= S_APPEND;
47 if (flags & UBIFS_IMMUTABLE_FL)
48 inode->i_flags |= S_IMMUTABLE;
49 if (flags & UBIFS_DIRSYNC_FL)
50 inode->i_flags |= S_DIRSYNC;
51}
52
53/*
54 * ioctl2ubifs - convert ioctl inode flags to UBIFS inode flags.
55 * @ioctl_flags: flags to convert
56 *
57 * This function convert ioctl flags (@FS_COMPR_FL, etc) to UBIFS inode flags
58 * (@UBIFS_COMPR_FL, etc).
59 */
60static int ioctl2ubifs(int ioctl_flags)
61{
62 int ubifs_flags = 0;
63
64 if (ioctl_flags & FS_COMPR_FL)
65 ubifs_flags |= UBIFS_COMPR_FL;
66 if (ioctl_flags & FS_SYNC_FL)
67 ubifs_flags |= UBIFS_SYNC_FL;
68 if (ioctl_flags & FS_APPEND_FL)
69 ubifs_flags |= UBIFS_APPEND_FL;
70 if (ioctl_flags & FS_IMMUTABLE_FL)
71 ubifs_flags |= UBIFS_IMMUTABLE_FL;
72 if (ioctl_flags & FS_DIRSYNC_FL)
73 ubifs_flags |= UBIFS_DIRSYNC_FL;
74
75 return ubifs_flags;
76}
77
78/*
79 * ubifs2ioctl - convert UBIFS inode flags to ioctl inode flags.
80 * @ubifs_flags: flags to convert
81 *
82 * This function convert UBIFS (@UBIFS_COMPR_FL, etc) to ioctl flags
83 * (@FS_COMPR_FL, etc).
84 */
85static int ubifs2ioctl(int ubifs_flags)
86{
87 int ioctl_flags = 0;
88
89 if (ubifs_flags & UBIFS_COMPR_FL)
90 ioctl_flags |= FS_COMPR_FL;
91 if (ubifs_flags & UBIFS_SYNC_FL)
92 ioctl_flags |= FS_SYNC_FL;
93 if (ubifs_flags & UBIFS_APPEND_FL)
94 ioctl_flags |= FS_APPEND_FL;
95 if (ubifs_flags & UBIFS_IMMUTABLE_FL)
96 ioctl_flags |= FS_IMMUTABLE_FL;
97 if (ubifs_flags & UBIFS_DIRSYNC_FL)
98 ioctl_flags |= FS_DIRSYNC_FL;
99
100 return ioctl_flags;
101}
102
103static int setflags(struct inode *inode, int flags)
104{
105 int oldflags, err, release;
106 struct ubifs_inode *ui = ubifs_inode(inode);
107 struct ubifs_info *c = inode->i_sb->s_fs_info;
108 struct ubifs_budget_req req = { .dirtied_ino = 1,
109 .dirtied_ino_d = ui->data_len };
110
111 err = ubifs_budget_space(c, &req);
112 if (err)
113 return err;
114
115 /*
116 * The IMMUTABLE and APPEND_ONLY flags can only be changed by
117 * the relevant capability.
118 */
119 mutex_lock(&ui->ui_mutex);
120 oldflags = ubifs2ioctl(ui->flags);
121 if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) {
122 if (!capable(CAP_LINUX_IMMUTABLE)) {
123 err = -EPERM;
124 goto out_unlock;
125 }
126 }
127
128 ui->flags = ioctl2ubifs(flags);
129 ubifs_set_inode_flags(inode);
130 inode->i_ctime = ubifs_current_time(inode);
131 release = ui->dirty;
132 mark_inode_dirty_sync(inode);
133 mutex_unlock(&ui->ui_mutex);
134
135 if (release)
136 ubifs_release_budget(c, &req);
137 if (IS_SYNC(inode))
138 err = write_inode_now(inode, 1);
139 return err;
140
141out_unlock:
142 ubifs_err("can't modify inode %lu attributes", inode->i_ino);
143 mutex_unlock(&ui->ui_mutex);
144 ubifs_release_budget(c, &req);
145 return err;
146}
147
148long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
149{
150 int flags, err;
151 struct inode *inode = file->f_path.dentry->d_inode;
152
153 switch (cmd) {
154 case FS_IOC_GETFLAGS:
155 flags = ubifs2ioctl(ubifs_inode(inode)->flags);
156
157 return put_user(flags, (int __user *) arg);
158
159 case FS_IOC_SETFLAGS: {
160 if (IS_RDONLY(inode))
161 return -EROFS;
162
163 if (!is_owner_or_cap(inode))
164 return -EACCES;
165
166 if (get_user(flags, (int __user *) arg))
167 return -EFAULT;
168
169 if (!S_ISDIR(inode->i_mode))
170 flags &= ~FS_DIRSYNC_FL;
171
172 /*
173 * Make sure the file-system is read-write and make sure it
174 * will not become read-only while we are changing the flags.
175 */
176 err = mnt_want_write(file->f_path.mnt);
177 if (err)
178 return err;
179 err = setflags(inode, flags);
180 mnt_drop_write(file->f_path.mnt);
181 return err;
182 }
183
184 default:
185 return -ENOTTY;
186 }
187}
188
189#ifdef CONFIG_COMPAT
190long ubifs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
191{
192 switch (cmd) {
193 case FS_IOC32_GETFLAGS:
194 cmd = FS_IOC_GETFLAGS;
195 break;
196 case FS_IOC32_SETFLAGS:
197 cmd = FS_IOC_SETFLAGS;
198 break;
199 default:
200 return -ENOIOCTLCMD;
201 }
202 return ubifs_ioctl(file, cmd, (unsigned long)compat_ptr(arg));
203}
204#endif
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
new file mode 100644
index 000000000000..283155abe5f5
--- /dev/null
+++ b/fs/ubifs/journal.c
@@ -0,0 +1,1387 @@
1/*
2 * This file is part of UBIFS.
3 *
4 * Copyright (C) 2006-2008 Nokia Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published by
8 * the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along with
16 * this program; if not, write to the Free Software Foundation, Inc., 51
17 * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 * Authors: Artem Bityutskiy (Битюцкий Артём)
20 * Adrian Hunter
21 */
22
23/*
24 * This file implements UBIFS journal.
25 *
26 * The journal consists of 2 parts - the log and bud LEBs. The log has fixed
27 * length and position, while a bud logical eraseblock is any LEB in the main
28 * area. Buds contain file system data - data nodes, inode nodes, etc. The log
29 * contains only references to buds and some other stuff like commit
30 * start node. The idea is that when we commit the journal, we do
31 * not copy the data, the buds just become indexed. Since after the commit the
32 * nodes in bud eraseblocks become leaf nodes of the file system index tree, we
33 * use term "bud". Analogy is obvious, bud eraseblocks contain nodes which will
34 * become leafs in the future.
35 *
36 * The journal is multi-headed because we want to write data to the journal as
37 * optimally as possible. It is nice to have nodes belonging to the same inode
38 * in one LEB, so we may write data owned by different inodes to different
39 * journal heads, although at present only one data head is used.
40 *
41 * For recovery reasons, the base head contains all inode nodes, all directory
42 * entry nodes and all truncate nodes. This means that the other heads contain
43 * only data nodes.
44 *
45 * Bud LEBs may be half-indexed. For example, if the bud was not full at the
46 * time of commit, the bud is retained to continue to be used in the journal,
47 * even though the "front" of the LEB is now indexed. In that case, the log
48 * reference contains the offset where the bud starts for the purposes of the
49 * journal.
50 *
51 * The journal size has to be limited, because the larger is the journal, the
52 * longer it takes to mount UBIFS (scanning the journal) and the more memory it
53 * takes (indexing in the TNC).
54 *
55 * All the journal write operations like 'ubifs_jnl_update()' here, which write
56 * multiple UBIFS nodes to the journal at one go, are atomic with respect to
57 * unclean reboots. Should the unclean reboot happen, the recovery code drops
58 * all the nodes.
59 */
60
61#include "ubifs.h"
62
63/**
64 * zero_ino_node_unused - zero out unused fields of an on-flash inode node.
65 * @ino: the inode to zero out
66 */
67static inline void zero_ino_node_unused(struct ubifs_ino_node *ino)
68{
69 memset(ino->padding1, 0, 4);
70 memset(ino->padding2, 0, 26);
71}
72
73/**
74 * zero_dent_node_unused - zero out unused fields of an on-flash directory
75 * entry node.
76 * @dent: the directory entry to zero out
77 */
78static inline void zero_dent_node_unused(struct ubifs_dent_node *dent)
79{
80 dent->padding1 = 0;
81 memset(dent->padding2, 0, 4);
82}
83
84/**
85 * zero_data_node_unused - zero out unused fields of an on-flash data node.
86 * @data: the data node to zero out
87 */
88static inline void zero_data_node_unused(struct ubifs_data_node *data)
89{
90 memset(data->padding, 0, 2);
91}
92
93/**
94 * zero_trun_node_unused - zero out unused fields of an on-flash truncation
95 * node.
96 * @trun: the truncation node to zero out
97 */
98static inline void zero_trun_node_unused(struct ubifs_trun_node *trun)
99{
100 memset(trun->padding, 0, 12);
101}
102
103/**
104 * reserve_space - reserve space in the journal.
105 * @c: UBIFS file-system description object
106 * @jhead: journal head number
107 * @len: node length
108 *
109 * This function reserves space in journal head @head. If the reservation
110 * succeeded, the journal head stays locked and later has to be unlocked using
111 * 'release_head()'. 'write_node()' and 'write_head()' functions also unlock
112 * it. Returns zero in case of success, %-EAGAIN if commit has to be done, and
113 * other negative error codes in case of other failures.
114 */
115static int reserve_space(struct ubifs_info *c, int jhead, int len)
116{
117 int err = 0, err1, retries = 0, avail, lnum, offs, free, squeeze;
118 struct ubifs_wbuf *wbuf = &c->jheads[jhead].wbuf;
119
120 /*
121 * Typically, the base head has smaller nodes written to it, so it is
122 * better to try to allocate space at the ends of eraseblocks. This is
123 * what the squeeze parameter does.
124 */
125 squeeze = (jhead == BASEHD);
126again:
127 mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
128
129 if (c->ro_media) {
130 err = -EROFS;
131 goto out_unlock;
132 }
133
134 avail = c->leb_size - wbuf->offs - wbuf->used;
135 if (wbuf->lnum != -1 && avail >= len)
136 return 0;
137
138 /*
139 * Write buffer wasn't seek'ed or there is no enough space - look for an
140 * LEB with some empty space.
141 */
142 lnum = ubifs_find_free_space(c, len, &free, squeeze);
143 if (lnum >= 0) {
144 /* Found an LEB, add it to the journal head */
145 offs = c->leb_size - free;
146 err = ubifs_add_bud_to_log(c, jhead, lnum, offs);
147 if (err)
148 goto out_return;
149 /* A new bud was successfully allocated and added to the log */
150 goto out;
151 }
152
153 err = lnum;
154 if (err != -ENOSPC)
155 goto out_unlock;
156
157 /*
158 * No free space, we have to run garbage collector to make
159 * some. But the write-buffer mutex has to be unlocked because
160 * GC also takes it.
161 */
162 dbg_jnl("no free space jhead %d, run GC", jhead);
163 mutex_unlock(&wbuf->io_mutex);
164
165 lnum = ubifs_garbage_collect(c, 0);
166 if (lnum < 0) {
167 err = lnum;
168 if (err != -ENOSPC)
169 return err;
170
171 /*
172 * GC could not make a free LEB. But someone else may
173 * have allocated new bud for this journal head,
174 * because we dropped @wbuf->io_mutex, so try once
175 * again.
176 */
177 dbg_jnl("GC couldn't make a free LEB for jhead %d", jhead);
178 if (retries++ < 2) {
179 dbg_jnl("retry (%d)", retries);
180 goto again;
181 }
182
183 dbg_jnl("return -ENOSPC");
184 return err;
185 }
186
187 mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
188 dbg_jnl("got LEB %d for jhead %d", lnum, jhead);
189 avail = c->leb_size - wbuf->offs - wbuf->used;
190
191 if (wbuf->lnum != -1 && avail >= len) {
192 /*
193 * Someone else has switched the journal head and we have
194 * enough space now. This happens when more then one process is
195 * trying to write to the same journal head at the same time.
196 */
197 dbg_jnl("return LEB %d back, already have LEB %d:%d",
198 lnum, wbuf->lnum, wbuf->offs + wbuf->used);
199 err = ubifs_return_leb(c, lnum);
200 if (err)
201 goto out_unlock;
202 return 0;
203 }
204
205 err = ubifs_add_bud_to_log(c, jhead, lnum, 0);
206 if (err)
207 goto out_return;
208 offs = 0;
209
210out:
211 err = ubifs_wbuf_seek_nolock(wbuf, lnum, offs, UBI_SHORTTERM);
212 if (err)
213 goto out_unlock;
214
215 return 0;
216
217out_unlock:
218 mutex_unlock(&wbuf->io_mutex);
219 return err;
220
221out_return:
222 /* An error occurred and the LEB has to be returned to lprops */
223 ubifs_assert(err < 0);
224 err1 = ubifs_return_leb(c, lnum);
225 if (err1 && err == -EAGAIN)
226 /*
227 * Return original error code only if it is not %-EAGAIN,
228 * which is not really an error. Otherwise, return the error
229 * code of 'ubifs_return_leb()'.
230 */
231 err = err1;
232 mutex_unlock(&wbuf->io_mutex);
233 return err;
234}
235
236/**
237 * write_node - write node to a journal head.
238 * @c: UBIFS file-system description object
239 * @jhead: journal head
240 * @node: node to write
241 * @len: node length
242 * @lnum: LEB number written is returned here
243 * @offs: offset written is returned here
244 *
245 * This function writes a node to reserved space of journal head @jhead.
246 * Returns zero in case of success and a negative error code in case of
247 * failure.
248 */
249static int write_node(struct ubifs_info *c, int jhead, void *node, int len,
250 int *lnum, int *offs)
251{
252 struct ubifs_wbuf *wbuf = &c->jheads[jhead].wbuf;
253
254 ubifs_assert(jhead != GCHD);
255
256 *lnum = c->jheads[jhead].wbuf.lnum;
257 *offs = c->jheads[jhead].wbuf.offs + c->jheads[jhead].wbuf.used;
258
259 dbg_jnl("jhead %d, LEB %d:%d, len %d", jhead, *lnum, *offs, len);
260 ubifs_prepare_node(c, node, len, 0);
261
262 return ubifs_wbuf_write_nolock(wbuf, node, len);
263}
264
265/**
266 * write_head - write data to a journal head.
267 * @c: UBIFS file-system description object
268 * @jhead: journal head
269 * @buf: buffer to write
270 * @len: length to write
271 * @lnum: LEB number written is returned here
272 * @offs: offset written is returned here
273 * @sync: non-zero if the write-buffer has to by synchronized
274 *
275 * This function is the same as 'write_node()' but it does not assume the
276 * buffer it is writing is a node, so it does not prepare it (which means
277 * initializing common header and calculating CRC).
278 */
279static int write_head(struct ubifs_info *c, int jhead, void *buf, int len,
280 int *lnum, int *offs, int sync)
281{
282 int err;
283 struct ubifs_wbuf *wbuf = &c->jheads[jhead].wbuf;
284
285 ubifs_assert(jhead != GCHD);
286
287 *lnum = c->jheads[jhead].wbuf.lnum;
288 *offs = c->jheads[jhead].wbuf.offs + c->jheads[jhead].wbuf.used;
289 dbg_jnl("jhead %d, LEB %d:%d, len %d", jhead, *lnum, *offs, len);
290
291 err = ubifs_wbuf_write_nolock(wbuf, buf, len);
292 if (err)
293 return err;
294 if (sync)
295 err = ubifs_wbuf_sync_nolock(wbuf);
296 return err;
297}
298
299/**
300 * make_reservation - reserve journal space.
301 * @c: UBIFS file-system description object
302 * @jhead: journal head
303 * @len: how many bytes to reserve
304 *
305 * This function makes space reservation in journal head @jhead. The function
306 * takes the commit lock and locks the journal head, and the caller has to
307 * unlock the head and finish the reservation with 'finish_reservation()'.
308 * Returns zero in case of success and a negative error code in case of
309 * failure.
310 *
311 * Note, the journal head may be unlocked as soon as the data is written, while
312 * the commit lock has to be released after the data has been added to the
313 * TNC.
314 */
315static int make_reservation(struct ubifs_info *c, int jhead, int len)
316{
317 int err, cmt_retries = 0, nospc_retries = 0;
318
319again:
320 down_read(&c->commit_sem);
321 err = reserve_space(c, jhead, len);
322 if (!err)
323 return 0;
324 up_read(&c->commit_sem);
325
326 if (err == -ENOSPC) {
327 /*
328 * GC could not make any progress. We should try to commit
329 * once because it could make some dirty space and GC would
330 * make progress, so make the error -EAGAIN so that the below
331 * will commit and re-try.
332 */
333 if (nospc_retries++ < 2) {
334 dbg_jnl("no space, retry");
335 err = -EAGAIN;
336 }
337
338 /*
339 * This means that the budgeting is incorrect. We always have
340 * to be able to write to the media, because all operations are
341 * budgeted. Deletions are not budgeted, though, but we reserve
342 * an extra LEB for them.
343 */
344 }
345
346 if (err != -EAGAIN)
347 goto out;
348
349 /*
350 * -EAGAIN means that the journal is full or too large, or the above
351 * code wants to do one commit. Do this and re-try.
352 */
353 if (cmt_retries > 128) {
354 /*
355 * This should not happen unless the journal size limitations
356 * are too tough.
357 */
358 ubifs_err("stuck in space allocation");
359 err = -ENOSPC;
360 goto out;
361 } else if (cmt_retries > 32)
362 ubifs_warn("too many space allocation re-tries (%d)",
363 cmt_retries);
364
365 dbg_jnl("-EAGAIN, commit and retry (retried %d times)",
366 cmt_retries);
367 cmt_retries += 1;
368
369 err = ubifs_run_commit(c);
370 if (err)
371 return err;
372 goto again;
373
374out:
375 ubifs_err("cannot reserve %d bytes in jhead %d, error %d",
376 len, jhead, err);
377 if (err == -ENOSPC) {
378 /* This are some budgeting problems, print useful information */
379 down_write(&c->commit_sem);
380 spin_lock(&c->space_lock);
381 dbg_dump_stack();
382 dbg_dump_budg(c);
383 spin_unlock(&c->space_lock);
384 dbg_dump_lprops(c);
385 cmt_retries = dbg_check_lprops(c);
386 up_write(&c->commit_sem);
387 }
388 return err;
389}
390
391/**
392 * release_head - release a journal head.
393 * @c: UBIFS file-system description object
394 * @jhead: journal head
395 *
396 * This function releases journal head @jhead which was locked by
397 * the 'make_reservation()' function. It has to be called after each successful
398 * 'make_reservation()' invocation.
399 */
400static inline void release_head(struct ubifs_info *c, int jhead)
401{
402 mutex_unlock(&c->jheads[jhead].wbuf.io_mutex);
403}
404
405/**
406 * finish_reservation - finish a reservation.
407 * @c: UBIFS file-system description object
408 *
409 * This function finishes journal space reservation. It must be called after
410 * 'make_reservation()'.
411 */
412static void finish_reservation(struct ubifs_info *c)
413{
414 up_read(&c->commit_sem);
415}
416
417/**
418 * get_dent_type - translate VFS inode mode to UBIFS directory entry type.
419 * @mode: inode mode
420 */
421static int get_dent_type(int mode)
422{
423 switch (mode & S_IFMT) {
424 case S_IFREG:
425 return UBIFS_ITYPE_REG;
426 case S_IFDIR:
427 return UBIFS_ITYPE_DIR;
428 case S_IFLNK:
429 return UBIFS_ITYPE_LNK;
430 case S_IFBLK:
431 return UBIFS_ITYPE_BLK;
432 case S_IFCHR:
433 return UBIFS_ITYPE_CHR;
434 case S_IFIFO:
435 return UBIFS_ITYPE_FIFO;
436 case S_IFSOCK:
437 return UBIFS_ITYPE_SOCK;
438 default:
439 BUG();
440 }
441 return 0;
442}
443
444/**
445 * pack_inode - pack an inode node.
446 * @c: UBIFS file-system description object
447 * @ino: buffer in which to pack inode node
448 * @inode: inode to pack
449 * @last: indicates the last node of the group
450 * @last_reference: non-zero if this is a deletion inode
451 */
452static void pack_inode(struct ubifs_info *c, struct ubifs_ino_node *ino,
453 const struct inode *inode, int last,
454 int last_reference)
455{
456 int data_len = 0;
457 struct ubifs_inode *ui = ubifs_inode(inode);
458
459 ino->ch.node_type = UBIFS_INO_NODE;
460 ino_key_init_flash(c, &ino->key, inode->i_ino);
461 ino->creat_sqnum = cpu_to_le64(ui->creat_sqnum);
462 ino->atime_sec = cpu_to_le64(inode->i_atime.tv_sec);
463 ino->atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
464 ino->ctime_sec = cpu_to_le64(inode->i_ctime.tv_sec);
465 ino->ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
466 ino->mtime_sec = cpu_to_le64(inode->i_mtime.tv_sec);
467 ino->mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
468 ino->uid = cpu_to_le32(inode->i_uid);
469 ino->gid = cpu_to_le32(inode->i_gid);
470 ino->mode = cpu_to_le32(inode->i_mode);
471 ino->flags = cpu_to_le32(ui->flags);
472 ino->size = cpu_to_le64(ui->ui_size);
473 ino->nlink = cpu_to_le32(inode->i_nlink);
474 ino->compr_type = cpu_to_le16(ui->compr_type);
475 ino->data_len = cpu_to_le32(ui->data_len);
476 ino->xattr_cnt = cpu_to_le32(ui->xattr_cnt);
477 ino->xattr_size = cpu_to_le32(ui->xattr_size);
478 ino->xattr_names = cpu_to_le32(ui->xattr_names);
479 zero_ino_node_unused(ino);
480
481 /*
482 * Drop the attached data if this is a deletion inode, the data is not
483 * needed anymore.
484 */
485 if (!last_reference) {
486 memcpy(ino->data, ui->data, ui->data_len);
487 data_len = ui->data_len;
488 }
489
490 ubifs_prep_grp_node(c, ino, UBIFS_INO_NODE_SZ + data_len, last);
491}
492
493/**
494 * mark_inode_clean - mark UBIFS inode as clean.
495 * @c: UBIFS file-system description object
496 * @ui: UBIFS inode to mark as clean
497 *
498 * This helper function marks UBIFS inode @ui as clean by cleaning the
499 * @ui->dirty flag and releasing its budget. Note, VFS may still treat the
500 * inode as dirty and try to write it back, but 'ubifs_write_inode()' would
501 * just do nothing.
502 */
503static void mark_inode_clean(struct ubifs_info *c, struct ubifs_inode *ui)
504{
505 if (ui->dirty)
506 ubifs_release_dirty_inode_budget(c, ui);
507 ui->dirty = 0;
508}
509
510/**
511 * ubifs_jnl_update - update inode.
512 * @c: UBIFS file-system description object
513 * @dir: parent inode or host inode in case of extended attributes
514 * @nm: directory entry name
515 * @inode: inode to update
516 * @deletion: indicates a directory entry deletion i.e unlink or rmdir
517 * @xent: non-zero if the directory entry is an extended attribute entry
518 *
519 * This function updates an inode by writing a directory entry (or extended
520 * attribute entry), the inode itself, and the parent directory inode (or the
521 * host inode) to the journal.
522 *
523 * The function writes the host inode @dir last, which is important in case of
524 * extended attributes. Indeed, then we guarantee that if the host inode gets
525 * synchronized (with 'fsync()'), and the write-buffer it sits in gets flushed,
526 * the extended attribute inode gets flushed too. And this is exactly what the
527 * user expects - synchronizing the host inode synchronizes its extended
528 * attributes. Similarly, this guarantees that if @dir is synchronized, its
529 * directory entry corresponding to @nm gets synchronized too.
530 *
531 * If the inode (@inode) or the parent directory (@dir) are synchronous, this
532 * function synchronizes the write-buffer.
533 *
534 * This function marks the @dir and @inode inodes as clean and returns zero on
535 * success. In case of failure, a negative error code is returned.
536 */
537int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir,
538 const struct qstr *nm, const struct inode *inode,
539 int deletion, int xent)
540{
541 int err, dlen, ilen, len, lnum, ino_offs, dent_offs;
542 int aligned_dlen, aligned_ilen, sync = IS_DIRSYNC(dir);
543 int last_reference = !!(deletion && inode->i_nlink == 0);
544 struct ubifs_inode *ui = ubifs_inode(inode);
545 struct ubifs_inode *dir_ui = ubifs_inode(dir);
546 struct ubifs_dent_node *dent;
547 struct ubifs_ino_node *ino;
548 union ubifs_key dent_key, ino_key;
549
550 dbg_jnl("ino %lu, dent '%.*s', data len %d in dir ino %lu",
551 inode->i_ino, nm->len, nm->name, ui->data_len, dir->i_ino);
552 ubifs_assert(dir_ui->data_len == 0);
553 ubifs_assert(mutex_is_locked(&dir_ui->ui_mutex));
554
555 dlen = UBIFS_DENT_NODE_SZ + nm->len + 1;
556 ilen = UBIFS_INO_NODE_SZ;
557
558 /*
559 * If the last reference to the inode is being deleted, then there is
560 * no need to attach and write inode data, it is being deleted anyway.
561 * And if the inode is being deleted, no need to synchronize
562 * write-buffer even if the inode is synchronous.
563 */
564 if (!last_reference) {
565 ilen += ui->data_len;
566 sync |= IS_SYNC(inode);
567 }
568
569 aligned_dlen = ALIGN(dlen, 8);
570 aligned_ilen = ALIGN(ilen, 8);
571 len = aligned_dlen + aligned_ilen + UBIFS_INO_NODE_SZ;
572 dent = kmalloc(len, GFP_NOFS);
573 if (!dent)
574 return -ENOMEM;
575
576 /* Make reservation before allocating sequence numbers */
577 err = make_reservation(c, BASEHD, len);
578 if (err)
579 goto out_free;
580
581 if (!xent) {
582 dent->ch.node_type = UBIFS_DENT_NODE;
583 dent_key_init(c, &dent_key, dir->i_ino, nm);
584 } else {
585 dent->ch.node_type = UBIFS_XENT_NODE;
586 xent_key_init(c, &dent_key, dir->i_ino, nm);
587 }
588
589 key_write(c, &dent_key, dent->key);
590 dent->inum = deletion ? 0 : cpu_to_le64(inode->i_ino);
591 dent->type = get_dent_type(inode->i_mode);
592 dent->nlen = cpu_to_le16(nm->len);
593 memcpy(dent->name, nm->name, nm->len);
594 dent->name[nm->len] = '\0';
595 zero_dent_node_unused(dent);
596 ubifs_prep_grp_node(c, dent, dlen, 0);
597
598 ino = (void *)dent + aligned_dlen;
599 pack_inode(c, ino, inode, 0, last_reference);
600 ino = (void *)ino + aligned_ilen;
601 pack_inode(c, ino, dir, 1, 0);
602
603 if (last_reference) {
604 err = ubifs_add_orphan(c, inode->i_ino);
605 if (err) {
606 release_head(c, BASEHD);
607 goto out_finish;
608 }
609 }
610
611 err = write_head(c, BASEHD, dent, len, &lnum, &dent_offs, sync);
612 if (err)
613 goto out_release;
614 if (!sync) {
615 struct ubifs_wbuf *wbuf = &c->jheads[BASEHD].wbuf;
616
617 ubifs_wbuf_add_ino_nolock(wbuf, inode->i_ino);
618 ubifs_wbuf_add_ino_nolock(wbuf, dir->i_ino);
619 }
620 release_head(c, BASEHD);
621 kfree(dent);
622
623 if (deletion) {
624 err = ubifs_tnc_remove_nm(c, &dent_key, nm);
625 if (err)
626 goto out_ro;
627 err = ubifs_add_dirt(c, lnum, dlen);
628 } else
629 err = ubifs_tnc_add_nm(c, &dent_key, lnum, dent_offs, dlen, nm);
630 if (err)
631 goto out_ro;
632
633 /*
634 * Note, we do not remove the inode from TNC even if the last reference
635 * to it has just been deleted, because the inode may still be opened.
636 * Instead, the inode has been added to orphan lists and the orphan
637 * subsystem will take further care about it.
638 */
639 ino_key_init(c, &ino_key, inode->i_ino);
640 ino_offs = dent_offs + aligned_dlen;
641 err = ubifs_tnc_add(c, &ino_key, lnum, ino_offs, ilen);
642 if (err)
643 goto out_ro;
644
645 ino_key_init(c, &ino_key, dir->i_ino);
646 ino_offs += aligned_ilen;
647 err = ubifs_tnc_add(c, &ino_key, lnum, ino_offs, UBIFS_INO_NODE_SZ);
648 if (err)
649 goto out_ro;
650
651 finish_reservation(c);
652 spin_lock(&ui->ui_lock);
653 ui->synced_i_size = ui->ui_size;
654 spin_unlock(&ui->ui_lock);
655 mark_inode_clean(c, ui);
656 mark_inode_clean(c, dir_ui);
657 return 0;
658
659out_finish:
660 finish_reservation(c);
661out_free:
662 kfree(dent);
663 return err;
664
665out_release:
666 release_head(c, BASEHD);
667out_ro:
668 ubifs_ro_mode(c, err);
669 if (last_reference)
670 ubifs_delete_orphan(c, inode->i_ino);
671 finish_reservation(c);
672 return err;
673}
674
675/**
676 * ubifs_jnl_write_data - write a data node to the journal.
677 * @c: UBIFS file-system description object
678 * @inode: inode the data node belongs to
679 * @key: node key
680 * @buf: buffer to write
681 * @len: data length (must not exceed %UBIFS_BLOCK_SIZE)
682 *
683 * This function writes a data node to the journal. Returns %0 if the data node
684 * was successfully written, and a negative error code in case of failure.
685 */
686int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode,
687 const union ubifs_key *key, const void *buf, int len)
688{
689 struct ubifs_data_node *data;
690 int err, lnum, offs, compr_type, out_len;
691 int dlen = UBIFS_DATA_NODE_SZ + UBIFS_BLOCK_SIZE * WORST_COMPR_FACTOR;
692 struct ubifs_inode *ui = ubifs_inode(inode);
693
694 dbg_jnl("ino %lu, blk %u, len %d, key %s", key_inum(c, key),
695 key_block(c, key), len, DBGKEY(key));
696 ubifs_assert(len <= UBIFS_BLOCK_SIZE);
697
698 data = kmalloc(dlen, GFP_NOFS);
699 if (!data)
700 return -ENOMEM;
701
702 data->ch.node_type = UBIFS_DATA_NODE;
703 key_write(c, key, &data->key);
704 data->size = cpu_to_le32(len);
705 zero_data_node_unused(data);
706
707 if (!(ui->flags && UBIFS_COMPR_FL))
708 /* Compression is disabled for this inode */
709 compr_type = UBIFS_COMPR_NONE;
710 else
711 compr_type = ui->compr_type;
712
713 out_len = dlen - UBIFS_DATA_NODE_SZ;
714 ubifs_compress(buf, len, &data->data, &out_len, &compr_type);
715 ubifs_assert(out_len <= UBIFS_BLOCK_SIZE);
716
717 dlen = UBIFS_DATA_NODE_SZ + out_len;
718 data->compr_type = cpu_to_le16(compr_type);
719
720 /* Make reservation before allocating sequence numbers */
721 err = make_reservation(c, DATAHD, dlen);
722 if (err)
723 goto out_free;
724
725 err = write_node(c, DATAHD, data, dlen, &lnum, &offs);
726 if (err)
727 goto out_release;
728 ubifs_wbuf_add_ino_nolock(&c->jheads[DATAHD].wbuf, key_inum(c, key));
729 release_head(c, DATAHD);
730
731 err = ubifs_tnc_add(c, key, lnum, offs, dlen);
732 if (err)
733 goto out_ro;
734
735 finish_reservation(c);
736 kfree(data);
737 return 0;
738
739out_release:
740 release_head(c, DATAHD);
741out_ro:
742 ubifs_ro_mode(c, err);
743 finish_reservation(c);
744out_free:
745 kfree(data);
746 return err;
747}
748
749/**
750 * ubifs_jnl_write_inode - flush inode to the journal.
751 * @c: UBIFS file-system description object
752 * @inode: inode to flush
753 * @deletion: inode has been deleted
754 *
755 * This function writes inode @inode to the journal. If the inode is
756 * synchronous, it also synchronizes the write-buffer. Returns zero in case of
757 * success and a negative error code in case of failure.
758 */
759int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode,
760 int deletion)
761{
762 int err, len, lnum, offs, sync = 0;
763 struct ubifs_ino_node *ino;
764 struct ubifs_inode *ui = ubifs_inode(inode);
765
766 dbg_jnl("ino %lu%s", inode->i_ino,
767 deletion ? " (last reference)" : "");
768 if (deletion)
769 ubifs_assert(inode->i_nlink == 0);
770
771 len = UBIFS_INO_NODE_SZ;
772 /*
773 * If the inode is being deleted, do not write the attached data. No
774 * need to synchronize the write-buffer either.
775 */
776 if (!deletion) {
777 len += ui->data_len;
778 sync = IS_SYNC(inode);
779 }
780 ino = kmalloc(len, GFP_NOFS);
781 if (!ino)
782 return -ENOMEM;
783
784 /* Make reservation before allocating sequence numbers */
785 err = make_reservation(c, BASEHD, len);
786 if (err)
787 goto out_free;
788
789 pack_inode(c, ino, inode, 1, deletion);
790 err = write_head(c, BASEHD, ino, len, &lnum, &offs, sync);
791 if (err)
792 goto out_release;
793 if (!sync)
794 ubifs_wbuf_add_ino_nolock(&c->jheads[BASEHD].wbuf,
795 inode->i_ino);
796 release_head(c, BASEHD);
797
798 if (deletion) {
799 err = ubifs_tnc_remove_ino(c, inode->i_ino);
800 if (err)
801 goto out_ro;
802 ubifs_delete_orphan(c, inode->i_ino);
803 err = ubifs_add_dirt(c, lnum, len);
804 } else {
805 union ubifs_key key;
806
807 ino_key_init(c, &key, inode->i_ino);
808 err = ubifs_tnc_add(c, &key, lnum, offs, len);
809 }
810 if (err)
811 goto out_ro;
812
813 finish_reservation(c);
814 spin_lock(&ui->ui_lock);
815 ui->synced_i_size = ui->ui_size;
816 spin_unlock(&ui->ui_lock);
817 kfree(ino);
818 return 0;
819
820out_release:
821 release_head(c, BASEHD);
822out_ro:
823 ubifs_ro_mode(c, err);
824 finish_reservation(c);
825out_free:
826 kfree(ino);
827 return err;
828}
829
830/**
831 * ubifs_jnl_rename - rename a directory entry.
832 * @c: UBIFS file-system description object
833 * @old_dir: parent inode of directory entry to rename
834 * @old_dentry: directory entry to rename
835 * @new_dir: parent inode of directory entry to rename
836 * @new_dentry: new directory entry (or directory entry to replace)
837 * @sync: non-zero if the write-buffer has to be synchronized
838 *
839 * This function implements the re-name operation which may involve writing up
840 * to 3 inodes and 2 directory entries. It marks the written inodes as clean
841 * and returns zero on success. In case of failure, a negative error code is
842 * returned.
843 */
844int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
845 const struct dentry *old_dentry,
846 const struct inode *new_dir,
847 const struct dentry *new_dentry, int sync)
848{
849 void *p;
850 union ubifs_key key;
851 struct ubifs_dent_node *dent, *dent2;
852 int err, dlen1, dlen2, ilen, lnum, offs, len;
853 const struct inode *old_inode = old_dentry->d_inode;
854 const struct inode *new_inode = new_dentry->d_inode;
855 int aligned_dlen1, aligned_dlen2, plen = UBIFS_INO_NODE_SZ;
856 int last_reference = !!(new_inode && new_inode->i_nlink == 0);
857 int move = (old_dir != new_dir);
858 struct ubifs_inode *uninitialized_var(new_ui);
859
860 dbg_jnl("dent '%.*s' in dir ino %lu to dent '%.*s' in dir ino %lu",
861 old_dentry->d_name.len, old_dentry->d_name.name,
862 old_dir->i_ino, new_dentry->d_name.len,
863 new_dentry->d_name.name, new_dir->i_ino);
864 ubifs_assert(ubifs_inode(old_dir)->data_len == 0);
865 ubifs_assert(ubifs_inode(new_dir)->data_len == 0);
866 ubifs_assert(mutex_is_locked(&ubifs_inode(old_dir)->ui_mutex));
867 ubifs_assert(mutex_is_locked(&ubifs_inode(new_dir)->ui_mutex));
868
869 dlen1 = UBIFS_DENT_NODE_SZ + new_dentry->d_name.len + 1;
870 dlen2 = UBIFS_DENT_NODE_SZ + old_dentry->d_name.len + 1;
871 if (new_inode) {
872 new_ui = ubifs_inode(new_inode);
873 ubifs_assert(mutex_is_locked(&new_ui->ui_mutex));
874 ilen = UBIFS_INO_NODE_SZ;
875 if (!last_reference)
876 ilen += new_ui->data_len;
877 } else
878 ilen = 0;
879
880 aligned_dlen1 = ALIGN(dlen1, 8);
881 aligned_dlen2 = ALIGN(dlen2, 8);
882 len = aligned_dlen1 + aligned_dlen2 + ALIGN(ilen, 8) + ALIGN(plen, 8);
883 if (old_dir != new_dir)
884 len += plen;
885 dent = kmalloc(len, GFP_NOFS);
886 if (!dent)
887 return -ENOMEM;
888
889 /* Make reservation before allocating sequence numbers */
890 err = make_reservation(c, BASEHD, len);
891 if (err)
892 goto out_free;
893
894 /* Make new dent */
895 dent->ch.node_type = UBIFS_DENT_NODE;
896 dent_key_init_flash(c, &dent->key, new_dir->i_ino, &new_dentry->d_name);
897 dent->inum = cpu_to_le64(old_inode->i_ino);
898 dent->type = get_dent_type(old_inode->i_mode);
899 dent->nlen = cpu_to_le16(new_dentry->d_name.len);
900 memcpy(dent->name, new_dentry->d_name.name, new_dentry->d_name.len);
901 dent->name[new_dentry->d_name.len] = '\0';
902 zero_dent_node_unused(dent);
903 ubifs_prep_grp_node(c, dent, dlen1, 0);
904
905 /* Make deletion dent */
906 dent2 = (void *)dent + aligned_dlen1;
907 dent2->ch.node_type = UBIFS_DENT_NODE;
908 dent_key_init_flash(c, &dent2->key, old_dir->i_ino,
909 &old_dentry->d_name);
910 dent2->inum = 0;
911 dent2->type = DT_UNKNOWN;
912 dent2->nlen = cpu_to_le16(old_dentry->d_name.len);
913 memcpy(dent2->name, old_dentry->d_name.name, old_dentry->d_name.len);
914 dent2->name[old_dentry->d_name.len] = '\0';
915 zero_dent_node_unused(dent2);
916 ubifs_prep_grp_node(c, dent2, dlen2, 0);
917
918 p = (void *)dent2 + aligned_dlen2;
919 if (new_inode) {
920 pack_inode(c, p, new_inode, 0, last_reference);
921 p += ALIGN(ilen, 8);
922 }
923
924 if (!move)
925 pack_inode(c, p, old_dir, 1, 0);
926 else {
927 pack_inode(c, p, old_dir, 0, 0);
928 p += ALIGN(plen, 8);
929 pack_inode(c, p, new_dir, 1, 0);
930 }
931
932 if (last_reference) {
933 err = ubifs_add_orphan(c, new_inode->i_ino);
934 if (err) {
935 release_head(c, BASEHD);
936 goto out_finish;
937 }
938 }
939
940 err = write_head(c, BASEHD, dent, len, &lnum, &offs, sync);
941 if (err)
942 goto out_release;
943 if (!sync) {
944 struct ubifs_wbuf *wbuf = &c->jheads[BASEHD].wbuf;
945
946 ubifs_wbuf_add_ino_nolock(wbuf, new_dir->i_ino);
947 ubifs_wbuf_add_ino_nolock(wbuf, old_dir->i_ino);
948 if (new_inode)
949 ubifs_wbuf_add_ino_nolock(&c->jheads[BASEHD].wbuf,
950 new_inode->i_ino);
951 }
952 release_head(c, BASEHD);
953
954 dent_key_init(c, &key, new_dir->i_ino, &new_dentry->d_name);
955 err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen1, &new_dentry->d_name);
956 if (err)
957 goto out_ro;
958
959 err = ubifs_add_dirt(c, lnum, dlen2);
960 if (err)
961 goto out_ro;
962
963 dent_key_init(c, &key, old_dir->i_ino, &old_dentry->d_name);
964 err = ubifs_tnc_remove_nm(c, &key, &old_dentry->d_name);
965 if (err)
966 goto out_ro;
967
968 offs += aligned_dlen1 + aligned_dlen2;
969 if (new_inode) {
970 ino_key_init(c, &key, new_inode->i_ino);
971 err = ubifs_tnc_add(c, &key, lnum, offs, ilen);
972 if (err)
973 goto out_ro;
974 offs += ALIGN(ilen, 8);
975 }
976
977 ino_key_init(c, &key, old_dir->i_ino);
978 err = ubifs_tnc_add(c, &key, lnum, offs, plen);
979 if (err)
980 goto out_ro;
981
982 if (old_dir != new_dir) {
983 offs += ALIGN(plen, 8);
984 ino_key_init(c, &key, new_dir->i_ino);
985 err = ubifs_tnc_add(c, &key, lnum, offs, plen);
986 if (err)
987 goto out_ro;
988 }
989
990 finish_reservation(c);
991 if (new_inode) {
992 mark_inode_clean(c, new_ui);
993 spin_lock(&new_ui->ui_lock);
994 new_ui->synced_i_size = new_ui->ui_size;
995 spin_unlock(&new_ui->ui_lock);
996 }
997 mark_inode_clean(c, ubifs_inode(old_dir));
998 if (move)
999 mark_inode_clean(c, ubifs_inode(new_dir));
1000 kfree(dent);
1001 return 0;
1002
1003out_release:
1004 release_head(c, BASEHD);
1005out_ro:
1006 ubifs_ro_mode(c, err);
1007 if (last_reference)
1008 ubifs_delete_orphan(c, new_inode->i_ino);
1009out_finish:
1010 finish_reservation(c);
1011out_free:
1012 kfree(dent);
1013 return err;
1014}
1015
1016/**
1017 * recomp_data_node - re-compress a truncated data node.
1018 * @dn: data node to re-compress
1019 * @new_len: new length
1020 *
1021 * This function is used when an inode is truncated and the last data node of
1022 * the inode has to be re-compressed and re-written.
1023 */
1024static int recomp_data_node(struct ubifs_data_node *dn, int *new_len)
1025{
1026 void *buf;
1027 int err, len, compr_type, out_len;
1028
1029 out_len = le32_to_cpu(dn->size);
1030 buf = kmalloc(out_len * WORST_COMPR_FACTOR, GFP_NOFS);
1031 if (!buf)
1032 return -ENOMEM;
1033
1034 len = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ;
1035 compr_type = le16_to_cpu(dn->compr_type);
1036 err = ubifs_decompress(&dn->data, len, buf, &out_len, compr_type);
1037 if (err)
1038 goto out;
1039
1040 ubifs_compress(buf, *new_len, &dn->data, &out_len, &compr_type);
1041 ubifs_assert(out_len <= UBIFS_BLOCK_SIZE);
1042 dn->compr_type = cpu_to_le16(compr_type);
1043 dn->size = cpu_to_le32(*new_len);
1044 *new_len = UBIFS_DATA_NODE_SZ + out_len;
1045out:
1046 kfree(buf);
1047 return err;
1048}
1049
1050/**
1051 * ubifs_jnl_truncate - update the journal for a truncation.
1052 * @c: UBIFS file-system description object
1053 * @inode: inode to truncate
1054 * @old_size: old size
1055 * @new_size: new size
1056 *
1057 * When the size of a file decreases due to truncation, a truncation node is
1058 * written, the journal tree is updated, and the last data block is re-written
1059 * if it has been affected. The inode is also updated in order to synchronize
1060 * the new inode size.
1061 *
1062 * This function marks the inode as clean and returns zero on success. In case
1063 * of failure, a negative error code is returned.
1064 */
1065int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode,
1066 loff_t old_size, loff_t new_size)
1067{
1068 union ubifs_key key, to_key;
1069 struct ubifs_ino_node *ino;
1070 struct ubifs_trun_node *trun;
1071 struct ubifs_data_node *uninitialized_var(dn);
1072 int err, dlen, len, lnum, offs, bit, sz, sync = IS_SYNC(inode);
1073 struct ubifs_inode *ui = ubifs_inode(inode);
1074 ino_t inum = inode->i_ino;
1075 unsigned int blk;
1076
1077 dbg_jnl("ino %lu, size %lld -> %lld", inum, old_size, new_size);
1078 ubifs_assert(!ui->data_len);
1079 ubifs_assert(S_ISREG(inode->i_mode));
1080 ubifs_assert(mutex_is_locked(&ui->ui_mutex));
1081
1082 sz = UBIFS_TRUN_NODE_SZ + UBIFS_INO_NODE_SZ +
1083 UBIFS_MAX_DATA_NODE_SZ * WORST_COMPR_FACTOR;
1084 ino = kmalloc(sz, GFP_NOFS);
1085 if (!ino)
1086 return -ENOMEM;
1087
1088 trun = (void *)ino + UBIFS_INO_NODE_SZ;
1089 trun->ch.node_type = UBIFS_TRUN_NODE;
1090 trun->inum = cpu_to_le32(inum);
1091 trun->old_size = cpu_to_le64(old_size);
1092 trun->new_size = cpu_to_le64(new_size);
1093 zero_trun_node_unused(trun);
1094
1095 dlen = new_size & (UBIFS_BLOCK_SIZE - 1);
1096 if (dlen) {
1097 /* Get last data block so it can be truncated */
1098 dn = (void *)trun + UBIFS_TRUN_NODE_SZ;
1099 blk = new_size >> UBIFS_BLOCK_SHIFT;
1100 data_key_init(c, &key, inum, blk);
1101 dbg_jnl("last block key %s", DBGKEY(&key));
1102 err = ubifs_tnc_lookup(c, &key, dn);
1103 if (err == -ENOENT)
1104 dlen = 0; /* Not found (so it is a hole) */
1105 else if (err)
1106 goto out_free;
1107 else {
1108 if (le32_to_cpu(dn->size) <= dlen)
1109 dlen = 0; /* Nothing to do */
1110 else {
1111 int compr_type = le16_to_cpu(dn->compr_type);
1112
1113 if (compr_type != UBIFS_COMPR_NONE) {
1114 err = recomp_data_node(dn, &dlen);
1115 if (err)
1116 goto out_free;
1117 } else {
1118 dn->size = cpu_to_le32(dlen);
1119 dlen += UBIFS_DATA_NODE_SZ;
1120 }
1121 zero_data_node_unused(dn);
1122 }
1123 }
1124 }
1125
1126 /* Must make reservation before allocating sequence numbers */
1127 len = UBIFS_TRUN_NODE_SZ + UBIFS_INO_NODE_SZ;
1128 if (dlen)
1129 len += dlen;
1130 err = make_reservation(c, BASEHD, len);
1131 if (err)
1132 goto out_free;
1133
1134 pack_inode(c, ino, inode, 0, 0);
1135 ubifs_prep_grp_node(c, trun, UBIFS_TRUN_NODE_SZ, dlen ? 0 : 1);
1136 if (dlen)
1137 ubifs_prep_grp_node(c, dn, dlen, 1);
1138
1139 err = write_head(c, BASEHD, ino, len, &lnum, &offs, sync);
1140 if (err)
1141 goto out_release;
1142 if (!sync)
1143 ubifs_wbuf_add_ino_nolock(&c->jheads[BASEHD].wbuf, inum);
1144 release_head(c, BASEHD);
1145
1146 if (dlen) {
1147 sz = offs + UBIFS_INO_NODE_SZ + UBIFS_TRUN_NODE_SZ;
1148 err = ubifs_tnc_add(c, &key, lnum, sz, dlen);
1149 if (err)
1150 goto out_ro;
1151 }
1152
1153 ino_key_init(c, &key, inum);
1154 err = ubifs_tnc_add(c, &key, lnum, offs, UBIFS_INO_NODE_SZ);
1155 if (err)
1156 goto out_ro;
1157
1158 err = ubifs_add_dirt(c, lnum, UBIFS_TRUN_NODE_SZ);
1159 if (err)
1160 goto out_ro;
1161
1162 bit = new_size & (UBIFS_BLOCK_SIZE - 1);
1163 blk = (new_size >> UBIFS_BLOCK_SHIFT) + (bit ? 1 : 0);
1164 data_key_init(c, &key, inum, blk);
1165
1166 bit = old_size & (UBIFS_BLOCK_SIZE - 1);
1167 blk = (old_size >> UBIFS_BLOCK_SHIFT) - (bit ? 0: 1);
1168 data_key_init(c, &to_key, inum, blk);
1169
1170 err = ubifs_tnc_remove_range(c, &key, &to_key);
1171 if (err)
1172 goto out_ro;
1173
1174 finish_reservation(c);
1175 spin_lock(&ui->ui_lock);
1176 ui->synced_i_size = ui->ui_size;
1177 spin_unlock(&ui->ui_lock);
1178 mark_inode_clean(c, ui);
1179 kfree(ino);
1180 return 0;
1181
1182out_release:
1183 release_head(c, BASEHD);
1184out_ro:
1185 ubifs_ro_mode(c, err);
1186 finish_reservation(c);
1187out_free:
1188 kfree(ino);
1189 return err;
1190}
1191
1192#ifdef CONFIG_UBIFS_FS_XATTR
1193
1194/**
1195 * ubifs_jnl_delete_xattr - delete an extended attribute.
1196 * @c: UBIFS file-system description object
1197 * @host: host inode
1198 * @inode: extended attribute inode
1199 * @nm: extended attribute entry name
1200 *
1201 * This function delete an extended attribute which is very similar to
1202 * un-linking regular files - it writes a deletion xentry, a deletion inode and
1203 * updates the target inode. Returns zero in case of success and a negative
1204 * error code in case of failure.
1205 */
1206int ubifs_jnl_delete_xattr(struct ubifs_info *c, const struct inode *host,
1207 const struct inode *inode, const struct qstr *nm)
1208{
1209 int err, xlen, hlen, len, lnum, xent_offs, aligned_xlen;
1210 struct ubifs_dent_node *xent;
1211 struct ubifs_ino_node *ino;
1212 union ubifs_key xent_key, key1, key2;
1213 int sync = IS_DIRSYNC(host);
1214 struct ubifs_inode *host_ui = ubifs_inode(host);
1215
1216 dbg_jnl("host %lu, xattr ino %lu, name '%s', data len %d",
1217 host->i_ino, inode->i_ino, nm->name,
1218 ubifs_inode(inode)->data_len);
1219 ubifs_assert(inode->i_nlink == 0);
1220 ubifs_assert(mutex_is_locked(&host_ui->ui_mutex));
1221
1222 /*
1223 * Since we are deleting the inode, we do not bother to attach any data
1224 * to it and assume its length is %UBIFS_INO_NODE_SZ.
1225 */
1226 xlen = UBIFS_DENT_NODE_SZ + nm->len + 1;
1227 aligned_xlen = ALIGN(xlen, 8);
1228 hlen = host_ui->data_len + UBIFS_INO_NODE_SZ;
1229 len = aligned_xlen + UBIFS_INO_NODE_SZ + ALIGN(hlen, 8);
1230
1231 xent = kmalloc(len, GFP_NOFS);
1232 if (!xent)
1233 return -ENOMEM;
1234
1235 /* Make reservation before allocating sequence numbers */
1236 err = make_reservation(c, BASEHD, len);
1237 if (err) {
1238 kfree(xent);
1239 return err;
1240 }
1241
1242 xent->ch.node_type = UBIFS_XENT_NODE;
1243 xent_key_init(c, &xent_key, host->i_ino, nm);
1244 key_write(c, &xent_key, xent->key);
1245 xent->inum = 0;
1246 xent->type = get_dent_type(inode->i_mode);
1247 xent->nlen = cpu_to_le16(nm->len);
1248 memcpy(xent->name, nm->name, nm->len);
1249 xent->name[nm->len] = '\0';
1250 zero_dent_node_unused(xent);
1251 ubifs_prep_grp_node(c, xent, xlen, 0);
1252
1253 ino = (void *)xent + aligned_xlen;
1254 pack_inode(c, ino, inode, 0, 1);
1255 ino = (void *)ino + UBIFS_INO_NODE_SZ;
1256 pack_inode(c, ino, host, 1, 0);
1257
1258 err = write_head(c, BASEHD, xent, len, &lnum, &xent_offs, sync);
1259 if (!sync && !err)
1260 ubifs_wbuf_add_ino_nolock(&c->jheads[BASEHD].wbuf, host->i_ino);
1261 release_head(c, BASEHD);
1262 kfree(xent);
1263 if (err)
1264 goto out_ro;
1265
1266 /* Remove the extended attribute entry from TNC */
1267 err = ubifs_tnc_remove_nm(c, &xent_key, nm);
1268 if (err)
1269 goto out_ro;
1270 err = ubifs_add_dirt(c, lnum, xlen);
1271 if (err)
1272 goto out_ro;
1273
1274 /*
1275 * Remove all nodes belonging to the extended attribute inode from TNC.
1276 * Well, there actually must be only one node - the inode itself.
1277 */
1278 lowest_ino_key(c, &key1, inode->i_ino);
1279 highest_ino_key(c, &key2, inode->i_ino);
1280 err = ubifs_tnc_remove_range(c, &key1, &key2);
1281 if (err)
1282 goto out_ro;
1283 err = ubifs_add_dirt(c, lnum, UBIFS_INO_NODE_SZ);
1284 if (err)
1285 goto out_ro;
1286
1287 /* And update TNC with the new host inode position */
1288 ino_key_init(c, &key1, host->i_ino);
1289 err = ubifs_tnc_add(c, &key1, lnum, xent_offs + len - hlen, hlen);
1290 if (err)
1291 goto out_ro;
1292
1293 finish_reservation(c);
1294 spin_lock(&host_ui->ui_lock);
1295 host_ui->synced_i_size = host_ui->ui_size;
1296 spin_unlock(&host_ui->ui_lock);
1297 mark_inode_clean(c, host_ui);
1298 return 0;
1299
1300out_ro:
1301 ubifs_ro_mode(c, err);
1302 finish_reservation(c);
1303 return err;
1304}
1305
1306/**
1307 * ubifs_jnl_change_xattr - change an extended attribute.
1308 * @c: UBIFS file-system description object
1309 * @inode: extended attribute inode
1310 * @host: host inode
1311 *
1312 * This function writes the updated version of an extended attribute inode and
1313 * the host inode tho the journal (to the base head). The host inode is written
1314 * after the extended attribute inode in order to guarantee that the extended
1315 * attribute will be flushed when the inode is synchronized by 'fsync()' and
1316 * consequently, the write-buffer is synchronized. This function returns zero
1317 * in case of success and a negative error code in case of failure.
1318 */
1319int ubifs_jnl_change_xattr(struct ubifs_info *c, const struct inode *inode,
1320 const struct inode *host)
1321{
1322 int err, len1, len2, aligned_len, aligned_len1, lnum, offs;
1323 struct ubifs_inode *host_ui = ubifs_inode(inode);
1324 struct ubifs_ino_node *ino;
1325 union ubifs_key key;
1326 int sync = IS_DIRSYNC(host);
1327
1328 dbg_jnl("ino %lu, ino %lu", host->i_ino, inode->i_ino);
1329 ubifs_assert(host->i_nlink > 0);
1330 ubifs_assert(inode->i_nlink > 0);
1331 ubifs_assert(mutex_is_locked(&host_ui->ui_mutex));
1332
1333 len1 = UBIFS_INO_NODE_SZ + host_ui->data_len;
1334 len2 = UBIFS_INO_NODE_SZ + ubifs_inode(inode)->data_len;
1335 aligned_len1 = ALIGN(len1, 8);
1336 aligned_len = aligned_len1 + ALIGN(len2, 8);
1337
1338 ino = kmalloc(aligned_len, GFP_NOFS);
1339 if (!ino)
1340 return -ENOMEM;
1341
1342 /* Make reservation before allocating sequence numbers */
1343 err = make_reservation(c, BASEHD, aligned_len);
1344 if (err)
1345 goto out_free;
1346
1347 pack_inode(c, ino, host, 0, 0);
1348 pack_inode(c, (void *)ino + aligned_len1, inode, 1, 0);
1349
1350 err = write_head(c, BASEHD, ino, aligned_len, &lnum, &offs, 0);
1351 if (!sync && !err) {
1352 struct ubifs_wbuf *wbuf = &c->jheads[BASEHD].wbuf;
1353
1354 ubifs_wbuf_add_ino_nolock(wbuf, host->i_ino);
1355 ubifs_wbuf_add_ino_nolock(wbuf, inode->i_ino);
1356 }
1357 release_head(c, BASEHD);
1358 if (err)
1359 goto out_ro;
1360
1361 ino_key_init(c, &key, host->i_ino);
1362 err = ubifs_tnc_add(c, &key, lnum, offs, len1);
1363 if (err)
1364 goto out_ro;
1365
1366 ino_key_init(c, &key, inode->i_ino);
1367 err = ubifs_tnc_add(c, &key, lnum, offs + aligned_len1, len2);
1368 if (err)
1369 goto out_ro;
1370
1371 finish_reservation(c);
1372 spin_lock(&host_ui->ui_lock);
1373 host_ui->synced_i_size = host_ui->ui_size;
1374 spin_unlock(&host_ui->ui_lock);
1375 mark_inode_clean(c, host_ui);
1376 kfree(ino);
1377 return 0;
1378
1379out_ro:
1380 ubifs_ro_mode(c, err);
1381 finish_reservation(c);
1382out_free:
1383 kfree(ino);
1384 return err;
1385}
1386
1387#endif /* CONFIG_UBIFS_FS_XATTR */
diff --git a/fs/ubifs/key.h b/fs/ubifs/key.h
new file mode 100644
index 000000000000..8f7476007549
--- /dev/null
+++ b/fs/ubifs/key.h
@@ -0,0 +1,533 @@
1/*
2 * This file is part of UBIFS.
3 *
4 * Copyright (C) 2006-2008 Nokia Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published by
8 * the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along with
16 * this program; if not, write to the Free Software Foundation, Inc., 51
17 * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 * Authors: Artem Bityutskiy (Битюцкий Артём)
20 * Adrian Hunter
21 */
22
23/*
24 * This header contains various key-related definitions and helper function.
25 * UBIFS allows several key schemes, so we access key fields only via these
26 * helpers. At the moment only one key scheme is supported.
27 *
28 * Simple key scheme
29 * ~~~~~~~~~~~~~~~~~
30 *
31 * Keys are 64-bits long. First 32-bits are inode number (parent inode number
32 * in case of direntry key). Next 3 bits are node type. The last 29 bits are
33 * 4KiB offset in case of inode node, and direntry hash in case of a direntry
34 * node. We use "r5" hash borrowed from reiserfs.
35 */
36
37#ifndef __UBIFS_KEY_H__
38#define __UBIFS_KEY_H__
39
40/**
41 * key_r5_hash - R5 hash function (borrowed from reiserfs).
42 * @s: direntry name
43 * @len: name length
44 */
45static inline uint32_t key_r5_hash(const char *s, int len)
46{
47 uint32_t a = 0;
48 const signed char *str = (const signed char *)s;
49
50 while (*str) {
51 a += *str << 4;
52 a += *str >> 4;
53 a *= 11;
54 str++;
55 }
56
57 a &= UBIFS_S_KEY_HASH_MASK;
58
59 /*
60 * We use hash values as offset in directories, so values %0 and %1 are
61 * reserved for "." and "..". %2 is reserved for "end of readdir"
62 * marker.
63 */
64 if (unlikely(a >= 0 && a <= 2))
65 a += 3;
66 return a;
67}
68
69/**
70 * key_test_hash - testing hash function.
71 * @str: direntry name
72 * @len: name length
73 */
74static inline uint32_t key_test_hash(const char *str, int len)
75{
76 uint32_t a = 0;
77
78 len = min_t(uint32_t, len, 4);
79 memcpy(&a, str, len);
80 a &= UBIFS_S_KEY_HASH_MASK;
81 if (unlikely(a >= 0 && a <= 2))
82 a += 3;
83 return a;
84}
85
86/**
87 * ino_key_init - initialize inode key.
88 * @c: UBIFS file-system description object
89 * @key: key to initialize
90 * @inum: inode number
91 */
92static inline void ino_key_init(const struct ubifs_info *c,
93 union ubifs_key *key, ino_t inum)
94{
95 key->u32[0] = inum;
96 key->u32[1] = UBIFS_INO_KEY << UBIFS_S_KEY_BLOCK_BITS;
97}
98
99/**
100 * ino_key_init_flash - initialize on-flash inode key.
101 * @c: UBIFS file-system description object
102 * @k: key to initialize
103 * @inum: inode number
104 */
105static inline void ino_key_init_flash(const struct ubifs_info *c, void *k,
106 ino_t inum)
107{
108 union ubifs_key *key = k;
109
110 key->j32[0] = cpu_to_le32(inum);
111 key->j32[1] = cpu_to_le32(UBIFS_INO_KEY << UBIFS_S_KEY_BLOCK_BITS);
112 memset(k + 8, 0, UBIFS_MAX_KEY_LEN - 8);
113}
114
115/**
116 * lowest_ino_key - get the lowest possible inode key.
117 * @c: UBIFS file-system description object
118 * @key: key to initialize
119 * @inum: inode number
120 */
121static inline void lowest_ino_key(const struct ubifs_info *c,
122 union ubifs_key *key, ino_t inum)
123{
124 key->u32[0] = inum;
125 key->u32[1] = 0;
126}
127
128/**
129 * highest_ino_key - get the highest possible inode key.
130 * @c: UBIFS file-system description object
131 * @key: key to initialize
132 * @inum: inode number
133 */
134static inline void highest_ino_key(const struct ubifs_info *c,
135 union ubifs_key *key, ino_t inum)
136{
137 key->u32[0] = inum;
138 key->u32[1] = 0xffffffff;
139}
140
141/**
142 * dent_key_init - initialize directory entry key.
143 * @c: UBIFS file-system description object
144 * @key: key to initialize
145 * @inum: parent inode number
146 * @nm: direntry name and length
147 */
148static inline void dent_key_init(const struct ubifs_info *c,
149 union ubifs_key *key, ino_t inum,
150 const struct qstr *nm)
151{
152 uint32_t hash = c->key_hash(nm->name, nm->len);
153
154 ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK));
155 key->u32[0] = inum;
156 key->u32[1] = hash | (UBIFS_DENT_KEY << UBIFS_S_KEY_HASH_BITS);
157}
158
159/**
160 * dent_key_init_hash - initialize directory entry key without re-calculating
161 * hash function.
162 * @c: UBIFS file-system description object
163 * @key: key to initialize
164 * @inum: parent inode number
165 * @hash: direntry name hash
166 */
167static inline void dent_key_init_hash(const struct ubifs_info *c,
168 union ubifs_key *key, ino_t inum,
169 uint32_t hash)
170{
171 ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK));
172 key->u32[0] = inum;
173 key->u32[1] = hash | (UBIFS_DENT_KEY << UBIFS_S_KEY_HASH_BITS);
174}
175
176/**
177 * dent_key_init_flash - initialize on-flash directory entry key.
178 * @c: UBIFS file-system description object
179 * @k: key to initialize
180 * @inum: parent inode number
181 * @nm: direntry name and length
182 */
183static inline void dent_key_init_flash(const struct ubifs_info *c, void *k,
184 ino_t inum, const struct qstr *nm)
185{
186 union ubifs_key *key = k;
187 uint32_t hash = c->key_hash(nm->name, nm->len);
188
189 ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK));
190 key->j32[0] = cpu_to_le32(inum);
191 key->j32[1] = cpu_to_le32(hash |
192 (UBIFS_DENT_KEY << UBIFS_S_KEY_HASH_BITS));
193 memset(k + 8, 0, UBIFS_MAX_KEY_LEN - 8);
194}
195
196/**
197 * lowest_dent_key - get the lowest possible directory entry key.
198 * @c: UBIFS file-system description object
199 * @key: where to store the lowest key
200 * @inum: parent inode number
201 */
202static inline void lowest_dent_key(const struct ubifs_info *c,
203 union ubifs_key *key, ino_t inum)
204{
205 key->u32[0] = inum;
206 key->u32[1] = UBIFS_DENT_KEY << UBIFS_S_KEY_HASH_BITS;
207}
208
209/**
210 * xent_key_init - initialize extended attribute entry key.
211 * @c: UBIFS file-system description object
212 * @key: key to initialize
213 * @inum: host inode number
214 * @nm: extended attribute entry name and length
215 */
216static inline void xent_key_init(const struct ubifs_info *c,
217 union ubifs_key *key, ino_t inum,
218 const struct qstr *nm)
219{
220 uint32_t hash = c->key_hash(nm->name, nm->len);
221
222 ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK));
223 key->u32[0] = inum;
224 key->u32[1] = hash | (UBIFS_XENT_KEY << UBIFS_S_KEY_HASH_BITS);
225}
226
227/**
228 * xent_key_init_hash - initialize extended attribute entry key without
229 * re-calculating hash function.
230 * @c: UBIFS file-system description object
231 * @key: key to initialize
232 * @inum: host inode number
233 * @hash: extended attribute entry name hash
234 */
235static inline void xent_key_init_hash(const struct ubifs_info *c,
236 union ubifs_key *key, ino_t inum,
237 uint32_t hash)
238{
239 ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK));
240 key->u32[0] = inum;
241 key->u32[1] = hash | (UBIFS_XENT_KEY << UBIFS_S_KEY_HASH_BITS);
242}
243
244/**
245 * xent_key_init_flash - initialize on-flash extended attribute entry key.
246 * @c: UBIFS file-system description object
247 * @k: key to initialize
248 * @inum: host inode number
249 * @nm: extended attribute entry name and length
250 */
251static inline void xent_key_init_flash(const struct ubifs_info *c, void *k,
252 ino_t inum, const struct qstr *nm)
253{
254 union ubifs_key *key = k;
255 uint32_t hash = c->key_hash(nm->name, nm->len);
256
257 ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK));
258 key->j32[0] = cpu_to_le32(inum);
259 key->j32[1] = cpu_to_le32(hash |
260 (UBIFS_XENT_KEY << UBIFS_S_KEY_HASH_BITS));
261 memset(k + 8, 0, UBIFS_MAX_KEY_LEN - 8);
262}
263
264/**
265 * lowest_xent_key - get the lowest possible extended attribute entry key.
266 * @c: UBIFS file-system description object
267 * @key: where to store the lowest key
268 * @inum: host inode number
269 */
270static inline void lowest_xent_key(const struct ubifs_info *c,
271 union ubifs_key *key, ino_t inum)
272{
273 key->u32[0] = inum;
274 key->u32[1] = UBIFS_XENT_KEY << UBIFS_S_KEY_HASH_BITS;
275}
276
277/**
278 * data_key_init - initialize data key.
279 * @c: UBIFS file-system description object
280 * @key: key to initialize
281 * @inum: inode number
282 * @block: block number
283 */
284static inline void data_key_init(const struct ubifs_info *c,
285 union ubifs_key *key, ino_t inum,
286 unsigned int block)
287{
288 ubifs_assert(!(block & ~UBIFS_S_KEY_BLOCK_MASK));
289 key->u32[0] = inum;
290 key->u32[1] = block | (UBIFS_DATA_KEY << UBIFS_S_KEY_BLOCK_BITS);
291}
292
293/**
294 * data_key_init_flash - initialize on-flash data key.
295 * @c: UBIFS file-system description object
296 * @k: key to initialize
297 * @inum: inode number
298 * @block: block number
299 */
300static inline void data_key_init_flash(const struct ubifs_info *c, void *k,
301 ino_t inum, unsigned int block)
302{
303 union ubifs_key *key = k;
304
305 ubifs_assert(!(block & ~UBIFS_S_KEY_BLOCK_MASK));
306 key->j32[0] = cpu_to_le32(inum);
307 key->j32[1] = cpu_to_le32(block |
308 (UBIFS_DATA_KEY << UBIFS_S_KEY_BLOCK_BITS));
309 memset(k + 8, 0, UBIFS_MAX_KEY_LEN - 8);
310}
311
312/**
313 * trun_key_init - initialize truncation node key.
314 * @c: UBIFS file-system description object
315 * @key: key to initialize
316 * @inum: inode number
317 *
318 * Note, UBIFS does not have truncation keys on the media and this function is
319 * only used for purposes of replay.
320 */
321static inline void trun_key_init(const struct ubifs_info *c,
322 union ubifs_key *key, ino_t inum)
323{
324 key->u32[0] = inum;
325 key->u32[1] = UBIFS_TRUN_KEY << UBIFS_S_KEY_BLOCK_BITS;
326}
327
328/**
329 * key_type - get key type.
330 * @c: UBIFS file-system description object
331 * @key: key to get type of
332 */
333static inline int key_type(const struct ubifs_info *c,
334 const union ubifs_key *key)
335{
336 return key->u32[1] >> UBIFS_S_KEY_BLOCK_BITS;
337}
338
339/**
340 * key_type_flash - get type of a on-flash formatted key.
341 * @c: UBIFS file-system description object
342 * @k: key to get type of
343 */
344static inline int key_type_flash(const struct ubifs_info *c, const void *k)
345{
346 const union ubifs_key *key = k;
347
348 return le32_to_cpu(key->u32[1]) >> UBIFS_S_KEY_BLOCK_BITS;
349}
350
351/**
352 * key_inum - fetch inode number from key.
353 * @c: UBIFS file-system description object
354 * @k: key to fetch inode number from
355 */
356static inline ino_t key_inum(const struct ubifs_info *c, const void *k)
357{
358 const union ubifs_key *key = k;
359
360 return key->u32[0];
361}
362
363/**
364 * key_inum_flash - fetch inode number from an on-flash formatted key.
365 * @c: UBIFS file-system description object
366 * @k: key to fetch inode number from
367 */
368static inline ino_t key_inum_flash(const struct ubifs_info *c, const void *k)
369{
370 const union ubifs_key *key = k;
371
372 return le32_to_cpu(key->j32[0]);
373}
374
375/**
376 * key_hash - get directory entry hash.
377 * @c: UBIFS file-system description object
378 * @key: the key to get hash from
379 */
380static inline int key_hash(const struct ubifs_info *c,
381 const union ubifs_key *key)
382{
383 return key->u32[1] & UBIFS_S_KEY_HASH_MASK;
384}
385
386/**
387 * key_hash_flash - get directory entry hash from an on-flash formatted key.
388 * @c: UBIFS file-system description object
389 * @k: the key to get hash from
390 */
391static inline int key_hash_flash(const struct ubifs_info *c, const void *k)
392{
393 const union ubifs_key *key = k;
394
395 return le32_to_cpu(key->j32[1]) & UBIFS_S_KEY_HASH_MASK;
396}
397
398/**
399 * key_block - get data block number.
400 * @c: UBIFS file-system description object
401 * @key: the key to get the block number from
402 */
403static inline unsigned int key_block(const struct ubifs_info *c,
404 const union ubifs_key *key)
405{
406 return key->u32[1] & UBIFS_S_KEY_BLOCK_MASK;
407}
408
409/**
410 * key_block_flash - get data block number from an on-flash formatted key.
411 * @c: UBIFS file-system description object
412 * @k: the key to get the block number from
413 */
414static inline unsigned int key_block_flash(const struct ubifs_info *c,
415 const void *k)
416{
417 const union ubifs_key *key = k;
418
419 return le32_to_cpu(key->u32[1]) & UBIFS_S_KEY_BLOCK_MASK;
420}
421
422/**
423 * key_read - transform a key to in-memory format.
424 * @c: UBIFS file-system description object
425 * @from: the key to transform
426 * @to: the key to store the result
427 */
428static inline void key_read(const struct ubifs_info *c, const void *from,
429 union ubifs_key *to)
430{
431 const union ubifs_key *f = from;
432
433 to->u32[0] = le32_to_cpu(f->j32[0]);
434 to->u32[1] = le32_to_cpu(f->j32[1]);
435}
436
437/**
438 * key_write - transform a key from in-memory format.
439 * @c: UBIFS file-system description object
440 * @from: the key to transform
441 * @to: the key to store the result
442 */
443static inline void key_write(const struct ubifs_info *c,
444 const union ubifs_key *from, void *to)
445{
446 union ubifs_key *t = to;
447
448 t->j32[0] = cpu_to_le32(from->u32[0]);
449 t->j32[1] = cpu_to_le32(from->u32[1]);
450 memset(to + 8, 0, UBIFS_MAX_KEY_LEN - 8);
451}
452
453/**
454 * key_write_idx - transform a key from in-memory format for the index.
455 * @c: UBIFS file-system description object
456 * @from: the key to transform
457 * @to: the key to store the result
458 */
459static inline void key_write_idx(const struct ubifs_info *c,
460 const union ubifs_key *from, void *to)
461{
462 union ubifs_key *t = to;
463
464 t->j32[0] = cpu_to_le32(from->u32[0]);
465 t->j32[1] = cpu_to_le32(from->u32[1]);
466}
467
468/**
469 * key_copy - copy a key.
470 * @c: UBIFS file-system description object
471 * @from: the key to copy from
472 * @to: the key to copy to
473 */
474static inline void key_copy(const struct ubifs_info *c,
475 const union ubifs_key *from, union ubifs_key *to)
476{
477 to->u64[0] = from->u64[0];
478}
479
480/**
481 * keys_cmp - compare keys.
482 * @c: UBIFS file-system description object
483 * @key1: the first key to compare
484 * @key2: the second key to compare
485 *
486 * This function compares 2 keys and returns %-1 if @key1 is less than
487 * @key2, 0 if the keys are equivalent and %1 if @key1 is greater than @key2.
488 */
489static inline int keys_cmp(const struct ubifs_info *c,
490 const union ubifs_key *key1,
491 const union ubifs_key *key2)
492{
493 if (key1->u32[0] < key2->u32[0])
494 return -1;
495 if (key1->u32[0] > key2->u32[0])
496 return 1;
497 if (key1->u32[1] < key2->u32[1])
498 return -1;
499 if (key1->u32[1] > key2->u32[1])
500 return 1;
501
502 return 0;
503}
504
505/**
506 * is_hash_key - is a key vulnerable to hash collisions.
507 * @c: UBIFS file-system description object
508 * @key: key
509 *
510 * This function returns %1 if @key is a hashed key or %0 otherwise.
511 */
512static inline int is_hash_key(const struct ubifs_info *c,
513 const union ubifs_key *key)
514{
515 int type = key_type(c, key);
516
517 return type == UBIFS_DENT_KEY || type == UBIFS_XENT_KEY;
518}
519
520/**
521 * key_max_inode_size - get maximum file size allowed by current key format.
522 * @c: UBIFS file-system description object
523 */
524static inline unsigned long long key_max_inode_size(const struct ubifs_info *c)
525{
526 switch (c->key_fmt) {
527 case UBIFS_SIMPLE_KEY_FMT:
528 return (1ULL << UBIFS_S_KEY_BLOCK_BITS) * UBIFS_BLOCK_SIZE;
529 default:
530 return 0;
531 }
532}
533#endif /* !__UBIFS_KEY_H__ */
diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c
new file mode 100644
index 000000000000..36857b9ed59e
--- /dev/null
+++ b/fs/ubifs/log.c
@@ -0,0 +1,805 @@
1/*
2 * This file is part of UBIFS.
3 *
4 * Copyright (C) 2006-2008 Nokia Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published by
8 * the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along with
16 * this program; if not, write to the Free Software Foundation, Inc., 51
17 * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 * Authors: Artem Bityutskiy (Битюцкий Артём)
20 * Adrian Hunter
21 */
22
23/*
24 * This file is a part of UBIFS journal implementation and contains various
25 * functions which manipulate the log. The log is a fixed area on the flash
26 * which does not contain any data but refers to buds. The log is a part of the
27 * journal.
28 */
29
30#include "ubifs.h"
31
32#ifdef CONFIG_UBIFS_FS_DEBUG
33static int dbg_check_bud_bytes(struct ubifs_info *c);
34#else
35#define dbg_check_bud_bytes(c) 0
36#endif
37
38/**
39 * ubifs_search_bud - search bud LEB.
40 * @c: UBIFS file-system description object
41 * @lnum: logical eraseblock number to search
42 *
43 * This function searches bud LEB @lnum. Returns bud description object in case
44 * of success and %NULL if there is no bud with this LEB number.
45 */
46struct ubifs_bud *ubifs_search_bud(struct ubifs_info *c, int lnum)
47{
48 struct rb_node *p;
49 struct ubifs_bud *bud;
50
51 spin_lock(&c->buds_lock);
52 p = c->buds.rb_node;
53 while (p) {
54 bud = rb_entry(p, struct ubifs_bud, rb);
55 if (lnum < bud->lnum)
56 p = p->rb_left;
57 else if (lnum > bud->lnum)
58 p = p->rb_right;
59 else {
60 spin_unlock(&c->buds_lock);
61 return bud;
62 }
63 }
64 spin_unlock(&c->buds_lock);
65 return NULL;
66}
67
68/**
69 * ubifs_get_wbuf - get the wbuf associated with a LEB, if there is one.
70 * @c: UBIFS file-system description object
71 * @lnum: logical eraseblock number to search
72 *
73 * This functions returns the wbuf for @lnum or %NULL if there is not one.
74 */
75struct ubifs_wbuf *ubifs_get_wbuf(struct ubifs_info *c, int lnum)
76{
77 struct rb_node *p;
78 struct ubifs_bud *bud;
79 int jhead;
80
81 if (!c->jheads)
82 return NULL;
83
84 spin_lock(&c->buds_lock);
85 p = c->buds.rb_node;
86 while (p) {
87 bud = rb_entry(p, struct ubifs_bud, rb);
88 if (lnum < bud->lnum)
89 p = p->rb_left;
90 else if (lnum > bud->lnum)
91 p = p->rb_right;
92 else {
93 jhead = bud->jhead;
94 spin_unlock(&c->buds_lock);
95 return &c->jheads[jhead].wbuf;
96 }
97 }
98 spin_unlock(&c->buds_lock);
99 return NULL;
100}
101
102/**
103 * next_log_lnum - switch to the next log LEB.
104 * @c: UBIFS file-system description object
105 * @lnum: current log LEB
106 */
107static inline int next_log_lnum(const struct ubifs_info *c, int lnum)
108{
109 lnum += 1;
110 if (lnum > c->log_last)
111 lnum = UBIFS_LOG_LNUM;
112
113 return lnum;
114}
115
116/**
117 * empty_log_bytes - calculate amount of empty space in the log.
118 * @c: UBIFS file-system description object
119 */
120static inline long long empty_log_bytes(const struct ubifs_info *c)
121{
122 long long h, t;
123
124 h = (long long)c->lhead_lnum * c->leb_size + c->lhead_offs;
125 t = (long long)c->ltail_lnum * c->leb_size;
126
127 if (h >= t)
128 return c->log_bytes - h + t;
129 else
130 return t - h;
131}
132
133/**
134 * ubifs_add_bud - add bud LEB to the tree of buds and its journal head list.
135 * @c: UBIFS file-system description object
136 * @bud: the bud to add
137 */
138void ubifs_add_bud(struct ubifs_info *c, struct ubifs_bud *bud)
139{
140 struct rb_node **p, *parent = NULL;
141 struct ubifs_bud *b;
142 struct ubifs_jhead *jhead;
143
144 spin_lock(&c->buds_lock);
145 p = &c->buds.rb_node;
146 while (*p) {
147 parent = *p;
148 b = rb_entry(parent, struct ubifs_bud, rb);
149 ubifs_assert(bud->lnum != b->lnum);
150 if (bud->lnum < b->lnum)
151 p = &(*p)->rb_left;
152 else
153 p = &(*p)->rb_right;
154 }
155
156 rb_link_node(&bud->rb, parent, p);
157 rb_insert_color(&bud->rb, &c->buds);
158 if (c->jheads) {
159 jhead = &c->jheads[bud->jhead];
160 list_add_tail(&bud->list, &jhead->buds_list);
161 } else
162 ubifs_assert(c->replaying && (c->vfs_sb->s_flags & MS_RDONLY));
163
164 /*
165 * Note, although this is a new bud, we anyway account this space now,
166 * before any data has been written to it, because this is about to
167 * guarantee fixed mount time, and this bud will anyway be read and
168 * scanned.
169 */
170 c->bud_bytes += c->leb_size - bud->start;
171
172 dbg_log("LEB %d:%d, jhead %d, bud_bytes %lld", bud->lnum,
173 bud->start, bud->jhead, c->bud_bytes);
174 spin_unlock(&c->buds_lock);
175}
176
177/**
178 * ubifs_create_buds_lists - create journal head buds lists for remount rw.
179 * @c: UBIFS file-system description object
180 */
181void ubifs_create_buds_lists(struct ubifs_info *c)
182{
183 struct rb_node *p;
184
185 spin_lock(&c->buds_lock);
186 p = rb_first(&c->buds);
187 while (p) {
188 struct ubifs_bud *bud = rb_entry(p, struct ubifs_bud, rb);
189 struct ubifs_jhead *jhead = &c->jheads[bud->jhead];
190
191 list_add_tail(&bud->list, &jhead->buds_list);
192 p = rb_next(p);
193 }
194 spin_unlock(&c->buds_lock);
195}
196
197/**
198 * ubifs_add_bud_to_log - add a new bud to the log.
199 * @c: UBIFS file-system description object
200 * @jhead: journal head the bud belongs to
201 * @lnum: LEB number of the bud
202 * @offs: starting offset of the bud
203 *
204 * This function writes reference node for the new bud LEB @lnum it to the log,
205 * and adds it to the buds tress. It also makes sure that log size does not
206 * exceed the 'c->max_bud_bytes' limit. Returns zero in case of success,
207 * %-EAGAIN if commit is required, and a negative error codes in case of
208 * failure.
209 */
210int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs)
211{
212 int err;
213 struct ubifs_bud *bud;
214 struct ubifs_ref_node *ref;
215
216 bud = kmalloc(sizeof(struct ubifs_bud), GFP_NOFS);
217 if (!bud)
218 return -ENOMEM;
219 ref = kzalloc(c->ref_node_alsz, GFP_NOFS);
220 if (!ref) {
221 kfree(bud);
222 return -ENOMEM;
223 }
224
225 mutex_lock(&c->log_mutex);
226
227 if (c->ro_media) {
228 err = -EROFS;
229 goto out_unlock;
230 }
231
232 /* Make sure we have enough space in the log */
233 if (empty_log_bytes(c) - c->ref_node_alsz < c->min_log_bytes) {
234 dbg_log("not enough log space - %lld, required %d",
235 empty_log_bytes(c), c->min_log_bytes);
236 ubifs_commit_required(c);
237 err = -EAGAIN;
238 goto out_unlock;
239 }
240
241 /*
242 * Make sure the the amount of space in buds will not exceed
243 * 'c->max_bud_bytes' limit, because we want to guarantee mount time
244 * limits.
245 *
246 * It is not necessary to hold @c->buds_lock when reading @c->bud_bytes
247 * because we are holding @c->log_mutex. All @c->bud_bytes take place
248 * when both @c->log_mutex and @c->bud_bytes are locked.
249 */
250 if (c->bud_bytes + c->leb_size - offs > c->max_bud_bytes) {
251 dbg_log("bud bytes %lld (%lld max), require commit",
252 c->bud_bytes, c->max_bud_bytes);
253 ubifs_commit_required(c);
254 err = -EAGAIN;
255 goto out_unlock;
256 }
257
258 /*
259 * If the journal is full enough - start background commit. Note, it is
260 * OK to read 'c->cmt_state' without spinlock because integer reads
261 * are atomic in the kernel.
262 */
263 if (c->bud_bytes >= c->bg_bud_bytes &&
264 c->cmt_state == COMMIT_RESTING) {
265 dbg_log("bud bytes %lld (%lld max), initiate BG commit",
266 c->bud_bytes, c->max_bud_bytes);
267 ubifs_request_bg_commit(c);
268 }
269
270 bud->lnum = lnum;
271 bud->start = offs;
272 bud->jhead = jhead;
273
274 ref->ch.node_type = UBIFS_REF_NODE;
275 ref->lnum = cpu_to_le32(bud->lnum);
276 ref->offs = cpu_to_le32(bud->start);
277 ref->jhead = cpu_to_le32(jhead);
278
279 if (c->lhead_offs > c->leb_size - c->ref_node_alsz) {
280 c->lhead_lnum = next_log_lnum(c, c->lhead_lnum);
281 c->lhead_offs = 0;
282 }
283
284 if (c->lhead_offs == 0) {
285 /* Must ensure next log LEB has been unmapped */
286 err = ubifs_leb_unmap(c, c->lhead_lnum);
287 if (err)
288 goto out_unlock;
289 }
290
291 if (bud->start == 0) {
292 /*
293 * Before writing the LEB reference which refers an empty LEB
294 * to the log, we have to make sure it is mapped, because
295 * otherwise we'd risk to refer an LEB with garbage in case of
296 * an unclean reboot, because the target LEB might have been
297 * unmapped, but not yet physically erased.
298 */
299 err = ubi_leb_map(c->ubi, bud->lnum, UBI_SHORTTERM);
300 if (err)
301 goto out_unlock;
302 }
303
304 dbg_log("write ref LEB %d:%d",
305 c->lhead_lnum, c->lhead_offs);
306 err = ubifs_write_node(c, ref, UBIFS_REF_NODE_SZ, c->lhead_lnum,
307 c->lhead_offs, UBI_SHORTTERM);
308 if (err)
309 goto out_unlock;
310
311 c->lhead_offs += c->ref_node_alsz;
312
313 ubifs_add_bud(c, bud);
314
315 mutex_unlock(&c->log_mutex);
316 kfree(ref);
317 return 0;
318
319out_unlock:
320 mutex_unlock(&c->log_mutex);
321 kfree(ref);
322 kfree(bud);
323 return err;
324}
325
326/**
327 * remove_buds - remove used buds.
328 * @c: UBIFS file-system description object
329 *
330 * This function removes use buds from the buds tree. It does not remove the
331 * buds which are pointed to by journal heads.
332 */
333static void remove_buds(struct ubifs_info *c)
334{
335 struct rb_node *p;
336
337 ubifs_assert(list_empty(&c->old_buds));
338 c->cmt_bud_bytes = 0;
339 spin_lock(&c->buds_lock);
340 p = rb_first(&c->buds);
341 while (p) {
342 struct rb_node *p1 = p;
343 struct ubifs_bud *bud;
344 struct ubifs_wbuf *wbuf;
345
346 p = rb_next(p);
347 bud = rb_entry(p1, struct ubifs_bud, rb);
348 wbuf = &c->jheads[bud->jhead].wbuf;
349
350 if (wbuf->lnum == bud->lnum) {
351 /*
352 * Do not remove buds which are pointed to by journal
353 * heads (non-closed buds).
354 */
355 c->cmt_bud_bytes += wbuf->offs - bud->start;
356 dbg_log("preserve %d:%d, jhead %d, bud bytes %d, "
357 "cmt_bud_bytes %lld", bud->lnum, bud->start,
358 bud->jhead, wbuf->offs - bud->start,
359 c->cmt_bud_bytes);
360 bud->start = wbuf->offs;
361 } else {
362 c->cmt_bud_bytes += c->leb_size - bud->start;
363 dbg_log("remove %d:%d, jhead %d, bud bytes %d, "
364 "cmt_bud_bytes %lld", bud->lnum, bud->start,
365 bud->jhead, c->leb_size - bud->start,
366 c->cmt_bud_bytes);
367 rb_erase(p1, &c->buds);
368 list_del(&bud->list);
369 /*
370 * If the commit does not finish, the recovery will need
371 * to replay the journal, in which case the old buds
372 * must be unchanged. Do not release them until post
373 * commit i.e. do not allow them to be garbage
374 * collected.
375 */
376 list_add(&bud->list, &c->old_buds);
377 }
378 }
379 spin_unlock(&c->buds_lock);
380}
381
382/**
383 * ubifs_log_start_commit - start commit.
384 * @c: UBIFS file-system description object
385 * @ltail_lnum: return new log tail LEB number
386 *
387 * The commit operation starts with writing "commit start" node to the log and
388 * reference nodes for all journal heads which will define new journal after
389 * the commit has been finished. The commit start and reference nodes are
390 * written in one go to the nearest empty log LEB (hence, when commit is
391 * finished UBIFS may safely unmap all the previous log LEBs). This function
392 * returns zero in case of success and a negative error code in case of
393 * failure.
394 */
395int ubifs_log_start_commit(struct ubifs_info *c, int *ltail_lnum)
396{
397 void *buf;
398 struct ubifs_cs_node *cs;
399 struct ubifs_ref_node *ref;
400 int err, i, max_len, len;
401
402 err = dbg_check_bud_bytes(c);
403 if (err)
404 return err;
405
406 max_len = UBIFS_CS_NODE_SZ + c->jhead_cnt * UBIFS_REF_NODE_SZ;
407 max_len = ALIGN(max_len, c->min_io_size);
408 buf = cs = kmalloc(max_len, GFP_NOFS);
409 if (!buf)
410 return -ENOMEM;
411
412 cs->ch.node_type = UBIFS_CS_NODE;
413 cs->cmt_no = cpu_to_le64(c->cmt_no + 1);
414 ubifs_prepare_node(c, cs, UBIFS_CS_NODE_SZ, 0);
415
416 /*
417 * Note, we do not lock 'c->log_mutex' because this is the commit start
418 * phase and we are exclusively using the log. And we do not lock
419 * write-buffer because nobody can write to the file-system at this
420 * phase.
421 */
422
423 len = UBIFS_CS_NODE_SZ;
424 for (i = 0; i < c->jhead_cnt; i++) {
425 int lnum = c->jheads[i].wbuf.lnum;
426 int offs = c->jheads[i].wbuf.offs;
427
428 if (lnum == -1 || offs == c->leb_size)
429 continue;
430
431 dbg_log("add ref to LEB %d:%d for jhead %d", lnum, offs, i);
432 ref = buf + len;
433 ref->ch.node_type = UBIFS_REF_NODE;
434 ref->lnum = cpu_to_le32(lnum);
435 ref->offs = cpu_to_le32(offs);
436 ref->jhead = cpu_to_le32(i);
437
438 ubifs_prepare_node(c, ref, UBIFS_REF_NODE_SZ, 0);
439 len += UBIFS_REF_NODE_SZ;
440 }
441
442 ubifs_pad(c, buf + len, ALIGN(len, c->min_io_size) - len);
443
444 /* Switch to the next log LEB */
445 if (c->lhead_offs) {
446 c->lhead_lnum = next_log_lnum(c, c->lhead_lnum);
447 c->lhead_offs = 0;
448 }
449
450 if (c->lhead_offs == 0) {
451 /* Must ensure next LEB has been unmapped */
452 err = ubifs_leb_unmap(c, c->lhead_lnum);
453 if (err)
454 goto out;
455 }
456
457 len = ALIGN(len, c->min_io_size);
458 dbg_log("writing commit start at LEB %d:0, len %d", c->lhead_lnum, len);
459 err = ubifs_leb_write(c, c->lhead_lnum, cs, 0, len, UBI_SHORTTERM);
460 if (err)
461 goto out;
462
463 *ltail_lnum = c->lhead_lnum;
464
465 c->lhead_offs += len;
466 if (c->lhead_offs == c->leb_size) {
467 c->lhead_lnum = next_log_lnum(c, c->lhead_lnum);
468 c->lhead_offs = 0;
469 }
470
471 remove_buds(c);
472
473 /*
474 * We have started the commit and now users may use the rest of the log
475 * for new writes.
476 */
477 c->min_log_bytes = 0;
478
479out:
480 kfree(buf);
481 return err;
482}
483
484/**
485 * ubifs_log_end_commit - end commit.
486 * @c: UBIFS file-system description object
487 * @ltail_lnum: new log tail LEB number
488 *
489 * This function is called on when the commit operation was finished. It
490 * moves log tail to new position and unmaps LEBs which contain obsolete data.
491 * Returns zero in case of success and a negative error code in case of
492 * failure.
493 */
494int ubifs_log_end_commit(struct ubifs_info *c, int ltail_lnum)
495{
496 int err;
497
498 /*
499 * At this phase we have to lock 'c->log_mutex' because UBIFS allows FS
500 * writes during commit. Its only short "commit" start phase when
501 * writers are blocked.
502 */
503 mutex_lock(&c->log_mutex);
504
505 dbg_log("old tail was LEB %d:0, new tail is LEB %d:0",
506 c->ltail_lnum, ltail_lnum);
507
508 c->ltail_lnum = ltail_lnum;
509 /*
510 * The commit is finished and from now on it must be guaranteed that
511 * there is always enough space for the next commit.
512 */
513 c->min_log_bytes = c->leb_size;
514
515 spin_lock(&c->buds_lock);
516 c->bud_bytes -= c->cmt_bud_bytes;
517 spin_unlock(&c->buds_lock);
518
519 err = dbg_check_bud_bytes(c);
520
521 mutex_unlock(&c->log_mutex);
522 return err;
523}
524
525/**
526 * ubifs_log_post_commit - things to do after commit is completed.
527 * @c: UBIFS file-system description object
528 * @old_ltail_lnum: old log tail LEB number
529 *
530 * Release buds only after commit is completed, because they must be unchanged
531 * if recovery is needed.
532 *
533 * Unmap log LEBs only after commit is completed, because they may be needed for
534 * recovery.
535 *
536 * This function returns %0 on success and a negative error code on failure.
537 */
538int ubifs_log_post_commit(struct ubifs_info *c, int old_ltail_lnum)
539{
540 int lnum, err = 0;
541
542 while (!list_empty(&c->old_buds)) {
543 struct ubifs_bud *bud;
544
545 bud = list_entry(c->old_buds.next, struct ubifs_bud, list);
546 err = ubifs_return_leb(c, bud->lnum);
547 if (err)
548 return err;
549 list_del(&bud->list);
550 kfree(bud);
551 }
552 mutex_lock(&c->log_mutex);
553 for (lnum = old_ltail_lnum; lnum != c->ltail_lnum;
554 lnum = next_log_lnum(c, lnum)) {
555 dbg_log("unmap log LEB %d", lnum);
556 err = ubifs_leb_unmap(c, lnum);
557 if (err)
558 goto out;
559 }
560out:
561 mutex_unlock(&c->log_mutex);
562 return err;
563}
564
565/**
566 * struct done_ref - references that have been done.
567 * @rb: rb-tree node
568 * @lnum: LEB number
569 */
570struct done_ref {
571 struct rb_node rb;
572 int lnum;
573};
574
575/**
576 * done_already - determine if a reference has been done already.
577 * @done_tree: rb-tree to store references that have been done
578 * @lnum: LEB number of reference
579 *
580 * This function returns %1 if the reference has been done, %0 if not, otherwise
581 * a negative error code is returned.
582 */
583static int done_already(struct rb_root *done_tree, int lnum)
584{
585 struct rb_node **p = &done_tree->rb_node, *parent = NULL;
586 struct done_ref *dr;
587
588 while (*p) {
589 parent = *p;
590 dr = rb_entry(parent, struct done_ref, rb);
591 if (lnum < dr->lnum)
592 p = &(*p)->rb_left;
593 else if (lnum > dr->lnum)
594 p = &(*p)->rb_right;
595 else
596 return 1;
597 }
598
599 dr = kzalloc(sizeof(struct done_ref), GFP_NOFS);
600 if (!dr)
601 return -ENOMEM;
602
603 dr->lnum = lnum;
604
605 rb_link_node(&dr->rb, parent, p);
606 rb_insert_color(&dr->rb, done_tree);
607
608 return 0;
609}
610
611/**
612 * destroy_done_tree - destroy the done tree.
613 * @done_tree: done tree to destroy
614 */
615static void destroy_done_tree(struct rb_root *done_tree)
616{
617 struct rb_node *this = done_tree->rb_node;
618 struct done_ref *dr;
619
620 while (this) {
621 if (this->rb_left) {
622 this = this->rb_left;
623 continue;
624 } else if (this->rb_right) {
625 this = this->rb_right;
626 continue;
627 }
628 dr = rb_entry(this, struct done_ref, rb);
629 this = rb_parent(this);
630 if (this) {
631 if (this->rb_left == &dr->rb)
632 this->rb_left = NULL;
633 else
634 this->rb_right = NULL;
635 }
636 kfree(dr);
637 }
638}
639
640/**
641 * add_node - add a node to the consolidated log.
642 * @c: UBIFS file-system description object
643 * @buf: buffer to which to add
644 * @lnum: LEB number to which to write is passed and returned here
645 * @offs: offset to where to write is passed and returned here
646 * @node: node to add
647 *
648 * This function returns %0 on success and a negative error code on failure.
649 */
650static int add_node(struct ubifs_info *c, void *buf, int *lnum, int *offs,
651 void *node)
652{
653 struct ubifs_ch *ch = node;
654 int len = le32_to_cpu(ch->len), remains = c->leb_size - *offs;
655
656 if (len > remains) {
657 int sz = ALIGN(*offs, c->min_io_size), err;
658
659 ubifs_pad(c, buf + *offs, sz - *offs);
660 err = ubifs_leb_change(c, *lnum, buf, sz, UBI_SHORTTERM);
661 if (err)
662 return err;
663 *lnum = next_log_lnum(c, *lnum);
664 *offs = 0;
665 }
666 memcpy(buf + *offs, node, len);
667 *offs += ALIGN(len, 8);
668 return 0;
669}
670
671/**
672 * ubifs_consolidate_log - consolidate the log.
673 * @c: UBIFS file-system description object
674 *
675 * Repeated failed commits could cause the log to be full, but at least 1 LEB is
676 * needed for commit. This function rewrites the reference nodes in the log
677 * omitting duplicates, and failed CS nodes, and leaving no gaps.
678 *
679 * This function returns %0 on success and a negative error code on failure.
680 */
681int ubifs_consolidate_log(struct ubifs_info *c)
682{
683 struct ubifs_scan_leb *sleb;
684 struct ubifs_scan_node *snod;
685 struct rb_root done_tree = RB_ROOT;
686 int lnum, err, first = 1, write_lnum, offs = 0;
687 void *buf;
688
689 dbg_rcvry("log tail LEB %d, log head LEB %d", c->ltail_lnum,
690 c->lhead_lnum);
691 buf = vmalloc(c->leb_size);
692 if (!buf)
693 return -ENOMEM;
694 lnum = c->ltail_lnum;
695 write_lnum = lnum;
696 while (1) {
697 sleb = ubifs_scan(c, lnum, 0, c->sbuf);
698 if (IS_ERR(sleb)) {
699 err = PTR_ERR(sleb);
700 goto out_free;
701 }
702 list_for_each_entry(snod, &sleb->nodes, list) {
703 switch (snod->type) {
704 case UBIFS_REF_NODE: {
705 struct ubifs_ref_node *ref = snod->node;
706 int ref_lnum = le32_to_cpu(ref->lnum);
707
708 err = done_already(&done_tree, ref_lnum);
709 if (err < 0)
710 goto out_scan;
711 if (err != 1) {
712 err = add_node(c, buf, &write_lnum,
713 &offs, snod->node);
714 if (err)
715 goto out_scan;
716 }
717 break;
718 }
719 case UBIFS_CS_NODE:
720 if (!first)
721 break;
722 err = add_node(c, buf, &write_lnum, &offs,
723 snod->node);
724 if (err)
725 goto out_scan;
726 first = 0;
727 break;
728 }
729 }
730 ubifs_scan_destroy(sleb);
731 if (lnum == c->lhead_lnum)
732 break;
733 lnum = next_log_lnum(c, lnum);
734 }
735 if (offs) {
736 int sz = ALIGN(offs, c->min_io_size);
737
738 ubifs_pad(c, buf + offs, sz - offs);
739 err = ubifs_leb_change(c, write_lnum, buf, sz, UBI_SHORTTERM);
740 if (err)
741 goto out_free;
742 offs = ALIGN(offs, c->min_io_size);
743 }
744 destroy_done_tree(&done_tree);
745 vfree(buf);
746 if (write_lnum == c->lhead_lnum) {
747 ubifs_err("log is too full");
748 return -EINVAL;
749 }
750 /* Unmap remaining LEBs */
751 lnum = write_lnum;
752 do {
753 lnum = next_log_lnum(c, lnum);
754 err = ubifs_leb_unmap(c, lnum);
755 if (err)
756 return err;
757 } while (lnum != c->lhead_lnum);
758 c->lhead_lnum = write_lnum;
759 c->lhead_offs = offs;
760 dbg_rcvry("new log head at %d:%d", c->lhead_lnum, c->lhead_offs);
761 return 0;
762
763out_scan:
764 ubifs_scan_destroy(sleb);
765out_free:
766 destroy_done_tree(&done_tree);
767 vfree(buf);
768 return err;
769}
770
771#ifdef CONFIG_UBIFS_FS_DEBUG
772
773/**
774 * dbg_check_bud_bytes - make sure bud bytes calculation are all right.
775 * @c: UBIFS file-system description object
776 *
777 * This function makes sure the amount of flash space used by closed buds
778 * ('c->bud_bytes' is correct). Returns zero in case of success and %-EINVAL in
779 * case of failure.
780 */
781static int dbg_check_bud_bytes(struct ubifs_info *c)
782{
783 int i, err = 0;
784 struct ubifs_bud *bud;
785 long long bud_bytes = 0;
786
787 if (!(ubifs_chk_flags & UBIFS_CHK_GEN))
788 return 0;
789
790 spin_lock(&c->buds_lock);
791 for (i = 0; i < c->jhead_cnt; i++)
792 list_for_each_entry(bud, &c->jheads[i].buds_list, list)
793 bud_bytes += c->leb_size - bud->start;
794
795 if (c->bud_bytes != bud_bytes) {
796 ubifs_err("bad bud_bytes %lld, calculated %lld",
797 c->bud_bytes, bud_bytes);
798 err = -EINVAL;
799 }
800 spin_unlock(&c->buds_lock);
801
802 return err;
803}
804
805#endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c
new file mode 100644
index 000000000000..2ba93da71b65
--- /dev/null
+++ b/fs/ubifs/lprops.c
@@ -0,0 +1,1357 @@
1/*
2 * This file is part of UBIFS.
3 *
4 * Copyright (C) 2006-2008 Nokia Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published by
8 * the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along with
16 * this program; if not, write to the Free Software Foundation, Inc., 51
17 * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 * Authors: Adrian Hunter
20 * Artem Bityutskiy (Битюцкий Артём)
21 */
22
23/*
24 * This file implements the functions that access LEB properties and their
25 * categories. LEBs are categorized based on the needs of UBIFS, and the
26 * categories are stored as either heaps or lists to provide a fast way of
27 * finding a LEB in a particular category. For example, UBIFS may need to find
28 * an empty LEB for the journal, or a very dirty LEB for garbage collection.
29 */
30
31#include "ubifs.h"
32
33/**
34 * get_heap_comp_val - get the LEB properties value for heap comparisons.
35 * @lprops: LEB properties
36 * @cat: LEB category
37 */
38static int get_heap_comp_val(struct ubifs_lprops *lprops, int cat)
39{
40 switch (cat) {
41 case LPROPS_FREE:
42 return lprops->free;
43 case LPROPS_DIRTY_IDX:
44 return lprops->free + lprops->dirty;
45 default:
46 return lprops->dirty;
47 }
48}
49
50/**
51 * move_up_lpt_heap - move a new heap entry up as far as possible.
52 * @c: UBIFS file-system description object
53 * @heap: LEB category heap
54 * @lprops: LEB properties to move
55 * @cat: LEB category
56 *
57 * New entries to a heap are added at the bottom and then moved up until the
58 * parent's value is greater. In the case of LPT's category heaps, the value
59 * is either the amount of free space or the amount of dirty space, depending
60 * on the category.
61 */
62static void move_up_lpt_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap,
63 struct ubifs_lprops *lprops, int cat)
64{
65 int val1, val2, hpos;
66
67 hpos = lprops->hpos;
68 if (!hpos)
69 return; /* Already top of the heap */
70 val1 = get_heap_comp_val(lprops, cat);
71 /* Compare to parent and, if greater, move up the heap */
72 do {
73 int ppos = (hpos - 1) / 2;
74
75 val2 = get_heap_comp_val(heap->arr[ppos], cat);
76 if (val2 >= val1)
77 return;
78 /* Greater than parent so move up */
79 heap->arr[ppos]->hpos = hpos;
80 heap->arr[hpos] = heap->arr[ppos];
81 heap->arr[ppos] = lprops;
82 lprops->hpos = ppos;
83 hpos = ppos;
84 } while (hpos);
85}
86
87/**
88 * adjust_lpt_heap - move a changed heap entry up or down the heap.
89 * @c: UBIFS file-system description object
90 * @heap: LEB category heap
91 * @lprops: LEB properties to move
92 * @hpos: heap position of @lprops
93 * @cat: LEB category
94 *
95 * Changed entries in a heap are moved up or down until the parent's value is
96 * greater. In the case of LPT's category heaps, the value is either the amount
97 * of free space or the amount of dirty space, depending on the category.
98 */
99static void adjust_lpt_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap,
100 struct ubifs_lprops *lprops, int hpos, int cat)
101{
102 int val1, val2, val3, cpos;
103
104 val1 = get_heap_comp_val(lprops, cat);
105 /* Compare to parent and, if greater than parent, move up the heap */
106 if (hpos) {
107 int ppos = (hpos - 1) / 2;
108
109 val2 = get_heap_comp_val(heap->arr[ppos], cat);
110 if (val1 > val2) {
111 /* Greater than parent so move up */
112 while (1) {
113 heap->arr[ppos]->hpos = hpos;
114 heap->arr[hpos] = heap->arr[ppos];
115 heap->arr[ppos] = lprops;
116 lprops->hpos = ppos;
117 hpos = ppos;
118 if (!hpos)
119 return;
120 ppos = (hpos - 1) / 2;
121 val2 = get_heap_comp_val(heap->arr[ppos], cat);
122 if (val1 <= val2)
123 return;
124 /* Still greater than parent so keep going */
125 }
126 }
127 }
128 /* Not greater than parent, so compare to children */
129 while (1) {
130 /* Compare to left child */
131 cpos = hpos * 2 + 1;
132 if (cpos >= heap->cnt)
133 return;
134 val2 = get_heap_comp_val(heap->arr[cpos], cat);
135 if (val1 < val2) {
136 /* Less than left child, so promote biggest child */
137 if (cpos + 1 < heap->cnt) {
138 val3 = get_heap_comp_val(heap->arr[cpos + 1],
139 cat);
140 if (val3 > val2)
141 cpos += 1; /* Right child is bigger */
142 }
143 heap->arr[cpos]->hpos = hpos;
144 heap->arr[hpos] = heap->arr[cpos];
145 heap->arr[cpos] = lprops;
146 lprops->hpos = cpos;
147 hpos = cpos;
148 continue;
149 }
150 /* Compare to right child */
151 cpos += 1;
152 if (cpos >= heap->cnt)
153 return;
154 val3 = get_heap_comp_val(heap->arr[cpos], cat);
155 if (val1 < val3) {
156 /* Less than right child, so promote right child */
157 heap->arr[cpos]->hpos = hpos;
158 heap->arr[hpos] = heap->arr[cpos];
159 heap->arr[cpos] = lprops;
160 lprops->hpos = cpos;
161 hpos = cpos;
162 continue;
163 }
164 return;
165 }
166}
167
168/**
169 * add_to_lpt_heap - add LEB properties to a LEB category heap.
170 * @c: UBIFS file-system description object
171 * @lprops: LEB properties to add
172 * @cat: LEB category
173 *
174 * This function returns %1 if @lprops is added to the heap for LEB category
175 * @cat, otherwise %0 is returned because the heap is full.
176 */
177static int add_to_lpt_heap(struct ubifs_info *c, struct ubifs_lprops *lprops,
178 int cat)
179{
180 struct ubifs_lpt_heap *heap = &c->lpt_heap[cat - 1];
181
182 if (heap->cnt >= heap->max_cnt) {
183 const int b = LPT_HEAP_SZ / 2 - 1;
184 int cpos, val1, val2;
185
186 /* Compare to some other LEB on the bottom of heap */
187 /* Pick a position kind of randomly */
188 cpos = (((size_t)lprops >> 4) & b) + b;
189 ubifs_assert(cpos >= b);
190 ubifs_assert(cpos < LPT_HEAP_SZ);
191 ubifs_assert(cpos < heap->cnt);
192
193 val1 = get_heap_comp_val(lprops, cat);
194 val2 = get_heap_comp_val(heap->arr[cpos], cat);
195 if (val1 > val2) {
196 struct ubifs_lprops *lp;
197
198 lp = heap->arr[cpos];
199 lp->flags &= ~LPROPS_CAT_MASK;
200 lp->flags |= LPROPS_UNCAT;
201 list_add(&lp->list, &c->uncat_list);
202 lprops->hpos = cpos;
203 heap->arr[cpos] = lprops;
204 move_up_lpt_heap(c, heap, lprops, cat);
205 dbg_check_heap(c, heap, cat, lprops->hpos);
206 return 1; /* Added to heap */
207 }
208 dbg_check_heap(c, heap, cat, -1);
209 return 0; /* Not added to heap */
210 } else {
211 lprops->hpos = heap->cnt++;
212 heap->arr[lprops->hpos] = lprops;
213 move_up_lpt_heap(c, heap, lprops, cat);
214 dbg_check_heap(c, heap, cat, lprops->hpos);
215 return 1; /* Added to heap */
216 }
217}
218
219/**
220 * remove_from_lpt_heap - remove LEB properties from a LEB category heap.
221 * @c: UBIFS file-system description object
222 * @lprops: LEB properties to remove
223 * @cat: LEB category
224 */
225static void remove_from_lpt_heap(struct ubifs_info *c,
226 struct ubifs_lprops *lprops, int cat)
227{
228 struct ubifs_lpt_heap *heap;
229 int hpos = lprops->hpos;
230
231 heap = &c->lpt_heap[cat - 1];
232 ubifs_assert(hpos >= 0 && hpos < heap->cnt);
233 ubifs_assert(heap->arr[hpos] == lprops);
234 heap->cnt -= 1;
235 if (hpos < heap->cnt) {
236 heap->arr[hpos] = heap->arr[heap->cnt];
237 heap->arr[hpos]->hpos = hpos;
238 adjust_lpt_heap(c, heap, heap->arr[hpos], hpos, cat);
239 }
240 dbg_check_heap(c, heap, cat, -1);
241}
242
243/**
244 * lpt_heap_replace - replace lprops in a category heap.
245 * @c: UBIFS file-system description object
246 * @old_lprops: LEB properties to replace
247 * @new_lprops: LEB properties with which to replace
248 * @cat: LEB category
249 *
250 * During commit it is sometimes necessary to copy a pnode (see dirty_cow_pnode)
251 * and the lprops that the pnode contains. When that happens, references in
252 * the category heaps to those lprops must be updated to point to the new
253 * lprops. This function does that.
254 */
255static void lpt_heap_replace(struct ubifs_info *c,
256 struct ubifs_lprops *old_lprops,
257 struct ubifs_lprops *new_lprops, int cat)
258{
259 struct ubifs_lpt_heap *heap;
260 int hpos = new_lprops->hpos;
261
262 heap = &c->lpt_heap[cat - 1];
263 heap->arr[hpos] = new_lprops;
264}
265
266/**
267 * ubifs_add_to_cat - add LEB properties to a category list or heap.
268 * @c: UBIFS file-system description object
269 * @lprops: LEB properties to add
270 * @cat: LEB category to which to add
271 *
272 * LEB properties are categorized to enable fast find operations.
273 */
274void ubifs_add_to_cat(struct ubifs_info *c, struct ubifs_lprops *lprops,
275 int cat)
276{
277 switch (cat) {
278 case LPROPS_DIRTY:
279 case LPROPS_DIRTY_IDX:
280 case LPROPS_FREE:
281 if (add_to_lpt_heap(c, lprops, cat))
282 break;
283 /* No more room on heap so make it uncategorized */
284 cat = LPROPS_UNCAT;
285 /* Fall through */
286 case LPROPS_UNCAT:
287 list_add(&lprops->list, &c->uncat_list);
288 break;
289 case LPROPS_EMPTY:
290 list_add(&lprops->list, &c->empty_list);
291 break;
292 case LPROPS_FREEABLE:
293 list_add(&lprops->list, &c->freeable_list);
294 c->freeable_cnt += 1;
295 break;
296 case LPROPS_FRDI_IDX:
297 list_add(&lprops->list, &c->frdi_idx_list);
298 break;
299 default:
300 ubifs_assert(0);
301 }
302 lprops->flags &= ~LPROPS_CAT_MASK;
303 lprops->flags |= cat;
304}
305
306/**
307 * ubifs_remove_from_cat - remove LEB properties from a category list or heap.
308 * @c: UBIFS file-system description object
309 * @lprops: LEB properties to remove
310 * @cat: LEB category from which to remove
311 *
312 * LEB properties are categorized to enable fast find operations.
313 */
314static void ubifs_remove_from_cat(struct ubifs_info *c,
315 struct ubifs_lprops *lprops, int cat)
316{
317 switch (cat) {
318 case LPROPS_DIRTY:
319 case LPROPS_DIRTY_IDX:
320 case LPROPS_FREE:
321 remove_from_lpt_heap(c, lprops, cat);
322 break;
323 case LPROPS_FREEABLE:
324 c->freeable_cnt -= 1;
325 ubifs_assert(c->freeable_cnt >= 0);
326 /* Fall through */
327 case LPROPS_UNCAT:
328 case LPROPS_EMPTY:
329 case LPROPS_FRDI_IDX:
330 ubifs_assert(!list_empty(&lprops->list));
331 list_del(&lprops->list);
332 break;
333 default:
334 ubifs_assert(0);
335 }
336}
337
338/**
339 * ubifs_replace_cat - replace lprops in a category list or heap.
340 * @c: UBIFS file-system description object
341 * @old_lprops: LEB properties to replace
342 * @new_lprops: LEB properties with which to replace
343 *
344 * During commit it is sometimes necessary to copy a pnode (see dirty_cow_pnode)
345 * and the lprops that the pnode contains. When that happens, references in
346 * category lists and heaps must be replaced. This function does that.
347 */
348void ubifs_replace_cat(struct ubifs_info *c, struct ubifs_lprops *old_lprops,
349 struct ubifs_lprops *new_lprops)
350{
351 int cat;
352
353 cat = new_lprops->flags & LPROPS_CAT_MASK;
354 switch (cat) {
355 case LPROPS_DIRTY:
356 case LPROPS_DIRTY_IDX:
357 case LPROPS_FREE:
358 lpt_heap_replace(c, old_lprops, new_lprops, cat);
359 break;
360 case LPROPS_UNCAT:
361 case LPROPS_EMPTY:
362 case LPROPS_FREEABLE:
363 case LPROPS_FRDI_IDX:
364 list_replace(&old_lprops->list, &new_lprops->list);
365 break;
366 default:
367 ubifs_assert(0);
368 }
369}
370
371/**
372 * ubifs_ensure_cat - ensure LEB properties are categorized.
373 * @c: UBIFS file-system description object
374 * @lprops: LEB properties
375 *
376 * A LEB may have fallen off of the bottom of a heap, and ended up as
377 * uncategorized even though it has enough space for us now. If that is the case
378 * this function will put the LEB back onto a heap.
379 */
380void ubifs_ensure_cat(struct ubifs_info *c, struct ubifs_lprops *lprops)
381{
382 int cat = lprops->flags & LPROPS_CAT_MASK;
383
384 if (cat != LPROPS_UNCAT)
385 return;
386 cat = ubifs_categorize_lprops(c, lprops);
387 if (cat == LPROPS_UNCAT)
388 return;
389 ubifs_remove_from_cat(c, lprops, LPROPS_UNCAT);
390 ubifs_add_to_cat(c, lprops, cat);
391}
392
393/**
394 * ubifs_categorize_lprops - categorize LEB properties.
395 * @c: UBIFS file-system description object
396 * @lprops: LEB properties to categorize
397 *
398 * LEB properties are categorized to enable fast find operations. This function
399 * returns the LEB category to which the LEB properties belong. Note however
400 * that if the LEB category is stored as a heap and the heap is full, the
401 * LEB properties may have their category changed to %LPROPS_UNCAT.
402 */
403int ubifs_categorize_lprops(const struct ubifs_info *c,
404 const struct ubifs_lprops *lprops)
405{
406 if (lprops->flags & LPROPS_TAKEN)
407 return LPROPS_UNCAT;
408
409 if (lprops->free == c->leb_size) {
410 ubifs_assert(!(lprops->flags & LPROPS_INDEX));
411 return LPROPS_EMPTY;
412 }
413
414 if (lprops->free + lprops->dirty == c->leb_size) {
415 if (lprops->flags & LPROPS_INDEX)
416 return LPROPS_FRDI_IDX;
417 else
418 return LPROPS_FREEABLE;
419 }
420
421 if (lprops->flags & LPROPS_INDEX) {
422 if (lprops->dirty + lprops->free >= c->min_idx_node_sz)
423 return LPROPS_DIRTY_IDX;
424 } else {
425 if (lprops->dirty >= c->dead_wm &&
426 lprops->dirty > lprops->free)
427 return LPROPS_DIRTY;
428 if (lprops->free > 0)
429 return LPROPS_FREE;
430 }
431
432 return LPROPS_UNCAT;
433}
434
435/**
436 * change_category - change LEB properties category.
437 * @c: UBIFS file-system description object
438 * @lprops: LEB properties to recategorize
439 *
440 * LEB properties are categorized to enable fast find operations. When the LEB
441 * properties change they must be recategorized.
442 */
443static void change_category(struct ubifs_info *c, struct ubifs_lprops *lprops)
444{
445 int old_cat = lprops->flags & LPROPS_CAT_MASK;
446 int new_cat = ubifs_categorize_lprops(c, lprops);
447
448 if (old_cat == new_cat) {
449 struct ubifs_lpt_heap *heap = &c->lpt_heap[new_cat - 1];
450
451 /* lprops on a heap now must be moved up or down */
452 if (new_cat < 1 || new_cat > LPROPS_HEAP_CNT)
453 return; /* Not on a heap */
454 heap = &c->lpt_heap[new_cat - 1];
455 adjust_lpt_heap(c, heap, lprops, lprops->hpos, new_cat);
456 } else {
457 ubifs_remove_from_cat(c, lprops, old_cat);
458 ubifs_add_to_cat(c, lprops, new_cat);
459 }
460}
461
462/**
463 * ubifs_get_lprops - get reference to LEB properties.
464 * @c: the UBIFS file-system description object
465 *
466 * This function locks lprops. Lprops have to be unlocked by
467 * 'ubifs_release_lprops()'.
468 */
469void ubifs_get_lprops(struct ubifs_info *c)
470{
471 mutex_lock(&c->lp_mutex);
472}
473
474/**
475 * calc_dark - calculate LEB dark space size.
476 * @c: the UBIFS file-system description object
477 * @spc: amount of free and dirty space in the LEB
478 *
479 * This function calculates amount of dark space in an LEB which has @spc bytes
480 * of free and dirty space. Returns the calculations result.
481 *
482 * Dark space is the space which is not always usable - it depends on which
483 * nodes are written in which order. E.g., if an LEB has only 512 free bytes,
484 * it is dark space, because it cannot fit a large data node. So UBIFS cannot
485 * count on this LEB and treat these 512 bytes as usable because it is not true
486 * if, for example, only big chunks of uncompressible data will be written to
487 * the FS.
488 */
489static int calc_dark(struct ubifs_info *c, int spc)
490{
491 ubifs_assert(!(spc & 7));
492
493 if (spc < c->dark_wm)
494 return spc;
495
496 /*
497 * If we have slightly more space then the dark space watermark, we can
498 * anyway safely assume it we'll be able to write a node of the
499 * smallest size there.
500 */
501 if (spc - c->dark_wm < MIN_WRITE_SZ)
502 return spc - MIN_WRITE_SZ;
503
504 return c->dark_wm;
505}
506
507/**
508 * is_lprops_dirty - determine if LEB properties are dirty.
509 * @c: the UBIFS file-system description object
510 * @lprops: LEB properties to test
511 */
512static int is_lprops_dirty(struct ubifs_info *c, struct ubifs_lprops *lprops)
513{
514 struct ubifs_pnode *pnode;
515 int pos;
516
517 pos = (lprops->lnum - c->main_first) & (UBIFS_LPT_FANOUT - 1);
518 pnode = (struct ubifs_pnode *)container_of(lprops - pos,
519 struct ubifs_pnode,
520 lprops[0]);
521 return !test_bit(COW_ZNODE, &pnode->flags) &&
522 test_bit(DIRTY_CNODE, &pnode->flags);
523}
524
525/**
526 * ubifs_change_lp - change LEB properties.
527 * @c: the UBIFS file-system description object
528 * @lp: LEB properties to change
529 * @free: new free space amount
530 * @dirty: new dirty space amount
531 * @flags: new flags
532 * @idx_gc_cnt: change to the count of idx_gc list
533 *
534 * This function changes LEB properties. This function does not change a LEB
535 * property (@free, @dirty or @flag) if the value passed is %LPROPS_NC.
536 *
537 * This function returns a pointer to the updated LEB properties on success
538 * and a negative error code on failure. N.B. the LEB properties may have had to
539 * be copied (due to COW) and consequently the pointer returned may not be the
540 * same as the pointer passed.
541 */
542const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c,
543 const struct ubifs_lprops *lp,
544 int free, int dirty, int flags,
545 int idx_gc_cnt)
546{
547 /*
548 * This is the only function that is allowed to change lprops, so we
549 * discard the const qualifier.
550 */
551 struct ubifs_lprops *lprops = (struct ubifs_lprops *)lp;
552
553 dbg_lp("LEB %d, free %d, dirty %d, flags %d",
554 lprops->lnum, free, dirty, flags);
555
556 ubifs_assert(mutex_is_locked(&c->lp_mutex));
557 ubifs_assert(c->lst.empty_lebs >= 0 &&
558 c->lst.empty_lebs <= c->main_lebs);
559 ubifs_assert(c->freeable_cnt >= 0);
560 ubifs_assert(c->freeable_cnt <= c->main_lebs);
561 ubifs_assert(c->lst.taken_empty_lebs >= 0);
562 ubifs_assert(c->lst.taken_empty_lebs <= c->lst.empty_lebs);
563 ubifs_assert(!(c->lst.total_free & 7) && !(c->lst.total_dirty & 7));
564 ubifs_assert(!(c->lst.total_dead & 7) && !(c->lst.total_dark & 7));
565 ubifs_assert(!(c->lst.total_used & 7));
566 ubifs_assert(free == LPROPS_NC || free >= 0);
567 ubifs_assert(dirty == LPROPS_NC || dirty >= 0);
568
569 if (!is_lprops_dirty(c, lprops)) {
570 lprops = ubifs_lpt_lookup_dirty(c, lprops->lnum);
571 if (IS_ERR(lprops))
572 return lprops;
573 } else
574 ubifs_assert(lprops == ubifs_lpt_lookup_dirty(c, lprops->lnum));
575
576 ubifs_assert(!(lprops->free & 7) && !(lprops->dirty & 7));
577
578 spin_lock(&c->space_lock);
579
580 if ((lprops->flags & LPROPS_TAKEN) && lprops->free == c->leb_size)
581 c->lst.taken_empty_lebs -= 1;
582
583 if (!(lprops->flags & LPROPS_INDEX)) {
584 int old_spc;
585
586 old_spc = lprops->free + lprops->dirty;
587 if (old_spc < c->dead_wm)
588 c->lst.total_dead -= old_spc;
589 else
590 c->lst.total_dark -= calc_dark(c, old_spc);
591
592 c->lst.total_used -= c->leb_size - old_spc;
593 }
594
595 if (free != LPROPS_NC) {
596 free = ALIGN(free, 8);
597 c->lst.total_free += free - lprops->free;
598
599 /* Increase or decrease empty LEBs counter if needed */
600 if (free == c->leb_size) {
601 if (lprops->free != c->leb_size)
602 c->lst.empty_lebs += 1;
603 } else if (lprops->free == c->leb_size)
604 c->lst.empty_lebs -= 1;
605 lprops->free = free;
606 }
607
608 if (dirty != LPROPS_NC) {
609 dirty = ALIGN(dirty, 8);
610 c->lst.total_dirty += dirty - lprops->dirty;
611 lprops->dirty = dirty;
612 }
613
614 if (flags != LPROPS_NC) {
615 /* Take care about indexing LEBs counter if needed */
616 if ((lprops->flags & LPROPS_INDEX)) {
617 if (!(flags & LPROPS_INDEX))
618 c->lst.idx_lebs -= 1;
619 } else if (flags & LPROPS_INDEX)
620 c->lst.idx_lebs += 1;
621 lprops->flags = flags;
622 }
623
624 if (!(lprops->flags & LPROPS_INDEX)) {
625 int new_spc;
626
627 new_spc = lprops->free + lprops->dirty;
628 if (new_spc < c->dead_wm)
629 c->lst.total_dead += new_spc;
630 else
631 c->lst.total_dark += calc_dark(c, new_spc);
632
633 c->lst.total_used += c->leb_size - new_spc;
634 }
635
636 if ((lprops->flags & LPROPS_TAKEN) && lprops->free == c->leb_size)
637 c->lst.taken_empty_lebs += 1;
638
639 change_category(c, lprops);
640
641 c->idx_gc_cnt += idx_gc_cnt;
642
643 spin_unlock(&c->space_lock);
644
645 return lprops;
646}
647
648/**
649 * ubifs_release_lprops - release lprops lock.
650 * @c: the UBIFS file-system description object
651 *
652 * This function has to be called after each 'ubifs_get_lprops()' call to
653 * unlock lprops.
654 */
655void ubifs_release_lprops(struct ubifs_info *c)
656{
657 ubifs_assert(mutex_is_locked(&c->lp_mutex));
658 ubifs_assert(c->lst.empty_lebs >= 0 &&
659 c->lst.empty_lebs <= c->main_lebs);
660
661 mutex_unlock(&c->lp_mutex);
662}
663
664/**
665 * ubifs_get_lp_stats - get lprops statistics.
666 * @c: UBIFS file-system description object
667 * @st: return statistics
668 */
669void ubifs_get_lp_stats(struct ubifs_info *c, struct ubifs_lp_stats *st)
670{
671 spin_lock(&c->space_lock);
672 memcpy(st, &c->lst, sizeof(struct ubifs_lp_stats));
673 spin_unlock(&c->space_lock);
674}
675
676/**
677 * ubifs_change_one_lp - change LEB properties.
678 * @c: the UBIFS file-system description object
679 * @lnum: LEB to change properties for
680 * @free: amount of free space
681 * @dirty: amount of dirty space
682 * @flags_set: flags to set
683 * @flags_clean: flags to clean
684 * @idx_gc_cnt: change to the count of idx_gc list
685 *
686 * This function changes properties of LEB @lnum. It is a helper wrapper over
687 * 'ubifs_change_lp()' which hides lprops get/release. The arguments are the
688 * same as in case of 'ubifs_change_lp()'. Returns zero in case of success and
689 * a negative error code in case of failure.
690 */
691int ubifs_change_one_lp(struct ubifs_info *c, int lnum, int free, int dirty,
692 int flags_set, int flags_clean, int idx_gc_cnt)
693{
694 int err = 0, flags;
695 const struct ubifs_lprops *lp;
696
697 ubifs_get_lprops(c);
698
699 lp = ubifs_lpt_lookup_dirty(c, lnum);
700 if (IS_ERR(lp)) {
701 err = PTR_ERR(lp);
702 goto out;
703 }
704
705 flags = (lp->flags | flags_set) & ~flags_clean;
706 lp = ubifs_change_lp(c, lp, free, dirty, flags, idx_gc_cnt);
707 if (IS_ERR(lp))
708 err = PTR_ERR(lp);
709
710out:
711 ubifs_release_lprops(c);
712 return err;
713}
714
715/**
716 * ubifs_update_one_lp - update LEB properties.
717 * @c: the UBIFS file-system description object
718 * @lnum: LEB to change properties for
719 * @free: amount of free space
720 * @dirty: amount of dirty space to add
721 * @flags_set: flags to set
722 * @flags_clean: flags to clean
723 *
724 * This function is the same as 'ubifs_change_one_lp()' but @dirty is added to
725 * current dirty space, not substitutes it.
726 */
727int ubifs_update_one_lp(struct ubifs_info *c, int lnum, int free, int dirty,
728 int flags_set, int flags_clean)
729{
730 int err = 0, flags;
731 const struct ubifs_lprops *lp;
732
733 ubifs_get_lprops(c);
734
735 lp = ubifs_lpt_lookup_dirty(c, lnum);
736 if (IS_ERR(lp)) {
737 err = PTR_ERR(lp);
738 goto out;
739 }
740
741 flags = (lp->flags | flags_set) & ~flags_clean;
742 lp = ubifs_change_lp(c, lp, free, lp->dirty + dirty, flags, 0);
743 if (IS_ERR(lp))
744 err = PTR_ERR(lp);
745
746out:
747 ubifs_release_lprops(c);
748 return err;
749}
750
751/**
752 * ubifs_read_one_lp - read LEB properties.
753 * @c: the UBIFS file-system description object
754 * @lnum: LEB to read properties for
755 * @lp: where to store read properties
756 *
757 * This helper function reads properties of a LEB @lnum and stores them in @lp.
758 * Returns zero in case of success and a negative error code in case of
759 * failure.
760 */
761int ubifs_read_one_lp(struct ubifs_info *c, int lnum, struct ubifs_lprops *lp)
762{
763 int err = 0;
764 const struct ubifs_lprops *lpp;
765
766 ubifs_get_lprops(c);
767
768 lpp = ubifs_lpt_lookup(c, lnum);
769 if (IS_ERR(lpp)) {
770 err = PTR_ERR(lpp);
771 goto out;
772 }
773
774 memcpy(lp, lpp, sizeof(struct ubifs_lprops));
775
776out:
777 ubifs_release_lprops(c);
778 return err;
779}
780
781/**
782 * ubifs_fast_find_free - try to find a LEB with free space quickly.
783 * @c: the UBIFS file-system description object
784 *
785 * This function returns LEB properties for a LEB with free space or %NULL if
786 * the function is unable to find a LEB quickly.
787 */
788const struct ubifs_lprops *ubifs_fast_find_free(struct ubifs_info *c)
789{
790 struct ubifs_lprops *lprops;
791 struct ubifs_lpt_heap *heap;
792
793 ubifs_assert(mutex_is_locked(&c->lp_mutex));
794
795 heap = &c->lpt_heap[LPROPS_FREE - 1];
796 if (heap->cnt == 0)
797 return NULL;
798
799 lprops = heap->arr[0];
800 ubifs_assert(!(lprops->flags & LPROPS_TAKEN));
801 ubifs_assert(!(lprops->flags & LPROPS_INDEX));
802 return lprops;
803}
804
805/**
806 * ubifs_fast_find_empty - try to find an empty LEB quickly.
807 * @c: the UBIFS file-system description object
808 *
809 * This function returns LEB properties for an empty LEB or %NULL if the
810 * function is unable to find an empty LEB quickly.
811 */
812const struct ubifs_lprops *ubifs_fast_find_empty(struct ubifs_info *c)
813{
814 struct ubifs_lprops *lprops;
815
816 ubifs_assert(mutex_is_locked(&c->lp_mutex));
817
818 if (list_empty(&c->empty_list))
819 return NULL;
820
821 lprops = list_entry(c->empty_list.next, struct ubifs_lprops, list);
822 ubifs_assert(!(lprops->flags & LPROPS_TAKEN));
823 ubifs_assert(!(lprops->flags & LPROPS_INDEX));
824 ubifs_assert(lprops->free == c->leb_size);
825 return lprops;
826}
827
828/**
829 * ubifs_fast_find_freeable - try to find a freeable LEB quickly.
830 * @c: the UBIFS file-system description object
831 *
832 * This function returns LEB properties for a freeable LEB or %NULL if the
833 * function is unable to find a freeable LEB quickly.
834 */
835const struct ubifs_lprops *ubifs_fast_find_freeable(struct ubifs_info *c)
836{
837 struct ubifs_lprops *lprops;
838
839 ubifs_assert(mutex_is_locked(&c->lp_mutex));
840
841 if (list_empty(&c->freeable_list))
842 return NULL;
843
844 lprops = list_entry(c->freeable_list.next, struct ubifs_lprops, list);
845 ubifs_assert(!(lprops->flags & LPROPS_TAKEN));
846 ubifs_assert(!(lprops->flags & LPROPS_INDEX));
847 ubifs_assert(lprops->free + lprops->dirty == c->leb_size);
848 ubifs_assert(c->freeable_cnt > 0);
849 return lprops;
850}
851
852/**
853 * ubifs_fast_find_frdi_idx - try to find a freeable index LEB quickly.
854 * @c: the UBIFS file-system description object
855 *
856 * This function returns LEB properties for a freeable index LEB or %NULL if the
857 * function is unable to find a freeable index LEB quickly.
858 */
859const struct ubifs_lprops *ubifs_fast_find_frdi_idx(struct ubifs_info *c)
860{
861 struct ubifs_lprops *lprops;
862
863 ubifs_assert(mutex_is_locked(&c->lp_mutex));
864
865 if (list_empty(&c->frdi_idx_list))
866 return NULL;
867
868 lprops = list_entry(c->frdi_idx_list.next, struct ubifs_lprops, list);
869 ubifs_assert(!(lprops->flags & LPROPS_TAKEN));
870 ubifs_assert((lprops->flags & LPROPS_INDEX));
871 ubifs_assert(lprops->free + lprops->dirty == c->leb_size);
872 return lprops;
873}
874
875#ifdef CONFIG_UBIFS_FS_DEBUG
876
877/**
878 * dbg_check_cats - check category heaps and lists.
879 * @c: UBIFS file-system description object
880 *
881 * This function returns %0 on success and a negative error code on failure.
882 */
883int dbg_check_cats(struct ubifs_info *c)
884{
885 struct ubifs_lprops *lprops;
886 struct list_head *pos;
887 int i, cat;
888
889 if (!(ubifs_chk_flags & (UBIFS_CHK_GEN | UBIFS_CHK_LPROPS)))
890 return 0;
891
892 list_for_each_entry(lprops, &c->empty_list, list) {
893 if (lprops->free != c->leb_size) {
894 ubifs_err("non-empty LEB %d on empty list "
895 "(free %d dirty %d flags %d)", lprops->lnum,
896 lprops->free, lprops->dirty, lprops->flags);
897 return -EINVAL;
898 }
899 if (lprops->flags & LPROPS_TAKEN) {
900 ubifs_err("taken LEB %d on empty list "
901 "(free %d dirty %d flags %d)", lprops->lnum,
902 lprops->free, lprops->dirty, lprops->flags);
903 return -EINVAL;
904 }
905 }
906
907 i = 0;
908 list_for_each_entry(lprops, &c->freeable_list, list) {
909 if (lprops->free + lprops->dirty != c->leb_size) {
910 ubifs_err("non-freeable LEB %d on freeable list "
911 "(free %d dirty %d flags %d)", lprops->lnum,
912 lprops->free, lprops->dirty, lprops->flags);
913 return -EINVAL;
914 }
915 if (lprops->flags & LPROPS_TAKEN) {
916 ubifs_err("taken LEB %d on freeable list "
917 "(free %d dirty %d flags %d)", lprops->lnum,
918 lprops->free, lprops->dirty, lprops->flags);
919 return -EINVAL;
920 }
921 i += 1;
922 }
923 if (i != c->freeable_cnt) {
924 ubifs_err("freeable list count %d expected %d", i,
925 c->freeable_cnt);
926 return -EINVAL;
927 }
928
929 i = 0;
930 list_for_each(pos, &c->idx_gc)
931 i += 1;
932 if (i != c->idx_gc_cnt) {
933 ubifs_err("idx_gc list count %d expected %d", i,
934 c->idx_gc_cnt);
935 return -EINVAL;
936 }
937
938 list_for_each_entry(lprops, &c->frdi_idx_list, list) {
939 if (lprops->free + lprops->dirty != c->leb_size) {
940 ubifs_err("non-freeable LEB %d on frdi_idx list "
941 "(free %d dirty %d flags %d)", lprops->lnum,
942 lprops->free, lprops->dirty, lprops->flags);
943 return -EINVAL;
944 }
945 if (lprops->flags & LPROPS_TAKEN) {
946 ubifs_err("taken LEB %d on frdi_idx list "
947 "(free %d dirty %d flags %d)", lprops->lnum,
948 lprops->free, lprops->dirty, lprops->flags);
949 return -EINVAL;
950 }
951 if (!(lprops->flags & LPROPS_INDEX)) {
952 ubifs_err("non-index LEB %d on frdi_idx list "
953 "(free %d dirty %d flags %d)", lprops->lnum,
954 lprops->free, lprops->dirty, lprops->flags);
955 return -EINVAL;
956 }
957 }
958
959 for (cat = 1; cat <= LPROPS_HEAP_CNT; cat++) {
960 struct ubifs_lpt_heap *heap = &c->lpt_heap[cat - 1];
961
962 for (i = 0; i < heap->cnt; i++) {
963 lprops = heap->arr[i];
964 if (!lprops) {
965 ubifs_err("null ptr in LPT heap cat %d", cat);
966 return -EINVAL;
967 }
968 if (lprops->hpos != i) {
969 ubifs_err("bad ptr in LPT heap cat %d", cat);
970 return -EINVAL;
971 }
972 if (lprops->flags & LPROPS_TAKEN) {
973 ubifs_err("taken LEB in LPT heap cat %d", cat);
974 return -EINVAL;
975 }
976 }
977 }
978
979 return 0;
980}
981
982void dbg_check_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat,
983 int add_pos)
984{
985 int i = 0, j, err = 0;
986
987 if (!(ubifs_chk_flags & (UBIFS_CHK_GEN | UBIFS_CHK_LPROPS)))
988 return;
989
990 for (i = 0; i < heap->cnt; i++) {
991 struct ubifs_lprops *lprops = heap->arr[i];
992 struct ubifs_lprops *lp;
993
994 if (i != add_pos)
995 if ((lprops->flags & LPROPS_CAT_MASK) != cat) {
996 err = 1;
997 goto out;
998 }
999 if (lprops->hpos != i) {
1000 err = 2;
1001 goto out;
1002 }
1003 lp = ubifs_lpt_lookup(c, lprops->lnum);
1004 if (IS_ERR(lp)) {
1005 err = 3;
1006 goto out;
1007 }
1008 if (lprops != lp) {
1009 dbg_msg("lprops %zx lp %zx lprops->lnum %d lp->lnum %d",
1010 (size_t)lprops, (size_t)lp, lprops->lnum,
1011 lp->lnum);
1012 err = 4;
1013 goto out;
1014 }
1015 for (j = 0; j < i; j++) {
1016 lp = heap->arr[j];
1017 if (lp == lprops) {
1018 err = 5;
1019 goto out;
1020 }
1021 if (lp->lnum == lprops->lnum) {
1022 err = 6;
1023 goto out;
1024 }
1025 }
1026 }
1027out:
1028 if (err) {
1029 dbg_msg("failed cat %d hpos %d err %d", cat, i, err);
1030 dbg_dump_stack();
1031 dbg_dump_heap(c, heap, cat);
1032 }
1033}
1034
1035/**
1036 * struct scan_check_data - data provided to scan callback function.
1037 * @lst: LEB properties statistics
1038 * @err: error code
1039 */
1040struct scan_check_data {
1041 struct ubifs_lp_stats lst;
1042 int err;
1043};
1044
1045/**
1046 * scan_check_cb - scan callback.
1047 * @c: the UBIFS file-system description object
1048 * @lp: LEB properties to scan
1049 * @in_tree: whether the LEB properties are in main memory
1050 * @data: information passed to and from the caller of the scan
1051 *
1052 * This function returns a code that indicates whether the scan should continue
1053 * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree
1054 * in main memory (%LPT_SCAN_ADD), or whether the scan should stop
1055 * (%LPT_SCAN_STOP).
1056 */
1057static int scan_check_cb(struct ubifs_info *c,
1058 const struct ubifs_lprops *lp, int in_tree,
1059 struct scan_check_data *data)
1060{
1061 struct ubifs_scan_leb *sleb;
1062 struct ubifs_scan_node *snod;
1063 struct ubifs_lp_stats *lst = &data->lst;
1064 int cat, lnum = lp->lnum, is_idx = 0, used = 0, free, dirty;
1065
1066 cat = lp->flags & LPROPS_CAT_MASK;
1067 if (cat != LPROPS_UNCAT) {
1068 cat = ubifs_categorize_lprops(c, lp);
1069 if (cat != (lp->flags & LPROPS_CAT_MASK)) {
1070 ubifs_err("bad LEB category %d expected %d",
1071 (lp->flags & LPROPS_CAT_MASK), cat);
1072 goto out;
1073 }
1074 }
1075
1076 /* Check lp is on its category list (if it has one) */
1077 if (in_tree) {
1078 struct list_head *list = NULL;
1079
1080 switch (cat) {
1081 case LPROPS_EMPTY:
1082 list = &c->empty_list;
1083 break;
1084 case LPROPS_FREEABLE:
1085 list = &c->freeable_list;
1086 break;
1087 case LPROPS_FRDI_IDX:
1088 list = &c->frdi_idx_list;
1089 break;
1090 case LPROPS_UNCAT:
1091 list = &c->uncat_list;
1092 break;
1093 }
1094 if (list) {
1095 struct ubifs_lprops *lprops;
1096 int found = 0;
1097
1098 list_for_each_entry(lprops, list, list) {
1099 if (lprops == lp) {
1100 found = 1;
1101 break;
1102 }
1103 }
1104 if (!found) {
1105 ubifs_err("bad LPT list (category %d)", cat);
1106 goto out;
1107 }
1108 }
1109 }
1110
1111 /* Check lp is on its category heap (if it has one) */
1112 if (in_tree && cat > 0 && cat <= LPROPS_HEAP_CNT) {
1113 struct ubifs_lpt_heap *heap = &c->lpt_heap[cat - 1];
1114
1115 if ((lp->hpos != -1 && heap->arr[lp->hpos]->lnum != lnum) ||
1116 lp != heap->arr[lp->hpos]) {
1117 ubifs_err("bad LPT heap (category %d)", cat);
1118 goto out;
1119 }
1120 }
1121
1122 sleb = ubifs_scan(c, lnum, 0, c->dbg_buf);
1123 if (IS_ERR(sleb)) {
1124 /*
1125 * After an unclean unmount, empty and freeable LEBs
1126 * may contain garbage.
1127 */
1128 if (lp->free == c->leb_size) {
1129 ubifs_err("scan errors were in empty LEB "
1130 "- continuing checking");
1131 lst->empty_lebs += 1;
1132 lst->total_free += c->leb_size;
1133 lst->total_dark += calc_dark(c, c->leb_size);
1134 return LPT_SCAN_CONTINUE;
1135 }
1136
1137 if (lp->free + lp->dirty == c->leb_size &&
1138 !(lp->flags & LPROPS_INDEX)) {
1139 ubifs_err("scan errors were in freeable LEB "
1140 "- continuing checking");
1141 lst->total_free += lp->free;
1142 lst->total_dirty += lp->dirty;
1143 lst->total_dark += calc_dark(c, c->leb_size);
1144 return LPT_SCAN_CONTINUE;
1145 }
1146 data->err = PTR_ERR(sleb);
1147 return LPT_SCAN_STOP;
1148 }
1149
1150 is_idx = -1;
1151 list_for_each_entry(snod, &sleb->nodes, list) {
1152 int found, level = 0;
1153
1154 cond_resched();
1155
1156 if (is_idx == -1)
1157 is_idx = (snod->type == UBIFS_IDX_NODE) ? 1 : 0;
1158
1159 if (is_idx && snod->type != UBIFS_IDX_NODE) {
1160 ubifs_err("indexing node in data LEB %d:%d",
1161 lnum, snod->offs);
1162 goto out_destroy;
1163 }
1164
1165 if (snod->type == UBIFS_IDX_NODE) {
1166 struct ubifs_idx_node *idx = snod->node;
1167
1168 key_read(c, ubifs_idx_key(c, idx), &snod->key);
1169 level = le16_to_cpu(idx->level);
1170 }
1171
1172 found = ubifs_tnc_has_node(c, &snod->key, level, lnum,
1173 snod->offs, is_idx);
1174 if (found) {
1175 if (found < 0)
1176 goto out_destroy;
1177 used += ALIGN(snod->len, 8);
1178 }
1179 }
1180
1181 free = c->leb_size - sleb->endpt;
1182 dirty = sleb->endpt - used;
1183
1184 if (free > c->leb_size || free < 0 || dirty > c->leb_size ||
1185 dirty < 0) {
1186 ubifs_err("bad calculated accounting for LEB %d: "
1187 "free %d, dirty %d", lnum, free, dirty);
1188 goto out_destroy;
1189 }
1190
1191 if (lp->free + lp->dirty == c->leb_size &&
1192 free + dirty == c->leb_size)
1193 if ((is_idx && !(lp->flags & LPROPS_INDEX)) ||
1194 (!is_idx && free == c->leb_size) ||
1195 lp->free == c->leb_size) {
1196 /*
1197 * Empty or freeable LEBs could contain index
1198 * nodes from an uncompleted commit due to an
1199 * unclean unmount. Or they could be empty for
1200 * the same reason. Or it may simply not have been
1201 * unmapped.
1202 */
1203 free = lp->free;
1204 dirty = lp->dirty;
1205 is_idx = 0;
1206 }
1207
1208 if (is_idx && lp->free + lp->dirty == free + dirty &&
1209 lnum != c->ihead_lnum) {
1210 /*
1211 * After an unclean unmount, an index LEB could have a different
1212 * amount of free space than the value recorded by lprops. That
1213 * is because the in-the-gaps method may use free space or
1214 * create free space (as a side-effect of using ubi_leb_change
1215 * and not writing the whole LEB). The incorrect free space
1216 * value is not a problem because the index is only ever
1217 * allocated empty LEBs, so there will never be an attempt to
1218 * write to the free space at the end of an index LEB - except
1219 * by the in-the-gaps method for which it is not a problem.
1220 */
1221 free = lp->free;
1222 dirty = lp->dirty;
1223 }
1224
1225 if (lp->free != free || lp->dirty != dirty)
1226 goto out_print;
1227
1228 if (is_idx && !(lp->flags & LPROPS_INDEX)) {
1229 if (free == c->leb_size)
1230 /* Free but not unmapped LEB, it's fine */
1231 is_idx = 0;
1232 else {
1233 ubifs_err("indexing node without indexing "
1234 "flag");
1235 goto out_print;
1236 }
1237 }
1238
1239 if (!is_idx && (lp->flags & LPROPS_INDEX)) {
1240 ubifs_err("data node with indexing flag");
1241 goto out_print;
1242 }
1243
1244 if (free == c->leb_size)
1245 lst->empty_lebs += 1;
1246
1247 if (is_idx)
1248 lst->idx_lebs += 1;
1249
1250 if (!(lp->flags & LPROPS_INDEX))
1251 lst->total_used += c->leb_size - free - dirty;
1252 lst->total_free += free;
1253 lst->total_dirty += dirty;
1254
1255 if (!(lp->flags & LPROPS_INDEX)) {
1256 int spc = free + dirty;
1257
1258 if (spc < c->dead_wm)
1259 lst->total_dead += spc;
1260 else
1261 lst->total_dark += calc_dark(c, spc);
1262 }
1263
1264 ubifs_scan_destroy(sleb);
1265
1266 return LPT_SCAN_CONTINUE;
1267
1268out_print:
1269 ubifs_err("bad accounting of LEB %d: free %d, dirty %d flags %#x, "
1270 "should be free %d, dirty %d",
1271 lnum, lp->free, lp->dirty, lp->flags, free, dirty);
1272 dbg_dump_leb(c, lnum);
1273out_destroy:
1274 ubifs_scan_destroy(sleb);
1275out:
1276 data->err = -EINVAL;
1277 return LPT_SCAN_STOP;
1278}
1279
1280/**
1281 * dbg_check_lprops - check all LEB properties.
1282 * @c: UBIFS file-system description object
1283 *
1284 * This function checks all LEB properties and makes sure they are all correct.
1285 * It returns zero if everything is fine, %-EINVAL if there is an inconsistency
1286 * and other negative error codes in case of other errors. This function is
1287 * called while the file system is locked (because of commit start), so no
1288 * additional locking is required. Note that locking the LPT mutex would cause
1289 * a circular lock dependency with the TNC mutex.
1290 */
1291int dbg_check_lprops(struct ubifs_info *c)
1292{
1293 int i, err;
1294 struct scan_check_data data;
1295 struct ubifs_lp_stats *lst = &data.lst;
1296
1297 if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS))
1298 return 0;
1299
1300 /*
1301 * As we are going to scan the media, the write buffers have to be
1302 * synchronized.
1303 */
1304 for (i = 0; i < c->jhead_cnt; i++) {
1305 err = ubifs_wbuf_sync(&c->jheads[i].wbuf);
1306 if (err)
1307 return err;
1308 }
1309
1310 memset(lst, 0, sizeof(struct ubifs_lp_stats));
1311
1312 data.err = 0;
1313 err = ubifs_lpt_scan_nolock(c, c->main_first, c->leb_cnt - 1,
1314 (ubifs_lpt_scan_callback)scan_check_cb,
1315 &data);
1316 if (err && err != -ENOSPC)
1317 goto out;
1318 if (data.err) {
1319 err = data.err;
1320 goto out;
1321 }
1322
1323 if (lst->empty_lebs != c->lst.empty_lebs ||
1324 lst->idx_lebs != c->lst.idx_lebs ||
1325 lst->total_free != c->lst.total_free ||
1326 lst->total_dirty != c->lst.total_dirty ||
1327 lst->total_used != c->lst.total_used) {
1328 ubifs_err("bad overall accounting");
1329 ubifs_err("calculated: empty_lebs %d, idx_lebs %d, "
1330 "total_free %lld, total_dirty %lld, total_used %lld",
1331 lst->empty_lebs, lst->idx_lebs, lst->total_free,
1332 lst->total_dirty, lst->total_used);
1333 ubifs_err("read from lprops: empty_lebs %d, idx_lebs %d, "
1334 "total_free %lld, total_dirty %lld, total_used %lld",
1335 c->lst.empty_lebs, c->lst.idx_lebs, c->lst.total_free,
1336 c->lst.total_dirty, c->lst.total_used);
1337 err = -EINVAL;
1338 goto out;
1339 }
1340
1341 if (lst->total_dead != c->lst.total_dead ||
1342 lst->total_dark != c->lst.total_dark) {
1343 ubifs_err("bad dead/dark space accounting");
1344 ubifs_err("calculated: total_dead %lld, total_dark %lld",
1345 lst->total_dead, lst->total_dark);
1346 ubifs_err("read from lprops: total_dead %lld, total_dark %lld",
1347 c->lst.total_dead, c->lst.total_dark);
1348 err = -EINVAL;
1349 goto out;
1350 }
1351
1352 err = dbg_check_cats(c);
1353out:
1354 return err;
1355}
1356
1357#endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c
new file mode 100644
index 000000000000..9ff2463177e5
--- /dev/null
+++ b/fs/ubifs/lpt.c
@@ -0,0 +1,2243 @@
1/*
2 * This file is part of UBIFS.
3 *
4 * Copyright (C) 2006-2008 Nokia Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published by
8 * the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along with
16 * this program; if not, write to the Free Software Foundation, Inc., 51
17 * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 * Authors: Adrian Hunter
20 * Artem Bityutskiy (Битюцкий Артём)
21 */
22
23/*
24 * This file implements the LEB properties tree (LPT) area. The LPT area
25 * contains the LEB properties tree, a table of LPT area eraseblocks (ltab), and
26 * (for the "big" model) a table of saved LEB numbers (lsave). The LPT area sits
27 * between the log and the orphan area.
28 *
29 * The LPT area is like a miniature self-contained file system. It is required
30 * that it never runs out of space, is fast to access and update, and scales
31 * logarithmically. The LEB properties tree is implemented as a wandering tree
32 * much like the TNC, and the LPT area has its own garbage collection.
33 *
34 * The LPT has two slightly different forms called the "small model" and the
35 * "big model". The small model is used when the entire LEB properties table
36 * can be written into a single eraseblock. In that case, garbage collection
37 * consists of just writing the whole table, which therefore makes all other
38 * eraseblocks reusable. In the case of the big model, dirty eraseblocks are
39 * selected for garbage collection, which consists are marking the nodes in
40 * that LEB as dirty, and then only the dirty nodes are written out. Also, in
41 * the case of the big model, a table of LEB numbers is saved so that the entire
42 * LPT does not to be scanned looking for empty eraseblocks when UBIFS is first
43 * mounted.
44 */
45
46#include <linux/crc16.h>
47#include "ubifs.h"
48
49/**
50 * do_calc_lpt_geom - calculate sizes for the LPT area.
51 * @c: the UBIFS file-system description object
52 *
53 * Calculate the sizes of LPT bit fields, nodes, and tree, based on the
54 * properties of the flash and whether LPT is "big" (c->big_lpt).
55 */
56static void do_calc_lpt_geom(struct ubifs_info *c)
57{
58 int i, n, bits, per_leb_wastage, max_pnode_cnt;
59 long long sz, tot_wastage;
60
61 n = c->main_lebs + c->max_leb_cnt - c->leb_cnt;
62 max_pnode_cnt = DIV_ROUND_UP(n, UBIFS_LPT_FANOUT);
63
64 c->lpt_hght = 1;
65 n = UBIFS_LPT_FANOUT;
66 while (n < max_pnode_cnt) {
67 c->lpt_hght += 1;
68 n <<= UBIFS_LPT_FANOUT_SHIFT;
69 }
70
71 c->pnode_cnt = DIV_ROUND_UP(c->main_lebs, UBIFS_LPT_FANOUT);
72
73 n = DIV_ROUND_UP(c->pnode_cnt, UBIFS_LPT_FANOUT);
74 c->nnode_cnt = n;
75 for (i = 1; i < c->lpt_hght; i++) {
76 n = DIV_ROUND_UP(n, UBIFS_LPT_FANOUT);
77 c->nnode_cnt += n;
78 }
79
80 c->space_bits = fls(c->leb_size) - 3;
81 c->lpt_lnum_bits = fls(c->lpt_lebs);
82 c->lpt_offs_bits = fls(c->leb_size - 1);
83 c->lpt_spc_bits = fls(c->leb_size);
84
85 n = DIV_ROUND_UP(c->max_leb_cnt, UBIFS_LPT_FANOUT);
86 c->pcnt_bits = fls(n - 1);
87
88 c->lnum_bits = fls(c->max_leb_cnt - 1);
89
90 bits = UBIFS_LPT_CRC_BITS + UBIFS_LPT_TYPE_BITS +
91 (c->big_lpt ? c->pcnt_bits : 0) +
92 (c->space_bits * 2 + 1) * UBIFS_LPT_FANOUT;
93 c->pnode_sz = (bits + 7) / 8;
94
95 bits = UBIFS_LPT_CRC_BITS + UBIFS_LPT_TYPE_BITS +
96 (c->big_lpt ? c->pcnt_bits : 0) +
97 (c->lpt_lnum_bits + c->lpt_offs_bits) * UBIFS_LPT_FANOUT;
98 c->nnode_sz = (bits + 7) / 8;
99
100 bits = UBIFS_LPT_CRC_BITS + UBIFS_LPT_TYPE_BITS +
101 c->lpt_lebs * c->lpt_spc_bits * 2;
102 c->ltab_sz = (bits + 7) / 8;
103
104 bits = UBIFS_LPT_CRC_BITS + UBIFS_LPT_TYPE_BITS +
105 c->lnum_bits * c->lsave_cnt;
106 c->lsave_sz = (bits + 7) / 8;
107
108 /* Calculate the minimum LPT size */
109 c->lpt_sz = (long long)c->pnode_cnt * c->pnode_sz;
110 c->lpt_sz += (long long)c->nnode_cnt * c->nnode_sz;
111 c->lpt_sz += c->ltab_sz;
112 c->lpt_sz += c->lsave_sz;
113
114 /* Add wastage */
115 sz = c->lpt_sz;
116 per_leb_wastage = max_t(int, c->pnode_sz, c->nnode_sz);
117 sz += per_leb_wastage;
118 tot_wastage = per_leb_wastage;
119 while (sz > c->leb_size) {
120 sz += per_leb_wastage;
121 sz -= c->leb_size;
122 tot_wastage += per_leb_wastage;
123 }
124 tot_wastage += ALIGN(sz, c->min_io_size) - sz;
125 c->lpt_sz += tot_wastage;
126}
127
128/**
129 * ubifs_calc_lpt_geom - calculate and check sizes for the LPT area.
130 * @c: the UBIFS file-system description object
131 *
132 * This function returns %0 on success and a negative error code on failure.
133 */
134int ubifs_calc_lpt_geom(struct ubifs_info *c)
135{
136 int lebs_needed;
137 uint64_t sz;
138
139 do_calc_lpt_geom(c);
140
141 /* Verify that lpt_lebs is big enough */
142 sz = c->lpt_sz * 2; /* Must have at least 2 times the size */
143 sz += c->leb_size - 1;
144 do_div(sz, c->leb_size);
145 lebs_needed = sz;
146 if (lebs_needed > c->lpt_lebs) {
147 ubifs_err("too few LPT LEBs");
148 return -EINVAL;
149 }
150
151 /* Verify that ltab fits in a single LEB (since ltab is a single node */
152 if (c->ltab_sz > c->leb_size) {
153 ubifs_err("LPT ltab too big");
154 return -EINVAL;
155 }
156
157 c->check_lpt_free = c->big_lpt;
158
159 return 0;
160}
161
162/**
163 * calc_dflt_lpt_geom - calculate default LPT geometry.
164 * @c: the UBIFS file-system description object
165 * @main_lebs: number of main area LEBs is passed and returned here
166 * @big_lpt: whether the LPT area is "big" is returned here
167 *
168 * The size of the LPT area depends on parameters that themselves are dependent
169 * on the size of the LPT area. This function, successively recalculates the LPT
170 * area geometry until the parameters and resultant geometry are consistent.
171 *
172 * This function returns %0 on success and a negative error code on failure.
173 */
174static int calc_dflt_lpt_geom(struct ubifs_info *c, int *main_lebs,
175 int *big_lpt)
176{
177 int i, lebs_needed;
178 uint64_t sz;
179
180 /* Start by assuming the minimum number of LPT LEBs */
181 c->lpt_lebs = UBIFS_MIN_LPT_LEBS;
182 c->main_lebs = *main_lebs - c->lpt_lebs;
183 if (c->main_lebs <= 0)
184 return -EINVAL;
185
186 /* And assume we will use the small LPT model */
187 c->big_lpt = 0;
188
189 /*
190 * Calculate the geometry based on assumptions above and then see if it
191 * makes sense
192 */
193 do_calc_lpt_geom(c);
194
195 /* Small LPT model must have lpt_sz < leb_size */
196 if (c->lpt_sz > c->leb_size) {
197 /* Nope, so try again using big LPT model */
198 c->big_lpt = 1;
199 do_calc_lpt_geom(c);
200 }
201
202 /* Now check there are enough LPT LEBs */
203 for (i = 0; i < 64 ; i++) {
204 sz = c->lpt_sz * 4; /* Allow 4 times the size */
205 sz += c->leb_size - 1;
206 do_div(sz, c->leb_size);
207 lebs_needed = sz;
208 if (lebs_needed > c->lpt_lebs) {
209 /* Not enough LPT LEBs so try again with more */
210 c->lpt_lebs = lebs_needed;
211 c->main_lebs = *main_lebs - c->lpt_lebs;
212 if (c->main_lebs <= 0)
213 return -EINVAL;
214 do_calc_lpt_geom(c);
215 continue;
216 }
217 if (c->ltab_sz > c->leb_size) {
218 ubifs_err("LPT ltab too big");
219 return -EINVAL;
220 }
221 *main_lebs = c->main_lebs;
222 *big_lpt = c->big_lpt;
223 return 0;
224 }
225 return -EINVAL;
226}
227
228/**
229 * pack_bits - pack bit fields end-to-end.
230 * @addr: address at which to pack (passed and next address returned)
231 * @pos: bit position at which to pack (passed and next position returned)
232 * @val: value to pack
233 * @nrbits: number of bits of value to pack (1-32)
234 */
235static void pack_bits(uint8_t **addr, int *pos, uint32_t val, int nrbits)
236{
237 uint8_t *p = *addr;
238 int b = *pos;
239
240 ubifs_assert(nrbits > 0);
241 ubifs_assert(nrbits <= 32);
242 ubifs_assert(*pos >= 0);
243 ubifs_assert(*pos < 8);
244 ubifs_assert((val >> nrbits) == 0 || nrbits == 32);
245 if (b) {
246 *p |= ((uint8_t)val) << b;
247 nrbits += b;
248 if (nrbits > 8) {
249 *++p = (uint8_t)(val >>= (8 - b));
250 if (nrbits > 16) {
251 *++p = (uint8_t)(val >>= 8);
252 if (nrbits > 24) {
253 *++p = (uint8_t)(val >>= 8);
254 if (nrbits > 32)
255 *++p = (uint8_t)(val >>= 8);
256 }
257 }
258 }
259 } else {
260 *p = (uint8_t)val;
261 if (nrbits > 8) {
262 *++p = (uint8_t)(val >>= 8);
263 if (nrbits > 16) {
264 *++p = (uint8_t)(val >>= 8);
265 if (nrbits > 24)
266 *++p = (uint8_t)(val >>= 8);
267 }
268 }
269 }
270 b = nrbits & 7;
271 if (b == 0)
272 p++;
273 *addr = p;
274 *pos = b;
275}
276
277/**
278 * ubifs_unpack_bits - unpack bit fields.
279 * @addr: address at which to unpack (passed and next address returned)
280 * @pos: bit position at which to unpack (passed and next position returned)
281 * @nrbits: number of bits of value to unpack (1-32)
282 *
283 * This functions returns the value unpacked.
284 */
285uint32_t ubifs_unpack_bits(uint8_t **addr, int *pos, int nrbits)
286{
287 const int k = 32 - nrbits;
288 uint8_t *p = *addr;
289 int b = *pos;
290 uint32_t val;
291
292 ubifs_assert(nrbits > 0);
293 ubifs_assert(nrbits <= 32);
294 ubifs_assert(*pos >= 0);
295 ubifs_assert(*pos < 8);
296 if (b) {
297 val = p[1] | ((uint32_t)p[2] << 8) | ((uint32_t)p[3] << 16) |
298 ((uint32_t)p[4] << 24);
299 val <<= (8 - b);
300 val |= *p >> b;
301 nrbits += b;
302 } else
303 val = p[0] | ((uint32_t)p[1] << 8) | ((uint32_t)p[2] << 16) |
304 ((uint32_t)p[3] << 24);
305 val <<= k;
306 val >>= k;
307 b = nrbits & 7;
308 p += nrbits / 8;
309 *addr = p;
310 *pos = b;
311 ubifs_assert((val >> nrbits) == 0 || nrbits - b == 32);
312 return val;
313}
314
315/**
316 * ubifs_pack_pnode - pack all the bit fields of a pnode.
317 * @c: UBIFS file-system description object
318 * @buf: buffer into which to pack
319 * @pnode: pnode to pack
320 */
321void ubifs_pack_pnode(struct ubifs_info *c, void *buf,
322 struct ubifs_pnode *pnode)
323{
324 uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
325 int i, pos = 0;
326 uint16_t crc;
327
328 pack_bits(&addr, &pos, UBIFS_LPT_PNODE, UBIFS_LPT_TYPE_BITS);
329 if (c->big_lpt)
330 pack_bits(&addr, &pos, pnode->num, c->pcnt_bits);
331 for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
332 pack_bits(&addr, &pos, pnode->lprops[i].free >> 3,
333 c->space_bits);
334 pack_bits(&addr, &pos, pnode->lprops[i].dirty >> 3,
335 c->space_bits);
336 if (pnode->lprops[i].flags & LPROPS_INDEX)
337 pack_bits(&addr, &pos, 1, 1);
338 else
339 pack_bits(&addr, &pos, 0, 1);
340 }
341 crc = crc16(-1, buf + UBIFS_LPT_CRC_BYTES,
342 c->pnode_sz - UBIFS_LPT_CRC_BYTES);
343 addr = buf;
344 pos = 0;
345 pack_bits(&addr, &pos, crc, UBIFS_LPT_CRC_BITS);
346}
347
348/**
349 * ubifs_pack_nnode - pack all the bit fields of a nnode.
350 * @c: UBIFS file-system description object
351 * @buf: buffer into which to pack
352 * @nnode: nnode to pack
353 */
354void ubifs_pack_nnode(struct ubifs_info *c, void *buf,
355 struct ubifs_nnode *nnode)
356{
357 uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
358 int i, pos = 0;
359 uint16_t crc;
360
361 pack_bits(&addr, &pos, UBIFS_LPT_NNODE, UBIFS_LPT_TYPE_BITS);
362 if (c->big_lpt)
363 pack_bits(&addr, &pos, nnode->num, c->pcnt_bits);
364 for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
365 int lnum = nnode->nbranch[i].lnum;
366
367 if (lnum == 0)
368 lnum = c->lpt_last + 1;
369 pack_bits(&addr, &pos, lnum - c->lpt_first, c->lpt_lnum_bits);
370 pack_bits(&addr, &pos, nnode->nbranch[i].offs,
371 c->lpt_offs_bits);
372 }
373 crc = crc16(-1, buf + UBIFS_LPT_CRC_BYTES,
374 c->nnode_sz - UBIFS_LPT_CRC_BYTES);
375 addr = buf;
376 pos = 0;
377 pack_bits(&addr, &pos, crc, UBIFS_LPT_CRC_BITS);
378}
379
380/**
381 * ubifs_pack_ltab - pack the LPT's own lprops table.
382 * @c: UBIFS file-system description object
383 * @buf: buffer into which to pack
384 * @ltab: LPT's own lprops table to pack
385 */
386void ubifs_pack_ltab(struct ubifs_info *c, void *buf,
387 struct ubifs_lpt_lprops *ltab)
388{
389 uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
390 int i, pos = 0;
391 uint16_t crc;
392
393 pack_bits(&addr, &pos, UBIFS_LPT_LTAB, UBIFS_LPT_TYPE_BITS);
394 for (i = 0; i < c->lpt_lebs; i++) {
395 pack_bits(&addr, &pos, ltab[i].free, c->lpt_spc_bits);
396 pack_bits(&addr, &pos, ltab[i].dirty, c->lpt_spc_bits);
397 }
398 crc = crc16(-1, buf + UBIFS_LPT_CRC_BYTES,
399 c->ltab_sz - UBIFS_LPT_CRC_BYTES);
400 addr = buf;
401 pos = 0;
402 pack_bits(&addr, &pos, crc, UBIFS_LPT_CRC_BITS);
403}
404
405/**
406 * ubifs_pack_lsave - pack the LPT's save table.
407 * @c: UBIFS file-system description object
408 * @buf: buffer into which to pack
409 * @lsave: LPT's save table to pack
410 */
411void ubifs_pack_lsave(struct ubifs_info *c, void *buf, int *lsave)
412{
413 uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
414 int i, pos = 0;
415 uint16_t crc;
416
417 pack_bits(&addr, &pos, UBIFS_LPT_LSAVE, UBIFS_LPT_TYPE_BITS);
418 for (i = 0; i < c->lsave_cnt; i++)
419 pack_bits(&addr, &pos, lsave[i], c->lnum_bits);
420 crc = crc16(-1, buf + UBIFS_LPT_CRC_BYTES,
421 c->lsave_sz - UBIFS_LPT_CRC_BYTES);
422 addr = buf;
423 pos = 0;
424 pack_bits(&addr, &pos, crc, UBIFS_LPT_CRC_BITS);
425}
426
427/**
428 * ubifs_add_lpt_dirt - add dirty space to LPT LEB properties.
429 * @c: UBIFS file-system description object
430 * @lnum: LEB number to which to add dirty space
431 * @dirty: amount of dirty space to add
432 */
433void ubifs_add_lpt_dirt(struct ubifs_info *c, int lnum, int dirty)
434{
435 if (!dirty || !lnum)
436 return;
437 dbg_lp("LEB %d add %d to %d",
438 lnum, dirty, c->ltab[lnum - c->lpt_first].dirty);
439 ubifs_assert(lnum >= c->lpt_first && lnum <= c->lpt_last);
440 c->ltab[lnum - c->lpt_first].dirty += dirty;
441}
442
443/**
444 * set_ltab - set LPT LEB properties.
445 * @c: UBIFS file-system description object
446 * @lnum: LEB number
447 * @free: amount of free space
448 * @dirty: amount of dirty space
449 */
450static void set_ltab(struct ubifs_info *c, int lnum, int free, int dirty)
451{
452 dbg_lp("LEB %d free %d dirty %d to %d %d",
453 lnum, c->ltab[lnum - c->lpt_first].free,
454 c->ltab[lnum - c->lpt_first].dirty, free, dirty);
455 ubifs_assert(lnum >= c->lpt_first && lnum <= c->lpt_last);
456 c->ltab[lnum - c->lpt_first].free = free;
457 c->ltab[lnum - c->lpt_first].dirty = dirty;
458}
459
460/**
461 * ubifs_add_nnode_dirt - add dirty space to LPT LEB properties.
462 * @c: UBIFS file-system description object
463 * @nnode: nnode for which to add dirt
464 */
465void ubifs_add_nnode_dirt(struct ubifs_info *c, struct ubifs_nnode *nnode)
466{
467 struct ubifs_nnode *np = nnode->parent;
468
469 if (np)
470 ubifs_add_lpt_dirt(c, np->nbranch[nnode->iip].lnum,
471 c->nnode_sz);
472 else {
473 ubifs_add_lpt_dirt(c, c->lpt_lnum, c->nnode_sz);
474 if (!(c->lpt_drty_flgs & LTAB_DIRTY)) {
475 c->lpt_drty_flgs |= LTAB_DIRTY;
476 ubifs_add_lpt_dirt(c, c->ltab_lnum, c->ltab_sz);
477 }
478 }
479}
480
481/**
482 * add_pnode_dirt - add dirty space to LPT LEB properties.
483 * @c: UBIFS file-system description object
484 * @pnode: pnode for which to add dirt
485 */
486static void add_pnode_dirt(struct ubifs_info *c, struct ubifs_pnode *pnode)
487{
488 ubifs_add_lpt_dirt(c, pnode->parent->nbranch[pnode->iip].lnum,
489 c->pnode_sz);
490}
491
492/**
493 * calc_nnode_num - calculate nnode number.
494 * @row: the row in the tree (root is zero)
495 * @col: the column in the row (leftmost is zero)
496 *
497 * The nnode number is a number that uniquely identifies a nnode and can be used
498 * easily to traverse the tree from the root to that nnode.
499 *
500 * This function calculates and returns the nnode number for the nnode at @row
501 * and @col.
502 */
503static int calc_nnode_num(int row, int col)
504{
505 int num, bits;
506
507 num = 1;
508 while (row--) {
509 bits = (col & (UBIFS_LPT_FANOUT - 1));
510 col >>= UBIFS_LPT_FANOUT_SHIFT;
511 num <<= UBIFS_LPT_FANOUT_SHIFT;
512 num |= bits;
513 }
514 return num;
515}
516
517/**
518 * calc_nnode_num_from_parent - calculate nnode number.
519 * @c: UBIFS file-system description object
520 * @parent: parent nnode
521 * @iip: index in parent
522 *
523 * The nnode number is a number that uniquely identifies a nnode and can be used
524 * easily to traverse the tree from the root to that nnode.
525 *
526 * This function calculates and returns the nnode number based on the parent's
527 * nnode number and the index in parent.
528 */
529static int calc_nnode_num_from_parent(struct ubifs_info *c,
530 struct ubifs_nnode *parent, int iip)
531{
532 int num, shft;
533
534 if (!parent)
535 return 1;
536 shft = (c->lpt_hght - parent->level) * UBIFS_LPT_FANOUT_SHIFT;
537 num = parent->num ^ (1 << shft);
538 num |= (UBIFS_LPT_FANOUT + iip) << shft;
539 return num;
540}
541
542/**
543 * calc_pnode_num_from_parent - calculate pnode number.
544 * @c: UBIFS file-system description object
545 * @parent: parent nnode
546 * @iip: index in parent
547 *
548 * The pnode number is a number that uniquely identifies a pnode and can be used
549 * easily to traverse the tree from the root to that pnode.
550 *
551 * This function calculates and returns the pnode number based on the parent's
552 * nnode number and the index in parent.
553 */
554static int calc_pnode_num_from_parent(struct ubifs_info *c,
555 struct ubifs_nnode *parent, int iip)
556{
557 int i, n = c->lpt_hght - 1, pnum = parent->num, num = 0;
558
559 for (i = 0; i < n; i++) {
560 num <<= UBIFS_LPT_FANOUT_SHIFT;
561 num |= pnum & (UBIFS_LPT_FANOUT - 1);
562 pnum >>= UBIFS_LPT_FANOUT_SHIFT;
563 }
564 num <<= UBIFS_LPT_FANOUT_SHIFT;
565 num |= iip;
566 return num;
567}
568
569/**
570 * ubifs_create_dflt_lpt - create default LPT.
571 * @c: UBIFS file-system description object
572 * @main_lebs: number of main area LEBs is passed and returned here
573 * @lpt_first: LEB number of first LPT LEB
574 * @lpt_lebs: number of LEBs for LPT is passed and returned here
575 * @big_lpt: use big LPT model is passed and returned here
576 *
577 * This function returns %0 on success and a negative error code on failure.
578 */
579int ubifs_create_dflt_lpt(struct ubifs_info *c, int *main_lebs, int lpt_first,
580 int *lpt_lebs, int *big_lpt)
581{
582 int lnum, err = 0, node_sz, iopos, i, j, cnt, len, alen, row;
583 int blnum, boffs, bsz, bcnt;
584 struct ubifs_pnode *pnode = NULL;
585 struct ubifs_nnode *nnode = NULL;
586 void *buf = NULL, *p;
587 struct ubifs_lpt_lprops *ltab = NULL;
588 int *lsave = NULL;
589
590 err = calc_dflt_lpt_geom(c, main_lebs, big_lpt);
591 if (err)
592 return err;
593 *lpt_lebs = c->lpt_lebs;
594
595 /* Needed by 'ubifs_pack_nnode()' and 'set_ltab()' */
596 c->lpt_first = lpt_first;
597 /* Needed by 'set_ltab()' */
598 c->lpt_last = lpt_first + c->lpt_lebs - 1;
599 /* Needed by 'ubifs_pack_lsave()' */
600 c->main_first = c->leb_cnt - *main_lebs;
601
602 lsave = kmalloc(sizeof(int) * c->lsave_cnt, GFP_KERNEL);
603 pnode = kzalloc(sizeof(struct ubifs_pnode), GFP_KERNEL);
604 nnode = kzalloc(sizeof(struct ubifs_nnode), GFP_KERNEL);
605 buf = vmalloc(c->leb_size);
606 ltab = vmalloc(sizeof(struct ubifs_lpt_lprops) * c->lpt_lebs);
607 if (!pnode || !nnode || !buf || !ltab || !lsave) {
608 err = -ENOMEM;
609 goto out;
610 }
611
612 ubifs_assert(!c->ltab);
613 c->ltab = ltab; /* Needed by set_ltab */
614
615 /* Initialize LPT's own lprops */
616 for (i = 0; i < c->lpt_lebs; i++) {
617 ltab[i].free = c->leb_size;
618 ltab[i].dirty = 0;
619 ltab[i].tgc = 0;
620 ltab[i].cmt = 0;
621 }
622
623 lnum = lpt_first;
624 p = buf;
625 /* Number of leaf nodes (pnodes) */
626 cnt = c->pnode_cnt;
627
628 /*
629 * The first pnode contains the LEB properties for the LEBs that contain
630 * the root inode node and the root index node of the index tree.
631 */
632 node_sz = ALIGN(ubifs_idx_node_sz(c, 1), 8);
633 iopos = ALIGN(node_sz, c->min_io_size);
634 pnode->lprops[0].free = c->leb_size - iopos;
635 pnode->lprops[0].dirty = iopos - node_sz;
636 pnode->lprops[0].flags = LPROPS_INDEX;
637
638 node_sz = UBIFS_INO_NODE_SZ;
639 iopos = ALIGN(node_sz, c->min_io_size);
640 pnode->lprops[1].free = c->leb_size - iopos;
641 pnode->lprops[1].dirty = iopos - node_sz;
642
643 for (i = 2; i < UBIFS_LPT_FANOUT; i++)
644 pnode->lprops[i].free = c->leb_size;
645
646 /* Add first pnode */
647 ubifs_pack_pnode(c, p, pnode);
648 p += c->pnode_sz;
649 len = c->pnode_sz;
650 pnode->num += 1;
651
652 /* Reset pnode values for remaining pnodes */
653 pnode->lprops[0].free = c->leb_size;
654 pnode->lprops[0].dirty = 0;
655 pnode->lprops[0].flags = 0;
656
657 pnode->lprops[1].free = c->leb_size;
658 pnode->lprops[1].dirty = 0;
659
660 /*
661 * To calculate the internal node branches, we keep information about
662 * the level below.
663 */
664 blnum = lnum; /* LEB number of level below */
665 boffs = 0; /* Offset of level below */
666 bcnt = cnt; /* Number of nodes in level below */
667 bsz = c->pnode_sz; /* Size of nodes in level below */
668
669 /* Add all remaining pnodes */
670 for (i = 1; i < cnt; i++) {
671 if (len + c->pnode_sz > c->leb_size) {
672 alen = ALIGN(len, c->min_io_size);
673 set_ltab(c, lnum, c->leb_size - alen, alen - len);
674 memset(p, 0xff, alen - len);
675 err = ubi_leb_change(c->ubi, lnum++, buf, alen,
676 UBI_SHORTTERM);
677 if (err)
678 goto out;
679 p = buf;
680 len = 0;
681 }
682 ubifs_pack_pnode(c, p, pnode);
683 p += c->pnode_sz;
684 len += c->pnode_sz;
685 /*
686 * pnodes are simply numbered left to right starting at zero,
687 * which means the pnode number can be used easily to traverse
688 * down the tree to the corresponding pnode.
689 */
690 pnode->num += 1;
691 }
692
693 row = 0;
694 for (i = UBIFS_LPT_FANOUT; cnt > i; i <<= UBIFS_LPT_FANOUT_SHIFT)
695 row += 1;
696 /* Add all nnodes, one level at a time */
697 while (1) {
698 /* Number of internal nodes (nnodes) at next level */
699 cnt = DIV_ROUND_UP(cnt, UBIFS_LPT_FANOUT);
700 for (i = 0; i < cnt; i++) {
701 if (len + c->nnode_sz > c->leb_size) {
702 alen = ALIGN(len, c->min_io_size);
703 set_ltab(c, lnum, c->leb_size - alen,
704 alen - len);
705 memset(p, 0xff, alen - len);
706 err = ubi_leb_change(c->ubi, lnum++, buf, alen,
707 UBI_SHORTTERM);
708 if (err)
709 goto out;
710 p = buf;
711 len = 0;
712 }
713 /* Only 1 nnode at this level, so it is the root */
714 if (cnt == 1) {
715 c->lpt_lnum = lnum;
716 c->lpt_offs = len;
717 }
718 /* Set branches to the level below */
719 for (j = 0; j < UBIFS_LPT_FANOUT; j++) {
720 if (bcnt) {
721 if (boffs + bsz > c->leb_size) {
722 blnum += 1;
723 boffs = 0;
724 }
725 nnode->nbranch[j].lnum = blnum;
726 nnode->nbranch[j].offs = boffs;
727 boffs += bsz;
728 bcnt--;
729 } else {
730 nnode->nbranch[j].lnum = 0;
731 nnode->nbranch[j].offs = 0;
732 }
733 }
734 nnode->num = calc_nnode_num(row, i);
735 ubifs_pack_nnode(c, p, nnode);
736 p += c->nnode_sz;
737 len += c->nnode_sz;
738 }
739 /* Only 1 nnode at this level, so it is the root */
740 if (cnt == 1)
741 break;
742 /* Update the information about the level below */
743 bcnt = cnt;
744 bsz = c->nnode_sz;
745 row -= 1;
746 }
747
748 if (*big_lpt) {
749 /* Need to add LPT's save table */
750 if (len + c->lsave_sz > c->leb_size) {
751 alen = ALIGN(len, c->min_io_size);
752 set_ltab(c, lnum, c->leb_size - alen, alen - len);
753 memset(p, 0xff, alen - len);
754 err = ubi_leb_change(c->ubi, lnum++, buf, alen,
755 UBI_SHORTTERM);
756 if (err)
757 goto out;
758 p = buf;
759 len = 0;
760 }
761
762 c->lsave_lnum = lnum;
763 c->lsave_offs = len;
764
765 for (i = 0; i < c->lsave_cnt && i < *main_lebs; i++)
766 lsave[i] = c->main_first + i;
767 for (; i < c->lsave_cnt; i++)
768 lsave[i] = c->main_first;
769
770 ubifs_pack_lsave(c, p, lsave);
771 p += c->lsave_sz;
772 len += c->lsave_sz;
773 }
774
775 /* Need to add LPT's own LEB properties table */
776 if (len + c->ltab_sz > c->leb_size) {
777 alen = ALIGN(len, c->min_io_size);
778 set_ltab(c, lnum, c->leb_size - alen, alen - len);
779 memset(p, 0xff, alen - len);
780 err = ubi_leb_change(c->ubi, lnum++, buf, alen, UBI_SHORTTERM);
781 if (err)
782 goto out;
783 p = buf;
784 len = 0;
785 }
786
787 c->ltab_lnum = lnum;
788 c->ltab_offs = len;
789
790 /* Update ltab before packing it */
791 len += c->ltab_sz;
792 alen = ALIGN(len, c->min_io_size);
793 set_ltab(c, lnum, c->leb_size - alen, alen - len);
794
795 ubifs_pack_ltab(c, p, ltab);
796 p += c->ltab_sz;
797
798 /* Write remaining buffer */
799 memset(p, 0xff, alen - len);
800 err = ubi_leb_change(c->ubi, lnum, buf, alen, UBI_SHORTTERM);
801 if (err)
802 goto out;
803
804 c->nhead_lnum = lnum;
805 c->nhead_offs = ALIGN(len, c->min_io_size);
806
807 dbg_lp("space_bits %d", c->space_bits);
808 dbg_lp("lpt_lnum_bits %d", c->lpt_lnum_bits);
809 dbg_lp("lpt_offs_bits %d", c->lpt_offs_bits);
810 dbg_lp("lpt_spc_bits %d", c->lpt_spc_bits);
811 dbg_lp("pcnt_bits %d", c->pcnt_bits);
812 dbg_lp("lnum_bits %d", c->lnum_bits);
813 dbg_lp("pnode_sz %d", c->pnode_sz);
814 dbg_lp("nnode_sz %d", c->nnode_sz);
815 dbg_lp("ltab_sz %d", c->ltab_sz);
816 dbg_lp("lsave_sz %d", c->lsave_sz);
817 dbg_lp("lsave_cnt %d", c->lsave_cnt);
818 dbg_lp("lpt_hght %d", c->lpt_hght);
819 dbg_lp("big_lpt %d", c->big_lpt);
820 dbg_lp("LPT root is at %d:%d", c->lpt_lnum, c->lpt_offs);
821 dbg_lp("LPT head is at %d:%d", c->nhead_lnum, c->nhead_offs);
822 dbg_lp("LPT ltab is at %d:%d", c->ltab_lnum, c->ltab_offs);
823 if (c->big_lpt)
824 dbg_lp("LPT lsave is at %d:%d", c->lsave_lnum, c->lsave_offs);
825out:
826 c->ltab = NULL;
827 kfree(lsave);
828 vfree(ltab);
829 vfree(buf);
830 kfree(nnode);
831 kfree(pnode);
832 return err;
833}
834
835/**
836 * update_cats - add LEB properties of a pnode to LEB category lists and heaps.
837 * @c: UBIFS file-system description object
838 * @pnode: pnode
839 *
840 * When a pnode is loaded into memory, the LEB properties it contains are added,
841 * by this function, to the LEB category lists and heaps.
842 */
843static void update_cats(struct ubifs_info *c, struct ubifs_pnode *pnode)
844{
845 int i;
846
847 for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
848 int cat = pnode->lprops[i].flags & LPROPS_CAT_MASK;
849 int lnum = pnode->lprops[i].lnum;
850
851 if (!lnum)
852 return;
853 ubifs_add_to_cat(c, &pnode->lprops[i], cat);
854 }
855}
856
857/**
858 * replace_cats - add LEB properties of a pnode to LEB category lists and heaps.
859 * @c: UBIFS file-system description object
860 * @old_pnode: pnode copied
861 * @new_pnode: pnode copy
862 *
863 * During commit it is sometimes necessary to copy a pnode
864 * (see dirty_cow_pnode). When that happens, references in
865 * category lists and heaps must be replaced. This function does that.
866 */
867static void replace_cats(struct ubifs_info *c, struct ubifs_pnode *old_pnode,
868 struct ubifs_pnode *new_pnode)
869{
870 int i;
871
872 for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
873 if (!new_pnode->lprops[i].lnum)
874 return;
875 ubifs_replace_cat(c, &old_pnode->lprops[i],
876 &new_pnode->lprops[i]);
877 }
878}
879
880/**
881 * check_lpt_crc - check LPT node crc is correct.
882 * @c: UBIFS file-system description object
883 * @buf: buffer containing node
884 * @len: length of node
885 *
886 * This function returns %0 on success and a negative error code on failure.
887 */
888static int check_lpt_crc(void *buf, int len)
889{
890 int pos = 0;
891 uint8_t *addr = buf;
892 uint16_t crc, calc_crc;
893
894 crc = ubifs_unpack_bits(&addr, &pos, UBIFS_LPT_CRC_BITS);
895 calc_crc = crc16(-1, buf + UBIFS_LPT_CRC_BYTES,
896 len - UBIFS_LPT_CRC_BYTES);
897 if (crc != calc_crc) {
898 ubifs_err("invalid crc in LPT node: crc %hx calc %hx", crc,
899 calc_crc);
900 dbg_dump_stack();
901 return -EINVAL;
902 }
903 return 0;
904}
905
906/**
907 * check_lpt_type - check LPT node type is correct.
908 * @c: UBIFS file-system description object
909 * @addr: address of type bit field is passed and returned updated here
910 * @pos: position of type bit field is passed and returned updated here
911 * @type: expected type
912 *
913 * This function returns %0 on success and a negative error code on failure.
914 */
915static int check_lpt_type(uint8_t **addr, int *pos, int type)
916{
917 int node_type;
918
919 node_type = ubifs_unpack_bits(addr, pos, UBIFS_LPT_TYPE_BITS);
920 if (node_type != type) {
921 ubifs_err("invalid type (%d) in LPT node type %d", node_type,
922 type);
923 dbg_dump_stack();
924 return -EINVAL;
925 }
926 return 0;
927}
928
929/**
930 * unpack_pnode - unpack a pnode.
931 * @c: UBIFS file-system description object
932 * @buf: buffer containing packed pnode to unpack
933 * @pnode: pnode structure to fill
934 *
935 * This function returns %0 on success and a negative error code on failure.
936 */
937static int unpack_pnode(struct ubifs_info *c, void *buf,
938 struct ubifs_pnode *pnode)
939{
940 uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
941 int i, pos = 0, err;
942
943 err = check_lpt_type(&addr, &pos, UBIFS_LPT_PNODE);
944 if (err)
945 return err;
946 if (c->big_lpt)
947 pnode->num = ubifs_unpack_bits(&addr, &pos, c->pcnt_bits);
948 for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
949 struct ubifs_lprops * const lprops = &pnode->lprops[i];
950
951 lprops->free = ubifs_unpack_bits(&addr, &pos, c->space_bits);
952 lprops->free <<= 3;
953 lprops->dirty = ubifs_unpack_bits(&addr, &pos, c->space_bits);
954 lprops->dirty <<= 3;
955
956 if (ubifs_unpack_bits(&addr, &pos, 1))
957 lprops->flags = LPROPS_INDEX;
958 else
959 lprops->flags = 0;
960 lprops->flags |= ubifs_categorize_lprops(c, lprops);
961 }
962 err = check_lpt_crc(buf, c->pnode_sz);
963 return err;
964}
965
966/**
967 * unpack_nnode - unpack a nnode.
968 * @c: UBIFS file-system description object
969 * @buf: buffer containing packed nnode to unpack
970 * @nnode: nnode structure to fill
971 *
972 * This function returns %0 on success and a negative error code on failure.
973 */
974static int unpack_nnode(struct ubifs_info *c, void *buf,
975 struct ubifs_nnode *nnode)
976{
977 uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
978 int i, pos = 0, err;
979
980 err = check_lpt_type(&addr, &pos, UBIFS_LPT_NNODE);
981 if (err)
982 return err;
983 if (c->big_lpt)
984 nnode->num = ubifs_unpack_bits(&addr, &pos, c->pcnt_bits);
985 for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
986 int lnum;
987
988 lnum = ubifs_unpack_bits(&addr, &pos, c->lpt_lnum_bits) +
989 c->lpt_first;
990 if (lnum == c->lpt_last + 1)
991 lnum = 0;
992 nnode->nbranch[i].lnum = lnum;
993 nnode->nbranch[i].offs = ubifs_unpack_bits(&addr, &pos,
994 c->lpt_offs_bits);
995 }
996 err = check_lpt_crc(buf, c->nnode_sz);
997 return err;
998}
999
1000/**
1001 * unpack_ltab - unpack the LPT's own lprops table.
1002 * @c: UBIFS file-system description object
1003 * @buf: buffer from which to unpack
1004 *
1005 * This function returns %0 on success and a negative error code on failure.
1006 */
1007static int unpack_ltab(struct ubifs_info *c, void *buf)
1008{
1009 uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
1010 int i, pos = 0, err;
1011
1012 err = check_lpt_type(&addr, &pos, UBIFS_LPT_LTAB);
1013 if (err)
1014 return err;
1015 for (i = 0; i < c->lpt_lebs; i++) {
1016 int free = ubifs_unpack_bits(&addr, &pos, c->lpt_spc_bits);
1017 int dirty = ubifs_unpack_bits(&addr, &pos, c->lpt_spc_bits);
1018
1019 if (free < 0 || free > c->leb_size || dirty < 0 ||
1020 dirty > c->leb_size || free + dirty > c->leb_size)
1021 return -EINVAL;
1022
1023 c->ltab[i].free = free;
1024 c->ltab[i].dirty = dirty;
1025 c->ltab[i].tgc = 0;
1026 c->ltab[i].cmt = 0;
1027 }
1028 err = check_lpt_crc(buf, c->ltab_sz);
1029 return err;
1030}
1031
1032/**
1033 * unpack_lsave - unpack the LPT's save table.
1034 * @c: UBIFS file-system description object
1035 * @buf: buffer from which to unpack
1036 *
1037 * This function returns %0 on success and a negative error code on failure.
1038 */
1039static int unpack_lsave(struct ubifs_info *c, void *buf)
1040{
1041 uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
1042 int i, pos = 0, err;
1043
1044 err = check_lpt_type(&addr, &pos, UBIFS_LPT_LSAVE);
1045 if (err)
1046 return err;
1047 for (i = 0; i < c->lsave_cnt; i++) {
1048 int lnum = ubifs_unpack_bits(&addr, &pos, c->lnum_bits);
1049
1050 if (lnum < c->main_first || lnum >= c->leb_cnt)
1051 return -EINVAL;
1052 c->lsave[i] = lnum;
1053 }
1054 err = check_lpt_crc(buf, c->lsave_sz);
1055 return err;
1056}
1057
1058/**
1059 * validate_nnode - validate a nnode.
1060 * @c: UBIFS file-system description object
1061 * @nnode: nnode to validate
1062 * @parent: parent nnode (or NULL for the root nnode)
1063 * @iip: index in parent
1064 *
1065 * This function returns %0 on success and a negative error code on failure.
1066 */
1067static int validate_nnode(struct ubifs_info *c, struct ubifs_nnode *nnode,
1068 struct ubifs_nnode *parent, int iip)
1069{
1070 int i, lvl, max_offs;
1071
1072 if (c->big_lpt) {
1073 int num = calc_nnode_num_from_parent(c, parent, iip);
1074
1075 if (nnode->num != num)
1076 return -EINVAL;
1077 }
1078 lvl = parent ? parent->level - 1 : c->lpt_hght;
1079 if (lvl < 1)
1080 return -EINVAL;
1081 if (lvl == 1)
1082 max_offs = c->leb_size - c->pnode_sz;
1083 else
1084 max_offs = c->leb_size - c->nnode_sz;
1085 for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
1086 int lnum = nnode->nbranch[i].lnum;
1087 int offs = nnode->nbranch[i].offs;
1088
1089 if (lnum == 0) {
1090 if (offs != 0)
1091 return -EINVAL;
1092 continue;
1093 }
1094 if (lnum < c->lpt_first || lnum > c->lpt_last)
1095 return -EINVAL;
1096 if (offs < 0 || offs > max_offs)
1097 return -EINVAL;
1098 }
1099 return 0;
1100}
1101
1102/**
1103 * validate_pnode - validate a pnode.
1104 * @c: UBIFS file-system description object
1105 * @pnode: pnode to validate
1106 * @parent: parent nnode
1107 * @iip: index in parent
1108 *
1109 * This function returns %0 on success and a negative error code on failure.
1110 */
1111static int validate_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
1112 struct ubifs_nnode *parent, int iip)
1113{
1114 int i;
1115
1116 if (c->big_lpt) {
1117 int num = calc_pnode_num_from_parent(c, parent, iip);
1118
1119 if (pnode->num != num)
1120 return -EINVAL;
1121 }
1122 for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
1123 int free = pnode->lprops[i].free;
1124 int dirty = pnode->lprops[i].dirty;
1125
1126 if (free < 0 || free > c->leb_size || free % c->min_io_size ||
1127 (free & 7))
1128 return -EINVAL;
1129 if (dirty < 0 || dirty > c->leb_size || (dirty & 7))
1130 return -EINVAL;
1131 if (dirty + free > c->leb_size)
1132 return -EINVAL;
1133 }
1134 return 0;
1135}
1136
1137/**
1138 * set_pnode_lnum - set LEB numbers on a pnode.
1139 * @c: UBIFS file-system description object
1140 * @pnode: pnode to update
1141 *
1142 * This function calculates the LEB numbers for the LEB properties it contains
1143 * based on the pnode number.
1144 */
1145static void set_pnode_lnum(struct ubifs_info *c, struct ubifs_pnode *pnode)
1146{
1147 int i, lnum;
1148
1149 lnum = (pnode->num << UBIFS_LPT_FANOUT_SHIFT) + c->main_first;
1150 for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
1151 if (lnum >= c->leb_cnt)
1152 return;
1153 pnode->lprops[i].lnum = lnum++;
1154 }
1155}
1156
1157/**
1158 * ubifs_read_nnode - read a nnode from flash and link it to the tree in memory.
1159 * @c: UBIFS file-system description object
1160 * @parent: parent nnode (or NULL for the root)
1161 * @iip: index in parent
1162 *
1163 * This function returns %0 on success and a negative error code on failure.
1164 */
1165int ubifs_read_nnode(struct ubifs_info *c, struct ubifs_nnode *parent, int iip)
1166{
1167 struct ubifs_nbranch *branch = NULL;
1168 struct ubifs_nnode *nnode = NULL;
1169 void *buf = c->lpt_nod_buf;
1170 int err, lnum, offs;
1171
1172 if (parent) {
1173 branch = &parent->nbranch[iip];
1174 lnum = branch->lnum;
1175 offs = branch->offs;
1176 } else {
1177 lnum = c->lpt_lnum;
1178 offs = c->lpt_offs;
1179 }
1180 nnode = kzalloc(sizeof(struct ubifs_nnode), GFP_NOFS);
1181 if (!nnode) {
1182 err = -ENOMEM;
1183 goto out;
1184 }
1185 if (lnum == 0) {
1186 /*
1187 * This nnode was not written which just means that the LEB
1188 * properties in the subtree below it describe empty LEBs. We
1189 * make the nnode as though we had read it, which in fact means
1190 * doing almost nothing.
1191 */
1192 if (c->big_lpt)
1193 nnode->num = calc_nnode_num_from_parent(c, parent, iip);
1194 } else {
1195 err = ubi_read(c->ubi, lnum, buf, offs, c->nnode_sz);
1196 if (err)
1197 goto out;
1198 err = unpack_nnode(c, buf, nnode);
1199 if (err)
1200 goto out;
1201 }
1202 err = validate_nnode(c, nnode, parent, iip);
1203 if (err)
1204 goto out;
1205 if (!c->big_lpt)
1206 nnode->num = calc_nnode_num_from_parent(c, parent, iip);
1207 if (parent) {
1208 branch->nnode = nnode;
1209 nnode->level = parent->level - 1;
1210 } else {
1211 c->nroot = nnode;
1212 nnode->level = c->lpt_hght;
1213 }
1214 nnode->parent = parent;
1215 nnode->iip = iip;
1216 return 0;
1217
1218out:
1219 ubifs_err("error %d reading nnode at %d:%d", err, lnum, offs);
1220 kfree(nnode);
1221 return err;
1222}
1223
1224/**
1225 * read_pnode - read a pnode from flash and link it to the tree in memory.
1226 * @c: UBIFS file-system description object
1227 * @parent: parent nnode
1228 * @iip: index in parent
1229 *
1230 * This function returns %0 on success and a negative error code on failure.
1231 */
1232static int read_pnode(struct ubifs_info *c, struct ubifs_nnode *parent, int iip)
1233{
1234 struct ubifs_nbranch *branch;
1235 struct ubifs_pnode *pnode = NULL;
1236 void *buf = c->lpt_nod_buf;
1237 int err, lnum, offs;
1238
1239 branch = &parent->nbranch[iip];
1240 lnum = branch->lnum;
1241 offs = branch->offs;
1242 pnode = kzalloc(sizeof(struct ubifs_pnode), GFP_NOFS);
1243 if (!pnode) {
1244 err = -ENOMEM;
1245 goto out;
1246 }
1247 if (lnum == 0) {
1248 /*
1249 * This pnode was not written which just means that the LEB
1250 * properties in it describe empty LEBs. We make the pnode as
1251 * though we had read it.
1252 */
1253 int i;
1254
1255 if (c->big_lpt)
1256 pnode->num = calc_pnode_num_from_parent(c, parent, iip);
1257 for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
1258 struct ubifs_lprops * const lprops = &pnode->lprops[i];
1259
1260 lprops->free = c->leb_size;
1261 lprops->flags = ubifs_categorize_lprops(c, lprops);
1262 }
1263 } else {
1264 err = ubi_read(c->ubi, lnum, buf, offs, c->pnode_sz);
1265 if (err)
1266 goto out;
1267 err = unpack_pnode(c, buf, pnode);
1268 if (err)
1269 goto out;
1270 }
1271 err = validate_pnode(c, pnode, parent, iip);
1272 if (err)
1273 goto out;
1274 if (!c->big_lpt)
1275 pnode->num = calc_pnode_num_from_parent(c, parent, iip);
1276 branch->pnode = pnode;
1277 pnode->parent = parent;
1278 pnode->iip = iip;
1279 set_pnode_lnum(c, pnode);
1280 c->pnodes_have += 1;
1281 return 0;
1282
1283out:
1284 ubifs_err("error %d reading pnode at %d:%d", err, lnum, offs);
1285 dbg_dump_pnode(c, pnode, parent, iip);
1286 dbg_msg("calc num: %d", calc_pnode_num_from_parent(c, parent, iip));
1287 kfree(pnode);
1288 return err;
1289}
1290
1291/**
1292 * read_ltab - read LPT's own lprops table.
1293 * @c: UBIFS file-system description object
1294 *
1295 * This function returns %0 on success and a negative error code on failure.
1296 */
1297static int read_ltab(struct ubifs_info *c)
1298{
1299 int err;
1300 void *buf;
1301
1302 buf = vmalloc(c->ltab_sz);
1303 if (!buf)
1304 return -ENOMEM;
1305 err = ubi_read(c->ubi, c->ltab_lnum, buf, c->ltab_offs, c->ltab_sz);
1306 if (err)
1307 goto out;
1308 err = unpack_ltab(c, buf);
1309out:
1310 vfree(buf);
1311 return err;
1312}
1313
1314/**
1315 * read_lsave - read LPT's save table.
1316 * @c: UBIFS file-system description object
1317 *
1318 * This function returns %0 on success and a negative error code on failure.
1319 */
1320static int read_lsave(struct ubifs_info *c)
1321{
1322 int err, i;
1323 void *buf;
1324
1325 buf = vmalloc(c->lsave_sz);
1326 if (!buf)
1327 return -ENOMEM;
1328 err = ubi_read(c->ubi, c->lsave_lnum, buf, c->lsave_offs, c->lsave_sz);
1329 if (err)
1330 goto out;
1331 err = unpack_lsave(c, buf);
1332 if (err)
1333 goto out;
1334 for (i = 0; i < c->lsave_cnt; i++) {
1335 int lnum = c->lsave[i];
1336
1337 /*
1338 * Due to automatic resizing, the values in the lsave table
1339 * could be beyond the volume size - just ignore them.
1340 */
1341 if (lnum >= c->leb_cnt)
1342 continue;
1343 ubifs_lpt_lookup(c, lnum);
1344 }
1345out:
1346 vfree(buf);
1347 return err;
1348}
1349
1350/**
1351 * ubifs_get_nnode - get a nnode.
1352 * @c: UBIFS file-system description object
1353 * @parent: parent nnode (or NULL for the root)
1354 * @iip: index in parent
1355 *
1356 * This function returns a pointer to the nnode on success or a negative error
1357 * code on failure.
1358 */
1359struct ubifs_nnode *ubifs_get_nnode(struct ubifs_info *c,
1360 struct ubifs_nnode *parent, int iip)
1361{
1362 struct ubifs_nbranch *branch;
1363 struct ubifs_nnode *nnode;
1364 int err;
1365
1366 branch = &parent->nbranch[iip];
1367 nnode = branch->nnode;
1368 if (nnode)
1369 return nnode;
1370 err = ubifs_read_nnode(c, parent, iip);
1371 if (err)
1372 return ERR_PTR(err);
1373 return branch->nnode;
1374}
1375
1376/**
1377 * ubifs_get_pnode - get a pnode.
1378 * @c: UBIFS file-system description object
1379 * @parent: parent nnode
1380 * @iip: index in parent
1381 *
1382 * This function returns a pointer to the pnode on success or a negative error
1383 * code on failure.
1384 */
1385struct ubifs_pnode *ubifs_get_pnode(struct ubifs_info *c,
1386 struct ubifs_nnode *parent, int iip)
1387{
1388 struct ubifs_nbranch *branch;
1389 struct ubifs_pnode *pnode;
1390 int err;
1391
1392 branch = &parent->nbranch[iip];
1393 pnode = branch->pnode;
1394 if (pnode)
1395 return pnode;
1396 err = read_pnode(c, parent, iip);
1397 if (err)
1398 return ERR_PTR(err);
1399 update_cats(c, branch->pnode);
1400 return branch->pnode;
1401}
1402
1403/**
1404 * ubifs_lpt_lookup - lookup LEB properties in the LPT.
1405 * @c: UBIFS file-system description object
1406 * @lnum: LEB number to lookup
1407 *
1408 * This function returns a pointer to the LEB properties on success or a
1409 * negative error code on failure.
1410 */
1411struct ubifs_lprops *ubifs_lpt_lookup(struct ubifs_info *c, int lnum)
1412{
1413 int err, i, h, iip, shft;
1414 struct ubifs_nnode *nnode;
1415 struct ubifs_pnode *pnode;
1416
1417 if (!c->nroot) {
1418 err = ubifs_read_nnode(c, NULL, 0);
1419 if (err)
1420 return ERR_PTR(err);
1421 }
1422 nnode = c->nroot;
1423 i = lnum - c->main_first;
1424 shft = c->lpt_hght * UBIFS_LPT_FANOUT_SHIFT;
1425 for (h = 1; h < c->lpt_hght; h++) {
1426 iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
1427 shft -= UBIFS_LPT_FANOUT_SHIFT;
1428 nnode = ubifs_get_nnode(c, nnode, iip);
1429 if (IS_ERR(nnode))
1430 return ERR_PTR(PTR_ERR(nnode));
1431 }
1432 iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
1433 shft -= UBIFS_LPT_FANOUT_SHIFT;
1434 pnode = ubifs_get_pnode(c, nnode, iip);
1435 if (IS_ERR(pnode))
1436 return ERR_PTR(PTR_ERR(pnode));
1437 iip = (i & (UBIFS_LPT_FANOUT - 1));
1438 dbg_lp("LEB %d, free %d, dirty %d, flags %d", lnum,
1439 pnode->lprops[iip].free, pnode->lprops[iip].dirty,
1440 pnode->lprops[iip].flags);
1441 return &pnode->lprops[iip];
1442}
1443
1444/**
1445 * dirty_cow_nnode - ensure a nnode is not being committed.
1446 * @c: UBIFS file-system description object
1447 * @nnode: nnode to check
1448 *
1449 * Returns dirtied nnode on success or negative error code on failure.
1450 */
1451static struct ubifs_nnode *dirty_cow_nnode(struct ubifs_info *c,
1452 struct ubifs_nnode *nnode)
1453{
1454 struct ubifs_nnode *n;
1455 int i;
1456
1457 if (!test_bit(COW_CNODE, &nnode->flags)) {
1458 /* nnode is not being committed */
1459 if (!test_and_set_bit(DIRTY_CNODE, &nnode->flags)) {
1460 c->dirty_nn_cnt += 1;
1461 ubifs_add_nnode_dirt(c, nnode);
1462 }
1463 return nnode;
1464 }
1465
1466 /* nnode is being committed, so copy it */
1467 n = kmalloc(sizeof(struct ubifs_nnode), GFP_NOFS);
1468 if (unlikely(!n))
1469 return ERR_PTR(-ENOMEM);
1470
1471 memcpy(n, nnode, sizeof(struct ubifs_nnode));
1472 n->cnext = NULL;
1473 __set_bit(DIRTY_CNODE, &n->flags);
1474 __clear_bit(COW_CNODE, &n->flags);
1475
1476 /* The children now have new parent */
1477 for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
1478 struct ubifs_nbranch *branch = &n->nbranch[i];
1479
1480 if (branch->cnode)
1481 branch->cnode->parent = n;
1482 }
1483
1484 ubifs_assert(!test_bit(OBSOLETE_CNODE, &nnode->flags));
1485 __set_bit(OBSOLETE_CNODE, &nnode->flags);
1486
1487 c->dirty_nn_cnt += 1;
1488 ubifs_add_nnode_dirt(c, nnode);
1489 if (nnode->parent)
1490 nnode->parent->nbranch[n->iip].nnode = n;
1491 else
1492 c->nroot = n;
1493 return n;
1494}
1495
1496/**
1497 * dirty_cow_pnode - ensure a pnode is not being committed.
1498 * @c: UBIFS file-system description object
1499 * @pnode: pnode to check
1500 *
1501 * Returns dirtied pnode on success or negative error code on failure.
1502 */
1503static struct ubifs_pnode *dirty_cow_pnode(struct ubifs_info *c,
1504 struct ubifs_pnode *pnode)
1505{
1506 struct ubifs_pnode *p;
1507
1508 if (!test_bit(COW_CNODE, &pnode->flags)) {
1509 /* pnode is not being committed */
1510 if (!test_and_set_bit(DIRTY_CNODE, &pnode->flags)) {
1511 c->dirty_pn_cnt += 1;
1512 add_pnode_dirt(c, pnode);
1513 }
1514 return pnode;
1515 }
1516
1517 /* pnode is being committed, so copy it */
1518 p = kmalloc(sizeof(struct ubifs_pnode), GFP_NOFS);
1519 if (unlikely(!p))
1520 return ERR_PTR(-ENOMEM);
1521
1522 memcpy(p, pnode, sizeof(struct ubifs_pnode));
1523 p->cnext = NULL;
1524 __set_bit(DIRTY_CNODE, &p->flags);
1525 __clear_bit(COW_CNODE, &p->flags);
1526 replace_cats(c, pnode, p);
1527
1528 ubifs_assert(!test_bit(OBSOLETE_CNODE, &pnode->flags));
1529 __set_bit(OBSOLETE_CNODE, &pnode->flags);
1530
1531 c->dirty_pn_cnt += 1;
1532 add_pnode_dirt(c, pnode);
1533 pnode->parent->nbranch[p->iip].pnode = p;
1534 return p;
1535}
1536
1537/**
1538 * ubifs_lpt_lookup_dirty - lookup LEB properties in the LPT.
1539 * @c: UBIFS file-system description object
1540 * @lnum: LEB number to lookup
1541 *
1542 * This function returns a pointer to the LEB properties on success or a
1543 * negative error code on failure.
1544 */
1545struct ubifs_lprops *ubifs_lpt_lookup_dirty(struct ubifs_info *c, int lnum)
1546{
1547 int err, i, h, iip, shft;
1548 struct ubifs_nnode *nnode;
1549 struct ubifs_pnode *pnode;
1550
1551 if (!c->nroot) {
1552 err = ubifs_read_nnode(c, NULL, 0);
1553 if (err)
1554 return ERR_PTR(err);
1555 }
1556 nnode = c->nroot;
1557 nnode = dirty_cow_nnode(c, nnode);
1558 if (IS_ERR(nnode))
1559 return ERR_PTR(PTR_ERR(nnode));
1560 i = lnum - c->main_first;
1561 shft = c->lpt_hght * UBIFS_LPT_FANOUT_SHIFT;
1562 for (h = 1; h < c->lpt_hght; h++) {
1563 iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
1564 shft -= UBIFS_LPT_FANOUT_SHIFT;
1565 nnode = ubifs_get_nnode(c, nnode, iip);
1566 if (IS_ERR(nnode))
1567 return ERR_PTR(PTR_ERR(nnode));
1568 nnode = dirty_cow_nnode(c, nnode);
1569 if (IS_ERR(nnode))
1570 return ERR_PTR(PTR_ERR(nnode));
1571 }
1572 iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
1573 shft -= UBIFS_LPT_FANOUT_SHIFT;
1574 pnode = ubifs_get_pnode(c, nnode, iip);
1575 if (IS_ERR(pnode))
1576 return ERR_PTR(PTR_ERR(pnode));
1577 pnode = dirty_cow_pnode(c, pnode);
1578 if (IS_ERR(pnode))
1579 return ERR_PTR(PTR_ERR(pnode));
1580 iip = (i & (UBIFS_LPT_FANOUT - 1));
1581 dbg_lp("LEB %d, free %d, dirty %d, flags %d", lnum,
1582 pnode->lprops[iip].free, pnode->lprops[iip].dirty,
1583 pnode->lprops[iip].flags);
1584 ubifs_assert(test_bit(DIRTY_CNODE, &pnode->flags));
1585 return &pnode->lprops[iip];
1586}
1587
1588/**
1589 * lpt_init_rd - initialize the LPT for reading.
1590 * @c: UBIFS file-system description object
1591 *
1592 * This function returns %0 on success and a negative error code on failure.
1593 */
1594static int lpt_init_rd(struct ubifs_info *c)
1595{
1596 int err, i;
1597
1598 c->ltab = vmalloc(sizeof(struct ubifs_lpt_lprops) * c->lpt_lebs);
1599 if (!c->ltab)
1600 return -ENOMEM;
1601
1602 i = max_t(int, c->nnode_sz, c->pnode_sz);
1603 c->lpt_nod_buf = kmalloc(i, GFP_KERNEL);
1604 if (!c->lpt_nod_buf)
1605 return -ENOMEM;
1606
1607 for (i = 0; i < LPROPS_HEAP_CNT; i++) {
1608 c->lpt_heap[i].arr = kmalloc(sizeof(void *) * LPT_HEAP_SZ,
1609 GFP_KERNEL);
1610 if (!c->lpt_heap[i].arr)
1611 return -ENOMEM;
1612 c->lpt_heap[i].cnt = 0;
1613 c->lpt_heap[i].max_cnt = LPT_HEAP_SZ;
1614 }
1615
1616 c->dirty_idx.arr = kmalloc(sizeof(void *) * LPT_HEAP_SZ, GFP_KERNEL);
1617 if (!c->dirty_idx.arr)
1618 return -ENOMEM;
1619 c->dirty_idx.cnt = 0;
1620 c->dirty_idx.max_cnt = LPT_HEAP_SZ;
1621
1622 err = read_ltab(c);
1623 if (err)
1624 return err;
1625
1626 dbg_lp("space_bits %d", c->space_bits);
1627 dbg_lp("lpt_lnum_bits %d", c->lpt_lnum_bits);
1628 dbg_lp("lpt_offs_bits %d", c->lpt_offs_bits);
1629 dbg_lp("lpt_spc_bits %d", c->lpt_spc_bits);
1630 dbg_lp("pcnt_bits %d", c->pcnt_bits);
1631 dbg_lp("lnum_bits %d", c->lnum_bits);
1632 dbg_lp("pnode_sz %d", c->pnode_sz);
1633 dbg_lp("nnode_sz %d", c->nnode_sz);
1634 dbg_lp("ltab_sz %d", c->ltab_sz);
1635 dbg_lp("lsave_sz %d", c->lsave_sz);
1636 dbg_lp("lsave_cnt %d", c->lsave_cnt);
1637 dbg_lp("lpt_hght %d", c->lpt_hght);
1638 dbg_lp("big_lpt %d", c->big_lpt);
1639 dbg_lp("LPT root is at %d:%d", c->lpt_lnum, c->lpt_offs);
1640 dbg_lp("LPT head is at %d:%d", c->nhead_lnum, c->nhead_offs);
1641 dbg_lp("LPT ltab is at %d:%d", c->ltab_lnum, c->ltab_offs);
1642 if (c->big_lpt)
1643 dbg_lp("LPT lsave is at %d:%d", c->lsave_lnum, c->lsave_offs);
1644
1645 return 0;
1646}
1647
1648/**
1649 * lpt_init_wr - initialize the LPT for writing.
1650 * @c: UBIFS file-system description object
1651 *
1652 * 'lpt_init_rd()' must have been called already.
1653 *
1654 * This function returns %0 on success and a negative error code on failure.
1655 */
1656static int lpt_init_wr(struct ubifs_info *c)
1657{
1658 int err, i;
1659
1660 c->ltab_cmt = vmalloc(sizeof(struct ubifs_lpt_lprops) * c->lpt_lebs);
1661 if (!c->ltab_cmt)
1662 return -ENOMEM;
1663
1664 c->lpt_buf = vmalloc(c->leb_size);
1665 if (!c->lpt_buf)
1666 return -ENOMEM;
1667
1668 if (c->big_lpt) {
1669 c->lsave = kmalloc(sizeof(int) * c->lsave_cnt, GFP_NOFS);
1670 if (!c->lsave)
1671 return -ENOMEM;
1672 err = read_lsave(c);
1673 if (err)
1674 return err;
1675 }
1676
1677 for (i = 0; i < c->lpt_lebs; i++)
1678 if (c->ltab[i].free == c->leb_size) {
1679 err = ubifs_leb_unmap(c, i + c->lpt_first);
1680 if (err)
1681 return err;
1682 }
1683
1684 return 0;
1685}
1686
1687/**
1688 * ubifs_lpt_init - initialize the LPT.
1689 * @c: UBIFS file-system description object
1690 * @rd: whether to initialize lpt for reading
1691 * @wr: whether to initialize lpt for writing
1692 *
1693 * For mounting 'rw', @rd and @wr are both true. For mounting 'ro', @rd is true
1694 * and @wr is false. For mounting from 'ro' to 'rw', @rd is false and @wr is
1695 * true.
1696 *
1697 * This function returns %0 on success and a negative error code on failure.
1698 */
1699int ubifs_lpt_init(struct ubifs_info *c, int rd, int wr)
1700{
1701 int err;
1702
1703 if (rd) {
1704 err = lpt_init_rd(c);
1705 if (err)
1706 return err;
1707 }
1708
1709 if (wr) {
1710 err = lpt_init_wr(c);
1711 if (err)
1712 return err;
1713 }
1714
1715 return 0;
1716}
1717
1718/**
1719 * struct lpt_scan_node - somewhere to put nodes while we scan LPT.
1720 * @nnode: where to keep a nnode
1721 * @pnode: where to keep a pnode
1722 * @cnode: where to keep a cnode
1723 * @in_tree: is the node in the tree in memory
1724 * @ptr.nnode: pointer to the nnode (if it is an nnode) which may be here or in
1725 * the tree
1726 * @ptr.pnode: ditto for pnode
1727 * @ptr.cnode: ditto for cnode
1728 */
1729struct lpt_scan_node {
1730 union {
1731 struct ubifs_nnode nnode;
1732 struct ubifs_pnode pnode;
1733 struct ubifs_cnode cnode;
1734 };
1735 int in_tree;
1736 union {
1737 struct ubifs_nnode *nnode;
1738 struct ubifs_pnode *pnode;
1739 struct ubifs_cnode *cnode;
1740 } ptr;
1741};
1742
1743/**
1744 * scan_get_nnode - for the scan, get a nnode from either the tree or flash.
1745 * @c: the UBIFS file-system description object
1746 * @path: where to put the nnode
1747 * @parent: parent of the nnode
1748 * @iip: index in parent of the nnode
1749 *
1750 * This function returns a pointer to the nnode on success or a negative error
1751 * code on failure.
1752 */
1753static struct ubifs_nnode *scan_get_nnode(struct ubifs_info *c,
1754 struct lpt_scan_node *path,
1755 struct ubifs_nnode *parent, int iip)
1756{
1757 struct ubifs_nbranch *branch;
1758 struct ubifs_nnode *nnode;
1759 void *buf = c->lpt_nod_buf;
1760 int err;
1761
1762 branch = &parent->nbranch[iip];
1763 nnode = branch->nnode;
1764 if (nnode) {
1765 path->in_tree = 1;
1766 path->ptr.nnode = nnode;
1767 return nnode;
1768 }
1769 nnode = &path->nnode;
1770 path->in_tree = 0;
1771 path->ptr.nnode = nnode;
1772 memset(nnode, 0, sizeof(struct ubifs_nnode));
1773 if (branch->lnum == 0) {
1774 /*
1775 * This nnode was not written which just means that the LEB
1776 * properties in the subtree below it describe empty LEBs. We
1777 * make the nnode as though we had read it, which in fact means
1778 * doing almost nothing.
1779 */
1780 if (c->big_lpt)
1781 nnode->num = calc_nnode_num_from_parent(c, parent, iip);
1782 } else {
1783 err = ubi_read(c->ubi, branch->lnum, buf, branch->offs,
1784 c->nnode_sz);
1785 if (err)
1786 return ERR_PTR(err);
1787 err = unpack_nnode(c, buf, nnode);
1788 if (err)
1789 return ERR_PTR(err);
1790 }
1791 err = validate_nnode(c, nnode, parent, iip);
1792 if (err)
1793 return ERR_PTR(err);
1794 if (!c->big_lpt)
1795 nnode->num = calc_nnode_num_from_parent(c, parent, iip);
1796 nnode->level = parent->level - 1;
1797 nnode->parent = parent;
1798 nnode->iip = iip;
1799 return nnode;
1800}
1801
1802/**
1803 * scan_get_pnode - for the scan, get a pnode from either the tree or flash.
1804 * @c: the UBIFS file-system description object
1805 * @path: where to put the pnode
1806 * @parent: parent of the pnode
1807 * @iip: index in parent of the pnode
1808 *
1809 * This function returns a pointer to the pnode on success or a negative error
1810 * code on failure.
1811 */
1812static struct ubifs_pnode *scan_get_pnode(struct ubifs_info *c,
1813 struct lpt_scan_node *path,
1814 struct ubifs_nnode *parent, int iip)
1815{
1816 struct ubifs_nbranch *branch;
1817 struct ubifs_pnode *pnode;
1818 void *buf = c->lpt_nod_buf;
1819 int err;
1820
1821 branch = &parent->nbranch[iip];
1822 pnode = branch->pnode;
1823 if (pnode) {
1824 path->in_tree = 1;
1825 path->ptr.pnode = pnode;
1826 return pnode;
1827 }
1828 pnode = &path->pnode;
1829 path->in_tree = 0;
1830 path->ptr.pnode = pnode;
1831 memset(pnode, 0, sizeof(struct ubifs_pnode));
1832 if (branch->lnum == 0) {
1833 /*
1834 * This pnode was not written which just means that the LEB
1835 * properties in it describe empty LEBs. We make the pnode as
1836 * though we had read it.
1837 */
1838 int i;
1839
1840 if (c->big_lpt)
1841 pnode->num = calc_pnode_num_from_parent(c, parent, iip);
1842 for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
1843 struct ubifs_lprops * const lprops = &pnode->lprops[i];
1844
1845 lprops->free = c->leb_size;
1846 lprops->flags = ubifs_categorize_lprops(c, lprops);
1847 }
1848 } else {
1849 ubifs_assert(branch->lnum >= c->lpt_first &&
1850 branch->lnum <= c->lpt_last);
1851 ubifs_assert(branch->offs >= 0 && branch->offs < c->leb_size);
1852 err = ubi_read(c->ubi, branch->lnum, buf, branch->offs,
1853 c->pnode_sz);
1854 if (err)
1855 return ERR_PTR(err);
1856 err = unpack_pnode(c, buf, pnode);
1857 if (err)
1858 return ERR_PTR(err);
1859 }
1860 err = validate_pnode(c, pnode, parent, iip);
1861 if (err)
1862 return ERR_PTR(err);
1863 if (!c->big_lpt)
1864 pnode->num = calc_pnode_num_from_parent(c, parent, iip);
1865 pnode->parent = parent;
1866 pnode->iip = iip;
1867 set_pnode_lnum(c, pnode);
1868 return pnode;
1869}
1870
1871/**
1872 * ubifs_lpt_scan_nolock - scan the LPT.
1873 * @c: the UBIFS file-system description object
1874 * @start_lnum: LEB number from which to start scanning
1875 * @end_lnum: LEB number at which to stop scanning
1876 * @scan_cb: callback function called for each lprops
1877 * @data: data to be passed to the callback function
1878 *
1879 * This function returns %0 on success and a negative error code on failure.
1880 */
1881int ubifs_lpt_scan_nolock(struct ubifs_info *c, int start_lnum, int end_lnum,
1882 ubifs_lpt_scan_callback scan_cb, void *data)
1883{
1884 int err = 0, i, h, iip, shft;
1885 struct ubifs_nnode *nnode;
1886 struct ubifs_pnode *pnode;
1887 struct lpt_scan_node *path;
1888
1889 if (start_lnum == -1) {
1890 start_lnum = end_lnum + 1;
1891 if (start_lnum >= c->leb_cnt)
1892 start_lnum = c->main_first;
1893 }
1894
1895 ubifs_assert(start_lnum >= c->main_first && start_lnum < c->leb_cnt);
1896 ubifs_assert(end_lnum >= c->main_first && end_lnum < c->leb_cnt);
1897
1898 if (!c->nroot) {
1899 err = ubifs_read_nnode(c, NULL, 0);
1900 if (err)
1901 return err;
1902 }
1903
1904 path = kmalloc(sizeof(struct lpt_scan_node) * (c->lpt_hght + 1),
1905 GFP_NOFS);
1906 if (!path)
1907 return -ENOMEM;
1908
1909 path[0].ptr.nnode = c->nroot;
1910 path[0].in_tree = 1;
1911again:
1912 /* Descend to the pnode containing start_lnum */
1913 nnode = c->nroot;
1914 i = start_lnum - c->main_first;
1915 shft = c->lpt_hght * UBIFS_LPT_FANOUT_SHIFT;
1916 for (h = 1; h < c->lpt_hght; h++) {
1917 iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
1918 shft -= UBIFS_LPT_FANOUT_SHIFT;
1919 nnode = scan_get_nnode(c, path + h, nnode, iip);
1920 if (IS_ERR(nnode)) {
1921 err = PTR_ERR(nnode);
1922 goto out;
1923 }
1924 }
1925 iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
1926 shft -= UBIFS_LPT_FANOUT_SHIFT;
1927 pnode = scan_get_pnode(c, path + h, nnode, iip);
1928 if (IS_ERR(pnode)) {
1929 err = PTR_ERR(pnode);
1930 goto out;
1931 }
1932 iip = (i & (UBIFS_LPT_FANOUT - 1));
1933
1934 /* Loop for each lprops */
1935 while (1) {
1936 struct ubifs_lprops *lprops = &pnode->lprops[iip];
1937 int ret, lnum = lprops->lnum;
1938
1939 ret = scan_cb(c, lprops, path[h].in_tree, data);
1940 if (ret < 0) {
1941 err = ret;
1942 goto out;
1943 }
1944 if (ret & LPT_SCAN_ADD) {
1945 /* Add all the nodes in path to the tree in memory */
1946 for (h = 1; h < c->lpt_hght; h++) {
1947 const size_t sz = sizeof(struct ubifs_nnode);
1948 struct ubifs_nnode *parent;
1949
1950 if (path[h].in_tree)
1951 continue;
1952 nnode = kmalloc(sz, GFP_NOFS);
1953 if (!nnode) {
1954 err = -ENOMEM;
1955 goto out;
1956 }
1957 memcpy(nnode, &path[h].nnode, sz);
1958 parent = nnode->parent;
1959 parent->nbranch[nnode->iip].nnode = nnode;
1960 path[h].ptr.nnode = nnode;
1961 path[h].in_tree = 1;
1962 path[h + 1].cnode.parent = nnode;
1963 }
1964 if (path[h].in_tree)
1965 ubifs_ensure_cat(c, lprops);
1966 else {
1967 const size_t sz = sizeof(struct ubifs_pnode);
1968 struct ubifs_nnode *parent;
1969
1970 pnode = kmalloc(sz, GFP_NOFS);
1971 if (!pnode) {
1972 err = -ENOMEM;
1973 goto out;
1974 }
1975 memcpy(pnode, &path[h].pnode, sz);
1976 parent = pnode->parent;
1977 parent->nbranch[pnode->iip].pnode = pnode;
1978 path[h].ptr.pnode = pnode;
1979 path[h].in_tree = 1;
1980 update_cats(c, pnode);
1981 c->pnodes_have += 1;
1982 }
1983 err = dbg_check_lpt_nodes(c, (struct ubifs_cnode *)
1984 c->nroot, 0, 0);
1985 if (err)
1986 goto out;
1987 err = dbg_check_cats(c);
1988 if (err)
1989 goto out;
1990 }
1991 if (ret & LPT_SCAN_STOP) {
1992 err = 0;
1993 break;
1994 }
1995 /* Get the next lprops */
1996 if (lnum == end_lnum) {
1997 /*
1998 * We got to the end without finding what we were
1999 * looking for
2000 */
2001 err = -ENOSPC;
2002 goto out;
2003 }
2004 if (lnum + 1 >= c->leb_cnt) {
2005 /* Wrap-around to the beginning */
2006 start_lnum = c->main_first;
2007 goto again;
2008 }
2009 if (iip + 1 < UBIFS_LPT_FANOUT) {
2010 /* Next lprops is in the same pnode */
2011 iip += 1;
2012 continue;
2013 }
2014 /* We need to get the next pnode. Go up until we can go right */
2015 iip = pnode->iip;
2016 while (1) {
2017 h -= 1;
2018 ubifs_assert(h >= 0);
2019 nnode = path[h].ptr.nnode;
2020 if (iip + 1 < UBIFS_LPT_FANOUT)
2021 break;
2022 iip = nnode->iip;
2023 }
2024 /* Go right */
2025 iip += 1;
2026 /* Descend to the pnode */
2027 h += 1;
2028 for (; h < c->lpt_hght; h++) {
2029 nnode = scan_get_nnode(c, path + h, nnode, iip);
2030 if (IS_ERR(nnode)) {
2031 err = PTR_ERR(nnode);
2032 goto out;
2033 }
2034 iip = 0;
2035 }
2036 pnode = scan_get_pnode(c, path + h, nnode, iip);
2037 if (IS_ERR(pnode)) {
2038 err = PTR_ERR(pnode);
2039 goto out;
2040 }
2041 iip = 0;
2042 }
2043out:
2044 kfree(path);
2045 return err;
2046}
2047
2048#ifdef CONFIG_UBIFS_FS_DEBUG
2049
2050/**
2051 * dbg_chk_pnode - check a pnode.
2052 * @c: the UBIFS file-system description object
2053 * @pnode: pnode to check
2054 * @col: pnode column
2055 *
2056 * This function returns %0 on success and a negative error code on failure.
2057 */
2058static int dbg_chk_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
2059 int col)
2060{
2061 int i;
2062
2063 if (pnode->num != col) {
2064 dbg_err("pnode num %d expected %d parent num %d iip %d",
2065 pnode->num, col, pnode->parent->num, pnode->iip);
2066 return -EINVAL;
2067 }
2068 for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
2069 struct ubifs_lprops *lp, *lprops = &pnode->lprops[i];
2070 int lnum = (pnode->num << UBIFS_LPT_FANOUT_SHIFT) + i +
2071 c->main_first;
2072 int found, cat = lprops->flags & LPROPS_CAT_MASK;
2073 struct ubifs_lpt_heap *heap;
2074 struct list_head *list = NULL;
2075
2076 if (lnum >= c->leb_cnt)
2077 continue;
2078 if (lprops->lnum != lnum) {
2079 dbg_err("bad LEB number %d expected %d",
2080 lprops->lnum, lnum);
2081 return -EINVAL;
2082 }
2083 if (lprops->flags & LPROPS_TAKEN) {
2084 if (cat != LPROPS_UNCAT) {
2085 dbg_err("LEB %d taken but not uncat %d",
2086 lprops->lnum, cat);
2087 return -EINVAL;
2088 }
2089 continue;
2090 }
2091 if (lprops->flags & LPROPS_INDEX) {
2092 switch (cat) {
2093 case LPROPS_UNCAT:
2094 case LPROPS_DIRTY_IDX:
2095 case LPROPS_FRDI_IDX:
2096 break;
2097 default:
2098 dbg_err("LEB %d index but cat %d",
2099 lprops->lnum, cat);
2100 return -EINVAL;
2101 }
2102 } else {
2103 switch (cat) {
2104 case LPROPS_UNCAT:
2105 case LPROPS_DIRTY:
2106 case LPROPS_FREE:
2107 case LPROPS_EMPTY:
2108 case LPROPS_FREEABLE:
2109 break;
2110 default:
2111 dbg_err("LEB %d not index but cat %d",
2112 lprops->lnum, cat);
2113 return -EINVAL;
2114 }
2115 }
2116 switch (cat) {
2117 case LPROPS_UNCAT:
2118 list = &c->uncat_list;
2119 break;
2120 case LPROPS_EMPTY:
2121 list = &c->empty_list;
2122 break;
2123 case LPROPS_FREEABLE:
2124 list = &c->freeable_list;
2125 break;
2126 case LPROPS_FRDI_IDX:
2127 list = &c->frdi_idx_list;
2128 break;
2129 }
2130 found = 0;
2131 switch (cat) {
2132 case LPROPS_DIRTY:
2133 case LPROPS_DIRTY_IDX:
2134 case LPROPS_FREE:
2135 heap = &c->lpt_heap[cat - 1];
2136 if (lprops->hpos < heap->cnt &&
2137 heap->arr[lprops->hpos] == lprops)
2138 found = 1;
2139 break;
2140 case LPROPS_UNCAT:
2141 case LPROPS_EMPTY:
2142 case LPROPS_FREEABLE:
2143 case LPROPS_FRDI_IDX:
2144 list_for_each_entry(lp, list, list)
2145 if (lprops == lp) {
2146 found = 1;
2147 break;
2148 }
2149 break;
2150 }
2151 if (!found) {
2152 dbg_err("LEB %d cat %d not found in cat heap/list",
2153 lprops->lnum, cat);
2154 return -EINVAL;
2155 }
2156 switch (cat) {
2157 case LPROPS_EMPTY:
2158 if (lprops->free != c->leb_size) {
2159 dbg_err("LEB %d cat %d free %d dirty %d",
2160 lprops->lnum, cat, lprops->free,
2161 lprops->dirty);
2162 return -EINVAL;
2163 }
2164 case LPROPS_FREEABLE:
2165 case LPROPS_FRDI_IDX:
2166 if (lprops->free + lprops->dirty != c->leb_size) {
2167 dbg_err("LEB %d cat %d free %d dirty %d",
2168 lprops->lnum, cat, lprops->free,
2169 lprops->dirty);
2170 return -EINVAL;
2171 }
2172 }
2173 }
2174 return 0;
2175}
2176
2177/**
2178 * dbg_check_lpt_nodes - check nnodes and pnodes.
2179 * @c: the UBIFS file-system description object
2180 * @cnode: next cnode (nnode or pnode) to check
2181 * @row: row of cnode (root is zero)
2182 * @col: column of cnode (leftmost is zero)
2183 *
2184 * This function returns %0 on success and a negative error code on failure.
2185 */
2186int dbg_check_lpt_nodes(struct ubifs_info *c, struct ubifs_cnode *cnode,
2187 int row, int col)
2188{
2189 struct ubifs_nnode *nnode, *nn;
2190 struct ubifs_cnode *cn;
2191 int num, iip = 0, err;
2192
2193 if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS))
2194 return 0;
2195
2196 while (cnode) {
2197 ubifs_assert(row >= 0);
2198 nnode = cnode->parent;
2199 if (cnode->level) {
2200 /* cnode is a nnode */
2201 num = calc_nnode_num(row, col);
2202 if (cnode->num != num) {
2203 dbg_err("nnode num %d expected %d "
2204 "parent num %d iip %d", cnode->num, num,
2205 (nnode ? nnode->num : 0), cnode->iip);
2206 return -EINVAL;
2207 }
2208 nn = (struct ubifs_nnode *)cnode;
2209 while (iip < UBIFS_LPT_FANOUT) {
2210 cn = nn->nbranch[iip].cnode;
2211 if (cn) {
2212 /* Go down */
2213 row += 1;
2214 col <<= UBIFS_LPT_FANOUT_SHIFT;
2215 col += iip;
2216 iip = 0;
2217 cnode = cn;
2218 break;
2219 }
2220 /* Go right */
2221 iip += 1;
2222 }
2223 if (iip < UBIFS_LPT_FANOUT)
2224 continue;
2225 } else {
2226 struct ubifs_pnode *pnode;
2227
2228 /* cnode is a pnode */
2229 pnode = (struct ubifs_pnode *)cnode;
2230 err = dbg_chk_pnode(c, pnode, col);
2231 if (err)
2232 return err;
2233 }
2234 /* Go up and to the right */
2235 row -= 1;
2236 col >>= UBIFS_LPT_FANOUT_SHIFT;
2237 iip = cnode->iip + 1;
2238 cnode = (struct ubifs_cnode *)nnode;
2239 }
2240 return 0;
2241}
2242
2243#endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
new file mode 100644
index 000000000000..5f0b83e20af6
--- /dev/null
+++ b/fs/ubifs/lpt_commit.c
@@ -0,0 +1,1648 @@
1/*
2 * This file is part of UBIFS.
3 *
4 * Copyright (C) 2006-2008 Nokia Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published by
8 * the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along with
16 * this program; if not, write to the Free Software Foundation, Inc., 51
17 * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 * Authors: Adrian Hunter
20 * Artem Bityutskiy (Битюцкий Артём)
21 */
22
23/*
24 * This file implements commit-related functionality of the LEB properties
25 * subsystem.
26 */
27
28#include <linux/crc16.h>
29#include "ubifs.h"
30
31/**
32 * first_dirty_cnode - find first dirty cnode.
33 * @c: UBIFS file-system description object
34 * @nnode: nnode at which to start
35 *
36 * This function returns the first dirty cnode or %NULL if there is not one.
37 */
38static struct ubifs_cnode *first_dirty_cnode(struct ubifs_nnode *nnode)
39{
40 ubifs_assert(nnode);
41 while (1) {
42 int i, cont = 0;
43
44 for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
45 struct ubifs_cnode *cnode;
46
47 cnode = nnode->nbranch[i].cnode;
48 if (cnode &&
49 test_bit(DIRTY_CNODE, &cnode->flags)) {
50 if (cnode->level == 0)
51 return cnode;
52 nnode = (struct ubifs_nnode *)cnode;
53 cont = 1;
54 break;
55 }
56 }
57 if (!cont)
58 return (struct ubifs_cnode *)nnode;
59 }
60}
61
62/**
63 * next_dirty_cnode - find next dirty cnode.
64 * @cnode: cnode from which to begin searching
65 *
66 * This function returns the next dirty cnode or %NULL if there is not one.
67 */
68static struct ubifs_cnode *next_dirty_cnode(struct ubifs_cnode *cnode)
69{
70 struct ubifs_nnode *nnode;
71 int i;
72
73 ubifs_assert(cnode);
74 nnode = cnode->parent;
75 if (!nnode)
76 return NULL;
77 for (i = cnode->iip + 1; i < UBIFS_LPT_FANOUT; i++) {
78 cnode = nnode->nbranch[i].cnode;
79 if (cnode && test_bit(DIRTY_CNODE, &cnode->flags)) {
80 if (cnode->level == 0)
81 return cnode; /* cnode is a pnode */
82 /* cnode is a nnode */
83 return first_dirty_cnode((struct ubifs_nnode *)cnode);
84 }
85 }
86 return (struct ubifs_cnode *)nnode;
87}
88
89/**
90 * get_cnodes_to_commit - create list of dirty cnodes to commit.
91 * @c: UBIFS file-system description object
92 *
93 * This function returns the number of cnodes to commit.
94 */
95static int get_cnodes_to_commit(struct ubifs_info *c)
96{
97 struct ubifs_cnode *cnode, *cnext;
98 int cnt = 0;
99
100 if (!c->nroot)
101 return 0;
102
103 if (!test_bit(DIRTY_CNODE, &c->nroot->flags))
104 return 0;
105
106 c->lpt_cnext = first_dirty_cnode(c->nroot);
107 cnode = c->lpt_cnext;
108 if (!cnode)
109 return 0;
110 cnt += 1;
111 while (1) {
112 ubifs_assert(!test_bit(COW_ZNODE, &cnode->flags));
113 __set_bit(COW_ZNODE, &cnode->flags);
114 cnext = next_dirty_cnode(cnode);
115 if (!cnext) {
116 cnode->cnext = c->lpt_cnext;
117 break;
118 }
119 cnode->cnext = cnext;
120 cnode = cnext;
121 cnt += 1;
122 }
123 dbg_cmt("committing %d cnodes", cnt);
124 dbg_lp("committing %d cnodes", cnt);
125 ubifs_assert(cnt == c->dirty_nn_cnt + c->dirty_pn_cnt);
126 return cnt;
127}
128
129/**
130 * upd_ltab - update LPT LEB properties.
131 * @c: UBIFS file-system description object
132 * @lnum: LEB number
133 * @free: amount of free space
134 * @dirty: amount of dirty space to add
135 */
136static void upd_ltab(struct ubifs_info *c, int lnum, int free, int dirty)
137{
138 dbg_lp("LEB %d free %d dirty %d to %d +%d",
139 lnum, c->ltab[lnum - c->lpt_first].free,
140 c->ltab[lnum - c->lpt_first].dirty, free, dirty);
141 ubifs_assert(lnum >= c->lpt_first && lnum <= c->lpt_last);
142 c->ltab[lnum - c->lpt_first].free = free;
143 c->ltab[lnum - c->lpt_first].dirty += dirty;
144}
145
146/**
147 * alloc_lpt_leb - allocate an LPT LEB that is empty.
148 * @c: UBIFS file-system description object
149 * @lnum: LEB number is passed and returned here
150 *
151 * This function finds the next empty LEB in the ltab starting from @lnum. If a
152 * an empty LEB is found it is returned in @lnum and the function returns %0.
153 * Otherwise the function returns -ENOSPC. Note however, that LPT is designed
154 * never to run out of space.
155 */
156static int alloc_lpt_leb(struct ubifs_info *c, int *lnum)
157{
158 int i, n;
159
160 n = *lnum - c->lpt_first + 1;
161 for (i = n; i < c->lpt_lebs; i++) {
162 if (c->ltab[i].tgc || c->ltab[i].cmt)
163 continue;
164 if (c->ltab[i].free == c->leb_size) {
165 c->ltab[i].cmt = 1;
166 *lnum = i + c->lpt_first;
167 return 0;
168 }
169 }
170
171 for (i = 0; i < n; i++) {
172 if (c->ltab[i].tgc || c->ltab[i].cmt)
173 continue;
174 if (c->ltab[i].free == c->leb_size) {
175 c->ltab[i].cmt = 1;
176 *lnum = i + c->lpt_first;
177 return 0;
178 }
179 }
180 dbg_err("last LEB %d", *lnum);
181 dump_stack();
182 return -ENOSPC;
183}
184
185/**
186 * layout_cnodes - layout cnodes for commit.
187 * @c: UBIFS file-system description object
188 *
189 * This function returns %0 on success and a negative error code on failure.
190 */
191static int layout_cnodes(struct ubifs_info *c)
192{
193 int lnum, offs, len, alen, done_lsave, done_ltab, err;
194 struct ubifs_cnode *cnode;
195
196 cnode = c->lpt_cnext;
197 if (!cnode)
198 return 0;
199 lnum = c->nhead_lnum;
200 offs = c->nhead_offs;
201 /* Try to place lsave and ltab nicely */
202 done_lsave = !c->big_lpt;
203 done_ltab = 0;
204 if (!done_lsave && offs + c->lsave_sz <= c->leb_size) {
205 done_lsave = 1;
206 c->lsave_lnum = lnum;
207 c->lsave_offs = offs;
208 offs += c->lsave_sz;
209 }
210
211 if (offs + c->ltab_sz <= c->leb_size) {
212 done_ltab = 1;
213 c->ltab_lnum = lnum;
214 c->ltab_offs = offs;
215 offs += c->ltab_sz;
216 }
217
218 do {
219 if (cnode->level) {
220 len = c->nnode_sz;
221 c->dirty_nn_cnt -= 1;
222 } else {
223 len = c->pnode_sz;
224 c->dirty_pn_cnt -= 1;
225 }
226 while (offs + len > c->leb_size) {
227 alen = ALIGN(offs, c->min_io_size);
228 upd_ltab(c, lnum, c->leb_size - alen, alen - offs);
229 err = alloc_lpt_leb(c, &lnum);
230 if (err)
231 return err;
232 offs = 0;
233 ubifs_assert(lnum >= c->lpt_first &&
234 lnum <= c->lpt_last);
235 /* Try to place lsave and ltab nicely */
236 if (!done_lsave) {
237 done_lsave = 1;
238 c->lsave_lnum = lnum;
239 c->lsave_offs = offs;
240 offs += c->lsave_sz;
241 continue;
242 }
243 if (!done_ltab) {
244 done_ltab = 1;
245 c->ltab_lnum = lnum;
246 c->ltab_offs = offs;
247 offs += c->ltab_sz;
248 continue;
249 }
250 break;
251 }
252 if (cnode->parent) {
253 cnode->parent->nbranch[cnode->iip].lnum = lnum;
254 cnode->parent->nbranch[cnode->iip].offs = offs;
255 } else {
256 c->lpt_lnum = lnum;
257 c->lpt_offs = offs;
258 }
259 offs += len;
260 cnode = cnode->cnext;
261 } while (cnode && cnode != c->lpt_cnext);
262
263 /* Make sure to place LPT's save table */
264 if (!done_lsave) {
265 if (offs + c->lsave_sz > c->leb_size) {
266 alen = ALIGN(offs, c->min_io_size);
267 upd_ltab(c, lnum, c->leb_size - alen, alen - offs);
268 err = alloc_lpt_leb(c, &lnum);
269 if (err)
270 return err;
271 offs = 0;
272 ubifs_assert(lnum >= c->lpt_first &&
273 lnum <= c->lpt_last);
274 }
275 done_lsave = 1;
276 c->lsave_lnum = lnum;
277 c->lsave_offs = offs;
278 offs += c->lsave_sz;
279 }
280
281 /* Make sure to place LPT's own lprops table */
282 if (!done_ltab) {
283 if (offs + c->ltab_sz > c->leb_size) {
284 alen = ALIGN(offs, c->min_io_size);
285 upd_ltab(c, lnum, c->leb_size - alen, alen - offs);
286 err = alloc_lpt_leb(c, &lnum);
287 if (err)
288 return err;
289 offs = 0;
290 ubifs_assert(lnum >= c->lpt_first &&
291 lnum <= c->lpt_last);
292 }
293 done_ltab = 1;
294 c->ltab_lnum = lnum;
295 c->ltab_offs = offs;
296 offs += c->ltab_sz;
297 }
298
299 alen = ALIGN(offs, c->min_io_size);
300 upd_ltab(c, lnum, c->leb_size - alen, alen - offs);
301 return 0;
302}
303
304/**
305 * realloc_lpt_leb - allocate an LPT LEB that is empty.
306 * @c: UBIFS file-system description object
307 * @lnum: LEB number is passed and returned here
308 *
309 * This function duplicates exactly the results of the function alloc_lpt_leb.
310 * It is used during end commit to reallocate the same LEB numbers that were
311 * allocated by alloc_lpt_leb during start commit.
312 *
313 * This function finds the next LEB that was allocated by the alloc_lpt_leb
314 * function starting from @lnum. If a LEB is found it is returned in @lnum and
315 * the function returns %0. Otherwise the function returns -ENOSPC.
316 * Note however, that LPT is designed never to run out of space.
317 */
318static int realloc_lpt_leb(struct ubifs_info *c, int *lnum)
319{
320 int i, n;
321
322 n = *lnum - c->lpt_first + 1;
323 for (i = n; i < c->lpt_lebs; i++)
324 if (c->ltab[i].cmt) {
325 c->ltab[i].cmt = 0;
326 *lnum = i + c->lpt_first;
327 return 0;
328 }
329
330 for (i = 0; i < n; i++)
331 if (c->ltab[i].cmt) {
332 c->ltab[i].cmt = 0;
333 *lnum = i + c->lpt_first;
334 return 0;
335 }
336 dbg_err("last LEB %d", *lnum);
337 dump_stack();
338 return -ENOSPC;
339}
340
341/**
342 * write_cnodes - write cnodes for commit.
343 * @c: UBIFS file-system description object
344 *
345 * This function returns %0 on success and a negative error code on failure.
346 */
347static int write_cnodes(struct ubifs_info *c)
348{
349 int lnum, offs, len, from, err, wlen, alen, done_ltab, done_lsave;
350 struct ubifs_cnode *cnode;
351 void *buf = c->lpt_buf;
352
353 cnode = c->lpt_cnext;
354 if (!cnode)
355 return 0;
356 lnum = c->nhead_lnum;
357 offs = c->nhead_offs;
358 from = offs;
359 /* Ensure empty LEB is unmapped */
360 if (offs == 0) {
361 err = ubifs_leb_unmap(c, lnum);
362 if (err)
363 return err;
364 }
365 /* Try to place lsave and ltab nicely */
366 done_lsave = !c->big_lpt;
367 done_ltab = 0;
368 if (!done_lsave && offs + c->lsave_sz <= c->leb_size) {
369 done_lsave = 1;
370 ubifs_pack_lsave(c, buf + offs, c->lsave);
371 offs += c->lsave_sz;
372 }
373
374 if (offs + c->ltab_sz <= c->leb_size) {
375 done_ltab = 1;
376 ubifs_pack_ltab(c, buf + offs, c->ltab_cmt);
377 offs += c->ltab_sz;
378 }
379
380 /* Loop for each cnode */
381 do {
382 if (cnode->level)
383 len = c->nnode_sz;
384 else
385 len = c->pnode_sz;
386 while (offs + len > c->leb_size) {
387 wlen = offs - from;
388 if (wlen) {
389 alen = ALIGN(wlen, c->min_io_size);
390 memset(buf + offs, 0xff, alen - wlen);
391 err = ubifs_leb_write(c, lnum, buf + from, from,
392 alen, UBI_SHORTTERM);
393 if (err)
394 return err;
395 }
396 err = realloc_lpt_leb(c, &lnum);
397 if (err)
398 return err;
399 offs = 0;
400 from = 0;
401 ubifs_assert(lnum >= c->lpt_first &&
402 lnum <= c->lpt_last);
403 err = ubifs_leb_unmap(c, lnum);
404 if (err)
405 return err;
406 /* Try to place lsave and ltab nicely */
407 if (!done_lsave) {
408 done_lsave = 1;
409 ubifs_pack_lsave(c, buf + offs, c->lsave);
410 offs += c->lsave_sz;
411 continue;
412 }
413 if (!done_ltab) {
414 done_ltab = 1;
415 ubifs_pack_ltab(c, buf + offs, c->ltab_cmt);
416 offs += c->ltab_sz;
417 continue;
418 }
419 break;
420 }
421 if (cnode->level)
422 ubifs_pack_nnode(c, buf + offs,
423 (struct ubifs_nnode *)cnode);
424 else
425 ubifs_pack_pnode(c, buf + offs,
426 (struct ubifs_pnode *)cnode);
427 /*
428 * The reason for the barriers is the same as in case of TNC.
429 * See comment in 'write_index()'. 'dirty_cow_nnode()' and
430 * 'dirty_cow_pnode()' are the functions for which this is
431 * important.
432 */
433 clear_bit(DIRTY_CNODE, &cnode->flags);
434 smp_mb__before_clear_bit();
435 clear_bit(COW_ZNODE, &cnode->flags);
436 smp_mb__after_clear_bit();
437 offs += len;
438 cnode = cnode->cnext;
439 } while (cnode && cnode != c->lpt_cnext);
440
441 /* Make sure to place LPT's save table */
442 if (!done_lsave) {
443 if (offs + c->lsave_sz > c->leb_size) {
444 wlen = offs - from;
445 alen = ALIGN(wlen, c->min_io_size);
446 memset(buf + offs, 0xff, alen - wlen);
447 err = ubifs_leb_write(c, lnum, buf + from, from, alen,
448 UBI_SHORTTERM);
449 if (err)
450 return err;
451 err = realloc_lpt_leb(c, &lnum);
452 if (err)
453 return err;
454 offs = 0;
455 ubifs_assert(lnum >= c->lpt_first &&
456 lnum <= c->lpt_last);
457 err = ubifs_leb_unmap(c, lnum);
458 if (err)
459 return err;
460 }
461 done_lsave = 1;
462 ubifs_pack_lsave(c, buf + offs, c->lsave);
463 offs += c->lsave_sz;
464 }
465
466 /* Make sure to place LPT's own lprops table */
467 if (!done_ltab) {
468 if (offs + c->ltab_sz > c->leb_size) {
469 wlen = offs - from;
470 alen = ALIGN(wlen, c->min_io_size);
471 memset(buf + offs, 0xff, alen - wlen);
472 err = ubifs_leb_write(c, lnum, buf + from, from, alen,
473 UBI_SHORTTERM);
474 if (err)
475 return err;
476 err = realloc_lpt_leb(c, &lnum);
477 if (err)
478 return err;
479 offs = 0;
480 ubifs_assert(lnum >= c->lpt_first &&
481 lnum <= c->lpt_last);
482 err = ubifs_leb_unmap(c, lnum);
483 if (err)
484 return err;
485 }
486 done_ltab = 1;
487 ubifs_pack_ltab(c, buf + offs, c->ltab_cmt);
488 offs += c->ltab_sz;
489 }
490
491 /* Write remaining data in buffer */
492 wlen = offs - from;
493 alen = ALIGN(wlen, c->min_io_size);
494 memset(buf + offs, 0xff, alen - wlen);
495 err = ubifs_leb_write(c, lnum, buf + from, from, alen, UBI_SHORTTERM);
496 if (err)
497 return err;
498 c->nhead_lnum = lnum;
499 c->nhead_offs = ALIGN(offs, c->min_io_size);
500
501 dbg_lp("LPT root is at %d:%d", c->lpt_lnum, c->lpt_offs);
502 dbg_lp("LPT head is at %d:%d", c->nhead_lnum, c->nhead_offs);
503 dbg_lp("LPT ltab is at %d:%d", c->ltab_lnum, c->ltab_offs);
504 if (c->big_lpt)
505 dbg_lp("LPT lsave is at %d:%d", c->lsave_lnum, c->lsave_offs);
506 return 0;
507}
508
509/**
510 * next_pnode - find next pnode.
511 * @c: UBIFS file-system description object
512 * @pnode: pnode
513 *
514 * This function returns the next pnode or %NULL if there are no more pnodes.
515 */
516static struct ubifs_pnode *next_pnode(struct ubifs_info *c,
517 struct ubifs_pnode *pnode)
518{
519 struct ubifs_nnode *nnode;
520 int iip;
521
522 /* Try to go right */
523 nnode = pnode->parent;
524 iip = pnode->iip + 1;
525 if (iip < UBIFS_LPT_FANOUT) {
526 /* We assume here that LEB zero is never an LPT LEB */
527 if (nnode->nbranch[iip].lnum)
528 return ubifs_get_pnode(c, nnode, iip);
529 else
530 return NULL;
531 }
532
533 /* Go up while can't go right */
534 do {
535 iip = nnode->iip + 1;
536 nnode = nnode->parent;
537 if (!nnode)
538 return NULL;
539 /* We assume here that LEB zero is never an LPT LEB */
540 } while (iip >= UBIFS_LPT_FANOUT || !nnode->nbranch[iip].lnum);
541
542 /* Go right */
543 nnode = ubifs_get_nnode(c, nnode, iip);
544 if (IS_ERR(nnode))
545 return (void *)nnode;
546
547 /* Go down to level 1 */
548 while (nnode->level > 1) {
549 nnode = ubifs_get_nnode(c, nnode, 0);
550 if (IS_ERR(nnode))
551 return (void *)nnode;
552 }
553
554 return ubifs_get_pnode(c, nnode, 0);
555}
556
557/**
558 * pnode_lookup - lookup a pnode in the LPT.
559 * @c: UBIFS file-system description object
560 * @i: pnode number (0 to main_lebs - 1)
561 *
562 * This function returns a pointer to the pnode on success or a negative
563 * error code on failure.
564 */
565static struct ubifs_pnode *pnode_lookup(struct ubifs_info *c, int i)
566{
567 int err, h, iip, shft;
568 struct ubifs_nnode *nnode;
569
570 if (!c->nroot) {
571 err = ubifs_read_nnode(c, NULL, 0);
572 if (err)
573 return ERR_PTR(err);
574 }
575 i <<= UBIFS_LPT_FANOUT_SHIFT;
576 nnode = c->nroot;
577 shft = c->lpt_hght * UBIFS_LPT_FANOUT_SHIFT;
578 for (h = 1; h < c->lpt_hght; h++) {
579 iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
580 shft -= UBIFS_LPT_FANOUT_SHIFT;
581 nnode = ubifs_get_nnode(c, nnode, iip);
582 if (IS_ERR(nnode))
583 return ERR_PTR(PTR_ERR(nnode));
584 }
585 iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
586 return ubifs_get_pnode(c, nnode, iip);
587}
588
589/**
590 * add_pnode_dirt - add dirty space to LPT LEB properties.
591 * @c: UBIFS file-system description object
592 * @pnode: pnode for which to add dirt
593 */
594static void add_pnode_dirt(struct ubifs_info *c, struct ubifs_pnode *pnode)
595{
596 ubifs_add_lpt_dirt(c, pnode->parent->nbranch[pnode->iip].lnum,
597 c->pnode_sz);
598}
599
600/**
601 * do_make_pnode_dirty - mark a pnode dirty.
602 * @c: UBIFS file-system description object
603 * @pnode: pnode to mark dirty
604 */
605static void do_make_pnode_dirty(struct ubifs_info *c, struct ubifs_pnode *pnode)
606{
607 /* Assumes cnext list is empty i.e. not called during commit */
608 if (!test_and_set_bit(DIRTY_CNODE, &pnode->flags)) {
609 struct ubifs_nnode *nnode;
610
611 c->dirty_pn_cnt += 1;
612 add_pnode_dirt(c, pnode);
613 /* Mark parent and ancestors dirty too */
614 nnode = pnode->parent;
615 while (nnode) {
616 if (!test_and_set_bit(DIRTY_CNODE, &nnode->flags)) {
617 c->dirty_nn_cnt += 1;
618 ubifs_add_nnode_dirt(c, nnode);
619 nnode = nnode->parent;
620 } else
621 break;
622 }
623 }
624}
625
626/**
627 * make_tree_dirty - mark the entire LEB properties tree dirty.
628 * @c: UBIFS file-system description object
629 *
630 * This function is used by the "small" LPT model to cause the entire LEB
631 * properties tree to be written. The "small" LPT model does not use LPT
632 * garbage collection because it is more efficient to write the entire tree
633 * (because it is small).
634 *
635 * This function returns %0 on success and a negative error code on failure.
636 */
637static int make_tree_dirty(struct ubifs_info *c)
638{
639 struct ubifs_pnode *pnode;
640
641 pnode = pnode_lookup(c, 0);
642 while (pnode) {
643 do_make_pnode_dirty(c, pnode);
644 pnode = next_pnode(c, pnode);
645 if (IS_ERR(pnode))
646 return PTR_ERR(pnode);
647 }
648 return 0;
649}
650
651/**
652 * need_write_all - determine if the LPT area is running out of free space.
653 * @c: UBIFS file-system description object
654 *
655 * This function returns %1 if the LPT area is running out of free space and %0
656 * if it is not.
657 */
658static int need_write_all(struct ubifs_info *c)
659{
660 long long free = 0;
661 int i;
662
663 for (i = 0; i < c->lpt_lebs; i++) {
664 if (i + c->lpt_first == c->nhead_lnum)
665 free += c->leb_size - c->nhead_offs;
666 else if (c->ltab[i].free == c->leb_size)
667 free += c->leb_size;
668 else if (c->ltab[i].free + c->ltab[i].dirty == c->leb_size)
669 free += c->leb_size;
670 }
671 /* Less than twice the size left */
672 if (free <= c->lpt_sz * 2)
673 return 1;
674 return 0;
675}
676
677/**
678 * lpt_tgc_start - start trivial garbage collection of LPT LEBs.
679 * @c: UBIFS file-system description object
680 *
681 * LPT trivial garbage collection is where a LPT LEB contains only dirty and
682 * free space and so may be reused as soon as the next commit is completed.
683 * This function is called during start commit to mark LPT LEBs for trivial GC.
684 */
685static void lpt_tgc_start(struct ubifs_info *c)
686{
687 int i;
688
689 for (i = 0; i < c->lpt_lebs; i++) {
690 if (i + c->lpt_first == c->nhead_lnum)
691 continue;
692 if (c->ltab[i].dirty > 0 &&
693 c->ltab[i].free + c->ltab[i].dirty == c->leb_size) {
694 c->ltab[i].tgc = 1;
695 c->ltab[i].free = c->leb_size;
696 c->ltab[i].dirty = 0;
697 dbg_lp("LEB %d", i + c->lpt_first);
698 }
699 }
700}
701
702/**
703 * lpt_tgc_end - end trivial garbage collection of LPT LEBs.
704 * @c: UBIFS file-system description object
705 *
706 * LPT trivial garbage collection is where a LPT LEB contains only dirty and
707 * free space and so may be reused as soon as the next commit is completed.
708 * This function is called after the commit is completed (master node has been
709 * written) and unmaps LPT LEBs that were marked for trivial GC.
710 */
711static int lpt_tgc_end(struct ubifs_info *c)
712{
713 int i, err;
714
715 for (i = 0; i < c->lpt_lebs; i++)
716 if (c->ltab[i].tgc) {
717 err = ubifs_leb_unmap(c, i + c->lpt_first);
718 if (err)
719 return err;
720 c->ltab[i].tgc = 0;
721 dbg_lp("LEB %d", i + c->lpt_first);
722 }
723 return 0;
724}
725
726/**
727 * populate_lsave - fill the lsave array with important LEB numbers.
728 * @c: the UBIFS file-system description object
729 *
730 * This function is only called for the "big" model. It records a small number
731 * of LEB numbers of important LEBs. Important LEBs are ones that are (from
732 * most important to least important): empty, freeable, freeable index, dirty
733 * index, dirty or free. Upon mount, we read this list of LEB numbers and bring
734 * their pnodes into memory. That will stop us from having to scan the LPT
735 * straight away. For the "small" model we assume that scanning the LPT is no
736 * big deal.
737 */
738static void populate_lsave(struct ubifs_info *c)
739{
740 struct ubifs_lprops *lprops;
741 struct ubifs_lpt_heap *heap;
742 int i, cnt = 0;
743
744 ubifs_assert(c->big_lpt);
745 if (!(c->lpt_drty_flgs & LSAVE_DIRTY)) {
746 c->lpt_drty_flgs |= LSAVE_DIRTY;
747 ubifs_add_lpt_dirt(c, c->lsave_lnum, c->lsave_sz);
748 }
749 list_for_each_entry(lprops, &c->empty_list, list) {
750 c->lsave[cnt++] = lprops->lnum;
751 if (cnt >= c->lsave_cnt)
752 return;
753 }
754 list_for_each_entry(lprops, &c->freeable_list, list) {
755 c->lsave[cnt++] = lprops->lnum;
756 if (cnt >= c->lsave_cnt)
757 return;
758 }
759 list_for_each_entry(lprops, &c->frdi_idx_list, list) {
760 c->lsave[cnt++] = lprops->lnum;
761 if (cnt >= c->lsave_cnt)
762 return;
763 }
764 heap = &c->lpt_heap[LPROPS_DIRTY_IDX - 1];
765 for (i = 0; i < heap->cnt; i++) {
766 c->lsave[cnt++] = heap->arr[i]->lnum;
767 if (cnt >= c->lsave_cnt)
768 return;
769 }
770 heap = &c->lpt_heap[LPROPS_DIRTY - 1];
771 for (i = 0; i < heap->cnt; i++) {
772 c->lsave[cnt++] = heap->arr[i]->lnum;
773 if (cnt >= c->lsave_cnt)
774 return;
775 }
776 heap = &c->lpt_heap[LPROPS_FREE - 1];
777 for (i = 0; i < heap->cnt; i++) {
778 c->lsave[cnt++] = heap->arr[i]->lnum;
779 if (cnt >= c->lsave_cnt)
780 return;
781 }
782 /* Fill it up completely */
783 while (cnt < c->lsave_cnt)
784 c->lsave[cnt++] = c->main_first;
785}
786
787/**
788 * nnode_lookup - lookup a nnode in the LPT.
789 * @c: UBIFS file-system description object
790 * @i: nnode number
791 *
792 * This function returns a pointer to the nnode on success or a negative
793 * error code on failure.
794 */
795static struct ubifs_nnode *nnode_lookup(struct ubifs_info *c, int i)
796{
797 int err, iip;
798 struct ubifs_nnode *nnode;
799
800 if (!c->nroot) {
801 err = ubifs_read_nnode(c, NULL, 0);
802 if (err)
803 return ERR_PTR(err);
804 }
805 nnode = c->nroot;
806 while (1) {
807 iip = i & (UBIFS_LPT_FANOUT - 1);
808 i >>= UBIFS_LPT_FANOUT_SHIFT;
809 if (!i)
810 break;
811 nnode = ubifs_get_nnode(c, nnode, iip);
812 if (IS_ERR(nnode))
813 return nnode;
814 }
815 return nnode;
816}
817
818/**
819 * make_nnode_dirty - find a nnode and, if found, make it dirty.
820 * @c: UBIFS file-system description object
821 * @node_num: nnode number of nnode to make dirty
822 * @lnum: LEB number where nnode was written
823 * @offs: offset where nnode was written
824 *
825 * This function is used by LPT garbage collection. LPT garbage collection is
826 * used only for the "big" LPT model (c->big_lpt == 1). Garbage collection
827 * simply involves marking all the nodes in the LEB being garbage-collected as
828 * dirty. The dirty nodes are written next commit, after which the LEB is free
829 * to be reused.
830 *
831 * This function returns %0 on success and a negative error code on failure.
832 */
833static int make_nnode_dirty(struct ubifs_info *c, int node_num, int lnum,
834 int offs)
835{
836 struct ubifs_nnode *nnode;
837
838 nnode = nnode_lookup(c, node_num);
839 if (IS_ERR(nnode))
840 return PTR_ERR(nnode);
841 if (nnode->parent) {
842 struct ubifs_nbranch *branch;
843
844 branch = &nnode->parent->nbranch[nnode->iip];
845 if (branch->lnum != lnum || branch->offs != offs)
846 return 0; /* nnode is obsolete */
847 } else if (c->lpt_lnum != lnum || c->lpt_offs != offs)
848 return 0; /* nnode is obsolete */
849 /* Assumes cnext list is empty i.e. not called during commit */
850 if (!test_and_set_bit(DIRTY_CNODE, &nnode->flags)) {
851 c->dirty_nn_cnt += 1;
852 ubifs_add_nnode_dirt(c, nnode);
853 /* Mark parent and ancestors dirty too */
854 nnode = nnode->parent;
855 while (nnode) {
856 if (!test_and_set_bit(DIRTY_CNODE, &nnode->flags)) {
857 c->dirty_nn_cnt += 1;
858 ubifs_add_nnode_dirt(c, nnode);
859 nnode = nnode->parent;
860 } else
861 break;
862 }
863 }
864 return 0;
865}
866
867/**
868 * make_pnode_dirty - find a pnode and, if found, make it dirty.
869 * @c: UBIFS file-system description object
870 * @node_num: pnode number of pnode to make dirty
871 * @lnum: LEB number where pnode was written
872 * @offs: offset where pnode was written
873 *
874 * This function is used by LPT garbage collection. LPT garbage collection is
875 * used only for the "big" LPT model (c->big_lpt == 1). Garbage collection
876 * simply involves marking all the nodes in the LEB being garbage-collected as
877 * dirty. The dirty nodes are written next commit, after which the LEB is free
878 * to be reused.
879 *
880 * This function returns %0 on success and a negative error code on failure.
881 */
882static int make_pnode_dirty(struct ubifs_info *c, int node_num, int lnum,
883 int offs)
884{
885 struct ubifs_pnode *pnode;
886 struct ubifs_nbranch *branch;
887
888 pnode = pnode_lookup(c, node_num);
889 if (IS_ERR(pnode))
890 return PTR_ERR(pnode);
891 branch = &pnode->parent->nbranch[pnode->iip];
892 if (branch->lnum != lnum || branch->offs != offs)
893 return 0;
894 do_make_pnode_dirty(c, pnode);
895 return 0;
896}
897
898/**
899 * make_ltab_dirty - make ltab node dirty.
900 * @c: UBIFS file-system description object
901 * @lnum: LEB number where ltab was written
902 * @offs: offset where ltab was written
903 *
904 * This function is used by LPT garbage collection. LPT garbage collection is
905 * used only for the "big" LPT model (c->big_lpt == 1). Garbage collection
906 * simply involves marking all the nodes in the LEB being garbage-collected as
907 * dirty. The dirty nodes are written next commit, after which the LEB is free
908 * to be reused.
909 *
910 * This function returns %0 on success and a negative error code on failure.
911 */
912static int make_ltab_dirty(struct ubifs_info *c, int lnum, int offs)
913{
914 if (lnum != c->ltab_lnum || offs != c->ltab_offs)
915 return 0; /* This ltab node is obsolete */
916 if (!(c->lpt_drty_flgs & LTAB_DIRTY)) {
917 c->lpt_drty_flgs |= LTAB_DIRTY;
918 ubifs_add_lpt_dirt(c, c->ltab_lnum, c->ltab_sz);
919 }
920 return 0;
921}
922
923/**
924 * make_lsave_dirty - make lsave node dirty.
925 * @c: UBIFS file-system description object
926 * @lnum: LEB number where lsave was written
927 * @offs: offset where lsave was written
928 *
929 * This function is used by LPT garbage collection. LPT garbage collection is
930 * used only for the "big" LPT model (c->big_lpt == 1). Garbage collection
931 * simply involves marking all the nodes in the LEB being garbage-collected as
932 * dirty. The dirty nodes are written next commit, after which the LEB is free
933 * to be reused.
934 *
935 * This function returns %0 on success and a negative error code on failure.
936 */
937static int make_lsave_dirty(struct ubifs_info *c, int lnum, int offs)
938{
939 if (lnum != c->lsave_lnum || offs != c->lsave_offs)
940 return 0; /* This lsave node is obsolete */
941 if (!(c->lpt_drty_flgs & LSAVE_DIRTY)) {
942 c->lpt_drty_flgs |= LSAVE_DIRTY;
943 ubifs_add_lpt_dirt(c, c->lsave_lnum, c->lsave_sz);
944 }
945 return 0;
946}
947
948/**
949 * make_node_dirty - make node dirty.
950 * @c: UBIFS file-system description object
951 * @node_type: LPT node type
952 * @node_num: node number
953 * @lnum: LEB number where node was written
954 * @offs: offset where node was written
955 *
956 * This function is used by LPT garbage collection. LPT garbage collection is
957 * used only for the "big" LPT model (c->big_lpt == 1). Garbage collection
958 * simply involves marking all the nodes in the LEB being garbage-collected as
959 * dirty. The dirty nodes are written next commit, after which the LEB is free
960 * to be reused.
961 *
962 * This function returns %0 on success and a negative error code on failure.
963 */
964static int make_node_dirty(struct ubifs_info *c, int node_type, int node_num,
965 int lnum, int offs)
966{
967 switch (node_type) {
968 case UBIFS_LPT_NNODE:
969 return make_nnode_dirty(c, node_num, lnum, offs);
970 case UBIFS_LPT_PNODE:
971 return make_pnode_dirty(c, node_num, lnum, offs);
972 case UBIFS_LPT_LTAB:
973 return make_ltab_dirty(c, lnum, offs);
974 case UBIFS_LPT_LSAVE:
975 return make_lsave_dirty(c, lnum, offs);
976 }
977 return -EINVAL;
978}
979
980/**
981 * get_lpt_node_len - return the length of a node based on its type.
982 * @c: UBIFS file-system description object
983 * @node_type: LPT node type
984 */
985static int get_lpt_node_len(struct ubifs_info *c, int node_type)
986{
987 switch (node_type) {
988 case UBIFS_LPT_NNODE:
989 return c->nnode_sz;
990 case UBIFS_LPT_PNODE:
991 return c->pnode_sz;
992 case UBIFS_LPT_LTAB:
993 return c->ltab_sz;
994 case UBIFS_LPT_LSAVE:
995 return c->lsave_sz;
996 }
997 return 0;
998}
999
1000/**
1001 * get_pad_len - return the length of padding in a buffer.
1002 * @c: UBIFS file-system description object
1003 * @buf: buffer
1004 * @len: length of buffer
1005 */
1006static int get_pad_len(struct ubifs_info *c, uint8_t *buf, int len)
1007{
1008 int offs, pad_len;
1009
1010 if (c->min_io_size == 1)
1011 return 0;
1012 offs = c->leb_size - len;
1013 pad_len = ALIGN(offs, c->min_io_size) - offs;
1014 return pad_len;
1015}
1016
1017/**
1018 * get_lpt_node_type - return type (and node number) of a node in a buffer.
1019 * @c: UBIFS file-system description object
1020 * @buf: buffer
1021 * @node_num: node number is returned here
1022 */
1023static int get_lpt_node_type(struct ubifs_info *c, uint8_t *buf, int *node_num)
1024{
1025 uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
1026 int pos = 0, node_type;
1027
1028 node_type = ubifs_unpack_bits(&addr, &pos, UBIFS_LPT_TYPE_BITS);
1029 *node_num = ubifs_unpack_bits(&addr, &pos, c->pcnt_bits);
1030 return node_type;
1031}
1032
1033/**
1034 * is_a_node - determine if a buffer contains a node.
1035 * @c: UBIFS file-system description object
1036 * @buf: buffer
1037 * @len: length of buffer
1038 *
1039 * This function returns %1 if the buffer contains a node or %0 if it does not.
1040 */
1041static int is_a_node(struct ubifs_info *c, uint8_t *buf, int len)
1042{
1043 uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
1044 int pos = 0, node_type, node_len;
1045 uint16_t crc, calc_crc;
1046
1047 node_type = ubifs_unpack_bits(&addr, &pos, UBIFS_LPT_TYPE_BITS);
1048 if (node_type == UBIFS_LPT_NOT_A_NODE)
1049 return 0;
1050 node_len = get_lpt_node_len(c, node_type);
1051 if (!node_len || node_len > len)
1052 return 0;
1053 pos = 0;
1054 addr = buf;
1055 crc = ubifs_unpack_bits(&addr, &pos, UBIFS_LPT_CRC_BITS);
1056 calc_crc = crc16(-1, buf + UBIFS_LPT_CRC_BYTES,
1057 node_len - UBIFS_LPT_CRC_BYTES);
1058 if (crc != calc_crc)
1059 return 0;
1060 return 1;
1061}
1062
1063
1064/**
1065 * lpt_gc_lnum - garbage collect a LPT LEB.
1066 * @c: UBIFS file-system description object
1067 * @lnum: LEB number to garbage collect
1068 *
1069 * LPT garbage collection is used only for the "big" LPT model
1070 * (c->big_lpt == 1). Garbage collection simply involves marking all the nodes
1071 * in the LEB being garbage-collected as dirty. The dirty nodes are written
1072 * next commit, after which the LEB is free to be reused.
1073 *
1074 * This function returns %0 on success and a negative error code on failure.
1075 */
1076static int lpt_gc_lnum(struct ubifs_info *c, int lnum)
1077{
1078 int err, len = c->leb_size, node_type, node_num, node_len, offs;
1079 void *buf = c->lpt_buf;
1080
1081 dbg_lp("LEB %d", lnum);
1082 err = ubi_read(c->ubi, lnum, buf, 0, c->leb_size);
1083 if (err) {
1084 ubifs_err("cannot read LEB %d, error %d", lnum, err);
1085 return err;
1086 }
1087 while (1) {
1088 if (!is_a_node(c, buf, len)) {
1089 int pad_len;
1090
1091 pad_len = get_pad_len(c, buf, len);
1092 if (pad_len) {
1093 buf += pad_len;
1094 len -= pad_len;
1095 continue;
1096 }
1097 return 0;
1098 }
1099 node_type = get_lpt_node_type(c, buf, &node_num);
1100 node_len = get_lpt_node_len(c, node_type);
1101 offs = c->leb_size - len;
1102 ubifs_assert(node_len != 0);
1103 mutex_lock(&c->lp_mutex);
1104 err = make_node_dirty(c, node_type, node_num, lnum, offs);
1105 mutex_unlock(&c->lp_mutex);
1106 if (err)
1107 return err;
1108 buf += node_len;
1109 len -= node_len;
1110 }
1111 return 0;
1112}
1113
1114/**
1115 * lpt_gc - LPT garbage collection.
1116 * @c: UBIFS file-system description object
1117 *
1118 * Select a LPT LEB for LPT garbage collection and call 'lpt_gc_lnum()'.
1119 * Returns %0 on success and a negative error code on failure.
1120 */
1121static int lpt_gc(struct ubifs_info *c)
1122{
1123 int i, lnum = -1, dirty = 0;
1124
1125 mutex_lock(&c->lp_mutex);
1126 for (i = 0; i < c->lpt_lebs; i++) {
1127 ubifs_assert(!c->ltab[i].tgc);
1128 if (i + c->lpt_first == c->nhead_lnum ||
1129 c->ltab[i].free + c->ltab[i].dirty == c->leb_size)
1130 continue;
1131 if (c->ltab[i].dirty > dirty) {
1132 dirty = c->ltab[i].dirty;
1133 lnum = i + c->lpt_first;
1134 }
1135 }
1136 mutex_unlock(&c->lp_mutex);
1137 if (lnum == -1)
1138 return -ENOSPC;
1139 return lpt_gc_lnum(c, lnum);
1140}
1141
1142/**
1143 * ubifs_lpt_start_commit - UBIFS commit starts.
1144 * @c: the UBIFS file-system description object
1145 *
1146 * This function has to be called when UBIFS starts the commit operation.
1147 * This function "freezes" all currently dirty LEB properties and does not
1148 * change them anymore. Further changes are saved and tracked separately
1149 * because they are not part of this commit. This function returns zero in case
1150 * of success and a negative error code in case of failure.
1151 */
1152int ubifs_lpt_start_commit(struct ubifs_info *c)
1153{
1154 int err, cnt;
1155
1156 dbg_lp("");
1157
1158 mutex_lock(&c->lp_mutex);
1159 err = dbg_check_ltab(c);
1160 if (err)
1161 goto out;
1162
1163 if (c->check_lpt_free) {
1164 /*
1165 * We ensure there is enough free space in
1166 * ubifs_lpt_post_commit() by marking nodes dirty. That
1167 * information is lost when we unmount, so we also need
1168 * to check free space once after mounting also.
1169 */
1170 c->check_lpt_free = 0;
1171 while (need_write_all(c)) {
1172 mutex_unlock(&c->lp_mutex);
1173 err = lpt_gc(c);
1174 if (err)
1175 return err;
1176 mutex_lock(&c->lp_mutex);
1177 }
1178 }
1179
1180 lpt_tgc_start(c);
1181
1182 if (!c->dirty_pn_cnt) {
1183 dbg_cmt("no cnodes to commit");
1184 err = 0;
1185 goto out;
1186 }
1187
1188 if (!c->big_lpt && need_write_all(c)) {
1189 /* If needed, write everything */
1190 err = make_tree_dirty(c);
1191 if (err)
1192 goto out;
1193 lpt_tgc_start(c);
1194 }
1195
1196 if (c->big_lpt)
1197 populate_lsave(c);
1198
1199 cnt = get_cnodes_to_commit(c);
1200 ubifs_assert(cnt != 0);
1201
1202 err = layout_cnodes(c);
1203 if (err)
1204 goto out;
1205
1206 /* Copy the LPT's own lprops for end commit to write */
1207 memcpy(c->ltab_cmt, c->ltab,
1208 sizeof(struct ubifs_lpt_lprops) * c->lpt_lebs);
1209 c->lpt_drty_flgs &= ~(LTAB_DIRTY | LSAVE_DIRTY);
1210
1211out:
1212 mutex_unlock(&c->lp_mutex);
1213 return err;
1214}
1215
1216/**
1217 * free_obsolete_cnodes - free obsolete cnodes for commit end.
1218 * @c: UBIFS file-system description object
1219 */
1220static void free_obsolete_cnodes(struct ubifs_info *c)
1221{
1222 struct ubifs_cnode *cnode, *cnext;
1223
1224 cnext = c->lpt_cnext;
1225 if (!cnext)
1226 return;
1227 do {
1228 cnode = cnext;
1229 cnext = cnode->cnext;
1230 if (test_bit(OBSOLETE_CNODE, &cnode->flags))
1231 kfree(cnode);
1232 else
1233 cnode->cnext = NULL;
1234 } while (cnext != c->lpt_cnext);
1235 c->lpt_cnext = NULL;
1236}
1237
1238/**
1239 * ubifs_lpt_end_commit - finish the commit operation.
1240 * @c: the UBIFS file-system description object
1241 *
1242 * This function has to be called when the commit operation finishes. It
1243 * flushes the changes which were "frozen" by 'ubifs_lprops_start_commit()' to
1244 * the media. Returns zero in case of success and a negative error code in case
1245 * of failure.
1246 */
1247int ubifs_lpt_end_commit(struct ubifs_info *c)
1248{
1249 int err;
1250
1251 dbg_lp("");
1252
1253 if (!c->lpt_cnext)
1254 return 0;
1255
1256 err = write_cnodes(c);
1257 if (err)
1258 return err;
1259
1260 mutex_lock(&c->lp_mutex);
1261 free_obsolete_cnodes(c);
1262 mutex_unlock(&c->lp_mutex);
1263
1264 return 0;
1265}
1266
1267/**
1268 * ubifs_lpt_post_commit - post commit LPT trivial GC and LPT GC.
1269 * @c: UBIFS file-system description object
1270 *
1271 * LPT trivial GC is completed after a commit. Also LPT GC is done after a
1272 * commit for the "big" LPT model.
1273 */
1274int ubifs_lpt_post_commit(struct ubifs_info *c)
1275{
1276 int err;
1277
1278 mutex_lock(&c->lp_mutex);
1279 err = lpt_tgc_end(c);
1280 if (err)
1281 goto out;
1282 if (c->big_lpt)
1283 while (need_write_all(c)) {
1284 mutex_unlock(&c->lp_mutex);
1285 err = lpt_gc(c);
1286 if (err)
1287 return err;
1288 mutex_lock(&c->lp_mutex);
1289 }
1290out:
1291 mutex_unlock(&c->lp_mutex);
1292 return err;
1293}
1294
1295/**
1296 * first_nnode - find the first nnode in memory.
1297 * @c: UBIFS file-system description object
1298 * @hght: height of tree where nnode found is returned here
1299 *
1300 * This function returns a pointer to the nnode found or %NULL if no nnode is
1301 * found. This function is a helper to 'ubifs_lpt_free()'.
1302 */
1303static struct ubifs_nnode *first_nnode(struct ubifs_info *c, int *hght)
1304{
1305 struct ubifs_nnode *nnode;
1306 int h, i, found;
1307
1308 nnode = c->nroot;
1309 *hght = 0;
1310 if (!nnode)
1311 return NULL;
1312 for (h = 1; h < c->lpt_hght; h++) {
1313 found = 0;
1314 for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
1315 if (nnode->nbranch[i].nnode) {
1316 found = 1;
1317 nnode = nnode->nbranch[i].nnode;
1318 *hght = h;
1319 break;
1320 }
1321 }
1322 if (!found)
1323 break;
1324 }
1325 return nnode;
1326}
1327
1328/**
1329 * next_nnode - find the next nnode in memory.
1330 * @c: UBIFS file-system description object
1331 * @nnode: nnode from which to start.
1332 * @hght: height of tree where nnode is, is passed and returned here
1333 *
1334 * This function returns a pointer to the nnode found or %NULL if no nnode is
1335 * found. This function is a helper to 'ubifs_lpt_free()'.
1336 */
1337static struct ubifs_nnode *next_nnode(struct ubifs_info *c,
1338 struct ubifs_nnode *nnode, int *hght)
1339{
1340 struct ubifs_nnode *parent;
1341 int iip, h, i, found;
1342
1343 parent = nnode->parent;
1344 if (!parent)
1345 return NULL;
1346 if (nnode->iip == UBIFS_LPT_FANOUT - 1) {
1347 *hght -= 1;
1348 return parent;
1349 }
1350 for (iip = nnode->iip + 1; iip < UBIFS_LPT_FANOUT; iip++) {
1351 nnode = parent->nbranch[iip].nnode;
1352 if (nnode)
1353 break;
1354 }
1355 if (!nnode) {
1356 *hght -= 1;
1357 return parent;
1358 }
1359 for (h = *hght + 1; h < c->lpt_hght; h++) {
1360 found = 0;
1361 for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
1362 if (nnode->nbranch[i].nnode) {
1363 found = 1;
1364 nnode = nnode->nbranch[i].nnode;
1365 *hght = h;
1366 break;
1367 }
1368 }
1369 if (!found)
1370 break;
1371 }
1372 return nnode;
1373}
1374
1375/**
1376 * ubifs_lpt_free - free resources owned by the LPT.
1377 * @c: UBIFS file-system description object
1378 * @wr_only: free only resources used for writing
1379 */
1380void ubifs_lpt_free(struct ubifs_info *c, int wr_only)
1381{
1382 struct ubifs_nnode *nnode;
1383 int i, hght;
1384
1385 /* Free write-only things first */
1386
1387 free_obsolete_cnodes(c); /* Leftover from a failed commit */
1388
1389 vfree(c->ltab_cmt);
1390 c->ltab_cmt = NULL;
1391 vfree(c->lpt_buf);
1392 c->lpt_buf = NULL;
1393 kfree(c->lsave);
1394 c->lsave = NULL;
1395
1396 if (wr_only)
1397 return;
1398
1399 /* Now free the rest */
1400
1401 nnode = first_nnode(c, &hght);
1402 while (nnode) {
1403 for (i = 0; i < UBIFS_LPT_FANOUT; i++)
1404 kfree(nnode->nbranch[i].nnode);
1405 nnode = next_nnode(c, nnode, &hght);
1406 }
1407 for (i = 0; i < LPROPS_HEAP_CNT; i++)
1408 kfree(c->lpt_heap[i].arr);
1409 kfree(c->dirty_idx.arr);
1410 kfree(c->nroot);
1411 vfree(c->ltab);
1412 kfree(c->lpt_nod_buf);
1413}
1414
1415#ifdef CONFIG_UBIFS_FS_DEBUG
1416
1417/**
1418 * dbg_is_all_ff - determine if a buffer contains only 0xff bytes.
1419 * @buf: buffer
1420 * @len: buffer length
1421 */
1422static int dbg_is_all_ff(uint8_t *buf, int len)
1423{
1424 int i;
1425
1426 for (i = 0; i < len; i++)
1427 if (buf[i] != 0xff)
1428 return 0;
1429 return 1;
1430}
1431
1432/**
1433 * dbg_is_nnode_dirty - determine if a nnode is dirty.
1434 * @c: the UBIFS file-system description object
1435 * @lnum: LEB number where nnode was written
1436 * @offs: offset where nnode was written
1437 */
1438static int dbg_is_nnode_dirty(struct ubifs_info *c, int lnum, int offs)
1439{
1440 struct ubifs_nnode *nnode;
1441 int hght;
1442
1443 /* Entire tree is in memory so first_nnode / next_nnode are ok */
1444 nnode = first_nnode(c, &hght);
1445 for (; nnode; nnode = next_nnode(c, nnode, &hght)) {
1446 struct ubifs_nbranch *branch;
1447
1448 cond_resched();
1449 if (nnode->parent) {
1450 branch = &nnode->parent->nbranch[nnode->iip];
1451 if (branch->lnum != lnum || branch->offs != offs)
1452 continue;
1453 if (test_bit(DIRTY_CNODE, &nnode->flags))
1454 return 1;
1455 return 0;
1456 } else {
1457 if (c->lpt_lnum != lnum || c->lpt_offs != offs)
1458 continue;
1459 if (test_bit(DIRTY_CNODE, &nnode->flags))
1460 return 1;
1461 return 0;
1462 }
1463 }
1464 return 1;
1465}
1466
1467/**
1468 * dbg_is_pnode_dirty - determine if a pnode is dirty.
1469 * @c: the UBIFS file-system description object
1470 * @lnum: LEB number where pnode was written
1471 * @offs: offset where pnode was written
1472 */
1473static int dbg_is_pnode_dirty(struct ubifs_info *c, int lnum, int offs)
1474{
1475 int i, cnt;
1476
1477 cnt = DIV_ROUND_UP(c->main_lebs, UBIFS_LPT_FANOUT);
1478 for (i = 0; i < cnt; i++) {
1479 struct ubifs_pnode *pnode;
1480 struct ubifs_nbranch *branch;
1481
1482 cond_resched();
1483 pnode = pnode_lookup(c, i);
1484 if (IS_ERR(pnode))
1485 return PTR_ERR(pnode);
1486 branch = &pnode->parent->nbranch[pnode->iip];
1487 if (branch->lnum != lnum || branch->offs != offs)
1488 continue;
1489 if (test_bit(DIRTY_CNODE, &pnode->flags))
1490 return 1;
1491 return 0;
1492 }
1493 return 1;
1494}
1495
1496/**
1497 * dbg_is_ltab_dirty - determine if a ltab node is dirty.
1498 * @c: the UBIFS file-system description object
1499 * @lnum: LEB number where ltab node was written
1500 * @offs: offset where ltab node was written
1501 */
1502static int dbg_is_ltab_dirty(struct ubifs_info *c, int lnum, int offs)
1503{
1504 if (lnum != c->ltab_lnum || offs != c->ltab_offs)
1505 return 1;
1506 return (c->lpt_drty_flgs & LTAB_DIRTY) != 0;
1507}
1508
1509/**
1510 * dbg_is_lsave_dirty - determine if a lsave node is dirty.
1511 * @c: the UBIFS file-system description object
1512 * @lnum: LEB number where lsave node was written
1513 * @offs: offset where lsave node was written
1514 */
1515static int dbg_is_lsave_dirty(struct ubifs_info *c, int lnum, int offs)
1516{
1517 if (lnum != c->lsave_lnum || offs != c->lsave_offs)
1518 return 1;
1519 return (c->lpt_drty_flgs & LSAVE_DIRTY) != 0;
1520}
1521
1522/**
1523 * dbg_is_node_dirty - determine if a node is dirty.
1524 * @c: the UBIFS file-system description object
1525 * @node_type: node type
1526 * @lnum: LEB number where node was written
1527 * @offs: offset where node was written
1528 */
1529static int dbg_is_node_dirty(struct ubifs_info *c, int node_type, int lnum,
1530 int offs)
1531{
1532 switch (node_type) {
1533 case UBIFS_LPT_NNODE:
1534 return dbg_is_nnode_dirty(c, lnum, offs);
1535 case UBIFS_LPT_PNODE:
1536 return dbg_is_pnode_dirty(c, lnum, offs);
1537 case UBIFS_LPT_LTAB:
1538 return dbg_is_ltab_dirty(c, lnum, offs);
1539 case UBIFS_LPT_LSAVE:
1540 return dbg_is_lsave_dirty(c, lnum, offs);
1541 }
1542 return 1;
1543}
1544
1545/**
1546 * dbg_check_ltab_lnum - check the ltab for a LPT LEB number.
1547 * @c: the UBIFS file-system description object
1548 * @lnum: LEB number where node was written
1549 * @offs: offset where node was written
1550 *
1551 * This function returns %0 on success and a negative error code on failure.
1552 */
1553static int dbg_check_ltab_lnum(struct ubifs_info *c, int lnum)
1554{
1555 int err, len = c->leb_size, dirty = 0, node_type, node_num, node_len;
1556 int ret;
1557 void *buf = c->dbg_buf;
1558
1559 dbg_lp("LEB %d", lnum);
1560 err = ubi_read(c->ubi, lnum, buf, 0, c->leb_size);
1561 if (err) {
1562 dbg_msg("ubi_read failed, LEB %d, error %d", lnum, err);
1563 return err;
1564 }
1565 while (1) {
1566 if (!is_a_node(c, buf, len)) {
1567 int i, pad_len;
1568
1569 pad_len = get_pad_len(c, buf, len);
1570 if (pad_len) {
1571 buf += pad_len;
1572 len -= pad_len;
1573 dirty += pad_len;
1574 continue;
1575 }
1576 if (!dbg_is_all_ff(buf, len)) {
1577 dbg_msg("invalid empty space in LEB %d at %d",
1578 lnum, c->leb_size - len);
1579 err = -EINVAL;
1580 }
1581 i = lnum - c->lpt_first;
1582 if (len != c->ltab[i].free) {
1583 dbg_msg("invalid free space in LEB %d "
1584 "(free %d, expected %d)",
1585 lnum, len, c->ltab[i].free);
1586 err = -EINVAL;
1587 }
1588 if (dirty != c->ltab[i].dirty) {
1589 dbg_msg("invalid dirty space in LEB %d "
1590 "(dirty %d, expected %d)",
1591 lnum, dirty, c->ltab[i].dirty);
1592 err = -EINVAL;
1593 }
1594 return err;
1595 }
1596 node_type = get_lpt_node_type(c, buf, &node_num);
1597 node_len = get_lpt_node_len(c, node_type);
1598 ret = dbg_is_node_dirty(c, node_type, lnum, c->leb_size - len);
1599 if (ret == 1)
1600 dirty += node_len;
1601 buf += node_len;
1602 len -= node_len;
1603 }
1604}
1605
1606/**
1607 * dbg_check_ltab - check the free and dirty space in the ltab.
1608 * @c: the UBIFS file-system description object
1609 *
1610 * This function returns %0 on success and a negative error code on failure.
1611 */
1612int dbg_check_ltab(struct ubifs_info *c)
1613{
1614 int lnum, err, i, cnt;
1615
1616 if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS))
1617 return 0;
1618
1619 /* Bring the entire tree into memory */
1620 cnt = DIV_ROUND_UP(c->main_lebs, UBIFS_LPT_FANOUT);
1621 for (i = 0; i < cnt; i++) {
1622 struct ubifs_pnode *pnode;
1623
1624 pnode = pnode_lookup(c, i);
1625 if (IS_ERR(pnode))
1626 return PTR_ERR(pnode);
1627 cond_resched();
1628 }
1629
1630 /* Check nodes */
1631 err = dbg_check_lpt_nodes(c, (struct ubifs_cnode *)c->nroot, 0, 0);
1632 if (err)
1633 return err;
1634
1635 /* Check each LEB */
1636 for (lnum = c->lpt_first; lnum <= c->lpt_last; lnum++) {
1637 err = dbg_check_ltab_lnum(c, lnum);
1638 if (err) {
1639 dbg_err("failed at LEB %d", lnum);
1640 return err;
1641 }
1642 }
1643
1644 dbg_lp("succeeded");
1645 return 0;
1646}
1647
1648#endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/master.c b/fs/ubifs/master.c
new file mode 100644
index 000000000000..71d5493bf565
--- /dev/null
+++ b/fs/ubifs/master.c
@@ -0,0 +1,387 @@
1/*
2 * This file is part of UBIFS.
3 *
4 * Copyright (C) 2006-2008 Nokia Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published by
8 * the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along with
16 * this program; if not, write to the Free Software Foundation, Inc., 51
17 * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 * Authors: Artem Bityutskiy (Битюцкий Артём)
20 * Adrian Hunter
21 */
22
23/* This file implements reading and writing the master node */
24
25#include "ubifs.h"
26
27/**
28 * scan_for_master - search the valid master node.
29 * @c: UBIFS file-system description object
30 *
31 * This function scans the master node LEBs and search for the latest master
32 * node. Returns zero in case of success and a negative error code in case of
33 * failure.
34 */
35static int scan_for_master(struct ubifs_info *c)
36{
37 struct ubifs_scan_leb *sleb;
38 struct ubifs_scan_node *snod;
39 int lnum, offs = 0, nodes_cnt;
40
41 lnum = UBIFS_MST_LNUM;
42
43 sleb = ubifs_scan(c, lnum, 0, c->sbuf);
44 if (IS_ERR(sleb))
45 return PTR_ERR(sleb);
46 nodes_cnt = sleb->nodes_cnt;
47 if (nodes_cnt > 0) {
48 snod = list_entry(sleb->nodes.prev, struct ubifs_scan_node,
49 list);
50 if (snod->type != UBIFS_MST_NODE)
51 goto out;
52 memcpy(c->mst_node, snod->node, snod->len);
53 offs = snod->offs;
54 }
55 ubifs_scan_destroy(sleb);
56
57 lnum += 1;
58
59 sleb = ubifs_scan(c, lnum, 0, c->sbuf);
60 if (IS_ERR(sleb))
61 return PTR_ERR(sleb);
62 if (sleb->nodes_cnt != nodes_cnt)
63 goto out;
64 if (!sleb->nodes_cnt)
65 goto out;
66 snod = list_entry(sleb->nodes.prev, struct ubifs_scan_node, list);
67 if (snod->type != UBIFS_MST_NODE)
68 goto out;
69 if (snod->offs != offs)
70 goto out;
71 if (memcmp((void *)c->mst_node + UBIFS_CH_SZ,
72 (void *)snod->node + UBIFS_CH_SZ,
73 UBIFS_MST_NODE_SZ - UBIFS_CH_SZ))
74 goto out;
75 c->mst_offs = offs;
76 ubifs_scan_destroy(sleb);
77 return 0;
78
79out:
80 ubifs_scan_destroy(sleb);
81 return -EINVAL;
82}
83
84/**
85 * validate_master - validate master node.
86 * @c: UBIFS file-system description object
87 *
88 * This function validates data which was read from master node. Returns zero
89 * if the data is all right and %-EINVAL if not.
90 */
91static int validate_master(const struct ubifs_info *c)
92{
93 long long main_sz;
94 int err;
95
96 if (c->max_sqnum >= SQNUM_WATERMARK) {
97 err = 1;
98 goto out;
99 }
100
101 if (c->cmt_no >= c->max_sqnum) {
102 err = 2;
103 goto out;
104 }
105
106 if (c->highest_inum >= INUM_WATERMARK) {
107 err = 3;
108 goto out;
109 }
110
111 if (c->lhead_lnum < UBIFS_LOG_LNUM ||
112 c->lhead_lnum >= UBIFS_LOG_LNUM + c->log_lebs ||
113 c->lhead_offs < 0 || c->lhead_offs >= c->leb_size ||
114 c->lhead_offs & (c->min_io_size - 1)) {
115 err = 4;
116 goto out;
117 }
118
119 if (c->zroot.lnum >= c->leb_cnt || c->zroot.lnum < c->main_first ||
120 c->zroot.offs >= c->leb_size || c->zroot.offs & 7) {
121 err = 5;
122 goto out;
123 }
124
125 if (c->zroot.len < c->ranges[UBIFS_IDX_NODE].min_len ||
126 c->zroot.len > c->ranges[UBIFS_IDX_NODE].max_len) {
127 err = 6;
128 goto out;
129 }
130
131 if (c->gc_lnum >= c->leb_cnt || c->gc_lnum < c->main_first) {
132 err = 7;
133 goto out;
134 }
135
136 if (c->ihead_lnum >= c->leb_cnt || c->ihead_lnum < c->main_first ||
137 c->ihead_offs % c->min_io_size || c->ihead_offs < 0 ||
138 c->ihead_offs > c->leb_size || c->ihead_offs & 7) {
139 err = 8;
140 goto out;
141 }
142
143 main_sz = (long long)c->main_lebs * c->leb_size;
144 if (c->old_idx_sz & 7 || c->old_idx_sz >= main_sz) {
145 err = 9;
146 goto out;
147 }
148
149 if (c->lpt_lnum < c->lpt_first || c->lpt_lnum > c->lpt_last ||
150 c->lpt_offs < 0 || c->lpt_offs + c->nnode_sz > c->leb_size) {
151 err = 10;
152 goto out;
153 }
154
155 if (c->nhead_lnum < c->lpt_first || c->nhead_lnum > c->lpt_last ||
156 c->nhead_offs < 0 || c->nhead_offs % c->min_io_size ||
157 c->nhead_offs > c->leb_size) {
158 err = 11;
159 goto out;
160 }
161
162 if (c->ltab_lnum < c->lpt_first || c->ltab_lnum > c->lpt_last ||
163 c->ltab_offs < 0 ||
164 c->ltab_offs + c->ltab_sz > c->leb_size) {
165 err = 12;
166 goto out;
167 }
168
169 if (c->big_lpt && (c->lsave_lnum < c->lpt_first ||
170 c->lsave_lnum > c->lpt_last || c->lsave_offs < 0 ||
171 c->lsave_offs + c->lsave_sz > c->leb_size)) {
172 err = 13;
173 goto out;
174 }
175
176 if (c->lscan_lnum < c->main_first || c->lscan_lnum >= c->leb_cnt) {
177 err = 14;
178 goto out;
179 }
180
181 if (c->lst.empty_lebs < 0 || c->lst.empty_lebs > c->main_lebs - 2) {
182 err = 15;
183 goto out;
184 }
185
186 if (c->lst.idx_lebs < 0 || c->lst.idx_lebs > c->main_lebs - 1) {
187 err = 16;
188 goto out;
189 }
190
191 if (c->lst.total_free < 0 || c->lst.total_free > main_sz ||
192 c->lst.total_free & 7) {
193 err = 17;
194 goto out;
195 }
196
197 if (c->lst.total_dirty < 0 || (c->lst.total_dirty & 7)) {
198 err = 18;
199 goto out;
200 }
201
202 if (c->lst.total_used < 0 || (c->lst.total_used & 7)) {
203 err = 19;
204 goto out;
205 }
206
207 if (c->lst.total_free + c->lst.total_dirty +
208 c->lst.total_used > main_sz) {
209 err = 20;
210 goto out;
211 }
212
213 if (c->lst.total_dead + c->lst.total_dark +
214 c->lst.total_used + c->old_idx_sz > main_sz) {
215 err = 21;
216 goto out;
217 }
218
219 if (c->lst.total_dead < 0 ||
220 c->lst.total_dead > c->lst.total_free + c->lst.total_dirty ||
221 c->lst.total_dead & 7) {
222 err = 22;
223 goto out;
224 }
225
226 if (c->lst.total_dark < 0 ||
227 c->lst.total_dark > c->lst.total_free + c->lst.total_dirty ||
228 c->lst.total_dark & 7) {
229 err = 23;
230 goto out;
231 }
232
233 return 0;
234
235out:
236 ubifs_err("bad master node at offset %d error %d", c->mst_offs, err);
237 dbg_dump_node(c, c->mst_node);
238 return -EINVAL;
239}
240
241/**
242 * ubifs_read_master - read master node.
243 * @c: UBIFS file-system description object
244 *
245 * This function finds and reads the master node during file-system mount. If
246 * the flash is empty, it creates default master node as well. Returns zero in
247 * case of success and a negative error code in case of failure.
248 */
249int ubifs_read_master(struct ubifs_info *c)
250{
251 int err, old_leb_cnt;
252
253 c->mst_node = kzalloc(c->mst_node_alsz, GFP_KERNEL);
254 if (!c->mst_node)
255 return -ENOMEM;
256
257 err = scan_for_master(c);
258 if (err) {
259 err = ubifs_recover_master_node(c);
260 if (err)
261 /*
262 * Note, we do not free 'c->mst_node' here because the
263 * unmount routine will take care of this.
264 */
265 return err;
266 }
267
268 /* Make sure that the recovery flag is clear */
269 c->mst_node->flags &= cpu_to_le32(~UBIFS_MST_RCVRY);
270
271 c->max_sqnum = le64_to_cpu(c->mst_node->ch.sqnum);
272 c->highest_inum = le64_to_cpu(c->mst_node->highest_inum);
273 c->cmt_no = le64_to_cpu(c->mst_node->cmt_no);
274 c->zroot.lnum = le32_to_cpu(c->mst_node->root_lnum);
275 c->zroot.offs = le32_to_cpu(c->mst_node->root_offs);
276 c->zroot.len = le32_to_cpu(c->mst_node->root_len);
277 c->lhead_lnum = le32_to_cpu(c->mst_node->log_lnum);
278 c->gc_lnum = le32_to_cpu(c->mst_node->gc_lnum);
279 c->ihead_lnum = le32_to_cpu(c->mst_node->ihead_lnum);
280 c->ihead_offs = le32_to_cpu(c->mst_node->ihead_offs);
281 c->old_idx_sz = le64_to_cpu(c->mst_node->index_size);
282 c->lpt_lnum = le32_to_cpu(c->mst_node->lpt_lnum);
283 c->lpt_offs = le32_to_cpu(c->mst_node->lpt_offs);
284 c->nhead_lnum = le32_to_cpu(c->mst_node->nhead_lnum);
285 c->nhead_offs = le32_to_cpu(c->mst_node->nhead_offs);
286 c->ltab_lnum = le32_to_cpu(c->mst_node->ltab_lnum);
287 c->ltab_offs = le32_to_cpu(c->mst_node->ltab_offs);
288 c->lsave_lnum = le32_to_cpu(c->mst_node->lsave_lnum);
289 c->lsave_offs = le32_to_cpu(c->mst_node->lsave_offs);
290 c->lscan_lnum = le32_to_cpu(c->mst_node->lscan_lnum);
291 c->lst.empty_lebs = le32_to_cpu(c->mst_node->empty_lebs);
292 c->lst.idx_lebs = le32_to_cpu(c->mst_node->idx_lebs);
293 old_leb_cnt = le32_to_cpu(c->mst_node->leb_cnt);
294 c->lst.total_free = le64_to_cpu(c->mst_node->total_free);
295 c->lst.total_dirty = le64_to_cpu(c->mst_node->total_dirty);
296 c->lst.total_used = le64_to_cpu(c->mst_node->total_used);
297 c->lst.total_dead = le64_to_cpu(c->mst_node->total_dead);
298 c->lst.total_dark = le64_to_cpu(c->mst_node->total_dark);
299
300 c->calc_idx_sz = c->old_idx_sz;
301
302 if (c->mst_node->flags & cpu_to_le32(UBIFS_MST_NO_ORPHS))
303 c->no_orphs = 1;
304
305 if (old_leb_cnt != c->leb_cnt) {
306 /* The file system has been resized */
307 int growth = c->leb_cnt - old_leb_cnt;
308
309 if (c->leb_cnt < old_leb_cnt ||
310 c->leb_cnt < UBIFS_MIN_LEB_CNT) {
311 ubifs_err("bad leb_cnt on master node");
312 dbg_dump_node(c, c->mst_node);
313 return -EINVAL;
314 }
315
316 dbg_mnt("Auto resizing (master) from %d LEBs to %d LEBs",
317 old_leb_cnt, c->leb_cnt);
318 c->lst.empty_lebs += growth;
319 c->lst.total_free += growth * (long long)c->leb_size;
320 c->lst.total_dark += growth * (long long)c->dark_wm;
321
322 /*
323 * Reflect changes back onto the master node. N.B. the master
324 * node gets written immediately whenever mounting (or
325 * remounting) in read-write mode, so we do not need to write it
326 * here.
327 */
328 c->mst_node->leb_cnt = cpu_to_le32(c->leb_cnt);
329 c->mst_node->empty_lebs = cpu_to_le32(c->lst.empty_lebs);
330 c->mst_node->total_free = cpu_to_le64(c->lst.total_free);
331 c->mst_node->total_dark = cpu_to_le64(c->lst.total_dark);
332 }
333
334 err = validate_master(c);
335 if (err)
336 return err;
337
338 err = dbg_old_index_check_init(c, &c->zroot);
339
340 return err;
341}
342
343/**
344 * ubifs_write_master - write master node.
345 * @c: UBIFS file-system description object
346 *
347 * This function writes the master node. The caller has to take the
348 * @c->mst_mutex lock before calling this function. Returns zero in case of
349 * success and a negative error code in case of failure. The master node is
350 * written twice to enable recovery.
351 */
352int ubifs_write_master(struct ubifs_info *c)
353{
354 int err, lnum, offs, len;
355
356 if (c->ro_media)
357 return -EINVAL;
358
359 lnum = UBIFS_MST_LNUM;
360 offs = c->mst_offs + c->mst_node_alsz;
361 len = UBIFS_MST_NODE_SZ;
362
363 if (offs + UBIFS_MST_NODE_SZ > c->leb_size) {
364 err = ubifs_leb_unmap(c, lnum);
365 if (err)
366 return err;
367 offs = 0;
368 }
369
370 c->mst_offs = offs;
371 c->mst_node->highest_inum = cpu_to_le64(c->highest_inum);
372
373 err = ubifs_write_node(c, c->mst_node, len, lnum, offs, UBI_SHORTTERM);
374 if (err)
375 return err;
376
377 lnum += 1;
378
379 if (offs == 0) {
380 err = ubifs_leb_unmap(c, lnum);
381 if (err)
382 return err;
383 }
384 err = ubifs_write_node(c, c->mst_node, len, lnum, offs, UBI_SHORTTERM);
385
386 return err;
387}
diff --git a/fs/ubifs/misc.h b/fs/ubifs/misc.h
new file mode 100644
index 000000000000..4beccfc256d2
--- /dev/null
+++ b/fs/ubifs/misc.h
@@ -0,0 +1,342 @@
1/*
2 * This file is part of UBIFS.
3 *
4 * Copyright (C) 2006-2008 Nokia Corporation
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published by
8 * the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along with
16 * this program; if not, write to the Free Software Foundation, Inc., 51
17 * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 * Authors: Artem Bityutskiy (Битюцкий Артём)
20 * Adrian Hunter
21 */
22
23/*
24 * This file contains miscellaneous helper functions.
25 */
26
27#ifndef __UBIFS_MISC_H__
28#define __UBIFS_MISC_H__
29
30/**
31 * ubifs_zn_dirty - check if znode is dirty.
32 * @znode: znode to check
33 *
34 * This helper function returns %1 if @znode is dirty and %0 otherwise.
35 */
36static inline int ubifs_zn_dirty(const struct ubifs_znode *znode)
37{
38 return !!test_bit(DIRTY_ZNODE, &znode->flags);
39}
40
41/**
42 * ubifs_wake_up_bgt - wake up background thread.
43 * @c: UBIFS file-system description object
44 */
45static inline void ubifs_wake_up_bgt(struct ubifs_info *c)
46{
47 if (c->bgt && !c->need_bgt) {
48 c->need_bgt = 1;
49 wake_up_process(c->bgt);
50 }
51}
52
53/**
54 * ubifs_tnc_find_child - find next child in znode.
55 * @znode: znode to search at
56 * @start: the zbranch index to start at
57 *
58 * This helper function looks for znode child starting at index @start. Returns
59 * the child or %NULL if no children were found.
60 */
61static inline struct ubifs_znode *
62ubifs_tnc_find_child(struct ubifs_znode *znode, int start)
63{
64 while (start < znode->child_cnt) {
65 if (znode->zbranch[start].znode)
66 return znode->zbranch[start].znode;
67 start += 1;
68 }
69
70 return NULL;
71}
72
73/**
74 * ubifs_inode - get UBIFS inode information by VFS 'struct inode' object.
75 * @inode: the VFS 'struct inode' pointer
76 */
77static inline struct ubifs_inode *ubifs_inode(const struct inode *inode)
78{
79 return container_of(inode, struct ubifs_inode, vfs_inode);
80}
81
82/**
83 * ubifs_ro_mode - switch UBIFS to read read-only mode.
84 * @c: UBIFS file-system description object
85 * @err: error code which is the reason of switching to R/O mode
86 */
87static inline void ubifs_ro_mode(struct ubifs_info *c, int err)
88{
89 if (!c->ro_media) {
90 c->ro_media = 1;
91 ubifs_warn("switched to read-only mode, error %d", err);
92 dbg_dump_stack();
93 }
94}
95
96/**
97 * ubifs_compr_present - check if compressor was compiled in.
98 * @compr_type: compressor type to check
99 *
100 * This function returns %1 of compressor of type @compr_type is present, and
101 * %0 if not.
102 */
103static inline int ubifs_compr_present(int compr_type)
104{
105 ubifs_assert(compr_type >= 0 && compr_type < UBIFS_COMPR_TYPES_CNT);
106 return !!ubifs_compressors[compr_type]->capi_name;
107}
108
109/**
110 * ubifs_compr_name - get compressor name string by its type.
111 * @compr_type: compressor type
112 *
113 * This function returns compressor type string.
114 */
115static inline const char *ubifs_compr_name(int compr_type)
116{
117 ubifs_assert(compr_type >= 0 && compr_type < UBIFS_COMPR_TYPES_CNT);
118 return ubifs_compressors[compr_type]->name;
119}
120
121/**
122 * ubifs_wbuf_sync - synchronize write-buffer.
123 * @wbuf: write-buffer to synchronize
124 *
125 * This is the same as as 'ubifs_wbuf_sync_nolock()' but it does not assume
126 * that the write-buffer is already locked.
127 */
128static inline int ubifs_wbuf_sync(struct ubifs_wbuf *wbuf)
129{
130 int err;
131
132 mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
133 err = ubifs_wbuf_sync_nolock(wbuf);
134 mutex_unlock(&wbuf->io_mutex);
135 return err;
136}
137
138/**
139 * ubifs_leb_unmap - unmap an LEB.
140 * @c: UBIFS file-system description object
141 * @lnum: LEB number to unmap
142 *
143 * This function returns %0 on success and a negative error code on failure.
144 */
145static inline int ubifs_leb_unmap(const struct ubifs_info *c, int lnum)
146{
147 int err;
148
149 if (c->ro_media)
150 return -EROFS;
151 err = ubi_leb_unmap(c->ubi, lnum);
152 if (err) {
153 ubifs_err("unmap LEB %d failed, error %d", lnum, err);
154 return err;
155 }
156
157 return 0;
158}
159
160/**
161 * ubifs_leb_write - write to a LEB.
162 * @c: UBIFS file-system description object
163 * @lnum: LEB number to write
164 * @buf: buffer to write from
165 * @offs: offset within LEB to write to
166 * @len: length to write
167 * @dtype: data type
168 *
169 * This function returns %0 on success and a negative error code on failure.
170 */
171static inline int ubifs_leb_write(const struct ubifs_info *c, int lnum,
172 const void *buf, int offs, int len, int dtype)
173{
174 int err;
175
176 if (c->ro_media)
177 return -EROFS;
178 err = ubi_leb_write(c->ubi, lnum, buf, offs, len, dtype);
179 if (err) {
180 ubifs_err("writing %d bytes at %d:%d, error %d",
181 len, lnum, offs, err);
182 return err;
183 }
184
185 return 0;
186}
187
188/**
189 * ubifs_leb_change - atomic LEB change.
190 * @c: UBIFS file-system description object
191 * @lnum: LEB number to write
192 * @buf: buffer to write from
193 * @len: length to write
194 * @dtype: data type
195 *
196 * This function returns %0 on success and a negative error code on failure.
197 */
198static inline int ubifs_leb_change(const struct ubifs_info *c, int lnum,
199 const void *buf, int len, int dtype)
200{
201 int err;
202
203 if (c->ro_media)
204 return -EROFS;
205 err = ubi_leb_change(c->ubi, lnum, buf, len, dtype);
206 if (err) {
207 ubifs_err("changing %d bytes in LEB %d, error %d",
208 len, lnum, err);
209 return err;
210 }
211
212 return 0;
213}
214
215/**
216 * ubifs_encode_dev - encode device node IDs.
217 * @dev: UBIFS device node information
218 * @rdev: device IDs to encode
219 *
220 * This is a helper function which encodes major/minor numbers of a device node
221 * into UBIFS device node description. We use standard Linux "new" and "huge"
222 * encodings.
223 */
224static inline int ubifs_encode_dev(union ubifs_dev_desc *dev, dev_t rdev)
225{
226 if (new_valid_dev(rdev)) {
227 dev->new = cpu_to_le32(new_encode_dev(rdev));
228 return sizeof(dev->new);
229 } else {
230 dev->huge = cpu_to_le64(huge_encode_dev(rdev));
231 return sizeof(dev->huge);
232 }
233}
234
235/**
236 * ubifs_add_dirt - add dirty space to LEB properties.
237 * @c: the UBIFS file-system description object
238 * @lnum: LEB to add dirty space for
239 * @dirty: dirty space to add
240 *
241 * This is a helper function which increased amount of dirty LEB space. Returns
242 * zero in case of success and a negative error code in case of failure.
243 */
244static inline int ubifs_add_dirt(struct ubifs_info *c, int lnum, int dirty)
245{
246 return ubifs_update_one_lp(c, lnum, LPROPS_NC, dirty, 0, 0);
247}
248
249/**
250 * ubifs_return_leb - return LEB to lprops.
251 * @c: the UBIFS file-system description object
252 * @lnum: LEB to return
253 *
254 * This helper function cleans the "taken" flag of a logical eraseblock in the
255 * lprops. Returns zero in case of success and a negative error code in case of
256 * failure.
257 */
258static inline int ubifs_return_leb(struct ubifs_info *c, int lnum)
259{
260 return ubifs_change_one_lp(c, lnum, LPROPS_NC, LPROPS_NC, 0,
261 LPROPS_TAKEN, 0);
262}
263
264/**
265 * ubifs_idx_node_sz - return index node size.
266 * @c: the UBIFS file-system description object
267 * @child_cnt: number of children of this index node
268 */
269static inline int ubifs_idx_node_sz(const struct ubifs_info *c, int child_cnt)
270{
271 return UBIFS_IDX_NODE_SZ + (UBIFS_BRANCH_SZ + c->key_len) * child_cnt;
272}
273
274/**
275 * ubifs_idx_branch - return pointer to an index branch.
276 * @c: the UBIFS file-system description object
277 * @idx: index node
278 * @bnum: branch number
279 */
280static inline
281struct ubifs_branch *ubifs_idx_branch(const struct ubifs_info *c,
282 const struct ubifs_idx_node *idx,
283 int bnum)
284{
285 return (struct ubifs_branch *)((void *)idx->branches +
286 (UBIFS_BRANCH_SZ + c->key_len) * bnum);
287}
288
289/**
290 * ubifs_idx_key - return pointer to an index key.
291 * @c: the UBIFS file-system description object
292 * @idx: index node
293 */
294static inline void *ubifs_idx_key(const struct ubifs_info *c,
295 const struct ubifs_idx_node *idx)
296{
297 return (void *)((struct ubifs_branch *)idx->branches)->key;
298}
299
300/**
301 * ubifs_reported_space - calculate reported free space.
302 * @c: the UBIFS file-system description object
303 * @free: amount of free space
304 *
305 * This function calculates amount of free space which will be reported to
306 * user-space. User-space application tend to expect that if the file-system
307 * (e.g., via the 'statfs()' call) reports that it has N bytes available, they
308 * are able to write a file of size N. UBIFS attaches node headers to each data
309 * node and it has to write indexind nodes as well. This introduces additional
310 * overhead, and UBIFS it has to report sligtly less free space to meet the
311 * above expectetion.
312 *
313 * This function assumes free space is made up of uncompressed data nodes and
314 * full index nodes (one per data node, doubled because we always allow enough
315 * space to write the index twice).
316 *
317 * Note, the calculation is pessimistic, which means that most of the time
318 * UBIFS reports less space than it actually has.
319 */
320static inline long long ubifs_reported_space(const struct ubifs_info *c,
321 uint64_t free)
322{
323 int divisor, factor;
324
325 divisor = UBIFS_MAX_DATA_NODE_SZ + (c->max_idx_node_sz << 1);
326 factor = UBIFS_MAX_DATA_NODE_SZ - UBIFS_DATA_NODE_SZ;
327 do_div(free, divisor);
328
329 return free * factor;
330}
331
332/**
333 * ubifs_current_time - round current time to time granularity.
334 * @inode: inode
335 */
336static inline struct timespec ubifs_current_time(struct inode *inode)
337{
338 return (inode->i_sb->s_time_gran < NSEC_PER_SEC) ?
339 current_fs_time(inode->i_sb) : CURRENT_TIME_SEC;
340}
341
342#endif /* __UBIFS_MISC_H__ */
diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c
new file mode 100644
index 000000000000..3afeb9242c6a
--- /dev/null
+++ b/fs/ubifs/orphan.c
@@ -0,0 +1,958 @@
1/*
2 * This file is part of UBIFS.
3 *
4 * Copyright (C) 2006-2008 Nokia Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published by
8 * the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along with
16 * this program; if not, write to the Free Software Foundation, Inc., 51
17 * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 * Author: Adrian Hunter
20 */
21
22#include "ubifs.h"
23
24/*
25 * An orphan is an inode number whose inode node has been committed to the index
26 * with a link count of zero. That happens when an open file is deleted
27 * (unlinked) and then a commit is run. In the normal course of events the inode
28 * would be deleted when the file is closed. However in the case of an unclean
29 * unmount, orphans need to be accounted for. After an unclean unmount, the
30 * orphans' inodes must be deleted which means either scanning the entire index
31 * looking for them, or keeping a list on flash somewhere. This unit implements
32 * the latter approach.
33 *
34 * The orphan area is a fixed number of LEBs situated between the LPT area and
35 * the main area. The number of orphan area LEBs is specified when the file
36 * system is created. The minimum number is 1. The size of the orphan area
37 * should be so that it can hold the maximum number of orphans that are expected
38 * to ever exist at one time.
39 *
40 * The number of orphans that can fit in a LEB is:
41 *
42 * (c->leb_size - UBIFS_ORPH_NODE_SZ) / sizeof(__le64)
43 *
44 * For example: a 15872 byte LEB can fit 1980 orphans so 1 LEB may be enough.
45 *
46 * Orphans are accumulated in a rb-tree. When an inode's link count drops to
47 * zero, the inode number is added to the rb-tree. It is removed from the tree
48 * when the inode is deleted. Any new orphans that are in the orphan tree when
49 * the commit is run, are written to the orphan area in 1 or more orph nodes.
50 * If the orphan area is full, it is consolidated to make space. There is
51 * always enough space because validation prevents the user from creating more
52 * than the maximum number of orphans allowed.
53 */
54
55#ifdef CONFIG_UBIFS_FS_DEBUG
56static int dbg_check_orphans(struct ubifs_info *c);
57#else
58#define dbg_check_orphans(c) 0
59#endif
60
61/**
62 * ubifs_add_orphan - add an orphan.
63 * @c: UBIFS file-system description object
64 * @inum: orphan inode number
65 *
66 * Add an orphan. This function is called when an inodes link count drops to
67 * zero.
68 */
69int ubifs_add_orphan(struct ubifs_info *c, ino_t inum)
70{
71 struct ubifs_orphan *orphan, *o;
72 struct rb_node **p, *parent = NULL;
73
74 orphan = kzalloc(sizeof(struct ubifs_orphan), GFP_NOFS);
75 if (!orphan)
76 return -ENOMEM;
77 orphan->inum = inum;
78 orphan->new = 1;
79
80 spin_lock(&c->orphan_lock);
81 if (c->tot_orphans >= c->max_orphans) {
82 spin_unlock(&c->orphan_lock);
83 kfree(orphan);
84 return -ENFILE;
85 }
86 p = &c->orph_tree.rb_node;
87 while (*p) {
88 parent = *p;
89 o = rb_entry(parent, struct ubifs_orphan, rb);
90 if (inum < o->inum)
91 p = &(*p)->rb_left;
92 else if (inum > o->inum)
93 p = &(*p)->rb_right;
94 else {
95 dbg_err("orphaned twice");
96 spin_unlock(&c->orphan_lock);
97 kfree(orphan);
98 return 0;
99 }
100 }
101 c->tot_orphans += 1;
102 c->new_orphans += 1;
103 rb_link_node(&orphan->rb, parent, p);
104 rb_insert_color(&orphan->rb, &c->orph_tree);
105 list_add_tail(&orphan->list, &c->orph_list);
106 list_add_tail(&orphan->new_list, &c->orph_new);
107 spin_unlock(&c->orphan_lock);
108 dbg_gen("ino %lu", inum);
109 return 0;
110}
111
112/**
113 * ubifs_delete_orphan - delete an orphan.
114 * @c: UBIFS file-system description object
115 * @inum: orphan inode number
116 *
117 * Delete an orphan. This function is called when an inode is deleted.
118 */
119void ubifs_delete_orphan(struct ubifs_info *c, ino_t inum)
120{
121 struct ubifs_orphan *o;
122 struct rb_node *p;
123
124 spin_lock(&c->orphan_lock);
125 p = c->orph_tree.rb_node;
126 while (p) {
127 o = rb_entry(p, struct ubifs_orphan, rb);
128 if (inum < o->inum)
129 p = p->rb_left;
130 else if (inum > o->inum)
131 p = p->rb_right;
132 else {
133 if (o->dnext) {
134 spin_unlock(&c->orphan_lock);
135 dbg_gen("deleted twice ino %lu", inum);
136 return;
137 }
138 if (o->cnext) {
139 o->dnext = c->orph_dnext;
140 c->orph_dnext = o;
141 spin_unlock(&c->orphan_lock);
142 dbg_gen("delete later ino %lu", inum);
143 return;
144 }
145 rb_erase(p, &c->orph_tree);
146 list_del(&o->list);
147 c->tot_orphans -= 1;
148 if (o->new) {
149 list_del(&o->new_list);
150 c->new_orphans -= 1;
151 }
152 spin_unlock(&c->orphan_lock);
153 kfree(o);
154 dbg_gen("inum %lu", inum);
155 return;
156 }
157 }
158 spin_unlock(&c->orphan_lock);
159 dbg_err("missing orphan ino %lu", inum);
160 dbg_dump_stack();
161}
162
163/**
164 * ubifs_orphan_start_commit - start commit of orphans.
165 * @c: UBIFS file-system description object
166 *
167 * Start commit of orphans.
168 */
169int ubifs_orphan_start_commit(struct ubifs_info *c)
170{
171 struct ubifs_orphan *orphan, **last;
172
173 spin_lock(&c->orphan_lock);
174 last = &c->orph_cnext;
175 list_for_each_entry(orphan, &c->orph_new, new_list) {
176 ubifs_assert(orphan->new);
177 orphan->new = 0;
178 *last = orphan;
179 last = &orphan->cnext;
180 }
181 *last = orphan->cnext;
182 c->cmt_orphans = c->new_orphans;
183 c->new_orphans = 0;
184 dbg_cmt("%d orphans to commit", c->cmt_orphans);
185 INIT_LIST_HEAD(&c->orph_new);
186 if (c->tot_orphans == 0)
187 c->no_orphs = 1;
188 else
189 c->no_orphs = 0;
190 spin_unlock(&c->orphan_lock);
191 return 0;
192}
193
194/**
195 * avail_orphs - calculate available space.
196 * @c: UBIFS file-system description object
197 *
198 * This function returns the number of orphans that can be written in the
199 * available space.
200 */
201static int avail_orphs(struct ubifs_info *c)
202{
203 int avail_lebs, avail, gap;
204
205 avail_lebs = c->orph_lebs - (c->ohead_lnum - c->orph_first) - 1;
206 avail = avail_lebs *
207 ((c->leb_size - UBIFS_ORPH_NODE_SZ) / sizeof(__le64));
208 gap = c->leb_size - c->ohead_offs;
209 if (gap >= UBIFS_ORPH_NODE_SZ + sizeof(__le64))
210 avail += (gap - UBIFS_ORPH_NODE_SZ) / sizeof(__le64);
211 return avail;
212}
213
214/**
215 * tot_avail_orphs - calculate total space.
216 * @c: UBIFS file-system description object
217 *
218 * This function returns the number of orphans that can be written in half
219 * the total space. That leaves half the space for adding new orphans.
220 */
221static int tot_avail_orphs(struct ubifs_info *c)
222{
223 int avail_lebs, avail;
224
225 avail_lebs = c->orph_lebs;
226 avail = avail_lebs *
227 ((c->leb_size - UBIFS_ORPH_NODE_SZ) / sizeof(__le64));
228 return avail / 2;
229}
230
231/**
232 * do_write_orph_node - write a node
233 * @c: UBIFS file-system description object
234 * @len: length of node
235 * @atomic: write atomically
236 *
237 * This function writes a node to the orphan head from the orphan buffer. If
238 * %atomic is not zero, then the write is done atomically. On success, %0 is
239 * returned, otherwise a negative error code is returned.
240 */
241static int do_write_orph_node(struct ubifs_info *c, int len, int atomic)
242{
243 int err = 0;
244
245 if (atomic) {
246 ubifs_assert(c->ohead_offs == 0);
247 ubifs_prepare_node(c, c->orph_buf, len, 1);
248 len = ALIGN(len, c->min_io_size);
249 err = ubifs_leb_change(c, c->ohead_lnum, c->orph_buf, len,
250 UBI_SHORTTERM);
251 } else {
252 if (c->ohead_offs == 0) {
253 /* Ensure LEB has been unmapped */
254 err = ubifs_leb_unmap(c, c->ohead_lnum);
255 if (err)
256 return err;
257 }
258 err = ubifs_write_node(c, c->orph_buf, len, c->ohead_lnum,
259 c->ohead_offs, UBI_SHORTTERM);
260 }
261 return err;
262}
263
264/**
265 * write_orph_node - write an orph node
266 * @c: UBIFS file-system description object
267 * @atomic: write atomically
268 *
269 * This function builds an orph node from the cnext list and writes it to the
270 * orphan head. On success, %0 is returned, otherwise a negative error code
271 * is returned.
272 */
273static int write_orph_node(struct ubifs_info *c, int atomic)
274{
275 struct ubifs_orphan *orphan, *cnext;
276 struct ubifs_orph_node *orph;
277 int gap, err, len, cnt, i;
278
279 ubifs_assert(c->cmt_orphans > 0);
280 gap = c->leb_size - c->ohead_offs;
281 if (gap < UBIFS_ORPH_NODE_SZ + sizeof(__le64)) {
282 c->ohead_lnum += 1;
283 c->ohead_offs = 0;
284 gap = c->leb_size;
285 if (c->ohead_lnum > c->orph_last) {
286 /*
287 * We limit the number of orphans so that this should
288 * never happen.
289 */
290 ubifs_err("out of space in orphan area");
291 return -EINVAL;
292 }
293 }
294 cnt = (gap - UBIFS_ORPH_NODE_SZ) / sizeof(__le64);
295 if (cnt > c->cmt_orphans)
296 cnt = c->cmt_orphans;
297 len = UBIFS_ORPH_NODE_SZ + cnt * sizeof(__le64);
298 ubifs_assert(c->orph_buf);
299 orph = c->orph_buf;
300 orph->ch.node_type = UBIFS_ORPH_NODE;
301 spin_lock(&c->orphan_lock);
302 cnext = c->orph_cnext;
303 for (i = 0; i < cnt; i++) {
304 orphan = cnext;
305 orph->inos[i] = cpu_to_le64(orphan->inum);
306 cnext = orphan->cnext;
307 orphan->cnext = NULL;
308 }
309 c->orph_cnext = cnext;
310 c->cmt_orphans -= cnt;
311 spin_unlock(&c->orphan_lock);
312 if (c->cmt_orphans)
313 orph->cmt_no = cpu_to_le64(c->cmt_no + 1);
314 else
315 /* Mark the last node of the commit */
316 orph->cmt_no = cpu_to_le64((c->cmt_no + 1) | (1ULL << 63));
317 ubifs_assert(c->ohead_offs + len <= c->leb_size);
318 ubifs_assert(c->ohead_lnum >= c->orph_first);
319 ubifs_assert(c->ohead_lnum <= c->orph_last);
320 err = do_write_orph_node(c, len, atomic);
321 c->ohead_offs += ALIGN(len, c->min_io_size);
322 c->ohead_offs = ALIGN(c->ohead_offs, 8);
323 return err;
324}
325
326/**
327 * write_orph_nodes - write orph nodes until there are no more to commit
328 * @c: UBIFS file-system description object
329 * @atomic: write atomically
330 *
331 * This function writes orph nodes for all the orphans to commit. On success,
332 * %0 is returned, otherwise a negative error code is returned.
333 */
334static int write_orph_nodes(struct ubifs_info *c, int atomic)
335{
336 int err;
337
338 while (c->cmt_orphans > 0) {
339 err = write_orph_node(c, atomic);
340 if (err)
341 return err;
342 }
343 if (atomic) {
344 int lnum;
345
346 /* Unmap any unused LEBs after consolidation */
347 lnum = c->ohead_lnum + 1;
348 for (lnum = c->ohead_lnum + 1; lnum <= c->orph_last; lnum++) {
349 err = ubifs_leb_unmap(c, lnum);
350 if (err)
351 return err;
352 }
353 }
354 return 0;
355}
356
357/**
358 * consolidate - consolidate the orphan area.
359 * @c: UBIFS file-system description object
360 *
361 * This function enables consolidation by putting all the orphans into the list
362 * to commit. The list is in the order that the orphans were added, and the
363 * LEBs are written atomically in order, so at no time can orphans be lost by
364 * an unclean unmount.
365 *
366 * This function returns %0 on success and a negative error code on failure.
367 */
368static int consolidate(struct ubifs_info *c)
369{
370 int tot_avail = tot_avail_orphs(c), err = 0;
371
372 spin_lock(&c->orphan_lock);
373 dbg_cmt("there is space for %d orphans and there are %d",
374 tot_avail, c->tot_orphans);
375 if (c->tot_orphans - c->new_orphans <= tot_avail) {
376 struct ubifs_orphan *orphan, **last;
377 int cnt = 0;
378
379 /* Change the cnext list to include all non-new orphans */
380 last = &c->orph_cnext;
381 list_for_each_entry(orphan, &c->orph_list, list) {
382 if (orphan->new)
383 continue;
384 *last = orphan;
385 last = &orphan->cnext;
386 cnt += 1;
387 }
388 *last = orphan->cnext;
389 ubifs_assert(cnt == c->tot_orphans - c->new_orphans);
390 c->cmt_orphans = cnt;
391 c->ohead_lnum = c->orph_first;
392 c->ohead_offs = 0;
393 } else {
394 /*
395 * We limit the number of orphans so that this should
396 * never happen.
397 */
398 ubifs_err("out of space in orphan area");
399 err = -EINVAL;
400 }
401 spin_unlock(&c->orphan_lock);
402 return err;
403}
404
405/**
406 * commit_orphans - commit orphans.
407 * @c: UBIFS file-system description object
408 *
409 * This function commits orphans to flash. On success, %0 is returned,
410 * otherwise a negative error code is returned.
411 */
412static int commit_orphans(struct ubifs_info *c)
413{
414 int avail, atomic = 0, err;
415
416 ubifs_assert(c->cmt_orphans > 0);
417 avail = avail_orphs(c);
418 if (avail < c->cmt_orphans) {
419 /* Not enough space to write new orphans, so consolidate */
420 err = consolidate(c);
421 if (err)
422 return err;
423 atomic = 1;
424 }
425 err = write_orph_nodes(c, atomic);
426 return err;
427}
428
429/**
430 * erase_deleted - erase the orphans marked for deletion.
431 * @c: UBIFS file-system description object
432 *
433 * During commit, the orphans being committed cannot be deleted, so they are
434 * marked for deletion and deleted by this function. Also, the recovery
435 * adds killed orphans to the deletion list, and therefore they are deleted
436 * here too.
437 */
438static void erase_deleted(struct ubifs_info *c)
439{
440 struct ubifs_orphan *orphan, *dnext;
441
442 spin_lock(&c->orphan_lock);
443 dnext = c->orph_dnext;
444 while (dnext) {
445 orphan = dnext;
446 dnext = orphan->dnext;
447 ubifs_assert(!orphan->new);
448 rb_erase(&orphan->rb, &c->orph_tree);
449 list_del(&orphan->list);
450 c->tot_orphans -= 1;
451 dbg_gen("deleting orphan ino %lu", orphan->inum);
452 kfree(orphan);
453 }
454 c->orph_dnext = NULL;
455 spin_unlock(&c->orphan_lock);
456}
457
458/**
459 * ubifs_orphan_end_commit - end commit of orphans.
460 * @c: UBIFS file-system description object
461 *
462 * End commit of orphans.
463 */
464int ubifs_orphan_end_commit(struct ubifs_info *c)
465{
466 int err;
467
468 if (c->cmt_orphans != 0) {
469 err = commit_orphans(c);
470 if (err)
471 return err;
472 }
473 erase_deleted(c);
474 err = dbg_check_orphans(c);
475 return err;
476}
477
478/**
479 * clear_orphans - erase all LEBs used for orphans.
480 * @c: UBIFS file-system description object
481 *
482 * If recovery is not required, then the orphans from the previous session
483 * are not needed. This function locates the LEBs used to record
484 * orphans, and un-maps them.
485 */
486static int clear_orphans(struct ubifs_info *c)
487{
488 int lnum, err;
489
490 for (lnum = c->orph_first; lnum <= c->orph_last; lnum++) {
491 err = ubifs_leb_unmap(c, lnum);
492 if (err)
493 return err;
494 }
495 c->ohead_lnum = c->orph_first;
496 c->ohead_offs = 0;
497 return 0;
498}
499
500/**
501 * insert_dead_orphan - insert an orphan.
502 * @c: UBIFS file-system description object
503 * @inum: orphan inode number
504 *
505 * This function is a helper to the 'do_kill_orphans()' function. The orphan
506 * must be kept until the next commit, so it is added to the rb-tree and the
507 * deletion list.
508 */
509static int insert_dead_orphan(struct ubifs_info *c, ino_t inum)
510{
511 struct ubifs_orphan *orphan, *o;
512 struct rb_node **p, *parent = NULL;
513
514 orphan = kzalloc(sizeof(struct ubifs_orphan), GFP_KERNEL);
515 if (!orphan)
516 return -ENOMEM;
517 orphan->inum = inum;
518
519 p = &c->orph_tree.rb_node;
520 while (*p) {
521 parent = *p;
522 o = rb_entry(parent, struct ubifs_orphan, rb);
523 if (inum < o->inum)
524 p = &(*p)->rb_left;
525 else if (inum > o->inum)
526 p = &(*p)->rb_right;
527 else {
528 /* Already added - no problem */
529 kfree(orphan);
530 return 0;
531 }
532 }
533 c->tot_orphans += 1;
534 rb_link_node(&orphan->rb, parent, p);
535 rb_insert_color(&orphan->rb, &c->orph_tree);
536 list_add_tail(&orphan->list, &c->orph_list);
537 orphan->dnext = c->orph_dnext;
538 c->orph_dnext = orphan;
539 dbg_mnt("ino %lu, new %d, tot %d",
540 inum, c->new_orphans, c->tot_orphans);
541 return 0;
542}
543
544/**
545 * do_kill_orphans - remove orphan inodes from the index.
546 * @c: UBIFS file-system description object
547 * @sleb: scanned LEB
548 * @last_cmt_no: cmt_no of last orph node read is passed and returned here
549 * @outofdate: whether the LEB is out of date is returned here
550 * @last_flagged: whether the end orph node is encountered
551 *
552 * This function is a helper to the 'kill_orphans()' function. It goes through
553 * every orphan node in a LEB and for every inode number recorded, removes
554 * all keys for that inode from the TNC.
555 */
556static int do_kill_orphans(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
557 unsigned long long *last_cmt_no, int *outofdate,
558 int *last_flagged)
559{
560 struct ubifs_scan_node *snod;
561 struct ubifs_orph_node *orph;
562 unsigned long long cmt_no;
563 ino_t inum;
564 int i, n, err, first = 1;
565
566 list_for_each_entry(snod, &sleb->nodes, list) {
567 if (snod->type != UBIFS_ORPH_NODE) {
568 ubifs_err("invalid node type %d in orphan area at "
569 "%d:%d", snod->type, sleb->lnum, snod->offs);
570 dbg_dump_node(c, snod->node);
571 return -EINVAL;
572 }
573
574 orph = snod->node;
575
576 /* Check commit number */
577 cmt_no = le64_to_cpu(orph->cmt_no) & LLONG_MAX;
578 /*
579 * The commit number on the master node may be less, because
580 * of a failed commit. If there are several failed commits in a
581 * row, the commit number written on orph nodes will continue to
582 * increase (because the commit number is adjusted here) even
583 * though the commit number on the master node stays the same
584 * because the master node has not been re-written.
585 */
586 if (cmt_no > c->cmt_no)
587 c->cmt_no = cmt_no;
588 if (cmt_no < *last_cmt_no && *last_flagged) {
589 /*
590 * The last orph node had a higher commit number and was
591 * flagged as the last written for that commit number.
592 * That makes this orph node, out of date.
593 */
594 if (!first) {
595 ubifs_err("out of order commit number %llu in "
596 "orphan node at %d:%d",
597 cmt_no, sleb->lnum, snod->offs);
598 dbg_dump_node(c, snod->node);
599 return -EINVAL;
600 }
601 dbg_rcvry("out of date LEB %d", sleb->lnum);
602 *outofdate = 1;
603 return 0;
604 }
605
606 if (first)
607 first = 0;
608
609 n = (le32_to_cpu(orph->ch.len) - UBIFS_ORPH_NODE_SZ) >> 3;
610 for (i = 0; i < n; i++) {
611 inum = le64_to_cpu(orph->inos[i]);
612 dbg_rcvry("deleting orphaned inode %lu", inum);
613 err = ubifs_tnc_remove_ino(c, inum);
614 if (err)
615 return err;
616 err = insert_dead_orphan(c, inum);
617 if (err)
618 return err;
619 }
620
621 *last_cmt_no = cmt_no;
622 if (le64_to_cpu(orph->cmt_no) & (1ULL << 63)) {
623 dbg_rcvry("last orph node for commit %llu at %d:%d",
624 cmt_no, sleb->lnum, snod->offs);
625 *last_flagged = 1;
626 } else
627 *last_flagged = 0;
628 }
629
630 return 0;
631}
632
633/**
634 * kill_orphans - remove all orphan inodes from the index.
635 * @c: UBIFS file-system description object
636 *
637 * If recovery is required, then orphan inodes recorded during the previous
638 * session (which ended with an unclean unmount) must be deleted from the index.
639 * This is done by updating the TNC, but since the index is not updated until
640 * the next commit, the LEBs where the orphan information is recorded are not
641 * erased until the next commit.
642 */
643static int kill_orphans(struct ubifs_info *c)
644{
645 unsigned long long last_cmt_no = 0;
646 int lnum, err = 0, outofdate = 0, last_flagged = 0;
647
648 c->ohead_lnum = c->orph_first;
649 c->ohead_offs = 0;
650 /* Check no-orphans flag and skip this if no orphans */
651 if (c->no_orphs) {
652 dbg_rcvry("no orphans");
653 return 0;
654 }
655 /*
656 * Orph nodes always start at c->orph_first and are written to each
657 * successive LEB in turn. Generally unused LEBs will have been unmapped
658 * but may contain out of date orph nodes if the unmap didn't go
659 * through. In addition, the last orph node written for each commit is
660 * marked (top bit of orph->cmt_no is set to 1). It is possible that
661 * there are orph nodes from the next commit (i.e. the commit did not
662 * complete successfully). In that case, no orphans will have been lost
663 * due to the way that orphans are written, and any orphans added will
664 * be valid orphans anyway and so can be deleted.
665 */
666 for (lnum = c->orph_first; lnum <= c->orph_last; lnum++) {
667 struct ubifs_scan_leb *sleb;
668
669 dbg_rcvry("LEB %d", lnum);
670 sleb = ubifs_scan(c, lnum, 0, c->sbuf);
671 if (IS_ERR(sleb)) {
672 sleb = ubifs_recover_leb(c, lnum, 0, c->sbuf, 0);
673 if (IS_ERR(sleb)) {
674 err = PTR_ERR(sleb);
675 break;
676 }
677 }
678 err = do_kill_orphans(c, sleb, &last_cmt_no, &outofdate,
679 &last_flagged);
680 if (err || outofdate) {
681 ubifs_scan_destroy(sleb);
682 break;
683 }
684 if (sleb->endpt) {
685 c->ohead_lnum = lnum;
686 c->ohead_offs = sleb->endpt;
687 }
688 ubifs_scan_destroy(sleb);
689 }
690 return err;
691}
692
693/**
694 * ubifs_mount_orphans - delete orphan inodes and erase LEBs that recorded them.
695 * @c: UBIFS file-system description object
696 * @unclean: indicates recovery from unclean unmount
697 * @read_only: indicates read only mount
698 *
699 * This function is called when mounting to erase orphans from the previous
700 * session. If UBIFS was not unmounted cleanly, then the inodes recorded as
701 * orphans are deleted.
702 */
703int ubifs_mount_orphans(struct ubifs_info *c, int unclean, int read_only)
704{
705 int err = 0;
706
707 c->max_orphans = tot_avail_orphs(c);
708
709 if (!read_only) {
710 c->orph_buf = vmalloc(c->leb_size);
711 if (!c->orph_buf)
712 return -ENOMEM;
713 }
714
715 if (unclean)
716 err = kill_orphans(c);
717 else if (!read_only)
718 err = clear_orphans(c);
719
720 return err;
721}
722
723#ifdef CONFIG_UBIFS_FS_DEBUG
724
725struct check_orphan {
726 struct rb_node rb;
727 ino_t inum;
728};
729
730struct check_info {
731 unsigned long last_ino;
732 unsigned long tot_inos;
733 unsigned long missing;
734 unsigned long long leaf_cnt;
735 struct ubifs_ino_node *node;
736 struct rb_root root;
737};
738
739static int dbg_find_orphan(struct ubifs_info *c, ino_t inum)
740{
741 struct ubifs_orphan *o;
742 struct rb_node *p;
743
744 spin_lock(&c->orphan_lock);
745 p = c->orph_tree.rb_node;
746 while (p) {
747 o = rb_entry(p, struct ubifs_orphan, rb);
748 if (inum < o->inum)
749 p = p->rb_left;
750 else if (inum > o->inum)
751 p = p->rb_right;
752 else {
753 spin_unlock(&c->orphan_lock);
754 return 1;
755 }
756 }
757 spin_unlock(&c->orphan_lock);
758 return 0;
759}
760
761static int dbg_ins_check_orphan(struct rb_root *root, ino_t inum)
762{
763 struct check_orphan *orphan, *o;
764 struct rb_node **p, *parent = NULL;
765
766 orphan = kzalloc(sizeof(struct check_orphan), GFP_NOFS);
767 if (!orphan)
768 return -ENOMEM;
769 orphan->inum = inum;
770
771 p = &root->rb_node;
772 while (*p) {
773 parent = *p;
774 o = rb_entry(parent, struct check_orphan, rb);
775 if (inum < o->inum)
776 p = &(*p)->rb_left;
777 else if (inum > o->inum)
778 p = &(*p)->rb_right;
779 else {
780 kfree(orphan);
781 return 0;
782 }
783 }
784 rb_link_node(&orphan->rb, parent, p);
785 rb_insert_color(&orphan->rb, root);
786 return 0;
787}
788
789static int dbg_find_check_orphan(struct rb_root *root, ino_t inum)
790{
791 struct check_orphan *o;
792 struct rb_node *p;
793
794 p = root->rb_node;
795 while (p) {
796 o = rb_entry(p, struct check_orphan, rb);
797 if (inum < o->inum)
798 p = p->rb_left;
799 else if (inum > o->inum)
800 p = p->rb_right;
801 else
802 return 1;
803 }
804 return 0;
805}
806
807static void dbg_free_check_tree(struct rb_root *root)
808{
809 struct rb_node *this = root->rb_node;
810 struct check_orphan *o;
811
812 while (this) {
813 if (this->rb_left) {
814 this = this->rb_left;
815 continue;
816 } else if (this->rb_right) {
817 this = this->rb_right;
818 continue;
819 }
820 o = rb_entry(this, struct check_orphan, rb);
821 this = rb_parent(this);
822 if (this) {
823 if (this->rb_left == &o->rb)
824 this->rb_left = NULL;
825 else
826 this->rb_right = NULL;
827 }
828 kfree(o);
829 }
830}
831
832static int dbg_orphan_check(struct ubifs_info *c, struct ubifs_zbranch *zbr,
833 void *priv)
834{
835 struct check_info *ci = priv;
836 ino_t inum;
837 int err;
838
839 inum = key_inum(c, &zbr->key);
840 if (inum != ci->last_ino) {
841 /* Lowest node type is the inode node, so it comes first */
842 if (key_type(c, &zbr->key) != UBIFS_INO_KEY)
843 ubifs_err("found orphan node ino %lu, type %d", inum,
844 key_type(c, &zbr->key));
845 ci->last_ino = inum;
846 ci->tot_inos += 1;
847 err = ubifs_tnc_read_node(c, zbr, ci->node);
848 if (err) {
849 ubifs_err("node read failed, error %d", err);
850 return err;
851 }
852 if (ci->node->nlink == 0)
853 /* Must be recorded as an orphan */
854 if (!dbg_find_check_orphan(&ci->root, inum) &&
855 !dbg_find_orphan(c, inum)) {
856 ubifs_err("missing orphan, ino %lu", inum);
857 ci->missing += 1;
858 }
859 }
860 ci->leaf_cnt += 1;
861 return 0;
862}
863
864static int dbg_read_orphans(struct check_info *ci, struct ubifs_scan_leb *sleb)
865{
866 struct ubifs_scan_node *snod;
867 struct ubifs_orph_node *orph;
868 ino_t inum;
869 int i, n, err;
870
871 list_for_each_entry(snod, &sleb->nodes, list) {
872 cond_resched();
873 if (snod->type != UBIFS_ORPH_NODE)
874 continue;
875 orph = snod->node;
876 n = (le32_to_cpu(orph->ch.len) - UBIFS_ORPH_NODE_SZ) >> 3;
877 for (i = 0; i < n; i++) {
878 inum = le64_to_cpu(orph->inos[i]);
879 err = dbg_ins_check_orphan(&ci->root, inum);
880 if (err)
881 return err;
882 }
883 }
884 return 0;
885}
886
887static int dbg_scan_orphans(struct ubifs_info *c, struct check_info *ci)
888{
889 int lnum, err = 0;
890
891 /* Check no-orphans flag and skip this if no orphans */
892 if (c->no_orphs)
893 return 0;
894
895 for (lnum = c->orph_first; lnum <= c->orph_last; lnum++) {
896 struct ubifs_scan_leb *sleb;
897
898 sleb = ubifs_scan(c, lnum, 0, c->dbg_buf);
899 if (IS_ERR(sleb)) {
900 err = PTR_ERR(sleb);
901 break;
902 }
903
904 err = dbg_read_orphans(ci, sleb);
905 ubifs_scan_destroy(sleb);
906 if (err)
907 break;
908 }
909
910 return err;
911}
912
913static int dbg_check_orphans(struct ubifs_info *c)
914{
915 struct check_info ci;
916 int err;
917
918 if (!(ubifs_chk_flags & UBIFS_CHK_ORPH))
919 return 0;
920
921 ci.last_ino = 0;
922 ci.tot_inos = 0;
923 ci.missing = 0;
924 ci.leaf_cnt = 0;
925 ci.root = RB_ROOT;
926 ci.node = kmalloc(UBIFS_MAX_INO_NODE_SZ, GFP_NOFS);
927 if (!ci.node) {
928 ubifs_err("out of memory");
929 return -ENOMEM;
930 }
931
932 err = dbg_scan_orphans(c, &ci);
933 if (err)
934 goto out;
935
936 err = dbg_walk_index(c, &dbg_orphan_check, NULL, &ci);
937 if (err) {
938 ubifs_err("cannot scan TNC, error %d", err);
939 goto out;
940 }
941
942 if (ci.missing) {
943 ubifs_err("%lu missing orphan(s)", ci.missing);
944 err = -EINVAL;
945 goto out;
946 }
947
948 dbg_cmt("last inode number is %lu", ci.last_ino);
949 dbg_cmt("total number of inodes is %lu", ci.tot_inos);
950 dbg_cmt("total number of leaf nodes is %llu", ci.leaf_cnt);
951
952out:
953 dbg_free_check_tree(&ci.root);
954 kfree(ci.node);
955 return err;
956}
957
958#endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
new file mode 100644
index 000000000000..77d26c141cf6
--- /dev/null
+++ b/fs/ubifs/recovery.c
@@ -0,0 +1,1519 @@
1/*
2 * This file is part of UBIFS.
3 *
4 * Copyright (C) 2006-2008 Nokia Corporation
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published by
8 * the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along with
16 * this program; if not, write to the Free Software Foundation, Inc., 51
17 * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 * Authors: Adrian Hunter
20 * Artem Bityutskiy (Битюцкий Артём)
21 */
22
23/*
24 * This file implements functions needed to recover from unclean un-mounts.
25 * When UBIFS is mounted, it checks a flag on the master node to determine if
26 * an un-mount was completed sucessfully. If not, the process of mounting
27 * incorparates additional checking and fixing of on-flash data structures.
28 * UBIFS always cleans away all remnants of an unclean un-mount, so that
29 * errors do not accumulate. However UBIFS defers recovery if it is mounted
30 * read-only, and the flash is not modified in that case.
31 */
32
33#include <linux/crc32.h>
34#include "ubifs.h"
35
36/**
37 * is_empty - determine whether a buffer is empty (contains all 0xff).
38 * @buf: buffer to clean
39 * @len: length of buffer
40 *
41 * This function returns %1 if the buffer is empty (contains all 0xff) otherwise
42 * %0 is returned.
43 */
44static int is_empty(void *buf, int len)
45{
46 uint8_t *p = buf;
47 int i;
48
49 for (i = 0; i < len; i++)
50 if (*p++ != 0xff)
51 return 0;
52 return 1;
53}
54
55/**
56 * get_master_node - get the last valid master node allowing for corruption.
57 * @c: UBIFS file-system description object
58 * @lnum: LEB number
59 * @pbuf: buffer containing the LEB read, is returned here
60 * @mst: master node, if found, is returned here
61 * @cor: corruption, if found, is returned here
62 *
63 * This function allocates a buffer, reads the LEB into it, and finds and
64 * returns the last valid master node allowing for one area of corruption.
65 * The corrupt area, if there is one, must be consistent with the assumption
66 * that it is the result of an unclean unmount while the master node was being
67 * written. Under those circumstances, it is valid to use the previously written
68 * master node.
69 *
70 * This function returns %0 on success and a negative error code on failure.
71 */
72static int get_master_node(const struct ubifs_info *c, int lnum, void **pbuf,
73 struct ubifs_mst_node **mst, void **cor)
74{
75 const int sz = c->mst_node_alsz;
76 int err, offs, len;
77 void *sbuf, *buf;
78
79 sbuf = vmalloc(c->leb_size);
80 if (!sbuf)
81 return -ENOMEM;
82
83 err = ubi_read(c->ubi, lnum, sbuf, 0, c->leb_size);
84 if (err && err != -EBADMSG)
85 goto out_free;
86
87 /* Find the first position that is definitely not a node */
88 offs = 0;
89 buf = sbuf;
90 len = c->leb_size;
91 while (offs + UBIFS_MST_NODE_SZ <= c->leb_size) {
92 struct ubifs_ch *ch = buf;
93
94 if (le32_to_cpu(ch->magic) != UBIFS_NODE_MAGIC)
95 break;
96 offs += sz;
97 buf += sz;
98 len -= sz;
99 }
100 /* See if there was a valid master node before that */
101 if (offs) {
102 int ret;
103
104 offs -= sz;
105 buf -= sz;
106 len += sz;
107 ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 1);
108 if (ret != SCANNED_A_NODE && offs) {
109 /* Could have been corruption so check one place back */
110 offs -= sz;
111 buf -= sz;
112 len += sz;
113 ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 1);
114 if (ret != SCANNED_A_NODE)
115 /*
116 * We accept only one area of corruption because
117 * we are assuming that it was caused while
118 * trying to write a master node.
119 */
120 goto out_err;
121 }
122 if (ret == SCANNED_A_NODE) {
123 struct ubifs_ch *ch = buf;
124
125 if (ch->node_type != UBIFS_MST_NODE)
126 goto out_err;
127 dbg_rcvry("found a master node at %d:%d", lnum, offs);
128 *mst = buf;
129 offs += sz;
130 buf += sz;
131 len -= sz;
132 }
133 }
134 /* Check for corruption */
135 if (offs < c->leb_size) {
136 if (!is_empty(buf, min_t(int, len, sz))) {
137 *cor = buf;
138 dbg_rcvry("found corruption at %d:%d", lnum, offs);
139 }
140 offs += sz;
141 buf += sz;
142 len -= sz;
143 }
144 /* Check remaining empty space */
145 if (offs < c->leb_size)
146 if (!is_empty(buf, len))
147 goto out_err;
148 *pbuf = sbuf;
149 return 0;
150
151out_err:
152 err = -EINVAL;
153out_free:
154 vfree(sbuf);
155 *mst = NULL;
156 *cor = NULL;
157 return err;
158}
159
160/**
161 * write_rcvrd_mst_node - write recovered master node.
162 * @c: UBIFS file-system description object
163 * @mst: master node
164 *
165 * This function returns %0 on success and a negative error code on failure.
166 */
167static int write_rcvrd_mst_node(struct ubifs_info *c,
168 struct ubifs_mst_node *mst)
169{
170 int err = 0, lnum = UBIFS_MST_LNUM, sz = c->mst_node_alsz;
171 uint32_t save_flags;
172
173 dbg_rcvry("recovery");
174
175 save_flags = mst->flags;
176 mst->flags = cpu_to_le32(le32_to_cpu(mst->flags) | UBIFS_MST_RCVRY);
177
178 ubifs_prepare_node(c, mst, UBIFS_MST_NODE_SZ, 1);
179 err = ubi_leb_change(c->ubi, lnum, mst, sz, UBI_SHORTTERM);
180 if (err)
181 goto out;
182 err = ubi_leb_change(c->ubi, lnum + 1, mst, sz, UBI_SHORTTERM);
183 if (err)
184 goto out;
185out:
186 mst->flags = save_flags;
187 return err;
188}
189
190/**
191 * ubifs_recover_master_node - recover the master node.
192 * @c: UBIFS file-system description object
193 *
194 * This function recovers the master node from corruption that may occur due to
195 * an unclean unmount.
196 *
197 * This function returns %0 on success and a negative error code on failure.
198 */
199int ubifs_recover_master_node(struct ubifs_info *c)
200{
201 void *buf1 = NULL, *buf2 = NULL, *cor1 = NULL, *cor2 = NULL;
202 struct ubifs_mst_node *mst1 = NULL, *mst2 = NULL, *mst;
203 const int sz = c->mst_node_alsz;
204 int err, offs1, offs2;
205
206 dbg_rcvry("recovery");
207
208 err = get_master_node(c, UBIFS_MST_LNUM, &buf1, &mst1, &cor1);
209 if (err)
210 goto out_free;
211
212 err = get_master_node(c, UBIFS_MST_LNUM + 1, &buf2, &mst2, &cor2);
213 if (err)
214 goto out_free;
215
216 if (mst1) {
217 offs1 = (void *)mst1 - buf1;
218 if ((le32_to_cpu(mst1->flags) & UBIFS_MST_RCVRY) &&
219 (offs1 == 0 && !cor1)) {
220 /*
221 * mst1 was written by recovery at offset 0 with no
222 * corruption.
223 */
224 dbg_rcvry("recovery recovery");
225 mst = mst1;
226 } else if (mst2) {
227 offs2 = (void *)mst2 - buf2;
228 if (offs1 == offs2) {
229 /* Same offset, so must be the same */
230 if (memcmp((void *)mst1 + UBIFS_CH_SZ,
231 (void *)mst2 + UBIFS_CH_SZ,
232 UBIFS_MST_NODE_SZ - UBIFS_CH_SZ))
233 goto out_err;
234 mst = mst1;
235 } else if (offs2 + sz == offs1) {
236 /* 1st LEB was written, 2nd was not */
237 if (cor1)
238 goto out_err;
239 mst = mst1;
240 } else if (offs1 == 0 && offs2 + sz >= c->leb_size) {
241 /* 1st LEB was unmapped and written, 2nd not */
242 if (cor1)
243 goto out_err;
244 mst = mst1;
245 } else
246 goto out_err;
247 } else {
248 /*
249 * 2nd LEB was unmapped and about to be written, so
250 * there must be only one master node in the first LEB
251 * and no corruption.
252 */
253 if (offs1 != 0 || cor1)
254 goto out_err;
255 mst = mst1;
256 }
257 } else {
258 if (!mst2)
259 goto out_err;
260 /*
261 * 1st LEB was unmapped and about to be written, so there must
262 * be no room left in 2nd LEB.
263 */
264 offs2 = (void *)mst2 - buf2;
265 if (offs2 + sz + sz <= c->leb_size)
266 goto out_err;
267 mst = mst2;
268 }
269
270 dbg_rcvry("recovered master node from LEB %d",
271 (mst == mst1 ? UBIFS_MST_LNUM : UBIFS_MST_LNUM + 1));
272
273 memcpy(c->mst_node, mst, UBIFS_MST_NODE_SZ);
274
275 if ((c->vfs_sb->s_flags & MS_RDONLY)) {
276 /* Read-only mode. Keep a copy for switching to rw mode */
277 c->rcvrd_mst_node = kmalloc(sz, GFP_KERNEL);
278 if (!c->rcvrd_mst_node) {
279 err = -ENOMEM;
280 goto out_free;
281 }
282 memcpy(c->rcvrd_mst_node, c->mst_node, UBIFS_MST_NODE_SZ);
283 } else {
284 /* Write the recovered master node */
285 c->max_sqnum = le64_to_cpu(mst->ch.sqnum) - 1;
286 err = write_rcvrd_mst_node(c, c->mst_node);
287 if (err)
288 goto out_free;
289 }
290
291 vfree(buf2);
292 vfree(buf1);
293
294 return 0;
295
296out_err:
297 err = -EINVAL;
298out_free:
299 ubifs_err("failed to recover master node");
300 if (mst1) {
301 dbg_err("dumping first master node");
302 dbg_dump_node(c, mst1);
303 }
304 if (mst2) {
305 dbg_err("dumping second master node");
306 dbg_dump_node(c, mst2);
307 }
308 vfree(buf2);
309 vfree(buf1);
310 return err;
311}
312
313/**
314 * ubifs_write_rcvrd_mst_node - write the recovered master node.
315 * @c: UBIFS file-system description object
316 *
317 * This function writes the master node that was recovered during mounting in
318 * read-only mode and must now be written because we are remounting rw.
319 *
320 * This function returns %0 on success and a negative error code on failure.
321 */
322int ubifs_write_rcvrd_mst_node(struct ubifs_info *c)
323{
324 int err;
325
326 if (!c->rcvrd_mst_node)
327 return 0;
328 c->rcvrd_mst_node->flags |= cpu_to_le32(UBIFS_MST_DIRTY);
329 c->mst_node->flags |= cpu_to_le32(UBIFS_MST_DIRTY);
330 err = write_rcvrd_mst_node(c, c->rcvrd_mst_node);
331 if (err)
332 return err;
333 kfree(c->rcvrd_mst_node);
334 c->rcvrd_mst_node = NULL;
335 return 0;
336}
337
338/**
339 * is_last_write - determine if an offset was in the last write to a LEB.
340 * @c: UBIFS file-system description object
341 * @buf: buffer to check
342 * @offs: offset to check
343 *
344 * This function returns %1 if @offs was in the last write to the LEB whose data
345 * is in @buf, otherwise %0 is returned. The determination is made by checking
346 * for subsequent empty space starting from the next min_io_size boundary (or a
347 * bit less than the common header size if min_io_size is one).
348 */
349static int is_last_write(const struct ubifs_info *c, void *buf, int offs)
350{
351 int empty_offs;
352 int check_len;
353 uint8_t *p;
354
355 if (c->min_io_size == 1) {
356 check_len = c->leb_size - offs;
357 p = buf + check_len;
358 for (; check_len > 0; check_len--)
359 if (*--p != 0xff)
360 break;
361 /*
362 * 'check_len' is the size of the corruption which cannot be
363 * more than the size of 1 node if it was caused by an unclean
364 * unmount.
365 */
366 if (check_len > UBIFS_MAX_NODE_SZ)
367 return 0;
368 return 1;
369 }
370
371 /*
372 * Round up to the next c->min_io_size boundary i.e. 'offs' is in the
373 * last wbuf written. After that should be empty space.
374 */
375 empty_offs = ALIGN(offs + 1, c->min_io_size);
376 check_len = c->leb_size - empty_offs;
377 p = buf + empty_offs - offs;
378
379 for (; check_len > 0; check_len--)
380 if (*p++ != 0xff)
381 return 0;
382 return 1;
383}
384
385/**
386 * clean_buf - clean the data from an LEB sitting in a buffer.
387 * @c: UBIFS file-system description object
388 * @buf: buffer to clean
389 * @lnum: LEB number to clean
390 * @offs: offset from which to clean
391 * @len: length of buffer
392 *
393 * This function pads up to the next min_io_size boundary (if there is one) and
394 * sets empty space to all 0xff. @buf, @offs and @len are updated to the next
395 * min_io_size boundary (if there is one).
396 */
397static void clean_buf(const struct ubifs_info *c, void **buf, int lnum,
398 int *offs, int *len)
399{
400 int empty_offs, pad_len;
401
402 lnum = lnum;
403 dbg_rcvry("cleaning corruption at %d:%d", lnum, *offs);
404
405 if (c->min_io_size == 1) {
406 memset(*buf, 0xff, c->leb_size - *offs);
407 return;
408 }
409
410 ubifs_assert(!(*offs & 7));
411 empty_offs = ALIGN(*offs, c->min_io_size);
412 pad_len = empty_offs - *offs;
413 ubifs_pad(c, *buf, pad_len);
414 *offs += pad_len;
415 *buf += pad_len;
416 *len -= pad_len;
417 memset(*buf, 0xff, c->leb_size - empty_offs);
418}
419
420/**
421 * no_more_nodes - determine if there are no more nodes in a buffer.
422 * @c: UBIFS file-system description object
423 * @buf: buffer to check
424 * @len: length of buffer
425 * @lnum: LEB number of the LEB from which @buf was read
426 * @offs: offset from which @buf was read
427 *
428 * This function scans @buf for more nodes and returns %0 is a node is found and
429 * %1 if no more nodes are found.
430 */
431static int no_more_nodes(const struct ubifs_info *c, void *buf, int len,
432 int lnum, int offs)
433{
434 int skip, next_offs = 0;
435
436 if (len > UBIFS_DATA_NODE_SZ) {
437 struct ubifs_ch *ch = buf;
438 int dlen = le32_to_cpu(ch->len);
439
440 if (ch->node_type == UBIFS_DATA_NODE && dlen >= UBIFS_CH_SZ &&
441 dlen <= UBIFS_MAX_DATA_NODE_SZ)
442 /* The corrupt node looks like a data node */
443 next_offs = ALIGN(offs + dlen, 8);
444 }
445
446 if (c->min_io_size == 1)
447 skip = 8;
448 else
449 skip = ALIGN(offs + 1, c->min_io_size) - offs;
450
451 offs += skip;
452 buf += skip;
453 len -= skip;
454 while (len > 8) {
455 struct ubifs_ch *ch = buf;
456 uint32_t magic = le32_to_cpu(ch->magic);
457 int ret;
458
459 if (magic == UBIFS_NODE_MAGIC) {
460 ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 1);
461 if (ret == SCANNED_A_NODE || ret > 0) {
462 /*
463 * There is a small chance this is just data in
464 * a data node, so check that possibility. e.g.
465 * this is part of a file that itself contains
466 * a UBIFS image.
467 */
468 if (next_offs && offs + le32_to_cpu(ch->len) <=
469 next_offs)
470 continue;
471 dbg_rcvry("unexpected node at %d:%d", lnum,
472 offs);
473 return 0;
474 }
475 }
476 offs += 8;
477 buf += 8;
478 len -= 8;
479 }
480 return 1;
481}
482
483/**
484 * fix_unclean_leb - fix an unclean LEB.
485 * @c: UBIFS file-system description object
486 * @sleb: scanned LEB information
487 * @start: offset where scan started
488 */
489static int fix_unclean_leb(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
490 int start)
491{
492 int lnum = sleb->lnum, endpt = start;
493
494 /* Get the end offset of the last node we are keeping */
495 if (!list_empty(&sleb->nodes)) {
496 struct ubifs_scan_node *snod;
497
498 snod = list_entry(sleb->nodes.prev,
499 struct ubifs_scan_node, list);
500 endpt = snod->offs + snod->len;
501 }
502
503 if ((c->vfs_sb->s_flags & MS_RDONLY) && !c->remounting_rw) {
504 /* Add to recovery list */
505 struct ubifs_unclean_leb *ucleb;
506
507 dbg_rcvry("need to fix LEB %d start %d endpt %d",
508 lnum, start, sleb->endpt);
509 ucleb = kzalloc(sizeof(struct ubifs_unclean_leb), GFP_NOFS);
510 if (!ucleb)
511 return -ENOMEM;
512 ucleb->lnum = lnum;
513 ucleb->endpt = endpt;
514 list_add_tail(&ucleb->list, &c->unclean_leb_list);
515 } else {
516 /* Write the fixed LEB back to flash */
517 int err;
518
519 dbg_rcvry("fixing LEB %d start %d endpt %d",
520 lnum, start, sleb->endpt);
521 if (endpt == 0) {
522 err = ubifs_leb_unmap(c, lnum);
523 if (err)
524 return err;
525 } else {
526 int len = ALIGN(endpt, c->min_io_size);
527
528 if (start) {
529 err = ubi_read(c->ubi, lnum, sleb->buf, 0,
530 start);
531 if (err)
532 return err;
533 }
534 /* Pad to min_io_size */
535 if (len > endpt) {
536 int pad_len = len - ALIGN(endpt, 8);
537
538 if (pad_len > 0) {
539 void *buf = sleb->buf + len - pad_len;
540
541 ubifs_pad(c, buf, pad_len);
542 }
543 }
544 err = ubi_leb_change(c->ubi, lnum, sleb->buf, len,
545 UBI_UNKNOWN);
546 if (err)
547 return err;
548 }
549 }
550 return 0;
551}
552
553/**
554 * drop_incomplete_group - drop nodes from an incomplete group.
555 * @sleb: scanned LEB information
556 * @offs: offset of dropped nodes is returned here
557 *
558 * This function returns %1 if nodes are dropped and %0 otherwise.
559 */
560static int drop_incomplete_group(struct ubifs_scan_leb *sleb, int *offs)
561{
562 int dropped = 0;
563
564 while (!list_empty(&sleb->nodes)) {
565 struct ubifs_scan_node *snod;
566 struct ubifs_ch *ch;
567
568 snod = list_entry(sleb->nodes.prev, struct ubifs_scan_node,
569 list);
570 ch = snod->node;
571 if (ch->group_type != UBIFS_IN_NODE_GROUP)
572 return dropped;
573 dbg_rcvry("dropping node at %d:%d", sleb->lnum, snod->offs);
574 *offs = snod->offs;
575 list_del(&snod->list);
576 kfree(snod);
577 sleb->nodes_cnt -= 1;
578 dropped = 1;
579 }
580 return dropped;
581}
582
583/**
584 * ubifs_recover_leb - scan and recover a LEB.
585 * @c: UBIFS file-system description object
586 * @lnum: LEB number
587 * @offs: offset
588 * @sbuf: LEB-sized buffer to use
589 * @grouped: nodes may be grouped for recovery
590 *
591 * This function does a scan of a LEB, but caters for errors that might have
592 * been caused by the unclean unmount from which we are attempting to recover.
593 *
594 * This function returns %0 on success and a negative error code on failure.
595 */
596struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
597 int offs, void *sbuf, int grouped)
598{
599 int err, len = c->leb_size - offs, need_clean = 0, quiet = 1;
600 int empty_chkd = 0, start = offs;
601 struct ubifs_scan_leb *sleb;
602 void *buf = sbuf + offs;
603
604 dbg_rcvry("%d:%d", lnum, offs);
605
606 sleb = ubifs_start_scan(c, lnum, offs, sbuf);
607 if (IS_ERR(sleb))
608 return sleb;
609
610 if (sleb->ecc)
611 need_clean = 1;
612
613 while (len >= 8) {
614 int ret;
615
616 dbg_scan("look at LEB %d:%d (%d bytes left)",
617 lnum, offs, len);
618
619 cond_resched();
620
621 /*
622 * Scan quietly until there is an error from which we cannot
623 * recover
624 */
625 ret = ubifs_scan_a_node(c, buf, len, lnum, offs, quiet);
626
627 if (ret == SCANNED_A_NODE) {
628 /* A valid node, and not a padding node */
629 struct ubifs_ch *ch = buf;
630 int node_len;
631
632 err = ubifs_add_snod(c, sleb, buf, offs);
633 if (err)
634 goto error;
635 node_len = ALIGN(le32_to_cpu(ch->len), 8);
636 offs += node_len;
637 buf += node_len;
638 len -= node_len;
639 continue;
640 }
641
642 if (ret > 0) {
643 /* Padding bytes or a valid padding node */
644 offs += ret;
645 buf += ret;
646 len -= ret;
647 continue;
648 }
649
650 if (ret == SCANNED_EMPTY_SPACE) {
651 if (!is_empty(buf, len)) {
652 if (!is_last_write(c, buf, offs))
653 break;
654 clean_buf(c, &buf, lnum, &offs, &len);
655 need_clean = 1;
656 }
657 empty_chkd = 1;
658 break;
659 }
660
661 if (ret == SCANNED_GARBAGE || ret == SCANNED_A_BAD_PAD_NODE)
662 if (is_last_write(c, buf, offs)) {
663 clean_buf(c, &buf, lnum, &offs, &len);
664 need_clean = 1;
665 empty_chkd = 1;
666 break;
667 }
668
669 if (ret == SCANNED_A_CORRUPT_NODE)
670 if (no_more_nodes(c, buf, len, lnum, offs)) {
671 clean_buf(c, &buf, lnum, &offs, &len);
672 need_clean = 1;
673 empty_chkd = 1;
674 break;
675 }
676
677 if (quiet) {
678 /* Redo the last scan but noisily */
679 quiet = 0;
680 continue;
681 }
682
683 switch (ret) {
684 case SCANNED_GARBAGE:
685 dbg_err("garbage");
686 goto corrupted;
687 case SCANNED_A_CORRUPT_NODE:
688 case SCANNED_A_BAD_PAD_NODE:
689 dbg_err("bad node");
690 goto corrupted;
691 default:
692 dbg_err("unknown");
693 goto corrupted;
694 }
695 }
696
697 if (!empty_chkd && !is_empty(buf, len)) {
698 if (is_last_write(c, buf, offs)) {
699 clean_buf(c, &buf, lnum, &offs, &len);
700 need_clean = 1;
701 } else {
702 ubifs_err("corrupt empty space at LEB %d:%d",
703 lnum, offs);
704 goto corrupted;
705 }
706 }
707
708 /* Drop nodes from incomplete group */
709 if (grouped && drop_incomplete_group(sleb, &offs)) {
710 buf = sbuf + offs;
711 len = c->leb_size - offs;
712 clean_buf(c, &buf, lnum, &offs, &len);
713 need_clean = 1;
714 }
715
716 if (offs % c->min_io_size) {
717 clean_buf(c, &buf, lnum, &offs, &len);
718 need_clean = 1;
719 }
720
721 ubifs_end_scan(c, sleb, lnum, offs);
722
723 if (need_clean) {
724 err = fix_unclean_leb(c, sleb, start);
725 if (err)
726 goto error;
727 }
728
729 return sleb;
730
731corrupted:
732 ubifs_scanned_corruption(c, lnum, offs, buf);
733 err = -EUCLEAN;
734error:
735 ubifs_err("LEB %d scanning failed", lnum);
736 ubifs_scan_destroy(sleb);
737 return ERR_PTR(err);
738}
739
740/**
741 * get_cs_sqnum - get commit start sequence number.
742 * @c: UBIFS file-system description object
743 * @lnum: LEB number of commit start node
744 * @offs: offset of commit start node
745 * @cs_sqnum: commit start sequence number is returned here
746 *
747 * This function returns %0 on success and a negative error code on failure.
748 */
749static int get_cs_sqnum(struct ubifs_info *c, int lnum, int offs,
750 unsigned long long *cs_sqnum)
751{
752 struct ubifs_cs_node *cs_node = NULL;
753 int err, ret;
754
755 dbg_rcvry("at %d:%d", lnum, offs);
756 cs_node = kmalloc(UBIFS_CS_NODE_SZ, GFP_KERNEL);
757 if (!cs_node)
758 return -ENOMEM;
759 if (c->leb_size - offs < UBIFS_CS_NODE_SZ)
760 goto out_err;
761 err = ubi_read(c->ubi, lnum, (void *)cs_node, offs, UBIFS_CS_NODE_SZ);
762 if (err && err != -EBADMSG)
763 goto out_free;
764 ret = ubifs_scan_a_node(c, cs_node, UBIFS_CS_NODE_SZ, lnum, offs, 0);
765 if (ret != SCANNED_A_NODE) {
766 dbg_err("Not a valid node");
767 goto out_err;
768 }
769 if (cs_node->ch.node_type != UBIFS_CS_NODE) {
770 dbg_err("Node a CS node, type is %d", cs_node->ch.node_type);
771 goto out_err;
772 }
773 if (le64_to_cpu(cs_node->cmt_no) != c->cmt_no) {
774 dbg_err("CS node cmt_no %llu != current cmt_no %llu",
775 (unsigned long long)le64_to_cpu(cs_node->cmt_no),
776 c->cmt_no);
777 goto out_err;
778 }
779 *cs_sqnum = le64_to_cpu(cs_node->ch.sqnum);
780 dbg_rcvry("commit start sqnum %llu", *cs_sqnum);
781 kfree(cs_node);
782 return 0;
783
784out_err:
785 err = -EINVAL;
786out_free:
787 ubifs_err("failed to get CS sqnum");
788 kfree(cs_node);
789 return err;
790}
791
792/**
793 * ubifs_recover_log_leb - scan and recover a log LEB.
794 * @c: UBIFS file-system description object
795 * @lnum: LEB number
796 * @offs: offset
797 * @sbuf: LEB-sized buffer to use
798 *
799 * This function does a scan of a LEB, but caters for errors that might have
800 * been caused by the unclean unmount from which we are attempting to recover.
801 *
802 * This function returns %0 on success and a negative error code on failure.
803 */
804struct ubifs_scan_leb *ubifs_recover_log_leb(struct ubifs_info *c, int lnum,
805 int offs, void *sbuf)
806{
807 struct ubifs_scan_leb *sleb;
808 int next_lnum;
809
810 dbg_rcvry("LEB %d", lnum);
811 next_lnum = lnum + 1;
812 if (next_lnum >= UBIFS_LOG_LNUM + c->log_lebs)
813 next_lnum = UBIFS_LOG_LNUM;
814 if (next_lnum != c->ltail_lnum) {
815 /*
816 * We can only recover at the end of the log, so check that the
817 * next log LEB is empty or out of date.
818 */
819 sleb = ubifs_scan(c, next_lnum, 0, sbuf);
820 if (IS_ERR(sleb))
821 return sleb;
822 if (sleb->nodes_cnt) {
823 struct ubifs_scan_node *snod;
824 unsigned long long cs_sqnum = c->cs_sqnum;
825
826 snod = list_entry(sleb->nodes.next,
827 struct ubifs_scan_node, list);
828 if (cs_sqnum == 0) {
829 int err;
830
831 err = get_cs_sqnum(c, lnum, offs, &cs_sqnum);
832 if (err) {
833 ubifs_scan_destroy(sleb);
834 return ERR_PTR(err);
835 }
836 }
837 if (snod->sqnum > cs_sqnum) {
838 ubifs_err("unrecoverable log corruption "
839 "in LEB %d", lnum);
840 ubifs_scan_destroy(sleb);
841 return ERR_PTR(-EUCLEAN);
842 }
843 }
844 ubifs_scan_destroy(sleb);
845 }
846 return ubifs_recover_leb(c, lnum, offs, sbuf, 0);
847}
848
849/**
850 * recover_head - recover a head.
851 * @c: UBIFS file-system description object
852 * @lnum: LEB number of head to recover
853 * @offs: offset of head to recover
854 * @sbuf: LEB-sized buffer to use
855 *
856 * This function ensures that there is no data on the flash at a head location.
857 *
858 * This function returns %0 on success and a negative error code on failure.
859 */
860static int recover_head(const struct ubifs_info *c, int lnum, int offs,
861 void *sbuf)
862{
863 int len, err, need_clean = 0;
864
865 if (c->min_io_size > 1)
866 len = c->min_io_size;
867 else
868 len = 512;
869 if (offs + len > c->leb_size)
870 len = c->leb_size - offs;
871
872 if (!len)
873 return 0;
874
875 /* Read at the head location and check it is empty flash */
876 err = ubi_read(c->ubi, lnum, sbuf, offs, len);
877 if (err)
878 need_clean = 1;
879 else {
880 uint8_t *p = sbuf;
881
882 while (len--)
883 if (*p++ != 0xff) {
884 need_clean = 1;
885 break;
886 }
887 }
888
889 if (need_clean) {
890 dbg_rcvry("cleaning head at %d:%d", lnum, offs);
891 if (offs == 0)
892 return ubifs_leb_unmap(c, lnum);
893 err = ubi_read(c->ubi, lnum, sbuf, 0, offs);
894 if (err)
895 return err;
896 return ubi_leb_change(c->ubi, lnum, sbuf, offs, UBI_UNKNOWN);
897 }
898
899 return 0;
900}
901
902/**
903 * ubifs_recover_inl_heads - recover index and LPT heads.
904 * @c: UBIFS file-system description object
905 * @sbuf: LEB-sized buffer to use
906 *
907 * This function ensures that there is no data on the flash at the index and
908 * LPT head locations.
909 *
910 * This deals with the recovery of a half-completed journal commit. UBIFS is
911 * careful never to overwrite the last version of the index or the LPT. Because
912 * the index and LPT are wandering trees, data from a half-completed commit will
913 * not be referenced anywhere in UBIFS. The data will be either in LEBs that are
914 * assumed to be empty and will be unmapped anyway before use, or in the index
915 * and LPT heads.
916 *
917 * This function returns %0 on success and a negative error code on failure.
918 */
919int ubifs_recover_inl_heads(const struct ubifs_info *c, void *sbuf)
920{
921 int err;
922
923 ubifs_assert(!(c->vfs_sb->s_flags & MS_RDONLY) || c->remounting_rw);
924
925 dbg_rcvry("checking index head at %d:%d", c->ihead_lnum, c->ihead_offs);
926 err = recover_head(c, c->ihead_lnum, c->ihead_offs, sbuf);
927 if (err)
928 return err;
929
930 dbg_rcvry("checking LPT head at %d:%d", c->nhead_lnum, c->nhead_offs);
931 err = recover_head(c, c->nhead_lnum, c->nhead_offs, sbuf);
932 if (err)
933 return err;
934
935 return 0;
936}
937
938/**
939 * clean_an_unclean_leb - read and write a LEB to remove corruption.
940 * @c: UBIFS file-system description object
941 * @ucleb: unclean LEB information
942 * @sbuf: LEB-sized buffer to use
943 *
944 * This function reads a LEB up to a point pre-determined by the mount recovery,
945 * checks the nodes, and writes the result back to the flash, thereby cleaning
946 * off any following corruption, or non-fatal ECC errors.
947 *
948 * This function returns %0 on success and a negative error code on failure.
949 */
950static int clean_an_unclean_leb(const struct ubifs_info *c,
951 struct ubifs_unclean_leb *ucleb, void *sbuf)
952{
953 int err, lnum = ucleb->lnum, offs = 0, len = ucleb->endpt, quiet = 1;
954 void *buf = sbuf;
955
956 dbg_rcvry("LEB %d len %d", lnum, len);
957
958 if (len == 0) {
959 /* Nothing to read, just unmap it */
960 err = ubifs_leb_unmap(c, lnum);
961 if (err)
962 return err;
963 return 0;
964 }
965
966 err = ubi_read(c->ubi, lnum, buf, offs, len);
967 if (err && err != -EBADMSG)
968 return err;
969
970 while (len >= 8) {
971 int ret;
972
973 cond_resched();
974
975 /* Scan quietly until there is an error */
976 ret = ubifs_scan_a_node(c, buf, len, lnum, offs, quiet);
977
978 if (ret == SCANNED_A_NODE) {
979 /* A valid node, and not a padding node */
980 struct ubifs_ch *ch = buf;
981 int node_len;
982
983 node_len = ALIGN(le32_to_cpu(ch->len), 8);
984 offs += node_len;
985 buf += node_len;
986 len -= node_len;
987 continue;
988 }
989
990 if (ret > 0) {
991 /* Padding bytes or a valid padding node */
992 offs += ret;
993 buf += ret;
994 len -= ret;
995 continue;
996 }
997
998 if (ret == SCANNED_EMPTY_SPACE) {
999 ubifs_err("unexpected empty space at %d:%d",
1000 lnum, offs);
1001 return -EUCLEAN;
1002 }
1003
1004 if (quiet) {
1005 /* Redo the last scan but noisily */
1006 quiet = 0;
1007 continue;
1008 }
1009
1010 ubifs_scanned_corruption(c, lnum, offs, buf);
1011 return -EUCLEAN;
1012 }
1013
1014 /* Pad to min_io_size */
1015 len = ALIGN(ucleb->endpt, c->min_io_size);
1016 if (len > ucleb->endpt) {
1017 int pad_len = len - ALIGN(ucleb->endpt, 8);
1018
1019 if (pad_len > 0) {
1020 buf = c->sbuf + len - pad_len;
1021 ubifs_pad(c, buf, pad_len);
1022 }
1023 }
1024
1025 /* Write back the LEB atomically */
1026 err = ubi_leb_change(c->ubi, lnum, sbuf, len, UBI_UNKNOWN);
1027 if (err)
1028 return err;
1029
1030 dbg_rcvry("cleaned LEB %d", lnum);
1031
1032 return 0;
1033}
1034
1035/**
1036 * ubifs_clean_lebs - clean LEBs recovered during read-only mount.
1037 * @c: UBIFS file-system description object
1038 * @sbuf: LEB-sized buffer to use
1039 *
1040 * This function cleans a LEB identified during recovery that needs to be
1041 * written but was not because UBIFS was mounted read-only. This happens when
1042 * remounting to read-write mode.
1043 *
1044 * This function returns %0 on success and a negative error code on failure.
1045 */
1046int ubifs_clean_lebs(const struct ubifs_info *c, void *sbuf)
1047{
1048 dbg_rcvry("recovery");
1049 while (!list_empty(&c->unclean_leb_list)) {
1050 struct ubifs_unclean_leb *ucleb;
1051 int err;
1052
1053 ucleb = list_entry(c->unclean_leb_list.next,
1054 struct ubifs_unclean_leb, list);
1055 err = clean_an_unclean_leb(c, ucleb, sbuf);
1056 if (err)
1057 return err;
1058 list_del(&ucleb->list);
1059 kfree(ucleb);
1060 }
1061 return 0;
1062}
1063
1064/**
1065 * ubifs_rcvry_gc_commit - recover the GC LEB number and run the commit.
1066 * @c: UBIFS file-system description object
1067 *
1068 * Out-of-place garbage collection requires always one empty LEB with which to
1069 * start garbage collection. The LEB number is recorded in c->gc_lnum and is
1070 * written to the master node on unmounting. In the case of an unclean unmount
1071 * the value of gc_lnum recorded in the master node is out of date and cannot
1072 * be used. Instead, recovery must allocate an empty LEB for this purpose.
1073 * However, there may not be enough empty space, in which case it must be
1074 * possible to GC the dirtiest LEB into the GC head LEB.
1075 *
1076 * This function also runs the commit which causes the TNC updates from
1077 * size-recovery and orphans to be written to the flash. That is important to
1078 * ensure correct replay order for subsequent mounts.
1079 *
1080 * This function returns %0 on success and a negative error code on failure.
1081 */
1082int ubifs_rcvry_gc_commit(struct ubifs_info *c)
1083{
1084 struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
1085 struct ubifs_lprops lp;
1086 int lnum, err;
1087
1088 c->gc_lnum = -1;
1089 if (wbuf->lnum == -1) {
1090 dbg_rcvry("no GC head LEB");
1091 goto find_free;
1092 }
1093 /*
1094 * See whether the used space in the dirtiest LEB fits in the GC head
1095 * LEB.
1096 */
1097 if (wbuf->offs == c->leb_size) {
1098 dbg_rcvry("no room in GC head LEB");
1099 goto find_free;
1100 }
1101 err = ubifs_find_dirty_leb(c, &lp, wbuf->offs, 2);
1102 if (err) {
1103 if (err == -ENOSPC)
1104 dbg_err("could not find a dirty LEB");
1105 return err;
1106 }
1107 ubifs_assert(!(lp.flags & LPROPS_INDEX));
1108 lnum = lp.lnum;
1109 if (lp.free + lp.dirty == c->leb_size) {
1110 /* An empty LEB was returned */
1111 if (lp.free != c->leb_size) {
1112 err = ubifs_change_one_lp(c, lnum, c->leb_size,
1113 0, 0, 0, 0);
1114 if (err)
1115 return err;
1116 }
1117 err = ubifs_leb_unmap(c, lnum);
1118 if (err)
1119 return err;
1120 c->gc_lnum = lnum;
1121 dbg_rcvry("allocated LEB %d for GC", lnum);
1122 /* Run the commit */
1123 dbg_rcvry("committing");
1124 return ubifs_run_commit(c);
1125 }
1126 /*
1127 * There was no empty LEB so the used space in the dirtiest LEB must fit
1128 * in the GC head LEB.
1129 */
1130 if (lp.free + lp.dirty < wbuf->offs) {
1131 dbg_rcvry("LEB %d doesn't fit in GC head LEB %d:%d",
1132 lnum, wbuf->lnum, wbuf->offs);
1133 err = ubifs_return_leb(c, lnum);
1134 if (err)
1135 return err;
1136 goto find_free;
1137 }
1138 /*
1139 * We run the commit before garbage collection otherwise subsequent
1140 * mounts will see the GC and orphan deletion in a different order.
1141 */
1142 dbg_rcvry("committing");
1143 err = ubifs_run_commit(c);
1144 if (err)
1145 return err;
1146 /*
1147 * The data in the dirtiest LEB fits in the GC head LEB, so do the GC
1148 * - use locking to keep 'ubifs_assert()' happy.
1149 */
1150 dbg_rcvry("GC'ing LEB %d", lnum);
1151 mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
1152 err = ubifs_garbage_collect_leb(c, &lp);
1153 if (err >= 0) {
1154 int err2 = ubifs_wbuf_sync_nolock(wbuf);
1155
1156 if (err2)
1157 err = err2;
1158 }
1159 mutex_unlock(&wbuf->io_mutex);
1160 if (err < 0) {
1161 dbg_err("GC failed, error %d", err);
1162 if (err == -EAGAIN)
1163 err = -EINVAL;
1164 return err;
1165 }
1166 if (err != LEB_RETAINED) {
1167 dbg_err("GC returned %d", err);
1168 return -EINVAL;
1169 }
1170 err = ubifs_leb_unmap(c, c->gc_lnum);
1171 if (err)
1172 return err;
1173 dbg_rcvry("allocated LEB %d for GC", lnum);
1174 return 0;
1175
1176find_free:
1177 /*
1178 * There is no GC head LEB or the free space in the GC head LEB is too
1179 * small. Allocate gc_lnum by calling 'ubifs_find_free_leb_for_idx()' so
1180 * GC is not run.
1181 */
1182 lnum = ubifs_find_free_leb_for_idx(c);
1183 if (lnum < 0) {
1184 dbg_err("could not find an empty LEB");
1185 return lnum;
1186 }
1187 /* And reset the index flag */
1188 err = ubifs_change_one_lp(c, lnum, LPROPS_NC, LPROPS_NC, 0,
1189 LPROPS_INDEX, 0);
1190 if (err)
1191 return err;
1192 c->gc_lnum = lnum;
1193 dbg_rcvry("allocated LEB %d for GC", lnum);
1194 /* Run the commit */
1195 dbg_rcvry("committing");
1196 return ubifs_run_commit(c);
1197}
1198
1199/**
1200 * struct size_entry - inode size information for recovery.
1201 * @rb: link in the RB-tree of sizes
1202 * @inum: inode number
1203 * @i_size: size on inode
1204 * @d_size: maximum size based on data nodes
1205 * @exists: indicates whether the inode exists
1206 * @inode: inode if pinned in memory awaiting rw mode to fix it
1207 */
1208struct size_entry {
1209 struct rb_node rb;
1210 ino_t inum;
1211 loff_t i_size;
1212 loff_t d_size;
1213 int exists;
1214 struct inode *inode;
1215};
1216
1217/**
1218 * add_ino - add an entry to the size tree.
1219 * @c: UBIFS file-system description object
1220 * @inum: inode number
1221 * @i_size: size on inode
1222 * @d_size: maximum size based on data nodes
1223 * @exists: indicates whether the inode exists
1224 */
1225static int add_ino(struct ubifs_info *c, ino_t inum, loff_t i_size,
1226 loff_t d_size, int exists)
1227{
1228 struct rb_node **p = &c->size_tree.rb_node, *parent = NULL;
1229 struct size_entry *e;
1230
1231 while (*p) {
1232 parent = *p;
1233 e = rb_entry(parent, struct size_entry, rb);
1234 if (inum < e->inum)
1235 p = &(*p)->rb_left;
1236 else
1237 p = &(*p)->rb_right;
1238 }
1239
1240 e = kzalloc(sizeof(struct size_entry), GFP_KERNEL);
1241 if (!e)
1242 return -ENOMEM;
1243
1244 e->inum = inum;
1245 e->i_size = i_size;
1246 e->d_size = d_size;
1247 e->exists = exists;
1248
1249 rb_link_node(&e->rb, parent, p);
1250 rb_insert_color(&e->rb, &c->size_tree);
1251
1252 return 0;
1253}
1254
1255/**
1256 * find_ino - find an entry on the size tree.
1257 * @c: UBIFS file-system description object
1258 * @inum: inode number
1259 */
1260static struct size_entry *find_ino(struct ubifs_info *c, ino_t inum)
1261{
1262 struct rb_node *p = c->size_tree.rb_node;
1263 struct size_entry *e;
1264
1265 while (p) {
1266 e = rb_entry(p, struct size_entry, rb);
1267 if (inum < e->inum)
1268 p = p->rb_left;
1269 else if (inum > e->inum)
1270 p = p->rb_right;
1271 else
1272 return e;
1273 }
1274 return NULL;
1275}
1276
1277/**
1278 * remove_ino - remove an entry from the size tree.
1279 * @c: UBIFS file-system description object
1280 * @inum: inode number
1281 */
1282static void remove_ino(struct ubifs_info *c, ino_t inum)
1283{
1284 struct size_entry *e = find_ino(c, inum);
1285
1286 if (!e)
1287 return;
1288 rb_erase(&e->rb, &c->size_tree);
1289 kfree(e);
1290}
1291
1292/**
1293 * ubifs_destroy_size_tree - free resources related to the size tree.
1294 * @c: UBIFS file-system description object
1295 */
1296void ubifs_destroy_size_tree(struct ubifs_info *c)
1297{
1298 struct rb_node *this = c->size_tree.rb_node;
1299 struct size_entry *e;
1300
1301 while (this) {
1302 if (this->rb_left) {
1303 this = this->rb_left;
1304 continue;
1305 } else if (this->rb_right) {
1306 this = this->rb_right;
1307 continue;
1308 }
1309 e = rb_entry(this, struct size_entry, rb);
1310 if (e->inode)
1311 iput(e->inode);
1312 this = rb_parent(this);
1313 if (this) {
1314 if (this->rb_left == &e->rb)
1315 this->rb_left = NULL;
1316 else
1317 this->rb_right = NULL;
1318 }
1319 kfree(e);
1320 }
1321 c->size_tree = RB_ROOT;
1322}
1323
1324/**
1325 * ubifs_recover_size_accum - accumulate inode sizes for recovery.
1326 * @c: UBIFS file-system description object
1327 * @key: node key
1328 * @deletion: node is for a deletion
1329 * @new_size: inode size
1330 *
1331 * This function has two purposes:
1332 * 1) to ensure there are no data nodes that fall outside the inode size
1333 * 2) to ensure there are no data nodes for inodes that do not exist
1334 * To accomplish those purposes, a rb-tree is constructed containing an entry
1335 * for each inode number in the journal that has not been deleted, and recording
1336 * the size from the inode node, the maximum size of any data node (also altered
1337 * by truncations) and a flag indicating a inode number for which no inode node
1338 * was present in the journal.
1339 *
1340 * Note that there is still the possibility that there are data nodes that have
1341 * been committed that are beyond the inode size, however the only way to find
1342 * them would be to scan the entire index. Alternatively, some provision could
1343 * be made to record the size of inodes at the start of commit, which would seem
1344 * very cumbersome for a scenario that is quite unlikely and the only negative
1345 * consequence of which is wasted space.
1346 *
1347 * This functions returns %0 on success and a negative error code on failure.
1348 */
1349int ubifs_recover_size_accum(struct ubifs_info *c, union ubifs_key *key,
1350 int deletion, loff_t new_size)
1351{
1352 ino_t inum = key_inum(c, key);
1353 struct size_entry *e;
1354 int err;
1355
1356 switch (key_type(c, key)) {
1357 case UBIFS_INO_KEY:
1358 if (deletion)
1359 remove_ino(c, inum);
1360 else {
1361 e = find_ino(c, inum);
1362 if (e) {
1363 e->i_size = new_size;
1364 e->exists = 1;
1365 } else {
1366 err = add_ino(c, inum, new_size, 0, 1);
1367 if (err)
1368 return err;
1369 }
1370 }
1371 break;
1372 case UBIFS_DATA_KEY:
1373 e = find_ino(c, inum);
1374 if (e) {
1375 if (new_size > e->d_size)
1376 e->d_size = new_size;
1377 } else {
1378 err = add_ino(c, inum, 0, new_size, 0);
1379 if (err)
1380 return err;
1381 }
1382 break;
1383 case UBIFS_TRUN_KEY:
1384 e = find_ino(c, inum);
1385 if (e)
1386 e->d_size = new_size;
1387 break;
1388 }
1389 return 0;
1390}
1391
1392/**
1393 * fix_size_in_place - fix inode size in place on flash.
1394 * @c: UBIFS file-system description object
1395 * @e: inode size information for recovery
1396 */
1397static int fix_size_in_place(struct ubifs_info *c, struct size_entry *e)
1398{
1399 struct ubifs_ino_node *ino = c->sbuf;
1400 unsigned char *p;
1401 union ubifs_key key;
1402 int err, lnum, offs, len;
1403 loff_t i_size;
1404 uint32_t crc;
1405
1406 /* Locate the inode node LEB number and offset */
1407 ino_key_init(c, &key, e->inum);
1408 err = ubifs_tnc_locate(c, &key, ino, &lnum, &offs);
1409 if (err)
1410 goto out;
1411 /*
1412 * If the size recorded on the inode node is greater than the size that
1413 * was calculated from nodes in the journal then don't change the inode.
1414 */
1415 i_size = le64_to_cpu(ino->size);
1416 if (i_size >= e->d_size)
1417 return 0;
1418 /* Read the LEB */
1419 err = ubi_read(c->ubi, lnum, c->sbuf, 0, c->leb_size);
1420 if (err)
1421 goto out;
1422 /* Change the size field and recalculate the CRC */
1423 ino = c->sbuf + offs;
1424 ino->size = cpu_to_le64(e->d_size);
1425 len = le32_to_cpu(ino->ch.len);
1426 crc = crc32(UBIFS_CRC32_INIT, (void *)ino + 8, len - 8);
1427 ino->ch.crc = cpu_to_le32(crc);
1428 /* Work out where data in the LEB ends and free space begins */
1429 p = c->sbuf;
1430 len = c->leb_size - 1;
1431 while (p[len] == 0xff)
1432 len -= 1;
1433 len = ALIGN(len + 1, c->min_io_size);
1434 /* Atomically write the fixed LEB back again */
1435 err = ubi_leb_change(c->ubi, lnum, c->sbuf, len, UBI_UNKNOWN);
1436 if (err)
1437 goto out;
1438 dbg_rcvry("inode %lu at %d:%d size %lld -> %lld ", e->inum, lnum, offs,
1439 i_size, e->d_size);
1440 return 0;
1441
1442out:
1443 ubifs_warn("inode %lu failed to fix size %lld -> %lld error %d",
1444 e->inum, e->i_size, e->d_size, err);
1445 return err;
1446}
1447
1448/**
1449 * ubifs_recover_size - recover inode size.
1450 * @c: UBIFS file-system description object
1451 *
1452 * This function attempts to fix inode size discrepancies identified by the
1453 * 'ubifs_recover_size_accum()' function.
1454 *
1455 * This functions returns %0 on success and a negative error code on failure.
1456 */
1457int ubifs_recover_size(struct ubifs_info *c)
1458{
1459 struct rb_node *this = rb_first(&c->size_tree);
1460
1461 while (this) {
1462 struct size_entry *e;
1463 int err;
1464
1465 e = rb_entry(this, struct size_entry, rb);
1466 if (!e->exists) {
1467 union ubifs_key key;
1468
1469 ino_key_init(c, &key, e->inum);
1470 err = ubifs_tnc_lookup(c, &key, c->sbuf);
1471 if (err && err != -ENOENT)
1472 return err;
1473 if (err == -ENOENT) {
1474 /* Remove data nodes that have no inode */
1475 dbg_rcvry("removing ino %lu", e->inum);
1476 err = ubifs_tnc_remove_ino(c, e->inum);
1477 if (err)
1478 return err;
1479 } else {
1480 struct ubifs_ino_node *ino = c->sbuf;
1481
1482 e->exists = 1;
1483 e->i_size = le64_to_cpu(ino->size);
1484 }
1485 }
1486 if (e->exists && e->i_size < e->d_size) {
1487 if (!e->inode && (c->vfs_sb->s_flags & MS_RDONLY)) {
1488 /* Fix the inode size and pin it in memory */
1489 struct inode *inode;
1490
1491 inode = ubifs_iget(c->vfs_sb, e->inum);
1492 if (IS_ERR(inode))
1493 return PTR_ERR(inode);
1494 if (inode->i_size < e->d_size) {
1495 dbg_rcvry("ino %lu size %lld -> %lld",
1496 e->inum, e->d_size,
1497 inode->i_size);
1498 inode->i_size = e->d_size;
1499 ubifs_inode(inode)->ui_size = e->d_size;
1500 e->inode = inode;
1501 this = rb_next(this);
1502 continue;
1503 }
1504 iput(inode);
1505 } else {
1506 /* Fix the size in place */
1507 err = fix_size_in_place(c, e);
1508 if (err)
1509 return err;
1510 if (e->inode)
1511 iput(e->inode);
1512 }
1513 }
1514 this = rb_next(this);
1515 rb_erase(&e->rb, &c->size_tree);
1516 kfree(e);
1517 }
1518 return 0;
1519}
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
new file mode 100644
index 000000000000..7399692af859
--- /dev/null
+++ b/fs/ubifs/replay.c
@@ -0,0 +1,1075 @@
1/*
2 * This file is part of UBIFS.
3 *
4 * Copyright (C) 2006-2008 Nokia Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published by
8 * the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along with
16 * this program; if not, write to the Free Software Foundation, Inc., 51
17 * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 * Authors: Adrian Hunter
20 * Artem Bityutskiy (Битюцкий Артём)
21 */
22
23/*
24 * This file contains journal replay code. It runs when the file-system is being
25 * mounted and requires no locking.
26 *
27 * The larger is the journal, the longer it takes to scan it, so the longer it
28 * takes to mount UBIFS. This is why the journal has limited size which may be
29 * changed depending on the system requirements. But a larger journal gives
30 * faster I/O speed because it writes the index less frequently. So this is a
31 * trade-off. Also, the journal is indexed by the in-memory index (TNC), so the
32 * larger is the journal, the more memory its index may consume.
33 */
34
35#include "ubifs.h"
36
37/*
38 * Replay flags.
39 *
40 * REPLAY_DELETION: node was deleted
41 * REPLAY_REF: node is a reference node
42 */
43enum {
44 REPLAY_DELETION = 1,
45 REPLAY_REF = 2,
46};
47
48/**
49 * struct replay_entry - replay tree entry.
50 * @lnum: logical eraseblock number of the node
51 * @offs: node offset
52 * @len: node length
53 * @sqnum: node sequence number
54 * @flags: replay flags
55 * @rb: links the replay tree
56 * @key: node key
57 * @nm: directory entry name
58 * @old_size: truncation old size
59 * @new_size: truncation new size
60 * @free: amount of free space in a bud
61 * @dirty: amount of dirty space in a bud from padding and deletion nodes
62 *
63 * UBIFS journal replay must compare node sequence numbers, which means it must
64 * build a tree of node information to insert into the TNC.
65 */
66struct replay_entry {
67 int lnum;
68 int offs;
69 int len;
70 unsigned long long sqnum;
71 int flags;
72 struct rb_node rb;
73 union ubifs_key key;
74 union {
75 struct qstr nm;
76 struct {
77 loff_t old_size;
78 loff_t new_size;
79 };
80 struct {
81 int free;
82 int dirty;
83 };
84 };
85};
86
87/**
88 * struct bud_entry - entry in the list of buds to replay.
89 * @list: next bud in the list
90 * @bud: bud description object
91 * @free: free bytes in the bud
92 * @sqnum: reference node sequence number
93 */
94struct bud_entry {
95 struct list_head list;
96 struct ubifs_bud *bud;
97 int free;
98 unsigned long long sqnum;
99};
100
101/**
102 * set_bud_lprops - set free and dirty space used by a bud.
103 * @c: UBIFS file-system description object
104 * @r: replay entry of bud
105 */
106static int set_bud_lprops(struct ubifs_info *c, struct replay_entry *r)
107{
108 const struct ubifs_lprops *lp;
109 int err = 0, dirty;
110
111 ubifs_get_lprops(c);
112
113 lp = ubifs_lpt_lookup_dirty(c, r->lnum);
114 if (IS_ERR(lp)) {
115 err = PTR_ERR(lp);
116 goto out;
117 }
118
119 dirty = lp->dirty;
120 if (r->offs == 0 && (lp->free != c->leb_size || lp->dirty != 0)) {
121 /*
122 * The LEB was added to the journal with a starting offset of
123 * zero which means the LEB must have been empty. The LEB
124 * property values should be lp->free == c->leb_size and
125 * lp->dirty == 0, but that is not the case. The reason is that
126 * the LEB was garbage collected. The garbage collector resets
127 * the free and dirty space without recording it anywhere except
128 * lprops, so if there is not a commit then lprops does not have
129 * that information next time the file system is mounted.
130 *
131 * We do not need to adjust free space because the scan has told
132 * us the exact value which is recorded in the replay entry as
133 * r->free.
134 *
135 * However we do need to subtract from the dirty space the
136 * amount of space that the garbage collector reclaimed, which
137 * is the whole LEB minus the amount of space that was free.
138 */
139 dbg_mnt("bud LEB %d was GC'd (%d free, %d dirty)", r->lnum,
140 lp->free, lp->dirty);
141 dbg_gc("bud LEB %d was GC'd (%d free, %d dirty)", r->lnum,
142 lp->free, lp->dirty);
143 dirty -= c->leb_size - lp->free;
144 /*
145 * If the replay order was perfect the dirty space would now be
146 * zero. The order is not perfect because the the journal heads
147 * race with eachother. This is not a problem but is does mean
148 * that the dirty space may temporarily exceed c->leb_size
149 * during the replay.
150 */
151 if (dirty != 0)
152 dbg_msg("LEB %d lp: %d free %d dirty "
153 "replay: %d free %d dirty", r->lnum, lp->free,
154 lp->dirty, r->free, r->dirty);
155 }
156 lp = ubifs_change_lp(c, lp, r->free, dirty + r->dirty,
157 lp->flags | LPROPS_TAKEN, 0);
158 if (IS_ERR(lp)) {
159 err = PTR_ERR(lp);
160 goto out;
161 }
162out:
163 ubifs_release_lprops(c);
164 return err;
165}
166
167/**
168 * trun_remove_range - apply a replay entry for a truncation to the TNC.
169 * @c: UBIFS file-system description object
170 * @r: replay entry of truncation
171 */
172static int trun_remove_range(struct ubifs_info *c, struct replay_entry *r)
173{
174 unsigned min_blk, max_blk;
175 union ubifs_key min_key, max_key;
176 ino_t ino;
177
178 min_blk = r->new_size / UBIFS_BLOCK_SIZE;
179 if (r->new_size & (UBIFS_BLOCK_SIZE - 1))
180 min_blk += 1;
181
182 max_blk = r->old_size / UBIFS_BLOCK_SIZE;
183 if ((r->old_size & (UBIFS_BLOCK_SIZE - 1)) == 0)
184 max_blk -= 1;
185
186 ino = key_inum(c, &r->key);
187
188 data_key_init(c, &min_key, ino, min_blk);
189 data_key_init(c, &max_key, ino, max_blk);
190
191 return ubifs_tnc_remove_range(c, &min_key, &max_key);
192}
193
194/**
195 * apply_replay_entry - apply a replay entry to the TNC.
196 * @c: UBIFS file-system description object
197 * @r: replay entry to apply
198 *
199 * Apply a replay entry to the TNC.
200 */
201static int apply_replay_entry(struct ubifs_info *c, struct replay_entry *r)
202{
203 int err, deletion = ((r->flags & REPLAY_DELETION) != 0);
204
205 dbg_mnt("LEB %d:%d len %d flgs %d sqnum %llu %s", r->lnum,
206 r->offs, r->len, r->flags, r->sqnum, DBGKEY(&r->key));
207
208 /* Set c->replay_sqnum to help deal with dangling branches. */
209 c->replay_sqnum = r->sqnum;
210
211 if (r->flags & REPLAY_REF)
212 err = set_bud_lprops(c, r);
213 else if (is_hash_key(c, &r->key)) {
214 if (deletion)
215 err = ubifs_tnc_remove_nm(c, &r->key, &r->nm);
216 else
217 err = ubifs_tnc_add_nm(c, &r->key, r->lnum, r->offs,
218 r->len, &r->nm);
219 } else {
220 if (deletion)
221 switch (key_type(c, &r->key)) {
222 case UBIFS_INO_KEY:
223 {
224 ino_t inum = key_inum(c, &r->key);
225
226 err = ubifs_tnc_remove_ino(c, inum);
227 break;
228 }
229 case UBIFS_TRUN_KEY:
230 err = trun_remove_range(c, r);
231 break;
232 default:
233 err = ubifs_tnc_remove(c, &r->key);
234 break;
235 }
236 else
237 err = ubifs_tnc_add(c, &r->key, r->lnum, r->offs,
238 r->len);
239 if (err)
240 return err;
241
242 if (c->need_recovery)
243 err = ubifs_recover_size_accum(c, &r->key, deletion,
244 r->new_size);
245 }
246
247 return err;
248}
249
250/**
251 * destroy_replay_tree - destroy the replay.
252 * @c: UBIFS file-system description object
253 *
254 * Destroy the replay tree.
255 */
256static void destroy_replay_tree(struct ubifs_info *c)
257{
258 struct rb_node *this = c->replay_tree.rb_node;
259 struct replay_entry *r;
260
261 while (this) {
262 if (this->rb_left) {
263 this = this->rb_left;
264 continue;
265 } else if (this->rb_right) {
266 this = this->rb_right;
267 continue;
268 }
269 r = rb_entry(this, struct replay_entry, rb);
270 this = rb_parent(this);
271 if (this) {
272 if (this->rb_left == &r->rb)
273 this->rb_left = NULL;
274 else
275 this->rb_right = NULL;
276 }
277 if (is_hash_key(c, &r->key))
278 kfree(r->nm.name);
279 kfree(r);
280 }
281 c->replay_tree = RB_ROOT;
282}
283
284/**
285 * apply_replay_tree - apply the replay tree to the TNC.
286 * @c: UBIFS file-system description object
287 *
288 * Apply the replay tree.
289 * Returns zero in case of success and a negative error code in case of
290 * failure.
291 */
292static int apply_replay_tree(struct ubifs_info *c)
293{
294 struct rb_node *this = rb_first(&c->replay_tree);
295
296 while (this) {
297 struct replay_entry *r;
298 int err;
299
300 cond_resched();
301
302 r = rb_entry(this, struct replay_entry, rb);
303 err = apply_replay_entry(c, r);
304 if (err)
305 return err;
306 this = rb_next(this);
307 }
308 return 0;
309}
310
311/**
312 * insert_node - insert a node to the replay tree.
313 * @c: UBIFS file-system description object
314 * @lnum: node logical eraseblock number
315 * @offs: node offset
316 * @len: node length
317 * @key: node key
318 * @sqnum: sequence number
319 * @deletion: non-zero if this is a deletion
320 * @used: number of bytes in use in a LEB
321 * @old_size: truncation old size
322 * @new_size: truncation new size
323 *
324 * This function inserts a scanned non-direntry node to the replay tree. The
325 * replay tree is an RB-tree containing @struct replay_entry elements which are
326 * indexed by the sequence number. The replay tree is applied at the very end
327 * of the replay process. Since the tree is sorted in sequence number order,
328 * the older modifications are applied first. This function returns zero in
329 * case of success and a negative error code in case of failure.
330 */
331static int insert_node(struct ubifs_info *c, int lnum, int offs, int len,
332 union ubifs_key *key, unsigned long long sqnum,
333 int deletion, int *used, loff_t old_size,
334 loff_t new_size)
335{
336 struct rb_node **p = &c->replay_tree.rb_node, *parent = NULL;
337 struct replay_entry *r;
338
339 if (key_inum(c, key) >= c->highest_inum)
340 c->highest_inum = key_inum(c, key);
341
342 dbg_mnt("add LEB %d:%d, key %s", lnum, offs, DBGKEY(key));
343 while (*p) {
344 parent = *p;
345 r = rb_entry(parent, struct replay_entry, rb);
346 if (sqnum < r->sqnum) {
347 p = &(*p)->rb_left;
348 continue;
349 } else if (sqnum > r->sqnum) {
350 p = &(*p)->rb_right;
351 continue;
352 }
353 ubifs_err("duplicate sqnum in replay");
354 return -EINVAL;
355 }
356
357 r = kzalloc(sizeof(struct replay_entry), GFP_KERNEL);
358 if (!r)
359 return -ENOMEM;
360
361 if (!deletion)
362 *used += ALIGN(len, 8);
363 r->lnum = lnum;
364 r->offs = offs;
365 r->len = len;
366 r->sqnum = sqnum;
367 r->flags = (deletion ? REPLAY_DELETION : 0);
368 r->old_size = old_size;
369 r->new_size = new_size;
370 key_copy(c, key, &r->key);
371
372 rb_link_node(&r->rb, parent, p);
373 rb_insert_color(&r->rb, &c->replay_tree);
374 return 0;
375}
376
377/**
378 * insert_dent - insert a directory entry node into the replay tree.
379 * @c: UBIFS file-system description object
380 * @lnum: node logical eraseblock number
381 * @offs: node offset
382 * @len: node length
383 * @key: node key
384 * @name: directory entry name
385 * @nlen: directory entry name length
386 * @sqnum: sequence number
387 * @deletion: non-zero if this is a deletion
388 * @used: number of bytes in use in a LEB
389 *
390 * This function inserts a scanned directory entry node to the replay tree.
391 * Returns zero in case of success and a negative error code in case of
392 * failure.
393 *
394 * This function is also used for extended attribute entries because they are
395 * implemented as directory entry nodes.
396 */
397static int insert_dent(struct ubifs_info *c, int lnum, int offs, int len,
398 union ubifs_key *key, const char *name, int nlen,
399 unsigned long long sqnum, int deletion, int *used)
400{
401 struct rb_node **p = &c->replay_tree.rb_node, *parent = NULL;
402 struct replay_entry *r;
403 char *nbuf;
404
405 if (key_inum(c, key) >= c->highest_inum)
406 c->highest_inum = key_inum(c, key);
407
408 dbg_mnt("add LEB %d:%d, key %s", lnum, offs, DBGKEY(key));
409 while (*p) {
410 parent = *p;
411 r = rb_entry(parent, struct replay_entry, rb);
412 if (sqnum < r->sqnum) {
413 p = &(*p)->rb_left;
414 continue;
415 }
416 if (sqnum > r->sqnum) {
417 p = &(*p)->rb_right;
418 continue;
419 }
420 ubifs_err("duplicate sqnum in replay");
421 return -EINVAL;
422 }
423
424 r = kzalloc(sizeof(struct replay_entry), GFP_KERNEL);
425 if (!r)
426 return -ENOMEM;
427 nbuf = kmalloc(nlen + 1, GFP_KERNEL);
428 if (!nbuf) {
429 kfree(r);
430 return -ENOMEM;
431 }
432
433 if (!deletion)
434 *used += ALIGN(len, 8);
435 r->lnum = lnum;
436 r->offs = offs;
437 r->len = len;
438 r->sqnum = sqnum;
439 r->nm.len = nlen;
440 memcpy(nbuf, name, nlen);
441 nbuf[nlen] = '\0';
442 r->nm.name = nbuf;
443 r->flags = (deletion ? REPLAY_DELETION : 0);
444 key_copy(c, key, &r->key);
445
446 ubifs_assert(!*p);
447 rb_link_node(&r->rb, parent, p);
448 rb_insert_color(&r->rb, &c->replay_tree);
449 return 0;
450}
451
452/**
453 * ubifs_validate_entry - validate directory or extended attribute entry node.
454 * @c: UBIFS file-system description object
455 * @dent: the node to validate
456 *
457 * This function validates directory or extended attribute entry node @dent.
458 * Returns zero if the node is all right and a %-EINVAL if not.
459 */
460int ubifs_validate_entry(struct ubifs_info *c,
461 const struct ubifs_dent_node *dent)
462{
463 int key_type = key_type_flash(c, dent->key);
464 int nlen = le16_to_cpu(dent->nlen);
465
466 if (le32_to_cpu(dent->ch.len) != nlen + UBIFS_DENT_NODE_SZ + 1 ||
467 dent->type >= UBIFS_ITYPES_CNT ||
468 nlen > UBIFS_MAX_NLEN || dent->name[nlen] != 0 ||
469 strnlen(dent->name, nlen) != nlen ||
470 le64_to_cpu(dent->inum) > MAX_INUM) {
471 ubifs_err("bad %s node", key_type == UBIFS_DENT_KEY ?
472 "directory entry" : "extended attribute entry");
473 return -EINVAL;
474 }
475
476 if (key_type != UBIFS_DENT_KEY && key_type != UBIFS_XENT_KEY) {
477 ubifs_err("bad key type %d", key_type);
478 return -EINVAL;
479 }
480
481 return 0;
482}
483
484/**
485 * replay_bud - replay a bud logical eraseblock.
486 * @c: UBIFS file-system description object
487 * @lnum: bud logical eraseblock number to replay
488 * @offs: bud start offset
489 * @jhead: journal head to which this bud belongs
490 * @free: amount of free space in the bud is returned here
491 * @dirty: amount of dirty space from padding and deletion nodes is returned
492 * here
493 *
494 * This function returns zero in case of success and a negative error code in
495 * case of failure.
496 */
497static int replay_bud(struct ubifs_info *c, int lnum, int offs, int jhead,
498 int *free, int *dirty)
499{
500 int err = 0, used = 0;
501 struct ubifs_scan_leb *sleb;
502 struct ubifs_scan_node *snod;
503 struct ubifs_bud *bud;
504
505 dbg_mnt("replay bud LEB %d, head %d", lnum, jhead);
506 if (c->need_recovery)
507 sleb = ubifs_recover_leb(c, lnum, offs, c->sbuf, jhead != GCHD);
508 else
509 sleb = ubifs_scan(c, lnum, offs, c->sbuf);
510 if (IS_ERR(sleb))
511 return PTR_ERR(sleb);
512
513 /*
514 * The bud does not have to start from offset zero - the beginning of
515 * the 'lnum' LEB may contain previously committed data. One of the
516 * things we have to do in replay is to correctly update lprops with
517 * newer information about this LEB.
518 *
519 * At this point lprops thinks that this LEB has 'c->leb_size - offs'
520 * bytes of free space because it only contain information about
521 * committed data.
522 *
523 * But we know that real amount of free space is 'c->leb_size -
524 * sleb->endpt', and the space in the 'lnum' LEB between 'offs' and
525 * 'sleb->endpt' is used by bud data. We have to correctly calculate
526 * how much of these data are dirty and update lprops with this
527 * information.
528 *
529 * The dirt in that LEB region is comprised of padding nodes, deletion
530 * nodes, truncation nodes and nodes which are obsoleted by subsequent
531 * nodes in this LEB. So instead of calculating clean space, we
532 * calculate used space ('used' variable).
533 */
534
535 list_for_each_entry(snod, &sleb->nodes, list) {
536 int deletion = 0;
537
538 cond_resched();
539
540 if (snod->sqnum >= SQNUM_WATERMARK) {
541 ubifs_err("file system's life ended");
542 goto out_dump;
543 }
544
545 if (snod->sqnum > c->max_sqnum)
546 c->max_sqnum = snod->sqnum;
547
548 switch (snod->type) {
549 case UBIFS_INO_NODE:
550 {
551 struct ubifs_ino_node *ino = snod->node;
552 loff_t new_size = le64_to_cpu(ino->size);
553
554 if (le32_to_cpu(ino->nlink) == 0)
555 deletion = 1;
556 err = insert_node(c, lnum, snod->offs, snod->len,
557 &snod->key, snod->sqnum, deletion,
558 &used, 0, new_size);
559 break;
560 }
561 case UBIFS_DATA_NODE:
562 {
563 struct ubifs_data_node *dn = snod->node;
564 loff_t new_size = le32_to_cpu(dn->size) +
565 key_block(c, &snod->key) *
566 UBIFS_BLOCK_SIZE;
567
568 err = insert_node(c, lnum, snod->offs, snod->len,
569 &snod->key, snod->sqnum, deletion,
570 &used, 0, new_size);
571 break;
572 }
573 case UBIFS_DENT_NODE:
574 case UBIFS_XENT_NODE:
575 {
576 struct ubifs_dent_node *dent = snod->node;
577
578 err = ubifs_validate_entry(c, dent);
579 if (err)
580 goto out_dump;
581
582 err = insert_dent(c, lnum, snod->offs, snod->len,
583 &snod->key, dent->name,
584 le16_to_cpu(dent->nlen), snod->sqnum,
585 !le64_to_cpu(dent->inum), &used);
586 break;
587 }
588 case UBIFS_TRUN_NODE:
589 {
590 struct ubifs_trun_node *trun = snod->node;
591 loff_t old_size = le64_to_cpu(trun->old_size);
592 loff_t new_size = le64_to_cpu(trun->new_size);
593 union ubifs_key key;
594
595 /* Validate truncation node */
596 if (old_size < 0 || old_size > c->max_inode_sz ||
597 new_size < 0 || new_size > c->max_inode_sz ||
598 old_size <= new_size) {
599 ubifs_err("bad truncation node");
600 goto out_dump;
601 }
602
603 /*
604 * Create a fake truncation key just to use the same
605 * functions which expect nodes to have keys.
606 */
607 trun_key_init(c, &key, le32_to_cpu(trun->inum));
608 err = insert_node(c, lnum, snod->offs, snod->len,
609 &key, snod->sqnum, 1, &used,
610 old_size, new_size);
611 break;
612 }
613 default:
614 ubifs_err("unexpected node type %d in bud LEB %d:%d",
615 snod->type, lnum, snod->offs);
616 err = -EINVAL;
617 goto out_dump;
618 }
619 if (err)
620 goto out;
621 }
622
623 bud = ubifs_search_bud(c, lnum);
624 if (!bud)
625 BUG();
626
627 ubifs_assert(sleb->endpt - offs >= used);
628 ubifs_assert(sleb->endpt % c->min_io_size == 0);
629
630 if (sleb->endpt + c->min_io_size <= c->leb_size &&
631 !(c->vfs_sb->s_flags & MS_RDONLY))
632 err = ubifs_wbuf_seek_nolock(&c->jheads[jhead].wbuf, lnum,
633 sleb->endpt, UBI_SHORTTERM);
634
635 *dirty = sleb->endpt - offs - used;
636 *free = c->leb_size - sleb->endpt;
637
638out:
639 ubifs_scan_destroy(sleb);
640 return err;
641
642out_dump:
643 ubifs_err("bad node is at LEB %d:%d", lnum, snod->offs);
644 dbg_dump_node(c, snod->node);
645 ubifs_scan_destroy(sleb);
646 return -EINVAL;
647}
648
649/**
650 * insert_ref_node - insert a reference node to the replay tree.
651 * @c: UBIFS file-system description object
652 * @lnum: node logical eraseblock number
653 * @offs: node offset
654 * @sqnum: sequence number
655 * @free: amount of free space in bud
656 * @dirty: amount of dirty space from padding and deletion nodes
657 *
658 * This function inserts a reference node to the replay tree and returns zero
659 * in case of success ort a negative error code in case of failure.
660 */
661static int insert_ref_node(struct ubifs_info *c, int lnum, int offs,
662 unsigned long long sqnum, int free, int dirty)
663{
664 struct rb_node **p = &c->replay_tree.rb_node, *parent = NULL;
665 struct replay_entry *r;
666
667 dbg_mnt("add ref LEB %d:%d", lnum, offs);
668 while (*p) {
669 parent = *p;
670 r = rb_entry(parent, struct replay_entry, rb);
671 if (sqnum < r->sqnum) {
672 p = &(*p)->rb_left;
673 continue;
674 } else if (sqnum > r->sqnum) {
675 p = &(*p)->rb_right;
676 continue;
677 }
678 ubifs_err("duplicate sqnum in replay tree");
679 return -EINVAL;
680 }
681
682 r = kzalloc(sizeof(struct replay_entry), GFP_KERNEL);
683 if (!r)
684 return -ENOMEM;
685
686 r->lnum = lnum;
687 r->offs = offs;
688 r->sqnum = sqnum;
689 r->flags = REPLAY_REF;
690 r->free = free;
691 r->dirty = dirty;
692
693 rb_link_node(&r->rb, parent, p);
694 rb_insert_color(&r->rb, &c->replay_tree);
695 return 0;
696}
697
698/**
699 * replay_buds - replay all buds.
700 * @c: UBIFS file-system description object
701 *
702 * This function returns zero in case of success and a negative error code in
703 * case of failure.
704 */
705static int replay_buds(struct ubifs_info *c)
706{
707 struct bud_entry *b;
708 int err, uninitialized_var(free), uninitialized_var(dirty);
709
710 list_for_each_entry(b, &c->replay_buds, list) {
711 err = replay_bud(c, b->bud->lnum, b->bud->start, b->bud->jhead,
712 &free, &dirty);
713 if (err)
714 return err;
715 err = insert_ref_node(c, b->bud->lnum, b->bud->start, b->sqnum,
716 free, dirty);
717 if (err)
718 return err;
719 }
720
721 return 0;
722}
723
724/**
725 * destroy_bud_list - destroy the list of buds to replay.
726 * @c: UBIFS file-system description object
727 */
728static void destroy_bud_list(struct ubifs_info *c)
729{
730 struct bud_entry *b;
731
732 while (!list_empty(&c->replay_buds)) {
733 b = list_entry(c->replay_buds.next, struct bud_entry, list);
734 list_del(&b->list);
735 kfree(b);
736 }
737}
738
739/**
740 * add_replay_bud - add a bud to the list of buds to replay.
741 * @c: UBIFS file-system description object
742 * @lnum: bud logical eraseblock number to replay
743 * @offs: bud start offset
744 * @jhead: journal head to which this bud belongs
745 * @sqnum: reference node sequence number
746 *
747 * This function returns zero in case of success and a negative error code in
748 * case of failure.
749 */
750static int add_replay_bud(struct ubifs_info *c, int lnum, int offs, int jhead,
751 unsigned long long sqnum)
752{
753 struct ubifs_bud *bud;
754 struct bud_entry *b;
755
756 dbg_mnt("add replay bud LEB %d:%d, head %d", lnum, offs, jhead);
757
758 bud = kmalloc(sizeof(struct ubifs_bud), GFP_KERNEL);
759 if (!bud)
760 return -ENOMEM;
761
762 b = kmalloc(sizeof(struct bud_entry), GFP_KERNEL);
763 if (!b) {
764 kfree(bud);
765 return -ENOMEM;
766 }
767
768 bud->lnum = lnum;
769 bud->start = offs;
770 bud->jhead = jhead;
771 ubifs_add_bud(c, bud);
772
773 b->bud = bud;
774 b->sqnum = sqnum;
775 list_add_tail(&b->list, &c->replay_buds);
776
777 return 0;
778}
779
780/**
781 * validate_ref - validate a reference node.
782 * @c: UBIFS file-system description object
783 * @ref: the reference node to validate
784 * @ref_lnum: LEB number of the reference node
785 * @ref_offs: reference node offset
786 *
787 * This function returns %1 if a bud reference already exists for the LEB. %0 is
788 * returned if the reference node is new, otherwise %-EINVAL is returned if
789 * validation failed.
790 */
791static int validate_ref(struct ubifs_info *c, const struct ubifs_ref_node *ref)
792{
793 struct ubifs_bud *bud;
794 int lnum = le32_to_cpu(ref->lnum);
795 unsigned int offs = le32_to_cpu(ref->offs);
796 unsigned int jhead = le32_to_cpu(ref->jhead);
797
798 /*
799 * ref->offs may point to the end of LEB when the journal head points
800 * to the end of LEB and we write reference node for it during commit.
801 * So this is why we require 'offs > c->leb_size'.
802 */
803 if (jhead >= c->jhead_cnt || lnum >= c->leb_cnt ||
804 lnum < c->main_first || offs > c->leb_size ||
805 offs & (c->min_io_size - 1))
806 return -EINVAL;
807
808 /* Make sure we have not already looked at this bud */
809 bud = ubifs_search_bud(c, lnum);
810 if (bud) {
811 if (bud->jhead == jhead && bud->start <= offs)
812 return 1;
813 ubifs_err("bud at LEB %d:%d was already referred", lnum, offs);
814 return -EINVAL;
815 }
816
817 return 0;
818}
819
820/**
821 * replay_log_leb - replay a log logical eraseblock.
822 * @c: UBIFS file-system description object
823 * @lnum: log logical eraseblock to replay
824 * @offs: offset to start replaying from
825 * @sbuf: scan buffer
826 *
827 * This function replays a log LEB and returns zero in case of success, %1 if
828 * this is the last LEB in the log, and a negative error code in case of
829 * failure.
830 */
831static int replay_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf)
832{
833 int err;
834 struct ubifs_scan_leb *sleb;
835 struct ubifs_scan_node *snod;
836 const struct ubifs_cs_node *node;
837
838 dbg_mnt("replay log LEB %d:%d", lnum, offs);
839 sleb = ubifs_scan(c, lnum, offs, sbuf);
840 if (IS_ERR(sleb)) {
841 if (c->need_recovery)
842 sleb = ubifs_recover_log_leb(c, lnum, offs, sbuf);
843 if (IS_ERR(sleb))
844 return PTR_ERR(sleb);
845 }
846
847 if (sleb->nodes_cnt == 0) {
848 err = 1;
849 goto out;
850 }
851
852 node = sleb->buf;
853
854 snod = list_entry(sleb->nodes.next, struct ubifs_scan_node, list);
855 if (c->cs_sqnum == 0) {
856 /*
857 * This is the first log LEB we are looking at, make sure that
858 * the first node is a commit start node. Also record its
859 * sequence number so that UBIFS can determine where the log
860 * ends, because all nodes which were have higher sequence
861 * numbers.
862 */
863 if (snod->type != UBIFS_CS_NODE) {
864 dbg_err("first log node at LEB %d:%d is not CS node",
865 lnum, offs);
866 goto out_dump;
867 }
868 if (le64_to_cpu(node->cmt_no) != c->cmt_no) {
869 dbg_err("first CS node at LEB %d:%d has wrong "
870 "commit number %llu expected %llu",
871 lnum, offs,
872 (unsigned long long)le64_to_cpu(node->cmt_no),
873 c->cmt_no);
874 goto out_dump;
875 }
876
877 c->cs_sqnum = le64_to_cpu(node->ch.sqnum);
878 dbg_mnt("commit start sqnum %llu", c->cs_sqnum);
879 }
880
881 if (snod->sqnum < c->cs_sqnum) {
882 /*
883 * This means that we reached end of log and now
884 * look to the older log data, which was already
885 * committed but the eraseblock was not erased (UBIFS
886 * only unmaps it). So this basically means we have to
887 * exit with "end of log" code.
888 */
889 err = 1;
890 goto out;
891 }
892
893 /* Make sure the first node sits at offset zero of the LEB */
894 if (snod->offs != 0) {
895 dbg_err("first node is not at zero offset");
896 goto out_dump;
897 }
898
899 list_for_each_entry(snod, &sleb->nodes, list) {
900
901 cond_resched();
902
903 if (snod->sqnum >= SQNUM_WATERMARK) {
904 ubifs_err("file system's life ended");
905 goto out_dump;
906 }
907
908 if (snod->sqnum < c->cs_sqnum) {
909 dbg_err("bad sqnum %llu, commit sqnum %llu",
910 snod->sqnum, c->cs_sqnum);
911 goto out_dump;
912 }
913
914 if (snod->sqnum > c->max_sqnum)
915 c->max_sqnum = snod->sqnum;
916
917 switch (snod->type) {
918 case UBIFS_REF_NODE: {
919 const struct ubifs_ref_node *ref = snod->node;
920
921 err = validate_ref(c, ref);
922 if (err == 1)
923 break; /* Already have this bud */
924 if (err)
925 goto out_dump;
926
927 err = add_replay_bud(c, le32_to_cpu(ref->lnum),
928 le32_to_cpu(ref->offs),
929 le32_to_cpu(ref->jhead),
930 snod->sqnum);
931 if (err)
932 goto out;
933
934 break;
935 }
936 case UBIFS_CS_NODE:
937 /* Make sure it sits at the beginning of LEB */
938 if (snod->offs != 0) {
939 ubifs_err("unexpected node in log");
940 goto out_dump;
941 }
942 break;
943 default:
944 ubifs_err("unexpected node in log");
945 goto out_dump;
946 }
947 }
948
949 if (sleb->endpt || c->lhead_offs >= c->leb_size) {
950 c->lhead_lnum = lnum;
951 c->lhead_offs = sleb->endpt;
952 }
953
954 err = !sleb->endpt;
955out:
956 ubifs_scan_destroy(sleb);
957 return err;
958
959out_dump:
960 ubifs_err("log error detected while replying the log at LEB %d:%d",
961 lnum, offs + snod->offs);
962 dbg_dump_node(c, snod->node);
963 ubifs_scan_destroy(sleb);
964 return -EINVAL;
965}
966
967/**
968 * take_ihead - update the status of the index head in lprops to 'taken'.
969 * @c: UBIFS file-system description object
970 *
971 * This function returns the amount of free space in the index head LEB or a
972 * negative error code.
973 */
974static int take_ihead(struct ubifs_info *c)
975{
976 const struct ubifs_lprops *lp;
977 int err, free;
978
979 ubifs_get_lprops(c);
980
981 lp = ubifs_lpt_lookup_dirty(c, c->ihead_lnum);
982 if (IS_ERR(lp)) {
983 err = PTR_ERR(lp);
984 goto out;
985 }
986
987 free = lp->free;
988
989 lp = ubifs_change_lp(c, lp, LPROPS_NC, LPROPS_NC,
990 lp->flags | LPROPS_TAKEN, 0);
991 if (IS_ERR(lp)) {
992 err = PTR_ERR(lp);
993 goto out;
994 }
995
996 err = free;
997out:
998 ubifs_release_lprops(c);
999 return err;
1000}
1001
1002/**
1003 * ubifs_replay_journal - replay journal.
1004 * @c: UBIFS file-system description object
1005 *
1006 * This function scans the journal, replays and cleans it up. It makes sure all
1007 * memory data structures related to uncommitted journal are built (dirty TNC
1008 * tree, tree of buds, modified lprops, etc).
1009 */
1010int ubifs_replay_journal(struct ubifs_info *c)
1011{
1012 int err, i, lnum, offs, free;
1013 void *sbuf = NULL;
1014
1015 BUILD_BUG_ON(UBIFS_TRUN_KEY > 5);
1016
1017 /* Update the status of the index head in lprops to 'taken' */
1018 free = take_ihead(c);
1019 if (free < 0)
1020 return free; /* Error code */
1021
1022 if (c->ihead_offs != c->leb_size - free) {
1023 ubifs_err("bad index head LEB %d:%d", c->ihead_lnum,
1024 c->ihead_offs);
1025 return -EINVAL;
1026 }
1027
1028 sbuf = vmalloc(c->leb_size);
1029 if (!sbuf)
1030 return -ENOMEM;
1031
1032 dbg_mnt("start replaying the journal");
1033
1034 c->replaying = 1;
1035
1036 lnum = c->ltail_lnum = c->lhead_lnum;
1037 offs = c->lhead_offs;
1038
1039 for (i = 0; i < c->log_lebs; i++, lnum++) {
1040 if (lnum >= UBIFS_LOG_LNUM + c->log_lebs) {
1041 /*
1042 * The log is logically circular, we reached the last
1043 * LEB, switch to the first one.
1044 */
1045 lnum = UBIFS_LOG_LNUM;
1046 offs = 0;
1047 }
1048 err = replay_log_leb(c, lnum, offs, sbuf);
1049 if (err == 1)
1050 /* We hit the end of the log */
1051 break;
1052 if (err)
1053 goto out;
1054 offs = 0;
1055 }
1056
1057 err = replay_buds(c);
1058 if (err)
1059 goto out;
1060
1061 err = apply_replay_tree(c);
1062 if (err)
1063 goto out;
1064
1065 ubifs_assert(c->bud_bytes <= c->max_bud_bytes || c->need_recovery);
1066 dbg_mnt("finished, log head LEB %d:%d, max_sqnum %llu, "
1067 "highest_inum %lu", c->lhead_lnum, c->lhead_offs, c->max_sqnum,
1068 c->highest_inum);
1069out:
1070 destroy_replay_tree(c);
1071 destroy_bud_list(c);
1072 vfree(sbuf);
1073 c->replaying = 0;
1074 return err;
1075}
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
new file mode 100644
index 000000000000..2bf753b38889
--- /dev/null
+++ b/fs/ubifs/sb.c
@@ -0,0 +1,629 @@
1/*
2 * This file is part of UBIFS.
3 *
4 * Copyright (C) 2006-2008 Nokia Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published by
8 * the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along with
16 * this program; if not, write to the Free Software Foundation, Inc., 51
17 * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 * Authors: Artem Bityutskiy (Битюцкий Артём)
20 * Adrian Hunter
21 */
22
23/*
24 * This file implements UBIFS superblock. The superblock is stored at the first
25 * LEB of the volume and is never changed by UBIFS. Only user-space tools may
26 * change it. The superblock node mostly contains geometry information.
27 */
28
29#include "ubifs.h"
30#include <linux/random.h>
31
32/*
33 * Default journal size in logical eraseblocks as a percent of total
34 * flash size.
35 */
36#define DEFAULT_JNL_PERCENT 5
37
38/* Default maximum journal size in bytes */
39#define DEFAULT_MAX_JNL (32*1024*1024)
40
41/* Default indexing tree fanout */
42#define DEFAULT_FANOUT 8
43
44/* Default number of data journal heads */
45#define DEFAULT_JHEADS_CNT 1
46
47/* Default positions of different LEBs in the main area */
48#define DEFAULT_IDX_LEB 0
49#define DEFAULT_DATA_LEB 1
50#define DEFAULT_GC_LEB 2
51
52/* Default number of LEB numbers in LPT's save table */
53#define DEFAULT_LSAVE_CNT 256
54
55/* Default reserved pool size as a percent of maximum free space */
56#define DEFAULT_RP_PERCENT 5
57
58/* The default maximum size of reserved pool in bytes */
59#define DEFAULT_MAX_RP_SIZE (5*1024*1024)
60
61/* Default time granularity in nanoseconds */
62#define DEFAULT_TIME_GRAN 1000000000
63
64/**
65 * create_default_filesystem - format empty UBI volume.
66 * @c: UBIFS file-system description object
67 *
68 * This function creates default empty file-system. Returns zero in case of
69 * success and a negative error code in case of failure.
70 */
71static int create_default_filesystem(struct ubifs_info *c)
72{
73 struct ubifs_sb_node *sup;
74 struct ubifs_mst_node *mst;
75 struct ubifs_idx_node *idx;
76 struct ubifs_branch *br;
77 struct ubifs_ino_node *ino;
78 struct ubifs_cs_node *cs;
79 union ubifs_key key;
80 int err, tmp, jnl_lebs, log_lebs, max_buds, main_lebs, main_first;
81 int lpt_lebs, lpt_first, orph_lebs, big_lpt, ino_waste, sup_flags = 0;
82 int min_leb_cnt = UBIFS_MIN_LEB_CNT;
83 uint64_t tmp64, main_bytes;
84
85 /* Some functions called from here depend on the @c->key_len filed */
86 c->key_len = UBIFS_SK_LEN;
87
88 /*
89 * First of all, we have to calculate default file-system geometry -
90 * log size, journal size, etc.
91 */
92 if (c->leb_cnt < 0x7FFFFFFF / DEFAULT_JNL_PERCENT)
93 /* We can first multiply then divide and have no overflow */
94 jnl_lebs = c->leb_cnt * DEFAULT_JNL_PERCENT / 100;
95 else
96 jnl_lebs = (c->leb_cnt / 100) * DEFAULT_JNL_PERCENT;
97
98 if (jnl_lebs < UBIFS_MIN_JNL_LEBS)
99 jnl_lebs = UBIFS_MIN_JNL_LEBS;
100 if (jnl_lebs * c->leb_size > DEFAULT_MAX_JNL)
101 jnl_lebs = DEFAULT_MAX_JNL / c->leb_size;
102
103 /*
104 * The log should be large enough to fit reference nodes for all bud
105 * LEBs. Because buds do not have to start from the beginning of LEBs
106 * (half of the LEB may contain committed data), the log should
107 * generally be larger, make it twice as large.
108 */
109 tmp = 2 * (c->ref_node_alsz * jnl_lebs) + c->leb_size - 1;
110 log_lebs = tmp / c->leb_size;
111 /* Plus one LEB reserved for commit */
112 log_lebs += 1;
113 if (c->leb_cnt - min_leb_cnt > 8) {
114 /* And some extra space to allow writes while committing */
115 log_lebs += 1;
116 min_leb_cnt += 1;
117 }
118
119 max_buds = jnl_lebs - log_lebs;
120 if (max_buds < UBIFS_MIN_BUD_LEBS)
121 max_buds = UBIFS_MIN_BUD_LEBS;
122
123 /*
124 * Orphan nodes are stored in a separate area. One node can store a lot
125 * of orphan inode numbers, but when new orphan comes we just add a new
126 * orphan node. At some point the nodes are consolidated into one
127 * orphan node.
128 */
129 orph_lebs = UBIFS_MIN_ORPH_LEBS;
130#ifdef CONFIG_UBIFS_FS_DEBUG
131 if (c->leb_cnt - min_leb_cnt > 1)
132 /*
133 * For debugging purposes it is better to have at least 2
134 * orphan LEBs, because the orphan subsystem would need to do
135 * consolidations and would be stressed more.
136 */
137 orph_lebs += 1;
138#endif
139
140 main_lebs = c->leb_cnt - UBIFS_SB_LEBS - UBIFS_MST_LEBS - log_lebs;
141 main_lebs -= orph_lebs;
142
143 lpt_first = UBIFS_LOG_LNUM + log_lebs;
144 c->lsave_cnt = DEFAULT_LSAVE_CNT;
145 c->max_leb_cnt = c->leb_cnt;
146 err = ubifs_create_dflt_lpt(c, &main_lebs, lpt_first, &lpt_lebs,
147 &big_lpt);
148 if (err)
149 return err;
150
151 dbg_gen("LEB Properties Tree created (LEBs %d-%d)", lpt_first,
152 lpt_first + lpt_lebs - 1);
153
154 main_first = c->leb_cnt - main_lebs;
155
156 /* Create default superblock */
157 tmp = ALIGN(UBIFS_SB_NODE_SZ, c->min_io_size);
158 sup = kzalloc(tmp, GFP_KERNEL);
159 if (!sup)
160 return -ENOMEM;
161
162 tmp64 = (uint64_t)max_buds * c->leb_size;
163 if (big_lpt)
164 sup_flags |= UBIFS_FLG_BIGLPT;
165
166 sup->ch.node_type = UBIFS_SB_NODE;
167 sup->key_hash = UBIFS_KEY_HASH_R5;
168 sup->flags = cpu_to_le32(sup_flags);
169 sup->min_io_size = cpu_to_le32(c->min_io_size);
170 sup->leb_size = cpu_to_le32(c->leb_size);
171 sup->leb_cnt = cpu_to_le32(c->leb_cnt);
172 sup->max_leb_cnt = cpu_to_le32(c->max_leb_cnt);
173 sup->max_bud_bytes = cpu_to_le64(tmp64);
174 sup->log_lebs = cpu_to_le32(log_lebs);
175 sup->lpt_lebs = cpu_to_le32(lpt_lebs);
176 sup->orph_lebs = cpu_to_le32(orph_lebs);
177 sup->jhead_cnt = cpu_to_le32(DEFAULT_JHEADS_CNT);
178 sup->fanout = cpu_to_le32(DEFAULT_FANOUT);
179 sup->lsave_cnt = cpu_to_le32(c->lsave_cnt);
180 sup->fmt_version = cpu_to_le32(UBIFS_FORMAT_VERSION);
181 sup->default_compr = cpu_to_le16(UBIFS_COMPR_LZO);
182 sup->time_gran = cpu_to_le32(DEFAULT_TIME_GRAN);
183
184 generate_random_uuid(sup->uuid);
185
186 main_bytes = (uint64_t)main_lebs * c->leb_size;
187 tmp64 = main_bytes * DEFAULT_RP_PERCENT;
188 do_div(tmp64, 100);
189 if (tmp64 > DEFAULT_MAX_RP_SIZE)
190 tmp64 = DEFAULT_MAX_RP_SIZE;
191 sup->rp_size = cpu_to_le64(tmp64);
192
193 err = ubifs_write_node(c, sup, UBIFS_SB_NODE_SZ, 0, 0, UBI_LONGTERM);
194 kfree(sup);
195 if (err)
196 return err;
197
198 dbg_gen("default superblock created at LEB 0:0");
199
200 /* Create default master node */
201 mst = kzalloc(c->mst_node_alsz, GFP_KERNEL);
202 if (!mst)
203 return -ENOMEM;
204
205 mst->ch.node_type = UBIFS_MST_NODE;
206 mst->log_lnum = cpu_to_le32(UBIFS_LOG_LNUM);
207 mst->highest_inum = cpu_to_le64(UBIFS_FIRST_INO);
208 mst->cmt_no = 0;
209 mst->root_lnum = cpu_to_le32(main_first + DEFAULT_IDX_LEB);
210 mst->root_offs = 0;
211 tmp = ubifs_idx_node_sz(c, 1);
212 mst->root_len = cpu_to_le32(tmp);
213 mst->gc_lnum = cpu_to_le32(main_first + DEFAULT_GC_LEB);
214 mst->ihead_lnum = cpu_to_le32(main_first + DEFAULT_IDX_LEB);
215 mst->ihead_offs = cpu_to_le32(ALIGN(tmp, c->min_io_size));
216 mst->index_size = cpu_to_le64(ALIGN(tmp, 8));
217 mst->lpt_lnum = cpu_to_le32(c->lpt_lnum);
218 mst->lpt_offs = cpu_to_le32(c->lpt_offs);
219 mst->nhead_lnum = cpu_to_le32(c->nhead_lnum);
220 mst->nhead_offs = cpu_to_le32(c->nhead_offs);
221 mst->ltab_lnum = cpu_to_le32(c->ltab_lnum);
222 mst->ltab_offs = cpu_to_le32(c->ltab_offs);
223 mst->lsave_lnum = cpu_to_le32(c->lsave_lnum);
224 mst->lsave_offs = cpu_to_le32(c->lsave_offs);
225 mst->lscan_lnum = cpu_to_le32(main_first);
226 mst->empty_lebs = cpu_to_le32(main_lebs - 2);
227 mst->idx_lebs = cpu_to_le32(1);
228 mst->leb_cnt = cpu_to_le32(c->leb_cnt);
229
230 /* Calculate lprops statistics */
231 tmp64 = main_bytes;
232 tmp64 -= ALIGN(ubifs_idx_node_sz(c, 1), c->min_io_size);
233 tmp64 -= ALIGN(UBIFS_INO_NODE_SZ, c->min_io_size);
234 mst->total_free = cpu_to_le64(tmp64);
235
236 tmp64 = ALIGN(ubifs_idx_node_sz(c, 1), c->min_io_size);
237 ino_waste = ALIGN(UBIFS_INO_NODE_SZ, c->min_io_size) -
238 UBIFS_INO_NODE_SZ;
239 tmp64 += ino_waste;
240 tmp64 -= ALIGN(ubifs_idx_node_sz(c, 1), 8);
241 mst->total_dirty = cpu_to_le64(tmp64);
242
243 /* The indexing LEB does not contribute to dark space */
244 tmp64 = (c->main_lebs - 1) * c->dark_wm;
245 mst->total_dark = cpu_to_le64(tmp64);
246
247 mst->total_used = cpu_to_le64(UBIFS_INO_NODE_SZ);
248
249 err = ubifs_write_node(c, mst, UBIFS_MST_NODE_SZ, UBIFS_MST_LNUM, 0,
250 UBI_UNKNOWN);
251 if (err) {
252 kfree(mst);
253 return err;
254 }
255 err = ubifs_write_node(c, mst, UBIFS_MST_NODE_SZ, UBIFS_MST_LNUM + 1, 0,
256 UBI_UNKNOWN);
257 kfree(mst);
258 if (err)
259 return err;
260
261 dbg_gen("default master node created at LEB %d:0", UBIFS_MST_LNUM);
262
263 /* Create the root indexing node */
264 tmp = ubifs_idx_node_sz(c, 1);
265 idx = kzalloc(ALIGN(tmp, c->min_io_size), GFP_KERNEL);
266 if (!idx)
267 return -ENOMEM;
268
269 c->key_fmt = UBIFS_SIMPLE_KEY_FMT;
270 c->key_hash = key_r5_hash;
271
272 idx->ch.node_type = UBIFS_IDX_NODE;
273 idx->child_cnt = cpu_to_le16(1);
274 ino_key_init(c, &key, UBIFS_ROOT_INO);
275 br = ubifs_idx_branch(c, idx, 0);
276 key_write_idx(c, &key, &br->key);
277 br->lnum = cpu_to_le32(main_first + DEFAULT_DATA_LEB);
278 br->len = cpu_to_le32(UBIFS_INO_NODE_SZ);
279 err = ubifs_write_node(c, idx, tmp, main_first + DEFAULT_IDX_LEB, 0,
280 UBI_UNKNOWN);
281 kfree(idx);
282 if (err)
283 return err;
284
285 dbg_gen("default root indexing node created LEB %d:0",
286 main_first + DEFAULT_IDX_LEB);
287
288 /* Create default root inode */
289 tmp = ALIGN(UBIFS_INO_NODE_SZ, c->min_io_size);
290 ino = kzalloc(tmp, GFP_KERNEL);
291 if (!ino)
292 return -ENOMEM;
293
294 ino_key_init_flash(c, &ino->key, UBIFS_ROOT_INO);
295 ino->ch.node_type = UBIFS_INO_NODE;
296 ino->creat_sqnum = cpu_to_le64(++c->max_sqnum);
297 ino->nlink = cpu_to_le32(2);
298 tmp = cpu_to_le64(CURRENT_TIME_SEC.tv_sec);
299 ino->atime_sec = tmp;
300 ino->ctime_sec = tmp;
301 ino->mtime_sec = tmp;
302 ino->atime_nsec = 0;
303 ino->ctime_nsec = 0;
304 ino->mtime_nsec = 0;
305 ino->mode = cpu_to_le32(S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO);
306 ino->size = cpu_to_le64(UBIFS_INO_NODE_SZ);
307
308 /* Set compression enabled by default */
309 ino->flags = cpu_to_le32(UBIFS_COMPR_FL);
310
311 err = ubifs_write_node(c, ino, UBIFS_INO_NODE_SZ,
312 main_first + DEFAULT_DATA_LEB, 0,
313 UBI_UNKNOWN);
314 kfree(ino);
315 if (err)
316 return err;
317
318 dbg_gen("root inode created at LEB %d:0",
319 main_first + DEFAULT_DATA_LEB);
320
321 /*
322 * The first node in the log has to be the commit start node. This is
323 * always the case during normal file-system operation. Write a fake
324 * commit start node to the log.
325 */
326 tmp = ALIGN(UBIFS_CS_NODE_SZ, c->min_io_size);
327 cs = kzalloc(tmp, GFP_KERNEL);
328 if (!cs)
329 return -ENOMEM;
330
331 cs->ch.node_type = UBIFS_CS_NODE;
332 err = ubifs_write_node(c, cs, UBIFS_CS_NODE_SZ, UBIFS_LOG_LNUM,
333 0, UBI_UNKNOWN);
334 kfree(cs);
335
336 ubifs_msg("default file-system created");
337 return 0;
338}
339
340/**
341 * validate_sb - validate superblock node.
342 * @c: UBIFS file-system description object
343 * @sup: superblock node
344 *
345 * This function validates superblock node @sup. Since most of data was read
346 * from the superblock and stored in @c, the function validates fields in @c
347 * instead. Returns zero in case of success and %-EINVAL in case of validation
348 * failure.
349 */
350static int validate_sb(struct ubifs_info *c, struct ubifs_sb_node *sup)
351{
352 long long max_bytes;
353 int err = 1, min_leb_cnt;
354
355 if (!c->key_hash) {
356 err = 2;
357 goto failed;
358 }
359
360 if (sup->key_fmt != UBIFS_SIMPLE_KEY_FMT) {
361 err = 3;
362 goto failed;
363 }
364
365 if (le32_to_cpu(sup->min_io_size) != c->min_io_size) {
366 ubifs_err("min. I/O unit mismatch: %d in superblock, %d real",
367 le32_to_cpu(sup->min_io_size), c->min_io_size);
368 goto failed;
369 }
370
371 if (le32_to_cpu(sup->leb_size) != c->leb_size) {
372 ubifs_err("LEB size mismatch: %d in superblock, %d real",
373 le32_to_cpu(sup->leb_size), c->leb_size);
374 goto failed;
375 }
376
377 if (c->log_lebs < UBIFS_MIN_LOG_LEBS ||
378 c->lpt_lebs < UBIFS_MIN_LPT_LEBS ||
379 c->orph_lebs < UBIFS_MIN_ORPH_LEBS ||
380 c->main_lebs < UBIFS_MIN_MAIN_LEBS) {
381 err = 4;
382 goto failed;
383 }
384
385 /*
386 * Calculate minimum allowed amount of main area LEBs. This is very
387 * similar to %UBIFS_MIN_LEB_CNT, but we take into account real what we
388 * have just read from the superblock.
389 */
390 min_leb_cnt = UBIFS_SB_LEBS + UBIFS_MST_LEBS + c->log_lebs;
391 min_leb_cnt += c->lpt_lebs + c->orph_lebs + c->jhead_cnt + 6;
392
393 if (c->leb_cnt < min_leb_cnt || c->leb_cnt > c->vi.size) {
394 ubifs_err("bad LEB count: %d in superblock, %d on UBI volume, "
395 "%d minimum required", c->leb_cnt, c->vi.size,
396 min_leb_cnt);
397 goto failed;
398 }
399
400 if (c->max_leb_cnt < c->leb_cnt) {
401 ubifs_err("max. LEB count %d less than LEB count %d",
402 c->max_leb_cnt, c->leb_cnt);
403 goto failed;
404 }
405
406 if (c->main_lebs < UBIFS_MIN_MAIN_LEBS) {
407 err = 7;
408 goto failed;
409 }
410
411 if (c->max_bud_bytes < (long long)c->leb_size * UBIFS_MIN_BUD_LEBS ||
412 c->max_bud_bytes > (long long)c->leb_size * c->main_lebs) {
413 err = 8;
414 goto failed;
415 }
416
417 if (c->jhead_cnt < NONDATA_JHEADS_CNT + 1 ||
418 c->jhead_cnt > NONDATA_JHEADS_CNT + UBIFS_MAX_JHEADS) {
419 err = 9;
420 goto failed;
421 }
422
423 if (c->fanout < UBIFS_MIN_FANOUT ||
424 ubifs_idx_node_sz(c, c->fanout) > c->leb_size) {
425 err = 10;
426 goto failed;
427 }
428
429 if (c->lsave_cnt < 0 || (c->lsave_cnt > DEFAULT_LSAVE_CNT &&
430 c->lsave_cnt > c->max_leb_cnt - UBIFS_SB_LEBS - UBIFS_MST_LEBS -
431 c->log_lebs - c->lpt_lebs - c->orph_lebs)) {
432 err = 11;
433 goto failed;
434 }
435
436 if (UBIFS_SB_LEBS + UBIFS_MST_LEBS + c->log_lebs + c->lpt_lebs +
437 c->orph_lebs + c->main_lebs != c->leb_cnt) {
438 err = 12;
439 goto failed;
440 }
441
442 if (c->default_compr < 0 || c->default_compr >= UBIFS_COMPR_TYPES_CNT) {
443 err = 13;
444 goto failed;
445 }
446
447 max_bytes = c->main_lebs * (long long)c->leb_size;
448 if (c->rp_size < 0 || max_bytes < c->rp_size) {
449 err = 14;
450 goto failed;
451 }
452
453 if (le32_to_cpu(sup->time_gran) > 1000000000 ||
454 le32_to_cpu(sup->time_gran) < 1) {
455 err = 15;
456 goto failed;
457 }
458
459 return 0;
460
461failed:
462 ubifs_err("bad superblock, error %d", err);
463 dbg_dump_node(c, sup);
464 return -EINVAL;
465}
466
467/**
468 * ubifs_read_sb_node - read superblock node.
469 * @c: UBIFS file-system description object
470 *
471 * This function returns a pointer to the superblock node or a negative error
472 * code.
473 */
474struct ubifs_sb_node *ubifs_read_sb_node(struct ubifs_info *c)
475{
476 struct ubifs_sb_node *sup;
477 int err;
478
479 sup = kmalloc(ALIGN(UBIFS_SB_NODE_SZ, c->min_io_size), GFP_NOFS);
480 if (!sup)
481 return ERR_PTR(-ENOMEM);
482
483 err = ubifs_read_node(c, sup, UBIFS_SB_NODE, UBIFS_SB_NODE_SZ,
484 UBIFS_SB_LNUM, 0);
485 if (err) {
486 kfree(sup);
487 return ERR_PTR(err);
488 }
489
490 return sup;
491}
492
493/**
494 * ubifs_write_sb_node - write superblock node.
495 * @c: UBIFS file-system description object
496 * @sup: superblock node read with 'ubifs_read_sb_node()'
497 *
498 * This function returns %0 on success and a negative error code on failure.
499 */
500int ubifs_write_sb_node(struct ubifs_info *c, struct ubifs_sb_node *sup)
501{
502 int len = ALIGN(UBIFS_SB_NODE_SZ, c->min_io_size);
503
504 ubifs_prepare_node(c, sup, UBIFS_SB_NODE_SZ, 1);
505 return ubifs_leb_change(c, UBIFS_SB_LNUM, sup, len, UBI_LONGTERM);
506}
507
508/**
509 * ubifs_read_superblock - read superblock.
510 * @c: UBIFS file-system description object
511 *
512 * This function finds, reads and checks the superblock. If an empty UBI volume
513 * is being mounted, this function creates default superblock. Returns zero in
514 * case of success, and a negative error code in case of failure.
515 */
516int ubifs_read_superblock(struct ubifs_info *c)
517{
518 int err, sup_flags;
519 struct ubifs_sb_node *sup;
520
521 if (c->empty) {
522 err = create_default_filesystem(c);
523 if (err)
524 return err;
525 }
526
527 sup = ubifs_read_sb_node(c);
528 if (IS_ERR(sup))
529 return PTR_ERR(sup);
530
531 /*
532 * The software supports all previous versions but not future versions,
533 * due to the unavailability of time-travelling equipment.
534 */
535 c->fmt_version = le32_to_cpu(sup->fmt_version);
536 if (c->fmt_version > UBIFS_FORMAT_VERSION) {
537 ubifs_err("on-flash format version is %d, but software only "
538 "supports up to version %d", c->fmt_version,
539 UBIFS_FORMAT_VERSION);
540 err = -EINVAL;
541 goto out;
542 }
543
544 if (c->fmt_version < 3) {
545 ubifs_err("on-flash format version %d is not supported",
546 c->fmt_version);
547 err = -EINVAL;
548 goto out;
549 }
550
551 switch (sup->key_hash) {
552 case UBIFS_KEY_HASH_R5:
553 c->key_hash = key_r5_hash;
554 c->key_hash_type = UBIFS_KEY_HASH_R5;
555 break;
556
557 case UBIFS_KEY_HASH_TEST:
558 c->key_hash = key_test_hash;
559 c->key_hash_type = UBIFS_KEY_HASH_TEST;
560 break;
561 };
562
563 c->key_fmt = sup->key_fmt;
564
565 switch (c->key_fmt) {
566 case UBIFS_SIMPLE_KEY_FMT:
567 c->key_len = UBIFS_SK_LEN;
568 break;
569 default:
570 ubifs_err("unsupported key format");
571 err = -EINVAL;
572 goto out;
573 }
574
575 c->leb_cnt = le32_to_cpu(sup->leb_cnt);
576 c->max_leb_cnt = le32_to_cpu(sup->max_leb_cnt);
577 c->max_bud_bytes = le64_to_cpu(sup->max_bud_bytes);
578 c->log_lebs = le32_to_cpu(sup->log_lebs);
579 c->lpt_lebs = le32_to_cpu(sup->lpt_lebs);
580 c->orph_lebs = le32_to_cpu(sup->orph_lebs);
581 c->jhead_cnt = le32_to_cpu(sup->jhead_cnt) + NONDATA_JHEADS_CNT;
582 c->fanout = le32_to_cpu(sup->fanout);
583 c->lsave_cnt = le32_to_cpu(sup->lsave_cnt);
584 c->default_compr = le16_to_cpu(sup->default_compr);
585 c->rp_size = le64_to_cpu(sup->rp_size);
586 c->rp_uid = le32_to_cpu(sup->rp_uid);
587 c->rp_gid = le32_to_cpu(sup->rp_gid);
588 sup_flags = le32_to_cpu(sup->flags);
589
590 c->vfs_sb->s_time_gran = le32_to_cpu(sup->time_gran);
591
592 memcpy(&c->uuid, &sup->uuid, 16);
593
594 c->big_lpt = !!(sup_flags & UBIFS_FLG_BIGLPT);
595
596 /* Automatically increase file system size to the maximum size */
597 c->old_leb_cnt = c->leb_cnt;
598 if (c->leb_cnt < c->vi.size && c->leb_cnt < c->max_leb_cnt) {
599 c->leb_cnt = min_t(int, c->max_leb_cnt, c->vi.size);
600 if (c->vfs_sb->s_flags & MS_RDONLY)
601 dbg_mnt("Auto resizing (ro) from %d LEBs to %d LEBs",
602 c->old_leb_cnt, c->leb_cnt);
603 else {
604 dbg_mnt("Auto resizing (sb) from %d LEBs to %d LEBs",
605 c->old_leb_cnt, c->leb_cnt);
606 sup->leb_cnt = cpu_to_le32(c->leb_cnt);
607 err = ubifs_write_sb_node(c, sup);
608 if (err)
609 goto out;
610 c->old_leb_cnt = c->leb_cnt;
611 }
612 }
613
614 c->log_bytes = (long long)c->log_lebs * c->leb_size;
615 c->log_last = UBIFS_LOG_LNUM + c->log_lebs - 1;
616 c->lpt_first = UBIFS_LOG_LNUM + c->log_lebs;
617 c->lpt_last = c->lpt_first + c->lpt_lebs - 1;
618 c->orph_first = c->lpt_last + 1;
619 c->orph_last = c->orph_first + c->orph_lebs - 1;
620 c->main_lebs = c->leb_cnt - UBIFS_SB_LEBS - UBIFS_MST_LEBS;
621 c->main_lebs -= c->log_lebs + c->lpt_lebs + c->orph_lebs;
622 c->main_first = c->leb_cnt - c->main_lebs;
623 c->report_rp_size = ubifs_reported_space(c, c->rp_size);
624
625 err = validate_sb(c, sup);
626out:
627 kfree(sup);
628 return err;
629}
diff --git a/fs/ubifs/scan.c b/fs/ubifs/scan.c
new file mode 100644
index 000000000000..acf5c5fffc60
--- /dev/null
+++ b/fs/ubifs/scan.c
@@ -0,0 +1,362 @@
1/*
2 * This file is part of UBIFS.
3 *
4 * Copyright (C) 2006-2008 Nokia Corporation
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published by
8 * the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along with
16 * this program; if not, write to the Free Software Foundation, Inc., 51
17 * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 * Authors: Adrian Hunter
20 * Artem Bityutskiy (Битюцкий Артём)
21 */
22
23/*
24 * This file implements the scan which is a general-purpose function for
25 * determining what nodes are in an eraseblock. The scan is used to replay the
26 * journal, to do garbage collection. for the TNC in-the-gaps method, and by
27 * debugging functions.
28 */
29
30#include "ubifs.h"
31
32/**
33 * scan_padding_bytes - scan for padding bytes.
34 * @buf: buffer to scan
35 * @len: length of buffer
36 *
37 * This function returns the number of padding bytes on success and
38 * %SCANNED_GARBAGE on failure.
39 */
40static int scan_padding_bytes(void *buf, int len)
41{
42 int pad_len = 0, max_pad_len = min_t(int, UBIFS_PAD_NODE_SZ, len);
43 uint8_t *p = buf;
44
45 dbg_scan("not a node");
46
47 while (pad_len < max_pad_len && *p++ == UBIFS_PADDING_BYTE)
48 pad_len += 1;
49
50 if (!pad_len || (pad_len & 7))
51 return SCANNED_GARBAGE;
52
53 dbg_scan("%d padding bytes", pad_len);
54
55 return pad_len;
56}
57
58/**
59 * ubifs_scan_a_node - scan for a node or padding.
60 * @c: UBIFS file-system description object
61 * @buf: buffer to scan
62 * @len: length of buffer
63 * @lnum: logical eraseblock number
64 * @offs: offset within the logical eraseblock
65 * @quiet: print no messages
66 *
67 * This function returns a scanning code to indicate what was scanned.
68 */
69int ubifs_scan_a_node(const struct ubifs_info *c, void *buf, int len, int lnum,
70 int offs, int quiet)
71{
72 struct ubifs_ch *ch = buf;
73 uint32_t magic;
74
75 magic = le32_to_cpu(ch->magic);
76
77 if (magic == 0xFFFFFFFF) {
78 dbg_scan("hit empty space");
79 return SCANNED_EMPTY_SPACE;
80 }
81
82 if (magic != UBIFS_NODE_MAGIC)
83 return scan_padding_bytes(buf, len);
84
85 if (len < UBIFS_CH_SZ)
86 return SCANNED_GARBAGE;
87
88 dbg_scan("scanning %s", dbg_ntype(ch->node_type));
89
90 if (ubifs_check_node(c, buf, lnum, offs, quiet))
91 return SCANNED_A_CORRUPT_NODE;
92
93 if (ch->node_type == UBIFS_PAD_NODE) {
94 struct ubifs_pad_node *pad = buf;
95 int pad_len = le32_to_cpu(pad->pad_len);
96 int node_len = le32_to_cpu(ch->len);
97
98 /* Validate the padding node */
99 if (pad_len < 0 ||
100 offs + node_len + pad_len > c->leb_size) {
101 if (!quiet) {
102 ubifs_err("bad pad node at LEB %d:%d",
103 lnum, offs);
104 dbg_dump_node(c, pad);
105 }
106 return SCANNED_A_BAD_PAD_NODE;
107 }
108
109 /* Make the node pads to 8-byte boundary */
110 if ((node_len + pad_len) & 7) {
111 if (!quiet) {
112 dbg_err("bad padding length %d - %d",
113 offs, offs + node_len + pad_len);
114 }
115 return SCANNED_A_BAD_PAD_NODE;
116 }
117
118 dbg_scan("%d bytes padded, offset now %d",
119 pad_len, ALIGN(offs + node_len + pad_len, 8));
120
121 return node_len + pad_len;
122 }
123
124 return SCANNED_A_NODE;
125}
126
127/**
128 * ubifs_start_scan - create LEB scanning information at start of scan.
129 * @c: UBIFS file-system description object
130 * @lnum: logical eraseblock number
131 * @offs: offset to start at (usually zero)
132 * @sbuf: scan buffer (must be c->leb_size)
133 *
134 * This function returns %0 on success and a negative error code on failure.
135 */
136struct ubifs_scan_leb *ubifs_start_scan(const struct ubifs_info *c, int lnum,
137 int offs, void *sbuf)
138{
139 struct ubifs_scan_leb *sleb;
140 int err;
141
142 dbg_scan("scan LEB %d:%d", lnum, offs);
143
144 sleb = kzalloc(sizeof(struct ubifs_scan_leb), GFP_NOFS);
145 if (!sleb)
146 return ERR_PTR(-ENOMEM);
147
148 sleb->lnum = lnum;
149 INIT_LIST_HEAD(&sleb->nodes);
150 sleb->buf = sbuf;
151
152 err = ubi_read(c->ubi, lnum, sbuf + offs, offs, c->leb_size - offs);
153 if (err && err != -EBADMSG) {
154 ubifs_err("cannot read %d bytes from LEB %d:%d,"
155 " error %d", c->leb_size - offs, lnum, offs, err);
156 kfree(sleb);
157 return ERR_PTR(err);
158 }
159
160 if (err == -EBADMSG)
161 sleb->ecc = 1;
162
163 return sleb;
164}
165
166/**
167 * ubifs_end_scan - update LEB scanning information at end of scan.
168 * @c: UBIFS file-system description object
169 * @sleb: scanning information
170 * @lnum: logical eraseblock number
171 * @offs: offset to start at (usually zero)
172 *
173 * This function returns %0 on success and a negative error code on failure.
174 */
175void ubifs_end_scan(const struct ubifs_info *c, struct ubifs_scan_leb *sleb,
176 int lnum, int offs)
177{
178 lnum = lnum;
179 dbg_scan("stop scanning LEB %d at offset %d", lnum, offs);
180 ubifs_assert(offs % c->min_io_size == 0);
181
182 sleb->endpt = ALIGN(offs, c->min_io_size);
183}
184
185/**
186 * ubifs_add_snod - add a scanned node to LEB scanning information.
187 * @c: UBIFS file-system description object
188 * @sleb: scanning information
189 * @buf: buffer containing node
190 * @offs: offset of node on flash
191 *
192 * This function returns %0 on success and a negative error code on failure.
193 */
194int ubifs_add_snod(const struct ubifs_info *c, struct ubifs_scan_leb *sleb,
195 void *buf, int offs)
196{
197 struct ubifs_ch *ch = buf;
198 struct ubifs_ino_node *ino = buf;
199 struct ubifs_scan_node *snod;
200
201 snod = kzalloc(sizeof(struct ubifs_scan_node), GFP_NOFS);
202 if (!snod)
203 return -ENOMEM;
204
205 snod->sqnum = le64_to_cpu(ch->sqnum);
206 snod->type = ch->node_type;
207 snod->offs = offs;
208 snod->len = le32_to_cpu(ch->len);
209 snod->node = buf;
210
211 switch (ch->node_type) {
212 case UBIFS_INO_NODE:
213 case UBIFS_DENT_NODE:
214 case UBIFS_XENT_NODE:
215 case UBIFS_DATA_NODE:
216 case UBIFS_TRUN_NODE:
217 /*
218 * The key is in the same place in all keyed
219 * nodes.
220 */
221 key_read(c, &ino->key, &snod->key);
222 break;
223 }
224 list_add_tail(&snod->list, &sleb->nodes);
225 sleb->nodes_cnt += 1;
226 return 0;
227}
228
229/**
230 * ubifs_scanned_corruption - print information after UBIFS scanned corruption.
231 * @c: UBIFS file-system description object
232 * @lnum: LEB number of corruption
233 * @offs: offset of corruption
234 * @buf: buffer containing corruption
235 */
236void ubifs_scanned_corruption(const struct ubifs_info *c, int lnum, int offs,
237 void *buf)
238{
239 int len;
240
241 ubifs_err("corrupted data at LEB %d:%d", lnum, offs);
242 if (dbg_failure_mode)
243 return;
244 len = c->leb_size - offs;
245 if (len > 4096)
246 len = 4096;
247 dbg_err("first %d bytes from LEB %d:%d", len, lnum, offs);
248 print_hex_dump(KERN_DEBUG, "", DUMP_PREFIX_OFFSET, 32, 4, buf, len, 1);
249}
250
251/**
252 * ubifs_scan - scan a logical eraseblock.
253 * @c: UBIFS file-system description object
254 * @lnum: logical eraseblock number
255 * @offs: offset to start at (usually zero)
256 * @sbuf: scan buffer (must be c->leb_size)
257 *
258 * This function scans LEB number @lnum and returns complete information about
259 * its contents. Returns an error code in case of failure.
260 */
261struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum,
262 int offs, void *sbuf)
263{
264 void *buf = sbuf + offs;
265 int err, len = c->leb_size - offs;
266 struct ubifs_scan_leb *sleb;
267
268 sleb = ubifs_start_scan(c, lnum, offs, sbuf);
269 if (IS_ERR(sleb))
270 return sleb;
271
272 while (len >= 8) {
273 struct ubifs_ch *ch = buf;
274 int node_len, ret;
275
276 dbg_scan("look at LEB %d:%d (%d bytes left)",
277 lnum, offs, len);
278
279 cond_resched();
280
281 ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 0);
282
283 if (ret > 0) {
284 /* Padding bytes or a valid padding node */
285 offs += ret;
286 buf += ret;
287 len -= ret;
288 continue;
289 }
290
291 if (ret == SCANNED_EMPTY_SPACE)
292 /* Empty space is checked later */
293 break;
294
295 switch (ret) {
296 case SCANNED_GARBAGE:
297 dbg_err("garbage");
298 goto corrupted;
299 case SCANNED_A_NODE:
300 break;
301 case SCANNED_A_CORRUPT_NODE:
302 case SCANNED_A_BAD_PAD_NODE:
303 dbg_err("bad node");
304 goto corrupted;
305 default:
306 dbg_err("unknown");
307 goto corrupted;
308 }
309
310 err = ubifs_add_snod(c, sleb, buf, offs);
311 if (err)
312 goto error;
313
314 node_len = ALIGN(le32_to_cpu(ch->len), 8);
315 offs += node_len;
316 buf += node_len;
317 len -= node_len;
318 }
319
320 if (offs % c->min_io_size)
321 goto corrupted;
322
323 ubifs_end_scan(c, sleb, lnum, offs);
324
325 for (; len > 4; offs += 4, buf = buf + 4, len -= 4)
326 if (*(uint32_t *)buf != 0xffffffff)
327 break;
328 for (; len; offs++, buf++, len--)
329 if (*(uint8_t *)buf != 0xff) {
330 ubifs_err("corrupt empty space at LEB %d:%d",
331 lnum, offs);
332 goto corrupted;
333 }
334
335 return sleb;
336
337corrupted:
338 ubifs_scanned_corruption(c, lnum, offs, buf);
339 err = -EUCLEAN;
340error:
341 ubifs_err("LEB %d scanning failed", lnum);
342 ubifs_scan_destroy(sleb);
343 return ERR_PTR(err);
344}
345
346/**
347 * ubifs_scan_destroy - destroy LEB scanning information.
348 * @sleb: scanning information to free
349 */
350void ubifs_scan_destroy(struct ubifs_scan_leb *sleb)
351{
352 struct ubifs_scan_node *node;
353 struct list_head *head;
354
355 head = &sleb->nodes;
356 while (!list_empty(head)) {
357 node = list_entry(head->next, struct ubifs_scan_node, list);
358 list_del(&node->list);
359 kfree(node);
360 }
361 kfree(sleb);
362}
diff --git a/fs/ubifs/shrinker.c b/fs/ubifs/shrinker.c
new file mode 100644
index 000000000000..f248533841a2
--- /dev/null
+++ b/fs/ubifs/shrinker.c
@@ -0,0 +1,322 @@
1/*
2 * This file is part of UBIFS.
3 *
4 * Copyright (C) 2006-2008 Nokia Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published by
8 * the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along with
16 * this program; if not, write to the Free Software Foundation, Inc., 51
17 * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 * Authors: Artem Bityutskiy (Битюцкий Артём)
20 * Adrian Hunter
21 */
22
23/*
24 * This file implements UBIFS shrinker which evicts clean znodes from the TNC
25 * tree when Linux VM needs more RAM.
26 *
27 * We do not implement any LRU lists to find oldest znodes to free because it
28 * would add additional overhead to the file system fast paths. So the shrinker
29 * just walks the TNC tree when searching for znodes to free.
30 *
31 * If the root of a TNC sub-tree is clean and old enough, then the children are
32 * also clean and old enough. So the shrinker walks the TNC in level order and
33 * dumps entire sub-trees.
34 *
35 * The age of znodes is just the time-stamp when they were last looked at.
36 * The current shrinker first tries to evict old znodes, then young ones.
37 *
38 * Since the shrinker is global, it has to protect against races with FS
39 * un-mounts, which is done by the 'ubifs_infos_lock' and 'c->umount_mutex'.
40 */
41
42#include "ubifs.h"
43
44/* List of all UBIFS file-system instances */
45LIST_HEAD(ubifs_infos);
46
47/*
48 * We number each shrinker run and record the number on the ubifs_info structure
49 * so that we can easily work out which ubifs_info structures have already been
50 * done by the current run.
51 */
52static unsigned int shrinker_run_no;
53
54/* Protects 'ubifs_infos' list */
55DEFINE_SPINLOCK(ubifs_infos_lock);
56
57/* Global clean znode counter (for all mounted UBIFS instances) */
58atomic_long_t ubifs_clean_zn_cnt;
59
60/**
61 * shrink_tnc - shrink TNC tree.
62 * @c: UBIFS file-system description object
63 * @nr: number of znodes to free
64 * @age: the age of znodes to free
65 * @contention: if any contention, this is set to %1
66 *
67 * This function traverses TNC tree and frees clean znodes. It does not free
68 * clean znodes which younger then @age. Returns number of freed znodes.
69 */
70static int shrink_tnc(struct ubifs_info *c, int nr, int age, int *contention)
71{
72 int total_freed = 0;
73 struct ubifs_znode *znode, *zprev;
74 int time = get_seconds();
75
76 ubifs_assert(mutex_is_locked(&c->umount_mutex));
77 ubifs_assert(mutex_is_locked(&c->tnc_mutex));
78
79 if (!c->zroot.znode || atomic_long_read(&c->clean_zn_cnt) == 0)
80 return 0;
81
82 /*
83 * Traverse the TNC tree in levelorder manner, so that it is possible
84 * to destroy large sub-trees. Indeed, if a znode is old, then all its
85 * children are older or of the same age.
86 *
87 * Note, we are holding 'c->tnc_mutex', so we do not have to lock the
88 * 'c->space_lock' when _reading_ 'c->clean_zn_cnt', because it is
89 * changed only when the 'c->tnc_mutex' is held.
90 */
91 zprev = NULL;
92 znode = ubifs_tnc_levelorder_next(c->zroot.znode, NULL);
93 while (znode && total_freed < nr &&
94 atomic_long_read(&c->clean_zn_cnt) > 0) {
95 int freed;
96
97 /*
98 * If the znode is clean, but it is in the 'c->cnext' list, this
99 * means that this znode has just been written to flash as a
100 * part of commit and was marked clean. They will be removed
101 * from the list at end commit. We cannot change the list,
102 * because it is not protected by any mutex (design decision to
103 * make commit really independent and parallel to main I/O). So
104 * we just skip these znodes.
105 *
106 * Note, the 'clean_zn_cnt' counters are not updated until
107 * after the commit, so the UBIFS shrinker does not report
108 * the znodes which are in the 'c->cnext' list as freeable.
109 *
110 * Also note, if the root of a sub-tree is not in 'c->cnext',
111 * then the whole sub-tree is not in 'c->cnext' as well, so it
112 * is safe to dump whole sub-tree.
113 */
114
115 if (znode->cnext) {
116 /*
117 * Very soon these znodes will be removed from the list
118 * and become freeable.
119 */
120 *contention = 1;
121 } else if (!ubifs_zn_dirty(znode) &&
122 abs(time - znode->time) >= age) {
123 if (znode->parent)
124 znode->parent->zbranch[znode->iip].znode = NULL;
125 else
126 c->zroot.znode = NULL;
127
128 freed = ubifs_destroy_tnc_subtree(znode);
129 atomic_long_sub(freed, &ubifs_clean_zn_cnt);
130 atomic_long_sub(freed, &c->clean_zn_cnt);
131 ubifs_assert(atomic_long_read(&c->clean_zn_cnt) >= 0);
132 total_freed += freed;
133 znode = zprev;
134 }
135
136 if (unlikely(!c->zroot.znode))
137 break;
138
139 zprev = znode;
140 znode = ubifs_tnc_levelorder_next(c->zroot.znode, znode);
141 cond_resched();
142 }
143
144 return total_freed;
145}
146
147/**
148 * shrink_tnc_trees - shrink UBIFS TNC trees.
149 * @nr: number of znodes to free
150 * @age: the age of znodes to free
151 * @contention: if any contention, this is set to %1
152 *
153 * This function walks the list of mounted UBIFS file-systems and frees clean
154 * znodes which are older then @age, until at least @nr znodes are freed.
155 * Returns the number of freed znodes.
156 */
157static int shrink_tnc_trees(int nr, int age, int *contention)
158{
159 struct ubifs_info *c;
160 struct list_head *p;
161 unsigned int run_no;
162 int freed = 0;
163
164 spin_lock(&ubifs_infos_lock);
165 do {
166 run_no = ++shrinker_run_no;
167 } while (run_no == 0);
168 /* Iterate over all mounted UBIFS file-systems and try to shrink them */
169 p = ubifs_infos.next;
170 while (p != &ubifs_infos) {
171 c = list_entry(p, struct ubifs_info, infos_list);
172 /*
173 * We move the ones we do to the end of the list, so we stop
174 * when we see one we have already done.
175 */
176 if (c->shrinker_run_no == run_no)
177 break;
178 if (!mutex_trylock(&c->umount_mutex)) {
179 /* Some un-mount is in progress, try next FS */
180 *contention = 1;
181 p = p->next;
182 continue;
183 }
184 /*
185 * We're holding 'c->umount_mutex', so the file-system won't go
186 * away.
187 */
188 if (!mutex_trylock(&c->tnc_mutex)) {
189 mutex_unlock(&c->umount_mutex);
190 *contention = 1;
191 p = p->next;
192 continue;
193 }
194 spin_unlock(&ubifs_infos_lock);
195 /*
196 * OK, now we have TNC locked, the file-system cannot go away -
197 * it is safe to reap the cache.
198 */
199 c->shrinker_run_no = run_no;
200 freed += shrink_tnc(c, nr, age, contention);
201 mutex_unlock(&c->tnc_mutex);
202 spin_lock(&ubifs_infos_lock);
203 /* Get the next list element before we move this one */
204 p = p->next;
205 /*
206 * Move this one to the end of the list to provide some
207 * fairness.
208 */
209 list_del(&c->infos_list);
210 list_add_tail(&c->infos_list, &ubifs_infos);
211 mutex_unlock(&c->umount_mutex);
212 if (freed >= nr)
213 break;
214 }
215 spin_unlock(&ubifs_infos_lock);
216 return freed;
217}
218
219/**
220 * kick_a_thread - kick a background thread to start commit.
221 *
222 * This function kicks a background thread to start background commit. Returns
223 * %-1 if a thread was kicked or there is another reason to assume the memory
224 * will soon be freed or become freeable. If there are no dirty znodes, returns
225 * %0.
226 */
227static int kick_a_thread(void)
228{
229 int i;
230 struct ubifs_info *c;
231
232 /*
233 * Iterate over all mounted UBIFS file-systems and find out if there is
234 * already an ongoing commit operation there. If no, then iterate for
235 * the second time and initiate background commit.
236 */
237 spin_lock(&ubifs_infos_lock);
238 for (i = 0; i < 2; i++) {
239 list_for_each_entry(c, &ubifs_infos, infos_list) {
240 long dirty_zn_cnt;
241
242 if (!mutex_trylock(&c->umount_mutex)) {
243 /*
244 * Some un-mount is in progress, it will
245 * certainly free memory, so just return.
246 */
247 spin_unlock(&ubifs_infos_lock);
248 return -1;
249 }
250
251 dirty_zn_cnt = atomic_long_read(&c->dirty_zn_cnt);
252
253 if (!dirty_zn_cnt || c->cmt_state == COMMIT_BROKEN ||
254 c->ro_media) {
255 mutex_unlock(&c->umount_mutex);
256 continue;
257 }
258
259 if (c->cmt_state != COMMIT_RESTING) {
260 spin_unlock(&ubifs_infos_lock);
261 mutex_unlock(&c->umount_mutex);
262 return -1;
263 }
264
265 if (i == 1) {
266 list_del(&c->infos_list);
267 list_add_tail(&c->infos_list, &ubifs_infos);
268 spin_unlock(&ubifs_infos_lock);
269
270 ubifs_request_bg_commit(c);
271 mutex_unlock(&c->umount_mutex);
272 return -1;
273 }
274 mutex_unlock(&c->umount_mutex);
275 }
276 }
277 spin_unlock(&ubifs_infos_lock);
278
279 return 0;
280}
281
282int ubifs_shrinker(int nr, gfp_t gfp_mask)
283{
284 int freed, contention = 0;
285 long clean_zn_cnt = atomic_long_read(&ubifs_clean_zn_cnt);
286
287 if (nr == 0)
288 return clean_zn_cnt;
289
290 if (!clean_zn_cnt) {
291 /*
292 * No clean znodes, nothing to reap. All we can do in this case
293 * is to kick background threads to start commit, which will
294 * probably make clean znodes which, in turn, will be freeable.
295 * And we return -1 which means will make VM call us again
296 * later.
297 */
298 dbg_tnc("no clean znodes, kick a thread");
299 return kick_a_thread();
300 }
301
302 freed = shrink_tnc_trees(nr, OLD_ZNODE_AGE, &contention);
303 if (freed >= nr)
304 goto out;
305
306 dbg_tnc("not enough old znodes, try to free young ones");
307 freed += shrink_tnc_trees(nr - freed, YOUNG_ZNODE_AGE, &contention);
308 if (freed >= nr)
309 goto out;
310
311 dbg_tnc("not enough young znodes, free all");
312 freed += shrink_tnc_trees(nr - freed, 0, &contention);
313
314 if (!freed && contention) {
315 dbg_tnc("freed nothing, but contention");
316 return -1;
317 }
318
319out:
320 dbg_tnc("%d znodes were freed, requested %d", freed, nr);
321 return freed;
322}
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
new file mode 100644
index 000000000000..00eb9c68ad03
--- /dev/null
+++ b/fs/ubifs/super.c
@@ -0,0 +1,1951 @@
1/*
2 * This file is part of UBIFS.
3 *
4 * Copyright (C) 2006-2008 Nokia Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published by
8 * the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along with
16 * this program; if not, write to the Free Software Foundation, Inc., 51
17 * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 * Authors: Artem Bityutskiy (Битюцкий Артём)
20 * Adrian Hunter
21 */
22
23/*
24 * This file implements UBIFS initialization and VFS superblock operations. Some
25 * initialization stuff which is rather large and complex is placed at
26 * corresponding subsystems, but most of it is here.
27 */
28
29#include <linux/init.h>
30#include <linux/slab.h>
31#include <linux/module.h>
32#include <linux/ctype.h>
33#include <linux/random.h>
34#include <linux/kthread.h>
35#include <linux/parser.h>
36#include <linux/seq_file.h>
37#include <linux/mount.h>
38#include "ubifs.h"
39
40/* Slab cache for UBIFS inodes */
41struct kmem_cache *ubifs_inode_slab;
42
43/* UBIFS TNC shrinker description */
44static struct shrinker ubifs_shrinker_info = {
45 .shrink = ubifs_shrinker,
46 .seeks = DEFAULT_SEEKS,
47};
48
49/**
50 * validate_inode - validate inode.
51 * @c: UBIFS file-system description object
52 * @inode: the inode to validate
53 *
54 * This is a helper function for 'ubifs_iget()' which validates various fields
55 * of a newly built inode to make sure they contain sane values and prevent
56 * possible vulnerabilities. Returns zero if the inode is all right and
57 * a non-zero error code if not.
58 */
59static int validate_inode(struct ubifs_info *c, const struct inode *inode)
60{
61 int err;
62 const struct ubifs_inode *ui = ubifs_inode(inode);
63
64 if (inode->i_size > c->max_inode_sz) {
65 ubifs_err("inode is too large (%lld)",
66 (long long)inode->i_size);
67 return 1;
68 }
69
70 if (ui->compr_type < 0 || ui->compr_type >= UBIFS_COMPR_TYPES_CNT) {
71 ubifs_err("unknown compression type %d", ui->compr_type);
72 return 2;
73 }
74
75 if (ui->xattr_names + ui->xattr_cnt > XATTR_LIST_MAX)
76 return 3;
77
78 if (ui->data_len < 0 || ui->data_len > UBIFS_MAX_INO_DATA)
79 return 4;
80
81 if (ui->xattr && (inode->i_mode & S_IFMT) != S_IFREG)
82 return 5;
83
84 if (!ubifs_compr_present(ui->compr_type)) {
85 ubifs_warn("inode %lu uses '%s' compression, but it was not "
86 "compiled in", inode->i_ino,
87 ubifs_compr_name(ui->compr_type));
88 }
89
90 err = dbg_check_dir_size(c, inode);
91 return err;
92}
93
94struct inode *ubifs_iget(struct super_block *sb, unsigned long inum)
95{
96 int err;
97 union ubifs_key key;
98 struct ubifs_ino_node *ino;
99 struct ubifs_info *c = sb->s_fs_info;
100 struct inode *inode;
101 struct ubifs_inode *ui;
102
103 dbg_gen("inode %lu", inum);
104
105 inode = iget_locked(sb, inum);
106 if (!inode)
107 return ERR_PTR(-ENOMEM);
108 if (!(inode->i_state & I_NEW))
109 return inode;
110 ui = ubifs_inode(inode);
111
112 ino = kmalloc(UBIFS_MAX_INO_NODE_SZ, GFP_NOFS);
113 if (!ino) {
114 err = -ENOMEM;
115 goto out;
116 }
117
118 ino_key_init(c, &key, inode->i_ino);
119
120 err = ubifs_tnc_lookup(c, &key, ino);
121 if (err)
122 goto out_ino;
123
124 inode->i_flags |= (S_NOCMTIME | S_NOATIME);
125 inode->i_nlink = le32_to_cpu(ino->nlink);
126 inode->i_uid = le32_to_cpu(ino->uid);
127 inode->i_gid = le32_to_cpu(ino->gid);
128 inode->i_atime.tv_sec = (int64_t)le64_to_cpu(ino->atime_sec);
129 inode->i_atime.tv_nsec = le32_to_cpu(ino->atime_nsec);
130 inode->i_mtime.tv_sec = (int64_t)le64_to_cpu(ino->mtime_sec);
131 inode->i_mtime.tv_nsec = le32_to_cpu(ino->mtime_nsec);
132 inode->i_ctime.tv_sec = (int64_t)le64_to_cpu(ino->ctime_sec);
133 inode->i_ctime.tv_nsec = le32_to_cpu(ino->ctime_nsec);
134 inode->i_mode = le32_to_cpu(ino->mode);
135 inode->i_size = le64_to_cpu(ino->size);
136
137 ui->data_len = le32_to_cpu(ino->data_len);
138 ui->flags = le32_to_cpu(ino->flags);
139 ui->compr_type = le16_to_cpu(ino->compr_type);
140 ui->creat_sqnum = le64_to_cpu(ino->creat_sqnum);
141 ui->xattr_cnt = le32_to_cpu(ino->xattr_cnt);
142 ui->xattr_size = le32_to_cpu(ino->xattr_size);
143 ui->xattr_names = le32_to_cpu(ino->xattr_names);
144 ui->synced_i_size = ui->ui_size = inode->i_size;
145
146 ui->xattr = (ui->flags & UBIFS_XATTR_FL) ? 1 : 0;
147
148 err = validate_inode(c, inode);
149 if (err)
150 goto out_invalid;
151
152 /* Disable readahead */
153 inode->i_mapping->backing_dev_info = &c->bdi;
154
155 switch (inode->i_mode & S_IFMT) {
156 case S_IFREG:
157 inode->i_mapping->a_ops = &ubifs_file_address_operations;
158 inode->i_op = &ubifs_file_inode_operations;
159 inode->i_fop = &ubifs_file_operations;
160 if (ui->xattr) {
161 ui->data = kmalloc(ui->data_len + 1, GFP_NOFS);
162 if (!ui->data) {
163 err = -ENOMEM;
164 goto out_ino;
165 }
166 memcpy(ui->data, ino->data, ui->data_len);
167 ((char *)ui->data)[ui->data_len] = '\0';
168 } else if (ui->data_len != 0) {
169 err = 10;
170 goto out_invalid;
171 }
172 break;
173 case S_IFDIR:
174 inode->i_op = &ubifs_dir_inode_operations;
175 inode->i_fop = &ubifs_dir_operations;
176 if (ui->data_len != 0) {
177 err = 11;
178 goto out_invalid;
179 }
180 break;
181 case S_IFLNK:
182 inode->i_op = &ubifs_symlink_inode_operations;
183 if (ui->data_len <= 0 || ui->data_len > UBIFS_MAX_INO_DATA) {
184 err = 12;
185 goto out_invalid;
186 }
187 ui->data = kmalloc(ui->data_len + 1, GFP_NOFS);
188 if (!ui->data) {
189 err = -ENOMEM;
190 goto out_ino;
191 }
192 memcpy(ui->data, ino->data, ui->data_len);
193 ((char *)ui->data)[ui->data_len] = '\0';
194 break;
195 case S_IFBLK:
196 case S_IFCHR:
197 {
198 dev_t rdev;
199 union ubifs_dev_desc *dev;
200
201 ui->data = kmalloc(sizeof(union ubifs_dev_desc), GFP_NOFS);
202 if (!ui->data) {
203 err = -ENOMEM;
204 goto out_ino;
205 }
206
207 dev = (union ubifs_dev_desc *)ino->data;
208 if (ui->data_len == sizeof(dev->new))
209 rdev = new_decode_dev(le32_to_cpu(dev->new));
210 else if (ui->data_len == sizeof(dev->huge))
211 rdev = huge_decode_dev(le64_to_cpu(dev->huge));
212 else {
213 err = 13;
214 goto out_invalid;
215 }
216 memcpy(ui->data, ino->data, ui->data_len);
217 inode->i_op = &ubifs_file_inode_operations;
218 init_special_inode(inode, inode->i_mode, rdev);
219 break;
220 }
221 case S_IFSOCK:
222 case S_IFIFO:
223 inode->i_op = &ubifs_file_inode_operations;
224 init_special_inode(inode, inode->i_mode, 0);
225 if (ui->data_len != 0) {
226 err = 14;
227 goto out_invalid;
228 }
229 break;
230 default:
231 err = 15;
232 goto out_invalid;
233 }
234
235 kfree(ino);
236 ubifs_set_inode_flags(inode);
237 unlock_new_inode(inode);
238 return inode;
239
240out_invalid:
241 ubifs_err("inode %lu validation failed, error %d", inode->i_ino, err);
242 dbg_dump_node(c, ino);
243 dbg_dump_inode(c, inode);
244 err = -EINVAL;
245out_ino:
246 kfree(ino);
247out:
248 ubifs_err("failed to read inode %lu, error %d", inode->i_ino, err);
249 iget_failed(inode);
250 return ERR_PTR(err);
251}
252
253static struct inode *ubifs_alloc_inode(struct super_block *sb)
254{
255 struct ubifs_inode *ui;
256
257 ui = kmem_cache_alloc(ubifs_inode_slab, GFP_NOFS);
258 if (!ui)
259 return NULL;
260
261 memset((void *)ui + sizeof(struct inode), 0,
262 sizeof(struct ubifs_inode) - sizeof(struct inode));
263 mutex_init(&ui->ui_mutex);
264 spin_lock_init(&ui->ui_lock);
265 return &ui->vfs_inode;
266};
267
268static void ubifs_destroy_inode(struct inode *inode)
269{
270 struct ubifs_inode *ui = ubifs_inode(inode);
271
272 kfree(ui->data);
273 kmem_cache_free(ubifs_inode_slab, inode);
274}
275
276/*
277 * Note, Linux write-back code calls this without 'i_mutex'.
278 */
279static int ubifs_write_inode(struct inode *inode, int wait)
280{
281 int err;
282 struct ubifs_info *c = inode->i_sb->s_fs_info;
283 struct ubifs_inode *ui = ubifs_inode(inode);
284
285 ubifs_assert(!ui->xattr);
286 if (is_bad_inode(inode))
287 return 0;
288
289 mutex_lock(&ui->ui_mutex);
290 /*
291 * Due to races between write-back forced by budgeting
292 * (see 'sync_some_inodes()') and pdflush write-back, the inode may
293 * have already been synchronized, do not do this again. This might
294 * also happen if it was synchronized in an VFS operation, e.g.
295 * 'ubifs_link()'.
296 */
297 if (!ui->dirty) {
298 mutex_unlock(&ui->ui_mutex);
299 return 0;
300 }
301
302 dbg_gen("inode %lu", inode->i_ino);
303 err = ubifs_jnl_write_inode(c, inode, 0);
304 if (err)
305 ubifs_err("can't write inode %lu, error %d", inode->i_ino, err);
306
307 ui->dirty = 0;
308 mutex_unlock(&ui->ui_mutex);
309 ubifs_release_dirty_inode_budget(c, ui);
310 return err;
311}
312
313static void ubifs_delete_inode(struct inode *inode)
314{
315 int err;
316 struct ubifs_info *c = inode->i_sb->s_fs_info;
317
318 if (ubifs_inode(inode)->xattr)
319 /*
320 * Extended attribute inode deletions are fully handled in
321 * 'ubifs_removexattr()'. These inodes are special and have
322 * limited usage, so there is nothing to do here.
323 */
324 goto out;
325
326 dbg_gen("inode %lu", inode->i_ino);
327 ubifs_assert(!atomic_read(&inode->i_count));
328 ubifs_assert(inode->i_nlink == 0);
329
330 truncate_inode_pages(&inode->i_data, 0);
331 if (is_bad_inode(inode))
332 goto out;
333
334 ubifs_inode(inode)->ui_size = inode->i_size = 0;
335 err = ubifs_jnl_write_inode(c, inode, 1);
336 if (err)
337 /*
338 * Worst case we have a lost orphan inode wasting space, so a
339 * simple error message is ok here.
340 */
341 ubifs_err("can't write inode %lu, error %d", inode->i_ino, err);
342out:
343 clear_inode(inode);
344}
345
346static void ubifs_dirty_inode(struct inode *inode)
347{
348 struct ubifs_inode *ui = ubifs_inode(inode);
349
350 ubifs_assert(mutex_is_locked(&ui->ui_mutex));
351 if (!ui->dirty) {
352 ui->dirty = 1;
353 dbg_gen("inode %lu", inode->i_ino);
354 }
355}
356
357static int ubifs_statfs(struct dentry *dentry, struct kstatfs *buf)
358{
359 struct ubifs_info *c = dentry->d_sb->s_fs_info;
360 unsigned long long free;
361
362 free = ubifs_budg_get_free_space(c);
363 dbg_gen("free space %lld bytes (%lld blocks)",
364 free, free >> UBIFS_BLOCK_SHIFT);
365
366 buf->f_type = UBIFS_SUPER_MAGIC;
367 buf->f_bsize = UBIFS_BLOCK_SIZE;
368 buf->f_blocks = c->block_cnt;
369 buf->f_bfree = free >> UBIFS_BLOCK_SHIFT;
370 if (free > c->report_rp_size)
371 buf->f_bavail = (free - c->report_rp_size) >> UBIFS_BLOCK_SHIFT;
372 else
373 buf->f_bavail = 0;
374 buf->f_files = 0;
375 buf->f_ffree = 0;
376 buf->f_namelen = UBIFS_MAX_NLEN;
377
378 return 0;
379}
380
381static int ubifs_show_options(struct seq_file *s, struct vfsmount *mnt)
382{
383 struct ubifs_info *c = mnt->mnt_sb->s_fs_info;
384
385 if (c->mount_opts.unmount_mode == 2)
386 seq_printf(s, ",fast_unmount");
387 else if (c->mount_opts.unmount_mode == 1)
388 seq_printf(s, ",norm_unmount");
389
390 return 0;
391}
392
393static int ubifs_sync_fs(struct super_block *sb, int wait)
394{
395 struct ubifs_info *c = sb->s_fs_info;
396 int i, ret = 0, err;
397
398 if (c->jheads)
399 for (i = 0; i < c->jhead_cnt; i++) {
400 err = ubifs_wbuf_sync(&c->jheads[i].wbuf);
401 if (err && !ret)
402 ret = err;
403 }
404 /*
405 * We ought to call sync for c->ubi but it does not have one. If it had
406 * it would in turn call mtd->sync, however mtd operations are
407 * synchronous anyway, so we don't lose any sleep here.
408 */
409 return ret;
410}
411
412/**
413 * init_constants_early - initialize UBIFS constants.
414 * @c: UBIFS file-system description object
415 *
416 * This function initialize UBIFS constants which do not need the superblock to
417 * be read. It also checks that the UBI volume satisfies basic UBIFS
418 * requirements. Returns zero in case of success and a negative error code in
419 * case of failure.
420 */
421static int init_constants_early(struct ubifs_info *c)
422{
423 if (c->vi.corrupted) {
424 ubifs_warn("UBI volume is corrupted - read-only mode");
425 c->ro_media = 1;
426 }
427
428 if (c->di.ro_mode) {
429 ubifs_msg("read-only UBI device");
430 c->ro_media = 1;
431 }
432
433 if (c->vi.vol_type == UBI_STATIC_VOLUME) {
434 ubifs_msg("static UBI volume - read-only mode");
435 c->ro_media = 1;
436 }
437
438 c->leb_cnt = c->vi.size;
439 c->leb_size = c->vi.usable_leb_size;
440 c->half_leb_size = c->leb_size / 2;
441 c->min_io_size = c->di.min_io_size;
442 c->min_io_shift = fls(c->min_io_size) - 1;
443
444 if (c->leb_size < UBIFS_MIN_LEB_SZ) {
445 ubifs_err("too small LEBs (%d bytes), min. is %d bytes",
446 c->leb_size, UBIFS_MIN_LEB_SZ);
447 return -EINVAL;
448 }
449
450 if (c->leb_cnt < UBIFS_MIN_LEB_CNT) {
451 ubifs_err("too few LEBs (%d), min. is %d",
452 c->leb_cnt, UBIFS_MIN_LEB_CNT);
453 return -EINVAL;
454 }
455
456 if (!is_power_of_2(c->min_io_size)) {
457 ubifs_err("bad min. I/O size %d", c->min_io_size);
458 return -EINVAL;
459 }
460
461 /*
462 * UBIFS aligns all node to 8-byte boundary, so to make function in
463 * io.c simpler, assume minimum I/O unit size to be 8 bytes if it is
464 * less than 8.
465 */
466 if (c->min_io_size < 8) {
467 c->min_io_size = 8;
468 c->min_io_shift = 3;
469 }
470
471 c->ref_node_alsz = ALIGN(UBIFS_REF_NODE_SZ, c->min_io_size);
472 c->mst_node_alsz = ALIGN(UBIFS_MST_NODE_SZ, c->min_io_size);
473
474 /*
475 * Initialize node length ranges which are mostly needed for node
476 * length validation.
477 */
478 c->ranges[UBIFS_PAD_NODE].len = UBIFS_PAD_NODE_SZ;
479 c->ranges[UBIFS_SB_NODE].len = UBIFS_SB_NODE_SZ;
480 c->ranges[UBIFS_MST_NODE].len = UBIFS_MST_NODE_SZ;
481 c->ranges[UBIFS_REF_NODE].len = UBIFS_REF_NODE_SZ;
482 c->ranges[UBIFS_TRUN_NODE].len = UBIFS_TRUN_NODE_SZ;
483 c->ranges[UBIFS_CS_NODE].len = UBIFS_CS_NODE_SZ;
484
485 c->ranges[UBIFS_INO_NODE].min_len = UBIFS_INO_NODE_SZ;
486 c->ranges[UBIFS_INO_NODE].max_len = UBIFS_MAX_INO_NODE_SZ;
487 c->ranges[UBIFS_ORPH_NODE].min_len =
488 UBIFS_ORPH_NODE_SZ + sizeof(__le64);
489 c->ranges[UBIFS_ORPH_NODE].max_len = c->leb_size;
490 c->ranges[UBIFS_DENT_NODE].min_len = UBIFS_DENT_NODE_SZ;
491 c->ranges[UBIFS_DENT_NODE].max_len = UBIFS_MAX_DENT_NODE_SZ;
492 c->ranges[UBIFS_XENT_NODE].min_len = UBIFS_XENT_NODE_SZ;
493 c->ranges[UBIFS_XENT_NODE].max_len = UBIFS_MAX_XENT_NODE_SZ;
494 c->ranges[UBIFS_DATA_NODE].min_len = UBIFS_DATA_NODE_SZ;
495 c->ranges[UBIFS_DATA_NODE].max_len = UBIFS_MAX_DATA_NODE_SZ;
496 /*
497 * Minimum indexing node size is amended later when superblock is
498 * read and the key length is known.
499 */
500 c->ranges[UBIFS_IDX_NODE].min_len = UBIFS_IDX_NODE_SZ + UBIFS_BRANCH_SZ;
501 /*
502 * Maximum indexing node size is amended later when superblock is
503 * read and the fanout is known.
504 */
505 c->ranges[UBIFS_IDX_NODE].max_len = INT_MAX;
506
507 /*
508 * Initialize dead and dark LEB space watermarks.
509 *
510 * Dead space is the space which cannot be used. Its watermark is
511 * equivalent to min. I/O unit or minimum node size if it is greater
512 * then min. I/O unit.
513 *
514 * Dark space is the space which might be used, or might not, depending
515 * on which node should be written to the LEB. Its watermark is
516 * equivalent to maximum UBIFS node size.
517 */
518 c->dead_wm = ALIGN(MIN_WRITE_SZ, c->min_io_size);
519 c->dark_wm = ALIGN(UBIFS_MAX_NODE_SZ, c->min_io_size);
520
521 return 0;
522}
523
524/**
525 * bud_wbuf_callback - bud LEB write-buffer synchronization call-back.
526 * @c: UBIFS file-system description object
527 * @lnum: LEB the write-buffer was synchronized to
528 * @free: how many free bytes left in this LEB
529 * @pad: how many bytes were padded
530 *
531 * This is a callback function which is called by the I/O unit when the
532 * write-buffer is synchronized. We need this to correctly maintain space
533 * accounting in bud logical eraseblocks. This function returns zero in case of
534 * success and a negative error code in case of failure.
535 *
536 * This function actually belongs to the journal, but we keep it here because
537 * we want to keep it static.
538 */
539static int bud_wbuf_callback(struct ubifs_info *c, int lnum, int free, int pad)
540{
541 return ubifs_update_one_lp(c, lnum, free, pad, 0, 0);
542}
543
544/*
545 * init_constants_late - initialize UBIFS constants.
546 * @c: UBIFS file-system description object
547 *
548 * This is a helper function which initializes various UBIFS constants after
549 * the superblock has been read. It also checks various UBIFS parameters and
550 * makes sure they are all right. Returns zero in case of success and a
551 * negative error code in case of failure.
552 */
553static int init_constants_late(struct ubifs_info *c)
554{
555 int tmp, err;
556 uint64_t tmp64;
557
558 c->main_bytes = (long long)c->main_lebs * c->leb_size;
559 c->max_znode_sz = sizeof(struct ubifs_znode) +
560 c->fanout * sizeof(struct ubifs_zbranch);
561
562 tmp = ubifs_idx_node_sz(c, 1);
563 c->ranges[UBIFS_IDX_NODE].min_len = tmp;
564 c->min_idx_node_sz = ALIGN(tmp, 8);
565
566 tmp = ubifs_idx_node_sz(c, c->fanout);
567 c->ranges[UBIFS_IDX_NODE].max_len = tmp;
568 c->max_idx_node_sz = ALIGN(tmp, 8);
569
570 /* Make sure LEB size is large enough to fit full commit */
571 tmp = UBIFS_CS_NODE_SZ + UBIFS_REF_NODE_SZ * c->jhead_cnt;
572 tmp = ALIGN(tmp, c->min_io_size);
573 if (tmp > c->leb_size) {
574 dbg_err("too small LEB size %d, at least %d needed",
575 c->leb_size, tmp);
576 return -EINVAL;
577 }
578
579 /*
580 * Make sure that the log is large enough to fit reference nodes for
581 * all buds plus one reserved LEB.
582 */
583 tmp64 = c->max_bud_bytes;
584 tmp = do_div(tmp64, c->leb_size);
585 c->max_bud_cnt = tmp64 + !!tmp;
586 tmp = (c->ref_node_alsz * c->max_bud_cnt + c->leb_size - 1);
587 tmp /= c->leb_size;
588 tmp += 1;
589 if (c->log_lebs < tmp) {
590 dbg_err("too small log %d LEBs, required min. %d LEBs",
591 c->log_lebs, tmp);
592 return -EINVAL;
593 }
594
595 /*
596 * When budgeting we assume worst-case scenarios when the pages are not
597 * be compressed and direntries are of the maximum size.
598 *
599 * Note, data, which may be stored in inodes is budgeted separately, so
600 * it is not included into 'c->inode_budget'.
601 */
602 c->page_budget = UBIFS_MAX_DATA_NODE_SZ * UBIFS_BLOCKS_PER_PAGE;
603 c->inode_budget = UBIFS_INO_NODE_SZ;
604 c->dent_budget = UBIFS_MAX_DENT_NODE_SZ;
605
606 /*
607 * When the amount of flash space used by buds becomes
608 * 'c->max_bud_bytes', UBIFS just blocks all writers and starts commit.
609 * The writers are unblocked when the commit is finished. To avoid
610 * writers to be blocked UBIFS initiates background commit in advance,
611 * when number of bud bytes becomes above the limit defined below.
612 */
613 c->bg_bud_bytes = (c->max_bud_bytes * 13) >> 4;
614
615 /*
616 * Ensure minimum journal size. All the bytes in the journal heads are
617 * considered to be used, when calculating the current journal usage.
618 * Consequently, if the journal is too small, UBIFS will treat it as
619 * always full.
620 */
621 tmp64 = (uint64_t)(c->jhead_cnt + 1) * c->leb_size + 1;
622 if (c->bg_bud_bytes < tmp64)
623 c->bg_bud_bytes = tmp64;
624 if (c->max_bud_bytes < tmp64 + c->leb_size)
625 c->max_bud_bytes = tmp64 + c->leb_size;
626
627 err = ubifs_calc_lpt_geom(c);
628 if (err)
629 return err;
630
631 c->min_idx_lebs = ubifs_calc_min_idx_lebs(c);
632
633 /*
634 * Calculate total amount of FS blocks. This number is not used
635 * internally because it does not make much sense for UBIFS, but it is
636 * necessary to report something for the 'statfs()' call.
637 *
638 * Subtract the LEB reserved for GC and the LEB which is reserved for
639 * deletions.
640 *
641 * Review 'ubifs_calc_available()' if changing this calculation.
642 */
643 tmp64 = c->main_lebs - 2;
644 tmp64 *= (uint64_t)c->leb_size - c->dark_wm;
645 tmp64 = ubifs_reported_space(c, tmp64);
646 c->block_cnt = tmp64 >> UBIFS_BLOCK_SHIFT;
647
648 return 0;
649}
650
651/**
652 * take_gc_lnum - reserve GC LEB.
653 * @c: UBIFS file-system description object
654 *
655 * This function ensures that the LEB reserved for garbage collection is
656 * unmapped and is marked as "taken" in lprops. We also have to set free space
657 * to LEB size and dirty space to zero, because lprops may contain out-of-date
658 * information if the file-system was un-mounted before it has been committed.
659 * This function returns zero in case of success and a negative error code in
660 * case of failure.
661 */
662static int take_gc_lnum(struct ubifs_info *c)
663{
664 int err;
665
666 if (c->gc_lnum == -1) {
667 ubifs_err("no LEB for GC");
668 return -EINVAL;
669 }
670
671 err = ubifs_leb_unmap(c, c->gc_lnum);
672 if (err)
673 return err;
674
675 /* And we have to tell lprops that this LEB is taken */
676 err = ubifs_change_one_lp(c, c->gc_lnum, c->leb_size, 0,
677 LPROPS_TAKEN, 0, 0);
678 return err;
679}
680
681/**
682 * alloc_wbufs - allocate write-buffers.
683 * @c: UBIFS file-system description object
684 *
685 * This helper function allocates and initializes UBIFS write-buffers. Returns
686 * zero in case of success and %-ENOMEM in case of failure.
687 */
688static int alloc_wbufs(struct ubifs_info *c)
689{
690 int i, err;
691
692 c->jheads = kzalloc(c->jhead_cnt * sizeof(struct ubifs_jhead),
693 GFP_KERNEL);
694 if (!c->jheads)
695 return -ENOMEM;
696
697 /* Initialize journal heads */
698 for (i = 0; i < c->jhead_cnt; i++) {
699 INIT_LIST_HEAD(&c->jheads[i].buds_list);
700 err = ubifs_wbuf_init(c, &c->jheads[i].wbuf);
701 if (err)
702 return err;
703
704 c->jheads[i].wbuf.sync_callback = &bud_wbuf_callback;
705 c->jheads[i].wbuf.jhead = i;
706 }
707
708 c->jheads[BASEHD].wbuf.dtype = UBI_SHORTTERM;
709 /*
710 * Garbage Collector head likely contains long-term data and
711 * does not need to be synchronized by timer.
712 */
713 c->jheads[GCHD].wbuf.dtype = UBI_LONGTERM;
714 c->jheads[GCHD].wbuf.timeout = 0;
715
716 return 0;
717}
718
719/**
720 * free_wbufs - free write-buffers.
721 * @c: UBIFS file-system description object
722 */
723static void free_wbufs(struct ubifs_info *c)
724{
725 int i;
726
727 if (c->jheads) {
728 for (i = 0; i < c->jhead_cnt; i++) {
729 kfree(c->jheads[i].wbuf.buf);
730 kfree(c->jheads[i].wbuf.inodes);
731 }
732 kfree(c->jheads);
733 c->jheads = NULL;
734 }
735}
736
737/**
738 * free_orphans - free orphans.
739 * @c: UBIFS file-system description object
740 */
741static void free_orphans(struct ubifs_info *c)
742{
743 struct ubifs_orphan *orph;
744
745 while (c->orph_dnext) {
746 orph = c->orph_dnext;
747 c->orph_dnext = orph->dnext;
748 list_del(&orph->list);
749 kfree(orph);
750 }
751
752 while (!list_empty(&c->orph_list)) {
753 orph = list_entry(c->orph_list.next, struct ubifs_orphan, list);
754 list_del(&orph->list);
755 kfree(orph);
756 dbg_err("orphan list not empty at unmount");
757 }
758
759 vfree(c->orph_buf);
760 c->orph_buf = NULL;
761}
762
763/**
764 * free_buds - free per-bud objects.
765 * @c: UBIFS file-system description object
766 */
767static void free_buds(struct ubifs_info *c)
768{
769 struct rb_node *this = c->buds.rb_node;
770 struct ubifs_bud *bud;
771
772 while (this) {
773 if (this->rb_left)
774 this = this->rb_left;
775 else if (this->rb_right)
776 this = this->rb_right;
777 else {
778 bud = rb_entry(this, struct ubifs_bud, rb);
779 this = rb_parent(this);
780 if (this) {
781 if (this->rb_left == &bud->rb)
782 this->rb_left = NULL;
783 else
784 this->rb_right = NULL;
785 }
786 kfree(bud);
787 }
788 }
789}
790
791/**
792 * check_volume_empty - check if the UBI volume is empty.
793 * @c: UBIFS file-system description object
794 *
795 * This function checks if the UBIFS volume is empty by looking if its LEBs are
796 * mapped or not. The result of checking is stored in the @c->empty variable.
797 * Returns zero in case of success and a negative error code in case of
798 * failure.
799 */
800static int check_volume_empty(struct ubifs_info *c)
801{
802 int lnum, err;
803
804 c->empty = 1;
805 for (lnum = 0; lnum < c->leb_cnt; lnum++) {
806 err = ubi_is_mapped(c->ubi, lnum);
807 if (unlikely(err < 0))
808 return err;
809 if (err == 1) {
810 c->empty = 0;
811 break;
812 }
813
814 cond_resched();
815 }
816
817 return 0;
818}
819
820/*
821 * UBIFS mount options.
822 *
823 * Opt_fast_unmount: do not run a journal commit before un-mounting
824 * Opt_norm_unmount: run a journal commit before un-mounting
825 * Opt_err: just end of array marker
826 */
827enum {
828 Opt_fast_unmount,
829 Opt_norm_unmount,
830 Opt_err,
831};
832
833static match_table_t tokens = {
834 {Opt_fast_unmount, "fast_unmount"},
835 {Opt_norm_unmount, "norm_unmount"},
836 {Opt_err, NULL},
837};
838
839/**
840 * ubifs_parse_options - parse mount parameters.
841 * @c: UBIFS file-system description object
842 * @options: parameters to parse
843 * @is_remount: non-zero if this is FS re-mount
844 *
845 * This function parses UBIFS mount options and returns zero in case success
846 * and a negative error code in case of failure.
847 */
848static int ubifs_parse_options(struct ubifs_info *c, char *options,
849 int is_remount)
850{
851 char *p;
852 substring_t args[MAX_OPT_ARGS];
853
854 if (!options)
855 return 0;
856
857 while ((p = strsep(&options, ","))) {
858 int token;
859
860 if (!*p)
861 continue;
862
863 token = match_token(p, tokens, args);
864 switch (token) {
865 case Opt_fast_unmount:
866 c->mount_opts.unmount_mode = 2;
867 c->fast_unmount = 1;
868 break;
869 case Opt_norm_unmount:
870 c->mount_opts.unmount_mode = 1;
871 c->fast_unmount = 0;
872 break;
873 default:
874 ubifs_err("unrecognized mount option \"%s\" "
875 "or missing value", p);
876 return -EINVAL;
877 }
878 }
879
880 return 0;
881}
882
883/**
884 * destroy_journal - destroy journal data structures.
885 * @c: UBIFS file-system description object
886 *
887 * This function destroys journal data structures including those that may have
888 * been created by recovery functions.
889 */
890static void destroy_journal(struct ubifs_info *c)
891{
892 while (!list_empty(&c->unclean_leb_list)) {
893 struct ubifs_unclean_leb *ucleb;
894
895 ucleb = list_entry(c->unclean_leb_list.next,
896 struct ubifs_unclean_leb, list);
897 list_del(&ucleb->list);
898 kfree(ucleb);
899 }
900 while (!list_empty(&c->old_buds)) {
901 struct ubifs_bud *bud;
902
903 bud = list_entry(c->old_buds.next, struct ubifs_bud, list);
904 list_del(&bud->list);
905 kfree(bud);
906 }
907 ubifs_destroy_idx_gc(c);
908 ubifs_destroy_size_tree(c);
909 ubifs_tnc_close(c);
910 free_buds(c);
911}
912
913/**
914 * mount_ubifs - mount UBIFS file-system.
915 * @c: UBIFS file-system description object
916 *
917 * This function mounts UBIFS file system. Returns zero in case of success and
918 * a negative error code in case of failure.
919 *
920 * Note, the function does not de-allocate resources it it fails half way
921 * through, and the caller has to do this instead.
922 */
923static int mount_ubifs(struct ubifs_info *c)
924{
925 struct super_block *sb = c->vfs_sb;
926 int err, mounted_read_only = (sb->s_flags & MS_RDONLY);
927 long long x;
928 size_t sz;
929
930 err = init_constants_early(c);
931 if (err)
932 return err;
933
934#ifdef CONFIG_UBIFS_FS_DEBUG
935 c->dbg_buf = vmalloc(c->leb_size);
936 if (!c->dbg_buf)
937 return -ENOMEM;
938#endif
939
940 err = check_volume_empty(c);
941 if (err)
942 goto out_free;
943
944 if (c->empty && (mounted_read_only || c->ro_media)) {
945 /*
946 * This UBI volume is empty, and read-only, or the file system
947 * is mounted read-only - we cannot format it.
948 */
949 ubifs_err("can't format empty UBI volume: read-only %s",
950 c->ro_media ? "UBI volume" : "mount");
951 err = -EROFS;
952 goto out_free;
953 }
954
955 if (c->ro_media && !mounted_read_only) {
956 ubifs_err("cannot mount read-write - read-only media");
957 err = -EROFS;
958 goto out_free;
959 }
960
961 /*
962 * The requirement for the buffer is that it should fit indexing B-tree
963 * height amount of integers. We assume the height if the TNC tree will
964 * never exceed 64.
965 */
966 err = -ENOMEM;
967 c->bottom_up_buf = kmalloc(BOTTOM_UP_HEIGHT * sizeof(int), GFP_KERNEL);
968 if (!c->bottom_up_buf)
969 goto out_free;
970
971 c->sbuf = vmalloc(c->leb_size);
972 if (!c->sbuf)
973 goto out_free;
974
975 if (!mounted_read_only) {
976 c->ileb_buf = vmalloc(c->leb_size);
977 if (!c->ileb_buf)
978 goto out_free;
979 }
980
981 err = ubifs_read_superblock(c);
982 if (err)
983 goto out_free;
984
985 /*
986 * Make sure the compressor which is set as the default on in the
987 * superblock was actually compiled in.
988 */
989 if (!ubifs_compr_present(c->default_compr)) {
990 ubifs_warn("'%s' compressor is set by superblock, but not "
991 "compiled in", ubifs_compr_name(c->default_compr));
992 c->default_compr = UBIFS_COMPR_NONE;
993 }
994
995 dbg_failure_mode_registration(c);
996
997 err = init_constants_late(c);
998 if (err)
999 goto out_dereg;
1000
1001 sz = ALIGN(c->max_idx_node_sz, c->min_io_size);
1002 sz = ALIGN(sz + c->max_idx_node_sz, c->min_io_size);
1003 c->cbuf = kmalloc(sz, GFP_NOFS);
1004 if (!c->cbuf) {
1005 err = -ENOMEM;
1006 goto out_dereg;
1007 }
1008
1009 if (!mounted_read_only) {
1010 err = alloc_wbufs(c);
1011 if (err)
1012 goto out_cbuf;
1013
1014 /* Create background thread */
1015 sprintf(c->bgt_name, BGT_NAME_PATTERN, c->vi.ubi_num,
1016 c->vi.vol_id);
1017 c->bgt = kthread_create(ubifs_bg_thread, c, c->bgt_name);
1018 if (!c->bgt)
1019 c->bgt = ERR_PTR(-EINVAL);
1020 if (IS_ERR(c->bgt)) {
1021 err = PTR_ERR(c->bgt);
1022 c->bgt = NULL;
1023 ubifs_err("cannot spawn \"%s\", error %d",
1024 c->bgt_name, err);
1025 goto out_wbufs;
1026 }
1027 wake_up_process(c->bgt);
1028 }
1029
1030 err = ubifs_read_master(c);
1031 if (err)
1032 goto out_master;
1033
1034 if ((c->mst_node->flags & cpu_to_le32(UBIFS_MST_DIRTY)) != 0) {
1035 ubifs_msg("recovery needed");
1036 c->need_recovery = 1;
1037 if (!mounted_read_only) {
1038 err = ubifs_recover_inl_heads(c, c->sbuf);
1039 if (err)
1040 goto out_master;
1041 }
1042 } else if (!mounted_read_only) {
1043 /*
1044 * Set the "dirty" flag so that if we reboot uncleanly we
1045 * will notice this immediately on the next mount.
1046 */
1047 c->mst_node->flags |= cpu_to_le32(UBIFS_MST_DIRTY);
1048 err = ubifs_write_master(c);
1049 if (err)
1050 goto out_master;
1051 }
1052
1053 err = ubifs_lpt_init(c, 1, !mounted_read_only);
1054 if (err)
1055 goto out_lpt;
1056
1057 err = dbg_check_idx_size(c, c->old_idx_sz);
1058 if (err)
1059 goto out_lpt;
1060
1061 err = ubifs_replay_journal(c);
1062 if (err)
1063 goto out_journal;
1064
1065 err = ubifs_mount_orphans(c, c->need_recovery, mounted_read_only);
1066 if (err)
1067 goto out_orphans;
1068
1069 if (!mounted_read_only) {
1070 int lnum;
1071
1072 /* Check for enough free space */
1073 if (ubifs_calc_available(c, c->min_idx_lebs) <= 0) {
1074 ubifs_err("insufficient available space");
1075 err = -EINVAL;
1076 goto out_orphans;
1077 }
1078
1079 /* Check for enough log space */
1080 lnum = c->lhead_lnum + 1;
1081 if (lnum >= UBIFS_LOG_LNUM + c->log_lebs)
1082 lnum = UBIFS_LOG_LNUM;
1083 if (lnum == c->ltail_lnum) {
1084 err = ubifs_consolidate_log(c);
1085 if (err)
1086 goto out_orphans;
1087 }
1088
1089 if (c->need_recovery) {
1090 err = ubifs_recover_size(c);
1091 if (err)
1092 goto out_orphans;
1093 err = ubifs_rcvry_gc_commit(c);
1094 } else
1095 err = take_gc_lnum(c);
1096 if (err)
1097 goto out_orphans;
1098
1099 err = dbg_check_lprops(c);
1100 if (err)
1101 goto out_orphans;
1102 } else if (c->need_recovery) {
1103 err = ubifs_recover_size(c);
1104 if (err)
1105 goto out_orphans;
1106 }
1107
1108 spin_lock(&ubifs_infos_lock);
1109 list_add_tail(&c->infos_list, &ubifs_infos);
1110 spin_unlock(&ubifs_infos_lock);
1111
1112 if (c->need_recovery) {
1113 if (mounted_read_only)
1114 ubifs_msg("recovery deferred");
1115 else {
1116 c->need_recovery = 0;
1117 ubifs_msg("recovery completed");
1118 }
1119 }
1120
1121 err = dbg_check_filesystem(c);
1122 if (err)
1123 goto out_infos;
1124
1125 ubifs_msg("mounted UBI device %d, volume %d", c->vi.ubi_num,
1126 c->vi.vol_id);
1127 if (mounted_read_only)
1128 ubifs_msg("mounted read-only");
1129 x = (long long)c->main_lebs * c->leb_size;
1130 ubifs_msg("file system size: %lld bytes (%lld KiB, %lld MiB, %d LEBs)",
1131 x, x >> 10, x >> 20, c->main_lebs);
1132 x = (long long)c->log_lebs * c->leb_size + c->max_bud_bytes;
1133 ubifs_msg("journal size: %lld bytes (%lld KiB, %lld MiB, %d LEBs)",
1134 x, x >> 10, x >> 20, c->log_lebs + c->max_bud_cnt);
1135 ubifs_msg("default compressor: %s", ubifs_compr_name(c->default_compr));
1136 ubifs_msg("media format %d, latest format %d",
1137 c->fmt_version, UBIFS_FORMAT_VERSION);
1138
1139 dbg_msg("compiled on: " __DATE__ " at " __TIME__);
1140 dbg_msg("min. I/O unit size: %d bytes", c->min_io_size);
1141 dbg_msg("LEB size: %d bytes (%d KiB)",
1142 c->leb_size, c->leb_size / 1024);
1143 dbg_msg("data journal heads: %d",
1144 c->jhead_cnt - NONDATA_JHEADS_CNT);
1145 dbg_msg("UUID: %02X%02X%02X%02X-%02X%02X"
1146 "-%02X%02X-%02X%02X-%02X%02X%02X%02X%02X%02X",
1147 c->uuid[0], c->uuid[1], c->uuid[2], c->uuid[3],
1148 c->uuid[4], c->uuid[5], c->uuid[6], c->uuid[7],
1149 c->uuid[8], c->uuid[9], c->uuid[10], c->uuid[11],
1150 c->uuid[12], c->uuid[13], c->uuid[14], c->uuid[15]);
1151 dbg_msg("fast unmount: %d", c->fast_unmount);
1152 dbg_msg("big_lpt %d", c->big_lpt);
1153 dbg_msg("log LEBs: %d (%d - %d)",
1154 c->log_lebs, UBIFS_LOG_LNUM, c->log_last);
1155 dbg_msg("LPT area LEBs: %d (%d - %d)",
1156 c->lpt_lebs, c->lpt_first, c->lpt_last);
1157 dbg_msg("orphan area LEBs: %d (%d - %d)",
1158 c->orph_lebs, c->orph_first, c->orph_last);
1159 dbg_msg("main area LEBs: %d (%d - %d)",
1160 c->main_lebs, c->main_first, c->leb_cnt - 1);
1161 dbg_msg("index LEBs: %d", c->lst.idx_lebs);
1162 dbg_msg("total index bytes: %lld (%lld KiB, %lld MiB)",
1163 c->old_idx_sz, c->old_idx_sz >> 10, c->old_idx_sz >> 20);
1164 dbg_msg("key hash type: %d", c->key_hash_type);
1165 dbg_msg("tree fanout: %d", c->fanout);
1166 dbg_msg("reserved GC LEB: %d", c->gc_lnum);
1167 dbg_msg("first main LEB: %d", c->main_first);
1168 dbg_msg("dead watermark: %d", c->dead_wm);
1169 dbg_msg("dark watermark: %d", c->dark_wm);
1170 x = (long long)c->main_lebs * c->dark_wm;
1171 dbg_msg("max. dark space: %lld (%lld KiB, %lld MiB)",
1172 x, x >> 10, x >> 20);
1173 dbg_msg("maximum bud bytes: %lld (%lld KiB, %lld MiB)",
1174 c->max_bud_bytes, c->max_bud_bytes >> 10,
1175 c->max_bud_bytes >> 20);
1176 dbg_msg("BG commit bud bytes: %lld (%lld KiB, %lld MiB)",
1177 c->bg_bud_bytes, c->bg_bud_bytes >> 10,
1178 c->bg_bud_bytes >> 20);
1179 dbg_msg("current bud bytes %lld (%lld KiB, %lld MiB)",
1180 c->bud_bytes, c->bud_bytes >> 10, c->bud_bytes >> 20);
1181 dbg_msg("max. seq. number: %llu", c->max_sqnum);
1182 dbg_msg("commit number: %llu", c->cmt_no);
1183
1184 return 0;
1185
1186out_infos:
1187 spin_lock(&ubifs_infos_lock);
1188 list_del(&c->infos_list);
1189 spin_unlock(&ubifs_infos_lock);
1190out_orphans:
1191 free_orphans(c);
1192out_journal:
1193 destroy_journal(c);
1194out_lpt:
1195 ubifs_lpt_free(c, 0);
1196out_master:
1197 kfree(c->mst_node);
1198 kfree(c->rcvrd_mst_node);
1199 if (c->bgt)
1200 kthread_stop(c->bgt);
1201out_wbufs:
1202 free_wbufs(c);
1203out_cbuf:
1204 kfree(c->cbuf);
1205out_dereg:
1206 dbg_failure_mode_deregistration(c);
1207out_free:
1208 vfree(c->ileb_buf);
1209 vfree(c->sbuf);
1210 kfree(c->bottom_up_buf);
1211 UBIFS_DBG(vfree(c->dbg_buf));
1212 return err;
1213}
1214
1215/**
1216 * ubifs_umount - un-mount UBIFS file-system.
1217 * @c: UBIFS file-system description object
1218 *
1219 * Note, this function is called to free allocated resourced when un-mounting,
1220 * as well as free resources when an error occurred while we were half way
1221 * through mounting (error path cleanup function). So it has to make sure the
1222 * resource was actually allocated before freeing it.
1223 */
1224static void ubifs_umount(struct ubifs_info *c)
1225{
1226 dbg_gen("un-mounting UBI device %d, volume %d", c->vi.ubi_num,
1227 c->vi.vol_id);
1228
1229 spin_lock(&ubifs_infos_lock);
1230 list_del(&c->infos_list);
1231 spin_unlock(&ubifs_infos_lock);
1232
1233 if (c->bgt)
1234 kthread_stop(c->bgt);
1235
1236 destroy_journal(c);
1237 free_wbufs(c);
1238 free_orphans(c);
1239 ubifs_lpt_free(c, 0);
1240
1241 kfree(c->cbuf);
1242 kfree(c->rcvrd_mst_node);
1243 kfree(c->mst_node);
1244 vfree(c->sbuf);
1245 kfree(c->bottom_up_buf);
1246 UBIFS_DBG(vfree(c->dbg_buf));
1247 vfree(c->ileb_buf);
1248 dbg_failure_mode_deregistration(c);
1249}
1250
1251/**
1252 * ubifs_remount_rw - re-mount in read-write mode.
1253 * @c: UBIFS file-system description object
1254 *
1255 * UBIFS avoids allocating many unnecessary resources when mounted in read-only
1256 * mode. This function allocates the needed resources and re-mounts UBIFS in
1257 * read-write mode.
1258 */
1259static int ubifs_remount_rw(struct ubifs_info *c)
1260{
1261 int err, lnum;
1262
1263 if (c->ro_media)
1264 return -EINVAL;
1265
1266 mutex_lock(&c->umount_mutex);
1267 c->remounting_rw = 1;
1268
1269 /* Check for enough free space */
1270 if (ubifs_calc_available(c, c->min_idx_lebs) <= 0) {
1271 ubifs_err("insufficient available space");
1272 err = -EINVAL;
1273 goto out;
1274 }
1275
1276 if (c->old_leb_cnt != c->leb_cnt) {
1277 struct ubifs_sb_node *sup;
1278
1279 sup = ubifs_read_sb_node(c);
1280 if (IS_ERR(sup)) {
1281 err = PTR_ERR(sup);
1282 goto out;
1283 }
1284 sup->leb_cnt = cpu_to_le32(c->leb_cnt);
1285 err = ubifs_write_sb_node(c, sup);
1286 if (err)
1287 goto out;
1288 }
1289
1290 if (c->need_recovery) {
1291 ubifs_msg("completing deferred recovery");
1292 err = ubifs_write_rcvrd_mst_node(c);
1293 if (err)
1294 goto out;
1295 err = ubifs_recover_size(c);
1296 if (err)
1297 goto out;
1298 err = ubifs_clean_lebs(c, c->sbuf);
1299 if (err)
1300 goto out;
1301 err = ubifs_recover_inl_heads(c, c->sbuf);
1302 if (err)
1303 goto out;
1304 }
1305
1306 if (!(c->mst_node->flags & cpu_to_le32(UBIFS_MST_DIRTY))) {
1307 c->mst_node->flags |= cpu_to_le32(UBIFS_MST_DIRTY);
1308 err = ubifs_write_master(c);
1309 if (err)
1310 goto out;
1311 }
1312
1313 c->ileb_buf = vmalloc(c->leb_size);
1314 if (!c->ileb_buf) {
1315 err = -ENOMEM;
1316 goto out;
1317 }
1318
1319 err = ubifs_lpt_init(c, 0, 1);
1320 if (err)
1321 goto out;
1322
1323 err = alloc_wbufs(c);
1324 if (err)
1325 goto out;
1326
1327 ubifs_create_buds_lists(c);
1328
1329 /* Create background thread */
1330 c->bgt = kthread_create(ubifs_bg_thread, c, c->bgt_name);
1331 if (!c->bgt)
1332 c->bgt = ERR_PTR(-EINVAL);
1333 if (IS_ERR(c->bgt)) {
1334 err = PTR_ERR(c->bgt);
1335 c->bgt = NULL;
1336 ubifs_err("cannot spawn \"%s\", error %d",
1337 c->bgt_name, err);
1338 return err;
1339 }
1340 wake_up_process(c->bgt);
1341
1342 c->orph_buf = vmalloc(c->leb_size);
1343 if (!c->orph_buf)
1344 return -ENOMEM;
1345
1346 /* Check for enough log space */
1347 lnum = c->lhead_lnum + 1;
1348 if (lnum >= UBIFS_LOG_LNUM + c->log_lebs)
1349 lnum = UBIFS_LOG_LNUM;
1350 if (lnum == c->ltail_lnum) {
1351 err = ubifs_consolidate_log(c);
1352 if (err)
1353 goto out;
1354 }
1355
1356 if (c->need_recovery)
1357 err = ubifs_rcvry_gc_commit(c);
1358 else
1359 err = take_gc_lnum(c);
1360 if (err)
1361 goto out;
1362
1363 if (c->need_recovery) {
1364 c->need_recovery = 0;
1365 ubifs_msg("deferred recovery completed");
1366 }
1367
1368 dbg_gen("re-mounted read-write");
1369 c->vfs_sb->s_flags &= ~MS_RDONLY;
1370 c->remounting_rw = 0;
1371 mutex_unlock(&c->umount_mutex);
1372 return 0;
1373
1374out:
1375 vfree(c->orph_buf);
1376 c->orph_buf = NULL;
1377 if (c->bgt) {
1378 kthread_stop(c->bgt);
1379 c->bgt = NULL;
1380 }
1381 free_wbufs(c);
1382 vfree(c->ileb_buf);
1383 c->ileb_buf = NULL;
1384 ubifs_lpt_free(c, 1);
1385 c->remounting_rw = 0;
1386 mutex_unlock(&c->umount_mutex);
1387 return err;
1388}
1389
1390/**
1391 * commit_on_unmount - commit the journal when un-mounting.
1392 * @c: UBIFS file-system description object
1393 *
1394 * This function is called during un-mounting and it commits the journal unless
1395 * the "fast unmount" mode is enabled. It also avoids committing the journal if
1396 * it contains too few data.
1397 *
1398 * Sometimes recovery requires the journal to be committed at least once, and
1399 * this function takes care about this.
1400 */
1401static void commit_on_unmount(struct ubifs_info *c)
1402{
1403 if (!c->fast_unmount) {
1404 long long bud_bytes;
1405
1406 spin_lock(&c->buds_lock);
1407 bud_bytes = c->bud_bytes;
1408 spin_unlock(&c->buds_lock);
1409 if (bud_bytes > c->leb_size)
1410 ubifs_run_commit(c);
1411 }
1412}
1413
1414/**
1415 * ubifs_remount_ro - re-mount in read-only mode.
1416 * @c: UBIFS file-system description object
1417 *
1418 * We rely on VFS to have stopped writing. Possibly the background thread could
1419 * be running a commit, however kthread_stop will wait in that case.
1420 */
1421static void ubifs_remount_ro(struct ubifs_info *c)
1422{
1423 int i, err;
1424
1425 ubifs_assert(!c->need_recovery);
1426 commit_on_unmount(c);
1427
1428 mutex_lock(&c->umount_mutex);
1429 if (c->bgt) {
1430 kthread_stop(c->bgt);
1431 c->bgt = NULL;
1432 }
1433
1434 for (i = 0; i < c->jhead_cnt; i++) {
1435 ubifs_wbuf_sync(&c->jheads[i].wbuf);
1436 del_timer_sync(&c->jheads[i].wbuf.timer);
1437 }
1438
1439 if (!c->ro_media) {
1440 c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_DIRTY);
1441 c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS);
1442 c->mst_node->gc_lnum = cpu_to_le32(c->gc_lnum);
1443 err = ubifs_write_master(c);
1444 if (err)
1445 ubifs_ro_mode(c, err);
1446 }
1447
1448 ubifs_destroy_idx_gc(c);
1449 free_wbufs(c);
1450 vfree(c->orph_buf);
1451 c->orph_buf = NULL;
1452 vfree(c->ileb_buf);
1453 c->ileb_buf = NULL;
1454 ubifs_lpt_free(c, 1);
1455 mutex_unlock(&c->umount_mutex);
1456}
1457
1458static void ubifs_put_super(struct super_block *sb)
1459{
1460 int i;
1461 struct ubifs_info *c = sb->s_fs_info;
1462
1463 ubifs_msg("un-mount UBI device %d, volume %d", c->vi.ubi_num,
1464 c->vi.vol_id);
1465 /*
1466 * The following asserts are only valid if there has not been a failure
1467 * of the media. For example, there will be dirty inodes if we failed
1468 * to write them back because of I/O errors.
1469 */
1470 ubifs_assert(atomic_long_read(&c->dirty_pg_cnt) == 0);
1471 ubifs_assert(c->budg_idx_growth == 0);
1472 ubifs_assert(c->budg_data_growth == 0);
1473
1474 /*
1475 * The 'c->umount_lock' prevents races between UBIFS memory shrinker
1476 * and file system un-mount. Namely, it prevents the shrinker from
1477 * picking this superblock for shrinking - it will be just skipped if
1478 * the mutex is locked.
1479 */
1480 mutex_lock(&c->umount_mutex);
1481 if (!(c->vfs_sb->s_flags & MS_RDONLY)) {
1482 /*
1483 * First of all kill the background thread to make sure it does
1484 * not interfere with un-mounting and freeing resources.
1485 */
1486 if (c->bgt) {
1487 kthread_stop(c->bgt);
1488 c->bgt = NULL;
1489 }
1490
1491 /* Synchronize write-buffers */
1492 if (c->jheads)
1493 for (i = 0; i < c->jhead_cnt; i++) {
1494 ubifs_wbuf_sync(&c->jheads[i].wbuf);
1495 del_timer_sync(&c->jheads[i].wbuf.timer);
1496 }
1497
1498 /*
1499 * On fatal errors c->ro_media is set to 1, in which case we do
1500 * not write the master node.
1501 */
1502 if (!c->ro_media) {
1503 /*
1504 * We are being cleanly unmounted which means the
1505 * orphans were killed - indicate this in the master
1506 * node. Also save the reserved GC LEB number.
1507 */
1508 int err;
1509
1510 c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_DIRTY);
1511 c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS);
1512 c->mst_node->gc_lnum = cpu_to_le32(c->gc_lnum);
1513 err = ubifs_write_master(c);
1514 if (err)
1515 /*
1516 * Recovery will attempt to fix the master area
1517 * next mount, so we just print a message and
1518 * continue to unmount normally.
1519 */
1520 ubifs_err("failed to write master node, "
1521 "error %d", err);
1522 }
1523 }
1524
1525 ubifs_umount(c);
1526 bdi_destroy(&c->bdi);
1527 ubi_close_volume(c->ubi);
1528 mutex_unlock(&c->umount_mutex);
1529 kfree(c);
1530}
1531
1532static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
1533{
1534 int err;
1535 struct ubifs_info *c = sb->s_fs_info;
1536
1537 dbg_gen("old flags %#lx, new flags %#x", sb->s_flags, *flags);
1538
1539 err = ubifs_parse_options(c, data, 1);
1540 if (err) {
1541 ubifs_err("invalid or unknown remount parameter");
1542 return err;
1543 }
1544 if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) {
1545 err = ubifs_remount_rw(c);
1546 if (err)
1547 return err;
1548 } else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY))
1549 ubifs_remount_ro(c);
1550
1551 return 0;
1552}
1553
1554struct super_operations ubifs_super_operations = {
1555 .alloc_inode = ubifs_alloc_inode,
1556 .destroy_inode = ubifs_destroy_inode,
1557 .put_super = ubifs_put_super,
1558 .write_inode = ubifs_write_inode,
1559 .delete_inode = ubifs_delete_inode,
1560 .statfs = ubifs_statfs,
1561 .dirty_inode = ubifs_dirty_inode,
1562 .remount_fs = ubifs_remount_fs,
1563 .show_options = ubifs_show_options,
1564 .sync_fs = ubifs_sync_fs,
1565};
1566
1567/**
1568 * open_ubi - parse UBI device name string and open the UBI device.
1569 * @name: UBI volume name
1570 * @mode: UBI volume open mode
1571 *
1572 * There are several ways to specify UBI volumes when mounting UBIFS:
1573 * o ubiX_Y - UBI device number X, volume Y;
1574 * o ubiY - UBI device number 0, volume Y;
1575 * o ubiX:NAME - mount UBI device X, volume with name NAME;
1576 * o ubi:NAME - mount UBI device 0, volume with name NAME.
1577 *
1578 * Alternative '!' separator may be used instead of ':' (because some shells
1579 * like busybox may interpret ':' as an NFS host name separator). This function
1580 * returns ubi volume object in case of success and a negative error code in
1581 * case of failure.
1582 */
1583static struct ubi_volume_desc *open_ubi(const char *name, int mode)
1584{
1585 int dev, vol;
1586 char *endptr;
1587
1588 if (name[0] != 'u' || name[1] != 'b' || name[2] != 'i')
1589 return ERR_PTR(-EINVAL);
1590
1591 /* ubi:NAME method */
1592 if ((name[3] == ':' || name[3] == '!') && name[4] != '\0')
1593 return ubi_open_volume_nm(0, name + 4, mode);
1594
1595 if (!isdigit(name[3]))
1596 return ERR_PTR(-EINVAL);
1597
1598 dev = simple_strtoul(name + 3, &endptr, 0);
1599
1600 /* ubiY method */
1601 if (*endptr == '\0')
1602 return ubi_open_volume(0, dev, mode);
1603
1604 /* ubiX_Y method */
1605 if (*endptr == '_' && isdigit(endptr[1])) {
1606 vol = simple_strtoul(endptr + 1, &endptr, 0);
1607 if (*endptr != '\0')
1608 return ERR_PTR(-EINVAL);
1609 return ubi_open_volume(dev, vol, mode);
1610 }
1611
1612 /* ubiX:NAME method */
1613 if ((*endptr == ':' || *endptr == '!') && endptr[1] != '\0')
1614 return ubi_open_volume_nm(dev, ++endptr, mode);
1615
1616 return ERR_PTR(-EINVAL);
1617}
1618
1619static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
1620{
1621 struct ubi_volume_desc *ubi = sb->s_fs_info;
1622 struct ubifs_info *c;
1623 struct inode *root;
1624 int err;
1625
1626 c = kzalloc(sizeof(struct ubifs_info), GFP_KERNEL);
1627 if (!c)
1628 return -ENOMEM;
1629
1630 spin_lock_init(&c->cnt_lock);
1631 spin_lock_init(&c->cs_lock);
1632 spin_lock_init(&c->buds_lock);
1633 spin_lock_init(&c->space_lock);
1634 spin_lock_init(&c->orphan_lock);
1635 init_rwsem(&c->commit_sem);
1636 mutex_init(&c->lp_mutex);
1637 mutex_init(&c->tnc_mutex);
1638 mutex_init(&c->log_mutex);
1639 mutex_init(&c->mst_mutex);
1640 mutex_init(&c->umount_mutex);
1641 init_waitqueue_head(&c->cmt_wq);
1642 c->buds = RB_ROOT;
1643 c->old_idx = RB_ROOT;
1644 c->size_tree = RB_ROOT;
1645 c->orph_tree = RB_ROOT;
1646 INIT_LIST_HEAD(&c->infos_list);
1647 INIT_LIST_HEAD(&c->idx_gc);
1648 INIT_LIST_HEAD(&c->replay_list);
1649 INIT_LIST_HEAD(&c->replay_buds);
1650 INIT_LIST_HEAD(&c->uncat_list);
1651 INIT_LIST_HEAD(&c->empty_list);
1652 INIT_LIST_HEAD(&c->freeable_list);
1653 INIT_LIST_HEAD(&c->frdi_idx_list);
1654 INIT_LIST_HEAD(&c->unclean_leb_list);
1655 INIT_LIST_HEAD(&c->old_buds);
1656 INIT_LIST_HEAD(&c->orph_list);
1657 INIT_LIST_HEAD(&c->orph_new);
1658
1659 c->highest_inum = UBIFS_FIRST_INO;
1660 get_random_bytes(&c->vfs_gen, sizeof(int));
1661 c->lhead_lnum = c->ltail_lnum = UBIFS_LOG_LNUM;
1662
1663 ubi_get_volume_info(ubi, &c->vi);
1664 ubi_get_device_info(c->vi.ubi_num, &c->di);
1665
1666 /* Re-open the UBI device in read-write mode */
1667 c->ubi = ubi_open_volume(c->vi.ubi_num, c->vi.vol_id, UBI_READWRITE);
1668 if (IS_ERR(c->ubi)) {
1669 err = PTR_ERR(c->ubi);
1670 goto out_free;
1671 }
1672
1673 /*
1674 * UBIFS provids 'backing_dev_info' in order to disable readahead. For
1675 * UBIFS, I/O is not deferred, it is done immediately in readpage,
1676 * which means the user would have to wait not just for their own I/O
1677 * but the readahead I/O as well i.e. completely pointless.
1678 *
1679 * Read-ahead will be disabled because @c->bdi.ra_pages is 0.
1680 */
1681 c->bdi.capabilities = BDI_CAP_MAP_COPY;
1682 c->bdi.unplug_io_fn = default_unplug_io_fn;
1683 err = bdi_init(&c->bdi);
1684 if (err)
1685 goto out_close;
1686
1687 err = ubifs_parse_options(c, data, 0);
1688 if (err)
1689 goto out_bdi;
1690
1691 c->vfs_sb = sb;
1692
1693 sb->s_fs_info = c;
1694 sb->s_magic = UBIFS_SUPER_MAGIC;
1695 sb->s_blocksize = UBIFS_BLOCK_SIZE;
1696 sb->s_blocksize_bits = UBIFS_BLOCK_SHIFT;
1697 sb->s_dev = c->vi.cdev;
1698 sb->s_maxbytes = c->max_inode_sz = key_max_inode_size(c);
1699 if (c->max_inode_sz > MAX_LFS_FILESIZE)
1700 sb->s_maxbytes = c->max_inode_sz = MAX_LFS_FILESIZE;
1701 sb->s_op = &ubifs_super_operations;
1702
1703 mutex_lock(&c->umount_mutex);
1704 err = mount_ubifs(c);
1705 if (err) {
1706 ubifs_assert(err < 0);
1707 goto out_unlock;
1708 }
1709
1710 /* Read the root inode */
1711 root = ubifs_iget(sb, UBIFS_ROOT_INO);
1712 if (IS_ERR(root)) {
1713 err = PTR_ERR(root);
1714 goto out_umount;
1715 }
1716
1717 sb->s_root = d_alloc_root(root);
1718 if (!sb->s_root)
1719 goto out_iput;
1720
1721 mutex_unlock(&c->umount_mutex);
1722
1723 return 0;
1724
1725out_iput:
1726 iput(root);
1727out_umount:
1728 ubifs_umount(c);
1729out_unlock:
1730 mutex_unlock(&c->umount_mutex);
1731out_bdi:
1732 bdi_destroy(&c->bdi);
1733out_close:
1734 ubi_close_volume(c->ubi);
1735out_free:
1736 kfree(c);
1737 return err;
1738}
1739
1740static int sb_test(struct super_block *sb, void *data)
1741{
1742 dev_t *dev = data;
1743
1744 return sb->s_dev == *dev;
1745}
1746
1747static int sb_set(struct super_block *sb, void *data)
1748{
1749 dev_t *dev = data;
1750
1751 sb->s_dev = *dev;
1752 return 0;
1753}
1754
1755static int ubifs_get_sb(struct file_system_type *fs_type, int flags,
1756 const char *name, void *data, struct vfsmount *mnt)
1757{
1758 struct ubi_volume_desc *ubi;
1759 struct ubi_volume_info vi;
1760 struct super_block *sb;
1761 int err;
1762
1763 dbg_gen("name %s, flags %#x", name, flags);
1764
1765 /*
1766 * Get UBI device number and volume ID. Mount it read-only so far
1767 * because this might be a new mount point, and UBI allows only one
1768 * read-write user at a time.
1769 */
1770 ubi = open_ubi(name, UBI_READONLY);
1771 if (IS_ERR(ubi)) {
1772 ubifs_err("cannot open \"%s\", error %d",
1773 name, (int)PTR_ERR(ubi));
1774 return PTR_ERR(ubi);
1775 }
1776 ubi_get_volume_info(ubi, &vi);
1777
1778 dbg_gen("opened ubi%d_%d", vi.ubi_num, vi.vol_id);
1779
1780 sb = sget(fs_type, &sb_test, &sb_set, &vi.cdev);
1781 if (IS_ERR(sb)) {
1782 err = PTR_ERR(sb);
1783 goto out_close;
1784 }
1785
1786 if (sb->s_root) {
1787 /* A new mount point for already mounted UBIFS */
1788 dbg_gen("this ubi volume is already mounted");
1789 if ((flags ^ sb->s_flags) & MS_RDONLY) {
1790 err = -EBUSY;
1791 goto out_deact;
1792 }
1793 } else {
1794 sb->s_flags = flags;
1795 /*
1796 * Pass 'ubi' to 'fill_super()' in sb->s_fs_info where it is
1797 * replaced by 'c'.
1798 */
1799 sb->s_fs_info = ubi;
1800 err = ubifs_fill_super(sb, data, flags & MS_SILENT ? 1 : 0);
1801 if (err)
1802 goto out_deact;
1803 /* We do not support atime */
1804 sb->s_flags |= MS_ACTIVE | MS_NOATIME;
1805 }
1806
1807 /* 'fill_super()' opens ubi again so we must close it here */
1808 ubi_close_volume(ubi);
1809
1810 return simple_set_mnt(mnt, sb);
1811
1812out_deact:
1813 up_write(&sb->s_umount);
1814 deactivate_super(sb);
1815out_close:
1816 ubi_close_volume(ubi);
1817 return err;
1818}
1819
1820static void ubifs_kill_sb(struct super_block *sb)
1821{
1822 struct ubifs_info *c = sb->s_fs_info;
1823
1824 /*
1825 * We do 'commit_on_unmount()' here instead of 'ubifs_put_super()'
1826 * in order to be outside BKL.
1827 */
1828 if (sb->s_root && !(sb->s_flags & MS_RDONLY))
1829 commit_on_unmount(c);
1830 /* The un-mount routine is actually done in put_super() */
1831 generic_shutdown_super(sb);
1832}
1833
1834static struct file_system_type ubifs_fs_type = {
1835 .name = "ubifs",
1836 .owner = THIS_MODULE,
1837 .get_sb = ubifs_get_sb,
1838 .kill_sb = ubifs_kill_sb
1839};
1840
1841/*
1842 * Inode slab cache constructor.
1843 */
1844static void inode_slab_ctor(struct kmem_cache *cachep, void *obj)
1845{
1846 struct ubifs_inode *ui = obj;
1847 inode_init_once(&ui->vfs_inode);
1848}
1849
1850static int __init ubifs_init(void)
1851{
1852 int err;
1853
1854 BUILD_BUG_ON(sizeof(struct ubifs_ch) != 24);
1855
1856 /* Make sure node sizes are 8-byte aligned */
1857 BUILD_BUG_ON(UBIFS_CH_SZ & 7);
1858 BUILD_BUG_ON(UBIFS_INO_NODE_SZ & 7);
1859 BUILD_BUG_ON(UBIFS_DENT_NODE_SZ & 7);
1860 BUILD_BUG_ON(UBIFS_XENT_NODE_SZ & 7);
1861 BUILD_BUG_ON(UBIFS_DATA_NODE_SZ & 7);
1862 BUILD_BUG_ON(UBIFS_TRUN_NODE_SZ & 7);
1863 BUILD_BUG_ON(UBIFS_SB_NODE_SZ & 7);
1864 BUILD_BUG_ON(UBIFS_MST_NODE_SZ & 7);
1865 BUILD_BUG_ON(UBIFS_REF_NODE_SZ & 7);
1866 BUILD_BUG_ON(UBIFS_CS_NODE_SZ & 7);
1867 BUILD_BUG_ON(UBIFS_ORPH_NODE_SZ & 7);
1868
1869 BUILD_BUG_ON(UBIFS_MAX_DENT_NODE_SZ & 7);
1870 BUILD_BUG_ON(UBIFS_MAX_XENT_NODE_SZ & 7);
1871 BUILD_BUG_ON(UBIFS_MAX_DATA_NODE_SZ & 7);
1872 BUILD_BUG_ON(UBIFS_MAX_INO_NODE_SZ & 7);
1873 BUILD_BUG_ON(UBIFS_MAX_NODE_SZ & 7);
1874 BUILD_BUG_ON(MIN_WRITE_SZ & 7);
1875
1876 /* Check min. node size */
1877 BUILD_BUG_ON(UBIFS_INO_NODE_SZ < MIN_WRITE_SZ);
1878 BUILD_BUG_ON(UBIFS_DENT_NODE_SZ < MIN_WRITE_SZ);
1879 BUILD_BUG_ON(UBIFS_XENT_NODE_SZ < MIN_WRITE_SZ);
1880 BUILD_BUG_ON(UBIFS_TRUN_NODE_SZ < MIN_WRITE_SZ);
1881
1882 BUILD_BUG_ON(UBIFS_MAX_DENT_NODE_SZ > UBIFS_MAX_NODE_SZ);
1883 BUILD_BUG_ON(UBIFS_MAX_XENT_NODE_SZ > UBIFS_MAX_NODE_SZ);
1884 BUILD_BUG_ON(UBIFS_MAX_DATA_NODE_SZ > UBIFS_MAX_NODE_SZ);
1885 BUILD_BUG_ON(UBIFS_MAX_INO_NODE_SZ > UBIFS_MAX_NODE_SZ);
1886
1887 /* Defined node sizes */
1888 BUILD_BUG_ON(UBIFS_SB_NODE_SZ != 4096);
1889 BUILD_BUG_ON(UBIFS_MST_NODE_SZ != 512);
1890 BUILD_BUG_ON(UBIFS_INO_NODE_SZ != 160);
1891 BUILD_BUG_ON(UBIFS_REF_NODE_SZ != 64);
1892
1893 /*
1894 * We require that PAGE_CACHE_SIZE is greater-than-or-equal-to
1895 * UBIFS_BLOCK_SIZE. It is assumed that both are powers of 2.
1896 */
1897 if (PAGE_CACHE_SIZE < UBIFS_BLOCK_SIZE) {
1898 ubifs_err("VFS page cache size is %u bytes, but UBIFS requires"
1899 " at least 4096 bytes",
1900 (unsigned int)PAGE_CACHE_SIZE);
1901 return -EINVAL;
1902 }
1903
1904 err = register_filesystem(&ubifs_fs_type);
1905 if (err) {
1906 ubifs_err("cannot register file system, error %d", err);
1907 return err;
1908 }
1909
1910 err = -ENOMEM;
1911 ubifs_inode_slab = kmem_cache_create("ubifs_inode_slab",
1912 sizeof(struct ubifs_inode), 0,
1913 SLAB_MEM_SPREAD | SLAB_RECLAIM_ACCOUNT,
1914 &inode_slab_ctor);
1915 if (!ubifs_inode_slab)
1916 goto out_reg;
1917
1918 register_shrinker(&ubifs_shrinker_info);
1919
1920 err = ubifs_compressors_init();
1921 if (err)
1922 goto out_compr;
1923
1924 return 0;
1925
1926out_compr:
1927 unregister_shrinker(&ubifs_shrinker_info);
1928 kmem_cache_destroy(ubifs_inode_slab);
1929out_reg:
1930 unregister_filesystem(&ubifs_fs_type);
1931 return err;
1932}
1933/* late_initcall to let compressors initialize first */
1934late_initcall(ubifs_init);
1935
1936static void __exit ubifs_exit(void)
1937{
1938 ubifs_assert(list_empty(&ubifs_infos));
1939 ubifs_assert(atomic_long_read(&ubifs_clean_zn_cnt) == 0);
1940
1941 ubifs_compressors_exit();
1942 unregister_shrinker(&ubifs_shrinker_info);
1943 kmem_cache_destroy(ubifs_inode_slab);
1944 unregister_filesystem(&ubifs_fs_type);
1945}
1946module_exit(ubifs_exit);
1947
1948MODULE_LICENSE("GPL");
1949MODULE_VERSION(__stringify(UBIFS_VERSION));
1950MODULE_AUTHOR("Artem Bityutskiy, Adrian Hunter");
1951MODULE_DESCRIPTION("UBIFS - UBI File System");
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
new file mode 100644
index 000000000000..e909f4a96443
--- /dev/null
+++ b/fs/ubifs/tnc.c
@@ -0,0 +1,2956 @@
1/*
2 * This file is part of UBIFS.
3 *
4 * Copyright (C) 2006-2008 Nokia Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published by
8 * the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along with
16 * this program; if not, write to the Free Software Foundation, Inc., 51
17 * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 * Authors: Adrian Hunter
20 * Artem Bityutskiy (Битюцкий Артём)
21 */
22
23/*
24 * This file implements TNC (Tree Node Cache) which caches indexing nodes of
25 * the UBIFS B-tree.
26 *
27 * At the moment the locking rules of the TNC tree are quite simple and
28 * straightforward. We just have a mutex and lock it when we traverse the
29 * tree. If a znode is not in memory, we read it from flash while still having
30 * the mutex locked.
31 */
32
33#include <linux/crc32.h>
34#include "ubifs.h"
35
36/*
37 * Returned codes of 'matches_name()' and 'fallible_matches_name()' functions.
38 * @NAME_LESS: name corresponding to the first argument is less than second
39 * @NAME_MATCHES: names match
40 * @NAME_GREATER: name corresponding to the second argument is greater than
41 * first
42 * @NOT_ON_MEDIA: node referred by zbranch does not exist on the media
43 *
44 * These constants were introduce to improve readability.
45 */
46enum {
47 NAME_LESS = 0,
48 NAME_MATCHES = 1,
49 NAME_GREATER = 2,
50 NOT_ON_MEDIA = 3,
51};
52
53/**
54 * insert_old_idx - record an index node obsoleted since the last commit start.
55 * @c: UBIFS file-system description object
56 * @lnum: LEB number of obsoleted index node
57 * @offs: offset of obsoleted index node
58 *
59 * Returns %0 on success, and a negative error code on failure.
60 *
61 * For recovery, there must always be a complete intact version of the index on
62 * flash at all times. That is called the "old index". It is the index as at the
63 * time of the last successful commit. Many of the index nodes in the old index
64 * may be dirty, but they must not be erased until the next successful commit
65 * (at which point that index becomes the old index).
66 *
67 * That means that the garbage collection and the in-the-gaps method of
68 * committing must be able to determine if an index node is in the old index.
69 * Most of the old index nodes can be found by looking up the TNC using the
70 * 'lookup_znode()' function. However, some of the old index nodes may have
71 * been deleted from the current index or may have been changed so much that
72 * they cannot be easily found. In those cases, an entry is added to an RB-tree.
73 * That is what this function does. The RB-tree is ordered by LEB number and
74 * offset because they uniquely identify the old index node.
75 */
76static int insert_old_idx(struct ubifs_info *c, int lnum, int offs)
77{
78 struct ubifs_old_idx *old_idx, *o;
79 struct rb_node **p, *parent = NULL;
80
81 old_idx = kmalloc(sizeof(struct ubifs_old_idx), GFP_NOFS);
82 if (unlikely(!old_idx))
83 return -ENOMEM;
84 old_idx->lnum = lnum;
85 old_idx->offs = offs;
86
87 p = &c->old_idx.rb_node;
88 while (*p) {
89 parent = *p;
90 o = rb_entry(parent, struct ubifs_old_idx, rb);
91 if (lnum < o->lnum)
92 p = &(*p)->rb_left;
93 else if (lnum > o->lnum)
94 p = &(*p)->rb_right;
95 else if (offs < o->offs)
96 p = &(*p)->rb_left;
97 else if (offs > o->offs)
98 p = &(*p)->rb_right;
99 else {
100 ubifs_err("old idx added twice!");
101 kfree(old_idx);
102 return 0;
103 }
104 }
105 rb_link_node(&old_idx->rb, parent, p);
106 rb_insert_color(&old_idx->rb, &c->old_idx);
107 return 0;
108}
109
110/**
111 * insert_old_idx_znode - record a znode obsoleted since last commit start.
112 * @c: UBIFS file-system description object
113 * @znode: znode of obsoleted index node
114 *
115 * Returns %0 on success, and a negative error code on failure.
116 */
117int insert_old_idx_znode(struct ubifs_info *c, struct ubifs_znode *znode)
118{
119 if (znode->parent) {
120 struct ubifs_zbranch *zbr;
121
122 zbr = &znode->parent->zbranch[znode->iip];
123 if (zbr->len)
124 return insert_old_idx(c, zbr->lnum, zbr->offs);
125 } else
126 if (c->zroot.len)
127 return insert_old_idx(c, c->zroot.lnum,
128 c->zroot.offs);
129 return 0;
130}
131
132/**
133 * ins_clr_old_idx_znode - record a znode obsoleted since last commit start.
134 * @c: UBIFS file-system description object
135 * @znode: znode of obsoleted index node
136 *
137 * Returns %0 on success, and a negative error code on failure.
138 */
139static int ins_clr_old_idx_znode(struct ubifs_info *c,
140 struct ubifs_znode *znode)
141{
142 int err;
143
144 if (znode->parent) {
145 struct ubifs_zbranch *zbr;
146
147 zbr = &znode->parent->zbranch[znode->iip];
148 if (zbr->len) {
149 err = insert_old_idx(c, zbr->lnum, zbr->offs);
150 if (err)
151 return err;
152 zbr->lnum = 0;
153 zbr->offs = 0;
154 zbr->len = 0;
155 }
156 } else
157 if (c->zroot.len) {
158 err = insert_old_idx(c, c->zroot.lnum, c->zroot.offs);
159 if (err)
160 return err;
161 c->zroot.lnum = 0;
162 c->zroot.offs = 0;
163 c->zroot.len = 0;
164 }
165 return 0;
166}
167
168/**
169 * destroy_old_idx - destroy the old_idx RB-tree.
170 * @c: UBIFS file-system description object
171 *
172 * During start commit, the old_idx RB-tree is used to avoid overwriting index
173 * nodes that were in the index last commit but have since been deleted. This
174 * is necessary for recovery i.e. the old index must be kept intact until the
175 * new index is successfully written. The old-idx RB-tree is used for the
176 * in-the-gaps method of writing index nodes and is destroyed every commit.
177 */
178void destroy_old_idx(struct ubifs_info *c)
179{
180 struct rb_node *this = c->old_idx.rb_node;
181 struct ubifs_old_idx *old_idx;
182
183 while (this) {
184 if (this->rb_left) {
185 this = this->rb_left;
186 continue;
187 } else if (this->rb_right) {
188 this = this->rb_right;
189 continue;
190 }
191 old_idx = rb_entry(this, struct ubifs_old_idx, rb);
192 this = rb_parent(this);
193 if (this) {
194 if (this->rb_left == &old_idx->rb)
195 this->rb_left = NULL;
196 else
197 this->rb_right = NULL;
198 }
199 kfree(old_idx);
200 }
201 c->old_idx = RB_ROOT;
202}
203
204/**
205 * copy_znode - copy a dirty znode.
206 * @c: UBIFS file-system description object
207 * @znode: znode to copy
208 *
209 * A dirty znode being committed may not be changed, so it is copied.
210 */
211static struct ubifs_znode *copy_znode(struct ubifs_info *c,
212 struct ubifs_znode *znode)
213{
214 struct ubifs_znode *zn;
215
216 zn = kmalloc(c->max_znode_sz, GFP_NOFS);
217 if (unlikely(!zn))
218 return ERR_PTR(-ENOMEM);
219
220 memcpy(zn, znode, c->max_znode_sz);
221 zn->cnext = NULL;
222 __set_bit(DIRTY_ZNODE, &zn->flags);
223 __clear_bit(COW_ZNODE, &zn->flags);
224
225 ubifs_assert(!test_bit(OBSOLETE_ZNODE, &znode->flags));
226 __set_bit(OBSOLETE_ZNODE, &znode->flags);
227
228 if (znode->level != 0) {
229 int i;
230 const int n = zn->child_cnt;
231
232 /* The children now have new parent */
233 for (i = 0; i < n; i++) {
234 struct ubifs_zbranch *zbr = &zn->zbranch[i];
235
236 if (zbr->znode)
237 zbr->znode->parent = zn;
238 }
239 }
240
241 atomic_long_inc(&c->dirty_zn_cnt);
242 return zn;
243}
244
245/**
246 * add_idx_dirt - add dirt due to a dirty znode.
247 * @c: UBIFS file-system description object
248 * @lnum: LEB number of index node
249 * @dirt: size of index node
250 *
251 * This function updates lprops dirty space and the new size of the index.
252 */
253static int add_idx_dirt(struct ubifs_info *c, int lnum, int dirt)
254{
255 c->calc_idx_sz -= ALIGN(dirt, 8);
256 return ubifs_add_dirt(c, lnum, dirt);
257}
258
259/**
260 * dirty_cow_znode - ensure a znode is not being committed.
261 * @c: UBIFS file-system description object
262 * @zbr: branch of znode to check
263 *
264 * Returns dirtied znode on success or negative error code on failure.
265 */
266static struct ubifs_znode *dirty_cow_znode(struct ubifs_info *c,
267 struct ubifs_zbranch *zbr)
268{
269 struct ubifs_znode *znode = zbr->znode;
270 struct ubifs_znode *zn;
271 int err;
272
273 if (!test_bit(COW_ZNODE, &znode->flags)) {
274 /* znode is not being committed */
275 if (!test_and_set_bit(DIRTY_ZNODE, &znode->flags)) {
276 atomic_long_inc(&c->dirty_zn_cnt);
277 atomic_long_dec(&c->clean_zn_cnt);
278 atomic_long_dec(&ubifs_clean_zn_cnt);
279 err = add_idx_dirt(c, zbr->lnum, zbr->len);
280 if (unlikely(err))
281 return ERR_PTR(err);
282 }
283 return znode;
284 }
285
286 zn = copy_znode(c, znode);
287 if (unlikely(IS_ERR(zn)))
288 return zn;
289
290 if (zbr->len) {
291 err = insert_old_idx(c, zbr->lnum, zbr->offs);
292 if (unlikely(err))
293 return ERR_PTR(err);
294 err = add_idx_dirt(c, zbr->lnum, zbr->len);
295 } else
296 err = 0;
297
298 zbr->znode = zn;
299 zbr->lnum = 0;
300 zbr->offs = 0;
301 zbr->len = 0;
302
303 if (unlikely(err))
304 return ERR_PTR(err);
305 return zn;
306}
307
308/**
309 * lnc_add - add a leaf node to the leaf node cache.
310 * @c: UBIFS file-system description object
311 * @zbr: zbranch of leaf node
312 * @node: leaf node
313 *
314 * Leaf nodes are non-index nodes directory entry nodes or data nodes. The
315 * purpose of the leaf node cache is to save re-reading the same leaf node over
316 * and over again. Most things are cached by VFS, however the file system must
317 * cache directory entries for readdir and for resolving hash collisions. The
318 * present implementation of the leaf node cache is extremely simple, and
319 * allows for error returns that are not used but that may be needed if a more
320 * complex implementation is created.
321 *
322 * Note, this function does not add the @node object to LNC directly, but
323 * allocates a copy of the object and adds the copy to LNC. The reason for this
324 * is that @node has been allocated outside of the TNC subsystem and will be
325 * used with @c->tnc_mutex unlock upon return from the TNC subsystem. But LNC
326 * may be changed at any time, e.g. freed by the shrinker.
327 */
328static int lnc_add(struct ubifs_info *c, struct ubifs_zbranch *zbr,
329 const void *node)
330{
331 int err;
332 void *lnc_node;
333 const struct ubifs_dent_node *dent = node;
334
335 ubifs_assert(!zbr->leaf);
336 ubifs_assert(zbr->len != 0);
337 ubifs_assert(is_hash_key(c, &zbr->key));
338
339 err = ubifs_validate_entry(c, dent);
340 if (err) {
341 dbg_dump_stack();
342 dbg_dump_node(c, dent);
343 return err;
344 }
345
346 lnc_node = kmalloc(zbr->len, GFP_NOFS);
347 if (!lnc_node)
348 /* We don't have to have the cache, so no error */
349 return 0;
350
351 memcpy(lnc_node, node, zbr->len);
352 zbr->leaf = lnc_node;
353 return 0;
354}
355
356 /**
357 * lnc_add_directly - add a leaf node to the leaf-node-cache.
358 * @c: UBIFS file-system description object
359 * @zbr: zbranch of leaf node
360 * @node: leaf node
361 *
362 * This function is similar to 'lnc_add()', but it does not create a copy of
363 * @node but inserts @node to TNC directly.
364 */
365static int lnc_add_directly(struct ubifs_info *c, struct ubifs_zbranch *zbr,
366 void *node)
367{
368 int err;
369
370 ubifs_assert(!zbr->leaf);
371 ubifs_assert(zbr->len != 0);
372
373 err = ubifs_validate_entry(c, node);
374 if (err) {
375 dbg_dump_stack();
376 dbg_dump_node(c, node);
377 return err;
378 }
379
380 zbr->leaf = node;
381 return 0;
382}
383
384/**
385 * lnc_free - remove a leaf node from the leaf node cache.
386 * @zbr: zbranch of leaf node
387 * @node: leaf node
388 */
389static void lnc_free(struct ubifs_zbranch *zbr)
390{
391 if (!zbr->leaf)
392 return;
393 kfree(zbr->leaf);
394 zbr->leaf = NULL;
395}
396
397/**
398 * tnc_read_node_nm - read a "hashed" leaf node.
399 * @c: UBIFS file-system description object
400 * @zbr: key and position of the node
401 * @node: node is returned here
402 *
403 * This function reads a "hashed" node defined by @zbr from the leaf node cache
404 * (in it is there) or from the hash media, in which case the node is also
405 * added to LNC. Returns zero in case of success or a negative negative error
406 * code in case of failure.
407 */
408static int tnc_read_node_nm(struct ubifs_info *c, struct ubifs_zbranch *zbr,
409 void *node)
410{
411 int err;
412
413 ubifs_assert(is_hash_key(c, &zbr->key));
414
415 if (zbr->leaf) {
416 /* Read from the leaf node cache */
417 ubifs_assert(zbr->len != 0);
418 memcpy(node, zbr->leaf, zbr->len);
419 return 0;
420 }
421
422 err = ubifs_tnc_read_node(c, zbr, node);
423 if (err)
424 return err;
425
426 /* Add the node to the leaf node cache */
427 err = lnc_add(c, zbr, node);
428 return err;
429}
430
431/**
432 * try_read_node - read a node if it is a node.
433 * @c: UBIFS file-system description object
434 * @buf: buffer to read to
435 * @type: node type
436 * @len: node length (not aligned)
437 * @lnum: LEB number of node to read
438 * @offs: offset of node to read
439 *
440 * This function tries to read a node of known type and length, checks it and
441 * stores it in @buf. This function returns %1 if a node is present and %0 if
442 * a node is not present. A negative error code is returned for I/O errors.
443 * This function performs that same function as ubifs_read_node except that
444 * it does not require that there is actually a node present and instead
445 * the return code indicates if a node was read.
446 */
447static int try_read_node(const struct ubifs_info *c, void *buf, int type,
448 int len, int lnum, int offs)
449{
450 int err, node_len;
451 struct ubifs_ch *ch = buf;
452 uint32_t crc, node_crc;
453
454 dbg_io("LEB %d:%d, %s, length %d", lnum, offs, dbg_ntype(type), len);
455
456 err = ubi_read(c->ubi, lnum, buf, offs, len);
457 if (err) {
458 ubifs_err("cannot read node type %d from LEB %d:%d, error %d",
459 type, lnum, offs, err);
460 return err;
461 }
462
463 if (le32_to_cpu(ch->magic) != UBIFS_NODE_MAGIC)
464 return 0;
465
466 if (ch->node_type != type)
467 return 0;
468
469 node_len = le32_to_cpu(ch->len);
470 if (node_len != len)
471 return 0;
472
473 crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8);
474 node_crc = le32_to_cpu(ch->crc);
475 if (crc != node_crc)
476 return 0;
477
478 return 1;
479}
480
481/**
482 * fallible_read_node - try to read a leaf node.
483 * @c: UBIFS file-system description object
484 * @key: key of node to read
485 * @zbr: position of node
486 * @node: node returned
487 *
488 * This function tries to read a node and returns %1 if the node is read, %0
489 * if the node is not present, and a negative error code in the case of error.
490 */
491static int fallible_read_node(struct ubifs_info *c, const union ubifs_key *key,
492 struct ubifs_zbranch *zbr, void *node)
493{
494 int ret;
495
496 dbg_tnc("LEB %d:%d, key %s", zbr->lnum, zbr->offs, DBGKEY(key));
497
498 ret = try_read_node(c, node, key_type(c, key), zbr->len, zbr->lnum,
499 zbr->offs);
500 if (ret == 1) {
501 union ubifs_key node_key;
502 struct ubifs_dent_node *dent = node;
503
504 /* All nodes have key in the same place */
505 key_read(c, &dent->key, &node_key);
506 if (keys_cmp(c, key, &node_key) != 0)
507 ret = 0;
508 }
509 if (ret == 0)
510 dbg_mnt("dangling branch LEB %d:%d len %d, key %s",
511 zbr->lnum, zbr->offs, zbr->len, DBGKEY(key));
512 return ret;
513}
514
515/**
516 * matches_name - determine if a direntry or xattr entry matches a given name.
517 * @c: UBIFS file-system description object
518 * @zbr: zbranch of dent
519 * @nm: name to match
520 *
521 * This function checks if xentry/direntry referred by zbranch @zbr matches name
522 * @nm. Returns %NAME_MATCHES if it does, %NAME_LESS if the name referred by
523 * @zbr is less than @nm, and %NAME_GREATER if it is greater than @nm. In case
524 * of failure, a negative error code is returned.
525 */
526static int matches_name(struct ubifs_info *c, struct ubifs_zbranch *zbr,
527 const struct qstr *nm)
528{
529 struct ubifs_dent_node *dent;
530 int nlen, err;
531
532 /* If possible, match against the dent in the leaf node cache */
533 if (!zbr->leaf) {
534 dent = kmalloc(zbr->len, GFP_NOFS);
535 if (!dent)
536 return -ENOMEM;
537
538 err = ubifs_tnc_read_node(c, zbr, dent);
539 if (err)
540 goto out_free;
541
542 /* Add the node to the leaf node cache */
543 err = lnc_add_directly(c, zbr, dent);
544 if (err)
545 goto out_free;
546 } else
547 dent = zbr->leaf;
548
549 nlen = le16_to_cpu(dent->nlen);
550 err = memcmp(dent->name, nm->name, min_t(int, nlen, nm->len));
551 if (err == 0) {
552 if (nlen == nm->len)
553 return NAME_MATCHES;
554 else if (nlen < nm->len)
555 return NAME_LESS;
556 else
557 return NAME_GREATER;
558 } else if (err < 0)
559 return NAME_LESS;
560 else
561 return NAME_GREATER;
562
563out_free:
564 kfree(dent);
565 return err;
566}
567
568/**
569 * get_znode - get a TNC znode that may not be loaded yet.
570 * @c: UBIFS file-system description object
571 * @znode: parent znode
572 * @n: znode branch slot number
573 *
574 * This function returns the znode or a negative error code.
575 */
576static struct ubifs_znode *get_znode(struct ubifs_info *c,
577 struct ubifs_znode *znode, int n)
578{
579 struct ubifs_zbranch *zbr;
580
581 zbr = &znode->zbranch[n];
582 if (zbr->znode)
583 znode = zbr->znode;
584 else
585 znode = ubifs_load_znode(c, zbr, znode, n);
586 return znode;
587}
588
589/**
590 * tnc_next - find next TNC entry.
591 * @c: UBIFS file-system description object
592 * @zn: znode is passed and returned here
593 * @n: znode branch slot number is passed and returned here
594 *
595 * This function returns %0 if the next TNC entry is found, %-ENOENT if there is
596 * no next entry, or a negative error code otherwise.
597 */
598static int tnc_next(struct ubifs_info *c, struct ubifs_znode **zn, int *n)
599{
600 struct ubifs_znode *znode = *zn;
601 int nn = *n;
602
603 nn += 1;
604 if (nn < znode->child_cnt) {
605 *n = nn;
606 return 0;
607 }
608 while (1) {
609 struct ubifs_znode *zp;
610
611 zp = znode->parent;
612 if (!zp)
613 return -ENOENT;
614 nn = znode->iip + 1;
615 znode = zp;
616 if (nn < znode->child_cnt) {
617 znode = get_znode(c, znode, nn);
618 if (IS_ERR(znode))
619 return PTR_ERR(znode);
620 while (znode->level != 0) {
621 znode = get_znode(c, znode, 0);
622 if (IS_ERR(znode))
623 return PTR_ERR(znode);
624 }
625 nn = 0;
626 break;
627 }
628 }
629 *zn = znode;
630 *n = nn;
631 return 0;
632}
633
634/**
635 * tnc_prev - find previous TNC entry.
636 * @c: UBIFS file-system description object
637 * @zn: znode is returned here
638 * @n: znode branch slot number is passed and returned here
639 *
640 * This function returns %0 if the previous TNC entry is found, %-ENOENT if
641 * there is no next entry, or a negative error code otherwise.
642 */
643static int tnc_prev(struct ubifs_info *c, struct ubifs_znode **zn, int *n)
644{
645 struct ubifs_znode *znode = *zn;
646 int nn = *n;
647
648 if (nn > 0) {
649 *n = nn - 1;
650 return 0;
651 }
652 while (1) {
653 struct ubifs_znode *zp;
654
655 zp = znode->parent;
656 if (!zp)
657 return -ENOENT;
658 nn = znode->iip - 1;
659 znode = zp;
660 if (nn >= 0) {
661 znode = get_znode(c, znode, nn);
662 if (IS_ERR(znode))
663 return PTR_ERR(znode);
664 while (znode->level != 0) {
665 nn = znode->child_cnt - 1;
666 znode = get_znode(c, znode, nn);
667 if (IS_ERR(znode))
668 return PTR_ERR(znode);
669 }
670 nn = znode->child_cnt - 1;
671 break;
672 }
673 }
674 *zn = znode;
675 *n = nn;
676 return 0;
677}
678
679/**
680 * resolve_collision - resolve a collision.
681 * @c: UBIFS file-system description object
682 * @key: key of a directory or extended attribute entry
683 * @zn: znode is returned here
684 * @n: zbranch number is passed and returned here
685 * @nm: name of the entry
686 *
687 * This function is called for "hashed" keys to make sure that the found key
688 * really corresponds to the looked up node (directory or extended attribute
689 * entry). It returns %1 and sets @zn and @n if the collision is resolved.
690 * %0 is returned if @nm is not found and @zn and @n are set to the previous
691 * entry, i.e. to the entry after which @nm could follow if it were in TNC.
692 * This means that @n may be set to %-1 if the leftmost key in @zn is the
693 * previous one. A negative error code is returned on failures.
694 */
695static int resolve_collision(struct ubifs_info *c, const union ubifs_key *key,
696 struct ubifs_znode **zn, int *n,
697 const struct qstr *nm)
698{
699 int err;
700
701 err = matches_name(c, &(*zn)->zbranch[*n], nm);
702 if (unlikely(err < 0))
703 return err;
704 if (err == NAME_MATCHES)
705 return 1;
706
707 if (err == NAME_GREATER) {
708 /* Look left */
709 while (1) {
710 err = tnc_prev(c, zn, n);
711 if (err == -ENOENT) {
712 ubifs_assert(*n == 0);
713 *n = -1;
714 return 0;
715 }
716 if (err < 0)
717 return err;
718 if (keys_cmp(c, &(*zn)->zbranch[*n].key, key)) {
719 /*
720 * We have found the branch after which we would
721 * like to insert, but inserting in this znode
722 * may still be wrong. Consider the following 3
723 * znodes, in the case where we are resolving a
724 * collision with Key2.
725 *
726 * znode zp
727 * ----------------------
728 * level 1 | Key0 | Key1 |
729 * -----------------------
730 * | |
731 * znode za | | znode zb
732 * ------------ ------------
733 * level 0 | Key0 | | Key2 |
734 * ------------ ------------
735 *
736 * The lookup finds Key2 in znode zb. Lets say
737 * there is no match and the name is greater so
738 * we look left. When we find Key0, we end up
739 * here. If we return now, we will insert into
740 * znode za at slot n = 1. But that is invalid
741 * according to the parent's keys. Key2 must
742 * be inserted into znode zb.
743 *
744 * Note, this problem is not relevant for the
745 * case when we go right, because
746 * 'tnc_insert()' would correct the parent key.
747 */
748 if (*n == (*zn)->child_cnt - 1) {
749 err = tnc_next(c, zn, n);
750 if (err) {
751 /* Should be impossible */
752 ubifs_assert(0);
753 if (err == -ENOENT)
754 err = -EINVAL;
755 return err;
756 }
757 ubifs_assert(*n == 0);
758 *n = -1;
759 }
760 return 0;
761 }
762 err = matches_name(c, &(*zn)->zbranch[*n], nm);
763 if (err < 0)
764 return err;
765 if (err == NAME_LESS)
766 return 0;
767 if (err == NAME_MATCHES)
768 return 1;
769 ubifs_assert(err == NAME_GREATER);
770 }
771 } else {
772 int nn = *n;
773 struct ubifs_znode *znode = *zn;
774
775 /* Look right */
776 while (1) {
777 err = tnc_next(c, &znode, &nn);
778 if (err == -ENOENT)
779 return 0;
780 if (err < 0)
781 return err;
782 if (keys_cmp(c, &znode->zbranch[nn].key, key))
783 return 0;
784 err = matches_name(c, &znode->zbranch[nn], nm);
785 if (err < 0)
786 return err;
787 if (err == NAME_GREATER)
788 return 0;
789 *zn = znode;
790 *n = nn;
791 if (err == NAME_MATCHES)
792 return 1;
793 ubifs_assert(err == NAME_LESS);
794 }
795 }
796}
797
798/**
799 * fallible_matches_name - determine if a dent matches a given name.
800 * @c: UBIFS file-system description object
801 * @zbr: zbranch of dent
802 * @nm: name to match
803 *
804 * This is a "fallible" version of 'matches_name()' function which does not
805 * panic if the direntry/xentry referred by @zbr does not exist on the media.
806 *
807 * This function checks if xentry/direntry referred by zbranch @zbr matches name
808 * @nm. Returns %NAME_MATCHES it does, %NAME_LESS if the name referred by @zbr
809 * is less than @nm, %NAME_GREATER if it is greater than @nm, and @NOT_ON_MEDIA
810 * if xentry/direntry referred by @zbr does not exist on the media. A negative
811 * error code is returned in case of failure.
812 */
813static int fallible_matches_name(struct ubifs_info *c,
814 struct ubifs_zbranch *zbr,
815 const struct qstr *nm)
816{
817 struct ubifs_dent_node *dent;
818 int nlen, err;
819
820 /* If possible, match against the dent in the leaf node cache */
821 if (!zbr->leaf) {
822 dent = kmalloc(zbr->len, GFP_NOFS);
823 if (!dent)
824 return -ENOMEM;
825
826 err = fallible_read_node(c, &zbr->key, zbr, dent);
827 if (err < 0)
828 goto out_free;
829 if (err == 0) {
830 /* The node was not present */
831 err = NOT_ON_MEDIA;
832 goto out_free;
833 }
834 ubifs_assert(err == 1);
835
836 err = lnc_add_directly(c, zbr, dent);
837 if (err)
838 goto out_free;
839 } else
840 dent = zbr->leaf;
841
842 nlen = le16_to_cpu(dent->nlen);
843 err = memcmp(dent->name, nm->name, min_t(int, nlen, nm->len));
844 if (err == 0) {
845 if (nlen == nm->len)
846 return NAME_MATCHES;
847 else if (nlen < nm->len)
848 return NAME_LESS;
849 else
850 return NAME_GREATER;
851 } else if (err < 0)
852 return NAME_LESS;
853 else
854 return NAME_GREATER;
855
856out_free:
857 kfree(dent);
858 return err;
859}
860
861/**
862 * fallible_resolve_collision - resolve a collision even if nodes are missing.
863 * @c: UBIFS file-system description object
864 * @key: key
865 * @zn: znode is returned here
866 * @n: branch number is passed and returned here
867 * @nm: name of directory entry
868 * @adding: indicates caller is adding a key to the TNC
869 *
870 * This is a "fallible" version of the 'resolve_collision()' function which
871 * does not panic if one of the nodes referred to by TNC does not exist on the
872 * media. This may happen when replaying the journal if a deleted node was
873 * Garbage-collected and the commit was not done. A branch that refers to a node
874 * that is not present is called a dangling branch. The following are the return
875 * codes for this function:
876 * o if @nm was found, %1 is returned and @zn and @n are set to the found
877 * branch;
878 * o if we are @adding and @nm was not found, %0 is returned;
879 * o if we are not @adding and @nm was not found, but a dangling branch was
880 * found, then %1 is returned and @zn and @n are set to the dangling branch;
881 * o a negative error code is returned in case of failure.
882 */
883static int fallible_resolve_collision(struct ubifs_info *c,
884 const union ubifs_key *key,
885 struct ubifs_znode **zn, int *n,
886 const struct qstr *nm, int adding)
887{
888 struct ubifs_znode *o_znode = NULL, *znode = *zn;
889 int uninitialized_var(o_n), err, cmp, unsure = 0, nn = *n;
890
891 cmp = fallible_matches_name(c, &znode->zbranch[nn], nm);
892 if (unlikely(cmp < 0))
893 return cmp;
894 if (cmp == NAME_MATCHES)
895 return 1;
896 if (cmp == NOT_ON_MEDIA) {
897 o_znode = znode;
898 o_n = nn;
899 /*
900 * We are unlucky and hit a dangling branch straight away.
901 * Now we do not really know where to go to find the needed
902 * branch - to the left or to the right. Well, let's try left.
903 */
904 unsure = 1;
905 } else if (!adding)
906 unsure = 1; /* Remove a dangling branch wherever it is */
907
908 if (cmp == NAME_GREATER || unsure) {
909 /* Look left */
910 while (1) {
911 err = tnc_prev(c, zn, n);
912 if (err == -ENOENT) {
913 ubifs_assert(*n == 0);
914 *n = -1;
915 break;
916 }
917 if (err < 0)
918 return err;
919 if (keys_cmp(c, &(*zn)->zbranch[*n].key, key)) {
920 /* See comments in 'resolve_collision()' */
921 if (*n == (*zn)->child_cnt - 1) {
922 err = tnc_next(c, zn, n);
923 if (err) {
924 /* Should be impossible */
925 ubifs_assert(0);
926 if (err == -ENOENT)
927 err = -EINVAL;
928 return err;
929 }
930 ubifs_assert(*n == 0);
931 *n = -1;
932 }
933 break;
934 }
935 err = fallible_matches_name(c, &(*zn)->zbranch[*n], nm);
936 if (err < 0)
937 return err;
938 if (err == NAME_MATCHES)
939 return 1;
940 if (err == NOT_ON_MEDIA) {
941 o_znode = *zn;
942 o_n = *n;
943 continue;
944 }
945 if (!adding)
946 continue;
947 if (err == NAME_LESS)
948 break;
949 else
950 unsure = 0;
951 }
952 }
953
954 if (cmp == NAME_LESS || unsure) {
955 /* Look right */
956 *zn = znode;
957 *n = nn;
958 while (1) {
959 err = tnc_next(c, &znode, &nn);
960 if (err == -ENOENT)
961 break;
962 if (err < 0)
963 return err;
964 if (keys_cmp(c, &znode->zbranch[nn].key, key))
965 break;
966 err = fallible_matches_name(c, &znode->zbranch[nn], nm);
967 if (err < 0)
968 return err;
969 if (err == NAME_GREATER)
970 break;
971 *zn = znode;
972 *n = nn;
973 if (err == NAME_MATCHES)
974 return 1;
975 if (err == NOT_ON_MEDIA) {
976 o_znode = znode;
977 o_n = nn;
978 }
979 }
980 }
981
982 /* Never match a dangling branch when adding */
983 if (adding || !o_znode)
984 return 0;
985
986 dbg_mnt("dangling match LEB %d:%d len %d %s",
987 o_znode->zbranch[o_n].lnum, o_znode->zbranch[o_n].offs,
988 o_znode->zbranch[o_n].len, DBGKEY(key));
989 *zn = o_znode;
990 *n = o_n;
991 return 1;
992}
993
994/**
995 * matches_position - determine if a zbranch matches a given position.
996 * @zbr: zbranch of dent
997 * @lnum: LEB number of dent to match
998 * @offs: offset of dent to match
999 *
1000 * This function returns %1 if @lnum:@offs matches, and %0 otherwise.
1001 */
1002static int matches_position(struct ubifs_zbranch *zbr, int lnum, int offs)
1003{
1004 if (zbr->lnum == lnum && zbr->offs == offs)
1005 return 1;
1006 else
1007 return 0;
1008}
1009
1010/**
1011 * resolve_collision_directly - resolve a collision directly.
1012 * @c: UBIFS file-system description object
1013 * @key: key of directory entry
1014 * @zn: znode is passed and returned here
1015 * @n: zbranch number is passed and returned here
1016 * @lnum: LEB number of dent node to match
1017 * @offs: offset of dent node to match
1018 *
1019 * This function is used for "hashed" keys to make sure the found directory or
1020 * extended attribute entry node is what was looked for. It is used when the
1021 * flash address of the right node is known (@lnum:@offs) which makes it much
1022 * easier to resolve collisions (no need to read entries and match full
1023 * names). This function returns %1 and sets @zn and @n if the collision is
1024 * resolved, %0 if @lnum:@offs is not found and @zn and @n are set to the
1025 * previous directory entry. Otherwise a negative error code is returned.
1026 */
1027static int resolve_collision_directly(struct ubifs_info *c,
1028 const union ubifs_key *key,
1029 struct ubifs_znode **zn, int *n,
1030 int lnum, int offs)
1031{
1032 struct ubifs_znode *znode;
1033 int nn, err;
1034
1035 znode = *zn;
1036 nn = *n;
1037 if (matches_position(&znode->zbranch[nn], lnum, offs))
1038 return 1;
1039
1040 /* Look left */
1041 while (1) {
1042 err = tnc_prev(c, &znode, &nn);
1043 if (err == -ENOENT)
1044 break;
1045 if (err < 0)
1046 return err;
1047 if (keys_cmp(c, &znode->zbranch[nn].key, key))
1048 break;
1049 if (matches_position(&znode->zbranch[nn], lnum, offs)) {
1050 *zn = znode;
1051 *n = nn;
1052 return 1;
1053 }
1054 }
1055
1056 /* Look right */
1057 znode = *zn;
1058 nn = *n;
1059 while (1) {
1060 err = tnc_next(c, &znode, &nn);
1061 if (err == -ENOENT)
1062 return 0;
1063 if (err < 0)
1064 return err;
1065 if (keys_cmp(c, &znode->zbranch[nn].key, key))
1066 return 0;
1067 *zn = znode;
1068 *n = nn;
1069 if (matches_position(&znode->zbranch[nn], lnum, offs))
1070 return 1;
1071 }
1072}
1073
1074/**
1075 * dirty_cow_bottom_up - dirty a znode and its ancestors.
1076 * @c: UBIFS file-system description object
1077 * @znode: znode to dirty
1078 *
1079 * If we do not have a unique key that resides in a znode, then we cannot
1080 * dirty that znode from the top down (i.e. by using lookup_level0_dirty)
1081 * This function records the path back to the last dirty ancestor, and then
1082 * dirties the znodes on that path.
1083 */
1084static struct ubifs_znode *dirty_cow_bottom_up(struct ubifs_info *c,
1085 struct ubifs_znode *znode)
1086{
1087 struct ubifs_znode *zp;
1088 int *path = c->bottom_up_buf, p = 0;
1089
1090 ubifs_assert(c->zroot.znode);
1091 ubifs_assert(znode);
1092 if (c->zroot.znode->level > BOTTOM_UP_HEIGHT) {
1093 kfree(c->bottom_up_buf);
1094 c->bottom_up_buf = kmalloc(c->zroot.znode->level * sizeof(int),
1095 GFP_NOFS);
1096 if (!c->bottom_up_buf)
1097 return ERR_PTR(-ENOMEM);
1098 path = c->bottom_up_buf;
1099 }
1100 if (c->zroot.znode->level) {
1101 /* Go up until parent is dirty */
1102 while (1) {
1103 int n;
1104
1105 zp = znode->parent;
1106 if (!zp)
1107 break;
1108 n = znode->iip;
1109 ubifs_assert(p < c->zroot.znode->level);
1110 path[p++] = n;
1111 if (!zp->cnext && ubifs_zn_dirty(znode))
1112 break;
1113 znode = zp;
1114 }
1115 }
1116
1117 /* Come back down, dirtying as we go */
1118 while (1) {
1119 struct ubifs_zbranch *zbr;
1120
1121 zp = znode->parent;
1122 if (zp) {
1123 ubifs_assert(path[p - 1] >= 0);
1124 ubifs_assert(path[p - 1] < zp->child_cnt);
1125 zbr = &zp->zbranch[path[--p]];
1126 znode = dirty_cow_znode(c, zbr);
1127 } else {
1128 ubifs_assert(znode == c->zroot.znode);
1129 znode = dirty_cow_znode(c, &c->zroot);
1130 }
1131 if (unlikely(IS_ERR(znode)) || !p)
1132 break;
1133 ubifs_assert(path[p - 1] >= 0);
1134 ubifs_assert(path[p - 1] < znode->child_cnt);
1135 znode = znode->zbranch[path[p - 1]].znode;
1136 }
1137
1138 return znode;
1139}
1140
1141/**
1142 * ubifs_lookup_level0 - search for zero-level znode.
1143 * @c: UBIFS file-system description object
1144 * @key: key to lookup
1145 * @zn: znode is returned here
1146 * @n: znode branch slot number is returned here
1147 *
1148 * This function looks up the TNC tree and search for zero-level znode which
1149 * refers key @key. The found zero-level znode is returned in @zn. There are 3
1150 * cases:
1151 * o exact match, i.e. the found zero-level znode contains key @key, then %1
1152 * is returned and slot number of the matched branch is stored in @n;
1153 * o not exact match, which means that zero-level znode does not contain
1154 * @key, then %0 is returned and slot number of the closed branch is stored
1155 * in @n;
1156 * o @key is so small that it is even less than the lowest key of the
1157 * leftmost zero-level node, then %0 is returned and %0 is stored in @n.
1158 *
1159 * Note, when the TNC tree is traversed, some znodes may be absent, then this
1160 * function reads corresponding indexing nodes and inserts them to TNC. In
1161 * case of failure, a negative error code is returned.
1162 */
1163int ubifs_lookup_level0(struct ubifs_info *c, const union ubifs_key *key,
1164 struct ubifs_znode **zn, int *n)
1165{
1166 int err, exact;
1167 struct ubifs_znode *znode;
1168 unsigned long time = get_seconds();
1169
1170 dbg_tnc("search key %s", DBGKEY(key));
1171
1172 znode = c->zroot.znode;
1173 if (unlikely(!znode)) {
1174 znode = ubifs_load_znode(c, &c->zroot, NULL, 0);
1175 if (IS_ERR(znode))
1176 return PTR_ERR(znode);
1177 }
1178
1179 znode->time = time;
1180
1181 while (1) {
1182 struct ubifs_zbranch *zbr;
1183
1184 exact = ubifs_search_zbranch(c, znode, key, n);
1185
1186 if (znode->level == 0)
1187 break;
1188
1189 if (*n < 0)
1190 *n = 0;
1191 zbr = &znode->zbranch[*n];
1192
1193 if (zbr->znode) {
1194 znode->time = time;
1195 znode = zbr->znode;
1196 continue;
1197 }
1198
1199 /* znode is not in TNC cache, load it from the media */
1200 znode = ubifs_load_znode(c, zbr, znode, *n);
1201 if (IS_ERR(znode))
1202 return PTR_ERR(znode);
1203 }
1204
1205 *zn = znode;
1206 if (exact || !is_hash_key(c, key) || *n != -1) {
1207 dbg_tnc("found %d, lvl %d, n %d", exact, znode->level, *n);
1208 return exact;
1209 }
1210
1211 /*
1212 * Here is a tricky place. We have not found the key and this is a
1213 * "hashed" key, which may collide. The rest of the code deals with
1214 * situations like this:
1215 *
1216 * | 3 | 5 |
1217 * / \
1218 * | 3 | 5 | | 6 | 7 | (x)
1219 *
1220 * Or more a complex example:
1221 *
1222 * | 1 | 5 |
1223 * / \
1224 * | 1 | 3 | | 5 | 8 |
1225 * \ /
1226 * | 5 | 5 | | 6 | 7 | (x)
1227 *
1228 * In the examples, if we are looking for key "5", we may reach nodes
1229 * marked with "(x)". In this case what we have do is to look at the
1230 * left and see if there is "5" key there. If there is, we have to
1231 * return it.
1232 *
1233 * Note, this whole situation is possible because we allow to have
1234 * elements which are equivalent to the next key in the parent in the
1235 * children of current znode. For example, this happens if we split a
1236 * znode like this: | 3 | 5 | 5 | 6 | 7 |, which results in something
1237 * like this:
1238 * | 3 | 5 |
1239 * / \
1240 * | 3 | 5 | | 5 | 6 | 7 |
1241 * ^
1242 * And this becomes what is at the first "picture" after key "5" marked
1243 * with "^" is removed. What could be done is we could prohibit
1244 * splitting in the middle of the colliding sequence. Also, when
1245 * removing the leftmost key, we would have to correct the key of the
1246 * parent node, which would introduce additional complications. Namely,
1247 * if we changed the the leftmost key of the parent znode, the garbage
1248 * collector would be unable to find it (GC is doing this when GC'ing
1249 * indexing LEBs). Although we already have an additional RB-tree where
1250 * we save such changed znodes (see 'ins_clr_old_idx_znode()') until
1251 * after the commit. But anyway, this does not look easy to implement
1252 * so we did not try this.
1253 */
1254 err = tnc_prev(c, &znode, n);
1255 if (err == -ENOENT) {
1256 dbg_tnc("found 0, lvl %d, n -1", znode->level);
1257 *n = -1;
1258 return 0;
1259 }
1260 if (unlikely(err < 0))
1261 return err;
1262 if (keys_cmp(c, key, &znode->zbranch[*n].key)) {
1263 dbg_tnc("found 0, lvl %d, n -1", znode->level);
1264 *n = -1;
1265 return 0;
1266 }
1267
1268 dbg_tnc("found 1, lvl %d, n %d", znode->level, *n);
1269 *zn = znode;
1270 return 1;
1271}
1272
1273/**
1274 * lookup_level0_dirty - search for zero-level znode dirtying.
1275 * @c: UBIFS file-system description object
1276 * @key: key to lookup
1277 * @zn: znode is returned here
1278 * @n: znode branch slot number is returned here
1279 *
1280 * This function looks up the TNC tree and search for zero-level znode which
1281 * refers key @key. The found zero-level znode is returned in @zn. There are 3
1282 * cases:
1283 * o exact match, i.e. the found zero-level znode contains key @key, then %1
1284 * is returned and slot number of the matched branch is stored in @n;
1285 * o not exact match, which means that zero-level znode does not contain @key
1286 * then %0 is returned and slot number of the closed branch is stored in
1287 * @n;
1288 * o @key is so small that it is even less than the lowest key of the
1289 * leftmost zero-level node, then %0 is returned and %-1 is stored in @n.
1290 *
1291 * Additionally all znodes in the path from the root to the located zero-level
1292 * znode are marked as dirty.
1293 *
1294 * Note, when the TNC tree is traversed, some znodes may be absent, then this
1295 * function reads corresponding indexing nodes and inserts them to TNC. In
1296 * case of failure, a negative error code is returned.
1297 */
1298static int lookup_level0_dirty(struct ubifs_info *c, const union ubifs_key *key,
1299 struct ubifs_znode **zn, int *n)
1300{
1301 int err, exact;
1302 struct ubifs_znode *znode;
1303 unsigned long time = get_seconds();
1304
1305 dbg_tnc("search and dirty key %s", DBGKEY(key));
1306
1307 znode = c->zroot.znode;
1308 if (unlikely(!znode)) {
1309 znode = ubifs_load_znode(c, &c->zroot, NULL, 0);
1310 if (IS_ERR(znode))
1311 return PTR_ERR(znode);
1312 }
1313
1314 znode = dirty_cow_znode(c, &c->zroot);
1315 if (IS_ERR(znode))
1316 return PTR_ERR(znode);
1317
1318 znode->time = time;
1319
1320 while (1) {
1321 struct ubifs_zbranch *zbr;
1322
1323 exact = ubifs_search_zbranch(c, znode, key, n);
1324
1325 if (znode->level == 0)
1326 break;
1327
1328 if (*n < 0)
1329 *n = 0;
1330 zbr = &znode->zbranch[*n];
1331
1332 if (zbr->znode) {
1333 znode->time = time;
1334 znode = dirty_cow_znode(c, zbr);
1335 if (IS_ERR(znode))
1336 return PTR_ERR(znode);
1337 continue;
1338 }
1339
1340 /* znode is not in TNC cache, load it from the media */
1341 znode = ubifs_load_znode(c, zbr, znode, *n);
1342 if (IS_ERR(znode))
1343 return PTR_ERR(znode);
1344 znode = dirty_cow_znode(c, zbr);
1345 if (IS_ERR(znode))
1346 return PTR_ERR(znode);
1347 }
1348
1349 *zn = znode;
1350 if (exact || !is_hash_key(c, key) || *n != -1) {
1351 dbg_tnc("found %d, lvl %d, n %d", exact, znode->level, *n);
1352 return exact;
1353 }
1354
1355 /*
1356 * See huge comment at 'lookup_level0_dirty()' what is the rest of the
1357 * code.
1358 */
1359 err = tnc_prev(c, &znode, n);
1360 if (err == -ENOENT) {
1361 *n = -1;
1362 dbg_tnc("found 0, lvl %d, n -1", znode->level);
1363 return 0;
1364 }
1365 if (unlikely(err < 0))
1366 return err;
1367 if (keys_cmp(c, key, &znode->zbranch[*n].key)) {
1368 *n = -1;
1369 dbg_tnc("found 0, lvl %d, n -1", znode->level);
1370 return 0;
1371 }
1372
1373 if (znode->cnext || !ubifs_zn_dirty(znode)) {
1374 znode = dirty_cow_bottom_up(c, znode);
1375 if (IS_ERR(znode))
1376 return PTR_ERR(znode);
1377 }
1378
1379 dbg_tnc("found 1, lvl %d, n %d", znode->level, *n);
1380 *zn = znode;
1381 return 1;
1382}
1383
1384/**
1385 * ubifs_tnc_lookup - look up a file-system node.
1386 * @c: UBIFS file-system description object
1387 * @key: node key to lookup
1388 * @node: the node is returned here
1389 *
1390 * This function look up and reads node with key @key. The caller has to make
1391 * sure the @node buffer is large enough to fit the node. Returns zero in case
1392 * of success, %-ENOENT if the node was not found, and a negative error code in
1393 * case of failure.
1394 */
1395int ubifs_tnc_lookup(struct ubifs_info *c, const union ubifs_key *key,
1396 void *node)
1397{
1398 int found, n, err;
1399 struct ubifs_znode *znode;
1400 struct ubifs_zbranch zbr, *zt;
1401
1402 mutex_lock(&c->tnc_mutex);
1403 found = ubifs_lookup_level0(c, key, &znode, &n);
1404 if (!found) {
1405 err = -ENOENT;
1406 goto out;
1407 } else if (found < 0) {
1408 err = found;
1409 goto out;
1410 }
1411 zt = &znode->zbranch[n];
1412 if (is_hash_key(c, key)) {
1413 /*
1414 * In this case the leaf node cache gets used, so we pass the
1415 * address of the zbranch and keep the mutex locked
1416 */
1417 err = tnc_read_node_nm(c, zt, node);
1418 goto out;
1419 }
1420 zbr = znode->zbranch[n];
1421 mutex_unlock(&c->tnc_mutex);
1422
1423 err = ubifs_tnc_read_node(c, &zbr, node);
1424 return err;
1425
1426out:
1427 mutex_unlock(&c->tnc_mutex);
1428 return err;
1429}
1430
1431/**
1432 * ubifs_tnc_locate - look up a file-system node and return it and its location.
1433 * @c: UBIFS file-system description object
1434 * @key: node key to lookup
1435 * @node: the node is returned here
1436 * @lnum: LEB number is returned here
1437 * @offs: offset is returned here
1438 *
1439 * This function is the same as 'ubifs_tnc_lookup()' but it returns the node
1440 * location also. See 'ubifs_tnc_lookup()'.
1441 */
1442int ubifs_tnc_locate(struct ubifs_info *c, const union ubifs_key *key,
1443 void *node, int *lnum, int *offs)
1444{
1445 int found, n, err;
1446 struct ubifs_znode *znode;
1447 struct ubifs_zbranch zbr, *zt;
1448
1449 mutex_lock(&c->tnc_mutex);
1450 found = ubifs_lookup_level0(c, key, &znode, &n);
1451 if (!found) {
1452 err = -ENOENT;
1453 goto out;
1454 } else if (found < 0) {
1455 err = found;
1456 goto out;
1457 }
1458 zt = &znode->zbranch[n];
1459 if (is_hash_key(c, key)) {
1460 /*
1461 * In this case the leaf node cache gets used, so we pass the
1462 * address of the zbranch and keep the mutex locked
1463 */
1464 *lnum = zt->lnum;
1465 *offs = zt->offs;
1466 err = tnc_read_node_nm(c, zt, node);
1467 goto out;
1468 }
1469 zbr = znode->zbranch[n];
1470 mutex_unlock(&c->tnc_mutex);
1471
1472 *lnum = zbr.lnum;
1473 *offs = zbr.offs;
1474
1475 err = ubifs_tnc_read_node(c, &zbr, node);
1476 return err;
1477
1478out:
1479 mutex_unlock(&c->tnc_mutex);
1480 return err;
1481}
1482
1483/**
1484 * do_lookup_nm- look up a "hashed" node.
1485 * @c: UBIFS file-system description object
1486 * @key: node key to lookup
1487 * @node: the node is returned here
1488 * @nm: node name
1489 *
1490 * This function look up and reads a node which contains name hash in the key.
1491 * Since the hash may have collisions, there may be many nodes with the same
1492 * key, so we have to sequentially look to all of them until the needed one is
1493 * found. This function returns zero in case of success, %-ENOENT if the node
1494 * was not found, and a negative error code in case of failure.
1495 */
1496static int do_lookup_nm(struct ubifs_info *c, const union ubifs_key *key,
1497 void *node, const struct qstr *nm)
1498{
1499 int found, n, err;
1500 struct ubifs_znode *znode;
1501 struct ubifs_zbranch zbr;
1502
1503 dbg_tnc("name '%.*s' key %s", nm->len, nm->name, DBGKEY(key));
1504 mutex_lock(&c->tnc_mutex);
1505 found = ubifs_lookup_level0(c, key, &znode, &n);
1506 if (!found) {
1507 err = -ENOENT;
1508 goto out_unlock;
1509 } else if (found < 0) {
1510 err = found;
1511 goto out_unlock;
1512 }
1513
1514 ubifs_assert(n >= 0);
1515
1516 err = resolve_collision(c, key, &znode, &n, nm);
1517 dbg_tnc("rc returned %d, znode %p, n %d", err, znode, n);
1518 if (unlikely(err < 0))
1519 goto out_unlock;
1520 if (err == 0) {
1521 err = -ENOENT;
1522 goto out_unlock;
1523 }
1524
1525 zbr = znode->zbranch[n];
1526 mutex_unlock(&c->tnc_mutex);
1527
1528 err = tnc_read_node_nm(c, &zbr, node);
1529 return err;
1530
1531out_unlock:
1532 mutex_unlock(&c->tnc_mutex);
1533 return err;
1534}
1535
1536/**
1537 * ubifs_tnc_lookup_nm - look up a "hashed" node.
1538 * @c: UBIFS file-system description object
1539 * @key: node key to lookup
1540 * @node: the node is returned here
1541 * @nm: node name
1542 *
1543 * This function look up and reads a node which contains name hash in the key.
1544 * Since the hash may have collisions, there may be many nodes with the same
1545 * key, so we have to sequentially look to all of them until the needed one is
1546 * found. This function returns zero in case of success, %-ENOENT if the node
1547 * was not found, and a negative error code in case of failure.
1548 */
1549int ubifs_tnc_lookup_nm(struct ubifs_info *c, const union ubifs_key *key,
1550 void *node, const struct qstr *nm)
1551{
1552 int err, len;
1553 const struct ubifs_dent_node *dent = node;
1554
1555 /*
1556 * We assume that in most of the cases there are no name collisions and
1557 * 'ubifs_tnc_lookup()' returns us the right direntry.
1558 */
1559 err = ubifs_tnc_lookup(c, key, node);
1560 if (err)
1561 return err;
1562
1563 len = le16_to_cpu(dent->nlen);
1564 if (nm->len == len && !memcmp(dent->name, nm->name, len))
1565 return 0;
1566
1567 /*
1568 * Unluckily, there are hash collisions and we have to iterate over
1569 * them look at each direntry with colliding name hash sequentially.
1570 */
1571 return do_lookup_nm(c, key, node, nm);
1572}
1573
1574/**
1575 * correct_parent_keys - correct parent znodes' keys.
1576 * @c: UBIFS file-system description object
1577 * @znode: znode to correct parent znodes for
1578 *
1579 * This is a helper function for 'tnc_insert()'. When the key of the leftmost
1580 * zbranch changes, keys of parent znodes have to be corrected. This helper
1581 * function is called in such situations and corrects the keys if needed.
1582 */
1583static void correct_parent_keys(const struct ubifs_info *c,
1584 struct ubifs_znode *znode)
1585{
1586 union ubifs_key *key, *key1;
1587
1588 ubifs_assert(znode->parent);
1589 ubifs_assert(znode->iip == 0);
1590
1591 key = &znode->zbranch[0].key;
1592 key1 = &znode->parent->zbranch[0].key;
1593
1594 while (keys_cmp(c, key, key1) < 0) {
1595 key_copy(c, key, key1);
1596 znode = znode->parent;
1597 znode->alt = 1;
1598 if (!znode->parent || znode->iip)
1599 break;
1600 key1 = &znode->parent->zbranch[0].key;
1601 }
1602}
1603
1604/**
1605 * insert_zbranch - insert a zbranch into a znode.
1606 * @znode: znode into which to insert
1607 * @zbr: zbranch to insert
1608 * @n: slot number to insert to
1609 *
1610 * This is a helper function for 'tnc_insert()'. UBIFS does not allow "gaps" in
1611 * znode's array of zbranches and keeps zbranches consolidated, so when a new
1612 * zbranch has to be inserted to the @znode->zbranches[]' array at the @n-th
1613 * slot, zbranches starting from @n have to be moved right.
1614 */
1615static void insert_zbranch(struct ubifs_znode *znode,
1616 const struct ubifs_zbranch *zbr, int n)
1617{
1618 int i;
1619
1620 ubifs_assert(ubifs_zn_dirty(znode));
1621
1622 if (znode->level) {
1623 for (i = znode->child_cnt; i > n; i--) {
1624 znode->zbranch[i] = znode->zbranch[i - 1];
1625 if (znode->zbranch[i].znode)
1626 znode->zbranch[i].znode->iip = i;
1627 }
1628 if (zbr->znode)
1629 zbr->znode->iip = n;
1630 } else
1631 for (i = znode->child_cnt; i > n; i--)
1632 znode->zbranch[i] = znode->zbranch[i - 1];
1633
1634 znode->zbranch[n] = *zbr;
1635 znode->child_cnt += 1;
1636
1637 /*
1638 * After inserting at slot zero, the lower bound of the key range of
1639 * this znode may have changed. If this znode is subsequently split
1640 * then the upper bound of the key range may change, and furthermore
1641 * it could change to be lower than the original lower bound. If that
1642 * happens, then it will no longer be possible to find this znode in the
1643 * TNC using the key from the index node on flash. That is bad because
1644 * if it is not found, we will assume it is obsolete and may overwrite
1645 * it. Then if there is an unclean unmount, we will start using the
1646 * old index which will be broken.
1647 *
1648 * So we first mark znodes that have insertions at slot zero, and then
1649 * if they are split we add their lnum/offs to the old_idx tree.
1650 */
1651 if (n == 0)
1652 znode->alt = 1;
1653}
1654
1655/**
1656 * tnc_insert - insert a node into TNC.
1657 * @c: UBIFS file-system description object
1658 * @znode: znode to insert into
1659 * @zbr: branch to insert
1660 * @n: slot number to insert new zbranch to
1661 *
1662 * This function inserts a new node described by @zbr into znode @znode. If
1663 * znode does not have a free slot for new zbranch, it is split. Parent znodes
1664 * are splat as well if needed. Returns zero in case of success or a negative
1665 * error code in case of failure.
1666 */
1667static int tnc_insert(struct ubifs_info *c, struct ubifs_znode *znode,
1668 struct ubifs_zbranch *zbr, int n)
1669{
1670 struct ubifs_znode *zn, *zi, *zp;
1671 int i, keep, move, appending = 0;
1672 union ubifs_key *key = &zbr->key;
1673
1674 ubifs_assert(n >= 0 && n <= c->fanout);
1675
1676 /* Implement naive insert for now */
1677again:
1678 zp = znode->parent;
1679 if (znode->child_cnt < c->fanout) {
1680 ubifs_assert(n != c->fanout);
1681 dbg_tnc("inserted at %d level %d, key %s", n, znode->level,
1682 DBGKEY(key));
1683
1684 insert_zbranch(znode, zbr, n);
1685
1686 /* Ensure parent's key is correct */
1687 if (n == 0 && zp && znode->iip == 0)
1688 correct_parent_keys(c, znode);
1689
1690 return 0;
1691 }
1692
1693 /*
1694 * Unfortunately, @znode does not have more empty slots and we have to
1695 * split it.
1696 */
1697 dbg_tnc("splitting level %d, key %s", znode->level, DBGKEY(key));
1698
1699 if (znode->alt)
1700 /*
1701 * We can no longer be sure of finding this znode by key, so we
1702 * record it in the old_idx tree.
1703 */
1704 ins_clr_old_idx_znode(c, znode);
1705
1706 zn = kzalloc(c->max_znode_sz, GFP_NOFS);
1707 if (!zn)
1708 return -ENOMEM;
1709 zn->parent = zp;
1710 zn->level = znode->level;
1711
1712 /* Decide where to split */
1713 if (znode->level == 0 && n == c->fanout &&
1714 key_type(c, key) == UBIFS_DATA_KEY) {
1715 union ubifs_key *key1;
1716
1717 /*
1718 * If this is an inode which is being appended - do not split
1719 * it because no other zbranches can be inserted between
1720 * zbranches of consecutive data nodes anyway.
1721 */
1722 key1 = &znode->zbranch[n - 1].key;
1723 if (key_inum(c, key1) == key_inum(c, key) &&
1724 key_type(c, key1) == UBIFS_DATA_KEY &&
1725 key_block(c, key1) == key_block(c, key) - 1)
1726 appending = 1;
1727 }
1728
1729 if (appending) {
1730 keep = c->fanout;
1731 move = 0;
1732 } else {
1733 keep = (c->fanout + 1) / 2;
1734 move = c->fanout - keep;
1735 }
1736
1737 /*
1738 * Although we don't at present, we could look at the neighbors and see
1739 * if we can move some zbranches there.
1740 */
1741
1742 if (n < keep) {
1743 /* Insert into existing znode */
1744 zi = znode;
1745 move += 1;
1746 keep -= 1;
1747 } else {
1748 /* Insert into new znode */
1749 zi = zn;
1750 n -= keep;
1751 /* Re-parent */
1752 if (zn->level != 0)
1753 zbr->znode->parent = zn;
1754 }
1755
1756 __set_bit(DIRTY_ZNODE, &zn->flags);
1757 atomic_long_inc(&c->dirty_zn_cnt);
1758
1759 zn->child_cnt = move;
1760 znode->child_cnt = keep;
1761
1762 dbg_tnc("moving %d, keeping %d", move, keep);
1763
1764 /* Move zbranch */
1765 for (i = 0; i < move; i++) {
1766 zn->zbranch[i] = znode->zbranch[keep + i];
1767 /* Re-parent */
1768 if (zn->level != 0)
1769 if (zn->zbranch[i].znode) {
1770 zn->zbranch[i].znode->parent = zn;
1771 zn->zbranch[i].znode->iip = i;
1772 }
1773 }
1774
1775 /* Insert new key and branch */
1776 dbg_tnc("inserting at %d level %d, key %s", n, zn->level, DBGKEY(key));
1777
1778 insert_zbranch(zi, zbr, n);
1779
1780 /* Insert new znode (produced by spitting) into the parent */
1781 if (zp) {
1782 i = n;
1783 /* Locate insertion point */
1784 n = znode->iip + 1;
1785 if (appending && n != c->fanout)
1786 appending = 0;
1787
1788 if (i == 0 && zi == znode && znode->iip == 0)
1789 correct_parent_keys(c, znode);
1790
1791 /* Tail recursion */
1792 zbr->key = zn->zbranch[0].key;
1793 zbr->znode = zn;
1794 zbr->lnum = 0;
1795 zbr->offs = 0;
1796 zbr->len = 0;
1797 znode = zp;
1798
1799 goto again;
1800 }
1801
1802 /* We have to split root znode */
1803 dbg_tnc("creating new zroot at level %d", znode->level + 1);
1804
1805 zi = kzalloc(c->max_znode_sz, GFP_NOFS);
1806 if (!zi)
1807 return -ENOMEM;
1808
1809 zi->child_cnt = 2;
1810 zi->level = znode->level + 1;
1811
1812 __set_bit(DIRTY_ZNODE, &zi->flags);
1813 atomic_long_inc(&c->dirty_zn_cnt);
1814
1815 zi->zbranch[0].key = znode->zbranch[0].key;
1816 zi->zbranch[0].znode = znode;
1817 zi->zbranch[0].lnum = c->zroot.lnum;
1818 zi->zbranch[0].offs = c->zroot.offs;
1819 zi->zbranch[0].len = c->zroot.len;
1820 zi->zbranch[1].key = zn->zbranch[0].key;
1821 zi->zbranch[1].znode = zn;
1822
1823 c->zroot.lnum = 0;
1824 c->zroot.offs = 0;
1825 c->zroot.len = 0;
1826 c->zroot.znode = zi;
1827
1828 zn->parent = zi;
1829 zn->iip = 1;
1830 znode->parent = zi;
1831 znode->iip = 0;
1832
1833 return 0;
1834}
1835
1836/**
1837 * ubifs_tnc_add - add a node to TNC.
1838 * @c: UBIFS file-system description object
1839 * @key: key to add
1840 * @lnum: LEB number of node
1841 * @offs: node offset
1842 * @len: node length
1843 *
1844 * This function adds a node with key @key to TNC. The node may be new or it may
1845 * obsolete some existing one. Returns %0 on success or negative error code on
1846 * failure.
1847 */
1848int ubifs_tnc_add(struct ubifs_info *c, const union ubifs_key *key, int lnum,
1849 int offs, int len)
1850{
1851 int found, n, err = 0;
1852 struct ubifs_znode *znode;
1853
1854 mutex_lock(&c->tnc_mutex);
1855 dbg_tnc("%d:%d, len %d, key %s", lnum, offs, len, DBGKEY(key));
1856 found = lookup_level0_dirty(c, key, &znode, &n);
1857 if (!found) {
1858 struct ubifs_zbranch zbr;
1859
1860 zbr.znode = NULL;
1861 zbr.lnum = lnum;
1862 zbr.offs = offs;
1863 zbr.len = len;
1864 key_copy(c, key, &zbr.key);
1865 err = tnc_insert(c, znode, &zbr, n + 1);
1866 } else if (found == 1) {
1867 struct ubifs_zbranch *zbr = &znode->zbranch[n];
1868
1869 lnc_free(zbr);
1870 err = ubifs_add_dirt(c, zbr->lnum, zbr->len);
1871 zbr->lnum = lnum;
1872 zbr->offs = offs;
1873 zbr->len = len;
1874 } else
1875 err = found;
1876 if (!err)
1877 err = dbg_check_tnc(c, 0);
1878 mutex_unlock(&c->tnc_mutex);
1879
1880 return err;
1881}
1882
1883/**
1884 * ubifs_tnc_replace - replace a node in the TNC only if the old node is found.
1885 * @c: UBIFS file-system description object
1886 * @key: key to add
1887 * @old_lnum: LEB number of old node
1888 * @old_offs: old node offset
1889 * @lnum: LEB number of node
1890 * @offs: node offset
1891 * @len: node length
1892 *
1893 * This function replaces a node with key @key in the TNC only if the old node
1894 * is found. This function is called by garbage collection when node are moved.
1895 * Returns %0 on success or negative error code on failure.
1896 */
1897int ubifs_tnc_replace(struct ubifs_info *c, const union ubifs_key *key,
1898 int old_lnum, int old_offs, int lnum, int offs, int len)
1899{
1900 int found, n, err = 0;
1901 struct ubifs_znode *znode;
1902
1903 mutex_lock(&c->tnc_mutex);
1904 dbg_tnc("old LEB %d:%d, new LEB %d:%d, len %d, key %s", old_lnum,
1905 old_offs, lnum, offs, len, DBGKEY(key));
1906 found = lookup_level0_dirty(c, key, &znode, &n);
1907 if (found < 0) {
1908 err = found;
1909 goto out_unlock;
1910 }
1911
1912 if (found == 1) {
1913 struct ubifs_zbranch *zbr = &znode->zbranch[n];
1914
1915 found = 0;
1916 if (zbr->lnum == old_lnum && zbr->offs == old_offs) {
1917 lnc_free(zbr);
1918 err = ubifs_add_dirt(c, zbr->lnum, zbr->len);
1919 if (err)
1920 goto out_unlock;
1921 zbr->lnum = lnum;
1922 zbr->offs = offs;
1923 zbr->len = len;
1924 found = 1;
1925 } else if (is_hash_key(c, key)) {
1926 found = resolve_collision_directly(c, key, &znode, &n,
1927 old_lnum, old_offs);
1928 dbg_tnc("rc returned %d, znode %p, n %d, LEB %d:%d",
1929 found, znode, n, old_lnum, old_offs);
1930 if (found < 0) {
1931 err = found;
1932 goto out_unlock;
1933 }
1934
1935 if (found) {
1936 /* Ensure the znode is dirtied */
1937 if (znode->cnext || !ubifs_zn_dirty(znode)) {
1938 znode = dirty_cow_bottom_up(c,
1939 znode);
1940 if (IS_ERR(znode)) {
1941 err = PTR_ERR(znode);
1942 goto out_unlock;
1943 }
1944 }
1945 zbr = &znode->zbranch[n];
1946 lnc_free(zbr);
1947 err = ubifs_add_dirt(c, zbr->lnum,
1948 zbr->len);
1949 if (err)
1950 goto out_unlock;
1951 zbr->lnum = lnum;
1952 zbr->offs = offs;
1953 zbr->len = len;
1954 }
1955 }
1956 }
1957
1958 if (!found)
1959 err = ubifs_add_dirt(c, lnum, len);
1960
1961 if (!err)
1962 err = dbg_check_tnc(c, 0);
1963
1964out_unlock:
1965 mutex_unlock(&c->tnc_mutex);
1966 return err;
1967}
1968
1969/**
1970 * ubifs_tnc_add_nm - add a "hashed" node to TNC.
1971 * @c: UBIFS file-system description object
1972 * @key: key to add
1973 * @lnum: LEB number of node
1974 * @offs: node offset
1975 * @len: node length
1976 * @nm: node name
1977 *
1978 * This is the same as 'ubifs_tnc_add()' but it should be used with keys which
1979 * may have collisions, like directory entry keys.
1980 */
1981int ubifs_tnc_add_nm(struct ubifs_info *c, const union ubifs_key *key,
1982 int lnum, int offs, int len, const struct qstr *nm)
1983{
1984 int found, n, err = 0;
1985 struct ubifs_znode *znode;
1986
1987 mutex_lock(&c->tnc_mutex);
1988 dbg_tnc("LEB %d:%d, name '%.*s', key %s", lnum, offs, nm->len, nm->name,
1989 DBGKEY(key));
1990 found = lookup_level0_dirty(c, key, &znode, &n);
1991 if (found < 0) {
1992 err = found;
1993 goto out_unlock;
1994 }
1995
1996 if (found == 1) {
1997 if (c->replaying)
1998 found = fallible_resolve_collision(c, key, &znode, &n,
1999 nm, 1);
2000 else
2001 found = resolve_collision(c, key, &znode, &n, nm);
2002 dbg_tnc("rc returned %d, znode %p, n %d", found, znode, n);
2003 if (found < 0) {
2004 err = found;
2005 goto out_unlock;
2006 }
2007
2008 /* Ensure the znode is dirtied */
2009 if (znode->cnext || !ubifs_zn_dirty(znode)) {
2010 znode = dirty_cow_bottom_up(c, znode);
2011 if (IS_ERR(znode)) {
2012 err = PTR_ERR(znode);
2013 goto out_unlock;
2014 }
2015 }
2016
2017 if (found == 1) {
2018 struct ubifs_zbranch *zbr = &znode->zbranch[n];
2019
2020 lnc_free(zbr);
2021 err = ubifs_add_dirt(c, zbr->lnum, zbr->len);
2022 zbr->lnum = lnum;
2023 zbr->offs = offs;
2024 zbr->len = len;
2025 goto out_unlock;
2026 }
2027 }
2028
2029 if (!found) {
2030 struct ubifs_zbranch zbr;
2031
2032 zbr.znode = NULL;
2033 zbr.lnum = lnum;
2034 zbr.offs = offs;
2035 zbr.len = len;
2036 key_copy(c, key, &zbr.key);
2037 err = tnc_insert(c, znode, &zbr, n + 1);
2038 if (err)
2039 goto out_unlock;
2040 if (c->replaying) {
2041 /*
2042 * We did not find it in the index so there may be a
2043 * dangling branch still in the index. So we remove it
2044 * by passing 'ubifs_tnc_remove_nm()' the same key but
2045 * an unmatchable name.
2046 */
2047 struct qstr noname = { .len = 0, .name = "" };
2048
2049 err = dbg_check_tnc(c, 0);
2050 mutex_unlock(&c->tnc_mutex);
2051 if (err)
2052 return err;
2053 return ubifs_tnc_remove_nm(c, key, &noname);
2054 }
2055 }
2056
2057out_unlock:
2058 if (!err)
2059 err = dbg_check_tnc(c, 0);
2060 mutex_unlock(&c->tnc_mutex);
2061 return err;
2062}
2063
2064/**
2065 * tnc_delete - delete a znode form TNC.
2066 * @c: UBIFS file-system description object
2067 * @znode: znode to delete from
2068 * @n: zbranch slot number to delete
2069 *
2070 * This function deletes a leaf node from @n-th slot of @znode. Returns zero in
2071 * case of success and a negative error code in case of failure.
2072 */
2073static int tnc_delete(struct ubifs_info *c, struct ubifs_znode *znode, int n)
2074{
2075 struct ubifs_zbranch *zbr;
2076 struct ubifs_znode *zp;
2077 int i, err;
2078
2079 /* Delete without merge for now */
2080 ubifs_assert(znode->level == 0);
2081 ubifs_assert(n >= 0 && n < c->fanout);
2082 dbg_tnc("deleting %s", DBGKEY(&znode->zbranch[n].key));
2083
2084 zbr = &znode->zbranch[n];
2085 lnc_free(zbr);
2086
2087 err = ubifs_add_dirt(c, zbr->lnum, zbr->len);
2088 if (err) {
2089 dbg_dump_znode(c, znode);
2090 return err;
2091 }
2092
2093 /* We do not "gap" zbranch slots */
2094 for (i = n; i < znode->child_cnt - 1; i++)
2095 znode->zbranch[i] = znode->zbranch[i + 1];
2096 znode->child_cnt -= 1;
2097
2098 if (znode->child_cnt > 0)
2099 return 0;
2100
2101 /*
2102 * This was the last zbranch, we have to delete this znode from the
2103 * parent.
2104 */
2105
2106 do {
2107 ubifs_assert(!test_bit(OBSOLETE_ZNODE, &znode->flags));
2108 ubifs_assert(ubifs_zn_dirty(znode));
2109
2110 zp = znode->parent;
2111 n = znode->iip;
2112
2113 atomic_long_dec(&c->dirty_zn_cnt);
2114
2115 err = insert_old_idx_znode(c, znode);
2116 if (err)
2117 return err;
2118
2119 if (znode->cnext) {
2120 __set_bit(OBSOLETE_ZNODE, &znode->flags);
2121 atomic_long_inc(&c->clean_zn_cnt);
2122 atomic_long_inc(&ubifs_clean_zn_cnt);
2123 } else
2124 kfree(znode);
2125 znode = zp;
2126 } while (znode->child_cnt == 1); /* while removing last child */
2127
2128 /* Remove from znode, entry n - 1 */
2129 znode->child_cnt -= 1;
2130 ubifs_assert(znode->level != 0);
2131 for (i = n; i < znode->child_cnt; i++) {
2132 znode->zbranch[i] = znode->zbranch[i + 1];
2133 if (znode->zbranch[i].znode)
2134 znode->zbranch[i].znode->iip = i;
2135 }
2136
2137 /*
2138 * If this is the root and it has only 1 child then
2139 * collapse the tree.
2140 */
2141 if (!znode->parent) {
2142 while (znode->child_cnt == 1 && znode->level != 0) {
2143 zp = znode;
2144 zbr = &znode->zbranch[0];
2145 znode = get_znode(c, znode, 0);
2146 if (IS_ERR(znode))
2147 return PTR_ERR(znode);
2148 znode = dirty_cow_znode(c, zbr);
2149 if (IS_ERR(znode))
2150 return PTR_ERR(znode);
2151 znode->parent = NULL;
2152 znode->iip = 0;
2153 if (c->zroot.len) {
2154 err = insert_old_idx(c, c->zroot.lnum,
2155 c->zroot.offs);
2156 if (err)
2157 return err;
2158 }
2159 c->zroot.lnum = zbr->lnum;
2160 c->zroot.offs = zbr->offs;
2161 c->zroot.len = zbr->len;
2162 c->zroot.znode = znode;
2163 ubifs_assert(!test_bit(OBSOLETE_ZNODE,
2164 &zp->flags));
2165 ubifs_assert(test_bit(DIRTY_ZNODE, &zp->flags));
2166 atomic_long_dec(&c->dirty_zn_cnt);
2167
2168 if (zp->cnext) {
2169 __set_bit(OBSOLETE_ZNODE, &zp->flags);
2170 atomic_long_inc(&c->clean_zn_cnt);
2171 atomic_long_inc(&ubifs_clean_zn_cnt);
2172 } else
2173 kfree(zp);
2174 }
2175 }
2176
2177 return 0;
2178}
2179
2180/**
2181 * ubifs_tnc_remove - remove an index entry of a node.
2182 * @c: UBIFS file-system description object
2183 * @key: key of node
2184 *
2185 * Returns %0 on success or negative error code on failure.
2186 */
2187int ubifs_tnc_remove(struct ubifs_info *c, const union ubifs_key *key)
2188{
2189 int found, n, err = 0;
2190 struct ubifs_znode *znode;
2191
2192 mutex_lock(&c->tnc_mutex);
2193 dbg_tnc("key %s", DBGKEY(key));
2194 found = lookup_level0_dirty(c, key, &znode, &n);
2195 if (found < 0) {
2196 err = found;
2197 goto out_unlock;
2198 }
2199 if (found == 1)
2200 err = tnc_delete(c, znode, n);
2201 if (!err)
2202 err = dbg_check_tnc(c, 0);
2203
2204out_unlock:
2205 mutex_unlock(&c->tnc_mutex);
2206 return err;
2207}
2208
2209/**
2210 * ubifs_tnc_remove_nm - remove an index entry for a "hashed" node.
2211 * @c: UBIFS file-system description object
2212 * @key: key of node
2213 * @nm: directory entry name
2214 *
2215 * Returns %0 on success or negative error code on failure.
2216 */
2217int ubifs_tnc_remove_nm(struct ubifs_info *c, const union ubifs_key *key,
2218 const struct qstr *nm)
2219{
2220 int n, err;
2221 struct ubifs_znode *znode;
2222
2223 mutex_lock(&c->tnc_mutex);
2224 dbg_tnc("%.*s, key %s", nm->len, nm->name, DBGKEY(key));
2225 err = lookup_level0_dirty(c, key, &znode, &n);
2226 if (err < 0)
2227 goto out_unlock;
2228
2229 if (err) {
2230 if (c->replaying)
2231 err = fallible_resolve_collision(c, key, &znode, &n,
2232 nm, 0);
2233 else
2234 err = resolve_collision(c, key, &znode, &n, nm);
2235 dbg_tnc("rc returned %d, znode %p, n %d", err, znode, n);
2236 if (err < 0)
2237 goto out_unlock;
2238 if (err) {
2239 /* Ensure the znode is dirtied */
2240 if (znode->cnext || !ubifs_zn_dirty(znode)) {
2241 znode = dirty_cow_bottom_up(c, znode);
2242 if (IS_ERR(znode)) {
2243 err = PTR_ERR(znode);
2244 goto out_unlock;
2245 }
2246 }
2247 err = tnc_delete(c, znode, n);
2248 }
2249 }
2250
2251out_unlock:
2252 if (!err)
2253 err = dbg_check_tnc(c, 0);
2254 mutex_unlock(&c->tnc_mutex);
2255 return err;
2256}
2257
2258/**
2259 * key_in_range - determine if a key falls within a range of keys.
2260 * @c: UBIFS file-system description object
2261 * @key: key to check
2262 * @from_key: lowest key in range
2263 * @to_key: highest key in range
2264 *
2265 * This function returns %1 if the key is in range and %0 otherwise.
2266 */
2267static int key_in_range(struct ubifs_info *c, union ubifs_key *key,
2268 union ubifs_key *from_key, union ubifs_key *to_key)
2269{
2270 if (keys_cmp(c, key, from_key) < 0)
2271 return 0;
2272 if (keys_cmp(c, key, to_key) > 0)
2273 return 0;
2274 return 1;
2275}
2276
2277/**
2278 * ubifs_tnc_remove_range - remove index entries in range.
2279 * @c: UBIFS file-system description object
2280 * @from_key: lowest key to remove
2281 * @to_key: highest key to remove
2282 *
2283 * This function removes index entries starting at @from_key and ending at
2284 * @to_key. This function returns zero in case of success and a negative error
2285 * code in case of failure.
2286 */
2287int ubifs_tnc_remove_range(struct ubifs_info *c, union ubifs_key *from_key,
2288 union ubifs_key *to_key)
2289{
2290 int i, n, k, err = 0;
2291 struct ubifs_znode *znode;
2292 union ubifs_key *key;
2293
2294 mutex_lock(&c->tnc_mutex);
2295 while (1) {
2296 /* Find first level 0 znode that contains keys to remove */
2297 err = ubifs_lookup_level0(c, from_key, &znode, &n);
2298 if (err < 0)
2299 goto out_unlock;
2300
2301 if (err)
2302 key = from_key;
2303 else {
2304 err = tnc_next(c, &znode, &n);
2305 if (err == -ENOENT) {
2306 err = 0;
2307 goto out_unlock;
2308 }
2309 if (err < 0)
2310 goto out_unlock;
2311 key = &znode->zbranch[n].key;
2312 if (!key_in_range(c, key, from_key, to_key)) {
2313 err = 0;
2314 goto out_unlock;
2315 }
2316 }
2317
2318 /* Ensure the znode is dirtied */
2319 if (znode->cnext || !ubifs_zn_dirty(znode)) {
2320 znode = dirty_cow_bottom_up(c, znode);
2321 if (IS_ERR(znode)) {
2322 err = PTR_ERR(znode);
2323 goto out_unlock;
2324 }
2325 }
2326
2327 /* Remove all keys in range except the first */
2328 for (i = n + 1, k = 0; i < znode->child_cnt; i++, k++) {
2329 key = &znode->zbranch[i].key;
2330 if (!key_in_range(c, key, from_key, to_key))
2331 break;
2332 lnc_free(&znode->zbranch[i]);
2333 err = ubifs_add_dirt(c, znode->zbranch[i].lnum,
2334 znode->zbranch[i].len);
2335 if (err) {
2336 dbg_dump_znode(c, znode);
2337 goto out_unlock;
2338 }
2339 dbg_tnc("removing %s", DBGKEY(key));
2340 }
2341 if (k) {
2342 for (i = n + 1 + k; i < znode->child_cnt; i++)
2343 znode->zbranch[i - k] = znode->zbranch[i];
2344 znode->child_cnt -= k;
2345 }
2346
2347 /* Now delete the first */
2348 err = tnc_delete(c, znode, n);
2349 if (err)
2350 goto out_unlock;
2351 }
2352
2353out_unlock:
2354 if (!err)
2355 err = dbg_check_tnc(c, 0);
2356 mutex_unlock(&c->tnc_mutex);
2357 return err;
2358}
2359
2360/**
2361 * ubifs_tnc_remove_ino - remove an inode from TNC.
2362 * @c: UBIFS file-system description object
2363 * @inum: inode number to remove
2364 *
2365 * This function remove inode @inum and all the extended attributes associated
2366 * with the anode from TNC and returns zero in case of success or a negative
2367 * error code in case of failure.
2368 */
2369int ubifs_tnc_remove_ino(struct ubifs_info *c, ino_t inum)
2370{
2371 union ubifs_key key1, key2;
2372 struct ubifs_dent_node *xent, *pxent = NULL;
2373 struct qstr nm = { .name = NULL };
2374
2375 dbg_tnc("ino %lu", inum);
2376
2377 /*
2378 * Walk all extended attribute entries and remove them together with
2379 * corresponding extended attribute inodes.
2380 */
2381 lowest_xent_key(c, &key1, inum);
2382 while (1) {
2383 ino_t xattr_inum;
2384 int err;
2385
2386 xent = ubifs_tnc_next_ent(c, &key1, &nm);
2387 if (IS_ERR(xent)) {
2388 err = PTR_ERR(xent);
2389 if (err == -ENOENT)
2390 break;
2391 return err;
2392 }
2393
2394 xattr_inum = le64_to_cpu(xent->inum);
2395 dbg_tnc("xent '%s', ino %lu", xent->name, xattr_inum);
2396
2397 nm.name = xent->name;
2398 nm.len = le16_to_cpu(xent->nlen);
2399 err = ubifs_tnc_remove_nm(c, &key1, &nm);
2400 if (err) {
2401 kfree(xent);
2402 return err;
2403 }
2404
2405 lowest_ino_key(c, &key1, xattr_inum);
2406 highest_ino_key(c, &key2, xattr_inum);
2407 err = ubifs_tnc_remove_range(c, &key1, &key2);
2408 if (err) {
2409 kfree(xent);
2410 return err;
2411 }
2412
2413 kfree(pxent);
2414 pxent = xent;
2415 key_read(c, &xent->key, &key1);
2416 }
2417
2418 kfree(pxent);
2419 lowest_ino_key(c, &key1, inum);
2420 highest_ino_key(c, &key2, inum);
2421
2422 return ubifs_tnc_remove_range(c, &key1, &key2);
2423}
2424
2425/**
2426 * ubifs_tnc_next_ent - walk directory or extended attribute entries.
2427 * @c: UBIFS file-system description object
2428 * @key: key of last entry
2429 * @nm: name of last entry found or %NULL
2430 *
2431 * This function finds and reads the next directory or extended attribute entry
2432 * after the given key (@key) if there is one. @nm is used to resolve
2433 * collisions.
2434 *
2435 * If the name of the current entry is not known and only the key is known,
2436 * @nm->name has to be %NULL. In this case the semantics of this function is a
2437 * little bit different and it returns the entry corresponding to this key, not
2438 * the next one. If the key was not found, the closest "right" entry is
2439 * returned.
2440 *
2441 * If the fist entry has to be found, @key has to contain the lowest possible
2442 * key value for this inode and @name has to be %NULL.
2443 *
2444 * This function returns the found directory or extended attribute entry node
2445 * in case of success, %-ENOENT is returned if no entry was found, and a
2446 * negative error code is returned in case of failure.
2447 */
2448struct ubifs_dent_node *ubifs_tnc_next_ent(struct ubifs_info *c,
2449 union ubifs_key *key,
2450 const struct qstr *nm)
2451{
2452 int n, err, type = key_type(c, key);
2453 struct ubifs_znode *znode;
2454 struct ubifs_dent_node *dent;
2455 struct ubifs_zbranch *zbr;
2456 union ubifs_key *dkey;
2457
2458 dbg_tnc("%s %s", nm->name ? (char *)nm->name : "(lowest)", DBGKEY(key));
2459 ubifs_assert(is_hash_key(c, key));
2460
2461 mutex_lock(&c->tnc_mutex);
2462 err = ubifs_lookup_level0(c, key, &znode, &n);
2463 if (unlikely(err < 0))
2464 goto out_unlock;
2465
2466 if (nm->name) {
2467 if (err) {
2468 /* Handle collisions */
2469 err = resolve_collision(c, key, &znode, &n, nm);
2470 dbg_tnc("rc returned %d, znode %p, n %d",
2471 err, znode, n);
2472 if (unlikely(err < 0))
2473 goto out_unlock;
2474 }
2475
2476 /* Now find next entry */
2477 err = tnc_next(c, &znode, &n);
2478 if (unlikely(err))
2479 goto out_unlock;
2480 } else {
2481 /*
2482 * The full name of the entry was not given, in which case the
2483 * behavior of this function is a little different and it
2484 * returns current entry, not the next one.
2485 */
2486 if (!err) {
2487 /*
2488 * However, the given key does not exist in the TNC
2489 * tree and @znode/@n variables contain the closest
2490 * "preceding" element. Switch to the next one.
2491 */
2492 err = tnc_next(c, &znode, &n);
2493 if (err)
2494 goto out_unlock;
2495 }
2496 }
2497
2498 zbr = &znode->zbranch[n];
2499 dent = kmalloc(zbr->len, GFP_NOFS);
2500 if (unlikely(!dent)) {
2501 err = -ENOMEM;
2502 goto out_unlock;
2503 }
2504
2505 /*
2506 * The above 'tnc_next()' call could lead us to the next inode, check
2507 * this.
2508 */
2509 dkey = &zbr->key;
2510 if (key_inum(c, dkey) != key_inum(c, key) ||
2511 key_type(c, dkey) != type) {
2512 err = -ENOENT;
2513 goto out_free;
2514 }
2515
2516 err = tnc_read_node_nm(c, zbr, dent);
2517 if (unlikely(err))
2518 goto out_free;
2519
2520 mutex_unlock(&c->tnc_mutex);
2521 return dent;
2522
2523out_free:
2524 kfree(dent);
2525out_unlock:
2526 mutex_unlock(&c->tnc_mutex);
2527 return ERR_PTR(err);
2528}
2529
2530/**
2531 * tnc_destroy_cnext - destroy left-over obsolete znodes from a failed commit.
2532 * @c: UBIFS file-system description object
2533 *
2534 * Destroy left-over obsolete znodes from a failed commit.
2535 */
2536static void tnc_destroy_cnext(struct ubifs_info *c)
2537{
2538 struct ubifs_znode *cnext;
2539
2540 if (!c->cnext)
2541 return;
2542 ubifs_assert(c->cmt_state == COMMIT_BROKEN);
2543 cnext = c->cnext;
2544 do {
2545 struct ubifs_znode *znode = cnext;
2546
2547 cnext = cnext->cnext;
2548 if (test_bit(OBSOLETE_ZNODE, &znode->flags))
2549 kfree(znode);
2550 } while (cnext && cnext != c->cnext);
2551}
2552
2553/**
2554 * ubifs_tnc_close - close TNC subsystem and free all related resources.
2555 * @c: UBIFS file-system description object
2556 */
2557void ubifs_tnc_close(struct ubifs_info *c)
2558{
2559 long clean_freed;
2560
2561 tnc_destroy_cnext(c);
2562 if (c->zroot.znode) {
2563 clean_freed = ubifs_destroy_tnc_subtree(c->zroot.znode);
2564 atomic_long_sub(clean_freed, &ubifs_clean_zn_cnt);
2565 }
2566 kfree(c->gap_lebs);
2567 kfree(c->ilebs);
2568 destroy_old_idx(c);
2569}
2570
2571/**
2572 * left_znode - get the znode to the left.
2573 * @c: UBIFS file-system description object
2574 * @znode: znode
2575 *
2576 * This function returns a pointer to the znode to the left of @znode or NULL if
2577 * there is not one. A negative error code is returned on failure.
2578 */
2579static struct ubifs_znode *left_znode(struct ubifs_info *c,
2580 struct ubifs_znode *znode)
2581{
2582 int level = znode->level;
2583
2584 while (1) {
2585 int n = znode->iip - 1;
2586
2587 /* Go up until we can go left */
2588 znode = znode->parent;
2589 if (!znode)
2590 return NULL;
2591 if (n >= 0) {
2592 /* Now go down the rightmost branch to 'level' */
2593 znode = get_znode(c, znode, n);
2594 if (IS_ERR(znode))
2595 return znode;
2596 while (znode->level != level) {
2597 n = znode->child_cnt - 1;
2598 znode = get_znode(c, znode, n);
2599 if (IS_ERR(znode))
2600 return znode;
2601 }
2602 break;
2603 }
2604 }
2605 return znode;
2606}
2607
2608/**
2609 * right_znode - get the znode to the right.
2610 * @c: UBIFS file-system description object
2611 * @znode: znode
2612 *
2613 * This function returns a pointer to the znode to the right of @znode or NULL
2614 * if there is not one. A negative error code is returned on failure.
2615 */
2616static struct ubifs_znode *right_znode(struct ubifs_info *c,
2617 struct ubifs_znode *znode)
2618{
2619 int level = znode->level;
2620
2621 while (1) {
2622 int n = znode->iip + 1;
2623
2624 /* Go up until we can go right */
2625 znode = znode->parent;
2626 if (!znode)
2627 return NULL;
2628 if (n < znode->child_cnt) {
2629 /* Now go down the leftmost branch to 'level' */
2630 znode = get_znode(c, znode, n);
2631 if (IS_ERR(znode))
2632 return znode;
2633 while (znode->level != level) {
2634 znode = get_znode(c, znode, 0);
2635 if (IS_ERR(znode))
2636 return znode;
2637 }
2638 break;
2639 }
2640 }
2641 return znode;
2642}
2643
2644/**
2645 * lookup_znode - find a particular indexing node from TNC.
2646 * @c: UBIFS file-system description object
2647 * @key: index node key to lookup
2648 * @level: index node level
2649 * @lnum: index node LEB number
2650 * @offs: index node offset
2651 *
2652 * This function searches an indexing node by its first key @key and its
2653 * address @lnum:@offs. It looks up the indexing tree by pulling all indexing
2654 * nodes it traverses to TNC. This function is called fro indexing nodes which
2655 * were found on the media by scanning, for example when garbage-collecting or
2656 * when doing in-the-gaps commit. This means that the indexing node which is
2657 * looked for does not have to have exactly the same leftmost key @key, because
2658 * the leftmost key may have been changed, in which case TNC will contain a
2659 * dirty znode which still refers the same @lnum:@offs. This function is clever
2660 * enough to recognize such indexing nodes.
2661 *
2662 * Note, if a znode was deleted or changed too much, then this function will
2663 * not find it. For situations like this UBIFS has the old index RB-tree
2664 * (indexed by @lnum:@offs).
2665 *
2666 * This function returns a pointer to the znode found or %NULL if it is not
2667 * found. A negative error code is returned on failure.
2668 */
2669static struct ubifs_znode *lookup_znode(struct ubifs_info *c,
2670 union ubifs_key *key, int level,
2671 int lnum, int offs)
2672{
2673 struct ubifs_znode *znode, *zn;
2674 int n, nn;
2675
2676 /*
2677 * The arguments have probably been read off flash, so don't assume
2678 * they are valid.
2679 */
2680 if (level < 0)
2681 return ERR_PTR(-EINVAL);
2682
2683 /* Get the root znode */
2684 znode = c->zroot.znode;
2685 if (!znode) {
2686 znode = ubifs_load_znode(c, &c->zroot, NULL, 0);
2687 if (IS_ERR(znode))
2688 return znode;
2689 }
2690 /* Check if it is the one we are looking for */
2691 if (c->zroot.lnum == lnum && c->zroot.offs == offs)
2692 return znode;
2693 /* Descend to the parent level i.e. (level + 1) */
2694 if (level >= znode->level)
2695 return NULL;
2696 while (1) {
2697 ubifs_search_zbranch(c, znode, key, &n);
2698 if (n < 0) {
2699 /*
2700 * We reached a znode where the leftmost key is greater
2701 * than the key we are searching for. This is the same
2702 * situation as the one described in a huge comment at
2703 * the end of the 'ubifs_lookup_level0()' function. And
2704 * for exactly the same reasons we have to try to look
2705 * left before giving up.
2706 */
2707 znode = left_znode(c, znode);
2708 if (!znode)
2709 return NULL;
2710 if (IS_ERR(znode))
2711 return znode;
2712 ubifs_search_zbranch(c, znode, key, &n);
2713 ubifs_assert(n >= 0);
2714 }
2715 if (znode->level == level + 1)
2716 break;
2717 znode = get_znode(c, znode, n);
2718 if (IS_ERR(znode))
2719 return znode;
2720 }
2721 /* Check if the child is the one we are looking for */
2722 if (znode->zbranch[n].lnum == lnum && znode->zbranch[n].offs == offs)
2723 return get_znode(c, znode, n);
2724 /* If the key is unique, there is nowhere else to look */
2725 if (!is_hash_key(c, key))
2726 return NULL;
2727 /*
2728 * The key is not unique and so may be also in the znodes to either
2729 * side.
2730 */
2731 zn = znode;
2732 nn = n;
2733 /* Look left */
2734 while (1) {
2735 /* Move one branch to the left */
2736 if (n)
2737 n -= 1;
2738 else {
2739 znode = left_znode(c, znode);
2740 if (!znode)
2741 break;
2742 if (IS_ERR(znode))
2743 return znode;
2744 n = znode->child_cnt - 1;
2745 }
2746 /* Check it */
2747 if (znode->zbranch[n].lnum == lnum &&
2748 znode->zbranch[n].offs == offs)
2749 return get_znode(c, znode, n);
2750 /* Stop if the key is less than the one we are looking for */
2751 if (keys_cmp(c, &znode->zbranch[n].key, key) < 0)
2752 break;
2753 }
2754 /* Back to the middle */
2755 znode = zn;
2756 n = nn;
2757 /* Look right */
2758 while (1) {
2759 /* Move one branch to the right */
2760 if (++n >= znode->child_cnt) {
2761 znode = right_znode(c, znode);
2762 if (!znode)
2763 break;
2764 if (IS_ERR(znode))
2765 return znode;
2766 n = 0;
2767 }
2768 /* Check it */
2769 if (znode->zbranch[n].lnum == lnum &&
2770 znode->zbranch[n].offs == offs)
2771 return get_znode(c, znode, n);
2772 /* Stop if the key is greater than the one we are looking for */
2773 if (keys_cmp(c, &znode->zbranch[n].key, key) > 0)
2774 break;
2775 }
2776 return NULL;
2777}
2778
2779/**
2780 * is_idx_node_in_tnc - determine if an index node is in the TNC.
2781 * @c: UBIFS file-system description object
2782 * @key: key of index node
2783 * @level: index node level
2784 * @lnum: LEB number of index node
2785 * @offs: offset of index node
2786 *
2787 * This function returns %0 if the index node is not referred to in the TNC, %1
2788 * if the index node is referred to in the TNC and the corresponding znode is
2789 * dirty, %2 if an index node is referred to in the TNC and the corresponding
2790 * znode is clean, and a negative error code in case of failure.
2791 *
2792 * Note, the @key argument has to be the key of the first child. Also note,
2793 * this function relies on the fact that 0:0 is never a valid LEB number and
2794 * offset for a main-area node.
2795 */
2796int is_idx_node_in_tnc(struct ubifs_info *c, union ubifs_key *key, int level,
2797 int lnum, int offs)
2798{
2799 struct ubifs_znode *znode;
2800
2801 znode = lookup_znode(c, key, level, lnum, offs);
2802 if (!znode)
2803 return 0;
2804 if (IS_ERR(znode))
2805 return PTR_ERR(znode);
2806
2807 return ubifs_zn_dirty(znode) ? 1 : 2;
2808}
2809
2810/**
2811 * is_leaf_node_in_tnc - determine if a non-indexing not is in the TNC.
2812 * @c: UBIFS file-system description object
2813 * @key: node key
2814 * @lnum: node LEB number
2815 * @offs: node offset
2816 *
2817 * This function returns %1 if the node is referred to in the TNC, %0 if it is
2818 * not, and a negative error code in case of failure.
2819 *
2820 * Note, this function relies on the fact that 0:0 is never a valid LEB number
2821 * and offset for a main-area node.
2822 */
2823static int is_leaf_node_in_tnc(struct ubifs_info *c, union ubifs_key *key,
2824 int lnum, int offs)
2825{
2826 struct ubifs_zbranch *zbr;
2827 struct ubifs_znode *znode, *zn;
2828 int n, found, err, nn;
2829 const int unique = !is_hash_key(c, key);
2830
2831 found = ubifs_lookup_level0(c, key, &znode, &n);
2832 if (found < 0)
2833 return found; /* Error code */
2834 if (!found)
2835 return 0;
2836 zbr = &znode->zbranch[n];
2837 if (lnum == zbr->lnum && offs == zbr->offs)
2838 return 1; /* Found it */
2839 if (unique)
2840 return 0;
2841 /*
2842 * Because the key is not unique, we have to look left
2843 * and right as well
2844 */
2845 zn = znode;
2846 nn = n;
2847 /* Look left */
2848 while (1) {
2849 err = tnc_prev(c, &znode, &n);
2850 if (err == -ENOENT)
2851 break;
2852 if (err)
2853 return err;
2854 if (keys_cmp(c, key, &znode->zbranch[n].key))
2855 break;
2856 zbr = &znode->zbranch[n];
2857 if (lnum == zbr->lnum && offs == zbr->offs)
2858 return 1; /* Found it */
2859 }
2860 /* Look right */
2861 znode = zn;
2862 n = nn;
2863 while (1) {
2864 err = tnc_next(c, &znode, &n);
2865 if (err) {
2866 if (err == -ENOENT)
2867 return 0;
2868 return err;
2869 }
2870 if (keys_cmp(c, key, &znode->zbranch[n].key))
2871 break;
2872 zbr = &znode->zbranch[n];
2873 if (lnum == zbr->lnum && offs == zbr->offs)
2874 return 1; /* Found it */
2875 }
2876 return 0;
2877}
2878
2879/**
2880 * ubifs_tnc_has_node - determine whether a node is in the TNC.
2881 * @c: UBIFS file-system description object
2882 * @key: node key
2883 * @level: index node level (if it is an index node)
2884 * @lnum: node LEB number
2885 * @offs: node offset
2886 * @is_idx: non-zero if the node is an index node
2887 *
2888 * This function returns %1 if the node is in the TNC, %0 if it is not, and a
2889 * negative error code in case of failure. For index nodes, @key has to be the
2890 * key of the first child. An index node is considered to be in the TNC only if
2891 * the corresponding znode is clean or has not been loaded.
2892 */
2893int ubifs_tnc_has_node(struct ubifs_info *c, union ubifs_key *key, int level,
2894 int lnum, int offs, int is_idx)
2895{
2896 int err;
2897
2898 mutex_lock(&c->tnc_mutex);
2899 if (is_idx) {
2900 err = is_idx_node_in_tnc(c, key, level, lnum, offs);
2901 if (err < 0)
2902 goto out_unlock;
2903 if (err == 1)
2904 /* The index node was found but it was dirty */
2905 err = 0;
2906 else if (err == 2)
2907 /* The index node was found and it was clean */
2908 err = 1;
2909 else
2910 BUG_ON(err != 0);
2911 } else
2912 err = is_leaf_node_in_tnc(c, key, lnum, offs);
2913
2914out_unlock:
2915 mutex_unlock(&c->tnc_mutex);
2916 return err;
2917}
2918
2919/**
2920 * ubifs_dirty_idx_node - dirty an index node.
2921 * @c: UBIFS file-system description object
2922 * @key: index node key
2923 * @level: index node level
2924 * @lnum: index node LEB number
2925 * @offs: index node offset
2926 *
2927 * This function loads and dirties an index node so that it can be garbage
2928 * collected. The @key argument has to be the key of the first child. This
2929 * function relies on the fact that 0:0 is never a valid LEB number and offset
2930 * for a main-area node. Returns %0 on success and a negative error code on
2931 * failure.
2932 */
2933int ubifs_dirty_idx_node(struct ubifs_info *c, union ubifs_key *key, int level,
2934 int lnum, int offs)
2935{
2936 struct ubifs_znode *znode;
2937 int err = 0;
2938
2939 mutex_lock(&c->tnc_mutex);
2940 znode = lookup_znode(c, key, level, lnum, offs);
2941 if (!znode)
2942 goto out_unlock;
2943 if (IS_ERR(znode)) {
2944 err = PTR_ERR(znode);
2945 goto out_unlock;
2946 }
2947 znode = dirty_cow_bottom_up(c, znode);
2948 if (IS_ERR(znode)) {
2949 err = PTR_ERR(znode);
2950 goto out_unlock;
2951 }
2952
2953out_unlock:
2954 mutex_unlock(&c->tnc_mutex);
2955 return err;
2956}
diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c
new file mode 100644
index 000000000000..8117e65ba2e9
--- /dev/null
+++ b/fs/ubifs/tnc_commit.c
@@ -0,0 +1,1103 @@
1/*
2 * This file is part of UBIFS.
3 *
4 * Copyright (C) 2006-2008 Nokia Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published by
8 * the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along with
16 * this program; if not, write to the Free Software Foundation, Inc., 51
17 * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 * Authors: Adrian Hunter
20 * Artem Bityutskiy (Битюцкий Артём)
21 */
22
23/* This file implements TNC functions for committing */
24
25#include "ubifs.h"
26
27/**
28 * make_idx_node - make an index node for fill-the-gaps method of TNC commit.
29 * @c: UBIFS file-system description object
30 * @idx: buffer in which to place new index node
31 * @znode: znode from which to make new index node
32 * @lnum: LEB number where new index node will be written
33 * @offs: offset where new index node will be written
34 * @len: length of new index node
35 */
36static int make_idx_node(struct ubifs_info *c, struct ubifs_idx_node *idx,
37 struct ubifs_znode *znode, int lnum, int offs, int len)
38{
39 struct ubifs_znode *zp;
40 int i, err;
41
42 /* Make index node */
43 idx->ch.node_type = UBIFS_IDX_NODE;
44 idx->child_cnt = cpu_to_le16(znode->child_cnt);
45 idx->level = cpu_to_le16(znode->level);
46 for (i = 0; i < znode->child_cnt; i++) {
47 struct ubifs_branch *br = ubifs_idx_branch(c, idx, i);
48 struct ubifs_zbranch *zbr = &znode->zbranch[i];
49
50 key_write_idx(c, &zbr->key, &br->key);
51 br->lnum = cpu_to_le32(zbr->lnum);
52 br->offs = cpu_to_le32(zbr->offs);
53 br->len = cpu_to_le32(zbr->len);
54 if (!zbr->lnum || !zbr->len) {
55 ubifs_err("bad ref in znode");
56 dbg_dump_znode(c, znode);
57 if (zbr->znode)
58 dbg_dump_znode(c, zbr->znode);
59 }
60 }
61 ubifs_prepare_node(c, idx, len, 0);
62
63#ifdef CONFIG_UBIFS_FS_DEBUG
64 znode->lnum = lnum;
65 znode->offs = offs;
66 znode->len = len;
67#endif
68
69 err = insert_old_idx_znode(c, znode);
70
71 /* Update the parent */
72 zp = znode->parent;
73 if (zp) {
74 struct ubifs_zbranch *zbr;
75
76 zbr = &zp->zbranch[znode->iip];
77 zbr->lnum = lnum;
78 zbr->offs = offs;
79 zbr->len = len;
80 } else {
81 c->zroot.lnum = lnum;
82 c->zroot.offs = offs;
83 c->zroot.len = len;
84 }
85 c->calc_idx_sz += ALIGN(len, 8);
86
87 atomic_long_dec(&c->dirty_zn_cnt);
88
89 ubifs_assert(ubifs_zn_dirty(znode));
90 ubifs_assert(test_bit(COW_ZNODE, &znode->flags));
91
92 __clear_bit(DIRTY_ZNODE, &znode->flags);
93 __clear_bit(COW_ZNODE, &znode->flags);
94
95 return err;
96}
97
98/**
99 * fill_gap - make index nodes in gaps in dirty index LEBs.
100 * @c: UBIFS file-system description object
101 * @lnum: LEB number that gap appears in
102 * @gap_start: offset of start of gap
103 * @gap_end: offset of end of gap
104 * @dirt: adds dirty space to this
105 *
106 * This function returns the number of index nodes written into the gap.
107 */
108static int fill_gap(struct ubifs_info *c, int lnum, int gap_start, int gap_end,
109 int *dirt)
110{
111 int len, gap_remains, gap_pos, written, pad_len;
112
113 ubifs_assert((gap_start & 7) == 0);
114 ubifs_assert((gap_end & 7) == 0);
115 ubifs_assert(gap_end >= gap_start);
116
117 gap_remains = gap_end - gap_start;
118 if (!gap_remains)
119 return 0;
120 gap_pos = gap_start;
121 written = 0;
122 while (c->enext) {
123 len = ubifs_idx_node_sz(c, c->enext->child_cnt);
124 if (len < gap_remains) {
125 struct ubifs_znode *znode = c->enext;
126 const int alen = ALIGN(len, 8);
127 int err;
128
129 ubifs_assert(alen <= gap_remains);
130 err = make_idx_node(c, c->ileb_buf + gap_pos, znode,
131 lnum, gap_pos, len);
132 if (err)
133 return err;
134 gap_remains -= alen;
135 gap_pos += alen;
136 c->enext = znode->cnext;
137 if (c->enext == c->cnext)
138 c->enext = NULL;
139 written += 1;
140 } else
141 break;
142 }
143 if (gap_end == c->leb_size) {
144 c->ileb_len = ALIGN(gap_pos, c->min_io_size);
145 /* Pad to end of min_io_size */
146 pad_len = c->ileb_len - gap_pos;
147 } else
148 /* Pad to end of gap */
149 pad_len = gap_remains;
150 dbg_gc("LEB %d:%d to %d len %d nodes written %d wasted bytes %d",
151 lnum, gap_start, gap_end, gap_end - gap_start, written, pad_len);
152 ubifs_pad(c, c->ileb_buf + gap_pos, pad_len);
153 *dirt += pad_len;
154 return written;
155}
156
157/**
158 * find_old_idx - find an index node obsoleted since the last commit start.
159 * @c: UBIFS file-system description object
160 * @lnum: LEB number of obsoleted index node
161 * @offs: offset of obsoleted index node
162 *
163 * Returns %1 if found and %0 otherwise.
164 */
165static int find_old_idx(struct ubifs_info *c, int lnum, int offs)
166{
167 struct ubifs_old_idx *o;
168 struct rb_node *p;
169
170 p = c->old_idx.rb_node;
171 while (p) {
172 o = rb_entry(p, struct ubifs_old_idx, rb);
173 if (lnum < o->lnum)
174 p = p->rb_left;
175 else if (lnum > o->lnum)
176 p = p->rb_right;
177 else if (offs < o->offs)
178 p = p->rb_left;
179 else if (offs > o->offs)
180 p = p->rb_right;
181 else
182 return 1;
183 }
184 return 0;
185}
186
187/**
188 * is_idx_node_in_use - determine if an index node can be overwritten.
189 * @c: UBIFS file-system description object
190 * @key: key of index node
191 * @level: index node level
192 * @lnum: LEB number of index node
193 * @offs: offset of index node
194 *
195 * If @key / @lnum / @offs identify an index node that was not part of the old
196 * index, then this function returns %0 (obsolete). Else if the index node was
197 * part of the old index but is now dirty %1 is returned, else if it is clean %2
198 * is returned. A negative error code is returned on failure.
199 */
200static int is_idx_node_in_use(struct ubifs_info *c, union ubifs_key *key,
201 int level, int lnum, int offs)
202{
203 int ret;
204
205 ret = is_idx_node_in_tnc(c, key, level, lnum, offs);
206 if (ret < 0)
207 return ret; /* Error code */
208 if (ret == 0)
209 if (find_old_idx(c, lnum, offs))
210 return 1;
211 return ret;
212}
213
214/**
215 * layout_leb_in_gaps - layout index nodes using in-the-gaps method.
216 * @c: UBIFS file-system description object
217 * @p: return LEB number here
218 *
219 * This function lays out new index nodes for dirty znodes using in-the-gaps
220 * method of TNC commit.
221 * This function merely puts the next znode into the next gap, making no attempt
222 * to try to maximise the number of znodes that fit.
223 * This function returns the number of index nodes written into the gaps, or a
224 * negative error code on failure.
225 */
226static int layout_leb_in_gaps(struct ubifs_info *c, int *p)
227{
228 struct ubifs_scan_leb *sleb;
229 struct ubifs_scan_node *snod;
230 int lnum, dirt = 0, gap_start, gap_end, err, written, tot_written;
231
232 tot_written = 0;
233 /* Get an index LEB with lots of obsolete index nodes */
234 lnum = ubifs_find_dirty_idx_leb(c);
235 if (lnum < 0)
236 /*
237 * There also may be dirt in the index head that could be
238 * filled, however we do not check there at present.
239 */
240 return lnum; /* Error code */
241 *p = lnum;
242 dbg_gc("LEB %d", lnum);
243 /*
244 * Scan the index LEB. We use the generic scan for this even though
245 * it is more comprehensive and less efficient than is needed for this
246 * purpose.
247 */
248 sleb = ubifs_scan(c, lnum, 0, c->ileb_buf);
249 c->ileb_len = 0;
250 if (IS_ERR(sleb))
251 return PTR_ERR(sleb);
252 gap_start = 0;
253 list_for_each_entry(snod, &sleb->nodes, list) {
254 struct ubifs_idx_node *idx;
255 int in_use, level;
256
257 ubifs_assert(snod->type == UBIFS_IDX_NODE);
258 idx = snod->node;
259 key_read(c, ubifs_idx_key(c, idx), &snod->key);
260 level = le16_to_cpu(idx->level);
261 /* Determine if the index node is in use (not obsolete) */
262 in_use = is_idx_node_in_use(c, &snod->key, level, lnum,
263 snod->offs);
264 if (in_use < 0) {
265 ubifs_scan_destroy(sleb);
266 return in_use; /* Error code */
267 }
268 if (in_use) {
269 if (in_use == 1)
270 dirt += ALIGN(snod->len, 8);
271 /*
272 * The obsolete index nodes form gaps that can be
273 * overwritten. This gap has ended because we have
274 * found an index node that is still in use
275 * i.e. not obsolete
276 */
277 gap_end = snod->offs;
278 /* Try to fill gap */
279 written = fill_gap(c, lnum, gap_start, gap_end, &dirt);
280 if (written < 0) {
281 ubifs_scan_destroy(sleb);
282 return written; /* Error code */
283 }
284 tot_written += written;
285 gap_start = ALIGN(snod->offs + snod->len, 8);
286 }
287 }
288 ubifs_scan_destroy(sleb);
289 c->ileb_len = c->leb_size;
290 gap_end = c->leb_size;
291 /* Try to fill gap */
292 written = fill_gap(c, lnum, gap_start, gap_end, &dirt);
293 if (written < 0)
294 return written; /* Error code */
295 tot_written += written;
296 if (tot_written == 0) {
297 struct ubifs_lprops lp;
298
299 dbg_gc("LEB %d wrote %d index nodes", lnum, tot_written);
300 err = ubifs_read_one_lp(c, lnum, &lp);
301 if (err)
302 return err;
303 if (lp.free == c->leb_size) {
304 /*
305 * We must have snatched this LEB from the idx_gc list
306 * so we need to correct the free and dirty space.
307 */
308 err = ubifs_change_one_lp(c, lnum,
309 c->leb_size - c->ileb_len,
310 dirt, 0, 0, 0);
311 if (err)
312 return err;
313 }
314 return 0;
315 }
316 err = ubifs_change_one_lp(c, lnum, c->leb_size - c->ileb_len, dirt,
317 0, 0, 0);
318 if (err)
319 return err;
320 err = ubifs_leb_change(c, lnum, c->ileb_buf, c->ileb_len,
321 UBI_SHORTTERM);
322 if (err)
323 return err;
324 dbg_gc("LEB %d wrote %d index nodes", lnum, tot_written);
325 return tot_written;
326}
327
328/**
329 * get_leb_cnt - calculate the number of empty LEBs needed to commit.
330 * @c: UBIFS file-system description object
331 * @cnt: number of znodes to commit
332 *
333 * This function returns the number of empty LEBs needed to commit @cnt znodes
334 * to the current index head. The number is not exact and may be more than
335 * needed.
336 */
337static int get_leb_cnt(struct ubifs_info *c, int cnt)
338{
339 int d;
340
341 /* Assume maximum index node size (i.e. overestimate space needed) */
342 cnt -= (c->leb_size - c->ihead_offs) / c->max_idx_node_sz;
343 if (cnt < 0)
344 cnt = 0;
345 d = c->leb_size / c->max_idx_node_sz;
346 return DIV_ROUND_UP(cnt, d);
347}
348
349/**
350 * layout_in_gaps - in-the-gaps method of committing TNC.
351 * @c: UBIFS file-system description object
352 * @cnt: number of dirty znodes to commit.
353 *
354 * This function lays out new index nodes for dirty znodes using in-the-gaps
355 * method of TNC commit.
356 *
357 * This function returns %0 on success and a negative error code on failure.
358 */
359static int layout_in_gaps(struct ubifs_info *c, int cnt)
360{
361 int err, leb_needed_cnt, written, *p;
362
363 dbg_gc("%d znodes to write", cnt);
364
365 c->gap_lebs = kmalloc(sizeof(int) * (c->lst.idx_lebs + 1), GFP_NOFS);
366 if (!c->gap_lebs)
367 return -ENOMEM;
368
369 p = c->gap_lebs;
370 do {
371 ubifs_assert(p < c->gap_lebs + sizeof(int) * c->lst.idx_lebs);
372 written = layout_leb_in_gaps(c, p);
373 if (written < 0) {
374 err = written;
375 if (err == -ENOSPC) {
376 if (!dbg_force_in_the_gaps_enabled) {
377 /*
378 * Do not print scary warnings if the
379 * debugging option which forces
380 * in-the-gaps is enabled.
381 */
382 ubifs_err("out of space");
383 spin_lock(&c->space_lock);
384 dbg_dump_budg(c);
385 spin_unlock(&c->space_lock);
386 dbg_dump_lprops(c);
387 }
388 /* Try to commit anyway */
389 err = 0;
390 break;
391 }
392 kfree(c->gap_lebs);
393 c->gap_lebs = NULL;
394 return err;
395 }
396 p++;
397 cnt -= written;
398 leb_needed_cnt = get_leb_cnt(c, cnt);
399 dbg_gc("%d znodes remaining, need %d LEBs, have %d", cnt,
400 leb_needed_cnt, c->ileb_cnt);
401 } while (leb_needed_cnt > c->ileb_cnt);
402
403 *p = -1;
404 return 0;
405}
406
407/**
408 * layout_in_empty_space - layout index nodes in empty space.
409 * @c: UBIFS file-system description object
410 *
411 * This function lays out new index nodes for dirty znodes using empty LEBs.
412 *
413 * This function returns %0 on success and a negative error code on failure.
414 */
415static int layout_in_empty_space(struct ubifs_info *c)
416{
417 struct ubifs_znode *znode, *cnext, *zp;
418 int lnum, offs, len, next_len, buf_len, buf_offs, used, avail;
419 int wlen, blen, err;
420
421 cnext = c->enext;
422 if (!cnext)
423 return 0;
424
425 lnum = c->ihead_lnum;
426 buf_offs = c->ihead_offs;
427
428 buf_len = ubifs_idx_node_sz(c, c->fanout);
429 buf_len = ALIGN(buf_len, c->min_io_size);
430 used = 0;
431 avail = buf_len;
432
433 /* Ensure there is enough room for first write */
434 next_len = ubifs_idx_node_sz(c, cnext->child_cnt);
435 if (buf_offs + next_len > c->leb_size)
436 lnum = -1;
437
438 while (1) {
439 znode = cnext;
440
441 len = ubifs_idx_node_sz(c, znode->child_cnt);
442
443 /* Determine the index node position */
444 if (lnum == -1) {
445 if (c->ileb_nxt >= c->ileb_cnt) {
446 ubifs_err("out of space");
447 return -ENOSPC;
448 }
449 lnum = c->ilebs[c->ileb_nxt++];
450 buf_offs = 0;
451 used = 0;
452 avail = buf_len;
453 }
454
455 offs = buf_offs + used;
456
457#ifdef CONFIG_UBIFS_FS_DEBUG
458 znode->lnum = lnum;
459 znode->offs = offs;
460 znode->len = len;
461#endif
462
463 /* Update the parent */
464 zp = znode->parent;
465 if (zp) {
466 struct ubifs_zbranch *zbr;
467 int i;
468
469 i = znode->iip;
470 zbr = &zp->zbranch[i];
471 zbr->lnum = lnum;
472 zbr->offs = offs;
473 zbr->len = len;
474 } else {
475 c->zroot.lnum = lnum;
476 c->zroot.offs = offs;
477 c->zroot.len = len;
478 }
479 c->calc_idx_sz += ALIGN(len, 8);
480
481 /*
482 * Once lprops is updated, we can decrease the dirty znode count
483 * but it is easier to just do it here.
484 */
485 atomic_long_dec(&c->dirty_zn_cnt);
486
487 /*
488 * Calculate the next index node length to see if there is
489 * enough room for it
490 */
491 cnext = znode->cnext;
492 if (cnext == c->cnext)
493 next_len = 0;
494 else
495 next_len = ubifs_idx_node_sz(c, cnext->child_cnt);
496
497 if (c->min_io_size == 1) {
498 buf_offs += ALIGN(len, 8);
499 if (next_len) {
500 if (buf_offs + next_len <= c->leb_size)
501 continue;
502 err = ubifs_update_one_lp(c, lnum, 0,
503 c->leb_size - buf_offs, 0, 0);
504 if (err)
505 return err;
506 lnum = -1;
507 continue;
508 }
509 err = ubifs_update_one_lp(c, lnum,
510 c->leb_size - buf_offs, 0, 0, 0);
511 if (err)
512 return err;
513 break;
514 }
515
516 /* Update buffer positions */
517 wlen = used + len;
518 used += ALIGN(len, 8);
519 avail -= ALIGN(len, 8);
520
521 if (next_len != 0 &&
522 buf_offs + used + next_len <= c->leb_size &&
523 avail > 0)
524 continue;
525
526 if (avail <= 0 && next_len &&
527 buf_offs + used + next_len <= c->leb_size)
528 blen = buf_len;
529 else
530 blen = ALIGN(wlen, c->min_io_size);
531
532 /* The buffer is full or there are no more znodes to do */
533 buf_offs += blen;
534 if (next_len) {
535 if (buf_offs + next_len > c->leb_size) {
536 err = ubifs_update_one_lp(c, lnum,
537 c->leb_size - buf_offs, blen - used,
538 0, 0);
539 if (err)
540 return err;
541 lnum = -1;
542 }
543 used -= blen;
544 if (used < 0)
545 used = 0;
546 avail = buf_len - used;
547 continue;
548 }
549 err = ubifs_update_one_lp(c, lnum, c->leb_size - buf_offs,
550 blen - used, 0, 0);
551 if (err)
552 return err;
553 break;
554 }
555
556#ifdef CONFIG_UBIFS_FS_DEBUG
557 c->new_ihead_lnum = lnum;
558 c->new_ihead_offs = buf_offs;
559#endif
560
561 return 0;
562}
563
564/**
565 * layout_commit - determine positions of index nodes to commit.
566 * @c: UBIFS file-system description object
567 * @no_space: indicates that insufficient empty LEBs were allocated
568 * @cnt: number of znodes to commit
569 *
570 * Calculate and update the positions of index nodes to commit. If there were
571 * an insufficient number of empty LEBs allocated, then index nodes are placed
572 * into the gaps created by obsolete index nodes in non-empty index LEBs. For
573 * this purpose, an obsolete index node is one that was not in the index as at
574 * the end of the last commit. To write "in-the-gaps" requires that those index
575 * LEBs are updated atomically in-place.
576 */
577static int layout_commit(struct ubifs_info *c, int no_space, int cnt)
578{
579 int err;
580
581 if (no_space) {
582 err = layout_in_gaps(c, cnt);
583 if (err)
584 return err;
585 }
586 err = layout_in_empty_space(c);
587 return err;
588}
589
590/**
591 * find_first_dirty - find first dirty znode.
592 * @znode: znode to begin searching from
593 */
594static struct ubifs_znode *find_first_dirty(struct ubifs_znode *znode)
595{
596 int i, cont;
597
598 if (!znode)
599 return NULL;
600
601 while (1) {
602 if (znode->level == 0) {
603 if (ubifs_zn_dirty(znode))
604 return znode;
605 return NULL;
606 }
607 cont = 0;
608 for (i = 0; i < znode->child_cnt; i++) {
609 struct ubifs_zbranch *zbr = &znode->zbranch[i];
610
611 if (zbr->znode && ubifs_zn_dirty(zbr->znode)) {
612 znode = zbr->znode;
613 cont = 1;
614 break;
615 }
616 }
617 if (!cont) {
618 if (ubifs_zn_dirty(znode))
619 return znode;
620 return NULL;
621 }
622 }
623}
624
625/**
626 * find_next_dirty - find next dirty znode.
627 * @znode: znode to begin searching from
628 */
629static struct ubifs_znode *find_next_dirty(struct ubifs_znode *znode)
630{
631 int n = znode->iip + 1;
632
633 znode = znode->parent;
634 if (!znode)
635 return NULL;
636 for (; n < znode->child_cnt; n++) {
637 struct ubifs_zbranch *zbr = &znode->zbranch[n];
638
639 if (zbr->znode && ubifs_zn_dirty(zbr->znode))
640 return find_first_dirty(zbr->znode);
641 }
642 return znode;
643}
644
645/**
646 * get_znodes_to_commit - create list of dirty znodes to commit.
647 * @c: UBIFS file-system description object
648 *
649 * This function returns the number of znodes to commit.
650 */
651static int get_znodes_to_commit(struct ubifs_info *c)
652{
653 struct ubifs_znode *znode, *cnext;
654 int cnt = 0;
655
656 c->cnext = find_first_dirty(c->zroot.znode);
657 znode = c->enext = c->cnext;
658 if (!znode) {
659 dbg_cmt("no znodes to commit");
660 return 0;
661 }
662 cnt += 1;
663 while (1) {
664 ubifs_assert(!test_bit(COW_ZNODE, &znode->flags));
665 __set_bit(COW_ZNODE, &znode->flags);
666 znode->alt = 0;
667 cnext = find_next_dirty(znode);
668 if (!cnext) {
669 znode->cnext = c->cnext;
670 break;
671 }
672 znode->cnext = cnext;
673 znode = cnext;
674 cnt += 1;
675 }
676 dbg_cmt("committing %d znodes", cnt);
677 ubifs_assert(cnt == atomic_long_read(&c->dirty_zn_cnt));
678 return cnt;
679}
680
681/**
682 * alloc_idx_lebs - allocate empty LEBs to be used to commit.
683 * @c: UBIFS file-system description object
684 * @cnt: number of znodes to commit
685 *
686 * This function returns %-ENOSPC if it cannot allocate a sufficient number of
687 * empty LEBs. %0 is returned on success, otherwise a negative error code
688 * is returned.
689 */
690static int alloc_idx_lebs(struct ubifs_info *c, int cnt)
691{
692 int i, leb_cnt, lnum;
693
694 c->ileb_cnt = 0;
695 c->ileb_nxt = 0;
696 leb_cnt = get_leb_cnt(c, cnt);
697 dbg_cmt("need about %d empty LEBS for TNC commit", leb_cnt);
698 if (!leb_cnt)
699 return 0;
700 c->ilebs = kmalloc(leb_cnt * sizeof(int), GFP_NOFS);
701 if (!c->ilebs)
702 return -ENOMEM;
703 for (i = 0; i < leb_cnt; i++) {
704 lnum = ubifs_find_free_leb_for_idx(c);
705 if (lnum < 0)
706 return lnum;
707 c->ilebs[c->ileb_cnt++] = lnum;
708 dbg_cmt("LEB %d", lnum);
709 }
710 if (dbg_force_in_the_gaps())
711 return -ENOSPC;
712 return 0;
713}
714
715/**
716 * free_unused_idx_lebs - free unused LEBs that were allocated for the commit.
717 * @c: UBIFS file-system description object
718 *
719 * It is possible that we allocate more empty LEBs for the commit than we need.
720 * This functions frees the surplus.
721 *
722 * This function returns %0 on success and a negative error code on failure.
723 */
724static int free_unused_idx_lebs(struct ubifs_info *c)
725{
726 int i, err = 0, lnum, er;
727
728 for (i = c->ileb_nxt; i < c->ileb_cnt; i++) {
729 lnum = c->ilebs[i];
730 dbg_cmt("LEB %d", lnum);
731 er = ubifs_change_one_lp(c, lnum, LPROPS_NC, LPROPS_NC, 0,
732 LPROPS_INDEX | LPROPS_TAKEN, 0);
733 if (!err)
734 err = er;
735 }
736 return err;
737}
738
739/**
740 * free_idx_lebs - free unused LEBs after commit end.
741 * @c: UBIFS file-system description object
742 *
743 * This function returns %0 on success and a negative error code on failure.
744 */
745static int free_idx_lebs(struct ubifs_info *c)
746{
747 int err;
748
749 err = free_unused_idx_lebs(c);
750 kfree(c->ilebs);
751 c->ilebs = NULL;
752 return err;
753}
754
755/**
756 * ubifs_tnc_start_commit - start TNC commit.
757 * @c: UBIFS file-system description object
758 * @zroot: new index root position is returned here
759 *
760 * This function prepares the list of indexing nodes to commit and lays out
761 * their positions on flash. If there is not enough free space it uses the
762 * in-gap commit method. Returns zero in case of success and a negative error
763 * code in case of failure.
764 */
765int ubifs_tnc_start_commit(struct ubifs_info *c, struct ubifs_zbranch *zroot)
766{
767 int err = 0, cnt;
768
769 mutex_lock(&c->tnc_mutex);
770 err = dbg_check_tnc(c, 1);
771 if (err)
772 goto out;
773 cnt = get_znodes_to_commit(c);
774 if (cnt != 0) {
775 int no_space = 0;
776
777 err = alloc_idx_lebs(c, cnt);
778 if (err == -ENOSPC)
779 no_space = 1;
780 else if (err)
781 goto out_free;
782 err = layout_commit(c, no_space, cnt);
783 if (err)
784 goto out_free;
785 ubifs_assert(atomic_long_read(&c->dirty_zn_cnt) == 0);
786 err = free_unused_idx_lebs(c);
787 if (err)
788 goto out;
789 }
790 destroy_old_idx(c);
791 memcpy(zroot, &c->zroot, sizeof(struct ubifs_zbranch));
792
793 err = ubifs_save_dirty_idx_lnums(c);
794 if (err)
795 goto out;
796
797 spin_lock(&c->space_lock);
798 /*
799 * Although we have not finished committing yet, update size of the
800 * committed index ('c->old_idx_sz') and zero out the index growth
801 * budget. It is OK to do this now, because we've reserved all the
802 * space which is needed to commit the index, and it is save for the
803 * budgeting subsystem to assume the index is already committed,
804 * even though it is not.
805 */
806 c->old_idx_sz = c->calc_idx_sz;
807 c->budg_uncommitted_idx = 0;
808 spin_unlock(&c->space_lock);
809 mutex_unlock(&c->tnc_mutex);
810
811 dbg_cmt("number of index LEBs %d", c->lst.idx_lebs);
812 dbg_cmt("size of index %llu", c->calc_idx_sz);
813 return err;
814
815out_free:
816 free_idx_lebs(c);
817out:
818 mutex_unlock(&c->tnc_mutex);
819 return err;
820}
821
822/**
823 * write_index - write index nodes.
824 * @c: UBIFS file-system description object
825 *
826 * This function writes the index nodes whose positions were laid out in the
827 * layout_in_empty_space function.
828 */
829static int write_index(struct ubifs_info *c)
830{
831 struct ubifs_idx_node *idx;
832 struct ubifs_znode *znode, *cnext;
833 int i, lnum, offs, len, next_len, buf_len, buf_offs, used;
834 int avail, wlen, err, lnum_pos = 0;
835
836 cnext = c->enext;
837 if (!cnext)
838 return 0;
839
840 /*
841 * Always write index nodes to the index head so that index nodes and
842 * other types of nodes are never mixed in the same erase block.
843 */
844 lnum = c->ihead_lnum;
845 buf_offs = c->ihead_offs;
846
847 /* Allocate commit buffer */
848 buf_len = ALIGN(c->max_idx_node_sz, c->min_io_size);
849 used = 0;
850 avail = buf_len;
851
852 /* Ensure there is enough room for first write */
853 next_len = ubifs_idx_node_sz(c, cnext->child_cnt);
854 if (buf_offs + next_len > c->leb_size) {
855 err = ubifs_update_one_lp(c, lnum, LPROPS_NC, 0, 0,
856 LPROPS_TAKEN);
857 if (err)
858 return err;
859 lnum = -1;
860 }
861
862 while (1) {
863 cond_resched();
864
865 znode = cnext;
866 idx = c->cbuf + used;
867
868 /* Make index node */
869 idx->ch.node_type = UBIFS_IDX_NODE;
870 idx->child_cnt = cpu_to_le16(znode->child_cnt);
871 idx->level = cpu_to_le16(znode->level);
872 for (i = 0; i < znode->child_cnt; i++) {
873 struct ubifs_branch *br = ubifs_idx_branch(c, idx, i);
874 struct ubifs_zbranch *zbr = &znode->zbranch[i];
875
876 key_write_idx(c, &zbr->key, &br->key);
877 br->lnum = cpu_to_le32(zbr->lnum);
878 br->offs = cpu_to_le32(zbr->offs);
879 br->len = cpu_to_le32(zbr->len);
880 if (!zbr->lnum || !zbr->len) {
881 ubifs_err("bad ref in znode");
882 dbg_dump_znode(c, znode);
883 if (zbr->znode)
884 dbg_dump_znode(c, zbr->znode);
885 }
886 }
887 len = ubifs_idx_node_sz(c, znode->child_cnt);
888 ubifs_prepare_node(c, idx, len, 0);
889
890 /* Determine the index node position */
891 if (lnum == -1) {
892 lnum = c->ilebs[lnum_pos++];
893 buf_offs = 0;
894 used = 0;
895 avail = buf_len;
896 }
897 offs = buf_offs + used;
898
899#ifdef CONFIG_UBIFS_FS_DEBUG
900 if (lnum != znode->lnum || offs != znode->offs ||
901 len != znode->len) {
902 ubifs_err("inconsistent znode posn");
903 return -EINVAL;
904 }
905#endif
906
907 /* Grab some stuff from znode while we still can */
908 cnext = znode->cnext;
909
910 ubifs_assert(ubifs_zn_dirty(znode));
911 ubifs_assert(test_bit(COW_ZNODE, &znode->flags));
912
913 /*
914 * It is important that other threads should see %DIRTY_ZNODE
915 * flag cleared before %COW_ZNODE. Specifically, it matters in
916 * the 'dirty_cow_znode()' function. This is the reason for the
917 * first barrier. Also, we want the bit changes to be seen to
918 * other threads ASAP, to avoid unnecesarry copying, which is
919 * the reason for the second barrier.
920 */
921 clear_bit(DIRTY_ZNODE, &znode->flags);
922 smp_mb__before_clear_bit();
923 clear_bit(COW_ZNODE, &znode->flags);
924 smp_mb__after_clear_bit();
925
926 /* Do not access znode from this point on */
927
928 /* Update buffer positions */
929 wlen = used + len;
930 used += ALIGN(len, 8);
931 avail -= ALIGN(len, 8);
932
933 /*
934 * Calculate the next index node length to see if there is
935 * enough room for it
936 */
937 if (cnext == c->cnext)
938 next_len = 0;
939 else
940 next_len = ubifs_idx_node_sz(c, cnext->child_cnt);
941
942 if (c->min_io_size == 1) {
943 /*
944 * Write the prepared index node immediately if there is
945 * no minimum IO size
946 */
947 err = ubifs_leb_write(c, lnum, c->cbuf, buf_offs,
948 wlen, UBI_SHORTTERM);
949 if (err)
950 return err;
951 buf_offs += ALIGN(wlen, 8);
952 if (next_len) {
953 used = 0;
954 avail = buf_len;
955 if (buf_offs + next_len > c->leb_size) {
956 err = ubifs_update_one_lp(c, lnum,
957 LPROPS_NC, 0, 0, LPROPS_TAKEN);
958 if (err)
959 return err;
960 lnum = -1;
961 }
962 continue;
963 }
964 } else {
965 int blen, nxt_offs = buf_offs + used + next_len;
966
967 if (next_len && nxt_offs <= c->leb_size) {
968 if (avail > 0)
969 continue;
970 else
971 blen = buf_len;
972 } else {
973 wlen = ALIGN(wlen, 8);
974 blen = ALIGN(wlen, c->min_io_size);
975 ubifs_pad(c, c->cbuf + wlen, blen - wlen);
976 }
977 /*
978 * The buffer is full or there are no more znodes
979 * to do
980 */
981 err = ubifs_leb_write(c, lnum, c->cbuf, buf_offs,
982 blen, UBI_SHORTTERM);
983 if (err)
984 return err;
985 buf_offs += blen;
986 if (next_len) {
987 if (nxt_offs > c->leb_size) {
988 err = ubifs_update_one_lp(c, lnum,
989 LPROPS_NC, 0, 0, LPROPS_TAKEN);
990 if (err)
991 return err;
992 lnum = -1;
993 }
994 used -= blen;
995 if (used < 0)
996 used = 0;
997 avail = buf_len - used;
998 memmove(c->cbuf, c->cbuf + blen, used);
999 continue;
1000 }
1001 }
1002 break;
1003 }
1004
1005#ifdef CONFIG_UBIFS_FS_DEBUG
1006 if (lnum != c->new_ihead_lnum || buf_offs != c->new_ihead_offs) {
1007 ubifs_err("inconsistent ihead");
1008 return -EINVAL;
1009 }
1010#endif
1011
1012 c->ihead_lnum = lnum;
1013 c->ihead_offs = buf_offs;
1014
1015 return 0;
1016}
1017
1018/**
1019 * free_obsolete_znodes - free obsolete znodes.
1020 * @c: UBIFS file-system description object
1021 *
1022 * At the end of commit end, obsolete znodes are freed.
1023 */
1024static void free_obsolete_znodes(struct ubifs_info *c)
1025{
1026 struct ubifs_znode *znode, *cnext;
1027
1028 cnext = c->cnext;
1029 do {
1030 znode = cnext;
1031 cnext = znode->cnext;
1032 if (test_bit(OBSOLETE_ZNODE, &znode->flags))
1033 kfree(znode);
1034 else {
1035 znode->cnext = NULL;
1036 atomic_long_inc(&c->clean_zn_cnt);
1037 atomic_long_inc(&ubifs_clean_zn_cnt);
1038 }
1039 } while (cnext != c->cnext);
1040}
1041
1042/**
1043 * return_gap_lebs - return LEBs used by the in-gap commit method.
1044 * @c: UBIFS file-system description object
1045 *
1046 * This function clears the "taken" flag for the LEBs which were used by the
1047 * "commit in-the-gaps" method.
1048 */
1049static int return_gap_lebs(struct ubifs_info *c)
1050{
1051 int *p, err;
1052
1053 if (!c->gap_lebs)
1054 return 0;
1055
1056 dbg_cmt("");
1057 for (p = c->gap_lebs; *p != -1; p++) {
1058 err = ubifs_change_one_lp(c, *p, LPROPS_NC, LPROPS_NC, 0,
1059 LPROPS_TAKEN, 0);
1060 if (err)
1061 return err;
1062 }
1063
1064 kfree(c->gap_lebs);
1065 c->gap_lebs = NULL;
1066 return 0;
1067}
1068
1069/**
1070 * ubifs_tnc_end_commit - update the TNC for commit end.
1071 * @c: UBIFS file-system description object
1072 *
1073 * Write the dirty znodes.
1074 */
1075int ubifs_tnc_end_commit(struct ubifs_info *c)
1076{
1077 int err;
1078
1079 if (!c->cnext)
1080 return 0;
1081
1082 err = return_gap_lebs(c);
1083 if (err)
1084 return err;
1085
1086 err = write_index(c);
1087 if (err)
1088 return err;
1089
1090 mutex_lock(&c->tnc_mutex);
1091
1092 dbg_cmt("TNC height is %d", c->zroot.znode->level + 1);
1093
1094 free_obsolete_znodes(c);
1095
1096 c->cnext = NULL;
1097 kfree(c->ilebs);
1098 c->ilebs = NULL;
1099
1100 mutex_unlock(&c->tnc_mutex);
1101
1102 return 0;
1103}
diff --git a/fs/ubifs/tnc_misc.c b/fs/ubifs/tnc_misc.c
new file mode 100644
index 000000000000..a25c1cc1f8d9
--- /dev/null
+++ b/fs/ubifs/tnc_misc.c
@@ -0,0 +1,494 @@
1/*
2 * This file is part of UBIFS.
3 *
4 * Copyright (C) 2006-2008 Nokia Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published by
8 * the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along with
16 * this program; if not, write to the Free Software Foundation, Inc., 51
17 * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 * Authors: Adrian Hunter
20 * Artem Bityutskiy (Битюцкий Артём)
21 */
22
23/*
24 * This file contains miscelanious TNC-related functions shared betweend
25 * different files. This file does not form any logically separate TNC
26 * sub-system. The file was created because there is a lot of TNC code and
27 * putting it all in one file would make that file too big and unreadable.
28 */
29
30#include "ubifs.h"
31
32/**
33 * ubifs_tnc_levelorder_next - next TNC tree element in levelorder traversal.
34 * @zr: root of the subtree to traverse
35 * @znode: previous znode
36 *
37 * This function implements levelorder TNC traversal. The LNC is ignored.
38 * Returns the next element or %NULL if @znode is already the last one.
39 */
40struct ubifs_znode *ubifs_tnc_levelorder_next(struct ubifs_znode *zr,
41 struct ubifs_znode *znode)
42{
43 int level, iip, level_search = 0;
44 struct ubifs_znode *zn;
45
46 ubifs_assert(zr);
47
48 if (unlikely(!znode))
49 return zr;
50
51 if (unlikely(znode == zr)) {
52 if (znode->level == 0)
53 return NULL;
54 return ubifs_tnc_find_child(zr, 0);
55 }
56
57 level = znode->level;
58
59 iip = znode->iip;
60 while (1) {
61 ubifs_assert(znode->level <= zr->level);
62
63 /*
64 * First walk up until there is a znode with next branch to
65 * look at.
66 */
67 while (znode->parent != zr && iip >= znode->parent->child_cnt) {
68 znode = znode->parent;
69 iip = znode->iip;
70 }
71
72 if (unlikely(znode->parent == zr &&
73 iip >= znode->parent->child_cnt)) {
74 /* This level is done, switch to the lower one */
75 level -= 1;
76 if (level_search || level < 0)
77 /*
78 * We were already looking for znode at lower
79 * level ('level_search'). As we are here
80 * again, it just does not exist. Or all levels
81 * were finished ('level < 0').
82 */
83 return NULL;
84
85 level_search = 1;
86 iip = -1;
87 znode = ubifs_tnc_find_child(zr, 0);
88 ubifs_assert(znode);
89 }
90
91 /* Switch to the next index */
92 zn = ubifs_tnc_find_child(znode->parent, iip + 1);
93 if (!zn) {
94 /* No more children to look at, we have walk up */
95 iip = znode->parent->child_cnt;
96 continue;
97 }
98
99 /* Walk back down to the level we came from ('level') */
100 while (zn->level != level) {
101 znode = zn;
102 zn = ubifs_tnc_find_child(zn, 0);
103 if (!zn) {
104 /*
105 * This path is not too deep so it does not
106 * reach 'level'. Try next path.
107 */
108 iip = znode->iip;
109 break;
110 }
111 }
112
113 if (zn) {
114 ubifs_assert(zn->level >= 0);
115 return zn;
116 }
117 }
118}
119
120/**
121 * ubifs_search_zbranch - search znode branch.
122 * @c: UBIFS file-system description object
123 * @znode: znode to search in
124 * @key: key to search for
125 * @n: znode branch slot number is returned here
126 *
127 * This is a helper function which search branch with key @key in @znode using
128 * binary search. The result of the search may be:
129 * o exact match, then %1 is returned, and the slot number of the branch is
130 * stored in @n;
131 * o no exact match, then %0 is returned and the slot number of the left
132 * closest branch is returned in @n; the slot if all keys in this znode are
133 * greater than @key, then %-1 is returned in @n.
134 */
135int ubifs_search_zbranch(const struct ubifs_info *c,
136 const struct ubifs_znode *znode,
137 const union ubifs_key *key, int *n)
138{
139 int beg = 0, end = znode->child_cnt, uninitialized_var(mid);
140 int uninitialized_var(cmp);
141 const struct ubifs_zbranch *zbr = &znode->zbranch[0];
142
143 ubifs_assert(end > beg);
144
145 while (end > beg) {
146 mid = (beg + end) >> 1;
147 cmp = keys_cmp(c, key, &zbr[mid].key);
148 if (cmp > 0)
149 beg = mid + 1;
150 else if (cmp < 0)
151 end = mid;
152 else {
153 *n = mid;
154 return 1;
155 }
156 }
157
158 *n = end - 1;
159
160 /* The insert point is after *n */
161 ubifs_assert(*n >= -1 && *n < znode->child_cnt);
162 if (*n == -1)
163 ubifs_assert(keys_cmp(c, key, &zbr[0].key) < 0);
164 else
165 ubifs_assert(keys_cmp(c, key, &zbr[*n].key) > 0);
166 if (*n + 1 < znode->child_cnt)
167 ubifs_assert(keys_cmp(c, key, &zbr[*n + 1].key) < 0);
168
169 return 0;
170}
171
172/**
173 * ubifs_tnc_postorder_first - find first znode to do postorder tree traversal.
174 * @znode: znode to start at (root of the sub-tree to traverse)
175 *
176 * Find the lowest leftmost znode in a subtree of the TNC tree. The LNC is
177 * ignored.
178 */
179struct ubifs_znode *ubifs_tnc_postorder_first(struct ubifs_znode *znode)
180{
181 if (unlikely(!znode))
182 return NULL;
183
184 while (znode->level > 0) {
185 struct ubifs_znode *child;
186
187 child = ubifs_tnc_find_child(znode, 0);
188 if (!child)
189 return znode;
190 znode = child;
191 }
192
193 return znode;
194}
195
196/**
197 * ubifs_tnc_postorder_next - next TNC tree element in postorder traversal.
198 * @znode: previous znode
199 *
200 * This function implements postorder TNC traversal. The LNC is ignored.
201 * Returns the next element or %NULL if @znode is already the last one.
202 */
203struct ubifs_znode *ubifs_tnc_postorder_next(struct ubifs_znode *znode)
204{
205 struct ubifs_znode *zn;
206
207 ubifs_assert(znode);
208 if (unlikely(!znode->parent))
209 return NULL;
210
211 /* Switch to the next index in the parent */
212 zn = ubifs_tnc_find_child(znode->parent, znode->iip + 1);
213 if (!zn)
214 /* This is in fact the last child, return parent */
215 return znode->parent;
216
217 /* Go to the first znode in this new subtree */
218 return ubifs_tnc_postorder_first(zn);
219}
220
221/**
222 * ubifs_destroy_tnc_subtree - destroy all znodes connected to a subtree.
223 * @znode: znode defining subtree to destroy
224 *
225 * This function destroys subtree of the TNC tree. Returns number of clean
226 * znodes in the subtree.
227 */
228long ubifs_destroy_tnc_subtree(struct ubifs_znode *znode)
229{
230 struct ubifs_znode *zn = ubifs_tnc_postorder_first(znode);
231 long clean_freed = 0;
232 int n;
233
234 ubifs_assert(zn);
235 while (1) {
236 for (n = 0; n < zn->child_cnt; n++) {
237 if (!zn->zbranch[n].znode)
238 continue;
239
240 if (zn->level > 0 &&
241 !ubifs_zn_dirty(zn->zbranch[n].znode))
242 clean_freed += 1;
243
244 cond_resched();
245 kfree(zn->zbranch[n].znode);
246 }
247
248 if (zn == znode) {
249 if (!ubifs_zn_dirty(zn))
250 clean_freed += 1;
251 kfree(zn);
252 return clean_freed;
253 }
254
255 zn = ubifs_tnc_postorder_next(zn);
256 }
257}
258
259/**
260 * read_znode - read an indexing node from flash and fill znode.
261 * @c: UBIFS file-system description object
262 * @lnum: LEB of the indexing node to read
263 * @offs: node offset
264 * @len: node length
265 * @znode: znode to read to
266 *
267 * This function reads an indexing node from the flash media and fills znode
268 * with the read data. Returns zero in case of success and a negative error
269 * code in case of failure. The read indexing node is validated and if anything
270 * is wrong with it, this function prints complaint messages and returns
271 * %-EINVAL.
272 */
273static int read_znode(struct ubifs_info *c, int lnum, int offs, int len,
274 struct ubifs_znode *znode)
275{
276 int i, err, type, cmp;
277 struct ubifs_idx_node *idx;
278
279 idx = kmalloc(c->max_idx_node_sz, GFP_NOFS);
280 if (!idx)
281 return -ENOMEM;
282
283 err = ubifs_read_node(c, idx, UBIFS_IDX_NODE, len, lnum, offs);
284 if (err < 0) {
285 kfree(idx);
286 return err;
287 }
288
289 znode->child_cnt = le16_to_cpu(idx->child_cnt);
290 znode->level = le16_to_cpu(idx->level);
291
292 dbg_tnc("LEB %d:%d, level %d, %d branch",
293 lnum, offs, znode->level, znode->child_cnt);
294
295 if (znode->child_cnt > c->fanout || znode->level > UBIFS_MAX_LEVELS) {
296 dbg_err("current fanout %d, branch count %d",
297 c->fanout, znode->child_cnt);
298 dbg_err("max levels %d, znode level %d",
299 UBIFS_MAX_LEVELS, znode->level);
300 err = 1;
301 goto out_dump;
302 }
303
304 for (i = 0; i < znode->child_cnt; i++) {
305 const struct ubifs_branch *br = ubifs_idx_branch(c, idx, i);
306 struct ubifs_zbranch *zbr = &znode->zbranch[i];
307
308 key_read(c, &br->key, &zbr->key);
309 zbr->lnum = le32_to_cpu(br->lnum);
310 zbr->offs = le32_to_cpu(br->offs);
311 zbr->len = le32_to_cpu(br->len);
312 zbr->znode = NULL;
313
314 /* Validate branch */
315
316 if (zbr->lnum < c->main_first ||
317 zbr->lnum >= c->leb_cnt || zbr->offs < 0 ||
318 zbr->offs + zbr->len > c->leb_size || zbr->offs & 7) {
319 dbg_err("bad branch %d", i);
320 err = 2;
321 goto out_dump;
322 }
323
324 switch (key_type(c, &zbr->key)) {
325 case UBIFS_INO_KEY:
326 case UBIFS_DATA_KEY:
327 case UBIFS_DENT_KEY:
328 case UBIFS_XENT_KEY:
329 break;
330 default:
331 dbg_msg("bad key type at slot %d: %s", i,
332 DBGKEY(&zbr->key));
333 err = 3;
334 goto out_dump;
335 }
336
337 if (znode->level)
338 continue;
339
340 type = key_type(c, &zbr->key);
341 if (c->ranges[type].max_len == 0) {
342 if (zbr->len != c->ranges[type].len) {
343 dbg_err("bad target node (type %d) length (%d)",
344 type, zbr->len);
345 dbg_err("have to be %d", c->ranges[type].len);
346 err = 4;
347 goto out_dump;
348 }
349 } else if (zbr->len < c->ranges[type].min_len ||
350 zbr->len > c->ranges[type].max_len) {
351 dbg_err("bad target node (type %d) length (%d)",
352 type, zbr->len);
353 dbg_err("have to be in range of %d-%d",
354 c->ranges[type].min_len,
355 c->ranges[type].max_len);
356 err = 5;
357 goto out_dump;
358 }
359 }
360
361 /*
362 * Ensure that the next key is greater or equivalent to the
363 * previous one.
364 */
365 for (i = 0; i < znode->child_cnt - 1; i++) {
366 const union ubifs_key *key1, *key2;
367
368 key1 = &znode->zbranch[i].key;
369 key2 = &znode->zbranch[i + 1].key;
370
371 cmp = keys_cmp(c, key1, key2);
372 if (cmp > 0) {
373 dbg_err("bad key order (keys %d and %d)", i, i + 1);
374 err = 6;
375 goto out_dump;
376 } else if (cmp == 0 && !is_hash_key(c, key1)) {
377 /* These can only be keys with colliding hash */
378 dbg_err("keys %d and %d are not hashed but equivalent",
379 i, i + 1);
380 err = 7;
381 goto out_dump;
382 }
383 }
384
385 kfree(idx);
386 return 0;
387
388out_dump:
389 ubifs_err("bad indexing node at LEB %d:%d, error %d", lnum, offs, err);
390 dbg_dump_node(c, idx);
391 kfree(idx);
392 return -EINVAL;
393}
394
395/**
396 * ubifs_load_znode - load znode to TNC cache.
397 * @c: UBIFS file-system description object
398 * @zbr: znode branch
399 * @parent: znode's parent
400 * @iip: index in parent
401 *
402 * This function loads znode pointed to by @zbr into the TNC cache and
403 * returns pointer to it in case of success and a negative error code in case
404 * of failure.
405 */
406struct ubifs_znode *ubifs_load_znode(struct ubifs_info *c,
407 struct ubifs_zbranch *zbr,
408 struct ubifs_znode *parent, int iip)
409{
410 int err;
411 struct ubifs_znode *znode;
412
413 ubifs_assert(!zbr->znode);
414 /*
415 * A slab cache is not presently used for znodes because the znode size
416 * depends on the fanout which is stored in the superblock.
417 */
418 znode = kzalloc(c->max_znode_sz, GFP_NOFS);
419 if (!znode)
420 return ERR_PTR(-ENOMEM);
421
422 err = read_znode(c, zbr->lnum, zbr->offs, zbr->len, znode);
423 if (err)
424 goto out;
425
426 atomic_long_inc(&c->clean_zn_cnt);
427
428 /*
429 * Increment the global clean znode counter as well. It is OK that
430 * global and per-FS clean znode counters may be inconsistent for some
431 * short time (because we might be preempted at this point), the global
432 * one is only used in shrinker.
433 */
434 atomic_long_inc(&ubifs_clean_zn_cnt);
435
436 zbr->znode = znode;
437 znode->parent = parent;
438 znode->time = get_seconds();
439 znode->iip = iip;
440
441 return znode;
442
443out:
444 kfree(znode);
445 return ERR_PTR(err);
446}
447
448/**
449 * ubifs_tnc_read_node - read a leaf node from the flash media.
450 * @c: UBIFS file-system description object
451 * @zbr: key and position of the node
452 * @node: node is returned here
453 *
454 * This function reads a node defined by @zbr from the flash media. Returns
455 * zero in case of success or a negative negative error code in case of
456 * failure.
457 */
458int ubifs_tnc_read_node(struct ubifs_info *c, struct ubifs_zbranch *zbr,
459 void *node)
460{
461 union ubifs_key key1, *key = &zbr->key;
462 int err, type = key_type(c, key);
463 struct ubifs_wbuf *wbuf;
464
465 /*
466 * 'zbr' has to point to on-flash node. The node may sit in a bud and
467 * may even be in a write buffer, so we have to take care about this.
468 */
469 wbuf = ubifs_get_wbuf(c, zbr->lnum);
470 if (wbuf)
471 err = ubifs_read_node_wbuf(wbuf, node, type, zbr->len,
472 zbr->lnum, zbr->offs);
473 else
474 err = ubifs_read_node(c, node, type, zbr->len, zbr->lnum,
475 zbr->offs);
476
477 if (err) {
478 dbg_tnc("key %s", DBGKEY(key));
479 return err;
480 }
481
482 /* Make sure the key of the read node is correct */
483 key_read(c, key, &key1);
484 if (memcmp(node + UBIFS_KEY_OFFSET, &key1, c->key_len)) {
485 ubifs_err("bad key in node at LEB %d:%d",
486 zbr->lnum, zbr->offs);
487 dbg_tnc("looked for key %s found node's key %s",
488 DBGKEY(key), DBGKEY1(&key1));
489 dbg_dump_node(c, node);
490 return -EINVAL;
491 }
492
493 return 0;
494}
diff --git a/fs/ubifs/ubifs-media.h b/fs/ubifs/ubifs-media.h
new file mode 100644
index 000000000000..0cc7da9bed47
--- /dev/null
+++ b/fs/ubifs/ubifs-media.h
@@ -0,0 +1,745 @@
1/*
2 * This file is part of UBIFS.
3 *
4 * Copyright (C) 2006-2008 Nokia Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published by
8 * the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along with
16 * this program; if not, write to the Free Software Foundation, Inc., 51
17 * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 * Authors: Artem Bityutskiy (Битюцкий Артём)
20 * Adrian Hunter
21 */
22
23/*
24 * This file describes UBIFS on-flash format and contains definitions of all the
25 * relevant data structures and constants.
26 *
27 * All UBIFS on-flash objects are stored in the form of nodes. All nodes start
28 * with the UBIFS node magic number and have the same common header. Nodes
29 * always sit at 8-byte aligned positions on the media and node header sizes are
30 * also 8-byte aligned (except for the indexing node and the padding node).
31 */
32
33#ifndef __UBIFS_MEDIA_H__
34#define __UBIFS_MEDIA_H__
35
36/* UBIFS node magic number (must not have the padding byte first or last) */
37#define UBIFS_NODE_MAGIC 0x06101831
38
39/* UBIFS on-flash format version */
40#define UBIFS_FORMAT_VERSION 4
41
42/* Minimum logical eraseblock size in bytes */
43#define UBIFS_MIN_LEB_SZ (15*1024)
44
45/* Initial CRC32 value used when calculating CRC checksums */
46#define UBIFS_CRC32_INIT 0xFFFFFFFFU
47
48/*
49 * UBIFS does not try to compress data if its length is less than the below
50 * constant.
51 */
52#define UBIFS_MIN_COMPR_LEN 128
53
54/* Root inode number */
55#define UBIFS_ROOT_INO 1
56
57/* Lowest inode number used for regular inodes (not UBIFS-only internal ones) */
58#define UBIFS_FIRST_INO 64
59
60/*
61 * Maximum file name and extended attribute length (must be a multiple of 8,
62 * minus 1).
63 */
64#define UBIFS_MAX_NLEN 255
65
66/* Maximum number of data journal heads */
67#define UBIFS_MAX_JHEADS 1
68
69/*
70 * Size of UBIFS data block. Note, UBIFS is not a block oriented file-system,
71 * which means that it does not treat the underlying media as consisting of
72 * blocks like in case of hard drives. Do not be confused. UBIFS block is just
73 * the maximum amount of data which one data node can have or which can be
74 * attached to an inode node.
75 */
76#define UBIFS_BLOCK_SIZE 4096
77#define UBIFS_BLOCK_SHIFT 12
78#define UBIFS_BLOCK_MASK 0x00000FFF
79
80/* UBIFS padding byte pattern (must not be first or last byte of node magic) */
81#define UBIFS_PADDING_BYTE 0xCE
82
83/* Maximum possible key length */
84#define UBIFS_MAX_KEY_LEN 16
85
86/* Key length ("simple" format) */
87#define UBIFS_SK_LEN 8
88
89/* Minimum index tree fanout */
90#define UBIFS_MIN_FANOUT 2
91
92/* Maximum number of levels in UBIFS indexing B-tree */
93#define UBIFS_MAX_LEVELS 512
94
95/* Maximum amount of data attached to an inode in bytes */
96#define UBIFS_MAX_INO_DATA UBIFS_BLOCK_SIZE
97
98/* LEB Properties Tree fanout (must be power of 2) and fanout shift */
99#define UBIFS_LPT_FANOUT 4
100#define UBIFS_LPT_FANOUT_SHIFT 2
101
102/* LEB Properties Tree bit field sizes */
103#define UBIFS_LPT_CRC_BITS 16
104#define UBIFS_LPT_CRC_BYTES 2
105#define UBIFS_LPT_TYPE_BITS 4
106
107/* The key is always at the same position in all keyed nodes */
108#define UBIFS_KEY_OFFSET offsetof(struct ubifs_ino_node, key)
109
110/*
111 * LEB Properties Tree node types.
112 *
113 * UBIFS_LPT_PNODE: LPT leaf node (contains LEB properties)
114 * UBIFS_LPT_NNODE: LPT internal node
115 * UBIFS_LPT_LTAB: LPT's own lprops table
116 * UBIFS_LPT_LSAVE: LPT's save table (big model only)
117 * UBIFS_LPT_NODE_CNT: count of LPT node types
118 * UBIFS_LPT_NOT_A_NODE: all ones (15 for 4 bits) is never a valid node type
119 */
120enum {
121 UBIFS_LPT_PNODE,
122 UBIFS_LPT_NNODE,
123 UBIFS_LPT_LTAB,
124 UBIFS_LPT_LSAVE,
125 UBIFS_LPT_NODE_CNT,
126 UBIFS_LPT_NOT_A_NODE = (1 << UBIFS_LPT_TYPE_BITS) - 1,
127};
128
129/*
130 * UBIFS inode types.
131 *
132 * UBIFS_ITYPE_REG: regular file
133 * UBIFS_ITYPE_DIR: directory
134 * UBIFS_ITYPE_LNK: soft link
135 * UBIFS_ITYPE_BLK: block device node
136 * UBIFS_ITYPE_CHR: character device node
137 * UBIFS_ITYPE_FIFO: fifo
138 * UBIFS_ITYPE_SOCK: socket
139 * UBIFS_ITYPES_CNT: count of supported file types
140 */
141enum {
142 UBIFS_ITYPE_REG,
143 UBIFS_ITYPE_DIR,
144 UBIFS_ITYPE_LNK,
145 UBIFS_ITYPE_BLK,
146 UBIFS_ITYPE_CHR,
147 UBIFS_ITYPE_FIFO,
148 UBIFS_ITYPE_SOCK,
149 UBIFS_ITYPES_CNT,
150};
151
152/*
153 * Supported key hash functions.
154 *
155 * UBIFS_KEY_HASH_R5: R5 hash
156 * UBIFS_KEY_HASH_TEST: test hash which just returns first 4 bytes of the name
157 */
158enum {
159 UBIFS_KEY_HASH_R5,
160 UBIFS_KEY_HASH_TEST,
161};
162
163/*
164 * Supported key formats.
165 *
166 * UBIFS_SIMPLE_KEY_FMT: simple key format
167 */
168enum {
169 UBIFS_SIMPLE_KEY_FMT,
170};
171
172/*
173 * The simple key format uses 29 bits for storing UBIFS block number and hash
174 * value.
175 */
176#define UBIFS_S_KEY_BLOCK_BITS 29
177#define UBIFS_S_KEY_BLOCK_MASK 0x1FFFFFFF
178#define UBIFS_S_KEY_HASH_BITS UBIFS_S_KEY_BLOCK_BITS
179#define UBIFS_S_KEY_HASH_MASK UBIFS_S_KEY_BLOCK_MASK
180
181/*
182 * Key types.
183 *
184 * UBIFS_INO_KEY: inode node key
185 * UBIFS_DATA_KEY: data node key
186 * UBIFS_DENT_KEY: directory entry node key
187 * UBIFS_XENT_KEY: extended attribute entry key
188 * UBIFS_KEY_TYPES_CNT: number of supported key types
189 */
190enum {
191 UBIFS_INO_KEY,
192 UBIFS_DATA_KEY,
193 UBIFS_DENT_KEY,
194 UBIFS_XENT_KEY,
195 UBIFS_KEY_TYPES_CNT,
196};
197
198/* Count of LEBs reserved for the superblock area */
199#define UBIFS_SB_LEBS 1
200/* Count of LEBs reserved for the master area */
201#define UBIFS_MST_LEBS 2
202
203/* First LEB of the superblock area */
204#define UBIFS_SB_LNUM 0
205/* First LEB of the master area */
206#define UBIFS_MST_LNUM (UBIFS_SB_LNUM + UBIFS_SB_LEBS)
207/* First LEB of the log area */
208#define UBIFS_LOG_LNUM (UBIFS_MST_LNUM + UBIFS_MST_LEBS)
209
210/*
211 * The below constants define the absolute minimum values for various UBIFS
212 * media areas. Many of them actually depend of flash geometry and the FS
213 * configuration (number of journal heads, orphan LEBs, etc). This means that
214 * the smallest volume size which can be used for UBIFS cannot be pre-defined
215 * by these constants. The file-system that meets the below limitation will not
216 * necessarily mount. UBIFS does run-time calculations and validates the FS
217 * size.
218 */
219
220/* Minimum number of logical eraseblocks in the log */
221#define UBIFS_MIN_LOG_LEBS 2
222/* Minimum number of bud logical eraseblocks (one for each head) */
223#define UBIFS_MIN_BUD_LEBS 3
224/* Minimum number of journal logical eraseblocks */
225#define UBIFS_MIN_JNL_LEBS (UBIFS_MIN_LOG_LEBS + UBIFS_MIN_BUD_LEBS)
226/* Minimum number of LPT area logical eraseblocks */
227#define UBIFS_MIN_LPT_LEBS 2
228/* Minimum number of orphan area logical eraseblocks */
229#define UBIFS_MIN_ORPH_LEBS 1
230/*
231 * Minimum number of main area logical eraseblocks (buds, 2 for the index, 1
232 * for GC, 1 for deletions, and at least 1 for committed data).
233 */
234#define UBIFS_MIN_MAIN_LEBS (UBIFS_MIN_BUD_LEBS + 5)
235
236/* Minimum number of logical eraseblocks */
237#define UBIFS_MIN_LEB_CNT (UBIFS_SB_LEBS + UBIFS_MST_LEBS + \
238 UBIFS_MIN_LOG_LEBS + UBIFS_MIN_LPT_LEBS + \
239 UBIFS_MIN_ORPH_LEBS + UBIFS_MIN_MAIN_LEBS)
240
241/* Node sizes (N.B. these are guaranteed to be multiples of 8) */
242#define UBIFS_CH_SZ sizeof(struct ubifs_ch)
243#define UBIFS_INO_NODE_SZ sizeof(struct ubifs_ino_node)
244#define UBIFS_DATA_NODE_SZ sizeof(struct ubifs_data_node)
245#define UBIFS_DENT_NODE_SZ sizeof(struct ubifs_dent_node)
246#define UBIFS_TRUN_NODE_SZ sizeof(struct ubifs_trun_node)
247#define UBIFS_PAD_NODE_SZ sizeof(struct ubifs_pad_node)
248#define UBIFS_SB_NODE_SZ sizeof(struct ubifs_sb_node)
249#define UBIFS_MST_NODE_SZ sizeof(struct ubifs_mst_node)
250#define UBIFS_REF_NODE_SZ sizeof(struct ubifs_ref_node)
251#define UBIFS_IDX_NODE_SZ sizeof(struct ubifs_idx_node)
252#define UBIFS_CS_NODE_SZ sizeof(struct ubifs_cs_node)
253#define UBIFS_ORPH_NODE_SZ sizeof(struct ubifs_orph_node)
254/* Extended attribute entry nodes are identical to directory entry nodes */
255#define UBIFS_XENT_NODE_SZ UBIFS_DENT_NODE_SZ
256/* Only this does not have to be multiple of 8 bytes */
257#define UBIFS_BRANCH_SZ sizeof(struct ubifs_branch)
258
259/* Maximum node sizes (N.B. these are guaranteed to be multiples of 8) */
260#define UBIFS_MAX_DATA_NODE_SZ (UBIFS_DATA_NODE_SZ + UBIFS_BLOCK_SIZE)
261#define UBIFS_MAX_INO_NODE_SZ (UBIFS_INO_NODE_SZ + UBIFS_MAX_INO_DATA)
262#define UBIFS_MAX_DENT_NODE_SZ (UBIFS_DENT_NODE_SZ + UBIFS_MAX_NLEN + 1)
263#define UBIFS_MAX_XENT_NODE_SZ UBIFS_MAX_DENT_NODE_SZ
264
265/* The largest UBIFS node */
266#define UBIFS_MAX_NODE_SZ UBIFS_MAX_INO_NODE_SZ
267
268/*
269 * On-flash inode flags.
270 *
271 * UBIFS_COMPR_FL: use compression for this inode
272 * UBIFS_SYNC_FL: I/O on this inode has to be synchronous
273 * UBIFS_IMMUTABLE_FL: inode is immutable
274 * UBIFS_APPEND_FL: writes to the inode may only append data
275 * UBIFS_DIRSYNC_FL: I/O on this directory inode has to be synchronous
276 * UBIFS_XATTR_FL: this inode is the inode for an extended attribute value
277 *
278 * Note, these are on-flash flags which correspond to ioctl flags
279 * (@FS_COMPR_FL, etc). They have the same values now, but generally, do not
280 * have to be the same.
281 */
282enum {
283 UBIFS_COMPR_FL = 0x01,
284 UBIFS_SYNC_FL = 0x02,
285 UBIFS_IMMUTABLE_FL = 0x04,
286 UBIFS_APPEND_FL = 0x08,
287 UBIFS_DIRSYNC_FL = 0x10,
288 UBIFS_XATTR_FL = 0x20,
289};
290
291/* Inode flag bits used by UBIFS */
292#define UBIFS_FL_MASK 0x0000001F
293
294/*
295 * UBIFS compression algorithms.
296 *
297 * UBIFS_COMPR_NONE: no compression
298 * UBIFS_COMPR_LZO: LZO compression
299 * UBIFS_COMPR_ZLIB: ZLIB compression
300 * UBIFS_COMPR_TYPES_CNT: count of supported compression types
301 */
302enum {
303 UBIFS_COMPR_NONE,
304 UBIFS_COMPR_LZO,
305 UBIFS_COMPR_ZLIB,
306 UBIFS_COMPR_TYPES_CNT,
307};
308
309/*
310 * UBIFS node types.
311 *
312 * UBIFS_INO_NODE: inode node
313 * UBIFS_DATA_NODE: data node
314 * UBIFS_DENT_NODE: directory entry node
315 * UBIFS_XENT_NODE: extended attribute node
316 * UBIFS_TRUN_NODE: truncation node
317 * UBIFS_PAD_NODE: padding node
318 * UBIFS_SB_NODE: superblock node
319 * UBIFS_MST_NODE: master node
320 * UBIFS_REF_NODE: LEB reference node
321 * UBIFS_IDX_NODE: index node
322 * UBIFS_CS_NODE: commit start node
323 * UBIFS_ORPH_NODE: orphan node
324 * UBIFS_NODE_TYPES_CNT: count of supported node types
325 *
326 * Note, we index arrays by these numbers, so keep them low and contiguous.
327 * Node type constants for inodes, direntries and so on have to be the same as
328 * corresponding key type constants.
329 */
330enum {
331 UBIFS_INO_NODE,
332 UBIFS_DATA_NODE,
333 UBIFS_DENT_NODE,
334 UBIFS_XENT_NODE,
335 UBIFS_TRUN_NODE,
336 UBIFS_PAD_NODE,
337 UBIFS_SB_NODE,
338 UBIFS_MST_NODE,
339 UBIFS_REF_NODE,
340 UBIFS_IDX_NODE,
341 UBIFS_CS_NODE,
342 UBIFS_ORPH_NODE,
343 UBIFS_NODE_TYPES_CNT,
344};
345
346/*
347 * Master node flags.
348 *
349 * UBIFS_MST_DIRTY: rebooted uncleanly - master node is dirty
350 * UBIFS_MST_NO_ORPHS: no orphan inodes present
351 * UBIFS_MST_RCVRY: written by recovery
352 */
353enum {
354 UBIFS_MST_DIRTY = 1,
355 UBIFS_MST_NO_ORPHS = 2,
356 UBIFS_MST_RCVRY = 4,
357};
358
359/*
360 * Node group type (used by recovery to recover whole group or none).
361 *
362 * UBIFS_NO_NODE_GROUP: this node is not part of a group
363 * UBIFS_IN_NODE_GROUP: this node is a part of a group
364 * UBIFS_LAST_OF_NODE_GROUP: this node is the last in a group
365 */
366enum {
367 UBIFS_NO_NODE_GROUP = 0,
368 UBIFS_IN_NODE_GROUP,
369 UBIFS_LAST_OF_NODE_GROUP,
370};
371
372/*
373 * Superblock flags.
374 *
375 * UBIFS_FLG_BIGLPT: if "big" LPT model is used if set
376 */
377enum {
378 UBIFS_FLG_BIGLPT = 0x02,
379};
380
381/**
382 * struct ubifs_ch - common header node.
383 * @magic: UBIFS node magic number (%UBIFS_NODE_MAGIC)
384 * @crc: CRC-32 checksum of the node header
385 * @sqnum: sequence number
386 * @len: full node length
387 * @node_type: node type
388 * @group_type: node group type
389 * @padding: reserved for future, zeroes
390 *
391 * Every UBIFS node starts with this common part. If the node has a key, the
392 * key always goes next.
393 */
394struct ubifs_ch {
395 __le32 magic;
396 __le32 crc;
397 __le64 sqnum;
398 __le32 len;
399 __u8 node_type;
400 __u8 group_type;
401 __u8 padding[2];
402} __attribute__ ((packed));
403
404/**
405 * union ubifs_dev_desc - device node descriptor.
406 * @new: new type device descriptor
407 * @huge: huge type device descriptor
408 *
409 * This data structure describes major/minor numbers of a device node. In an
410 * inode is a device node then its data contains an object of this type. UBIFS
411 * uses standard Linux "new" and "huge" device node encodings.
412 */
413union ubifs_dev_desc {
414 __le32 new;
415 __le64 huge;
416} __attribute__ ((packed));
417
418/**
419 * struct ubifs_ino_node - inode node.
420 * @ch: common header
421 * @key: node key
422 * @creat_sqnum: sequence number at time of creation
423 * @size: inode size in bytes (amount of uncompressed data)
424 * @atime_sec: access time seconds
425 * @ctime_sec: creation time seconds
426 * @mtime_sec: modification time seconds
427 * @atime_nsec: access time nanoseconds
428 * @ctime_nsec: creation time nanoseconds
429 * @mtime_nsec: modification time nanoseconds
430 * @nlink: number of hard links
431 * @uid: owner ID
432 * @gid: group ID
433 * @mode: access flags
434 * @flags: per-inode flags (%UBIFS_COMPR_FL, %UBIFS_SYNC_FL, etc)
435 * @data_len: inode data length
436 * @xattr_cnt: count of extended attributes this inode has
437 * @xattr_size: summarized size of all extended attributes in bytes
438 * @padding1: reserved for future, zeroes
439 * @xattr_names: sum of lengths of all extended attribute names belonging to
440 * this inode
441 * @compr_type: compression type used for this inode
442 * @padding2: reserved for future, zeroes
443 * @data: data attached to the inode
444 *
445 * Note, even though inode compression type is defined by @compr_type, some
446 * nodes of this inode may be compressed with different compressor - this
447 * happens if compression type is changed while the inode already has data
448 * nodes. But @compr_type will be use for further writes to the inode.
449 *
450 * Note, do not forget to amend 'zero_ino_node_unused()' function when changing
451 * the padding fields.
452 */
453struct ubifs_ino_node {
454 struct ubifs_ch ch;
455 __u8 key[UBIFS_MAX_KEY_LEN];
456 __le64 creat_sqnum;
457 __le64 size;
458 __le64 atime_sec;
459 __le64 ctime_sec;
460 __le64 mtime_sec;
461 __le32 atime_nsec;
462 __le32 ctime_nsec;
463 __le32 mtime_nsec;
464 __le32 nlink;
465 __le32 uid;
466 __le32 gid;
467 __le32 mode;
468 __le32 flags;
469 __le32 data_len;
470 __le32 xattr_cnt;
471 __le32 xattr_size;
472 __u8 padding1[4]; /* Watch 'zero_ino_node_unused()' if changing! */
473 __le32 xattr_names;
474 __le16 compr_type;
475 __u8 padding2[26]; /* Watch 'zero_ino_node_unused()' if changing! */
476 __u8 data[];
477} __attribute__ ((packed));
478
479/**
480 * struct ubifs_dent_node - directory entry node.
481 * @ch: common header
482 * @key: node key
483 * @inum: target inode number
484 * @padding1: reserved for future, zeroes
485 * @type: type of the target inode (%UBIFS_ITYPE_REG, %UBIFS_ITYPE_DIR, etc)
486 * @nlen: name length
487 * @padding2: reserved for future, zeroes
488 * @name: zero-terminated name
489 *
490 * Note, do not forget to amend 'zero_dent_node_unused()' function when
491 * changing the padding fields.
492 */
493struct ubifs_dent_node {
494 struct ubifs_ch ch;
495 __u8 key[UBIFS_MAX_KEY_LEN];
496 __le64 inum;
497 __u8 padding1;
498 __u8 type;
499 __le16 nlen;
500 __u8 padding2[4]; /* Watch 'zero_dent_node_unused()' if changing! */
501 __u8 name[];
502} __attribute__ ((packed));
503
504/**
505 * struct ubifs_data_node - data node.
506 * @ch: common header
507 * @key: node key
508 * @size: uncompressed data size in bytes
509 * @compr_type: compression type (%UBIFS_COMPR_NONE, %UBIFS_COMPR_LZO, etc)
510 * @padding: reserved for future, zeroes
511 * @data: data
512 *
513 * Note, do not forget to amend 'zero_data_node_unused()' function when
514 * changing the padding fields.
515 */
516struct ubifs_data_node {
517 struct ubifs_ch ch;
518 __u8 key[UBIFS_MAX_KEY_LEN];
519 __le32 size;
520 __le16 compr_type;
521 __u8 padding[2]; /* Watch 'zero_data_node_unused()' if changing! */
522 __u8 data[];
523} __attribute__ ((packed));
524
525/**
526 * struct ubifs_trun_node - truncation node.
527 * @ch: common header
528 * @inum: truncated inode number
529 * @padding: reserved for future, zeroes
530 * @old_size: size before truncation
531 * @new_size: size after truncation
532 *
533 * This node exists only in the journal and never goes to the main area. Note,
534 * do not forget to amend 'zero_trun_node_unused()' function when changing the
535 * padding fields.
536 */
537struct ubifs_trun_node {
538 struct ubifs_ch ch;
539 __le32 inum;
540 __u8 padding[12]; /* Watch 'zero_trun_node_unused()' if changing! */
541 __le64 old_size;
542 __le64 new_size;
543} __attribute__ ((packed));
544
545/**
546 * struct ubifs_pad_node - padding node.
547 * @ch: common header
548 * @pad_len: how many bytes after this node are unused (because padded)
549 * @padding: reserved for future, zeroes
550 */
551struct ubifs_pad_node {
552 struct ubifs_ch ch;
553 __le32 pad_len;
554} __attribute__ ((packed));
555
556/**
557 * struct ubifs_sb_node - superblock node.
558 * @ch: common header
559 * @padding: reserved for future, zeroes
560 * @key_hash: type of hash function used in keys
561 * @key_fmt: format of the key
562 * @flags: file-system flags (%UBIFS_FLG_BIGLPT, etc)
563 * @min_io_size: minimal input/output unit size
564 * @leb_size: logical eraseblock size in bytes
565 * @leb_cnt: count of LEBs used by file-system
566 * @max_leb_cnt: maximum count of LEBs used by file-system
567 * @max_bud_bytes: maximum amount of data stored in buds
568 * @log_lebs: log size in logical eraseblocks
569 * @lpt_lebs: number of LEBs used for lprops table
570 * @orph_lebs: number of LEBs used for recording orphans
571 * @jhead_cnt: count of journal heads
572 * @fanout: tree fanout (max. number of links per indexing node)
573 * @lsave_cnt: number of LEB numbers in LPT's save table
574 * @fmt_version: UBIFS on-flash format version
575 * @default_compr: default compression algorithm (%UBIFS_COMPR_LZO, etc)
576 * @padding1: reserved for future, zeroes
577 * @rp_uid: reserve pool UID
578 * @rp_gid: reserve pool GID
579 * @rp_size: size of the reserved pool in bytes
580 * @padding2: reserved for future, zeroes
581 * @time_gran: time granularity in nanoseconds
582 * @uuid: UUID generated when the file system image was created
583 */
584struct ubifs_sb_node {
585 struct ubifs_ch ch;
586 __u8 padding[2];
587 __u8 key_hash;
588 __u8 key_fmt;
589 __le32 flags;
590 __le32 min_io_size;
591 __le32 leb_size;
592 __le32 leb_cnt;
593 __le32 max_leb_cnt;
594 __le64 max_bud_bytes;
595 __le32 log_lebs;
596 __le32 lpt_lebs;
597 __le32 orph_lebs;
598 __le32 jhead_cnt;
599 __le32 fanout;
600 __le32 lsave_cnt;
601 __le32 fmt_version;
602 __le16 default_compr;
603 __u8 padding1[2];
604 __le32 rp_uid;
605 __le32 rp_gid;
606 __le64 rp_size;
607 __le32 time_gran;
608 __u8 uuid[16];
609 __u8 padding2[3972];
610} __attribute__ ((packed));
611
612/**
613 * struct ubifs_mst_node - master node.
614 * @ch: common header
615 * @highest_inum: highest inode number in the committed index
616 * @cmt_no: commit number
617 * @flags: various flags (%UBIFS_MST_DIRTY, etc)
618 * @log_lnum: start of the log
619 * @root_lnum: LEB number of the root indexing node
620 * @root_offs: offset within @root_lnum
621 * @root_len: root indexing node length
622 * @gc_lnum: LEB reserved for garbage collection (%-1 value means the LEB was
623 * not reserved and should be reserved on mount)
624 * @ihead_lnum: LEB number of index head
625 * @ihead_offs: offset of index head
626 * @index_size: size of index on flash
627 * @total_free: total free space in bytes
628 * @total_dirty: total dirty space in bytes
629 * @total_used: total used space in bytes (includes only data LEBs)
630 * @total_dead: total dead space in bytes (includes only data LEBs)
631 * @total_dark: total dark space in bytes (includes only data LEBs)
632 * @lpt_lnum: LEB number of LPT root nnode
633 * @lpt_offs: offset of LPT root nnode
634 * @nhead_lnum: LEB number of LPT head
635 * @nhead_offs: offset of LPT head
636 * @ltab_lnum: LEB number of LPT's own lprops table
637 * @ltab_offs: offset of LPT's own lprops table
638 * @lsave_lnum: LEB number of LPT's save table (big model only)
639 * @lsave_offs: offset of LPT's save table (big model only)
640 * @lscan_lnum: LEB number of last LPT scan
641 * @empty_lebs: number of empty logical eraseblocks
642 * @idx_lebs: number of indexing logical eraseblocks
643 * @leb_cnt: count of LEBs used by file-system
644 * @padding: reserved for future, zeroes
645 */
646struct ubifs_mst_node {
647 struct ubifs_ch ch;
648 __le64 highest_inum;
649 __le64 cmt_no;
650 __le32 flags;
651 __le32 log_lnum;
652 __le32 root_lnum;
653 __le32 root_offs;
654 __le32 root_len;
655 __le32 gc_lnum;
656 __le32 ihead_lnum;
657 __le32 ihead_offs;
658 __le64 index_size;
659 __le64 total_free;
660 __le64 total_dirty;
661 __le64 total_used;
662 __le64 total_dead;
663 __le64 total_dark;
664 __le32 lpt_lnum;
665 __le32 lpt_offs;
666 __le32 nhead_lnum;
667 __le32 nhead_offs;
668 __le32 ltab_lnum;
669 __le32 ltab_offs;
670 __le32 lsave_lnum;
671 __le32 lsave_offs;
672 __le32 lscan_lnum;
673 __le32 empty_lebs;
674 __le32 idx_lebs;
675 __le32 leb_cnt;
676 __u8 padding[344];
677} __attribute__ ((packed));
678
679/**
680 * struct ubifs_ref_node - logical eraseblock reference node.
681 * @ch: common header
682 * @lnum: the referred logical eraseblock number
683 * @offs: start offset in the referred LEB
684 * @jhead: journal head number
685 * @padding: reserved for future, zeroes
686 */
687struct ubifs_ref_node {
688 struct ubifs_ch ch;
689 __le32 lnum;
690 __le32 offs;
691 __le32 jhead;
692 __u8 padding[28];
693} __attribute__ ((packed));
694
695/**
696 * struct ubifs_branch - key/reference/length branch
697 * @lnum: LEB number of the target node
698 * @offs: offset within @lnum
699 * @len: target node length
700 * @key: key
701 */
702struct ubifs_branch {
703 __le32 lnum;
704 __le32 offs;
705 __le32 len;
706 __u8 key[];
707} __attribute__ ((packed));
708
709/**
710 * struct ubifs_idx_node - indexing node.
711 * @ch: common header
712 * @child_cnt: number of child index nodes
713 * @level: tree level
714 * @branches: LEB number / offset / length / key branches
715 */
716struct ubifs_idx_node {
717 struct ubifs_ch ch;
718 __le16 child_cnt;
719 __le16 level;
720 __u8 branches[];
721} __attribute__ ((packed));
722
723/**
724 * struct ubifs_cs_node - commit start node.
725 * @ch: common header
726 * @cmt_no: commit number
727 */
728struct ubifs_cs_node {
729 struct ubifs_ch ch;
730 __le64 cmt_no;
731} __attribute__ ((packed));
732
733/**
734 * struct ubifs_orph_node - orphan node.
735 * @ch: common header
736 * @cmt_no: commit number (also top bit is set on the last node of the commit)
737 * @inos: inode numbers of orphans
738 */
739struct ubifs_orph_node {
740 struct ubifs_ch ch;
741 __le64 cmt_no;
742 __le64 inos[];
743} __attribute__ ((packed));
744
745#endif /* __UBIFS_MEDIA_H__ */
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
new file mode 100644
index 000000000000..e4f89f271827
--- /dev/null
+++ b/fs/ubifs/ubifs.h
@@ -0,0 +1,1649 @@
1/*
2 * This file is part of UBIFS.
3 *
4 * Copyright (C) 2006-2008 Nokia Corporation
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published by
8 * the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along with
16 * this program; if not, write to the Free Software Foundation, Inc., 51
17 * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 * Authors: Artem Bityutskiy (Битюцкий Артём)
20 * Adrian Hunter
21 */
22
23/* Implementation version 0.7 */
24
25#ifndef __UBIFS_H__
26#define __UBIFS_H__
27
28#include <asm/div64.h>
29#include <linux/statfs.h>
30#include <linux/fs.h>
31#include <linux/err.h>
32#include <linux/sched.h>
33#include <linux/vmalloc.h>
34#include <linux/spinlock.h>
35#include <linux/mutex.h>
36#include <linux/rwsem.h>
37#include <linux/mtd/ubi.h>
38#include <linux/pagemap.h>
39#include <linux/backing-dev.h>
40#include "ubifs-media.h"
41
42/* Version of this UBIFS implementation */
43#define UBIFS_VERSION 1
44
45/* Normal UBIFS messages */
46#define ubifs_msg(fmt, ...) \
47 printk(KERN_NOTICE "UBIFS: " fmt "\n", ##__VA_ARGS__)
48/* UBIFS error messages */
49#define ubifs_err(fmt, ...) \
50 printk(KERN_ERR "UBIFS error (pid %d): %s: " fmt "\n", current->pid, \
51 __func__, ##__VA_ARGS__)
52/* UBIFS warning messages */
53#define ubifs_warn(fmt, ...) \
54 printk(KERN_WARNING "UBIFS warning (pid %d): %s: " fmt "\n", \
55 current->pid, __func__, ##__VA_ARGS__)
56
57/* UBIFS file system VFS magic number */
58#define UBIFS_SUPER_MAGIC 0x24051905
59
60/* Number of UBIFS blocks per VFS page */
61#define UBIFS_BLOCKS_PER_PAGE (PAGE_CACHE_SIZE / UBIFS_BLOCK_SIZE)
62#define UBIFS_BLOCKS_PER_PAGE_SHIFT (PAGE_CACHE_SHIFT - UBIFS_BLOCK_SHIFT)
63
64/* "File system end of life" sequence number watermark */
65#define SQNUM_WARN_WATERMARK 0xFFFFFFFF00000000ULL
66#define SQNUM_WATERMARK 0xFFFFFFFFFF000000ULL
67
68/* Minimum amount of data UBIFS writes to the flash */
69#define MIN_WRITE_SZ (UBIFS_DATA_NODE_SZ + 8)
70
71/*
72 * Currently we do not support inode number overlapping and re-using, so this
73 * watermark defines dangerous inode number level. This should be fixed later,
74 * although it is difficult to exceed current limit. Another option is to use
75 * 64-bit inode numbers, but this means more overhead.
76 */
77#define INUM_WARN_WATERMARK 0xFFF00000
78#define INUM_WATERMARK 0xFFFFFF00
79
80/* Largest key size supported in this implementation */
81#define CUR_MAX_KEY_LEN UBIFS_SK_LEN
82
83/* Maximum number of entries in each LPT (LEB category) heap */
84#define LPT_HEAP_SZ 256
85
86/*
87 * Background thread name pattern. The numbers are UBI device and volume
88 * numbers.
89 */
90#define BGT_NAME_PATTERN "ubifs_bgt%d_%d"
91
92/* Default write-buffer synchronization timeout (5 secs) */
93#define DEFAULT_WBUF_TIMEOUT (5 * HZ)
94
95/* Maximum possible inode number (only 32-bit inodes are supported now) */
96#define MAX_INUM 0xFFFFFFFF
97
98/* Number of non-data journal heads */
99#define NONDATA_JHEADS_CNT 2
100
101/* Garbage collector head */
102#define GCHD 0
103/* Base journal head number */
104#define BASEHD 1
105/* First "general purpose" journal head */
106#define DATAHD 2
107
108/* 'No change' value for 'ubifs_change_lp()' */
109#define LPROPS_NC 0x80000001
110
111/*
112 * There is no notion of truncation key because truncation nodes do not exist
113 * in TNC. However, when replaying, it is handy to introduce fake "truncation"
114 * keys for truncation nodes because the code becomes simpler. So we define
115 * %UBIFS_TRUN_KEY type.
116 */
117#define UBIFS_TRUN_KEY UBIFS_KEY_TYPES_CNT
118
119/*
120 * How much a directory entry/extended attribute entry adds to the parent/host
121 * inode.
122 */
123#define CALC_DENT_SIZE(name_len) ALIGN(UBIFS_DENT_NODE_SZ + (name_len) + 1, 8)
124
125/* How much an extended attribute adds to the host inode */
126#define CALC_XATTR_BYTES(data_len) ALIGN(UBIFS_INO_NODE_SZ + (data_len) + 1, 8)
127
128/*
129 * Znodes which were not touched for 'OLD_ZNODE_AGE' seconds are considered
130 * "old", and znode which were touched last 'YOUNG_ZNODE_AGE' seconds ago are
131 * considered "young". This is used by shrinker when selecting znode to trim
132 * off.
133 */
134#define OLD_ZNODE_AGE 20
135#define YOUNG_ZNODE_AGE 5
136
137/*
138 * Some compressors, like LZO, may end up with more data then the input buffer.
139 * So UBIFS always allocates larger output buffer, to be sure the compressor
140 * will not corrupt memory in case of worst case compression.
141 */
142#define WORST_COMPR_FACTOR 2
143
144/* Maximum expected tree height for use by bottom_up_buf */
145#define BOTTOM_UP_HEIGHT 64
146
147/*
148 * Lockdep classes for UBIFS inode @ui_mutex.
149 */
150enum {
151 WB_MUTEX_1 = 0,
152 WB_MUTEX_2 = 1,
153 WB_MUTEX_3 = 2,
154};
155
156/*
157 * Znode flags (actually, bit numbers which store the flags).
158 *
159 * DIRTY_ZNODE: znode is dirty
160 * COW_ZNODE: znode is being committed and a new instance of this znode has to
161 * be created before changing this znode
162 * OBSOLETE_ZNODE: znode is obsolete, which means it was deleted, but it is
163 * still in the commit list and the ongoing commit operation
164 * will commit it, and delete this znode after it is done
165 */
166enum {
167 DIRTY_ZNODE = 0,
168 COW_ZNODE = 1,
169 OBSOLETE_ZNODE = 2,
170};
171
172/*
173 * Commit states.
174 *
175 * COMMIT_RESTING: commit is not wanted
176 * COMMIT_BACKGROUND: background commit has been requested
177 * COMMIT_REQUIRED: commit is required
178 * COMMIT_RUNNING_BACKGROUND: background commit is running
179 * COMMIT_RUNNING_REQUIRED: commit is running and it is required
180 * COMMIT_BROKEN: commit failed
181 */
182enum {
183 COMMIT_RESTING = 0,
184 COMMIT_BACKGROUND,
185 COMMIT_REQUIRED,
186 COMMIT_RUNNING_BACKGROUND,
187 COMMIT_RUNNING_REQUIRED,
188 COMMIT_BROKEN,
189};
190
191/*
192 * 'ubifs_scan_a_node()' return values.
193 *
194 * SCANNED_GARBAGE: scanned garbage
195 * SCANNED_EMPTY_SPACE: scanned empty space
196 * SCANNED_A_NODE: scanned a valid node
197 * SCANNED_A_CORRUPT_NODE: scanned a corrupted node
198 * SCANNED_A_BAD_PAD_NODE: scanned a padding node with invalid pad length
199 *
200 * Greater than zero means: 'scanned that number of padding bytes'
201 */
202enum {
203 SCANNED_GARBAGE = 0,
204 SCANNED_EMPTY_SPACE = -1,
205 SCANNED_A_NODE = -2,
206 SCANNED_A_CORRUPT_NODE = -3,
207 SCANNED_A_BAD_PAD_NODE = -4,
208};
209
210/*
211 * LPT cnode flag bits.
212 *
213 * DIRTY_CNODE: cnode is dirty
214 * COW_CNODE: cnode is being committed and must be copied before writing
215 * OBSOLETE_CNODE: cnode is being committed and has been copied (or deleted),
216 * so it can (and must) be freed when the commit is finished
217 */
218enum {
219 DIRTY_CNODE = 0,
220 COW_CNODE = 1,
221 OBSOLETE_CNODE = 2,
222};
223
224/*
225 * Dirty flag bits (lpt_drty_flgs) for LPT special nodes.
226 *
227 * LTAB_DIRTY: ltab node is dirty
228 * LSAVE_DIRTY: lsave node is dirty
229 */
230enum {
231 LTAB_DIRTY = 1,
232 LSAVE_DIRTY = 2,
233};
234
235/*
236 * Return codes used by the garbage collector.
237 * @LEB_FREED: the logical eraseblock was freed and is ready to use
238 * @LEB_FREED_IDX: indexing LEB was freed and can be used only after the commit
239 * @LEB_RETAINED: the logical eraseblock was freed and retained for GC purposes
240 */
241enum {
242 LEB_FREED,
243 LEB_FREED_IDX,
244 LEB_RETAINED,
245};
246
247/**
248 * struct ubifs_old_idx - index node obsoleted since last commit start.
249 * @rb: rb-tree node
250 * @lnum: LEB number of obsoleted index node
251 * @offs: offset of obsoleted index node
252 */
253struct ubifs_old_idx {
254 struct rb_node rb;
255 int lnum;
256 int offs;
257};
258
259/* The below union makes it easier to deal with keys */
260union ubifs_key {
261 uint8_t u8[CUR_MAX_KEY_LEN];
262 uint32_t u32[CUR_MAX_KEY_LEN/4];
263 uint64_t u64[CUR_MAX_KEY_LEN/8];
264 __le32 j32[CUR_MAX_KEY_LEN/4];
265};
266
267/**
268 * struct ubifs_scan_node - UBIFS scanned node information.
269 * @list: list of scanned nodes
270 * @key: key of node scanned (if it has one)
271 * @sqnum: sequence number
272 * @type: type of node scanned
273 * @offs: offset with LEB of node scanned
274 * @len: length of node scanned
275 * @node: raw node
276 */
277struct ubifs_scan_node {
278 struct list_head list;
279 union ubifs_key key;
280 unsigned long long sqnum;
281 int type;
282 int offs;
283 int len;
284 void *node;
285};
286
287/**
288 * struct ubifs_scan_leb - UBIFS scanned LEB information.
289 * @lnum: logical eraseblock number
290 * @nodes_cnt: number of nodes scanned
291 * @nodes: list of struct ubifs_scan_node
292 * @endpt: end point (and therefore the start of empty space)
293 * @ecc: read returned -EBADMSG
294 * @buf: buffer containing entire LEB scanned
295 */
296struct ubifs_scan_leb {
297 int lnum;
298 int nodes_cnt;
299 struct list_head nodes;
300 int endpt;
301 int ecc;
302 void *buf;
303};
304
305/**
306 * struct ubifs_gced_idx_leb - garbage-collected indexing LEB.
307 * @list: list
308 * @lnum: LEB number
309 * @unmap: OK to unmap this LEB
310 *
311 * This data structure is used to temporary store garbage-collected indexing
312 * LEBs - they are not released immediately, but only after the next commit.
313 * This is needed to guarantee recoverability.
314 */
315struct ubifs_gced_idx_leb {
316 struct list_head list;
317 int lnum;
318 int unmap;
319};
320
321/**
322 * struct ubifs_inode - UBIFS in-memory inode description.
323 * @vfs_inode: VFS inode description object
324 * @creat_sqnum: sequence number at time of creation
325 * @xattr_size: summarized size of all extended attributes in bytes
326 * @xattr_cnt: count of extended attributes this inode has
327 * @xattr_names: sum of lengths of all extended attribute names belonging to
328 * this inode
329 * @dirty: non-zero if the inode is dirty
330 * @xattr: non-zero if this is an extended attribute inode
331 * @ui_mutex: serializes inode write-back with the rest of VFS operations,
332 * serializes "clean <-> dirty" state changes, protects @dirty,
333 * @ui_size, and @xattr_size
334 * @ui_lock: protects @synced_i_size
335 * @synced_i_size: synchronized size of inode, i.e. the value of inode size
336 * currently stored on the flash; used only for regular file
337 * inodes
338 * @ui_size: inode size used by UBIFS when writing to flash
339 * @flags: inode flags (@UBIFS_COMPR_FL, etc)
340 * @compr_type: default compression type used for this inode
341 * @data_len: length of the data attached to the inode
342 * @data: inode's data
343 *
344 * @ui_mutex exists for two main reasons. At first it prevents inodes from
345 * being written back while UBIFS changing them, being in the middle of an VFS
346 * operation. This way UBIFS makes sure the inode fields are consistent. For
347 * example, in 'ubifs_rename()' we change 3 inodes simultaneously, and
348 * write-back must not write any of them before we have finished.
349 *
350 * The second reason is budgeting - UBIFS has to budget all operations. If an
351 * operation is going to mark an inode dirty, it has to allocate budget for
352 * this. It cannot just mark it dirty because there is no guarantee there will
353 * be enough flash space to write the inode back later. This means UBIFS has
354 * to have full control over inode "clean <-> dirty" transitions (and pages
355 * actually). But unfortunately, VFS marks inodes dirty in many places, and it
356 * does not ask the file-system if it is allowed to do so (there is a notifier,
357 * but it is not enough), i.e., there is no mechanism to synchronize with this.
358 * So UBIFS has its own inode dirty flag and its own mutex to serialize
359 * "clean <-> dirty" transitions.
360 *
361 * The @synced_i_size field is used to make sure we never write pages which are
362 * beyond last synchronized inode size. See 'ubifs_writepage()' for more
363 * information.
364 *
365 * The @ui_size is a "shadow" variable for @inode->i_size and UBIFS uses
366 * @ui_size instead of @inode->i_size. The reason for this is that UBIFS cannot
367 * make sure @inode->i_size is always changed under @ui_mutex, because it
368 * cannot call 'vmtruncate()' with @ui_mutex locked, because it would deadlock
369 * with 'ubifs_writepage()' (see file.c). All the other inode fields are
370 * changed under @ui_mutex, so they do not need "shadow" fields. Note, one
371 * could consider to rework locking and base it on "shadow" fields.
372 */
373struct ubifs_inode {
374 struct inode vfs_inode;
375 unsigned long long creat_sqnum;
376 unsigned int xattr_size;
377 unsigned int xattr_cnt;
378 unsigned int xattr_names;
379 unsigned int dirty:1;
380 unsigned int xattr:1;
381 struct mutex ui_mutex;
382 spinlock_t ui_lock;
383 loff_t synced_i_size;
384 loff_t ui_size;
385 int flags;
386 int compr_type;
387 int data_len;
388 void *data;
389};
390
391/**
392 * struct ubifs_unclean_leb - records a LEB recovered under read-only mode.
393 * @list: list
394 * @lnum: LEB number of recovered LEB
395 * @endpt: offset where recovery ended
396 *
397 * This structure records a LEB identified during recovery that needs to be
398 * cleaned but was not because UBIFS was mounted read-only. The information
399 * is used to clean the LEB when remounting to read-write mode.
400 */
401struct ubifs_unclean_leb {
402 struct list_head list;
403 int lnum;
404 int endpt;
405};
406
407/*
408 * LEB properties flags.
409 *
410 * LPROPS_UNCAT: not categorized
411 * LPROPS_DIRTY: dirty > 0, not index
412 * LPROPS_DIRTY_IDX: dirty + free > UBIFS_CH_SZ and index
413 * LPROPS_FREE: free > 0, not empty, not index
414 * LPROPS_HEAP_CNT: number of heaps used for storing categorized LEBs
415 * LPROPS_EMPTY: LEB is empty, not taken
416 * LPROPS_FREEABLE: free + dirty == leb_size, not index, not taken
417 * LPROPS_FRDI_IDX: free + dirty == leb_size and index, may be taken
418 * LPROPS_CAT_MASK: mask for the LEB categories above
419 * LPROPS_TAKEN: LEB was taken (this flag is not saved on the media)
420 * LPROPS_INDEX: LEB contains indexing nodes (this flag also exists on flash)
421 */
422enum {
423 LPROPS_UNCAT = 0,
424 LPROPS_DIRTY = 1,
425 LPROPS_DIRTY_IDX = 2,
426 LPROPS_FREE = 3,
427 LPROPS_HEAP_CNT = 3,
428 LPROPS_EMPTY = 4,
429 LPROPS_FREEABLE = 5,
430 LPROPS_FRDI_IDX = 6,
431 LPROPS_CAT_MASK = 15,
432 LPROPS_TAKEN = 16,
433 LPROPS_INDEX = 32,
434};
435
436/**
437 * struct ubifs_lprops - logical eraseblock properties.
438 * @free: amount of free space in bytes
439 * @dirty: amount of dirty space in bytes
440 * @flags: LEB properties flags (see above)
441 * @lnum: LEB number
442 * @list: list of same-category lprops (for LPROPS_EMPTY and LPROPS_FREEABLE)
443 * @hpos: heap position in heap of same-category lprops (other categories)
444 */
445struct ubifs_lprops {
446 int free;
447 int dirty;
448 int flags;
449 int lnum;
450 union {
451 struct list_head list;
452 int hpos;
453 };
454};
455
456/**
457 * struct ubifs_lpt_lprops - LPT logical eraseblock properties.
458 * @free: amount of free space in bytes
459 * @dirty: amount of dirty space in bytes
460 * @tgc: trivial GC flag (1 => unmap after commit end)
461 * @cmt: commit flag (1 => reserved for commit)
462 */
463struct ubifs_lpt_lprops {
464 int free;
465 int dirty;
466 unsigned tgc : 1;
467 unsigned cmt : 1;
468};
469
470/**
471 * struct ubifs_lp_stats - statistics of eraseblocks in the main area.
472 * @empty_lebs: number of empty LEBs
473 * @taken_empty_lebs: number of taken LEBs
474 * @idx_lebs: number of indexing LEBs
475 * @total_free: total free space in bytes
476 * @total_dirty: total dirty space in bytes
477 * @total_used: total used space in bytes (includes only data LEBs)
478 * @total_dead: total dead space in bytes (includes only data LEBs)
479 * @total_dark: total dark space in bytes (includes only data LEBs)
480 *
481 * N.B. total_dirty and total_used are different to other total_* fields,
482 * because they account _all_ LEBs, not just data LEBs.
483 *
484 * 'taken_empty_lebs' counts the LEBs that are in the transient state of having
485 * been 'taken' for use but not yet written to. 'taken_empty_lebs' is needed
486 * to account correctly for gc_lnum, otherwise 'empty_lebs' could be used
487 * by itself (in which case 'unused_lebs' would be a better name). In the case
488 * of gc_lnum, it is 'taken' at mount time or whenever a LEB is retained by GC,
489 * but unlike other empty LEBs that are 'taken', it may not be written straight
490 * away (i.e. before the next commit start or unmount), so either gc_lnum must
491 * be specially accounted for, or the current approach followed i.e. count it
492 * under 'taken_empty_lebs'.
493 */
494struct ubifs_lp_stats {
495 int empty_lebs;
496 int taken_empty_lebs;
497 int idx_lebs;
498 long long total_free;
499 long long total_dirty;
500 long long total_used;
501 long long total_dead;
502 long long total_dark;
503};
504
505struct ubifs_nnode;
506
507/**
508 * struct ubifs_cnode - LEB Properties Tree common node.
509 * @parent: parent nnode
510 * @cnext: next cnode to commit
511 * @flags: flags (%DIRTY_LPT_NODE or %OBSOLETE_LPT_NODE)
512 * @iip: index in parent
513 * @level: level in the tree (zero for pnodes, greater than zero for nnodes)
514 * @num: node number
515 */
516struct ubifs_cnode {
517 struct ubifs_nnode *parent;
518 struct ubifs_cnode *cnext;
519 unsigned long flags;
520 int iip;
521 int level;
522 int num;
523};
524
525/**
526 * struct ubifs_pnode - LEB Properties Tree leaf node.
527 * @parent: parent nnode
528 * @cnext: next cnode to commit
529 * @flags: flags (%DIRTY_LPT_NODE or %OBSOLETE_LPT_NODE)
530 * @iip: index in parent
531 * @level: level in the tree (always zero for pnodes)
532 * @num: node number
533 * @lprops: LEB properties array
534 */
535struct ubifs_pnode {
536 struct ubifs_nnode *parent;
537 struct ubifs_cnode *cnext;
538 unsigned long flags;
539 int iip;
540 int level;
541 int num;
542 struct ubifs_lprops lprops[UBIFS_LPT_FANOUT];
543};
544
545/**
546 * struct ubifs_nbranch - LEB Properties Tree internal node branch.
547 * @lnum: LEB number of child
548 * @offs: offset of child
549 * @nnode: nnode child
550 * @pnode: pnode child
551 * @cnode: cnode child
552 */
553struct ubifs_nbranch {
554 int lnum;
555 int offs;
556 union {
557 struct ubifs_nnode *nnode;
558 struct ubifs_pnode *pnode;
559 struct ubifs_cnode *cnode;
560 };
561};
562
563/**
564 * struct ubifs_nnode - LEB Properties Tree internal node.
565 * @parent: parent nnode
566 * @cnext: next cnode to commit
567 * @flags: flags (%DIRTY_LPT_NODE or %OBSOLETE_LPT_NODE)
568 * @iip: index in parent
569 * @level: level in the tree (always greater than zero for nnodes)
570 * @num: node number
571 * @nbranch: branches to child nodes
572 */
573struct ubifs_nnode {
574 struct ubifs_nnode *parent;
575 struct ubifs_cnode *cnext;
576 unsigned long flags;
577 int iip;
578 int level;
579 int num;
580 struct ubifs_nbranch nbranch[UBIFS_LPT_FANOUT];
581};
582
583/**
584 * struct ubifs_lpt_heap - heap of categorized lprops.
585 * @arr: heap array
586 * @cnt: number in heap
587 * @max_cnt: maximum number allowed in heap
588 *
589 * There are %LPROPS_HEAP_CNT heaps.
590 */
591struct ubifs_lpt_heap {
592 struct ubifs_lprops **arr;
593 int cnt;
594 int max_cnt;
595};
596
597/*
598 * Return codes for LPT scan callback function.
599 *
600 * LPT_SCAN_CONTINUE: continue scanning
601 * LPT_SCAN_ADD: add the LEB properties scanned to the tree in memory
602 * LPT_SCAN_STOP: stop scanning
603 */
604enum {
605 LPT_SCAN_CONTINUE = 0,
606 LPT_SCAN_ADD = 1,
607 LPT_SCAN_STOP = 2,
608};
609
610struct ubifs_info;
611
612/* Callback used by the 'ubifs_lpt_scan_nolock()' function */
613typedef int (*ubifs_lpt_scan_callback)(struct ubifs_info *c,
614 const struct ubifs_lprops *lprops,
615 int in_tree, void *data);
616
617/**
618 * struct ubifs_wbuf - UBIFS write-buffer.
619 * @c: UBIFS file-system description object
620 * @buf: write-buffer (of min. flash I/O unit size)
621 * @lnum: logical eraseblock number the write-buffer points to
622 * @offs: write-buffer offset in this logical eraseblock
623 * @avail: number of bytes available in the write-buffer
624 * @used: number of used bytes in the write-buffer
625 * @dtype: type of data stored in this LEB (%UBI_LONGTERM, %UBI_SHORTTERM,
626 * %UBI_UNKNOWN)
627 * @jhead: journal head the mutex belongs to (note, needed only to shut lockdep
628 * up by 'mutex_lock_nested()).
629 * @sync_callback: write-buffer synchronization callback
630 * @io_mutex: serializes write-buffer I/O
631 * @lock: serializes @buf, @lnum, @offs, @avail, @used, @next_ino and @inodes
632 * fields
633 * @timer: write-buffer timer
634 * @timeout: timer expire interval in jiffies
635 * @need_sync: it is set if its timer expired and needs sync
636 * @next_ino: points to the next position of the following inode number
637 * @inodes: stores the inode numbers of the nodes which are in wbuf
638 *
639 * The write-buffer synchronization callback is called when the write-buffer is
640 * synchronized in order to notify how much space was wasted due to
641 * write-buffer padding and how much free space is left in the LEB.
642 *
643 * Note: the fields @buf, @lnum, @offs, @avail and @used can be read under
644 * spin-lock or mutex because they are written under both mutex and spin-lock.
645 * @buf is appended to under mutex but overwritten under both mutex and
646 * spin-lock. Thus the data between @buf and @buf + @used can be read under
647 * spinlock.
648 */
649struct ubifs_wbuf {
650 struct ubifs_info *c;
651 void *buf;
652 int lnum;
653 int offs;
654 int avail;
655 int used;
656 int dtype;
657 int jhead;
658 int (*sync_callback)(struct ubifs_info *c, int lnum, int free, int pad);
659 struct mutex io_mutex;
660 spinlock_t lock;
661 struct timer_list timer;
662 int timeout;
663 int need_sync;
664 int next_ino;
665 ino_t *inodes;
666};
667
668/**
669 * struct ubifs_bud - bud logical eraseblock.
670 * @lnum: logical eraseblock number
671 * @start: where the (uncommitted) bud data starts
672 * @jhead: journal head number this bud belongs to
673 * @list: link in the list buds belonging to the same journal head
674 * @rb: link in the tree of all buds
675 */
676struct ubifs_bud {
677 int lnum;
678 int start;
679 int jhead;
680 struct list_head list;
681 struct rb_node rb;
682};
683
684/**
685 * struct ubifs_jhead - journal head.
686 * @wbuf: head's write-buffer
687 * @buds_list: list of bud LEBs belonging to this journal head
688 *
689 * Note, the @buds list is protected by the @c->buds_lock.
690 */
691struct ubifs_jhead {
692 struct ubifs_wbuf wbuf;
693 struct list_head buds_list;
694};
695
696/**
697 * struct ubifs_zbranch - key/coordinate/length branch stored in znodes.
698 * @key: key
699 * @znode: znode address in memory
700 * @lnum: LEB number of the indexing node
701 * @offs: offset of the indexing node within @lnum
702 * @len: target node length
703 */
704struct ubifs_zbranch {
705 union ubifs_key key;
706 union {
707 struct ubifs_znode *znode;
708 void *leaf;
709 };
710 int lnum;
711 int offs;
712 int len;
713};
714
715/**
716 * struct ubifs_znode - in-memory representation of an indexing node.
717 * @parent: parent znode or NULL if it is the root
718 * @cnext: next znode to commit
719 * @flags: znode flags (%DIRTY_ZNODE, %COW_ZNODE or %OBSOLETE_ZNODE)
720 * @time: last access time (seconds)
721 * @level: level of the entry in the TNC tree
722 * @child_cnt: count of child znodes
723 * @iip: index in parent's zbranch array
724 * @alt: lower bound of key range has altered i.e. child inserted at slot 0
725 * @lnum: LEB number of the corresponding indexing node
726 * @offs: offset of the corresponding indexing node
727 * @len: length of the corresponding indexing node
728 * @zbranch: array of znode branches (@c->fanout elements)
729 */
730struct ubifs_znode {
731 struct ubifs_znode *parent;
732 struct ubifs_znode *cnext;
733 unsigned long flags;
734 unsigned long time;
735 int level;
736 int child_cnt;
737 int iip;
738 int alt;
739#ifdef CONFIG_UBIFS_FS_DEBUG
740 int lnum, offs, len;
741#endif
742 struct ubifs_zbranch zbranch[];
743};
744
745/**
746 * struct ubifs_node_range - node length range description data structure.
747 * @len: fixed node length
748 * @min_len: minimum possible node length
749 * @max_len: maximum possible node length
750 *
751 * If @max_len is %0, the node has fixed length @len.
752 */
753struct ubifs_node_range {
754 union {
755 int len;
756 int min_len;
757 };
758 int max_len;
759};
760
761/**
762 * struct ubifs_compressor - UBIFS compressor description structure.
763 * @compr_type: compressor type (%UBIFS_COMPR_LZO, etc)
764 * @cc: cryptoapi compressor handle
765 * @comp_mutex: mutex used during compression
766 * @decomp_mutex: mutex used during decompression
767 * @name: compressor name
768 * @capi_name: cryptoapi compressor name
769 */
770struct ubifs_compressor {
771 int compr_type;
772 struct crypto_comp *cc;
773 struct mutex *comp_mutex;
774 struct mutex *decomp_mutex;
775 const char *name;
776 const char *capi_name;
777};
778
779/**
780 * struct ubifs_budget_req - budget requirements of an operation.
781 *
782 * @fast: non-zero if the budgeting should try to aquire budget quickly and
783 * should not try to call write-back
784 * @recalculate: non-zero if @idx_growth, @data_growth, and @dd_growth fields
785 * have to be re-calculated
786 * @new_page: non-zero if the operation adds a new page
787 * @dirtied_page: non-zero if the operation makes a page dirty
788 * @new_dent: non-zero if the operation adds a new directory entry
789 * @mod_dent: non-zero if the operation removes or modifies an existing
790 * directory entry
791 * @new_ino: non-zero if the operation adds a new inode
792 * @new_ino_d: now much data newly created inode contains
793 * @dirtied_ino: how many inodes the operation makes dirty
794 * @dirtied_ino_d: now much data dirtied inode contains
795 * @idx_growth: how much the index will supposedly grow
796 * @data_growth: how much new data the operation will supposedly add
797 * @dd_growth: how much data that makes other data dirty the operation will
798 * supposedly add
799 *
800 * @idx_growth, @data_growth and @dd_growth are not used in budget request. The
801 * budgeting subsystem caches index and data growth values there to avoid
802 * re-calculating them when the budget is released. However, if @idx_growth is
803 * %-1, it is calculated by the release function using other fields.
804 *
805 * An inode may contain 4KiB of data at max., thus the widths of @new_ino_d
806 * is 13 bits, and @dirtied_ino_d - 15, because up to 4 inodes may be made
807 * dirty by the re-name operation.
808 */
809struct ubifs_budget_req {
810 unsigned int fast:1;
811 unsigned int recalculate:1;
812 unsigned int new_page:1;
813 unsigned int dirtied_page:1;
814 unsigned int new_dent:1;
815 unsigned int mod_dent:1;
816 unsigned int new_ino:1;
817 unsigned int new_ino_d:13;
818#ifndef UBIFS_DEBUG
819 unsigned int dirtied_ino:4;
820 unsigned int dirtied_ino_d:15;
821#else
822 /* Not bit-fields to check for overflows */
823 unsigned int dirtied_ino;
824 unsigned int dirtied_ino_d;
825#endif
826 int idx_growth;
827 int data_growth;
828 int dd_growth;
829};
830
831/**
832 * struct ubifs_orphan - stores the inode number of an orphan.
833 * @rb: rb-tree node of rb-tree of orphans sorted by inode number
834 * @list: list head of list of orphans in order added
835 * @new_list: list head of list of orphans added since the last commit
836 * @cnext: next orphan to commit
837 * @dnext: next orphan to delete
838 * @inum: inode number
839 * @new: %1 => added since the last commit, otherwise %0
840 */
841struct ubifs_orphan {
842 struct rb_node rb;
843 struct list_head list;
844 struct list_head new_list;
845 struct ubifs_orphan *cnext;
846 struct ubifs_orphan *dnext;
847 ino_t inum;
848 int new;
849};
850
851/**
852 * struct ubifs_mount_opts - UBIFS-specific mount options information.
853 * @unmount_mode: selected unmount mode (%0 default, %1 normal, %2 fast)
854 */
855struct ubifs_mount_opts {
856 unsigned int unmount_mode:2;
857};
858
859/**
860 * struct ubifs_info - UBIFS file-system description data structure
861 * (per-superblock).
862 * @vfs_sb: VFS @struct super_block object
863 * @bdi: backing device info object to make VFS happy and disable readahead
864 *
865 * @highest_inum: highest used inode number
866 * @vfs_gen: VFS inode generation counter
867 * @max_sqnum: current global sequence number
868 * @cmt_no: commit number (last successfully completed commit)
869 * @cnt_lock: protects @highest_inum, @vfs_gen, and @max_sqnum counters
870 * @fmt_version: UBIFS on-flash format version
871 * @uuid: UUID from super block
872 *
873 * @lhead_lnum: log head logical eraseblock number
874 * @lhead_offs: log head offset
875 * @ltail_lnum: log tail logical eraseblock number (offset is always 0)
876 * @log_mutex: protects the log, @lhead_lnum, @lhead_offs, @ltail_lnum, and
877 * @bud_bytes
878 * @min_log_bytes: minimum required number of bytes in the log
879 * @cmt_bud_bytes: used during commit to temporarily amount of bytes in
880 * committed buds
881 *
882 * @buds: tree of all buds indexed by bud LEB number
883 * @bud_bytes: how many bytes of flash is used by buds
884 * @buds_lock: protects the @buds tree, @bud_bytes, and per-journal head bud
885 * lists
886 * @jhead_cnt: count of journal heads
887 * @jheads: journal heads (head zero is base head)
888 * @max_bud_bytes: maximum number of bytes allowed in buds
889 * @bg_bud_bytes: number of bud bytes when background commit is initiated
890 * @old_buds: buds to be released after commit ends
891 * @max_bud_cnt: maximum number of buds
892 *
893 * @commit_sem: synchronizes committer with other processes
894 * @cmt_state: commit state
895 * @cs_lock: commit state lock
896 * @cmt_wq: wait queue to sleep on if the log is full and a commit is running
897 * @fast_unmount: do not run journal commit before un-mounting
898 * @big_lpt: flag that LPT is too big to write whole during commit
899 * @check_lpt_free: flag that indicates LPT GC may be needed
900 * @nospace: non-zero if the file-system does not have flash space (used as
901 * optimization)
902 * @nospace_rp: the same as @nospace, but additionally means that even reserved
903 * pool is full
904 *
905 * @tnc_mutex: protects the Tree Node Cache (TNC), @zroot, @cnext, @enext, and
906 * @calc_idx_sz
907 * @zroot: zbranch which points to the root index node and znode
908 * @cnext: next znode to commit
909 * @enext: next znode to commit to empty space
910 * @gap_lebs: array of LEBs used by the in-gaps commit method
911 * @cbuf: commit buffer
912 * @ileb_buf: buffer for commit in-the-gaps method
913 * @ileb_len: length of data in ileb_buf
914 * @ihead_lnum: LEB number of index head
915 * @ihead_offs: offset of index head
916 * @ilebs: pre-allocated index LEBs
917 * @ileb_cnt: number of pre-allocated index LEBs
918 * @ileb_nxt: next pre-allocated index LEBs
919 * @old_idx: tree of index nodes obsoleted since the last commit start
920 * @bottom_up_buf: a buffer which is used by 'dirty_cow_bottom_up()' in tnc.c
921 * @new_ihead_lnum: used by debugging to check ihead_lnum
922 * @new_ihead_offs: used by debugging to check ihead_offs
923 *
924 * @mst_node: master node
925 * @mst_offs: offset of valid master node
926 * @mst_mutex: protects the master node area, @mst_node, and @mst_offs
927 *
928 * @log_lebs: number of logical eraseblocks in the log
929 * @log_bytes: log size in bytes
930 * @log_last: last LEB of the log
931 * @lpt_lebs: number of LEBs used for lprops table
932 * @lpt_first: first LEB of the lprops table area
933 * @lpt_last: last LEB of the lprops table area
934 * @orph_lebs: number of LEBs used for the orphan area
935 * @orph_first: first LEB of the orphan area
936 * @orph_last: last LEB of the orphan area
937 * @main_lebs: count of LEBs in the main area
938 * @main_first: first LEB of the main area
939 * @main_bytes: main area size in bytes
940 * @default_compr: default compression algorithm (%UBIFS_COMPR_LZO, etc)
941 *
942 * @key_hash_type: type of the key hash
943 * @key_hash: direntry key hash function
944 * @key_fmt: key format
945 * @key_len: key length
946 * @fanout: fanout of the index tree (number of links per indexing node)
947 *
948 * @min_io_size: minimal input/output unit size
949 * @min_io_shift: number of bits in @min_io_size minus one
950 * @leb_size: logical eraseblock size in bytes
951 * @half_leb_size: half LEB size
952 * @leb_cnt: count of logical eraseblocks
953 * @max_leb_cnt: maximum count of logical eraseblocks
954 * @old_leb_cnt: count of logical eraseblocks before re-size
955 * @ro_media: the underlying UBI volume is read-only
956 *
957 * @dirty_pg_cnt: number of dirty pages (not used)
958 * @dirty_zn_cnt: number of dirty znodes
959 * @clean_zn_cnt: number of clean znodes
960 *
961 * @budg_idx_growth: amount of bytes budgeted for index growth
962 * @budg_data_growth: amount of bytes budgeted for cached data
963 * @budg_dd_growth: amount of bytes budgeted for cached data that will make
964 * other data dirty
965 * @budg_uncommitted_idx: amount of bytes were budgeted for growth of the index,
966 * but which still have to be taken into account because
967 * the index has not been committed so far
968 * @space_lock: protects @budg_idx_growth, @budg_data_growth, @budg_dd_growth,
969 * @budg_uncommited_idx, @min_idx_lebs, @old_idx_sz, and @lst;
970 * @min_idx_lebs: minimum number of LEBs required for the index
971 * @old_idx_sz: size of index on flash
972 * @calc_idx_sz: temporary variable which is used to calculate new index size
973 * (contains accurate new index size at end of TNC commit start)
974 * @lst: lprops statistics
975 *
976 * @page_budget: budget for a page
977 * @inode_budget: budget for an inode
978 * @dent_budget: budget for a directory entry
979 *
980 * @ref_node_alsz: size of the LEB reference node aligned to the min. flash
981 * I/O unit
982 * @mst_node_alsz: master node aligned size
983 * @min_idx_node_sz: minimum indexing node aligned on 8-bytes boundary
984 * @max_idx_node_sz: maximum indexing node aligned on 8-bytes boundary
985 * @max_inode_sz: maximum possible inode size in bytes
986 * @max_znode_sz: size of znode in bytes
987 * @dead_wm: LEB dead space watermark
988 * @dark_wm: LEB dark space watermark
989 * @block_cnt: count of 4KiB blocks on the FS
990 *
991 * @ranges: UBIFS node length ranges
992 * @ubi: UBI volume descriptor
993 * @di: UBI device information
994 * @vi: UBI volume information
995 *
996 * @orph_tree: rb-tree of orphan inode numbers
997 * @orph_list: list of orphan inode numbers in order added
998 * @orph_new: list of orphan inode numbers added since last commit
999 * @orph_cnext: next orphan to commit
1000 * @orph_dnext: next orphan to delete
1001 * @orphan_lock: lock for orph_tree and orph_new
1002 * @orph_buf: buffer for orphan nodes
1003 * @new_orphans: number of orphans since last commit
1004 * @cmt_orphans: number of orphans being committed
1005 * @tot_orphans: number of orphans in the rb_tree
1006 * @max_orphans: maximum number of orphans allowed
1007 * @ohead_lnum: orphan head LEB number
1008 * @ohead_offs: orphan head offset
1009 * @no_orphs: non-zero if there are no orphans
1010 *
1011 * @bgt: UBIFS background thread
1012 * @bgt_name: background thread name
1013 * @need_bgt: if background thread should run
1014 * @need_wbuf_sync: if write-buffers have to be synchronized
1015 *
1016 * @gc_lnum: LEB number used for garbage collection
1017 * @sbuf: a buffer of LEB size used by GC and replay for scanning
1018 * @idx_gc: list of index LEBs that have been garbage collected
1019 * @idx_gc_cnt: number of elements on the idx_gc list
1020 *
1021 * @infos_list: links all 'ubifs_info' objects
1022 * @umount_mutex: serializes shrinker and un-mount
1023 * @shrinker_run_no: shrinker run number
1024 *
1025 * @space_bits: number of bits needed to record free or dirty space
1026 * @lpt_lnum_bits: number of bits needed to record a LEB number in the LPT
1027 * @lpt_offs_bits: number of bits needed to record an offset in the LPT
1028 * @lpt_spc_bits: number of bits needed to space in the LPT
1029 * @pcnt_bits: number of bits needed to record pnode or nnode number
1030 * @lnum_bits: number of bits needed to record LEB number
1031 * @nnode_sz: size of on-flash nnode
1032 * @pnode_sz: size of on-flash pnode
1033 * @ltab_sz: size of on-flash LPT lprops table
1034 * @lsave_sz: size of on-flash LPT save table
1035 * @pnode_cnt: number of pnodes
1036 * @nnode_cnt: number of nnodes
1037 * @lpt_hght: height of the LPT
1038 * @pnodes_have: number of pnodes in memory
1039 *
1040 * @lp_mutex: protects lprops table and all the other lprops-related fields
1041 * @lpt_lnum: LEB number of the root nnode of the LPT
1042 * @lpt_offs: offset of the root nnode of the LPT
1043 * @nhead_lnum: LEB number of LPT head
1044 * @nhead_offs: offset of LPT head
1045 * @lpt_drty_flgs: dirty flags for LPT special nodes e.g. ltab
1046 * @dirty_nn_cnt: number of dirty nnodes
1047 * @dirty_pn_cnt: number of dirty pnodes
1048 * @lpt_sz: LPT size
1049 * @lpt_nod_buf: buffer for an on-flash nnode or pnode
1050 * @lpt_buf: buffer of LEB size used by LPT
1051 * @nroot: address in memory of the root nnode of the LPT
1052 * @lpt_cnext: next LPT node to commit
1053 * @lpt_heap: array of heaps of categorized lprops
1054 * @dirty_idx: a (reverse sorted) copy of the LPROPS_DIRTY_IDX heap as at
1055 * previous commit start
1056 * @uncat_list: list of un-categorized LEBs
1057 * @empty_list: list of empty LEBs
1058 * @freeable_list: list of freeable non-index LEBs (free + dirty == leb_size)
1059 * @frdi_idx_list: list of freeable index LEBs (free + dirty == leb_size)
1060 * @freeable_cnt: number of freeable LEBs in @freeable_list
1061 *
1062 * @ltab_lnum: LEB number of LPT's own lprops table
1063 * @ltab_offs: offset of LPT's own lprops table
1064 * @ltab: LPT's own lprops table
1065 * @ltab_cmt: LPT's own lprops table (commit copy)
1066 * @lsave_cnt: number of LEB numbers in LPT's save table
1067 * @lsave_lnum: LEB number of LPT's save table
1068 * @lsave_offs: offset of LPT's save table
1069 * @lsave: LPT's save table
1070 * @lscan_lnum: LEB number of last LPT scan
1071 *
1072 * @rp_size: size of the reserved pool in bytes
1073 * @report_rp_size: size of the reserved pool reported to user-space
1074 * @rp_uid: reserved pool user ID
1075 * @rp_gid: reserved pool group ID
1076 *
1077 * @empty: if the UBI device is empty
1078 * @replay_tree: temporary tree used during journal replay
1079 * @replay_list: temporary list used during journal replay
1080 * @replay_buds: list of buds to replay
1081 * @cs_sqnum: sequence number of first node in the log (commit start node)
1082 * @replay_sqnum: sequence number of node currently being replayed
1083 * @need_recovery: file-system needs recovery
1084 * @replaying: set to %1 during journal replay
1085 * @unclean_leb_list: LEBs to recover when mounting ro to rw
1086 * @rcvrd_mst_node: recovered master node to write when mounting ro to rw
1087 * @size_tree: inode size information for recovery
1088 * @remounting_rw: set while remounting from ro to rw (sb flags have MS_RDONLY)
1089 * @mount_opts: UBIFS-specific mount options
1090 *
1091 * @dbg_buf: a buffer of LEB size used for debugging purposes
1092 * @old_zroot: old index root - used by 'dbg_check_old_index()'
1093 * @old_zroot_level: old index root level - used by 'dbg_check_old_index()'
1094 * @old_zroot_sqnum: old index root sqnum - used by 'dbg_check_old_index()'
1095 * @failure_mode: failure mode for recovery testing
1096 * @fail_delay: 0=>don't delay, 1=>delay a time, 2=>delay a number of calls
1097 * @fail_timeout: time in jiffies when delay of failure mode expires
1098 * @fail_cnt: current number of calls to failure mode I/O functions
1099 * @fail_cnt_max: number of calls by which to delay failure mode
1100 */
1101struct ubifs_info {
1102 struct super_block *vfs_sb;
1103 struct backing_dev_info bdi;
1104
1105 ino_t highest_inum;
1106 unsigned int vfs_gen;
1107 unsigned long long max_sqnum;
1108 unsigned long long cmt_no;
1109 spinlock_t cnt_lock;
1110 int fmt_version;
1111 unsigned char uuid[16];
1112
1113 int lhead_lnum;
1114 int lhead_offs;
1115 int ltail_lnum;
1116 struct mutex log_mutex;
1117 int min_log_bytes;
1118 long long cmt_bud_bytes;
1119
1120 struct rb_root buds;
1121 long long bud_bytes;
1122 spinlock_t buds_lock;
1123 int jhead_cnt;
1124 struct ubifs_jhead *jheads;
1125 long long max_bud_bytes;
1126 long long bg_bud_bytes;
1127 struct list_head old_buds;
1128 int max_bud_cnt;
1129
1130 struct rw_semaphore commit_sem;
1131 int cmt_state;
1132 spinlock_t cs_lock;
1133 wait_queue_head_t cmt_wq;
1134 unsigned int fast_unmount:1;
1135 unsigned int big_lpt:1;
1136 unsigned int check_lpt_free:1;
1137 unsigned int nospace:1;
1138 unsigned int nospace_rp:1;
1139
1140 struct mutex tnc_mutex;
1141 struct ubifs_zbranch zroot;
1142 struct ubifs_znode *cnext;
1143 struct ubifs_znode *enext;
1144 int *gap_lebs;
1145 void *cbuf;
1146 void *ileb_buf;
1147 int ileb_len;
1148 int ihead_lnum;
1149 int ihead_offs;
1150 int *ilebs;
1151 int ileb_cnt;
1152 int ileb_nxt;
1153 struct rb_root old_idx;
1154 int *bottom_up_buf;
1155#ifdef CONFIG_UBIFS_FS_DEBUG
1156 int new_ihead_lnum;
1157 int new_ihead_offs;
1158#endif
1159
1160 struct ubifs_mst_node *mst_node;
1161 int mst_offs;
1162 struct mutex mst_mutex;
1163
1164 int log_lebs;
1165 long long log_bytes;
1166 int log_last;
1167 int lpt_lebs;
1168 int lpt_first;
1169 int lpt_last;
1170 int orph_lebs;
1171 int orph_first;
1172 int orph_last;
1173 int main_lebs;
1174 int main_first;
1175 long long main_bytes;
1176 int default_compr;
1177
1178 uint8_t key_hash_type;
1179 uint32_t (*key_hash)(const char *str, int len);
1180 int key_fmt;
1181 int key_len;
1182 int fanout;
1183
1184 int min_io_size;
1185 int min_io_shift;
1186 int leb_size;
1187 int half_leb_size;
1188 int leb_cnt;
1189 int max_leb_cnt;
1190 int old_leb_cnt;
1191 int ro_media;
1192
1193 atomic_long_t dirty_pg_cnt;
1194 atomic_long_t dirty_zn_cnt;
1195 atomic_long_t clean_zn_cnt;
1196
1197 long long budg_idx_growth;
1198 long long budg_data_growth;
1199 long long budg_dd_growth;
1200 long long budg_uncommitted_idx;
1201 spinlock_t space_lock;
1202 int min_idx_lebs;
1203 unsigned long long old_idx_sz;
1204 unsigned long long calc_idx_sz;
1205 struct ubifs_lp_stats lst;
1206
1207 int page_budget;
1208 int inode_budget;
1209 int dent_budget;
1210
1211 int ref_node_alsz;
1212 int mst_node_alsz;
1213 int min_idx_node_sz;
1214 int max_idx_node_sz;
1215 long long max_inode_sz;
1216 int max_znode_sz;
1217 int dead_wm;
1218 int dark_wm;
1219 int block_cnt;
1220
1221 struct ubifs_node_range ranges[UBIFS_NODE_TYPES_CNT];
1222 struct ubi_volume_desc *ubi;
1223 struct ubi_device_info di;
1224 struct ubi_volume_info vi;
1225
1226 struct rb_root orph_tree;
1227 struct list_head orph_list;
1228 struct list_head orph_new;
1229 struct ubifs_orphan *orph_cnext;
1230 struct ubifs_orphan *orph_dnext;
1231 spinlock_t orphan_lock;
1232 void *orph_buf;
1233 int new_orphans;
1234 int cmt_orphans;
1235 int tot_orphans;
1236 int max_orphans;
1237 int ohead_lnum;
1238 int ohead_offs;
1239 int no_orphs;
1240
1241 struct task_struct *bgt;
1242 char bgt_name[sizeof(BGT_NAME_PATTERN) + 9];
1243 int need_bgt;
1244 int need_wbuf_sync;
1245
1246 int gc_lnum;
1247 void *sbuf;
1248 struct list_head idx_gc;
1249 int idx_gc_cnt;
1250
1251 struct list_head infos_list;
1252 struct mutex umount_mutex;
1253 unsigned int shrinker_run_no;
1254
1255 int space_bits;
1256 int lpt_lnum_bits;
1257 int lpt_offs_bits;
1258 int lpt_spc_bits;
1259 int pcnt_bits;
1260 int lnum_bits;
1261 int nnode_sz;
1262 int pnode_sz;
1263 int ltab_sz;
1264 int lsave_sz;
1265 int pnode_cnt;
1266 int nnode_cnt;
1267 int lpt_hght;
1268 int pnodes_have;
1269
1270 struct mutex lp_mutex;
1271 int lpt_lnum;
1272 int lpt_offs;
1273 int nhead_lnum;
1274 int nhead_offs;
1275 int lpt_drty_flgs;
1276 int dirty_nn_cnt;
1277 int dirty_pn_cnt;
1278 long long lpt_sz;
1279 void *lpt_nod_buf;
1280 void *lpt_buf;
1281 struct ubifs_nnode *nroot;
1282 struct ubifs_cnode *lpt_cnext;
1283 struct ubifs_lpt_heap lpt_heap[LPROPS_HEAP_CNT];
1284 struct ubifs_lpt_heap dirty_idx;
1285 struct list_head uncat_list;
1286 struct list_head empty_list;
1287 struct list_head freeable_list;
1288 struct list_head frdi_idx_list;
1289 int freeable_cnt;
1290
1291 int ltab_lnum;
1292 int ltab_offs;
1293 struct ubifs_lpt_lprops *ltab;
1294 struct ubifs_lpt_lprops *ltab_cmt;
1295 int lsave_cnt;
1296 int lsave_lnum;
1297 int lsave_offs;
1298 int *lsave;
1299 int lscan_lnum;
1300
1301 long long rp_size;
1302 long long report_rp_size;
1303 uid_t rp_uid;
1304 gid_t rp_gid;
1305
1306 /* The below fields are used only during mounting and re-mounting */
1307 int empty;
1308 struct rb_root replay_tree;
1309 struct list_head replay_list;
1310 struct list_head replay_buds;
1311 unsigned long long cs_sqnum;
1312 unsigned long long replay_sqnum;
1313 int need_recovery;
1314 int replaying;
1315 struct list_head unclean_leb_list;
1316 struct ubifs_mst_node *rcvrd_mst_node;
1317 struct rb_root size_tree;
1318 int remounting_rw;
1319 struct ubifs_mount_opts mount_opts;
1320
1321#ifdef CONFIG_UBIFS_FS_DEBUG
1322 void *dbg_buf;
1323 struct ubifs_zbranch old_zroot;
1324 int old_zroot_level;
1325 unsigned long long old_zroot_sqnum;
1326 int failure_mode;
1327 int fail_delay;
1328 unsigned long fail_timeout;
1329 unsigned int fail_cnt;
1330 unsigned int fail_cnt_max;
1331#endif
1332};
1333
1334extern struct list_head ubifs_infos;
1335extern spinlock_t ubifs_infos_lock;
1336extern atomic_long_t ubifs_clean_zn_cnt;
1337extern struct kmem_cache *ubifs_inode_slab;
1338extern struct super_operations ubifs_super_operations;
1339extern struct address_space_operations ubifs_file_address_operations;
1340extern struct file_operations ubifs_file_operations;
1341extern struct inode_operations ubifs_file_inode_operations;
1342extern struct file_operations ubifs_dir_operations;
1343extern struct inode_operations ubifs_dir_inode_operations;
1344extern struct inode_operations ubifs_symlink_inode_operations;
1345extern struct backing_dev_info ubifs_backing_dev_info;
1346extern struct ubifs_compressor *ubifs_compressors[UBIFS_COMPR_TYPES_CNT];
1347
1348/* io.c */
1349int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len);
1350int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs,
1351 int dtype);
1352int ubifs_wbuf_init(struct ubifs_info *c, struct ubifs_wbuf *wbuf);
1353int ubifs_read_node(const struct ubifs_info *c, void *buf, int type, int len,
1354 int lnum, int offs);
1355int ubifs_read_node_wbuf(struct ubifs_wbuf *wbuf, void *buf, int type, int len,
1356 int lnum, int offs);
1357int ubifs_write_node(struct ubifs_info *c, void *node, int len, int lnum,
1358 int offs, int dtype);
1359int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum,
1360 int offs, int quiet);
1361void ubifs_prepare_node(struct ubifs_info *c, void *buf, int len, int pad);
1362void ubifs_prep_grp_node(struct ubifs_info *c, void *node, int len, int last);
1363int ubifs_io_init(struct ubifs_info *c);
1364void ubifs_pad(const struct ubifs_info *c, void *buf, int pad);
1365int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf);
1366int ubifs_bg_wbufs_sync(struct ubifs_info *c);
1367void ubifs_wbuf_add_ino_nolock(struct ubifs_wbuf *wbuf, ino_t inum);
1368int ubifs_sync_wbufs_by_inode(struct ubifs_info *c, struct inode *inode);
1369
1370/* scan.c */
1371struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum,
1372 int offs, void *sbuf);
1373void ubifs_scan_destroy(struct ubifs_scan_leb *sleb);
1374int ubifs_scan_a_node(const struct ubifs_info *c, void *buf, int len, int lnum,
1375 int offs, int quiet);
1376struct ubifs_scan_leb *ubifs_start_scan(const struct ubifs_info *c, int lnum,
1377 int offs, void *sbuf);
1378void ubifs_end_scan(const struct ubifs_info *c, struct ubifs_scan_leb *sleb,
1379 int lnum, int offs);
1380int ubifs_add_snod(const struct ubifs_info *c, struct ubifs_scan_leb *sleb,
1381 void *buf, int offs);
1382void ubifs_scanned_corruption(const struct ubifs_info *c, int lnum, int offs,
1383 void *buf);
1384
1385/* log.c */
1386void ubifs_add_bud(struct ubifs_info *c, struct ubifs_bud *bud);
1387void ubifs_create_buds_lists(struct ubifs_info *c);
1388int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs);
1389struct ubifs_bud *ubifs_search_bud(struct ubifs_info *c, int lnum);
1390struct ubifs_wbuf *ubifs_get_wbuf(struct ubifs_info *c, int lnum);
1391int ubifs_log_start_commit(struct ubifs_info *c, int *ltail_lnum);
1392int ubifs_log_end_commit(struct ubifs_info *c, int new_ltail_lnum);
1393int ubifs_log_post_commit(struct ubifs_info *c, int old_ltail_lnum);
1394int ubifs_consolidate_log(struct ubifs_info *c);
1395
1396/* journal.c */
1397int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir,
1398 const struct qstr *nm, const struct inode *inode,
1399 int deletion, int xent);
1400int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode,
1401 const union ubifs_key *key, const void *buf, int len);
1402int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode,
1403 int last_reference);
1404int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
1405 const struct dentry *old_dentry,
1406 const struct inode *new_dir,
1407 const struct dentry *new_dentry, int sync);
1408int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode,
1409 loff_t old_size, loff_t new_size);
1410int ubifs_jnl_delete_xattr(struct ubifs_info *c, const struct inode *host,
1411 const struct inode *inode, const struct qstr *nm);
1412int ubifs_jnl_change_xattr(struct ubifs_info *c, const struct inode *inode1,
1413 const struct inode *inode2);
1414
1415/* budget.c */
1416int ubifs_budget_space(struct ubifs_info *c, struct ubifs_budget_req *req);
1417void ubifs_release_budget(struct ubifs_info *c, struct ubifs_budget_req *req);
1418void ubifs_release_dirty_inode_budget(struct ubifs_info *c,
1419 struct ubifs_inode *ui);
1420int ubifs_budget_inode_op(struct ubifs_info *c, struct inode *inode,
1421 struct ubifs_budget_req *req);
1422void ubifs_release_ino_dirty(struct ubifs_info *c, struct inode *inode,
1423 struct ubifs_budget_req *req);
1424void ubifs_cancel_ino_op(struct ubifs_info *c, struct inode *inode,
1425 struct ubifs_budget_req *req);
1426long long ubifs_budg_get_free_space(struct ubifs_info *c);
1427int ubifs_calc_min_idx_lebs(struct ubifs_info *c);
1428void ubifs_convert_page_budget(struct ubifs_info *c);
1429long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs);
1430
1431/* find.c */
1432int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free,
1433 int squeeze);
1434int ubifs_find_free_leb_for_idx(struct ubifs_info *c);
1435int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp,
1436 int min_space, int pick_free);
1437int ubifs_find_dirty_idx_leb(struct ubifs_info *c);
1438int ubifs_save_dirty_idx_lnums(struct ubifs_info *c);
1439
1440/* tnc.c */
1441int ubifs_lookup_level0(struct ubifs_info *c, const union ubifs_key *key,
1442 struct ubifs_znode **zn, int *n);
1443int ubifs_tnc_lookup(struct ubifs_info *c, const union ubifs_key *key,
1444 void *node);
1445int ubifs_tnc_lookup_nm(struct ubifs_info *c, const union ubifs_key *key,
1446 void *node, const struct qstr *nm);
1447int ubifs_tnc_locate(struct ubifs_info *c, const union ubifs_key *key,
1448 void *node, int *lnum, int *offs);
1449int ubifs_tnc_add(struct ubifs_info *c, const union ubifs_key *key, int lnum,
1450 int offs, int len);
1451int ubifs_tnc_replace(struct ubifs_info *c, const union ubifs_key *key,
1452 int old_lnum, int old_offs, int lnum, int offs, int len);
1453int ubifs_tnc_add_nm(struct ubifs_info *c, const union ubifs_key *key,
1454 int lnum, int offs, int len, const struct qstr *nm);
1455int ubifs_tnc_remove(struct ubifs_info *c, const union ubifs_key *key);
1456int ubifs_tnc_remove_nm(struct ubifs_info *c, const union ubifs_key *key,
1457 const struct qstr *nm);
1458int ubifs_tnc_remove_range(struct ubifs_info *c, union ubifs_key *from_key,
1459 union ubifs_key *to_key);
1460int ubifs_tnc_remove_ino(struct ubifs_info *c, ino_t inum);
1461struct ubifs_dent_node *ubifs_tnc_next_ent(struct ubifs_info *c,
1462 union ubifs_key *key,
1463 const struct qstr *nm);
1464void ubifs_tnc_close(struct ubifs_info *c);
1465int ubifs_tnc_has_node(struct ubifs_info *c, union ubifs_key *key, int level,
1466 int lnum, int offs, int is_idx);
1467int ubifs_dirty_idx_node(struct ubifs_info *c, union ubifs_key *key, int level,
1468 int lnum, int offs);
1469/* Shared by tnc.c for tnc_commit.c */
1470void destroy_old_idx(struct ubifs_info *c);
1471int is_idx_node_in_tnc(struct ubifs_info *c, union ubifs_key *key, int level,
1472 int lnum, int offs);
1473int insert_old_idx_znode(struct ubifs_info *c, struct ubifs_znode *znode);
1474
1475/* tnc_misc.c */
1476struct ubifs_znode *ubifs_tnc_levelorder_next(struct ubifs_znode *zr,
1477 struct ubifs_znode *znode);
1478int ubifs_search_zbranch(const struct ubifs_info *c,
1479 const struct ubifs_znode *znode,
1480 const union ubifs_key *key, int *n);
1481struct ubifs_znode *ubifs_tnc_postorder_first(struct ubifs_znode *znode);
1482struct ubifs_znode *ubifs_tnc_postorder_next(struct ubifs_znode *znode);
1483long ubifs_destroy_tnc_subtree(struct ubifs_znode *zr);
1484struct ubifs_znode *ubifs_load_znode(struct ubifs_info *c,
1485 struct ubifs_zbranch *zbr,
1486 struct ubifs_znode *parent, int iip);
1487int ubifs_tnc_read_node(struct ubifs_info *c, struct ubifs_zbranch *zbr,
1488 void *node);
1489
1490/* tnc_commit.c */
1491int ubifs_tnc_start_commit(struct ubifs_info *c, struct ubifs_zbranch *zroot);
1492int ubifs_tnc_end_commit(struct ubifs_info *c);
1493
1494/* shrinker.c */
1495int ubifs_shrinker(int nr_to_scan, gfp_t gfp_mask);
1496
1497/* commit.c */
1498int ubifs_bg_thread(void *info);
1499void ubifs_commit_required(struct ubifs_info *c);
1500void ubifs_request_bg_commit(struct ubifs_info *c);
1501int ubifs_run_commit(struct ubifs_info *c);
1502void ubifs_recovery_commit(struct ubifs_info *c);
1503int ubifs_gc_should_commit(struct ubifs_info *c);
1504void ubifs_wait_for_commit(struct ubifs_info *c);
1505
1506/* master.c */
1507int ubifs_read_master(struct ubifs_info *c);
1508int ubifs_write_master(struct ubifs_info *c);
1509
1510/* sb.c */
1511int ubifs_read_superblock(struct ubifs_info *c);
1512struct ubifs_sb_node *ubifs_read_sb_node(struct ubifs_info *c);
1513int ubifs_write_sb_node(struct ubifs_info *c, struct ubifs_sb_node *sup);
1514
1515/* replay.c */
1516int ubifs_validate_entry(struct ubifs_info *c,
1517 const struct ubifs_dent_node *dent);
1518int ubifs_replay_journal(struct ubifs_info *c);
1519
1520/* gc.c */
1521int ubifs_garbage_collect(struct ubifs_info *c, int anyway);
1522int ubifs_gc_start_commit(struct ubifs_info *c);
1523int ubifs_gc_end_commit(struct ubifs_info *c);
1524void ubifs_destroy_idx_gc(struct ubifs_info *c);
1525int ubifs_get_idx_gc_leb(struct ubifs_info *c);
1526int ubifs_garbage_collect_leb(struct ubifs_info *c, struct ubifs_lprops *lp);
1527
1528/* orphan.c */
1529int ubifs_add_orphan(struct ubifs_info *c, ino_t inum);
1530void ubifs_delete_orphan(struct ubifs_info *c, ino_t inum);
1531int ubifs_orphan_start_commit(struct ubifs_info *c);
1532int ubifs_orphan_end_commit(struct ubifs_info *c);
1533int ubifs_mount_orphans(struct ubifs_info *c, int unclean, int read_only);
1534
1535/* lpt.c */
1536int ubifs_calc_lpt_geom(struct ubifs_info *c);
1537int ubifs_create_dflt_lpt(struct ubifs_info *c, int *main_lebs, int lpt_first,
1538 int *lpt_lebs, int *big_lpt);
1539int ubifs_lpt_init(struct ubifs_info *c, int rd, int wr);
1540struct ubifs_lprops *ubifs_lpt_lookup(struct ubifs_info *c, int lnum);
1541struct ubifs_lprops *ubifs_lpt_lookup_dirty(struct ubifs_info *c, int lnum);
1542int ubifs_lpt_scan_nolock(struct ubifs_info *c, int start_lnum, int end_lnum,
1543 ubifs_lpt_scan_callback scan_cb, void *data);
1544
1545/* Shared by lpt.c for lpt_commit.c */
1546void ubifs_pack_lsave(struct ubifs_info *c, void *buf, int *lsave);
1547void ubifs_pack_ltab(struct ubifs_info *c, void *buf,
1548 struct ubifs_lpt_lprops *ltab);
1549void ubifs_pack_pnode(struct ubifs_info *c, void *buf,
1550 struct ubifs_pnode *pnode);
1551void ubifs_pack_nnode(struct ubifs_info *c, void *buf,
1552 struct ubifs_nnode *nnode);
1553struct ubifs_pnode *ubifs_get_pnode(struct ubifs_info *c,
1554 struct ubifs_nnode *parent, int iip);
1555struct ubifs_nnode *ubifs_get_nnode(struct ubifs_info *c,
1556 struct ubifs_nnode *parent, int iip);
1557int ubifs_read_nnode(struct ubifs_info *c, struct ubifs_nnode *parent, int iip);
1558void ubifs_add_lpt_dirt(struct ubifs_info *c, int lnum, int dirty);
1559void ubifs_add_nnode_dirt(struct ubifs_info *c, struct ubifs_nnode *nnode);
1560uint32_t ubifs_unpack_bits(uint8_t **addr, int *pos, int nrbits);
1561struct ubifs_nnode *ubifs_first_nnode(struct ubifs_info *c, int *hght);
1562
1563/* lpt_commit.c */
1564int ubifs_lpt_start_commit(struct ubifs_info *c);
1565int ubifs_lpt_end_commit(struct ubifs_info *c);
1566int ubifs_lpt_post_commit(struct ubifs_info *c);
1567void ubifs_lpt_free(struct ubifs_info *c, int wr_only);
1568
1569/* lprops.c */
1570void ubifs_get_lprops(struct ubifs_info *c);
1571const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c,
1572 const struct ubifs_lprops *lp,
1573 int free, int dirty, int flags,
1574 int idx_gc_cnt);
1575void ubifs_release_lprops(struct ubifs_info *c);
1576void ubifs_get_lp_stats(struct ubifs_info *c, struct ubifs_lp_stats *stats);
1577void ubifs_add_to_cat(struct ubifs_info *c, struct ubifs_lprops *lprops,
1578 int cat);
1579void ubifs_replace_cat(struct ubifs_info *c, struct ubifs_lprops *old_lprops,
1580 struct ubifs_lprops *new_lprops);
1581void ubifs_ensure_cat(struct ubifs_info *c, struct ubifs_lprops *lprops);
1582int ubifs_categorize_lprops(const struct ubifs_info *c,
1583 const struct ubifs_lprops *lprops);
1584int ubifs_change_one_lp(struct ubifs_info *c, int lnum, int free, int dirty,
1585 int flags_set, int flags_clean, int idx_gc_cnt);
1586int ubifs_update_one_lp(struct ubifs_info *c, int lnum, int free, int dirty,
1587 int flags_set, int flags_clean);
1588int ubifs_read_one_lp(struct ubifs_info *c, int lnum, struct ubifs_lprops *lp);
1589const struct ubifs_lprops *ubifs_fast_find_free(struct ubifs_info *c);
1590const struct ubifs_lprops *ubifs_fast_find_empty(struct ubifs_info *c);
1591const struct ubifs_lprops *ubifs_fast_find_freeable(struct ubifs_info *c);
1592const struct ubifs_lprops *ubifs_fast_find_frdi_idx(struct ubifs_info *c);
1593
1594/* file.c */
1595int ubifs_fsync(struct file *file, struct dentry *dentry, int datasync);
1596int ubifs_setattr(struct dentry *dentry, struct iattr *attr);
1597
1598/* dir.c */
1599struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir,
1600 int mode);
1601int ubifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
1602 struct kstat *stat);
1603
1604/* xattr.c */
1605int ubifs_setxattr(struct dentry *dentry, const char *name,
1606 const void *value, size_t size, int flags);
1607ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf,
1608 size_t size);
1609ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size);
1610int ubifs_removexattr(struct dentry *dentry, const char *name);
1611
1612/* super.c */
1613struct inode *ubifs_iget(struct super_block *sb, unsigned long inum);
1614
1615/* recovery.c */
1616int ubifs_recover_master_node(struct ubifs_info *c);
1617int ubifs_write_rcvrd_mst_node(struct ubifs_info *c);
1618struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
1619 int offs, void *sbuf, int grouped);
1620struct ubifs_scan_leb *ubifs_recover_log_leb(struct ubifs_info *c, int lnum,
1621 int offs, void *sbuf);
1622int ubifs_recover_inl_heads(const struct ubifs_info *c, void *sbuf);
1623int ubifs_clean_lebs(const struct ubifs_info *c, void *sbuf);
1624int ubifs_rcvry_gc_commit(struct ubifs_info *c);
1625int ubifs_recover_size_accum(struct ubifs_info *c, union ubifs_key *key,
1626 int deletion, loff_t new_size);
1627int ubifs_recover_size(struct ubifs_info *c);
1628void ubifs_destroy_size_tree(struct ubifs_info *c);
1629
1630/* ioctl.c */
1631long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
1632void ubifs_set_inode_flags(struct inode *inode);
1633#ifdef CONFIG_COMPAT
1634long ubifs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
1635#endif
1636
1637/* compressor.c */
1638int __init ubifs_compressors_init(void);
1639void __exit ubifs_compressors_exit(void);
1640void ubifs_compress(const void *in_buf, int in_len, void *out_buf, int *out_len,
1641 int *compr_type);
1642int ubifs_decompress(const void *buf, int len, void *out, int *out_len,
1643 int compr_type);
1644
1645#include "debug.h"
1646#include "misc.h"
1647#include "key.h"
1648
1649#endif /* !__UBIFS_H__ */
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
new file mode 100644
index 000000000000..1388a078e1a9
--- /dev/null
+++ b/fs/ubifs/xattr.c
@@ -0,0 +1,581 @@
1/*
2 * This file is part of UBIFS.
3 *
4 * Copyright (C) 2006-2008 Nokia Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published by
8 * the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along with
16 * this program; if not, write to the Free Software Foundation, Inc., 51
17 * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 * Authors: Artem Bityutskiy (Битюцкий Артём)
20 * Adrian Hunter
21 */
22
23/*
24 * This file implements UBIFS extended attributes support.
25 *
26 * Extended attributes are implemented as regular inodes with attached data,
27 * which limits extended attribute size to UBIFS block size (4KiB). Names of
28 * extended attributes are described by extended attribute entries (xentries),
29 * which are almost identical to directory entries, but have different key type.
30 *
31 * In other words, the situation with extended attributes is very similar to
32 * directories. Indeed, any inode (but of course not xattr inodes) may have a
33 * number of associated xentries, just like directory inodes have associated
34 * directory entries. Extended attribute entries store the name of the extended
35 * attribute, the host inode number, and the extended attribute inode number.
36 * Similarly, direntries store the name, the parent and the target inode
37 * numbers. Thus, most of the common UBIFS mechanisms may be re-used for
38 * extended attributes.
39 *
40 * The number of extended attributes is not limited, but there is Linux
41 * limitation on the maximum possible size of the list of all extended
42 * attributes associated with an inode (%XATTR_LIST_MAX), so UBIFS makes sure
43 * the sum of all extended attribute names of the inode does not exceed that
44 * limit.
45 *
46 * Extended attributes are synchronous, which means they are written to the
47 * flash media synchronously and there is no write-back for extended attribute
48 * inodes. The extended attribute values are not stored in compressed form on
49 * the media.
50 *
51 * Since extended attributes are represented by regular inodes, they are cached
52 * in the VFS inode cache. The xentries are cached in the LNC cache (see
53 * tnc.c).
54 *
55 * ACL support is not implemented.
56 */
57
58#include <linux/xattr.h>
59#include <linux/posix_acl_xattr.h>
60#include "ubifs.h"
61
62/*
63 * Limit the number of extended attributes per inode so that the total size
64 * (xattr_size) is guaranteeded to fit in an 'unsigned int'.
65 */
66#define MAX_XATTRS_PER_INODE 65535
67
68/*
69 * Extended attribute type constants.
70 *
71 * USER_XATTR: user extended attribute ("user.*")
72 * TRUSTED_XATTR: trusted extended attribute ("trusted.*)
73 * SECURITY_XATTR: security extended attribute ("security.*")
74 */
75enum {
76 USER_XATTR,
77 TRUSTED_XATTR,
78 SECURITY_XATTR,
79};
80
81static struct inode_operations none_inode_operations;
82static struct address_space_operations none_address_operations;
83static struct file_operations none_file_operations;
84
85/**
86 * create_xattr - create an extended attribute.
87 * @c: UBIFS file-system description object
88 * @host: host inode
89 * @nm: extended attribute name
90 * @value: extended attribute value
91 * @size: size of extended attribute value
92 *
93 * This is a helper function which creates an extended attribute of name @nm
94 * and value @value for inode @host. The host inode is also updated on flash
95 * because the ctime and extended attribute accounting data changes. This
96 * function returns zero in case of success and a negative error code in case
97 * of failure.
98 */
99static int create_xattr(struct ubifs_info *c, struct inode *host,
100 const struct qstr *nm, const void *value, int size)
101{
102 int err;
103 struct inode *inode;
104 struct ubifs_inode *ui, *host_ui = ubifs_inode(host);
105 struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
106 .new_ino_d = size, .dirtied_ino = 1,
107 .dirtied_ino_d = host_ui->data_len};
108
109 if (host_ui->xattr_cnt >= MAX_XATTRS_PER_INODE)
110 return -ENOSPC;
111 /*
112 * Linux limits the maximum size of the extended attribute names list
113 * to %XATTR_LIST_MAX. This means we should not allow creating more*
114 * extended attributes if the name list becomes larger. This limitation
115 * is artificial for UBIFS, though.
116 */
117 if (host_ui->xattr_names + host_ui->xattr_cnt +
118 nm->len + 1 > XATTR_LIST_MAX)
119 return -ENOSPC;
120
121 err = ubifs_budget_space(c, &req);
122 if (err)
123 return err;
124
125 inode = ubifs_new_inode(c, host, S_IFREG | S_IRWXUGO);
126 if (IS_ERR(inode)) {
127 err = PTR_ERR(inode);
128 goto out_budg;
129 }
130
131 mutex_lock(&host_ui->ui_mutex);
132 /* Re-define all operations to be "nothing" */
133 inode->i_mapping->a_ops = &none_address_operations;
134 inode->i_op = &none_inode_operations;
135 inode->i_fop = &none_file_operations;
136
137 inode->i_flags |= S_SYNC | S_NOATIME | S_NOCMTIME | S_NOQUOTA;
138 ui = ubifs_inode(inode);
139 ui->xattr = 1;
140 ui->flags |= UBIFS_XATTR_FL;
141 ui->data = kmalloc(size, GFP_NOFS);
142 if (!ui->data) {
143 err = -ENOMEM;
144 goto out_unlock;
145 }
146
147 memcpy(ui->data, value, size);
148 host->i_ctime = ubifs_current_time(host);
149 host_ui->xattr_cnt += 1;
150 host_ui->xattr_size += CALC_DENT_SIZE(nm->len);
151 host_ui->xattr_size += CALC_XATTR_BYTES(size);
152 host_ui->xattr_names += nm->len;
153
154 /*
155 * We do not use i_size_write() because nobody can race with us as we
156 * are holding host @host->i_mutex - every xattr operation for this
157 * inode is serialized by it.
158 */
159 inode->i_size = ui->ui_size = size;
160 ui->data_len = size;
161 err = ubifs_jnl_update(c, host, nm, inode, 0, 1);
162 if (err)
163 goto out_cancel;
164 mutex_unlock(&host_ui->ui_mutex);
165
166 ubifs_release_budget(c, &req);
167 insert_inode_hash(inode);
168 iput(inode);
169 return 0;
170
171out_cancel:
172 host_ui->xattr_cnt -= 1;
173 host_ui->xattr_size -= CALC_DENT_SIZE(nm->len);
174 host_ui->xattr_size -= CALC_XATTR_BYTES(size);
175out_unlock:
176 mutex_unlock(&host_ui->ui_mutex);
177 make_bad_inode(inode);
178 iput(inode);
179out_budg:
180 ubifs_release_budget(c, &req);
181 return err;
182}
183
184/**
185 * change_xattr - change an extended attribute.
186 * @c: UBIFS file-system description object
187 * @host: host inode
188 * @inode: extended attribute inode
189 * @value: extended attribute value
190 * @size: size of extended attribute value
191 *
192 * This helper function changes the value of extended attribute @inode with new
193 * data from @value. Returns zero in case of success and a negative error code
194 * in case of failure.
195 */
196static int change_xattr(struct ubifs_info *c, struct inode *host,
197 struct inode *inode, const void *value, int size)
198{
199 int err;
200 struct ubifs_inode *host_ui = ubifs_inode(host);
201 struct ubifs_inode *ui = ubifs_inode(inode);
202 struct ubifs_budget_req req = { .dirtied_ino = 2,
203 .dirtied_ino_d = size + host_ui->data_len };
204
205 ubifs_assert(ui->data_len == inode->i_size);
206 err = ubifs_budget_space(c, &req);
207 if (err)
208 return err;
209
210 mutex_lock(&host_ui->ui_mutex);
211 host->i_ctime = ubifs_current_time(host);
212 host_ui->xattr_size -= CALC_XATTR_BYTES(ui->data_len);
213 host_ui->xattr_size += CALC_XATTR_BYTES(size);
214
215 kfree(ui->data);
216 ui->data = kmalloc(size, GFP_NOFS);
217 if (!ui->data) {
218 err = -ENOMEM;
219 goto out_unlock;
220 }
221
222 memcpy(ui->data, value, size);
223 inode->i_size = ui->ui_size = size;
224 ui->data_len = size;
225
226 /*
227 * It is important to write the host inode after the xattr inode
228 * because if the host inode gets synchronized (via 'fsync()'), then
229 * the extended attribute inode gets synchronized, because it goes
230 * before the host inode in the write-buffer.
231 */
232 err = ubifs_jnl_change_xattr(c, inode, host);
233 if (err)
234 goto out_cancel;
235 mutex_unlock(&host_ui->ui_mutex);
236
237 ubifs_release_budget(c, &req);
238 return 0;
239
240out_cancel:
241 host_ui->xattr_size -= CALC_XATTR_BYTES(size);
242 host_ui->xattr_size += CALC_XATTR_BYTES(ui->data_len);
243 make_bad_inode(inode);
244out_unlock:
245 mutex_unlock(&host_ui->ui_mutex);
246 ubifs_release_budget(c, &req);
247 return err;
248}
249
250/**
251 * check_namespace - check extended attribute name-space.
252 * @nm: extended attribute name
253 *
254 * This function makes sure the extended attribute name belongs to one of the
255 * supported extended attribute name-spaces. Returns name-space index in case
256 * of success and a negative error code in case of failure.
257 */
258static int check_namespace(const struct qstr *nm)
259{
260 int type;
261
262 if (nm->len > UBIFS_MAX_NLEN)
263 return -ENAMETOOLONG;
264
265 if (!strncmp(nm->name, XATTR_TRUSTED_PREFIX,
266 XATTR_TRUSTED_PREFIX_LEN)) {
267 if (nm->name[sizeof(XATTR_TRUSTED_PREFIX) - 1] == '\0')
268 return -EINVAL;
269 type = TRUSTED_XATTR;
270 } else if (!strncmp(nm->name, XATTR_USER_PREFIX,
271 XATTR_USER_PREFIX_LEN)) {
272 if (nm->name[XATTR_USER_PREFIX_LEN] == '\0')
273 return -EINVAL;
274 type = USER_XATTR;
275 } else if (!strncmp(nm->name, XATTR_SECURITY_PREFIX,
276 XATTR_SECURITY_PREFIX_LEN)) {
277 if (nm->name[sizeof(XATTR_SECURITY_PREFIX) - 1] == '\0')
278 return -EINVAL;
279 type = SECURITY_XATTR;
280 } else
281 return -EOPNOTSUPP;
282
283 return type;
284}
285
286static struct inode *iget_xattr(struct ubifs_info *c, ino_t inum)
287{
288 struct inode *inode;
289
290 inode = ubifs_iget(c->vfs_sb, inum);
291 if (IS_ERR(inode)) {
292 ubifs_err("dead extended attribute entry, error %d",
293 (int)PTR_ERR(inode));
294 return inode;
295 }
296 if (ubifs_inode(inode)->xattr)
297 return inode;
298 ubifs_err("corrupt extended attribute entry");
299 iput(inode);
300 return ERR_PTR(-EINVAL);
301}
302
303int ubifs_setxattr(struct dentry *dentry, const char *name,
304 const void *value, size_t size, int flags)
305{
306 struct inode *inode, *host = dentry->d_inode;
307 struct ubifs_info *c = host->i_sb->s_fs_info;
308 struct qstr nm = { .name = name, .len = strlen(name) };
309 struct ubifs_dent_node *xent;
310 union ubifs_key key;
311 int err, type;
312
313 dbg_gen("xattr '%s', host ino %lu ('%.*s'), size %zd", name,
314 host->i_ino, dentry->d_name.len, dentry->d_name.name, size);
315
316 if (size > UBIFS_MAX_INO_DATA)
317 return -ERANGE;
318
319 type = check_namespace(&nm);
320 if (type < 0)
321 return type;
322
323 xent = kmalloc(UBIFS_MAX_XENT_NODE_SZ, GFP_NOFS);
324 if (!xent)
325 return -ENOMEM;
326
327 /*
328 * The extended attribute entries are stored in LNC, so multiple
329 * look-ups do not involve reading the flash.
330 */
331 xent_key_init(c, &key, host->i_ino, &nm);
332 err = ubifs_tnc_lookup_nm(c, &key, xent, &nm);
333 if (err) {
334 if (err != -ENOENT)
335 goto out_free;
336
337 if (flags & XATTR_REPLACE)
338 /* We are asked not to create the xattr */
339 err = -ENODATA;
340 else
341 err = create_xattr(c, host, &nm, value, size);
342 goto out_free;
343 }
344
345 if (flags & XATTR_CREATE) {
346 /* We are asked not to replace the xattr */
347 err = -EEXIST;
348 goto out_free;
349 }
350
351 inode = iget_xattr(c, le64_to_cpu(xent->inum));
352 if (IS_ERR(inode)) {
353 err = PTR_ERR(inode);
354 goto out_free;
355 }
356
357 err = change_xattr(c, host, inode, value, size);
358 iput(inode);
359
360out_free:
361 kfree(xent);
362 return err;
363}
364
365ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf,
366 size_t size)
367{
368 struct inode *inode, *host = dentry->d_inode;
369 struct ubifs_info *c = host->i_sb->s_fs_info;
370 struct qstr nm = { .name = name, .len = strlen(name) };
371 struct ubifs_inode *ui;
372 struct ubifs_dent_node *xent;
373 union ubifs_key key;
374 int err;
375
376 dbg_gen("xattr '%s', ino %lu ('%.*s'), buf size %zd", name,
377 host->i_ino, dentry->d_name.len, dentry->d_name.name, size);
378
379 err = check_namespace(&nm);
380 if (err < 0)
381 return err;
382
383 xent = kmalloc(UBIFS_MAX_XENT_NODE_SZ, GFP_NOFS);
384 if (!xent)
385 return -ENOMEM;
386
387 mutex_lock(&host->i_mutex);
388 xent_key_init(c, &key, host->i_ino, &nm);
389 err = ubifs_tnc_lookup_nm(c, &key, xent, &nm);
390 if (err) {
391 if (err == -ENOENT)
392 err = -ENODATA;
393 goto out_unlock;
394 }
395
396 inode = iget_xattr(c, le64_to_cpu(xent->inum));
397 if (IS_ERR(inode)) {
398 err = PTR_ERR(inode);
399 goto out_unlock;
400 }
401
402 ui = ubifs_inode(inode);
403 ubifs_assert(inode->i_size == ui->data_len);
404 ubifs_assert(ubifs_inode(host)->xattr_size > ui->data_len);
405
406 if (buf) {
407 /* If @buf is %NULL we are supposed to return the length */
408 if (ui->data_len > size) {
409 dbg_err("buffer size %zd, xattr len %d",
410 size, ui->data_len);
411 err = -ERANGE;
412 goto out_iput;
413 }
414
415 memcpy(buf, ui->data, ui->data_len);
416 }
417 err = ui->data_len;
418
419out_iput:
420 iput(inode);
421out_unlock:
422 mutex_unlock(&host->i_mutex);
423 kfree(xent);
424 return err;
425}
426
427ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size)
428{
429 union ubifs_key key;
430 struct inode *host = dentry->d_inode;
431 struct ubifs_info *c = host->i_sb->s_fs_info;
432 struct ubifs_inode *host_ui = ubifs_inode(host);
433 struct ubifs_dent_node *xent, *pxent = NULL;
434 int err, len, written = 0;
435 struct qstr nm = { .name = NULL };
436
437 dbg_gen("ino %lu ('%.*s'), buffer size %zd", host->i_ino,
438 dentry->d_name.len, dentry->d_name.name, size);
439
440 len = host_ui->xattr_names + host_ui->xattr_cnt;
441 if (!buffer)
442 /*
443 * We should return the minimum buffer size which will fit a
444 * null-terminated list of all the extended attribute names.
445 */
446 return len;
447
448 if (len > size)
449 return -ERANGE;
450
451 lowest_xent_key(c, &key, host->i_ino);
452
453 mutex_lock(&host->i_mutex);
454 while (1) {
455 int type;
456
457 xent = ubifs_tnc_next_ent(c, &key, &nm);
458 if (unlikely(IS_ERR(xent))) {
459 err = PTR_ERR(xent);
460 break;
461 }
462
463 nm.name = xent->name;
464 nm.len = le16_to_cpu(xent->nlen);
465
466 type = check_namespace(&nm);
467 if (unlikely(type < 0)) {
468 err = type;
469 break;
470 }
471
472 /* Show trusted namespace only for "power" users */
473 if (type != TRUSTED_XATTR || capable(CAP_SYS_ADMIN)) {
474 memcpy(buffer + written, nm.name, nm.len + 1);
475 written += nm.len + 1;
476 }
477
478 kfree(pxent);
479 pxent = xent;
480 key_read(c, &xent->key, &key);
481 }
482 mutex_unlock(&host->i_mutex);
483
484 kfree(pxent);
485 if (err != -ENOENT) {
486 ubifs_err("cannot find next direntry, error %d", err);
487 return err;
488 }
489
490 ubifs_assert(written <= size);
491 return written;
492}
493
494static int remove_xattr(struct ubifs_info *c, struct inode *host,
495 struct inode *inode, const struct qstr *nm)
496{
497 int err;
498 struct ubifs_inode *host_ui = ubifs_inode(host);
499 struct ubifs_inode *ui = ubifs_inode(inode);
500 struct ubifs_budget_req req = { .dirtied_ino = 1, .mod_dent = 1,
501 .dirtied_ino_d = host_ui->data_len };
502
503 ubifs_assert(ui->data_len == inode->i_size);
504
505 err = ubifs_budget_space(c, &req);
506 if (err)
507 return err;
508
509 mutex_lock(&host_ui->ui_mutex);
510 host->i_ctime = ubifs_current_time(host);
511 host_ui->xattr_cnt -= 1;
512 host_ui->xattr_size -= CALC_DENT_SIZE(nm->len);
513 host_ui->xattr_size -= CALC_XATTR_BYTES(ui->data_len);
514 host_ui->xattr_names -= nm->len;
515
516 err = ubifs_jnl_delete_xattr(c, host, inode, nm);
517 if (err)
518 goto out_cancel;
519 mutex_unlock(&host_ui->ui_mutex);
520
521 ubifs_release_budget(c, &req);
522 return 0;
523
524out_cancel:
525 host_ui->xattr_cnt += 1;
526 host_ui->xattr_size += CALC_DENT_SIZE(nm->len);
527 host_ui->xattr_size += CALC_XATTR_BYTES(ui->data_len);
528 mutex_unlock(&host_ui->ui_mutex);
529 ubifs_release_budget(c, &req);
530 make_bad_inode(inode);
531 return err;
532}
533
534int ubifs_removexattr(struct dentry *dentry, const char *name)
535{
536 struct inode *inode, *host = dentry->d_inode;
537 struct ubifs_info *c = host->i_sb->s_fs_info;
538 struct qstr nm = { .name = name, .len = strlen(name) };
539 struct ubifs_dent_node *xent;
540 union ubifs_key key;
541 int err;
542
543 dbg_gen("xattr '%s', ino %lu ('%.*s')", name,
544 host->i_ino, dentry->d_name.len, dentry->d_name.name);
545 ubifs_assert(mutex_is_locked(&host->i_mutex));
546
547 err = check_namespace(&nm);
548 if (err < 0)
549 return err;
550
551 xent = kmalloc(UBIFS_MAX_XENT_NODE_SZ, GFP_NOFS);
552 if (!xent)
553 return -ENOMEM;
554
555 xent_key_init(c, &key, host->i_ino, &nm);
556 err = ubifs_tnc_lookup_nm(c, &key, xent, &nm);
557 if (err) {
558 if (err == -ENOENT)
559 err = -ENODATA;
560 goto out_free;
561 }
562
563 inode = iget_xattr(c, le64_to_cpu(xent->inum));
564 if (IS_ERR(inode)) {
565 err = PTR_ERR(inode);
566 goto out_free;
567 }
568
569 ubifs_assert(inode->i_nlink == 1);
570 inode->i_nlink = 0;
571 err = remove_xattr(c, host, inode, &nm);
572 if (err)
573 inode->i_nlink = 1;
574
575 /* If @i_nlink is 0, 'iput()' will delete the inode */
576 iput(inode);
577
578out_free:
579 kfree(xent);
580 return err;
581}
diff --git a/fs/vfat/namei.c b/fs/vfat/namei.c
index a3522727ea5b..b546ba69be82 100644
--- a/fs/vfat/namei.c
+++ b/fs/vfat/namei.c
@@ -645,7 +645,7 @@ static int vfat_add_entry(struct inode *dir, struct qstr *qname, int is_dir,
645 if (len == 0) 645 if (len == 0)
646 return -ENOENT; 646 return -ENOENT;
647 647
648 slots = kmalloc(sizeof(*slots) * MSDOS_SLOTS, GFP_KERNEL); 648 slots = kmalloc(sizeof(*slots) * MSDOS_SLOTS, GFP_NOFS);
649 if (slots == NULL) 649 if (slots == NULL)
650 return -ENOMEM; 650 return -ENOMEM;
651 651
@@ -687,7 +687,7 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
687 struct dentry *alias; 687 struct dentry *alias;
688 int err, table; 688 int err, table;
689 689
690 lock_kernel(); 690 lock_super(sb);
691 table = (MSDOS_SB(sb)->options.name_check == 's') ? 2 : 0; 691 table = (MSDOS_SB(sb)->options.name_check == 's') ? 2 : 0;
692 dentry->d_op = &vfat_dentry_ops[table]; 692 dentry->d_op = &vfat_dentry_ops[table];
693 693
@@ -699,7 +699,7 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
699 inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos); 699 inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos);
700 brelse(sinfo.bh); 700 brelse(sinfo.bh);
701 if (IS_ERR(inode)) { 701 if (IS_ERR(inode)) {
702 unlock_kernel(); 702 unlock_super(sb);
703 return ERR_CAST(inode); 703 return ERR_CAST(inode);
704 } 704 }
705 alias = d_find_alias(inode); 705 alias = d_find_alias(inode);
@@ -708,13 +708,13 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
708 dput(alias); 708 dput(alias);
709 else { 709 else {
710 iput(inode); 710 iput(inode);
711 unlock_kernel(); 711 unlock_super(sb);
712 return alias; 712 return alias;
713 } 713 }
714 714
715 } 715 }
716error: 716error:
717 unlock_kernel(); 717 unlock_super(sb);
718 dentry->d_op = &vfat_dentry_ops[table]; 718 dentry->d_op = &vfat_dentry_ops[table];
719 dentry->d_time = dentry->d_parent->d_inode->i_version; 719 dentry->d_time = dentry->d_parent->d_inode->i_version;
720 dentry = d_splice_alias(inode, dentry); 720 dentry = d_splice_alias(inode, dentry);
@@ -734,7 +734,7 @@ static int vfat_create(struct inode *dir, struct dentry *dentry, int mode,
734 struct timespec ts; 734 struct timespec ts;
735 int err; 735 int err;
736 736
737 lock_kernel(); 737 lock_super(sb);
738 738
739 ts = CURRENT_TIME_SEC; 739 ts = CURRENT_TIME_SEC;
740 err = vfat_add_entry(dir, &dentry->d_name, 0, 0, &ts, &sinfo); 740 err = vfat_add_entry(dir, &dentry->d_name, 0, 0, &ts, &sinfo);
@@ -755,17 +755,18 @@ static int vfat_create(struct inode *dir, struct dentry *dentry, int mode,
755 dentry->d_time = dentry->d_parent->d_inode->i_version; 755 dentry->d_time = dentry->d_parent->d_inode->i_version;
756 d_instantiate(dentry, inode); 756 d_instantiate(dentry, inode);
757out: 757out:
758 unlock_kernel(); 758 unlock_super(sb);
759 return err; 759 return err;
760} 760}
761 761
762static int vfat_rmdir(struct inode *dir, struct dentry *dentry) 762static int vfat_rmdir(struct inode *dir, struct dentry *dentry)
763{ 763{
764 struct inode *inode = dentry->d_inode; 764 struct inode *inode = dentry->d_inode;
765 struct super_block *sb = dir->i_sb;
765 struct fat_slot_info sinfo; 766 struct fat_slot_info sinfo;
766 int err; 767 int err;
767 768
768 lock_kernel(); 769 lock_super(sb);
769 770
770 err = fat_dir_empty(inode); 771 err = fat_dir_empty(inode);
771 if (err) 772 if (err)
@@ -783,7 +784,7 @@ static int vfat_rmdir(struct inode *dir, struct dentry *dentry)
783 inode->i_mtime = inode->i_atime = CURRENT_TIME_SEC; 784 inode->i_mtime = inode->i_atime = CURRENT_TIME_SEC;
784 fat_detach(inode); 785 fat_detach(inode);
785out: 786out:
786 unlock_kernel(); 787 unlock_super(sb);
787 788
788 return err; 789 return err;
789} 790}
@@ -791,10 +792,11 @@ out:
791static int vfat_unlink(struct inode *dir, struct dentry *dentry) 792static int vfat_unlink(struct inode *dir, struct dentry *dentry)
792{ 793{
793 struct inode *inode = dentry->d_inode; 794 struct inode *inode = dentry->d_inode;
795 struct super_block *sb = dir->i_sb;
794 struct fat_slot_info sinfo; 796 struct fat_slot_info sinfo;
795 int err; 797 int err;
796 798
797 lock_kernel(); 799 lock_super(sb);
798 800
799 err = vfat_find(dir, &dentry->d_name, &sinfo); 801 err = vfat_find(dir, &dentry->d_name, &sinfo);
800 if (err) 802 if (err)
@@ -807,7 +809,7 @@ static int vfat_unlink(struct inode *dir, struct dentry *dentry)
807 inode->i_mtime = inode->i_atime = CURRENT_TIME_SEC; 809 inode->i_mtime = inode->i_atime = CURRENT_TIME_SEC;
808 fat_detach(inode); 810 fat_detach(inode);
809out: 811out:
810 unlock_kernel(); 812 unlock_super(sb);
811 813
812 return err; 814 return err;
813} 815}
@@ -820,7 +822,7 @@ static int vfat_mkdir(struct inode *dir, struct dentry *dentry, int mode)
820 struct timespec ts; 822 struct timespec ts;
821 int err, cluster; 823 int err, cluster;
822 824
823 lock_kernel(); 825 lock_super(sb);
824 826
825 ts = CURRENT_TIME_SEC; 827 ts = CURRENT_TIME_SEC;
826 cluster = fat_alloc_new_dir(dir, &ts); 828 cluster = fat_alloc_new_dir(dir, &ts);
@@ -849,13 +851,13 @@ static int vfat_mkdir(struct inode *dir, struct dentry *dentry, int mode)
849 dentry->d_time = dentry->d_parent->d_inode->i_version; 851 dentry->d_time = dentry->d_parent->d_inode->i_version;
850 d_instantiate(dentry, inode); 852 d_instantiate(dentry, inode);
851 853
852 unlock_kernel(); 854 unlock_super(sb);
853 return 0; 855 return 0;
854 856
855out_free: 857out_free:
856 fat_free_clusters(dir, cluster); 858 fat_free_clusters(dir, cluster);
857out: 859out:
858 unlock_kernel(); 860 unlock_super(sb);
859 return err; 861 return err;
860} 862}
861 863
@@ -869,11 +871,12 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
869 struct timespec ts; 871 struct timespec ts;
870 loff_t dotdot_i_pos, new_i_pos; 872 loff_t dotdot_i_pos, new_i_pos;
871 int err, is_dir, update_dotdot, corrupt = 0; 873 int err, is_dir, update_dotdot, corrupt = 0;
874 struct super_block *sb = old_dir->i_sb;
872 875
873 old_sinfo.bh = sinfo.bh = dotdot_bh = NULL; 876 old_sinfo.bh = sinfo.bh = dotdot_bh = NULL;
874 old_inode = old_dentry->d_inode; 877 old_inode = old_dentry->d_inode;
875 new_inode = new_dentry->d_inode; 878 new_inode = new_dentry->d_inode;
876 lock_kernel(); 879 lock_super(sb);
877 err = vfat_find(old_dir, &old_dentry->d_name, &old_sinfo); 880 err = vfat_find(old_dir, &old_dentry->d_name, &old_sinfo);
878 if (err) 881 if (err)
879 goto out; 882 goto out;
@@ -951,7 +954,7 @@ out:
951 brelse(sinfo.bh); 954 brelse(sinfo.bh);
952 brelse(dotdot_bh); 955 brelse(dotdot_bh);
953 brelse(old_sinfo.bh); 956 brelse(old_sinfo.bh);
954 unlock_kernel(); 957 unlock_super(sb);
955 958
956 return err; 959 return err;
957 960
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index afaee301b0ee..ad3d26ddfe31 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -2427,13 +2427,20 @@ restart:
2427 if (iclog->ic_size - iclog->ic_offset < 2*sizeof(xlog_op_header_t)) { 2427 if (iclog->ic_size - iclog->ic_offset < 2*sizeof(xlog_op_header_t)) {
2428 xlog_state_switch_iclogs(log, iclog, iclog->ic_size); 2428 xlog_state_switch_iclogs(log, iclog, iclog->ic_size);
2429 2429
2430 /* If I'm the only one writing to this iclog, sync it to disk */ 2430 /*
2431 if (atomic_read(&iclog->ic_refcnt) == 1) { 2431 * If I'm the only one writing to this iclog, sync it to disk.
2432 * We need to do an atomic compare and decrement here to avoid
2433 * racing with concurrent atomic_dec_and_lock() calls in
2434 * xlog_state_release_iclog() when there is more than one
2435 * reference to the iclog.
2436 */
2437 if (!atomic_add_unless(&iclog->ic_refcnt, -1, 1)) {
2438 /* we are the only one */
2432 spin_unlock(&log->l_icloglock); 2439 spin_unlock(&log->l_icloglock);
2433 if ((error = xlog_state_release_iclog(log, iclog))) 2440 error = xlog_state_release_iclog(log, iclog);
2441 if (error)
2434 return error; 2442 return error;
2435 } else { 2443 } else {
2436 atomic_dec(&iclog->ic_refcnt);
2437 spin_unlock(&log->l_icloglock); 2444 spin_unlock(&log->l_icloglock);
2438 } 2445 }
2439 goto restart; 2446 goto restart;