10 files changed, 247 insertions, 34 deletions
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 94d93b1f8b53..b30753cbf431 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -67,6 +67,7 @@ prototypes:
                                struct file *, unsigned open_flag,
                                umode_t create_mode, int *opened);
        int (*tmpfile) (struct inode *, struct dentry *, umode_t);
+        int (*dentry_open)(struct dentry *, struct file *, const struct cred *);
 locking rules:
        all may block
@@ -96,6 +97,7 @@ fiemap:		no
 update_time:    no
 atomic_open:    yes
 tmpfile:        no
+dentry_open:    no
        Additionally, ->rmdir(), ->unlink() and ->rename() have ->i_mutex on
 victim.
diff --git a/Documentation/filesystems/debugfs.txt b/Documentation/filesystems/debugfs.txt
index 3a863f692728..88ab81c79109 100644
--- a/Documentation/filesystems/debugfs.txt
+++ b/Documentation/filesystems/debugfs.txt
@@ -140,7 +140,7 @@ file.
                                     struct dentry *parent,
                                     struct debugfs_regset32 *regset);
-    int debugfs_print_regs32(struct seq_file *s, struct debugfs_reg32 *regs,
+    void debugfs_print_regs32(struct seq_file *s, struct debugfs_reg32 *regs,
                         int nregs, void __iomem *base, char *prefix);
 The "base" argument may be 0, but you may want to build the reg32 array
diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt
index 2cca5a25ef89..e0950c483c22 100644
--- a/Documentation/filesystems/f2fs.txt
+++ b/Documentation/filesystems/f2fs.txt
@@ -122,6 +122,10 @@ disable_ext_identify   Disable the extension list configured by mkfs, so f2fs
 inline_xattr           Enable the inline xattrs feature.
 inline_data            Enable the inline data feature: New created small(<~3.4k)
                       files can be written into inode block.
+inline_dentry          Enable the inline dir feature: data in new created
+                       directory entries can be written into inode block. The
+                       space of inode block which is used to store inline
+                       dentries is limited to ~3.4k.
 flush_merge            Merge concurrent cache_flush commands as much as possible
                       to eliminate redundant command issues. If the underlying
                       device handles the cache_flush command relatively slowly,
@@ -131,6 +135,9 @@ nobarrier              This option can be used if underlying storage guarantees
                       If this option is set, no cache_flush commands are issued
                       but f2fs still guarantees the write ordering of all the
                       data writes.
+fastboot               This option is used when a system wants to reduce mount
+                       time as much as possible, even though normal performance
+                       can be sacrificed.
 ================================================================================
 DEBUGFS ENTRIES
diff --git a/Documentation/filesystems/nfs/Exporting b/Documentation/filesystems/nfs/Exporting
index c8f036a9b13f..520a4becb75c 100644
--- a/Documentation/filesystems/nfs/Exporting
+++ b/Documentation/filesystems/nfs/Exporting
@@ -72,24 +72,11 @@ c/ Helper routines to allocate anonymous dentries, and to help attach
        DCACHE_DISCONNECTED) dentry is allocated and attached.
      In the case of a directory, care is taken that only one dentry
      can ever be attached.
-    d_splice_alias(inode, dentry) or d_materialise_unique(dentry, inode)
+    d_splice_alias(inode, dentry) will introduce a new dentry into the tree;
-      will introduce a new dentry into the tree; either the passed-in
+      either the passed-in dentry or a preexisting alias for the given inode
-      dentry or a preexisting alias for the given inode (such as an
+      (such as an anonymous one created by d_obtain_alias), if appropriate.
-      anonymous one created by d_obtain_alias), if appropriate.  The two
+      It returns NULL when the passed-in dentry is used, following the calling
-      functions differ in their handling of directories with preexisting
+      convention of ->lookup.
-      aliases:
-        d_splice_alias will use any existing IS_ROOT dentry, but it will
-          return -EIO rather than try to move a dentry with a different
-          parent.  This is appropriate for local filesystems, which
-          should never see such an alias unless the filesystem is
-          corrupted somehow (for example, if two on-disk directory
-          entries refer to the same directory.)
-        d_materialise_unique will attempt to move any dentry.  This is
-          appropriate for distributed filesystems, where finding a
-          directory other than where we last cached it may be a normal
-          consequence of concurrent operations on other hosts.
-      Both functions return NULL when the passed-in dentry is used,
-      following the calling convention of ->lookup.
 
 Filesystem Issues
diff --git a/Documentation/filesystems/overlayfs.txt b/Documentation/filesystems/overlayfs.txt
new file mode 100644
index 000000000000..a27c950ece61
--- /dev/null
+++ b/Documentation/filesystems/overlayfs.txt
@@ -0,0 +1,198 @@
+Written by: Neil Brown <neilb@suse.de>
+Overlay Filesystem
+==================
+This document describes a prototype for a new approach to providing
+overlay-filesystem functionality in Linux (sometimes referred to as
+union-filesystems).  An overlay-filesystem tries to present a
+filesystem which is the result over overlaying one filesystem on top
+of the other.
+The result will inevitably fail to look exactly like a normal
+filesystem for various technical reasons.  The expectation is that
+many use cases will be able to ignore these differences.
+This approach is 'hybrid' because the objects that appear in the
+filesystem do not all appear to belong to that filesystem.  In many
+cases an object accessed in the union will be indistinguishable
+from accessing the corresponding object from the original filesystem.
+This is most obvious from the 'st_dev' field returned by stat(2).
+While directories will report an st_dev from the overlay-filesystem,
+all non-directory objects will report an st_dev from the lower or
+upper filesystem that is providing the object.  Similarly st_ino will
+only be unique when combined with st_dev, and both of these can change
+over the lifetime of a non-directory object.  Many applications and
+tools ignore these values and will not be affected.
+Upper and Lower
+---------------
+An overlay filesystem combines two filesystems - an 'upper' filesystem
+and a 'lower' filesystem.  When a name exists in both filesystems, the
+object in the 'upper' filesystem is visible while the object in the
+'lower' filesystem is either hidden or, in the case of directories,
+merged with the 'upper' object.
+It would be more correct to refer to an upper and lower 'directory
+tree' rather than 'filesystem' as it is quite possible for both
+directory trees to be in the same filesystem and there is no
+requirement that the root of a filesystem be given for either upper or
+lower.
+The lower filesystem can be any filesystem supported by Linux and does
+not need to be writable.  The lower filesystem can even be another
+overlayfs.  The upper filesystem will normally be writable and if it
+is it must support the creation of trusted.* extended attributes, and
+must provide valid d_type in readdir responses, so NFS is not suitable.
+A read-only overlay of two read-only filesystems may use any
+filesystem type.
+Directories
+-----------
+Overlaying mainly involves directories.  If a given name appears in both
+upper and lower filesystems and refers to a non-directory in either,
+then the lower object is hidden - the name refers only to the upper
+object.
+Where both upper and lower objects are directories, a merged directory
+is formed.
+At mount time, the two directories given as mount options "lowerdir" and
+"upperdir" are combined into a merged directory:
+  mount -t overlay overlay -olowerdir=/lower,upperdir=/upper,\
+workdir=/work /merged
+The "workdir" needs to be an empty directory on the same filesystem
+as upperdir.
+Then whenever a lookup is requested in such a merged directory, the
+lookup is performed in each actual directory and the combined result
+is cached in the dentry belonging to the overlay filesystem.  If both
+actual lookups find directories, both are stored and a merged
+directory is created, otherwise only one is stored: the upper if it
+exists, else the lower.
+Only the lists of names from directories are merged.  Other content
+such as metadata and extended attributes are reported for the upper
+directory only.  These attributes of the lower directory are hidden.
+whiteouts and opaque directories
+--------------------------------
+In order to support rm and rmdir without changing the lower
+filesystem, an overlay filesystem needs to record in the upper filesystem
+that files have been removed.  This is done using whiteouts and opaque
+directories (non-directories are always opaque).
+A whiteout is created as a character device with 0/0 device number.
+When a whiteout is found in the upper level of a merged directory, any
+matching name in the lower level is ignored, and the whiteout itself
+is also hidden.
+A directory is made opaque by setting the xattr "trusted.overlay.opaque"
+to "y".  Where the upper filesystem contains an opaque directory, any
+directory in the lower filesystem with the same name is ignored.
+readdir
+-------
+When a 'readdir' request is made on a merged directory, the upper and
+lower directories are each read and the name lists merged in the
+obvious way (upper is read first, then lower - entries that already
+exist are not re-added).  This merged name list is cached in the
+'struct file' and so remains as long as the file is kept open.  If the
+directory is opened and read by two processes at the same time, they
+will each have separate caches.  A seekdir to the start of the
+directory (offset 0) followed by a readdir will cause the cache to be
+discarded and rebuilt.
+This means that changes to the merged directory do not appear while a
+directory is being read.  This is unlikely to be noticed by many
+programs.
+seek offsets are assigned sequentially when the directories are read.
+Thus if
+  - read part of a directory
+  - remember an offset, and close the directory
+  - re-open the directory some time later
+  - seek to the remembered offset
+there may be little correlation between the old and new locations in
+the list of filenames, particularly if anything has changed in the
+directory.
+Readdir on directories that are not merged is simply handled by the
+underlying directory (upper or lower).
+Non-directories
+---------------
+Objects that are not directories (files, symlinks, device-special
+files etc.) are presented either from the upper or lower filesystem as
+appropriate.  When a file in the lower filesystem is accessed in a way
+the requires write-access, such as opening for write access, changing
+some metadata etc., the file is first copied from the lower filesystem
+to the upper filesystem (copy_up).  Note that creating a hard-link
+also requires copy_up, though of course creation of a symlink does
+not.
+The copy_up may turn out to be unnecessary, for example if the file is
+opened for read-write but the data is not modified.
+The copy_up process first makes sure that the containing directory
+exists in the upper filesystem - creating it and any parents as
+necessary.  It then creates the object with the same metadata (owner,
+mode, mtime, symlink-target etc.) and then if the object is a file, the
+data is copied from the lower to the upper filesystem.  Finally any
+extended attributes are copied up.
+Once the copy_up is complete, the overlay filesystem simply
+provides direct access to the newly created file in the upper
+filesystem - future operations on the file are barely noticed by the
+overlay filesystem (though an operation on the name of the file such as
+rename or unlink will of course be noticed and handled).
+Non-standard behavior
+---------------------
+The copy_up operation essentially creates a new, identical file and
+moves it over to the old name.  The new file may be on a different
+filesystem, so both st_dev and st_ino of the file may change.
+Any open files referring to this inode will access the old data and
+metadata.  Similarly any file locks obtained before copy_up will not
+apply to the copied up file.
+On a file opened with O_RDONLY fchmod(2), fchown(2), futimesat(2) and
+fsetxattr(2) will fail with EROFS.
+If a file with multiple hard links is copied up, then this will
+"break" the link.  Changes will not be propagated to other names
+referring to the same inode.
+Symlinks in /proc/PID/ and /proc/PID/fd which point to a non-directory
+object in overlayfs will not contain valid absolute paths, only
+relative paths leading up to the filesystem's root.  This will be
+fixed in the future.
+Some operations are not atomic, for example a crash during copy_up or
+rename will leave the filesystem in an inconsistent state.  This will
+be addressed in the future.
+Changes to underlying filesystems
+---------------------------------
+Offline changes, when the overlay is not mounted, are allowed to either
+the upper or the lower trees.
+Changes to the underlying filesystems while part of a mounted overlay
+filesystem are not allowed.  If the underlying filesystem is changed,
+the behavior of the overlay is undefined, though it will not result in
+a crash or deadlock.
diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting
index 0f3a1390bf00..fa2db081505e 100644
--- a/Documentation/filesystems/porting
+++ b/Documentation/filesystems/porting
@@ -463,3 +463,11 @@ in your dentry operations instead.
        of the in-tree instances did).  inode_hash_lock is still held,
        of course, so they are still serialized wrt removal from inode hash,
        as well as wrt set() callback of iget5_locked().
+--
+[mandatory]
+        d_materialise_unique() is gone; d_splice_alias() does everything you
+        need now.  Remember that they have opposite orders of arguments ;-/
+--
+[mandatory]
+        f_dentry is gone; use f_path.dentry, or, better yet, see if you can avoid
+        it entirely.
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index eb8a10e22f7c..aae9dd13c91f 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -1272,7 +1272,7 @@ softirq.
 1.9 Ext4 file system parameters
------------------------------
+-------------------------------
 Information about mounted ext4 file systems can be found in
 /proc/fs/ext4.  Each mounted filesystem will have a directory in
diff --git a/Documentation/filesystems/seq_file.txt b/Documentation/filesystems/seq_file.txt
index 8ea3e90ace07..b797ed38de46 100644
--- a/Documentation/filesystems/seq_file.txt
+++ b/Documentation/filesystems/seq_file.txt
@@ -180,23 +180,19 @@ output must be passed to the seq_file code. Some utility functions have
 been defined which make this task easy.
 Most code will simply use seq_printf(), which works pretty much like
-printk(), but which requires the seq_file pointer as an argument. It is
+printk(), but which requires the seq_file pointer as an argument.
-common to ignore the return value from seq_printf(), but a function
-producing complicated output may want to check that value and quit if
-something non-zero is returned; an error return means that the seq_file
-buffer has been filled and further output will be discarded.
 For straight character output, the following functions may be used:
-        int seq_putc(struct seq_file *m, char c);
+        seq_putc(struct seq_file *m, char c);
-        int seq_puts(struct seq_file *m, const char *s);
+        seq_puts(struct seq_file *m, const char *s);
-        int seq_escape(struct seq_file *m, const char *s, const char *esc);
+        seq_escape(struct seq_file *m, const char *s, const char *esc);
 The first two output a single character and a string, just like one would
 expect. seq_escape() is like seq_puts(), except that any character in s
 which is in the string esc will be represented in octal form in the output.
-There is also a pair of functions for printing filenames:
+There are also a pair of functions for printing filenames:
        int seq_path(struct seq_file *m, struct path *path, char *esc);
        int seq_path_root(struct seq_file *m, struct path *path,
@@ -209,6 +205,14 @@ root is desired, it can be used with seq_path_root().  Note that, if it
 turns out that path cannot be reached from root, the value of root will be
 changed in seq_file_root() to a root which *does* work.
+A function producing complicated output may want to check
+        bool seq_has_overflowed(struct seq_file *m);
+and avoid further seq_<output> calls if true is returned.
+A true return from seq_has_overflowed means that the seq_file buffer will
+be discarded and the seq_show function will attempt to allocate a larger
+buffer and retry printing.
 Making it all work
diff --git a/Documentation/filesystems/squashfs.txt b/Documentation/filesystems/squashfs.txt
index 403c090aca39..e5274f84dc56 100644
--- a/Documentation/filesystems/squashfs.txt
+++ b/Documentation/filesystems/squashfs.txt
@@ -2,10 +2,10 @@ SQUASHFS 4.0 FILESYSTEM
 =======================
 Squashfs is a compressed read-only filesystem for Linux.
-It uses zlib/lzo/xz compression to compress files, inodes and directories.
+It uses zlib, lz4, lzo, or xz compression to compress files, inodes and
-Inodes in the system are very small and all blocks are packed to minimise
+directories.  Inodes in the system are very small and all blocks are packed to
-data overhead. Block sizes greater than 4K are supported up to a maximum
+minimise data overhead. Block sizes greater than 4K are supported up to a
-of 1Mbytes (default block size 128K).
+maximum of 1Mbytes (default block size 128K).
 Squashfs is intended for general read-only filesystem use, for archival
 use (i.e. in cases where a .tar.gz file may be used), and in constrained
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index fceff7c00a3c..43ce0507ee25 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -364,6 +364,7 @@ struct inode_operations {
        int (*atomic_open)(struct inode *, struct dentry *, struct file *,
                        unsigned open_flag, umode_t create_mode, int *opened);
        int (*tmpfile) (struct inode *, struct dentry *, umode_t);
+        int (*dentry_open)(struct dentry *, struct file *, const struct cred *);
 };
 Again, all methods are called without any locks being held, unless
@@ -696,6 +697,12 @@ struct address_space_operations {
        but instead uses bmap to find out where the blocks in the file
        are and uses those addresses directly.
+  dentry_open: *WARNING: probably going away soon, do not use!* This is an
+        alternative to f_op->open(), the difference is that this method may open
+        a file not necessarily originating from the same filesystem as the one
+        i_op->open() was called on.  It may be useful for stacking filesystems
+        which want to allow native I/O directly on underlying files.
  invalidatepage: If a page has PagePrivate set, then invalidatepage
        will be called when part or all of the page is to be removed
@@ -828,7 +835,7 @@ struct file_operations {
        ssize_t (*splice_read)(struct file *, struct pipe_inode_info *, size_t, unsigned int);
        int (*setlease)(struct file *, long arg, struct file_lock **, void **);
        long (*fallocate)(struct file *, int mode, loff_t offset, loff_t len);
-        int (*show_fdinfo)(struct seq_file *m, struct file *f);
+        void (*show_fdinfo)(struct seq_file *m, struct file *f);
 };
 Again, all methods are called without any locks being held, unless