6 files changed, 418 insertions, 28 deletions
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index ccec55394380..ec6a9392a173 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -97,8 +97,8 @@ prototypes:
        void (*put_super) (struct super_block *);
        void (*write_super) (struct super_block *);
        int (*sync_fs)(struct super_block *sb, int wait);
-        void (*write_super_lockfs) (struct super_block *);
+        int (*freeze_fs) (struct super_block *);
-        void (*unlockfs) (struct super_block *);
+        int (*unfreeze_fs) (struct super_block *);
        int (*statfs) (struct dentry *, struct kstatfs *);
        int (*remount_fs) (struct super_block *, int *, char *);
        void (*clear_inode) (struct inode *);
@@ -119,8 +119,8 @@ delete_inode:		no
 put_super:              yes     yes     no
 write_super:            no      yes     read
 sync_fs:                no      no      read
-write_super_lockfs:     ?
+freeze_fs:              ?
-unlockfs:               ?
+unfreeze_fs:            ?
 statfs:                 no      no      no
 remount_fs:             yes     yes     maybe           (see below)
 clear_inode:            no
@@ -397,7 +397,7 @@ prototypes:
 };
 locking rules:
-        All except ->poll() may block.
+        All may block.
                        BKL
 llseek:                 no      (see below)
 read:                   no
diff --git a/Documentation/filesystems/btrfs.txt b/Documentation/filesystems/btrfs.txt
new file mode 100644
index 000000000000..64087c34327f
--- /dev/null
+++ b/Documentation/filesystems/btrfs.txt
@@ -0,0 +1,91 @@
+        BTRFS
+        =====
+Btrfs is a new copy on write filesystem for Linux aimed at
+implementing advanced features while focusing on fault tolerance,
+repair and easy administration. Initially developed by Oracle, Btrfs
+is licensed under the GPL and open for contribution from anyone.
+Linux has a wealth of filesystems to choose from, but we are facing a
+number of challenges with scaling to the large storage subsystems that
+are becoming common in today's data centers. Filesystems need to scale
+in their ability to address and manage large storage, and also in
+their ability to detect, repair and tolerate errors in the data stored
+on disk.  Btrfs is under heavy development, and is not suitable for
+any uses other than benchmarking and review. The Btrfs disk format is
+not yet finalized.
+The main Btrfs features include:
+    * Extent based file storage (2^64 max file size)
+    * Space efficient packing of small files
+    * Space efficient indexed directories
+    * Dynamic inode allocation
+    * Writable snapshots
+    * Subvolumes (separate internal filesystem roots)
+    * Object level mirroring and striping
+    * Checksums on data and metadata (multiple algorithms available)
+    * Compression
+    * Integrated multiple device support, with several raid algorithms
+    * Online filesystem check (not yet implemented)
+    * Very fast offline filesystem check
+    * Efficient incremental backup and FS mirroring (not yet implemented)
+    * Online filesystem defragmentation
+        MAILING LIST
+        ============
+There is a Btrfs mailing list hosted on vger.kernel.org. You can
+find details on how to subscribe here:
+http://vger.kernel.org/vger-lists.html#linux-btrfs
+Mailing list archives are available from gmane:
+http://dir.gmane.org/gmane.comp.file-systems.btrfs
+        IRC
+        ===
+Discussion of Btrfs also occurs on the #btrfs channel of the Freenode
+IRC network.
+        UTILITIES
+        =========
+Userspace tools for creating and manipulating Btrfs file systems are
+available from the git repository at the following location:
+ http://git.kernel.org/?p=linux/kernel/git/mason/btrfs-progs-unstable.git
+ git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs-progs-unstable.git
+These include the following tools:
+mkfs.btrfs: create a filesystem
+btrfsctl: control program to create snapshots and subvolumes:
+        mount /dev/sda2 /mnt
+        btrfsctl -s new_subvol_name /mnt
+        btrfsctl -s snapshot_of_default /mnt/default
+        btrfsctl -s snapshot_of_new_subvol /mnt/new_subvol_name
+        btrfsctl -s snapshot_of_a_snapshot /mnt/snapshot_of_new_subvol
+        ls /mnt
+        default snapshot_of_a_snapshot snapshot_of_new_subvol
+        new_subvol_name snapshot_of_default
+        Snapshots and subvolumes cannot be deleted right now, but you can
+        rm -rf all the files and directories inside them.
+btrfsck: do a limited check of the FS extent trees.
+btrfs-debug-tree: print all of the FS metadata in text form.  Example:
+        btrfs-debug-tree /dev/sda2 >& big_output_file
diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt
index 174eaff7ded9..cec829bc7291 100644
--- a/Documentation/filesystems/ext4.txt
+++ b/Documentation/filesystems/ext4.txt
@@ -58,13 +58,22 @@ Note: More extensive information for getting started with ext4 can be
        # mount -t ext4 /dev/hda1 /wherever
-  - When comparing performance with other filesystems, remember that
+  - When comparing performance with other filesystems, it's always
-    ext3/4 by default offers higher data integrity guarantees than most.
+    important to try multiple workloads; very often a subtle change in a
-    So when comparing with a metadata-only journalling filesystem, such
+    workload parameter can completely change the ranking of which
-    as ext3, use `mount -o data=writeback'.  And you might as well use
+    filesystems do well compared to others.  When comparing versus ext3,
-    `mount -o nobh' too along with it.  Making the journal larger than
+    note that ext4 enables write barriers by default, while ext3 does
-    the mke2fs default often helps performance with metadata-intensive
+    not enable write barriers by default.  So it is useful to use
-    workloads.
+    explicitly specify whether barriers are enabled or not when via the
+    '-o barriers=[0|1]' mount option for both ext3 and ext4 filesystems
+    for a fair comparison.  When tuning ext3 for best benchmark numbers,
+    it is often worthwhile to try changing the data journaling mode; '-o
+    data=writeback,nobh' can be faster for some workloads.  (Note
+    however that running mounted with data=writeback can potentially
+    leave stale data exposed in recently written files in case of an
+    unclean shutdown, which could be a security exposure in some
+    situations.)  Configuring the filesystem with a large journal can
+    also be helpful for metadata-intensive workloads.
 2. Features
 ===========
@@ -74,7 +83,7 @@ Note: More extensive information for getting started with ext4 can be
 * ability to use filesystems > 16TB (e2fsprogs support not available yet)
 * extent format reduces metadata overhead (RAM, IO for access, transactions)
 * extent format more robust in face of on-disk corruption due to magics,
-* internal redunancy in tree
+* internal redundancy in tree
 * improved file allocation (multi-block alloc)
 * fix 32000 subdirectory limit
 * nsec timestamps for mtime, atime, ctime, create time
@@ -116,10 +125,11 @@ grouping of bitmaps and inode tables.  Some test results available here:
 When mounting an ext4 filesystem, the following option are accepted:
 (*) == default
-extents         (*)     ext4 will use extents to address file data.  The
+ro                      Mount filesystem read only. Note that ext4 will
-                        file system will no longer be mountable by ext3.
+                        replay the journal (and thus write to the
+                        partition) even when mounted "read only". The
-noextents               ext4 will not use extents for newly created files
+                        mount options "ro,noload" can be used to prevent
+                        writes to the filesystem.
 journal_checksum        Enable checksumming of the journal transactions.
                        This will allow the recovery code in e2fsck and the
@@ -134,17 +144,17 @@ journal_async_commit	Commit block can be written to disk without waiting
 journal=update          Update the ext4 file system's journal to the current
                        format.
-journal=inum            When a journal already exists, this option is ignored.
-                        Otherwise, it specifies the number of the inode which
-                        will represent the ext4 file system's journal file.
 journal_dev=devnum      When the external journal device's major/minor numbers
                        have changed, this option allows the user to specify
                        the new journal location.  The journal device is
                        identified through its new major/minor numbers encoded
                        in devnum.
-noload                  Don't load the journal on mounting.
+noload                  Don't load the journal on mounting.  Note that
+                        if the filesystem was not unmounted cleanly,
+                        skipping the journal replay will lead to the
+                        filesystem containing inconsistencies that can
+                        lead to any number of problems.
 data=journal            All data are committed into the journal prior to being
                        written into the main file system.
@@ -219,9 +229,12 @@ minixdf			Make 'df' act like Minix.
 debug                   Extra debugging information is sent to syslog.
-errors=remount-ro(*)    Remount the filesystem read-only on an error.
+errors=remount-ro       Remount the filesystem read-only on an error.
 errors=continue         Keep going on a filesystem error.
 errors=panic            Panic and halt the machine if an error occurs.
+                        (These mount options override the errors behavior
+                        specified in the superblock, which can be configured
+                        using tune2fs)
 data_err=ignore(*)      Just print an error message if an error occurs
                        in a file data buffer in ordered mode.
@@ -261,6 +274,42 @@ delalloc	(*)	Deferring block allocation until write-out time.
 nodelalloc              Disable delayed allocation. Blocks are allocation
                        when data is copied from user to page cache.
+max_batch_time=usec     Maximum amount of time ext4 should wait for
+                        additional filesystem operations to be batch
+                        together with a synchronous write operation.
+                        Since a synchronous write operation is going to
+                        force a commit and then a wait for the I/O
+                        complete, it doesn't cost much, and can be a
+                        huge throughput win, we wait for a small amount
+                        of time to see if any other transactions can
+                        piggyback on the synchronous write.   The
+                        algorithm used is designed to automatically tune
+                        for the speed of the disk, by measuring the
+                        amount of time (on average) that it takes to
+                        finish committing a transaction.  Call this time
+                        the "commit time".  If the time that the
+                        transactoin has been running is less than the
+                        commit time, ext4 will try sleeping for the
+                        commit time to see if other operations will join
+                        the transaction.   The commit time is capped by
+                        the max_batch_time, which defaults to 15000us
+                        (15ms).   This optimization can be turned off
+                        entirely by setting max_batch_time to 0.
+min_batch_time=usec     This parameter sets the commit time (as
+                        described above) to be at least min_batch_time.
+                        It defaults to zero microseconds.  Increasing
+                        this parameter may improve the throughput of
+                        multi-threaded, synchronous workloads on very
+                        fast disks, at the cost of increasing latency.
+journal_ioprio=prio     The I/O priority (from 0 to 7, where 0 is the
+                        highest priorty) which should be used for I/O
+                        operations submitted by kjournald2 during a
+                        commit operation.  This defaults to 3, which is
+                        a slightly higher priority than the default I/O
+                        priority.
 Data Mode
 =========
 There are 3 different data modes:
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 71df353e367c..d105eb45282a 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -140,6 +140,7 @@ Table 1-1: Process specific entries in /proc
 statm          Process memory status information
 status         Process status in human readable form
 wchan          If CONFIG_KALLSYMS is set, a pre-decoded wchan
+ stack          Report full stack trace, enable via CONFIG_STACKTRACE
 smaps          Extension based on maps, the rss size for each mapped file
 ..............................................................................
@@ -1385,6 +1386,15 @@ swapcache reclaim.  Decreasing vfs_cache_pressure causes the kernel to prefer
 to retain dentry and inode caches.  Increasing vfs_cache_pressure beyond 100
 causes the kernel to prefer to reclaim dentries and inodes.
+dirty_background_bytes
+----------------------
+Contains the amount of dirty memory at which the pdflush background writeback
+daemon will start writeback.
+If dirty_background_bytes is written, dirty_background_ratio becomes a function
+of its value (dirty_background_bytes / the amount of dirtyable system memory).
 dirty_background_ratio
 ----------------------
@@ -1393,14 +1403,29 @@ pages + file cache, not including locked pages and HugePages), the number of
 pages at which the pdflush background writeback daemon will start writing out
 dirty data.
+If dirty_background_ratio is written, dirty_background_bytes becomes a function
+of its value (dirty_background_ratio * the amount of dirtyable system memory).
+dirty_bytes
+-----------
+Contains the amount of dirty memory at which a process generating disk writes
+will itself start writeback.
+If dirty_bytes is written, dirty_ratio becomes a function of its value
+(dirty_bytes / the amount of dirtyable system memory).
 dirty_ratio
-----------------
+-----------
 Contains, as a percentage of the dirtyable system memory (free pages + mapped
 pages + file cache, not including locked pages and HugePages), the number of
 pages at which a process which is generating disk writes will itself start
 writing out dirty data.
+If dirty_ratio is written, dirty_bytes becomes a function of its value
+(dirty_ratio * the amount of dirtyable system memory).
 dirty_writeback_centisecs
 -------------------------
diff --git a/Documentation/filesystems/squashfs.txt b/Documentation/filesystems/squashfs.txt
new file mode 100644
index 000000000000..3e79e4a7a392
--- /dev/null
+++ b/Documentation/filesystems/squashfs.txt
@@ -0,0 +1,225 @@
+SQUASHFS 4.0 FILESYSTEM
+=======================
+Squashfs is a compressed read-only filesystem for Linux.
+It uses zlib compression to compress files, inodes and directories.
+Inodes in the system are very small and all blocks are packed to minimise
+data overhead. Block sizes greater than 4K are supported up to a maximum
+of 1Mbytes (default block size 128K).
+Squashfs is intended for general read-only filesystem use, for archival
+use (i.e. in cases where a .tar.gz file may be used), and in constrained
+block device/memory systems (e.g. embedded systems) where low overhead is
+needed.
+Mailing list: squashfs-devel@lists.sourceforge.net
+Web site: www.squashfs.org
+1. FILESYSTEM FEATURES
+----------------------
+Squashfs filesystem features versus Cramfs:
+                                Squashfs                Cramfs
+Max filesystem size:            2^64                    16 MiB
+Max file size:                  ~ 2 TiB                 16 MiB
+Max files:                      unlimited               unlimited
+Max directories:                unlimited               unlimited
+Max entries per directory:      unlimited               unlimited
+Max block size:                 1 MiB                   4 KiB
+Metadata compression:           yes                     no
+Directory indexes:              yes                     no
+Sparse file support:            yes                     no
+Tail-end packing (fragments):   yes                     no
+Exportable (NFS etc.):          yes                     no
+Hard link support:              yes                     no
+"." and ".." in readdir:        yes                     no
+Real inode numbers:             yes                     no
+32-bit uids/gids:               yes                     no
+File creation time:             yes                     no
+Xattr and ACL support:          no                      no
+Squashfs compresses data, inodes and directories.  In addition, inode and
+directory data are highly compacted, and packed on byte boundaries.  Each
+compressed inode is on average 8 bytes in length (the exact length varies on
+file type, i.e. regular file, directory, symbolic link, and block/char device
+inodes have different sizes).
+2. USING SQUASHFS
+-----------------
+As squashfs is a read-only filesystem, the mksquashfs program must be used to
+create populated squashfs filesystems.  This and other squashfs utilities
+can be obtained from http://www.squashfs.org.  Usage instructions can be
+obtained from this site also.
+3. SQUASHFS FILESYSTEM DESIGN
+-----------------------------
+A squashfs filesystem consists of seven parts, packed together on a byte
+alignment:
+         ---------------
+        |  superblock   |
+        |---------------|
+        |  datablocks   |
+        |  & fragments  |
+        |---------------|
+        |  inode table  |
+        |---------------|
+        |   directory   |
+        |     table     |
+        |---------------|
+        |   fragment    |
+        |    table      |
+        |---------------|
+        |    export     |
+        |    table      |
+        |---------------|
+        |    uid/gid    |
+        |  lookup table |
+         ---------------
+Compressed data blocks are written to the filesystem as files are read from
+the source directory, and checked for duplicates.  Once all file data has been
+written the completed inode, directory, fragment, export and uid/gid lookup
+tables are written.
+3.1 Inodes
+----------
+Metadata (inodes and directories) are compressed in 8Kbyte blocks.  Each
+compressed block is prefixed by a two byte length, the top bit is set if the
+block is uncompressed.  A block will be uncompressed if the -noI option is set,
+or if the compressed block was larger than the uncompressed block.
+Inodes are packed into the metadata blocks, and are not aligned to block
+boundaries, therefore inodes overlap compressed blocks.  Inodes are identified
+by a 48-bit number which encodes the location of the compressed metadata block
+containing the inode, and the byte offset into that block where the inode is
+placed (<block, offset>).
+To maximise compression there are different inodes for each file type
+(regular file, directory, device, etc.), the inode contents and length
+varying with the type.
+To further maximise compression, two types of regular file inode and
+directory inode are defined: inodes optimised for frequently occurring
+regular files and directories, and extended types where extra
+information has to be stored.
+3.2 Directories
+---------------
+Like inodes, directories are packed into compressed metadata blocks, stored
+in a directory table.  Directories are accessed using the start address of
+the metablock containing the directory and the offset into the
+decompressed block (<block, offset>).
+Directories are organised in a slightly complex way, and are not simply
+a list of file names.  The organisation takes advantage of the
+fact that (in most cases) the inodes of the files will be in the same
+compressed metadata block, and therefore, can share the start block.
+Directories are therefore organised in a two level list, a directory
+header containing the shared start block value, and a sequence of directory
+entries, each of which share the shared start block.  A new directory header
+is written once/if the inode start block changes.  The directory
+header/directory entry list is repeated as many times as necessary.
+Directories are sorted, and can contain a directory index to speed up
+file lookup.  Directory indexes store one entry per metablock, each entry
+storing the index/filename mapping to the first directory header
+in each metadata block.  Directories are sorted in alphabetical order,
+and at lookup the index is scanned linearly looking for the first filename
+alphabetically larger than the filename being looked up.  At this point the
+location of the metadata block the filename is in has been found.
+The general idea of the index is ensure only one metadata block needs to be
+decompressed to do a lookup irrespective of the length of the directory.
+This scheme has the advantage that it doesn't require extra memory overhead
+and doesn't require much extra storage on disk.
+3.3 File data
+-------------
+Regular files consist of a sequence of contiguous compressed blocks, and/or a
+compressed fragment block (tail-end packed block).   The compressed size
+of each datablock is stored in a block list contained within the
+file inode.
+To speed up access to datablocks when reading 'large' files (256 Mbytes or
+larger), the code implements an index cache that caches the mapping from
+block index to datablock location on disk.
+The index cache allows Squashfs to handle large files (up to 1.75 TiB) while
+retaining a simple and space-efficient block list on disk.  The cache
+is split into slots, caching up to eight 224 GiB files (128 KiB blocks).
+Larger files use multiple slots, with 1.75 TiB files using all 8 slots.
+The index cache is designed to be memory efficient, and by default uses
+16 KiB.
+3.4 Fragment lookup table
+-------------------------
+Regular files can contain a fragment index which is mapped to a fragment
+location on disk and compressed size using a fragment lookup table.  This
+fragment lookup table is itself stored compressed into metadata blocks.
+A second index table is used to locate these.  This second index table for
+speed of access (and because it is small) is read at mount time and cached
+in memory.
+3.5 Uid/gid lookup table
+------------------------
+For space efficiency regular files store uid and gid indexes, which are
+converted to 32-bit uids/gids using an id look up table.  This table is
+stored compressed into metadata blocks.  A second index table is used to
+locate these.  This second index table for speed of access (and because it
+is small) is read at mount time and cached in memory.
+3.6 Export table
+----------------
+To enable Squashfs filesystems to be exportable (via NFS etc.) filesystems
+can optionally (disabled with the -no-exports Mksquashfs option) contain
+an inode number to inode disk location lookup table.  This is required to
+enable Squashfs to map inode numbers passed in filehandles to the inode
+location on disk, which is necessary when the export code reinstantiates
+expired/flushed inodes.
+This table is stored compressed into metadata blocks.  A second index table is
+used to locate these.  This second index table for speed of access (and because
+it is small) is read at mount time and cached in memory.
+4. TODOS AND OUTSTANDING ISSUES
+-------------------------------
+4.1 Todo list
+-------------
+Implement Xattr and ACL support.  The Squashfs 4.0 filesystem layout has hooks
+for these but the code has not been written.  Once the code has been written
+the existing layout should not require modification.
+4.2 Squashfs internal cache
+---------------------------
+Blocks in Squashfs are compressed.  To avoid repeatedly decompressing
+recently accessed data Squashfs uses two small metadata and fragment caches.
+The cache is not used for file datablocks, these are decompressed and cached in
+the page-cache in the normal way.  The cache is used to temporarily cache
+fragment and metadata blocks which have been read as a result of a metadata
+(i.e. inode or directory) or fragment access.  Because metadata and fragments
+are packed together into blocks (to gain greater compression) the read of a
+particular piece of metadata or fragment will retrieve other metadata/fragments
+which have been packed with it, these because of locality-of-reference may be
+read in the near future. Temporarily caching them ensures they are available
+for near future access without requiring an additional read and decompress.
+In the future this internal cache may be replaced with an implementation which
+uses the kernel page cache.  Because the page cache operates on page sized
+units this may introduce additional complexity in terms of locking and
+associated race conditions.
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index ef19afa186a9..deeeed0faa8f 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -210,8 +210,8 @@ struct super_operations {
        void (*put_super) (struct super_block *);
        void (*write_super) (struct super_block *);
        int (*sync_fs)(struct super_block *sb, int wait);
-        void (*write_super_lockfs) (struct super_block *);
+        int (*freeze_fs) (struct super_block *);
-        void (*unlockfs) (struct super_block *);
+        int (*unfreeze_fs) (struct super_block *);
        int (*statfs) (struct dentry *, struct kstatfs *);
        int (*remount_fs) (struct super_block *, int *, char *);
        void (*clear_inode) (struct inode *);
@@ -270,11 +270,11 @@ or bottom half).
        a superblock. The second parameter indicates whether the method
        should wait until the write out has been completed. Optional.
-  write_super_lockfs: called when VFS is locking a filesystem and
+  freeze_fs: called when VFS is locking a filesystem and
        forcing it into a consistent state.  This method is currently
        used by the Logical Volume Manager (LVM).
-  unlockfs: called when VFS is unlocking a filesystem and making it writable
+  unfreeze_fs: called when VFS is unlocking a filesystem and making it writable
        again.
  statfs: called when the VFS needs to get filesystem statistics. This