29 files changed, 609 insertions, 119 deletions
diff --git a/Documentation/filesystems/00-INDEX b/Documentation/filesystems/00-INDEX
index f15621ee5599..3bae418c6ad3 100644
--- a/Documentation/filesystems/00-INDEX
+++ b/Documentation/filesystems/00-INDEX
@@ -1,7 +1,5 @@
 00-INDEX
        - this file (info on some of the filesystems supported by linux).
-Exporting
-        - explanation of how to make filesystems exportable.
 Locking
        - info on locking rules as they pertain to Linux VFS.
 9p.txt
@@ -34,8 +32,12 @@ dlmfs.txt
        - info on the userspace interface to the OCFS2 DLM.
 dnotify.txt
        - info about directory notification in Linux.
+dnotify_test.c
+        - example program for dnotify
 ecryptfs.txt
        - docs on eCryptfs: stacked cryptographic filesystem for Linux.
+exofs.txt
+        - info, usage, mount options, design about EXOFS.
 ext2.txt
        - info, mount options and specifications for the Ext2 filesystem.
 ext3.txt
@@ -62,16 +64,14 @@ jfs.txt
        - info and mount options for the JFS filesystem.
 locks.txt
        - info on file locking implementations, flock() vs. fcntl(), etc.
+logfs.txt
+        - info on the LogFS flash filesystem.
 mandatory-locking.txt
        - info on the Linux implementation of Sys V mandatory file locking.
 ncpfs.txt
        - info on Novell Netware(tm) filesystem using NCP protocol.
-nfs41-server.txt
+nfs/
-        - info on the Linux server implementation of NFSv4 minor version 1.
+        - nfs-related documentation.
-nfs-rdma.txt
-        - how to install and setup the Linux NFS/RDMA client and server software.
-nfsroot.txt
-        - short guide on setting up a diskless box with NFS root filesystem.
 nilfs2.txt
        - info and mount options for the NILFS2 filesystem.
 ntfs.txt
@@ -90,8 +90,6 @@ relay.txt
        - info on relay, for efficient streaming from kernel to user space.
 romfs.txt
        - description of the ROMFS filesystem.
-rpc-cache.txt
-        - introduction to the caching mechanisms in the sunrpc layer.
 seq_file.txt
        - how to use the seq_file API
 sharedsubtree.txt
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 18b9d0ca0630..06bbbed71206 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -460,13 +460,6 @@ in sys_read() and friends.
 --------------------------- dquot_operations -------------------------------
 prototypes:
-        int (*initialize) (struct inode *, int);
-        int (*drop) (struct inode *);
-        int (*alloc_space) (struct inode *, qsize_t, int);
-        int (*alloc_inode) (const struct inode *, unsigned long);
-        int (*free_space) (struct inode *, qsize_t);
-        int (*free_inode) (const struct inode *, unsigned long);
-        int (*transfer) (struct inode *, struct iattr *);
        int (*write_dquot) (struct dquot *);
        int (*acquire_dquot) (struct dquot *);
        int (*release_dquot) (struct dquot *);
@@ -479,13 +472,6 @@ a proper locking wrt the filesystem and call the generic quota operations.
 What filesystem should expect from the generic quota functions:
                FS recursion    Held locks when called
-initialize:     yes             maybe dqonoff_sem
-drop:           yes             -
-alloc_space:    ->mark_dirty()  -
-alloc_inode:    ->mark_dirty()  -
-free_space:     ->mark_dirty()  -
-free_inode:     ->mark_dirty()  -
-transfer:       yes             -
 write_dquot:    yes             dqonoff_sem or dqptr_sem
 acquire_dquot:  yes             dqonoff_sem or dqptr_sem
 release_dquot:  yes             dqonoff_sem or dqptr_sem
@@ -495,10 +481,6 @@ write_info:	yes		dqonoff_sem
 FS recursion means calling ->quota_read() and ->quota_write() from superblock
 operations.
->alloc_space(), ->alloc_inode(), ->free_space(), ->free_inode() are called
-only directly by the filesystem and do not call any fs functions only
-the ->mark_dirty() operation.
 More details about quota locking can be found in fs/dquot.c.
 --------------------------- vm_operations_struct -----------------------------
diff --git a/Documentation/filesystems/Makefile b/Documentation/filesystems/Makefile
new file mode 100644
index 000000000000..a5dd114da14f
--- /dev/null
+++ b/Documentation/filesystems/Makefile
@@ -0,0 +1,8 @@
+# kbuild trick to avoid linker error. Can be omitted if a module is built.
+obj- := dummy.o
+# List of programs to build
+hostprogs-y := dnotify_test
+# Tell kbuild to always build the programs
+always := $(hostprogs-y)
diff --git a/Documentation/filesystems/caching/fscache.txt b/Documentation/filesystems/caching/fscache.txt
index 9e94b9491d89..a91e2e2095b0 100644
--- a/Documentation/filesystems/caching/fscache.txt
+++ b/Documentation/filesystems/caching/fscache.txt
@@ -235,6 +235,7 @@ proc files.
                neg=N   Number of negative lookups made
                pos=N   Number of positive lookups made
                crt=N   Number of objects created by lookup
+                tmo=N   Number of lookups timed out and requeued
        Updates n=N     Number of update cookie requests seen
                nul=N   Number of upd reqs given a NULL parent
                run=N   Number of upd reqs granted CPU time
@@ -250,8 +251,10 @@ proc files.
                ok=N    Number of successful alloc reqs
                wt=N    Number of alloc reqs that waited on lookup completion
                nbf=N   Number of alloc reqs rejected -ENOBUFS
+                int=N   Number of alloc reqs aborted -ERESTARTSYS
                ops=N   Number of alloc reqs submitted
                owt=N   Number of alloc reqs waited for CPU time
+                abt=N   Number of alloc reqs aborted due to object death
        Retrvls n=N     Number of retrieval (read) requests seen
                ok=N    Number of successful retr reqs
                wt=N    Number of retr reqs that waited on lookup completion
@@ -261,6 +264,7 @@ proc files.
                oom=N   Number of retr reqs failed -ENOMEM
                ops=N   Number of retr reqs submitted
                owt=N   Number of retr reqs waited for CPU time
+                abt=N   Number of retr reqs aborted due to object death
        Stores  n=N     Number of storage (write) requests seen
                ok=N    Number of successful store reqs
                agn=N   Number of store reqs on a page already pending storage
@@ -268,12 +272,37 @@ proc files.
                oom=N   Number of store reqs failed -ENOMEM
                ops=N   Number of store reqs submitted
                run=N   Number of store reqs granted CPU time
+                pgs=N   Number of pages given store req processing time
+                rxd=N   Number of store reqs deleted from tracking tree
+                olm=N   Number of store reqs over store limit
+        VmScan  nos=N   Number of release reqs against pages with no pending store
+                gon=N   Number of release reqs against pages stored by time lock granted
+                bsy=N   Number of release reqs ignored due to in-progress store
+                can=N   Number of page stores cancelled due to release req
        Ops     pend=N  Number of times async ops added to pending queues
                run=N   Number of times async ops given CPU time
                enq=N   Number of times async ops queued for processing
+                can=N   Number of async ops cancelled
+                rej=N   Number of async ops rejected due to object lookup/create failure
                dfr=N   Number of async ops queued for deferred release
                rel=N   Number of async ops released
                gc=N    Number of deferred-release async ops garbage collected
+        CacheOp alo=N   Number of in-progress alloc_object() cache ops
+                luo=N   Number of in-progress lookup_object() cache ops
+                luc=N   Number of in-progress lookup_complete() cache ops
+                gro=N   Number of in-progress grab_object() cache ops
+                upo=N   Number of in-progress update_object() cache ops
+                dro=N   Number of in-progress drop_object() cache ops
+                pto=N   Number of in-progress put_object() cache ops
+                syn=N   Number of in-progress sync_cache() cache ops
+                atc=N   Number of in-progress attr_changed() cache ops
+                rap=N   Number of in-progress read_or_alloc_page() cache ops
+                ras=N   Number of in-progress read_or_alloc_pages() cache ops
+                alp=N   Number of in-progress allocate_page() cache ops
+                als=N   Number of in-progress allocate_pages() cache ops
+                wrp=N   Number of in-progress write_page() cache ops
+                ucp=N   Number of in-progress uncache_page() cache ops
+                dsp=N   Number of in-progress dissociate_pages() cache ops
 (*) /proc/fs/fscache/histogram
@@ -299,6 +328,87 @@ proc files.
     jiffy range covered, and the SECS field the equivalent number of seconds.
+===========
+OBJECT LIST
+===========
+If CONFIG_FSCACHE_OBJECT_LIST is enabled, the FS-Cache facility will maintain a
+list of all the objects currently allocated and allow them to be viewed
+through:
+        /proc/fs/fscache/objects
+This will look something like:
+        [root@andromeda ~]# head /proc/fs/fscache/objects
+        OBJECT   PARENT   STAT CHLDN OPS OOP IPR EX READS EM EV F S | NETFS_COOKIE_DEF TY FL NETFS_DATA       OBJECT_KEY, AUX_DATA
+        ======== ======== ==== ===== === === === == ===== == == = = | ================ == == ================ ================
+           17e4b        2 ACTV     0   0   0   0  0     0 7b  4 0 8 | NFS.fh           DT  0 ffff88001dd82820 010006017edcf8bbc93b43298fdfbe71e50b57b13a172c0117f38472, e567634700000000000000000000000063f2404a000000000000000000000000c9030000000000000000000063f2404a
+           1693a        2 ACTV     0   0   0   0  0     0 7b  4 0 8 | NFS.fh           DT  0 ffff88002db23380 010006017edcf8bbc93b43298fdfbe71e50b57b1e0162c01a2df0ea6, 420ebc4a000000000000000000000000420ebc4a0000000000000000000000000e1801000000000000000000420ebc4a
+where the first set of columns before the '|' describe the object:
+        COLUMN  DESCRIPTION
+        ======= ===============================================================
+        OBJECT  Object debugging ID (appears as OBJ%x in some debug messages)
+        PARENT  Debugging ID of parent object
+        STAT    Object state
+        CHLDN   Number of child objects of this object
+        OPS     Number of outstanding operations on this object
+        OOP     Number of outstanding child object management operations
+        IPR
+        EX      Number of outstanding exclusive operations
+        READS   Number of outstanding read operations
+        EM      Object's event mask
+        EV      Events raised on this object
+        F       Object flags
+        S       Object slow-work work item flags
+and the second set of columns describe the object's cookie, if present:
+        COLUMN          DESCRIPTION
+        =============== =======================================================
+        NETFS_COOKIE_DEF Name of netfs cookie definition
+        TY              Cookie type (IX - index, DT - data, hex - special)
+        FL              Cookie flags
+        NETFS_DATA      Netfs private data stored in the cookie
+        OBJECT_KEY      Object key      } 1 column, with separating comma
+        AUX_DATA        Object aux data } presence may be configured
+The data shown may be filtered by attaching the a key to an appropriate keyring
+before viewing the file.  Something like:
+                keyctl add user fscache:objlist <restrictions> @s
+where <restrictions> are a selection of the following letters:
+        K       Show hexdump of object key (don't show if not given)
+        A       Show hexdump of object aux data (don't show if not given)
+and the following paired letters:
+        C       Show objects that have a cookie
+        c       Show objects that don't have a cookie
+        B       Show objects that are busy
+        b       Show objects that aren't busy
+        W       Show objects that have pending writes
+        w       Show objects that don't have pending writes
+        R       Show objects that have outstanding reads
+        r       Show objects that don't have outstanding reads
+        S       Show objects that have slow work queued
+        s       Show objects that don't have slow work queued
+If neither side of a letter pair is given, then both are implied.  For example:
+        keyctl add user fscache:objlist KB @s
+shows objects that are busy, and lists their object keys, but does not dump
+their auxiliary data.  It also implies "CcWwRrSs", but as 'B' is given, 'b' is
+not implied.
+By default all objects and all fields will be shown.
 =========
 DEBUGGING
 =========
diff --git a/Documentation/filesystems/caching/netfs-api.txt b/Documentation/filesystems/caching/netfs-api.txt
index 2666b1ed5e9e..1902c57b72ef 100644
--- a/Documentation/filesystems/caching/netfs-api.txt
+++ b/Documentation/filesystems/caching/netfs-api.txt
@@ -641,7 +641,7 @@ data file must be retired (see the relinquish cookie function below).
 Furthermore, note that this does not cancel the asynchronous read or write
 operation started by the read/alloc and write functions, so the page
-invalidation and release functions must use:
+invalidation functions must use:
        bool fscache_check_page_write(struct fscache_cookie *cookie,
                                      struct page *page);
@@ -654,6 +654,25 @@ to see if a page is being written to the cache, and:
 to wait for it to finish if it is.
+When releasepage() is being implemented, a special FS-Cache function exists to
+manage the heuristics of coping with vmscan trying to eject pages, which may
+conflict with the cache trying to write pages to the cache (which may itself
+need to allocate memory):
+        bool fscache_maybe_release_page(struct fscache_cookie *cookie,
+                                        struct page *page,
+                                        gfp_t gfp);
+This takes the netfs cookie, and the page and gfp arguments as supplied to
+releasepage().  It will return false if the page cannot be released yet for
+some reason and if it returns true, the page has been uncached and can now be
+released.
+To make a page available for release, this function may wait for an outstanding
+storage request to complete, or it may attempt to cancel the storage request -
+in which case the page will not be stored in the cache this time.
 ==========================
 INDEX AND DATA FILE UPDATE
 ==========================
diff --git a/Documentation/filesystems/dentry-locking.txt b/Documentation/filesystems/dentry-locking.txt
index 4c0c575a4012..79334ed5daa7 100644
--- a/Documentation/filesystems/dentry-locking.txt
+++ b/Documentation/filesystems/dentry-locking.txt
@@ -62,7 +62,8 @@ changes are :
 2. Insertion of a dentry into the hash table is done using
   hlist_add_head_rcu() which take care of ordering the writes - the
   writes to the dentry must be visible before the dentry is
-   inserted. This works in conjunction with hlist_for_each_rcu() while
+   inserted. This works in conjunction with hlist_for_each_rcu(),
+   which has since been replaced by hlist_for_each_entry_rcu(), while
   walking the hash chain. The only requirement is that all
   initialization to the dentry must be done before
   hlist_add_head_rcu() since we don't have dcache_lock protection
diff --git a/Documentation/filesystems/dnotify.txt b/Documentation/filesystems/dnotify.txt
index 9f5d338ddbb8..6baf88f46859 100644
--- a/Documentation/filesystems/dnotify.txt
+++ b/Documentation/filesystems/dnotify.txt
@@ -62,38 +62,9 @@ disabled, fcntl(fd, F_NOTIFY, ...) will return -EINVAL.
 Example
 -------
+See Documentation/filesystems/dnotify_test.c for an example.
-        #define _GNU_SOURCE     /* needed to get the defines */
+NOTE
-        #include <fcntl.h>      /* in glibc 2.2 this has the needed
+----
-                                           values defined */
+Beginning with Linux 2.6.13, dnotify has been replaced by inotify.
-        #include <signal.h>
+See Documentation/filesystems/inotify.txt for more information on it.
-        #include <stdio.h>
-        #include <unistd.h>
-        static volatile int event_fd;
-        static void handler(int sig, siginfo_t *si, void *data)
-        {
-                event_fd = si->si_fd;
-        }
-        int main(void)
-        {
-                struct sigaction act;
-                int fd;
-                act.sa_sigaction = handler;
-                sigemptyset(&act.sa_mask);
-                act.sa_flags = SA_SIGINFO;
-                sigaction(SIGRTMIN + 1, &act, NULL);
-                fd = open(".", O_RDONLY);
-                fcntl(fd, F_SETSIG, SIGRTMIN + 1);
-                fcntl(fd, F_NOTIFY, DN_MODIFY|DN_CREATE|DN_MULTISHOT);
-                /* we will now be notified if any of the files
-                   in "." is modified or new files are created */
-                while (1) {
-                        pause();
-                        printf("Got event on fd=%d\n", event_fd);
-                }
-        }
diff --git a/Documentation/filesystems/dnotify_test.c b/Documentation/filesystems/dnotify_test.c
new file mode 100644
index 000000000000..8b37b4a1e18d
--- /dev/null
+++ b/Documentation/filesystems/dnotify_test.c
@@ -0,0 +1,34 @@
+#define _GNU_SOURCE     /* needed to get the defines */
+#include <fcntl.h>      /* in glibc 2.2 this has the needed
+                                   values defined */
+#include <signal.h>
+#include <stdio.h>
+#include <unistd.h>
+static volatile int event_fd;
+static void handler(int sig, siginfo_t *si, void *data)
+{
+        event_fd = si->si_fd;
+}
+int main(void)
+{
+        struct sigaction act;
+        int fd;
+        act.sa_sigaction = handler;
+        sigemptyset(&act.sa_mask);
+        act.sa_flags = SA_SIGINFO;
+        sigaction(SIGRTMIN + 1, &act, NULL);
+        fd = open(".", O_RDONLY);
+        fcntl(fd, F_SETSIG, SIGRTMIN + 1);
+        fcntl(fd, F_NOTIFY, DN_MODIFY|DN_CREATE|DN_MULTISHOT);
+        /* we will now be notified if any of the files
+           in "." is modified or new files are created */
+        while (1) {
+                pause();
+                printf("Got event on fd=%d\n", event_fd);
+        }
+}
diff --git a/Documentation/filesystems/exofs.txt b/Documentation/filesystems/exofs.txt
index 0ced74c2f73c..abd2a9b5b787 100644
--- a/Documentation/filesystems/exofs.txt
+++ b/Documentation/filesystems/exofs.txt
@@ -60,13 +60,13 @@ USAGE
   mkfs.exofs --pid=65536 --format /dev/osd0
-   The --format is optional if not specified no OSD_FORMAT will be
+   The --format is optional. If not specified, no OSD_FORMAT will be
-   preformed and a clean file system will be created in the specified pid,
+   performed and a clean file system will be created in the specified pid,
   in the available space of the target. (Use --format=size_in_meg to limit
   the total LUN space available)
-   If pid already exist it will be deleted and a new one will be created in it's
+   If pid already exists, it will be deleted and a new one will be created in
-   place. Be careful.
+   its place. Be careful.
   An exofs lives inside a single OSD partition. You can create multiple exofs
   filesystems on the same device using multiple pids.
@@ -81,7 +81,7 @@ USAGE
 7. For reference (See do-exofs example script):
        do-exofs start - an example of how to perform the above steps.
-        do-exofs stop -  an example of how to unmount the file system.
+        do-exofs stop - an example of how to unmount the file system.
        do-exofs format - an example of how to format and mkfs a new exofs.
 8. Extra compilation flags (uncomment in fs/exofs/Kbuild):
@@ -104,8 +104,8 @@ Where:
    exofs specific options: Options are separated by commas (,)
                pid=<integer> - The partition number to mount/create as
                                container of the filesystem.
-                                This option is mandatory
+                                This option is mandatory.
-                to=<integer>  - Timeout in ticks for a single command
+                to=<integer>  - Timeout in ticks for a single command.
                                default is (60 * HZ) [for debugging only]
 ===============================================================================
@@ -116,7 +116,7 @@ DESIGN
  with a special ID (defined in common.h).
  Information included in the file system control block is used to fill the
  in-memory superblock structure at mount time. This object is created before
-  the file system is used by mkexofs.c It contains information such as:
+  the file system is used by mkexofs.c. It contains information such as:
        - The file system's magic number
        - The next inode number to be allocated
@@ -134,8 +134,8 @@ DESIGN
  attributes. This applies to both regular files and other types (directories,
  device files, symlinks, etc.).
-* Credentials are generated per object (inode and superblock) when they is
+* Credentials are generated per object (inode and superblock) when they are
-  created in memory (read off disk or created). The credential works for all
+  created in memory (read from disk or created). The credential works for all
  operations and is used as long as the object remains in memory.
 * Async OSD operations are used whenever possible, but the target may execute
@@ -145,7 +145,8 @@ DESIGN
  from executing in reverse order:
        - The following are handled with the OBJ_CREATED and OBJ_2BCREATED
          flags. OBJ_CREATED is set when we know the object exists on the OSD -
-          in create's callback function, and when we successfully do a read_inode.
+          in create's callback function, and when we successfully do a
+          read_inode.
          OBJ_2BCREATED is set in the beginning of the create function, so we
          know that we should wait.
                - create/delete: delete should wait until the object is created
diff --git a/Documentation/filesystems/ext3.txt b/Documentation/filesystems/ext3.txt
index 570f9bd9be2b..867c5b50cb42 100644
--- a/Documentation/filesystems/ext3.txt
+++ b/Documentation/filesystems/ext3.txt
@@ -32,8 +32,8 @@ journal_dev=devnum	When the external journal device's major/minor numbers
                        identified through its new major/minor numbers encoded
                        in devnum.
-noload                  Don't load the journal on mounting. Note that this forces
+norecovery              Don't load the journal on mounting. Note that this forces
-                        mount of inconsistent filesystem, which can lead to
+noload                  mount of inconsistent filesystem, which can lead to
                        various problems.
 data=journal            All data are committed into the journal prior to being
@@ -123,10 +123,18 @@ resuid=n		The user ID which may use the reserved blocks.
 sb=n                    Use alternate superblock at this location.
-quota
+quota                   These options are ignored by the filesystem. They
-noquota
+noquota                 are used only by quota tools to recognize volumes
-grpquota
+grpquota                where quota should be turned on. See documentation
-usrquota
+usrquota                in the quota-tools package for more details
+                        (http://sourceforge.net/projects/linuxquota).
+jqfmt=<quota type>      These options tell filesystem details about quota
+usrjquota=<file>        so that quota information can be properly updated
+grpjquota=<file>        during journal replay. They replace the above
+                        quota options. See documentation in the quota-tools
+                        package for more details
+                        (http://sourceforge.net/projects/linuxquota).
 bh              (*)     ext3 associates buffer heads to data pages to
 nobh                    (a) cache disk block mapping information
diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt
index 18b5ec8cea45..e1def1786e50 100644
--- a/Documentation/filesystems/ext4.txt
+++ b/Documentation/filesystems/ext4.txt
@@ -134,9 +134,15 @@ ro                   	Mount filesystem read only. Note that ext4 will
                        mount options "ro,noload" can be used to prevent
                        writes to the filesystem.
+journal_checksum        Enable checksumming of the journal transactions.
+                        This will allow the recovery code in e2fsck and the
+                        kernel to detect corruption in the kernel.  It is a
+                        compatible change and will be ignored by older kernels.
 journal_async_commit    Commit block can be written to disk without waiting
                        for descriptor blocks. If enabled older kernels cannot
-                        mount the device.
+                        mount the device. This will enable 'journal_checksum'
+                        internally.
 journal=update          Update the ext4 file system's journal to the current
                        format.
@@ -147,8 +153,8 @@ journal_dev=devnum	When the external journal device's major/minor numbers
                        identified through its new major/minor numbers encoded
                        in devnum.
-noload                  Don't load the journal on mounting.  Note that
+norecovery              Don't load the journal on mounting.  Note that
-                        if the filesystem was not unmounted cleanly,
+noload                  if the filesystem was not unmounted cleanly,
                        skipping the journal replay will lead to the
                        filesystem containing inconsistencies that can
                        lead to any number of problems.
@@ -190,7 +196,7 @@ nobarrier		This also requires an IO stack which can support
                        also be used to enable or disable barriers, for
                        consistency with other ext4 mount options.
-inode_readahead=n       This tuning parameter controls the maximum
+inode_readahead_blks=n  This tuning parameter controls the maximum
                        number of inode table blocks that ext4's inode
                        table readahead algorithm will pre-read into
                        the buffer cache.  The default value is 32 blocks.
@@ -282,9 +288,16 @@ stripe=n		Number of filesystem blocks that mballoc will try
                        to use for allocation size and alignment. For RAID5/6
                        systems this should be the number of data
                        disks *  RAID chunk size in file system blocks.
-delalloc        (*)     Deferring block allocation until write-out time.
-nodelalloc              Disable delayed allocation. Blocks are allocation
+delalloc        (*)     Defer block allocation until just before ext4
-                        when data is copied from user to page cache.
+                        writes out the block(s) in question.  This
+                        allows ext4 to better allocation decisions
+                        more efficiently.
+nodelalloc              Disable delayed allocation.  Blocks are allocated
+                        when the data is copied from userspace to the
+                        page cache, either via the write(2) system call
+                        or when an mmap'ed page which was previously
+                        unallocated is written for the first time.
 max_batch_time=usec     Maximum amount of time ext4 should wait for
                        additional filesystem operations to be batch
@@ -340,6 +353,12 @@ noauto_da_alloc		replacing existing files via patterns such as
                        system crashes before the delayed allocation
                        blocks are forced to disk.
+discard         Controls whether ext4 should issue discard/TRIM
+nodiscard(*)            commands to the underlying block device when
+                        blocks are freed.  This is useful for SSD devices
+                        and sparse/thinly-provisioned LUNs, but it is off
+                        by default until sufficient testing has been done.
 Data Mode
 =========
 There are 3 different data modes:
diff --git a/Documentation/filesystems/logfs.txt b/Documentation/filesystems/logfs.txt
new file mode 100644
index 000000000000..e64c94ba401a
--- /dev/null
+++ b/Documentation/filesystems/logfs.txt
@@ -0,0 +1,241 @@
+The LogFS Flash Filesystem
+==========================
+Specification
+=============
+Superblocks
+-----------
+Two superblocks exist at the beginning and end of the filesystem.
+Each superblock is 256 Bytes large, with another 3840 Bytes reserved
+for future purposes, making a total of 4096 Bytes.
+Superblock locations may differ for MTD and block devices.  On MTD the
+first non-bad block contains a superblock in the first 4096 Bytes and
+the last non-bad block contains a superblock in the last 4096 Bytes.
+On block devices, the first 4096 Bytes of the device contain the first
+superblock and the last aligned 4096 Byte-block contains the second
+superblock.
+For the most part, the superblocks can be considered read-only.  They
+are written only to correct errors detected within the superblocks,
+move the journal and change the filesystem parameters through tunefs.
+As a result, the superblock does not contain any fields that require
+constant updates, like the amount of free space, etc.
+Segments
+--------
+The space in the device is split up into equal-sized segments.
+Segments are the primary write unit of LogFS.  Within each segments,
+writes happen from front (low addresses) to back (high addresses.  If
+only a partial segment has been written, the segment number, the
+current position within and optionally a write buffer are stored in
+the journal.
+Segments are erased as a whole.  Therefore Garbage Collection may be
+required to completely free a segment before doing so.
+Journal
+--------
+The journal contains all global information about the filesystem that
+is subject to frequent change.  At mount time, it has to be scanned
+for the most recent commit entry, which contains a list of pointers to
+all currently valid entries.
+Object Store
+------------
+All space except for the superblocks and journal is part of the object
+store.  Each segment contains a segment header and a number of
+objects, each consisting of the object header and the payload.
+Objects are either inodes, directory entries (dentries), file data
+blocks or indirect blocks.
+Levels
+------
+Garbage collection (GC) may fail if all data is written
+indiscriminately.  One requirement of GC is that data is seperated
+roughly according to the distance between the tree root and the data.
+Effectively that means all file data is on level 0, indirect blocks
+are on levels 1, 2, 3 4 or 5 for 1x, 2x, 3x, 4x or 5x indirect blocks,
+respectively.  Inode file data is on level 6 for the inodes and 7-11
+for indirect blocks.
+Each segment contains objects of a single level only.  As a result,
+each level requires its own seperate segment to be open for writing.
+Inode File
+----------
+All inodes are stored in a special file, the inode file.  Single
+exception is the inode file's inode (master inode) which for obvious
+reasons is stored in the journal instead.  Instead of data blocks, the
+leaf nodes of the inode files are inodes.
+Aliases
+-------
+Writes in LogFS are done by means of a wandering tree.  A naïve
+implementation would require that for each write or a block, all
+parent blocks are written as well, since the block pointers have
+changed.  Such an implementation would not be very efficient.
+In LogFS, the block pointer changes are cached in the journal by means
+of alias entries.  Each alias consists of its logical address - inode
+number, block index, level and child number (index into block) - and
+the changed data.  Any 8-byte word can be changes in this manner.
+Currently aliases are used for block pointers, file size, file used
+bytes and the height of an inodes indirect tree.
+Segment Aliases
+---------------
+Related to regular aliases, these are used to handle bad blocks.
+Initially, bad blocks are handled by moving the affected segment
+content to a spare segment and noting this move in the journal with a
+segment alias, a simple (to, from) tupel.  GC will later empty this
+segment and the alias can be removed again.  This is used on MTD only.
+Vim
+---
+By cleverly predicting the life time of data, it is possible to
+seperate long-living data from short-living data and thereby reduce
+the GC overhead later.  Each type of distinc life expectency (vim) can
+have a seperate segment open for writing.  Each (level, vim) tupel can
+be open just once.  If an open segment with unknown vim is encountered
+at mount time, it is closed and ignored henceforth.
+Indirect Tree
+-------------
+Inodes in LogFS are similar to FFS-style filesystems with direct and
+indirect block pointers.  One difference is that LogFS uses a single
+indirect pointer that can be either a 1x, 2x, etc. indirect pointer.
+A height field in the inode defines the height of the indirect tree
+and thereby the indirection of the pointer.
+Another difference is the addressing of indirect blocks.  In LogFS,
+the first 16 pointers in the first indirect block are left empty,
+corresponding to the 16 direct pointers in the inode.  In ext2 (maybe
+others as well) the first pointer in the first indirect block
+corresponds to logical block 12, skipping the 12 direct pointers.
+So where ext2 is using arithmetic to better utilize space, LogFS keeps
+arithmetic simple and uses compression to save space.
+Compression
+-----------
+Both file data and metadata can be compressed.  Compression for file
+data can be enabled with chattr +c and disabled with chattr -c.  Doing
+so has no effect on existing data, but new data will be stored
+accordingly.  New inodes will inherit the compression flag of the
+parent directory.
+Metadata is always compressed.  However, the space accounting ignores
+this and charges for the uncompressed size.  Failing to do so could
+result in GC failures when, after moving some data, indirect blocks
+compress worse than previously.  Even on a 100% full medium, GC may
+not consume any extra space, so the compression gains are lost space
+to the user.
+However, they are not lost space to the filesystem internals.  By
+cheating the user for those bytes, the filesystem gained some slack
+space and GC will run less often and faster.
+Garbage Collection and Wear Leveling
+------------------------------------
+Garbage collection is invoked whenever the number of free segments
+falls below a threshold.  The best (known) candidate is picked based
+on the least amount of valid data contained in the segment.  All
+remaining valid data is copied elsewhere, thereby invalidating it.
+The GC code also checks for aliases and writes then back if their
+number gets too large.
+Wear leveling is done by occasionally picking a suboptimal segment for
+garbage collection.  If a stale segments erase count is significantly
+lower than the active segments' erase counts, it will be picked.  Wear
+leveling is rate limited, so it will never monopolize the device for
+more than one segment worth at a time.
+Values for "occasionally", "significantly lower" are compile time
+constants.
+Hashed directories
+------------------
+To satisfy efficient lookup(), directory entries are hashed and
+located based on the hash.  In order to both support large directories
+and not be overly inefficient for small directories, several hash
+tables of increasing size are used.  For each table, the hash value
+modulo the table size gives the table index.
+Tables sizes are chosen to limit the number of indirect blocks with a
+fully populated table to 0, 1, 2 or 3 respectively.  So the first
+table contains 16 entries, the second 512-16, etc.
+The last table is special in several ways.  First its size depends on
+the effective 32bit limit on telldir/seekdir cookies.  Since logfs
+uses the upper half of the address space for indirect blocks, the size
+is limited to 2^31.  Secondly the table contains hash buckets with 16
+entries each.
+Using single-entry buckets would result in birthday "attacks".  At
+just 2^16 used entries, hash collisions would be likely (P >= 0.5).
+My math skills are insufficient to do the combinatorics for the 17x
+collisions necessary to overflow a bucket, but testing showed that in
+10,000 runs the lowest directory fill before a bucket overflow was
+188,057,130 entries with an average of 315,149,915 entries.  So for
+directory sizes of up to a million, bucket overflows should be
+virtually impossible under normal circumstances.
+With carefully chosen filenames, it is obviously possible to cause an
+overflow with just 21 entries (4 higher tables + 16 entries + 1).  So
+there may be a security concern if a malicious user has write access
+to a directory.
+Open For Discussion
+===================
+Device Address Space
+--------------------
+A device address space is used for caching.  Both block devices and
+MTD provide functions to either read a single page or write a segment.
+Partial segments may be written for data integrity, but where possible
+complete segments are written for performance on simple block device
+flash media.
+Meta Inodes
+-----------
+Inodes are stored in the inode file, which is just a regular file for
+most purposes.  At umount time, however, the inode file needs to
+remain open until all dirty inodes are written.  So
+generic_shutdown_super() may not close this inode, but shouldn't
+complain about remaining inodes due to the inode file either.  Same
+goes for mapping inode of the device address space.
+Currently logfs uses a hack that essentially copies part of fs/inode.c
+code over.  A general solution would be preferred.
+Indirect block mapping
+----------------------
+With compression, the block device (or mapping inode) cannot be used
+to cache indirect blocks.  Some other place is required.  Currently
+logfs uses the top half of each inode's address space.  The low 8TB
+(on 32bit) are filled with file data, the high 8TB are used for
+indirect blocks.
+One problem is that 16TB files created on 64bit systems actually have
+data in the top 8TB.  But files >16TB would cause problems anyway, so
+only the limit has changed.
diff --git a/Documentation/filesystems/nfs/00-INDEX b/Documentation/filesystems/nfs/00-INDEX
new file mode 100644
index 000000000000..2f68cd688769
--- /dev/null
+++ b/Documentation/filesystems/nfs/00-INDEX
@@ -0,0 +1,16 @@
+00-INDEX
+        - this file (nfs-related documentation).
+Exporting
+        - explanation of how to make filesystems exportable.
+knfsd-stats.txt
+        - statistics which the NFS server makes available to user space.
+nfs.txt
+        - nfs client, and DNS resolution for fs_locations.
+nfs41-server.txt
+        - info on the Linux server implementation of NFSv4 minor version 1.
+nfs-rdma.txt
+        - how to install and setup the Linux NFS/RDMA client and server software
+nfsroot.txt
+        - short guide on setting up a diskless box with NFS root filesystem.
+rpc-cache.txt
+        - introduction to the caching mechanisms in the sunrpc layer.
diff --git a/Documentation/filesystems/Exporting b/Documentation/filesystems/nfs/Exporting
index 87019d2b5981..87019d2b5981 100644
--- a/Documentation/filesystems/Exporting
+++ b/Documentation/filesystems/nfs/Exporting
diff --git a/Documentation/filesystems/knfsd-stats.txt b/Documentation/filesystems/nfs/knfsd-stats.txt
index 64ced5149d37..64ced5149d37 100644
--- a/Documentation/filesystems/knfsd-stats.txt
+++ b/Documentation/filesystems/nfs/knfsd-stats.txt
diff --git a/Documentation/filesystems/nfs-rdma.txt b/Documentation/filesystems/nfs/nfs-rdma.txt
index e386f7e4bcee..e386f7e4bcee 100644
--- a/Documentation/filesystems/nfs-rdma.txt
+++ b/Documentation/filesystems/nfs/nfs-rdma.txt
diff --git a/Documentation/filesystems/nfs.txt b/Documentation/filesystems/nfs/nfs.txt
index f50f26ce6cd0..f50f26ce6cd0 100644
--- a/Documentation/filesystems/nfs.txt
+++ b/Documentation/filesystems/nfs/nfs.txt
diff --git a/Documentation/filesystems/nfs41-server.txt b/Documentation/filesystems/nfs/nfs41-server.txt
index 5920fe26e6ff..6a53a84afc72 100644
--- a/Documentation/filesystems/nfs41-server.txt
+++ b/Documentation/filesystems/nfs/nfs41-server.txt
@@ -17,8 +17,7 @@ kernels must turn 4.1 on or off *before* turning support for version 4
 on or off; rpc.nfsd does this correctly.)
 The NFSv4 minorversion 1 (NFSv4.1) implementation in nfsd is based
-on the latest NFSv4.1 Internet Draft:
+on RFC 5661.
-http://tools.ietf.org/html/draft-ietf-nfsv4-minorversion1-29
 From the many new features in NFSv4.1 the current implementation
 focuses on the mandatory-to-implement NFSv4.1 Sessions, providing
@@ -41,10 +40,10 @@ interoperability problems with future clients.  Known issues:
          conformant with the spec (for example, we don't use kerberos
          on the backchannel correctly).
        - no trunking support: no clients currently take advantage of
-          trunking, but this is a mandatory failure, and its use is
+          trunking, but this is a mandatory feature, and its use is
          recommended to clients in a number of places.  (E.g. to ensure
          timely renewal in case an existing connection's retry timeouts
-          have gotten too long; see section 8.3 of the draft.)
+          have gotten too long; see section 8.3 of the RFC.)
          Therefore, lack of this feature may cause future clients to
          fail.
        - Incomplete backchannel support: incomplete backchannel gss
@@ -213,3 +212,10 @@ The following cases aren't supported yet:
  DESTROY_CLIENTID, DESTROY_SESSION, EXCHANGE_ID.
 * DESTROY_SESSION MUST be the final operation in the COMPOUND request.
+Nonstandard compound limitations:
+* No support for a sessions fore channel RPC compound that requires both a
+  ca_maxrequestsize request and a ca_maxresponsesize reply, so we may
+  fail to live up to the promise we made in CREATE_SESSION fore channel
+  negotiation.
+* No more than one IO operation (read, write, readdir) allowed per
+  compound.
diff --git a/Documentation/filesystems/nfsroot.txt b/Documentation/filesystems/nfs/nfsroot.txt
index 3ba0b945aaf8..3ba0b945aaf8 100644
--- a/Documentation/filesystems/nfsroot.txt
+++ b/Documentation/filesystems/nfs/nfsroot.txt
diff --git a/Documentation/filesystems/rpc-cache.txt b/Documentation/filesystems/nfs/rpc-cache.txt
index 8a382bea6808..8a382bea6808 100644
--- a/Documentation/filesystems/rpc-cache.txt
+++ b/Documentation/filesystems/nfs/rpc-cache.txt
diff --git a/Documentation/filesystems/nilfs2.txt b/Documentation/filesystems/nilfs2.txt
index 01539f410676..cf6d0d85ca82 100644
--- a/Documentation/filesystems/nilfs2.txt
+++ b/Documentation/filesystems/nilfs2.txt
@@ -28,7 +28,7 @@ described in the man pages included in the package.
 Project web page:    http://www.nilfs.org/en/
 Download page:       http://www.nilfs.org/en/download.html
 Git tree web page:   http://www.nilfs.org/git/
-NILFS mailing lists: http://www.nilfs.org/mailman/listinfo/users
+List info:           http://vger.kernel.org/vger-lists.html#linux-nilfs
 Caveats
 =======
@@ -49,8 +49,7 @@ Mount options
 NILFS2 supports the following mount options:
 (*) == default
-barrier=on(*)           This enables/disables barriers. barrier=off disables
+nobarrier               Disables barriers.
-                        it, barrier=on enables it.
 errors=continue(*)      Keep going on a filesystem error.
 errors=remount-ro       Remount the filesystem read-only on an error.
 errors=panic            Panic and halt the machine if an error occurs.
@@ -71,6 +70,13 @@ order=strict		Apply strict in-order semantics that preserves sequence
                        blocks.  That means, it is guaranteed that no
                        overtaking of events occurs in the recovered file
                        system after a crash.
+norecovery              Disable recovery of the filesystem on mount.
+                        This disables every write access on the device for
+                        read-only mounts or snapshots.  This option will fail
+                        for r/w mounts on an unclean volume.
+discard                 Issue discard/TRIM commands to the underlying block
+                        device when blocks are freed.  This is useful for SSD
+                        devices and sparse/thinly-provisioned LUNs.
 NILFS2 usage
 ============
diff --git a/Documentation/filesystems/ocfs2.txt b/Documentation/filesystems/ocfs2.txt
index c2a0871280a0..c58b9f5ba002 100644
--- a/Documentation/filesystems/ocfs2.txt
+++ b/Documentation/filesystems/ocfs2.txt
@@ -20,15 +20,16 @@ Lots of code taken from ext3 and other projects.
 Authors in alphabetical order:
 Joel Becker   <joel.becker@oracle.com>
 Zach Brown    <zach.brown@oracle.com>
-Mark Fasheh   <mark.fasheh@oracle.com>
+Mark Fasheh   <mfasheh@suse.com>
 Kurt Hackel   <kurt.hackel@oracle.com>
+Tao Ma        <tao.ma@oracle.com>
 Sunil Mushran <sunil.mushran@oracle.com>
 Manish Singh  <manish.singh@oracle.com>
+Tiger Yang    <tiger.yang@oracle.com>
 Caveats
 =======
 Features which OCFS2 does not support yet:
-        - quotas
        - Directory change notification (F_NOTIFY)
        - Distributed Caching (F_SETLEASE/F_GETLEASE/break_lease)
@@ -70,7 +71,6 @@ commit=nrsec	(*)	Ocfs2 can be told to sync all its data and metadata
                        performance.
 localalloc=8(*)         Allows custom localalloc size in MB. If the value is too
                        large, the fs will silently revert it to the default.
-                        Localalloc is not enabled for local mounts.
 localflocks             This disables cluster aware flock.
 inode64                 Indicates that Ocfs2 is allowed to create inodes at
                        any location in the filesystem, including those which
diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting
index 92b888d540a6..a7e9746ee7ea 100644
--- a/Documentation/filesystems/porting
+++ b/Documentation/filesystems/porting
@@ -140,7 +140,7 @@ Callers of notify_change() need ->i_mutex now.
 New super_block field "struct export_operations *s_export_op" for
 explicit support for exporting, e.g. via NFS.  The structure is fully
 documented at its declaration in include/linux/fs.h, and in
-Documentation/filesystems/Exporting.
+Documentation/filesystems/nfs/Exporting.
 Briefly it allows for the definition of decode_fh and encode_fh operations
 to encode and decode filehandles, and allows the filesystem to use
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index b5aee7838a00..a4f30faa4f1f 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -38,6 +38,7 @@ Table of Contents
  3.3   /proc/<pid>/io - Display the IO accounting fields
  3.4   /proc/<pid>/coredump_filter - Core dump filtering settings
  3.5   /proc/<pid>/mountinfo - Information about mounts
+  3.6   /proc/<pid>/comm  & /proc/<pid>/task/<tid>/comm
 ------------------------------------------------------------------------------
@@ -163,6 +164,7 @@ read the file /proc/PID/status:
  VmExe:        68 kB
  VmLib:      1412 kB
  VmPTE:        20 kb
+  VmSwap:        0 kB
  Threads:        1
  SigQ:   0/28578
  SigPnd: 0000000000000000
@@ -176,7 +178,6 @@ read the file /proc/PID/status:
  CapBnd: ffffffffffffffff
  voluntary_ctxt_switches:        0
  nonvoluntary_ctxt_switches:     1
-  Stack usage:    12 kB
 This shows you nearly the same information you would get if you viewed it with
 the ps  command.  In  fact,  ps  uses  the  proc  file  system  to  obtain its
@@ -188,7 +189,13 @@ memory usage. Its seven fields are explained in Table 1-3.  The stat file
 contains details information about the process itself.  Its fields are
 explained in Table 1-4.
-Table 1-2: Contents of the statm files (as of 2.6.30-rc7)
+(for SMP CONFIG users)
+For making accounting scalable, RSS related information are handled in
+asynchronous manner and the vaule may not be very precise. To see a precise
+snapshot of a moment, you can see /proc/<pid>/smaps file and scan page table.
+It's slow but very precise.
+Table 1-2: Contents of the status files (as of 2.6.30-rc7)
 ..............................................................................
 Field                       Content
 Name                        filename of the executable
@@ -213,6 +220,7 @@ Table 1-2: Contents of the statm files (as of 2.6.30-rc7)
 VmExe                       size of text segment
 VmLib                       size of shared library code
 VmPTE                       size of page table entries
+ VmSwap                      size of swap usage (the number of referred swapents)
 Threads                     number of threads
 SigQ                        number of signals queued/max. number for queue
 SigPnd                      bitmap of pending signals for the thread
@@ -230,7 +238,6 @@ Table 1-2: Contents of the statm files (as of 2.6.30-rc7)
 Mems_allowed_list           Same as previous, but in "list format"
 voluntary_ctxt_switches     number of voluntary context switches
 nonvoluntary_ctxt_switches  number of non voluntary context switches
- Stack usage:                stack usage high water mark (round up to page size)
 ..............................................................................
 Table 1-3: Contents of the statm files (as of 2.6.8-rc3)
@@ -431,6 +438,7 @@ Table 1-5: Kernel info in /proc
 modules     List of loaded modules                            
 mounts      Mounted filesystems                               
 net         Networking info (see text)                        
+ pagetypeinfo Additional page allocator information (see text)  (2.5)
 partitions  Table of partitions known to the system           
 pci         Deprecated info of PCI bus (new way -> /proc/bus/pci/,
             decoupled by lspci                                 (2.4)
@@ -585,7 +593,7 @@ Node 0, zone      DMA      0      4      5      4      4      3 ...
 Node 0, zone   Normal      1      0      0      1    101      8 ...
 Node 0, zone  HighMem      2      0      0      1      1      0 ...
-Memory fragmentation is a problem under some workloads, and buddyinfo is a 
+External fragmentation is a problem under some workloads, and buddyinfo is a
 useful tool for helping diagnose these problems.  Buddyinfo will give you a 
 clue as to how big an area you can safely allocate, or why a previous
 allocation failed.
@@ -595,6 +603,48 @@ available.  In this case, there are 0 chunks of 2^0*PAGE_SIZE available in
 ZONE_DMA, 4 chunks of 2^1*PAGE_SIZE in ZONE_DMA, 101 chunks of 2^4*PAGE_SIZE 
 available in ZONE_NORMAL, etc... 
+More information relevant to external fragmentation can be found in
+pagetypeinfo.
+> cat /proc/pagetypeinfo
+Page block order: 9
+Pages per block:  512
+Free pages count per migrate type at order       0      1      2      3      4      5      6      7      8      9     10
+Node    0, zone      DMA, type    Unmovable      0      0      0      1      1      1      1      1      1      1      0
+Node    0, zone      DMA, type  Reclaimable      0      0      0      0      0      0      0      0      0      0      0
+Node    0, zone      DMA, type      Movable      1      1      2      1      2      1      1      0      1      0      2
+Node    0, zone      DMA, type      Reserve      0      0      0      0      0      0      0      0      0      1      0
+Node    0, zone      DMA, type      Isolate      0      0      0      0      0      0      0      0      0      0      0
+Node    0, zone    DMA32, type    Unmovable    103     54     77      1      1      1     11      8      7      1      9
+Node    0, zone    DMA32, type  Reclaimable      0      0      2      1      0      0      0      0      1      0      0
+Node    0, zone    DMA32, type      Movable    169    152    113     91     77     54     39     13      6      1    452
+Node    0, zone    DMA32, type      Reserve      1      2      2      2      2      0      1      1      1      1      0
+Node    0, zone    DMA32, type      Isolate      0      0      0      0      0      0      0      0      0      0      0
+Number of blocks type     Unmovable  Reclaimable      Movable      Reserve      Isolate
+Node 0, zone      DMA            2            0            5            1            0
+Node 0, zone    DMA32           41            6          967            2            0
+Fragmentation avoidance in the kernel works by grouping pages of different
+migrate types into the same contiguous regions of memory called page blocks.
+A page block is typically the size of the default hugepage size e.g. 2MB on
+X86-64. By keeping pages grouped based on their ability to move, the kernel
+can reclaim pages within a page block to satisfy a high-order allocation.
+The pagetypinfo begins with information on the size of a page block. It
+then gives the same type of information as buddyinfo except broken down
+by migrate-type and finishes with details on how many page blocks of each
+type exist.
+If min_free_kbytes has been tuned correctly (recommendations made by hugeadm
+from libhugetlbfs http://sourceforge.net/projects/libhugetlbfs/), one can
+make an estimate of the likely number of huge pages that can be allocated
+at a given point in time. All the "Movable" blocks should be allocatable
+unless memory has been mlock()'d. Some of the Reclaimable blocks should
+also be allocatable although a lot of filesystem metadata may have to be
+reclaimed to achieve this.
 ..............................................................................
 meminfo:
@@ -1072,7 +1122,8 @@ second).  The meanings of the columns are as follows, from left to right:
 - irq: servicing interrupts
 - softirq: servicing softirqs
 - steal: involuntary wait
- guest: running a guest
+- guest: running a normal guest
+- guest_nice: running a niced guest
 The "intr" line gives counts of interrupts  serviced since boot time, for each
 of the  possible system interrupts.   The first  column  is the  total of  all
@@ -1088,8 +1139,8 @@ The "processes" line gives the number  of processes and threads created, which
 includes (but  is not limited  to) those  created by  calls to the  fork() and
 clone() system calls.
-The  "procs_running" line gives the  number of processes  currently running on
+The "procs_running" line gives the total number of threads that are
-CPUs.
+running or ready to run (i.e., the total number of runnable threads).
 The   "procs_blocked" line gives  the  number of  processes currently blocked,
 waiting for I/O to complete.
@@ -1113,7 +1164,6 @@ Table 1-12: Files in /proc/fs/ext4/<devname>
 ..............................................................................
 File            Content                                        
 mb_groups       details of multiblock allocator buddy cache of free blocks
- mb_history      multiblock allocation history
 ..............................................................................
@@ -1409,3 +1459,11 @@ For more information on mount propagation see:
  Documentation/filesystems/sharedsubtree.txt
+3.6     /proc/<pid>/comm  & /proc/<pid>/task/<tid>/comm
+--------------------------------------------------------
+These files provide a method to access a tasks comm value. It also allows for
+a task to set its own or one of its thread siblings comm value. The comm value
+is limited in size compared to the cmdline value, so writing anything longer
+then the kernel's TASK_COMM_LEN (currently 16 chars) will result in a truncated
+comm value.
diff --git a/Documentation/filesystems/seq_file.txt b/Documentation/filesystems/seq_file.txt
index 0d15ebccf5b0..a1e2e0dda907 100644
--- a/Documentation/filesystems/seq_file.txt
+++ b/Documentation/filesystems/seq_file.txt
@@ -248,9 +248,7 @@ code, that is done in the initialization code in the usual way:
        {
                struct proc_dir_entry *entry;
-                entry = create_proc_entry("sequence", 0, NULL);
+                proc_create("sequence", 0, NULL, &ct_file_ops);
-                if (entry)
-                        entry->proc_fops = &ct_file_ops;
                return 0;
        }
diff --git a/Documentation/filesystems/sharedsubtree.txt b/Documentation/filesystems/sharedsubtree.txt
index 23a181074f94..fc0e39af43c3 100644
--- a/Documentation/filesystems/sharedsubtree.txt
+++ b/Documentation/filesystems/sharedsubtree.txt
@@ -837,6 +837,9 @@ replicas continue to be exactly same.
         individual lists does not affect propagation or the way propagation
         tree is modified by operations.
+        All vfsmounts in a peer group have the same ->mnt_master.  If it is
+        non-NULL, they form a contiguous (ordered) segment of slave list.
        A example propagation tree looks as shown in the figure below.
        [ NOTE: Though it looks like a forest, if we consider all the shared
        mounts as a conceptual entity called 'pnode', it becomes a tree]
@@ -874,8 +877,19 @@ replicas continue to be exactly same.
        NOTE: The propagation tree is orthogonal to the mount tree.
+8B Locking:
+        ->mnt_share, ->mnt_slave, ->mnt_slave_list, ->mnt_master are protected
+        by namespace_sem (exclusive for modifications, shared for reading).
+        Normally we have ->mnt_flags modifications serialized by vfsmount_lock.
+        There are two exceptions: do_add_mount() and clone_mnt().
+        The former modifies a vfsmount that has not been visible in any shared
+        data structures yet.
+        The latter holds namespace_sem and the only references to vfsmount
+        are in lists that can't be traversed without namespace_sem.
-8B Algorithm:
+8C Algorithm:
        The crux of the implementation resides in rbind/move operation.
diff --git a/Documentation/filesystems/sysfs.txt b/Documentation/filesystems/sysfs.txt
index b245d524d568..931c806642c5 100644
--- a/Documentation/filesystems/sysfs.txt
+++ b/Documentation/filesystems/sysfs.txt
@@ -91,8 +91,8 @@ struct device_attribute {
                         const char *buf, size_t count);
 };
-int device_create_file(struct device *, struct device_attribute *);
+int device_create_file(struct device *, const struct device_attribute *);
-void device_remove_file(struct device *, struct device_attribute *);
+void device_remove_file(struct device *, const struct device_attribute *);
 It also defines this helper for defining device attributes: 
@@ -316,8 +316,8 @@ DEVICE_ATTR(_name, _mode, _show, _store);
 Creation/Removal:
-int device_create_file(struct device *device, struct device_attribute * attr);
+int device_create_file(struct device *dev, const struct device_attribute * attr);
-void device_remove_file(struct device * dev, struct device_attribute * attr);
+void device_remove_file(struct device *dev, const struct device_attribute * attr);
 - bus drivers (include/linux/device.h)
@@ -358,7 +358,7 @@ DRIVER_ATTR(_name, _mode, _show, _store)
 Creation/Removal:
-int driver_create_file(struct device_driver *, struct driver_attribute *);
+int driver_create_file(struct device_driver *, const struct driver_attribute *);
-void driver_remove_file(struct device_driver *, struct driver_attribute *);
+void driver_remove_file(struct device_driver *, const struct driver_attribute *);
diff --git a/Documentation/filesystems/vfat.txt b/Documentation/filesystems/vfat.txt
index b58b84b50fa2..eed520fd0c8e 100644
--- a/Documentation/filesystems/vfat.txt
+++ b/Documentation/filesystems/vfat.txt
@@ -102,7 +102,7 @@ shortname=lower|win95|winnt|mixed
                 winnt: emulate the Windows NT rule for display/create.
                 mixed: emulate the Windows NT rule for display,
                        emulate the Windows 95 rule for create.
-                 Default setting is `lower'.
+                 Default setting is `mixed'.
 tz=UTC        -- Interpret timestamps as UTC rather than local time.
                 This option disables the conversion of timestamps
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 623f094c9d8d..3de2f32edd90 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -472,7 +472,7 @@ __sync_single_inode) to check if ->writepages has been successful in
 writing out the whole address_space.
 The Writeback tag is used by filemap*wait* and sync_page* functions,
-via wait_on_page_writeback_range, to wait for all writeback to
+via filemap_fdatawait_range, to wait for all writeback to
 complete.  While waiting ->sync_page (if defined) will be called on
 each page that is found to require writeback.