Merge branch 'linus' into timers/hpet

author: Ingo Molnar <mingo@elte.hu> 2008-07-31 12:43:41 -0400
committer: Ingo Molnar <mingo@elte.hu> 2008-07-31 12:43:41 -0400
commit: 85e9ca333d03fbd56b9e123c8456f0d98e20faad (patch)
tree: 7bb15ada5f536950efa23ad60ea9eea60380ca1c /Documentation/filesystems
parent: a300bec952127d9a15e666b391bb35c9aecb3002 (diff)
parent: 6e86841d05f371b5b9b86ce76c02aaee83352298 (diff)
13 files changed, 623 insertions, 117 deletions
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 8b22d7d8b991..680fb566b928 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -510,6 +510,7 @@ prototypes:
        void (*close)(struct vm_area_struct*);
        int (*fault)(struct vm_area_struct*, struct vm_fault *);
        int (*page_mkwrite)(struct vm_area_struct *, struct page *);
+        int (*access)(struct vm_area_struct *, unsigned long, void*, int, int);
 locking rules:
                BKL     mmap_sem        PageLocked(page)
@@ -517,6 +518,7 @@ open:		no	yes
 close:          no      yes
 fault:          no      yes
 page_mkwrite:   no      yes             no
+access:         no      yes
        ->page_mkwrite() is called when a previously read-only page is
 about to become writeable. The file system is responsible for
@@ -525,6 +527,11 @@ taking to lock out truncate, the page range should be verified to be
 within i_size. The page mapping should also be checked that it is not
 NULL.
+        ->access() is called when get_user_pages() fails in
+acces_process_vm(), typically used to debug a process through
+/proc/pid/mem or ptrace.  This function is needed only for
+VM_IO | VM_PFNMAP VMAs.
 ================================================================================
                        Dubious stuff
diff --git a/Documentation/filesystems/bfs.txt b/Documentation/filesystems/bfs.txt
index ea825e178e79..78043d5a8fc3 100644
--- a/Documentation/filesystems/bfs.txt
+++ b/Documentation/filesystems/bfs.txt
@@ -26,11 +26,11 @@ You can simplify mounting by just typing:
 this will allocate the first available loopback device (and load loop.o 
 kernel module if necessary) automatically. If the loopback driver is not
-loaded automatically, make sure that your kernel is compiled with kmod 
+loaded automatically, make sure that you have compiled the module and
-support (CONFIG_KMOD) enabled. Beware that umount will not
+that modprobe is functioning. Beware that umount will not deallocate
-deallocate /dev/loopN device if /etc/mtab file on your system is a
+/dev/loopN device if /etc/mtab file on your system is a symbolic link to
-symbolic link to /proc/mounts. You will need to do it manually using
+/proc/mounts. You will need to do it manually using "-d" switch of
-"-d" switch of losetup(8). Read losetup(8) manpage for more info.
+losetup(8). Read losetup(8) manpage for more info.
 To create the BFS image under UnixWare you need to find out first which
 slice contains it. The command prtvtoc(1M) is your friend:
diff --git a/Documentation/filesystems/configfs/configfs_example.c b/Documentation/filesystems/configfs/configfs_example.c
index 25151fd5c2c6..039648791701 100644
--- a/Documentation/filesystems/configfs/configfs_example.c
+++ b/Documentation/filesystems/configfs/configfs_example.c
@@ -279,7 +279,7 @@ static struct config_item *simple_children_make_item(struct config_group *group,
        simple_child = kzalloc(sizeof(struct simple_child), GFP_KERNEL);
        if (!simple_child)
-                return NULL;
+                return ERR_PTR(-ENOMEM);
        config_item_init_type_name(&simple_child->item, name,
@@ -366,7 +366,7 @@ static struct config_group *group_children_make_group(struct config_group *group
        simple_children = kzalloc(sizeof(struct simple_children),
                                  GFP_KERNEL);
        if (!simple_children)
-                return NULL;
+                return ERR_PTR(-ENOMEM);
        config_group_init_type_name(&simple_children->group, name,
diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt
index 0c5086db8352..80e193d82e2e 100644
--- a/Documentation/filesystems/ext4.txt
+++ b/Documentation/filesystems/ext4.txt
@@ -13,72 +13,93 @@ Mailing list: linux-ext4@vger.kernel.org
 1. Quick usage instructions:
 ===========================
-  - Grab updated e2fsprogs from
+  - Compile and install the latest version of e2fsprogs (as of this
-    ftp://ftp.kernel.org/pub/linux/kernel/people/tytso/e2fsprogs-interim/
+    writing version 1.41) from:
-    This is a patchset on top of e2fsprogs-1.39, which can be found at
+    http://sourceforge.net/project/showfiles.php?group_id=2406
+        
+        or
    ftp://ftp.kernel.org/pub/linux/kernel/people/tytso/e2fsprogs/
-  - It's still mke2fs -j /dev/hda1
+        or grab the latest git repository from:
+    git://git.kernel.org/pub/scm/fs/ext2/e2fsprogs.git
+  - Create a new filesystem using the ext4dev filesystem type:
+        # mke2fs -t ext4dev /dev/hda1
+    Or configure an existing ext3 filesystem to support extents and set
+    the test_fs flag to indicate that it's ok for an in-development
+    filesystem to touch this filesystem:
-  - mount /dev/hda1 /wherever -t ext4dev
+        # tune2fs -O extents -E test_fs /dev/hda1
-  - To enable extents,
+    If the filesystem was created with 128 byte inodes, it can be
+    converted to use 256 byte for greater efficiency via:
-        mount /dev/hda1 /wherever -t ext4dev -o extents
+        # tune2fs -I 256 /dev/hda1
-  - The filesystem is compatible with the ext3 driver until you add a file
+    (Note: we currently do not have tools to convert an ext4dev
-    which has extents (ie: `mount -o extents', then create a file).
+    filesystem back to ext3; so please do not do try this on production
+    filesystems.)
-    NOTE: The "extents" mount flag is temporary.  It will soon go away and
+  - Mounting:
-    extents will be enabled by the "-o extents" flag to mke2fs or tune2fs
+        # mount -t ext4dev /dev/hda1 /wherever
  - When comparing performance with other filesystems, remember that
-    ext3/4 by default offers higher data integrity guarantees than most.  So
+    ext3/4 by default offers higher data integrity guarantees than most.
-    when comparing with a metadata-only journalling filesystem, use `mount -o
+    So when comparing with a metadata-only journalling filesystem, such
-    data=writeback'.  And you might as well use `mount -o nobh' too along
+    as ext3, use `mount -o data=writeback'.  And you might as well use
-    with it.  Making the journal larger than the mke2fs default often helps
+    `mount -o nobh' too along with it.  Making the journal larger than
-    performance with metadata-intensive workloads.
+    the mke2fs default often helps performance with metadata-intensive
+    workloads.
 2. Features
 ===========
 2.1 Currently available
-* ability to use filesystems > 16TB
+* ability to use filesystems > 16TB (e2fsprogs support not available yet)
 * extent format reduces metadata overhead (RAM, IO for access, transactions)
 * extent format more robust in face of on-disk corruption due to magics,
 * internal redunancy in tree
+* improved file allocation (multi-block alloc)
-2.1 Previously available, soon to be enabled by default by "mkefs.ext4":
+* fix 32000 subdirectory limit
+* nsec timestamps for mtime, atime, ctime, create time
-* dir_index and resize inode will be on by default
+* inode version field on disk (NFSv4, Lustre)
-* large inodes will be used by default for fast EAs, nsec timestamps, etc
+* reduced e2fsck time via uninit_bg feature
+* journal checksumming for robustness, performance
+* persistent file preallocation (e.g for streaming media, databases)
+* ability to pack bitmaps and inode tables into larger virtual groups via the
+  flex_bg feature
+* large file support
+* Inode allocation using large virtual block groups via flex_bg
+* delayed allocation
+* large block (up to pagesize) support
+* efficent new ordered mode in JBD2 and ext4(avoid using buffer head to force
+  the ordering)
 2.2 Candidate features for future inclusion
-There are several under discussion, whether they all make it in is
+* Online defrag (patches available but not well tested)
-partly a function of how much time everyone has to work on them:
+* reduced mke2fs time via lazy itable initialization in conjuction with
+  the uninit_bg feature (capability to do this is available in e2fsprogs
+  but a kernel thread to do lazy zeroing of unused inode table blocks
+  after filesystem is first mounted is required for safety)
-* improved file allocation (multi-block alloc, delayed alloc; basically done)
+There are several others under discussion, whether they all make it in is
-* fix 32000 subdirectory limit (patch exists, needs some e2fsck work)
+partly a function of how much time everyone has to work on them. Features like
-* nsec timestamps for mtime, atime, ctime, create time (patch exists,
+metadata checksumming have been discussed and planned for a bit but no patches
-  needs some e2fsck work)
+exist yet so I'm not sure they're in the near-term roadmap.
-* inode version field on disk (NFSv4, Lustre; prototype exists)
-* reduced mke2fs/e2fsck time via uninitialized groups (prototype exists)
-* journal checksumming for robustness, performance (prototype exists)
-* persistent file preallocation (e.g for streaming media, databases)
-Features like metadata checksumming have been discussed and planned for
+The big performance win will come with mballoc, delalloc and flex_bg
-a bit but no patches exist yet so I'm not sure they're in the near-term
+grouping of bitmaps and inode tables.  Some test results available here:
-roadmap.
-The big performance win will come with mballoc and delalloc.  CFS has
+ - http://www.bullopensource.org/ext4/20080530/ffsb-write-2.6.26-rc2.html
-been using mballoc for a few years already with Lustre, and IBM + Bull
+ - http://www.bullopensource.org/ext4/20080530/ffsb-readwrite-2.6.26-rc2.html
-did a lot of benchmarking on it.  The reason it isn't in the first set of
-patches is partly a manageability issue, and partly because it doesn't
-directly affect the on-disk format (outside of much better allocation)
-so it isn't critical to get into the first round of changes.  I believe
-Alex is working on a new set of patches right now.
 3. Options
 ==========
@@ -222,9 +243,11 @@ stripe=n		Number of filesystem blocks that mballoc will try
                        to use for allocation size and alignment. For RAID5/6
                        systems this should be the number of data
                        disks *  RAID chunk size in file system blocks.
+delalloc        (*)     Deferring block allocation until write-out time.
+nodelalloc              Disable delayed allocation. Blocks are allocation
+                        when data is copied from user to page cache.
 Data Mode
---------
+=========
 There are 3 different data modes:
 * writeback mode
@@ -236,10 +259,10 @@ typically provide the best ext4 performance.
 * ordered mode
 In data=ordered mode, ext4 only officially journals metadata, but it logically
-groups metadata and data blocks into a single unit called a transaction.  When
+groups metadata information related to data changes with the data blocks into a
-it's time to write the new metadata out to disk, the associated data blocks
+single unit called a transaction.  When it's time to write the new metadata
-are written first.  In general, this mode performs slightly slower than
+out to disk, the associated data blocks are written first.  In general,
-writeback but significantly faster than journal mode.
+this mode performs slightly slower than writeback but significantly faster than journal mode.
 * journal mode
 data=journal mode provides full data and metadata journaling.  All new data is
@@ -247,7 +270,8 @@ written to the journal first, and then to its final location.
 In the event of a crash, the journal can be replayed, bringing both data and
 metadata into a consistent state.  This mode is the slowest except when data
 needs to be read from and written to disk at the same time where it
-outperforms all others modes.
+outperforms all others modes.  Curently ext4 does not have delayed
+allocation support if this data journalling mode is selected.
 References
 ==========
@@ -256,7 +280,8 @@ kernel source:	<file:fs/ext4/>
                <file:fs/jbd2/>
 programs:       http://e2fsprogs.sourceforge.net/
-                http://ext2resize.sourceforge.net
 useful links:   http://fedoraproject.org/wiki/ext3-devel
                http://www.bullopensource.org/ext4/
+                http://ext4.wiki.kernel.org/index.php/Main_Page
+                http://fedoraproject.org/wiki/Features/Ext4
diff --git a/Documentation/filesystems/gfs2-glocks.txt b/Documentation/filesystems/gfs2-glocks.txt
new file mode 100644
index 000000000000..4dae9a3840bf
--- /dev/null
+++ b/Documentation/filesystems/gfs2-glocks.txt
@@ -0,0 +1,114 @@
+                   Glock internal locking rules
+                  ------------------------------
+This documents the basic principles of the glock state machine
+internals. Each glock (struct gfs2_glock in fs/gfs2/incore.h)
+has two main (internal) locks:
+ 1. A spinlock (gl_spin) which protects the internal state such
+    as gl_state, gl_target and the list of holders (gl_holders)
+ 2. A non-blocking bit lock, GLF_LOCK, which is used to prevent other
+    threads from making calls to the DLM, etc. at the same time. If a
+    thread takes this lock, it must then call run_queue (usually via the
+    workqueue) when it releases it in order to ensure any pending tasks
+    are completed.
+The gl_holders list contains all the queued lock requests (not
+just the holders) associated with the glock. If there are any
+held locks, then they will be contiguous entries at the head
+of the list. Locks are granted in strictly the order that they
+are queued, except for those marked LM_FLAG_PRIORITY which are
+used only during recovery, and even then only for journal locks.
+There are three lock states that users of the glock layer can request,
+namely shared (SH), deferred (DF) and exclusive (EX). Those translate
+to the following DLM lock modes:
+Glock mode    | DLM lock mode
+------------------------------
+    UN        |    IV/NL  Unlocked (no DLM lock associated with glock) or NL
+    SH        |    PR     (Protected read)
+    DF        |    CW     (Concurrent write)
+    EX        |    EX     (Exclusive)
+Thus DF is basically a shared mode which is incompatible with the "normal"
+shared lock mode, SH. In GFS2 the DF mode is used exclusively for direct I/O
+operations. The glocks are basically a lock plus some routines which deal
+with cache management. The following rules apply for the cache:
+Glock mode   |  Cache data | Cache Metadata | Dirty Data | Dirty Metadata
+--------------------------------------------------------------------------
+    UN       |     No      |       No       |     No     |      No
+    SH       |     Yes     |       Yes      |     No     |      No
+    DF       |     No      |       Yes      |     No     |      No
+    EX       |     Yes     |       Yes      |     Yes    |      Yes
+These rules are implemented using the various glock operations which
+are defined for each type of glock. Not all types of glocks use
+all the modes. Only inode glocks use the DF mode for example.
+Table of glock operations and per type constants:
+Field            | Purpose
+----------------------------------------------------------------------------
+go_xmote_th      | Called before remote state change (e.g. to sync dirty data)
+go_xmote_bh      | Called after remote state change (e.g. to refill cache)
+go_inval         | Called if remote state change requires invalidating the cache
+go_demote_ok     | Returns boolean value of whether its ok to demote a glock
+                 | (e.g. checks timeout, and that there is no cached data)
+go_lock          | Called for the first local holder of a lock
+go_unlock        | Called on the final local unlock of a lock
+go_dump          | Called to print content of object for debugfs file, or on
+                 | error to dump glock to the log.
+go_type;         | The type of the glock, LM_TYPE_.....
+go_min_hold_time | The minimum hold time
+The minimum hold time for each lock is the time after a remote lock
+grant for which we ignore remote demote requests. This is in order to
+prevent a situation where locks are being bounced around the cluster
+from node to node with none of the nodes making any progress. This
+tends to show up most with shared mmaped files which are being written
+to by multiple nodes. By delaying the demotion in response to a
+remote callback, that gives the userspace program time to make
+some progress before the pages are unmapped.
+There is a plan to try and remove the go_lock and go_unlock callbacks
+if possible, in order to try and speed up the fast path though the locking.
+Also, eventually we hope to make the glock "EX" mode locally shared
+such that any local locking will be done with the i_mutex as required
+rather than via the glock.
+Locking rules for glock operations:
+Operation     |  GLF_LOCK bit lock held |  gl_spin spinlock held
+-----------------------------------------------------------------
+go_xmote_th   |       Yes               |       No
+go_xmote_bh   |       Yes               |       No
+go_inval      |       Yes               |       No
+go_demote_ok  |       Sometimes         |       Yes
+go_lock       |       Yes               |       No
+go_unlock     |       Yes               |       No
+go_dump       |       Sometimes         |       Yes
+N.B. Operations must not drop either the bit lock or the spinlock
+if its held on entry. go_dump and do_demote_ok must never block.
+Note that go_dump will only be called if the glock's state
+indicates that it is caching uptodate data.
+Glock locking order within GFS2:
+ 1. i_mutex (if required)
+ 2. Rename glock (for rename only)
+ 3. Inode glock(s)
+    (Parents before children, inodes at "same level" with same parent in
+     lock number order)
+ 4. Rgrp glock(s) (for (de)allocation operations)
+ 5. Transaction glock (via gfs2_trans_begin) for non-read operations
+ 6. Page lock  (always last, very important!)
+There are two glocks per inode. One deals with access to the inode
+itself (locking order as above), and the other, known as the iopen
+glock is used in conjunction with the i_nlink field in the inode to
+determine the lifetime of the inode in question. Locking of inodes
+is on a per-inode basis. Locking of rgrps is on a per rgrp basis.
diff --git a/Documentation/filesystems/nfs-rdma.txt b/Documentation/filesystems/nfs-rdma.txt
index d0ec45ae4e7d..44bd766f2e5d 100644
--- a/Documentation/filesystems/nfs-rdma.txt
+++ b/Documentation/filesystems/nfs-rdma.txt
@@ -5,7 +5,7 @@
 ################################################################################
 Author: NetApp and Open Grid Computing
- Date: April 15, 2008
+ Date: May 29, 2008
 Table of Contents
 ~~~~~~~~~~~~~~~~~
@@ -60,16 +60,18 @@ Installation
    The procedures described in this document have been tested with
    distributions from Red Hat's Fedora Project (http://fedora.redhat.com/).
-  - Install nfs-utils-1.1.1 or greater on the client
+  - Install nfs-utils-1.1.2 or greater on the client
-    An NFS/RDMA mount point can only be obtained by using the mount.nfs
+    An NFS/RDMA mount point can be obtained by using the mount.nfs command in
-    command in nfs-utils-1.1.1 or greater. To see which version of mount.nfs
+    nfs-utils-1.1.2 or greater (nfs-utils-1.1.1 was the first nfs-utils
-    you are using, type:
+    version with support for NFS/RDMA mounts, but for various reasons we
+    recommend using nfs-utils-1.1.2 or greater). To see which version of
+    mount.nfs you are using, type:
-    > /sbin/mount.nfs -V
+    $ /sbin/mount.nfs -V
-    If the version is less than 1.1.1 or the command does not exist,
+    If the version is less than 1.1.2 or the command does not exist,
-    then you will need to install the latest version of nfs-utils.
+    you should install the latest version of nfs-utils.
    Download the latest package from:
@@ -77,22 +79,33 @@ Installation
    Uncompress the package and follow the installation instructions.
-    If you will not be using GSS and NFSv4, the installation process
+    If you will not need the idmapper and gssd executables (you do not need
-    can be simplified by disabling these features when running configure:
+    these to create an NFS/RDMA enabled mount command), the installation
+    process can be simplified by disabling these features when running
+    configure:
-    > ./configure --disable-gss --disable-nfsv4
+    $ ./configure --disable-gss --disable-nfsv4
-    For more information on this see the package's README and INSTALL files.
+    To build nfs-utils you will need the tcp_wrappers package installed. For
+    more information on this see the package's README and INSTALL files.
    After building the nfs-utils package, there will be a mount.nfs binary in
    the utils/mount directory. This binary can be used to initiate NFS v2, v3,
-    or v4 mounts. To initiate a v4 mount, the binary must be called mount.nfs4.
+    or v4 mounts. To initiate a v4 mount, the binary must be called
-    The standard technique is to create a symlink called mount.nfs4 to mount.nfs.
+    mount.nfs4.  The standard technique is to create a symlink called
+    mount.nfs4 to mount.nfs.
-    NOTE: mount.nfs and therefore nfs-utils-1.1.1 or greater is only needed
+    This mount.nfs binary should be installed at /sbin/mount.nfs as follows:
+    $ sudo cp utils/mount/mount.nfs /sbin/mount.nfs
+    In this location, mount.nfs will be invoked automatically for NFS mounts
+    by the system mount commmand.
+    NOTE: mount.nfs and therefore nfs-utils-1.1.2 or greater is only needed
    on the NFS client machine. You do not need this specific version of
    nfs-utils on the server. Furthermore, only the mount.nfs command from
-    nfs-utils-1.1.1 is needed on the client.
+    nfs-utils-1.1.2 is needed on the client.
  - Install a Linux kernel with NFS/RDMA
@@ -156,8 +169,8 @@ Check RDMA and NFS Setup
    this time. For example, if you are using a Mellanox Tavor/Sinai/Arbel
    card:
-    > modprobe ib_mthca
+    $ modprobe ib_mthca
-    > modprobe ib_ipoib
+    $ modprobe ib_ipoib
    If you are using InfiniBand, make sure there is a Subnet Manager (SM)
    running on the network. If your IB switch has an embedded SM, you can
@@ -166,7 +179,7 @@ Check RDMA and NFS Setup
    If an SM is running on your network, you should see the following:
-    > cat /sys/class/infiniband/driverX/ports/1/state
+    $ cat /sys/class/infiniband/driverX/ports/1/state
    4: ACTIVE
    where driverX is mthca0, ipath5, ehca3, etc.
@@ -174,10 +187,10 @@ Check RDMA and NFS Setup
    To further test the InfiniBand software stack, use IPoIB (this
    assumes you have two IB hosts named host1 and host2):
-    host1> ifconfig ib0 a.b.c.x
+    host1$ ifconfig ib0 a.b.c.x
-    host2> ifconfig ib0 a.b.c.y
+    host2$ ifconfig ib0 a.b.c.y
-    host1> ping a.b.c.y
+    host1$ ping a.b.c.y
-    host2> ping a.b.c.x
+    host2$ ping a.b.c.x
    For other device types, follow the appropriate procedures.
@@ -202,11 +215,11 @@ NFS/RDMA Setup
    /vol0   192.168.0.47(fsid=0,rw,async,insecure,no_root_squash)
    /vol0   192.168.0.0/255.255.255.0(fsid=0,rw,async,insecure,no_root_squash)
-    The IP address(es) is(are) the client's IPoIB address for an InfiniBand HCA or the
+    The IP address(es) is(are) the client's IPoIB address for an InfiniBand
-    cleint's iWARP address(es) for an RNIC.
+    HCA or the cleint's iWARP address(es) for an RNIC.
-    NOTE: The "insecure" option must be used because the NFS/RDMA client does not
+    NOTE: The "insecure" option must be used because the NFS/RDMA client does
-    use a reserved port.
+    not use a reserved port.
 Each time a machine boots:
@@ -214,43 +227,45 @@ NFS/RDMA Setup
    For InfiniBand using a Mellanox adapter:
-    > modprobe ib_mthca
+    $ modprobe ib_mthca
-    > modprobe ib_ipoib
+    $ modprobe ib_ipoib
-    > ifconfig ib0 a.b.c.d
+    $ ifconfig ib0 a.b.c.d
    NOTE: use unique addresses for the client and server
  - Start the NFS server
-    If the NFS/RDMA server was built as a module (CONFIG_SUNRPC_XPRT_RDMA=m in kernel config),
+    If the NFS/RDMA server was built as a module (CONFIG_SUNRPC_XPRT_RDMA=m in
-    load the RDMA transport module:
+    kernel config), load the RDMA transport module:
-    > modprobe svcrdma
+    $ modprobe svcrdma
-    Regardless of how the server was built (module or built-in), start the server:
+    Regardless of how the server was built (module or built-in), start the
+    server:
-    > /etc/init.d/nfs start
+    $ /etc/init.d/nfs start
    or
-    > service nfs start
+    $ service nfs start
    Instruct the server to listen on the RDMA transport:
-    > echo rdma 2050 > /proc/fs/nfsd/portlist
+    $ echo rdma 2050 > /proc/fs/nfsd/portlist
  - On the client system
-    If the NFS/RDMA client was built as a module (CONFIG_SUNRPC_XPRT_RDMA=m in kernel config),
+    If the NFS/RDMA client was built as a module (CONFIG_SUNRPC_XPRT_RDMA=m in
-    load the RDMA client module:
+    kernel config), load the RDMA client module:
-    > modprobe xprtrdma.ko
+    $ modprobe xprtrdma.ko
-    Regardless of how the client was built (module or built-in), issue the mount.nfs command:
+    Regardless of how the client was built (module or built-in), use this
+    command to mount the NFS/RDMA server:
-    > /path/to/your/mount.nfs <IPoIB-server-name-or-address>:/<export> /mnt -i -o rdma,port=2050
+    $ mount -o rdma,port=2050 <IPoIB-server-name-or-address>:/<export> /mnt
-    To verify that the mount is using RDMA, run "cat /proc/mounts" and check the
+    To verify that the mount is using RDMA, run "cat /proc/mounts" and check
-    "proto" field for the given mount.
+    the "proto" field for the given mount.
  Congratulations! You're using NFS/RDMA!
diff --git a/Documentation/filesystems/omfs.txt b/Documentation/filesystems/omfs.txt
new file mode 100644
index 000000000000..1d0d41ff5c65
--- /dev/null
+++ b/Documentation/filesystems/omfs.txt
@@ -0,0 +1,106 @@
+Optimized MPEG Filesystem (OMFS)
+Overview
+========
+OMFS is a filesystem created by SonicBlue for use in the ReplayTV DVR
+and Rio Karma MP3 player.  The filesystem is extent-based, utilizing
+block sizes from 2k to 8k, with hash-based directories.  This
+filesystem driver may be used to read and write disks from these
+devices.
+Note, it is not recommended that this FS be used in place of a general
+filesystem for your own streaming media device.  Native Linux filesystems
+will likely perform better.
+More information is available at:
+    http://linux-karma.sf.net/
+Various utilities, including mkomfs and omfsck, are included with
+omfsprogs, available at:
+    http://bobcopeland.com/karma/
+Instructions are included in its README.
+Options
+=======
+OMFS supports the following mount-time options:
+    uid=n        - make all files owned by specified user
+    gid=n        - make all files owned by specified group
+    umask=xxx    - set permission umask to xxx
+    fmask=xxx    - set umask to xxx for files
+    dmask=xxx    - set umask to xxx for directories
+Disk format
+===========
+OMFS discriminates between "sysblocks" and normal data blocks.  The sysblock
+group consists of super block information, file metadata, directory structures,
+and extents.  Each sysblock has a header containing CRCs of the entire
+sysblock, and may be mirrored in successive blocks on the disk.  A sysblock may
+have a smaller size than a data block, but since they are both addressed by the
+same 64-bit block number, any remaining space in the smaller sysblock is
+unused.
+Sysblock header information:
+struct omfs_header {
+        __be64 h_self;                  /* FS block where this is located */
+        __be32 h_body_size;             /* size of useful data after header */
+        __be16 h_crc;                   /* crc-ccitt of body_size bytes */
+        char h_fill1[2];
+        u8 h_version;                   /* version, always 1 */
+        char h_type;                    /* OMFS_INODE_X */
+        u8 h_magic;                     /* OMFS_IMAGIC */
+        u8 h_check_xor;                 /* XOR of header bytes before this */
+        __be32 h_fill2;
+};
+Files and directories are both represented by omfs_inode:
+struct omfs_inode {
+        struct omfs_header i_head;      /* header */
+        __be64 i_parent;                /* parent containing this inode */
+        __be64 i_sibling;               /* next inode in hash bucket */
+        __be64 i_ctime;                 /* ctime, in milliseconds */
+        char i_fill1[35];
+        char i_type;                    /* OMFS_[DIR,FILE] */
+        __be32 i_fill2;
+        char i_fill3[64];
+        char i_name[OMFS_NAMELEN];      /* filename */
+        __be64 i_size;                  /* size of file, in bytes */
+};
+Directories in OMFS are implemented as a large hash table.  Filenames are
+hashed then prepended into the bucket list beginning at OMFS_DIR_START.
+Lookup requires hashing the filename, then seeking across i_sibling pointers
+until a match is found on i_name.  Empty buckets are represented by block
+pointers with all-1s (~0).
+A file is an omfs_inode structure followed by an extent table beginning at
+OMFS_EXTENT_START:
+struct omfs_extent_entry {
+        __be64 e_cluster;               /* start location of a set of blocks */
+        __be64 e_blocks;                /* number of blocks after e_cluster */
+};
+struct omfs_extent {
+        __be64 e_next;                  /* next extent table location */
+        __be32 e_extent_count;          /* total # extents in this table */
+        __be32 e_fill;
+        struct omfs_extent_entry e_entry;       /* start of extent entries */
+};
+Each extent holds the block offset followed by number of blocks allocated to
+the extent.  The final extent in each table is a terminator with e_cluster
+being ~0 and e_blocks being ones'-complement of the total number of blocks
+in the table.
+If this table overflows, a continuation inode is written and pointed to by
+e_next.  These have a header but lack the rest of the inode structure.
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index dbc3c6a3650f..64557821ee59 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -296,6 +296,7 @@ Table 1-4: Kernel info in /proc
 uptime      System uptime                                     
 version     Kernel version                                    
 video       bttv info of video resources                       (2.4)
+ vmallocinfo Show vmalloced areas
 ..............................................................................
 You can,  for  example,  check  which interrupts are currently in use and what
@@ -380,28 +381,35 @@ i386 and x86_64 platforms support the new IRQ vector displays.
 Of some interest is the introduction of the /proc/irq directory to 2.4.
 It could be used to set IRQ to CPU affinity, this means that you can "hook" an
 IRQ to only one CPU, or to exclude a CPU of handling IRQs. The contents of the
-irq subdir is one subdir for each IRQ, and one file; prof_cpu_mask
+irq subdir is one subdir for each IRQ, and two files; default_smp_affinity and
+prof_cpu_mask.
 For example 
  > ls /proc/irq/
  0  10  12  14  16  18  2  4  6  8  prof_cpu_mask
-  1  11  13  15  17  19  3  5  7  9
+  1  11  13  15  17  19  3  5  7  9  default_smp_affinity
  > ls /proc/irq/0/
  smp_affinity
-The contents of the prof_cpu_mask file and each smp_affinity file for each IRQ
+smp_affinity is a bitmask, in which you can specify which CPUs can handle the
-is the same by default:
+IRQ, you can set it by doing:
-  > cat /proc/irq/0/smp_affinity 
+  > echo 1 > /proc/irq/10/smp_affinity
-  ffffffff
+This means that only the first CPU will handle the IRQ, but you can also echo
+5 which means that only the first and fourth CPU can handle the IRQ.
+The contents of each smp_affinity file is the same by default:
-It's a bitmask, in which you can specify which CPUs can handle the IRQ, you can
+  > cat /proc/irq/0/smp_affinity
-set it by doing:
+  ffffffff
-  > echo 1 > /proc/irq/prof_cpu_mask
+The default_smp_affinity mask applies to all non-active IRQs, which are the
+IRQs which have not yet been allocated/activated, and hence which lack a
+/proc/irq/[0-9]* directory.
-This means that only the first CPU will handle the IRQ, but you can also echo 5
+prof_cpu_mask specifies which CPUs are to be profiled by the system wide
-which means that only the first and fourth CPU can handle the IRQ.
+profiler. Default value is ffffffff (all cpus).
 The way IRQs are routed is handled by the IO-APIC, and it's Round Robin
 between all the CPUs which are allowed to handle it. As usual the kernel has
@@ -550,6 +558,49 @@ VmallocTotal: total size of vmalloc memory area
 VmallocUsed: amount of vmalloc area which is used
 VmallocChunk: largest contigious block of vmalloc area which is free
+..............................................................................
+vmallocinfo:
+Provides information about vmalloced/vmaped areas. One line per area,
+containing the virtual address range of the area, size in bytes,
+caller information of the creator, and optional information depending
+on the kind of area :
+ pages=nr    number of pages
+ phys=addr   if a physical address was specified
+ ioremap     I/O mapping (ioremap() and friends)
+ vmalloc     vmalloc() area
+ vmap        vmap()ed pages
+ user        VM_USERMAP area
+ vpages      buffer for pages pointers was vmalloced (huge area)
+ N<node>=nr  (Only on NUMA kernels)
+             Number of pages allocated on memory node <node>
+> cat /proc/vmallocinfo
+0xffffc20000000000-0xffffc20000201000 2101248 alloc_large_system_hash+0x204 ...
+  /0x2c0 pages=512 vmalloc N0=128 N1=128 N2=128 N3=128
+0xffffc20000201000-0xffffc20000302000 1052672 alloc_large_system_hash+0x204 ...
+  /0x2c0 pages=256 vmalloc N0=64 N1=64 N2=64 N3=64
+0xffffc20000302000-0xffffc20000304000    8192 acpi_tb_verify_table+0x21/0x4f...
+  phys=7fee8000 ioremap
+0xffffc20000304000-0xffffc20000307000   12288 acpi_tb_verify_table+0x21/0x4f...
+  phys=7fee7000 ioremap
+0xffffc2000031d000-0xffffc2000031f000    8192 init_vdso_vars+0x112/0x210
+0xffffc2000031f000-0xffffc2000032b000   49152 cramfs_uncompress_init+0x2e ...
+  /0x80 pages=11 vmalloc N0=3 N1=3 N2=2 N3=3
+0xffffc2000033a000-0xffffc2000033d000   12288 sys_swapon+0x640/0xac0      ...
+  pages=2 vmalloc N1=2
+0xffffc20000347000-0xffffc2000034c000   20480 xt_alloc_table_info+0xfe ...
+  /0x130 [x_tables] pages=4 vmalloc N0=4
+0xffffffffa0000000-0xffffffffa000f000   61440 sys_init_module+0xc27/0x1d00 ...
+   pages=14 vmalloc N2=14
+0xffffffffa000f000-0xffffffffa0014000   20480 sys_init_module+0xc27/0x1d00 ...
+   pages=4 vmalloc N1=4
+0xffffffffa0014000-0xffffffffa0017000   12288 sys_init_module+0xc27/0x1d00 ...
+   pages=2 vmalloc N1=2
+0xffffffffa0017000-0xffffffffa0022000   45056 sys_init_module+0xc27/0x1d00 ...
+   pages=10 vmalloc N0=10
 1.3 IDE devices in /proc/ide
 ----------------------------
@@ -880,7 +931,7 @@ group_prealloc  max_to_scan  mb_groups  mb_history  min_to_scan  order2_req
 stats  stream_req
 mb_groups:
-This file gives the details of mutiblock allocator buddy cache of free blocks
+This file gives the details of multiblock allocator buddy cache of free blocks
 mb_history:
 Multiblock allocation history.
@@ -1423,7 +1474,7 @@ used because pages_free(1355) is smaller than watermark + protection[2]
 normal page requirement. If requirement is DMA zone(index=0), protection[0]
 (=0) is used.
-zone[i]'s protection[j] is calculated by following exprssion.
+zone[i]'s protection[j] is calculated by following expression.
 (i < j):
  zone[i]->protection[j]
diff --git a/Documentation/filesystems/relay.txt b/Documentation/filesystems/relay.txt
index 094f2d2f38b1..510b722667ac 100644
--- a/Documentation/filesystems/relay.txt
+++ b/Documentation/filesystems/relay.txt
@@ -294,6 +294,16 @@ user-defined data with a channel, and is immediately available
 (including in create_buf_file()) via chan->private_data or
 buf->chan->private_data.
+Buffer-only channels
+--------------------
+These channels have no files associated and can be created with
+relay_open(NULL, NULL, ...). Such channels are useful in scenarios such
+as when doing early tracing in the kernel, before the VFS is up. In these
+cases, one may open a buffer-only channel and then call
+relay_late_setup_files() when the kernel is ready to handle files,
+to expose the buffered data to the userspace.
 Channel 'modes'
 ---------------
diff --git a/Documentation/filesystems/sysfs.txt b/Documentation/filesystems/sysfs.txt
index 7f27b8f840d0..9e9c348275a9 100644
--- a/Documentation/filesystems/sysfs.txt
+++ b/Documentation/filesystems/sysfs.txt
@@ -248,6 +248,7 @@ The top level sysfs directory looks like:
 block/
 bus/
 class/
+dev/
 devices/
 firmware/
 net/
@@ -274,6 +275,11 @@ fs/ contains a directory for some filesystems.  Currently each
 filesystem wanting to export attributes must create its own hierarchy
 below fs/ (see ./fuse.txt for an example).
+dev/ contains two directories char/ and block/. Inside these two
+directories there are symlinks named <major>:<minor>.  These symlinks
+point to the sysfs directory for the given device.  /sys/dev provides a
+quick way to lookup the sysfs interface for a device from the result of
+a stat(2) operation.
 More information can driver-model specific features can be found in
 Documentation/driver-model/. 
diff --git a/Documentation/filesystems/ubifs.txt b/Documentation/filesystems/ubifs.txt
new file mode 100644
index 000000000000..540e9e7f59c5
--- /dev/null
+++ b/Documentation/filesystems/ubifs.txt
@@ -0,0 +1,164 @@
+Introduction
+=============
+UBIFS file-system stands for UBI File System. UBI stands for "Unsorted
+Block Images". UBIFS is a flash file system, which means it is designed
+to work with flash devices. It is important to understand, that UBIFS
+is completely different to any traditional file-system in Linux, like
+Ext2, XFS, JFS, etc. UBIFS represents a separate class of file-systems
+which work with MTD devices, not block devices. The other Linux
+file-system of this class is JFFS2.
+To make it more clear, here is a small comparison of MTD devices and
+block devices.
+1 MTD devices represent flash devices and they consist of eraseblocks of
+  rather large size, typically about 128KiB. Block devices consist of
+  small blocks, typically 512 bytes.
+2 MTD devices support 3 main operations - read from some offset within an
+  eraseblock, write to some offset within an eraseblock, and erase a whole
+  eraseblock. Block  devices support 2 main operations - read a whole
+  block and write a whole block.
+3 The whole eraseblock has to be erased before it becomes possible to
+  re-write its contents. Blocks may be just re-written.
+4 Eraseblocks become worn out after some number of erase cycles -
+  typically 100K-1G for SLC NAND and NOR flashes, and 1K-10K for MLC
+  NAND flashes. Blocks do not have the wear-out property.
+5 Eraseblocks may become bad (only on NAND flashes) and software should
+  deal with this. Blocks on hard drives typically do not become bad,
+  because hardware has mechanisms to substitute bad blocks, at least in
+  modern LBA disks.
+It should be quite obvious why UBIFS is very different to traditional
+file-systems.
+UBIFS works on top of UBI. UBI is a separate software layer which may be
+found in drivers/mtd/ubi. UBI is basically a volume management and
+wear-leveling layer. It provides so called UBI volumes which is a higher
+level abstraction than a MTD device. The programming model of UBI devices
+is very similar to MTD devices - they still consist of large eraseblocks,
+they have read/write/erase operations, but UBI devices are devoid of
+limitations like wear and bad blocks (items 4 and 5 in the above list).
+In a sense, UBIFS is a next generation of JFFS2 file-system, but it is
+very different and incompatible to JFFS2. The following are the main
+differences.
+* JFFS2 works on top of MTD devices, UBIFS depends on UBI and works on
+  top of UBI volumes.
+* JFFS2 does not have on-media index and has to build it while mounting,
+  which requires full media scan. UBIFS maintains the FS indexing
+  information on the flash media and does not require full media scan,
+  so it mounts many times faster than JFFS2.
+* JFFS2 is a write-through file-system, while UBIFS supports write-back,
+  which makes UBIFS much faster on writes.
+Similarly to JFFS2, UBIFS supports on-the-flight compression which makes
+it possible to fit quite a lot of data to the flash.
+Similarly to JFFS2, UBIFS is tolerant of unclean reboots and power-cuts.
+It does not need stuff like ckfs.ext2. UBIFS automatically replays its
+journal and recovers from crashes, ensuring that the on-flash data
+structures are consistent.
+UBIFS scales logarithmically (most of the data structures it uses are
+trees), so the mount time and memory consumption do not linearly depend
+on the flash size, like in case of JFFS2. This is because UBIFS
+maintains the FS index on the flash media. However, UBIFS depends on
+UBI, which scales linearly. So overall UBI/UBIFS stack scales linearly.
+Nevertheless, UBI/UBIFS scales considerably better than JFFS2.
+The authors of UBIFS believe, that it is possible to develop UBI2 which
+would scale logarithmically as well. UBI2 would support the same API as UBI,
+but it would be binary incompatible to UBI. So UBIFS would not need to be
+changed to use UBI2
+Mount options
+=============
+(*) == default.
+norm_unmount (*)        commit on unmount; the journal is committed
+                        when the file-system is unmounted so that the
+                        next mount does not have to replay the journal
+                        and it becomes very fast;
+fast_unmount            do not commit on unmount; this option makes
+                        unmount faster, but the next mount slower
+                        because of the need to replay the journal.
+Quick usage instructions
+========================
+The UBI volume to mount is specified using "ubiX_Y" or "ubiX:NAME" syntax,
+where "X" is UBI device number, "Y" is UBI volume number, and "NAME" is
+UBI volume name.
+Mount volume 0 on UBI device 0 to /mnt/ubifs:
+$ mount -t ubifs ubi0_0 /mnt/ubifs
+Mount "rootfs" volume of UBI device 0 to /mnt/ubifs ("rootfs" is volume
+name):
+$ mount -t ubifs ubi0:rootfs /mnt/ubifs
+The following is an example of the kernel boot arguments to attach mtd0
+to UBI and mount volume "rootfs":
+ubi.mtd=0 root=ubi0:rootfs rootfstype=ubifs
+Module Parameters for Debugging
+===============================
+When UBIFS has been compiled with debugging enabled, there are 3 module
+parameters that are available to control aspects of testing and debugging.
+The parameters are unsigned integers where each bit controls an option.
+The parameters are:
+debug_msgs      Selects which debug messages to display, as follows:
+                Message Type                            Flag value
+                General messages                        1
+                Journal messages                        2
+                Mount messages                          4
+                Commit messages                         8
+                LEB search messages                     16
+                Budgeting messages                      32
+                Garbage collection messages             64
+                Tree Node Cache (TNC) messages          128
+                LEB properties (lprops) messages        256
+                Input/output messages                   512
+                Log messages                            1024
+                Scan messages                           2048
+                Recovery messages                       4096
+debug_chks      Selects extra checks that UBIFS can do while running:
+                Check                                   Flag value
+                General checks                          1
+                Check Tree Node Cache (TNC)             2
+                Check indexing tree size                4
+                Check orphan area                       8
+                Check old indexing tree                 16
+                Check LEB properties (lprops)           32
+                Check leaf nodes and inodes             64
+debug_tsts      Selects a mode of testing, as follows:
+                Test mode                               Flag value
+                Force in-the-gaps method                2
+                Failure mode for recovery testing       4
+For example, set debug_msgs to 5 to display General messages and Mount
+messages.
+References
+==========
+UBIFS documentation and FAQ/HOWTO at the MTD web site:
+http://www.linux-mtd.infradead.org/doc/ubifs.html
+http://www.linux-mtd.infradead.org/faq/ubifs.html
diff --git a/Documentation/filesystems/vfat.txt b/Documentation/filesystems/vfat.txt
index 2d5e1e582e13..bbac4f1d9056 100644
--- a/Documentation/filesystems/vfat.txt
+++ b/Documentation/filesystems/vfat.txt
@@ -96,6 +96,14 @@ shortname=lower|win95|winnt|mixed
                        emulate the Windows 95 rule for create.
                 Default setting is `lower'.
+tz=UTC        -- Interpret timestamps as UTC rather than local time.
+                 This option disables the conversion of timestamps
+                 between local time (as used by Windows on FAT) and UTC
+                 (which Linux uses internally).  This is particuluarly
+                 useful when mounting devices (like digital cameras)
+                 that are set to UTC in order to avoid the pitfalls of
+                 local time.
 <bool>: 0,1,yes,no,true,false
 TODO
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index b7522c6cbae3..c4d348dabe94 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -143,7 +143,7 @@ struct file_system_type {
 The get_sb() method has the following arguments:
-  struct file_system_type *fs_type: decribes the filesystem, partly initialized
+  struct file_system_type *fs_type: describes the filesystem, partly initialized
        by the specific filesystem code
  int flags: mount flags
@@ -895,9 +895,9 @@ struct dentry_operations {
        iput() yourself
  d_dname: called when the pathname of a dentry should be generated.
-        Usefull for some pseudo filesystems (sockfs, pipefs, ...) to delay
+        Useful for some pseudo filesystems (sockfs, pipefs, ...) to delay
        pathname generation. (Instead of doing it when dentry is created,
-        its done only when the path is needed.). Real filesystems probably
+        it's done only when the path is needed.). Real filesystems probably
        dont want to use it, because their dentries are present in global
        dcache hash, so their hash should be an invariant. As no lock is
        held, d_dname() should not try to modify the dentry itself, unless
author	Ingo Molnar <mingo@elte.hu>	2008-07-31 12:43:41 -0400
committer	Ingo Molnar <mingo@elte.hu>	2008-07-31 12:43:41 -0400
commit	85e9ca333d03fbd56b9e123c8456f0d98e20faad (patch)
tree	7bb15ada5f536950efa23ad60ea9eea60380ca1c /Documentation/filesystems
parent	a300bec952127d9a15e666b391bb35c9aecb3002 (diff)
parent	6e86841d05f371b5b9b86ce76c02aaee83352298 (diff)