12 files changed, 417 insertions, 241 deletions
diff --git a/Documentation/auxdisplay/cfag12864b-example.c b/Documentation/auxdisplay/cfag12864b-example.c
index 1d2c010bae12..e7823ffb1ca0 100644
--- a/Documentation/auxdisplay/cfag12864b-example.c
+++ b/Documentation/auxdisplay/cfag12864b-example.c
@@ -194,7 +194,6 @@ static void cfag12864b_blit(void)
 */
 #include <stdio.h>
-#include <string.h>
 #define EXAMPLES        6
diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt
index 6eb1a97e88ce..455d4e6d346d 100644
--- a/Documentation/cgroups/cgroups.txt
+++ b/Documentation/cgroups/cgroups.txt
@@ -408,6 +408,26 @@ You can attach the current shell task by echoing 0:
 # echo 0 > tasks
+2.3 Mounting hierarchies by name
+--------------------------------
+Passing the name=<x> option when mounting a cgroups hierarchy
+associates the given name with the hierarchy.  This can be used when
+mounting a pre-existing hierarchy, in order to refer to it by name
+rather than by its set of active subsystems.  Each hierarchy is either
+nameless, or has a unique name.
+The name should match [\w.-]+
+When passing a name=<x> option for a new hierarchy, you need to
+specify subsystems manually; the legacy behaviour of mounting all
+subsystems when none are explicitly specified is not supported when
+you give a subsystem a name.
+The name of the subsystem appears as part of the hierarchy description
+in /proc/mounts and /proc/<pid>/cgroups.
 3. Kernel API
 =============
@@ -501,7 +521,7 @@ rmdir() will fail with it. From this behavior, pre_destroy() can be
 called multiple times against a cgroup.
 int can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
-               struct task_struct *task)
+               struct task_struct *task, bool threadgroup)
 (cgroup_mutex held by caller)
 Called prior to moving a task into a cgroup; if the subsystem
@@ -509,14 +529,20 @@ returns an error, this will abort the attach operation.  If a NULL
 task is passed, then a successful result indicates that *any*
 unspecified task can be moved into the cgroup. Note that this isn't
 called on a fork. If this method returns 0 (success) then this should
-remain valid while the caller holds cgroup_mutex.
+remain valid while the caller holds cgroup_mutex. If threadgroup is
+true, then a successful result indicates that all threads in the given
+thread's threadgroup can be moved together.
 void attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
-            struct cgroup *old_cgrp, struct task_struct *task)
+            struct cgroup *old_cgrp, struct task_struct *task,
+            bool threadgroup)
 (cgroup_mutex held by caller)
 Called after the task has been attached to the cgroup, to allow any
 post-attachment activity that requires memory allocations or blocking.
+If threadgroup is true, the subsystem should take care of all threads
+in the specified thread's threadgroup. Currently does not support any
+subsystem that might need the old_cgrp for every thread in the group.
 void fork(struct cgroup_subsy *ss, struct task_struct *task)
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt
index 23d1262c0775..b871f2552b45 100644
--- a/Documentation/cgroups/memory.txt
+++ b/Documentation/cgroups/memory.txt
@@ -179,6 +179,9 @@ The reclaim algorithm has not been modified for cgroups, except that
 pages that are selected for reclaiming come from the per cgroup LRU
 list.
+NOTE: Reclaim does not work for the root cgroup, since we cannot set any
+limits on the root cgroup.
 2. Locking
 The memory controller uses the following hierarchy
@@ -210,6 +213,7 @@ We can alter the memory limit:
 NOTE: We can use a suffix (k, K, m, M, g or G) to indicate values in kilo,
 mega or gigabytes.
 NOTE: We can write "-1" to reset the *.limit_in_bytes(unlimited).
+NOTE: We cannot set limits on the root cgroup any more.
 # cat /cgroups/0/memory.limit_in_bytes
 4194304
@@ -375,7 +379,42 @@ cgroups created below it.
 NOTE2: This feature can be enabled/disabled per subtree.
-7. TODO
+7. Soft limits
+Soft limits allow for greater sharing of memory. The idea behind soft limits
+is to allow control groups to use as much of the memory as needed, provided
+a. There is no memory contention
+b. They do not exceed their hard limit
+When the system detects memory contention or low memory control groups
+are pushed back to their soft limits. If the soft limit of each control
+group is very high, they are pushed back as much as possible to make
+sure that one control group does not starve the others of memory.
+Please note that soft limits is a best effort feature, it comes with
+no guarantees, but it does its best to make sure that when memory is
+heavily contended for, memory is allocated based on the soft limit
+hints/setup. Currently soft limit based reclaim is setup such that
+it gets invoked from balance_pgdat (kswapd).
+7.1 Interface
+Soft limits can be setup by using the following commands (in this example we
+assume a soft limit of 256 megabytes)
+# echo 256M > memory.soft_limit_in_bytes
+If we want to change this to 1G, we can at any time use
+# echo 1G > memory.soft_limit_in_bytes
+NOTE1: Soft limits take effect over a long period of time, since they involve
+       reclaiming memory for balancing between memory cgroups
+NOTE2: It is recommended to set the soft limit always below the hard limit,
+       otherwise the hard limit will take precedence.
+8. TODO
 1. Add support for accounting huge pages (as a separate controller)
 2. Make per-cgroup scanner reclaim not-shared pages first
diff --git a/Documentation/crypto/async-tx-api.txt b/Documentation/crypto/async-tx-api.txt
index 9f59fcbf5d82..ba046b8fa92f 100644
--- a/Documentation/crypto/async-tx-api.txt
+++ b/Documentation/crypto/async-tx-api.txt
@@ -54,20 +54,23 @@ features surfaced as a result:
 3.1 General format of the API:
 struct dma_async_tx_descriptor *
-async_<operation>(<op specific parameters>,
+async_<operation>(<op specific parameters>, struct async_submit ctl *submit)
-                  enum async_tx_flags flags,
-                  struct dma_async_tx_descriptor *dependency,
-                  dma_async_tx_callback callback_routine,
-                  void *callback_parameter);
 3.2 Supported operations:
-memcpy       - memory copy between a source and a destination buffer
+memcpy  - memory copy between a source and a destination buffer
-memset       - fill a destination buffer with a byte value
+memset  - fill a destination buffer with a byte value
-xor          - xor a series of source buffers and write the result to a
+xor     - xor a series of source buffers and write the result to a
-               destination buffer
+          destination buffer
-xor_zero_sum - xor a series of source buffers and set a flag if the
+xor_val - xor a series of source buffers and set a flag if the
-               result is zero.  The implementation attempts to prevent
+          result is zero.  The implementation attempts to prevent
-               writes to memory
+          writes to memory
+pq      - generate the p+q (raid6 syndrome) from a series of source buffers
+pq_val  - validate that a p and or q buffer are in sync with a given series of
+          sources
+datap   - (raid6_datap_recov) recover a raid6 data block and the p block
+          from the given sources
+2data   - (raid6_2data_recov) recover 2 raid6 data blocks from the given
+          sources
 3.3 Descriptor management:
 The return value is non-NULL and points to a 'descriptor' when the operation
@@ -80,8 +83,8 @@ acknowledged by the application before the offload engine driver is allowed to
 recycle (or free) the descriptor.  A descriptor can be acked by one of the
 following methods:
 1/ setting the ASYNC_TX_ACK flag if no child operations are to be submitted
-2/ setting the ASYNC_TX_DEP_ACK flag to acknowledge the parent
+2/ submitting an unacknowledged descriptor as a dependency to another
-   descriptor of a new operation.
+   async_tx call will implicitly set the acknowledged state.
 3/ calling async_tx_ack() on the descriptor.
 3.4 When does the operation execute?
@@ -119,30 +122,42 @@ of an operation.
 Perform a xor->copy->xor operation where each operation depends on the
 result from the previous operation:
-void complete_xor_copy_xor(void *param)
+void callback(void *param)
 {
-        printk("complete\n");
+        struct completion *cmp = param;
+        complete(cmp);
 }
-int run_xor_copy_xor(struct page **xor_srcs,
+void run_xor_copy_xor(struct page **xor_srcs,
-                     int xor_src_cnt,
+                      int xor_src_cnt,
-                     struct page *xor_dest,
+                      struct page *xor_dest,
-                     size_t xor_len,
+                      size_t xor_len,
-                     struct page *copy_src,
+                      struct page *copy_src,
-                     struct page *copy_dest,
+                      struct page *copy_dest,
-                     size_t copy_len)
+                      size_t copy_len)
 {
        struct dma_async_tx_descriptor *tx;
+        addr_conv_t addr_conv[xor_src_cnt];
+        struct async_submit_ctl submit;
+        addr_conv_t addr_conv[NDISKS];
+        struct completion cmp;
+        init_async_submit(&submit, ASYNC_TX_XOR_DROP_DST, NULL, NULL, NULL,
+                          addr_conv);
+        tx = async_xor(xor_dest, xor_srcs, 0, xor_src_cnt, xor_len, &submit)
-        tx = async_xor(xor_dest, xor_srcs, 0, xor_src_cnt, xor_len,
+        submit->depend_tx = tx;
-                       ASYNC_TX_XOR_DROP_DST, NULL, NULL, NULL);
+        tx = async_memcpy(copy_dest, copy_src, 0, 0, copy_len, &submit);
-        tx = async_memcpy(copy_dest, copy_src, 0, 0, copy_len,
-                          ASYNC_TX_DEP_ACK, tx, NULL, NULL);
+        init_completion(&cmp);
-        tx = async_xor(xor_dest, xor_srcs, 0, xor_src_cnt, xor_len,
+        init_async_submit(&submit, ASYNC_TX_XOR_DROP_DST | ASYNC_TX_ACK, tx,
-                       ASYNC_TX_XOR_DROP_DST | ASYNC_TX_DEP_ACK | ASYNC_TX_ACK,
+                          callback, &cmp, addr_conv);
-                       tx, complete_xor_copy_xor, NULL);
+        tx = async_xor(xor_dest, xor_srcs, 0, xor_src_cnt, xor_len, &submit);
        async_tx_issue_pending_all();
+        wait_for_completion(&cmp);
 }
 See include/linux/async_tx.h for more information on the flags.  See the
diff --git a/Documentation/filesystems/sharedsubtree.txt b/Documentation/filesystems/sharedsubtree.txt
index 736540045dc7..23a181074f94 100644
--- a/Documentation/filesystems/sharedsubtree.txt
+++ b/Documentation/filesystems/sharedsubtree.txt
@@ -4,7 +4,7 @@ Shared Subtrees
 Contents:
        1) Overview
        2) Features
-        3) smount command
+        3) Setting mount states
        4) Use-case
        5) Detailed semantics
        6) Quiz
@@ -41,14 +41,14 @@ replicas continue to be exactly same.
        Here is an example:
-        Lets say /mnt has a mount that is shared.
+        Let's say /mnt has a mount that is shared.
        mount --make-shared /mnt
-        note: mount command does not yet support the --make-shared flag.
+        Note: mount(8) command now supports the --make-shared flag,
-        I have included a small C program which does the same by executing
+        so the sample 'smount' program is no longer needed and has been
-        'smount /mnt shared'
+        removed.
-        #mount --bind /mnt /tmp
+        # mount --bind /mnt /tmp
        The above command replicates the mount at /mnt to the mountpoint /tmp
        and the contents of both the mounts remain identical.
@@ -58,8 +58,8 @@ replicas continue to be exactly same.
        #ls /tmp
        a b c
-        Now lets say we mount a device at /tmp/a
+        Now let's say we mount a device at /tmp/a
-        #mount /dev/sd0  /tmp/a
+        # mount /dev/sd0  /tmp/a
        #ls /tmp/a
        t1 t2 t2
@@ -80,21 +80,20 @@ replicas continue to be exactly same.
        Here is an example:
-        Lets say /mnt has a mount which is shared.
+        Let's say /mnt has a mount which is shared.
-        #mount --make-shared /mnt
+        # mount --make-shared /mnt
-        Lets bind mount /mnt to /tmp
+        Let's bind mount /mnt to /tmp
-        #mount --bind /mnt /tmp
+        # mount --bind /mnt /tmp
        the new mount at /tmp becomes a shared mount and it is a replica of
        the mount at /mnt.
-        Now lets make the mount at /tmp; a slave of /mnt
+        Now let's make the mount at /tmp; a slave of /mnt
-        #mount --make-slave /tmp
+        # mount --make-slave /tmp
-        [or smount /tmp slave]
-        lets mount /dev/sd0 on /mnt/a
+        let's mount /dev/sd0 on /mnt/a
-        #mount /dev/sd0 /mnt/a
+        # mount /dev/sd0 /mnt/a
        #ls /mnt/a
        t1 t2 t3
@@ -104,9 +103,9 @@ replicas continue to be exactly same.
        Note the mount event has propagated to the mount at /tmp
-        However lets see what happens if we mount something on the mount at /tmp
+        However let's see what happens if we mount something on the mount at /tmp
-        #mount /dev/sd1 /tmp/b
+        # mount /dev/sd1 /tmp/b
        #ls /tmp/b
        s1 s2 s3
@@ -124,12 +123,11 @@ replicas continue to be exactly same.
 2d) A unbindable mount is a unbindable private mount
-        lets say we have a mount at /mnt and we make is unbindable
+        let's say we have a mount at /mnt and we make is unbindable
-        #mount --make-unbindable /mnt
+        # mount --make-unbindable /mnt
-         [ smount /mnt  unbindable ]
-         Lets try to bind mount this mount somewhere else.
+         Let's try to bind mount this mount somewhere else.
         # mount --bind /mnt /tmp
         mount: wrong fs type, bad option, bad superblock on /mnt,
                or too many mounted file systems
@@ -137,149 +135,15 @@ replicas continue to be exactly same.
        Binding a unbindable mount is a invalid operation.
-3) smount command
+3) Setting mount states
-        Currently the mount command is not aware of shared subtree features.
+        The mount command (util-linux package) can be used to set mount
-        Work is in progress to add the support in mount ( util-linux package ).
+        states:
-        Till then use the following program.
-        ------------------------------------------------------------------------
+        mount --make-shared mountpoint
-        //
+        mount --make-slave mountpoint
-        //this code was developed my Miklos Szeredi <miklos@szeredi.hu>
+        mount --make-private mountpoint
-        //and modified by Ram Pai <linuxram@us.ibm.com>
+        mount --make-unbindable mountpoint
-        // sample usage:
-        //              smount /tmp shared
-        //
-        #include <stdio.h>
-        #include <stdlib.h>
-        #include <unistd.h>
-        #include <string.h>
-        #include <sys/mount.h>
-        #include <sys/fsuid.h>
-        #ifndef MS_REC
-        #define MS_REC          0x4000  /* 16384: Recursive loopback */
-        #endif
-        #ifndef MS_SHARED
-        #define MS_SHARED               1<<20   /* Shared */
-        #endif
-        #ifndef MS_PRIVATE
-        #define MS_PRIVATE              1<<18   /* Private */
-        #endif
-        #ifndef MS_SLAVE
-        #define MS_SLAVE                1<<19   /* Slave */
-        #endif
-        #ifndef MS_UNBINDABLE
-        #define MS_UNBINDABLE           1<<17   /* Unbindable */
-        #endif
-        int main(int argc, char *argv[])
-        {
-                int type;
-                if(argc != 3) {
-                        fprintf(stderr, "usage: %s dir "
-                        "<rshared|rslave|rprivate|runbindable|shared|slave"
-                        "|private|unbindable>\n" , argv[0]);
-                        return 1;
-                }
-                fprintf(stdout, "%s %s %s\n", argv[0], argv[1], argv[2]);
-                if (strcmp(argv[2],"rshared")==0)
-                        type=(MS_SHARED|MS_REC);
-                else if (strcmp(argv[2],"rslave")==0)
-                        type=(MS_SLAVE|MS_REC);
-                else if (strcmp(argv[2],"rprivate")==0)
-                        type=(MS_PRIVATE|MS_REC);
-                else if (strcmp(argv[2],"runbindable")==0)
-                        type=(MS_UNBINDABLE|MS_REC);
-                else if (strcmp(argv[2],"shared")==0)
-                        type=MS_SHARED;
-                else if (strcmp(argv[2],"slave")==0)
-                        type=MS_SLAVE;
-                else if (strcmp(argv[2],"private")==0)
-                        type=MS_PRIVATE;
-                else if (strcmp(argv[2],"unbindable")==0)
-                        type=MS_UNBINDABLE;
-                else {
-                        fprintf(stderr, "invalid operation: %s\n", argv[2]);
-                        return 1;
-                }
-                setfsuid(getuid());
-                if(mount("", argv[1], "dontcare", type, "") == -1) {
-                        perror("mount");
-                        return 1;
-                }
-                return 0;
-        }
-        -----------------------------------------------------------------------
-        Copy the above code snippet into smount.c
-        gcc -o smount smount.c
-        (i) To mark all the mounts under /mnt as shared execute the following
-        command:
-                smount /mnt rshared
-                the corresponding syntax planned for mount command is
-                mount --make-rshared /mnt
-            just to mark a mount /mnt as shared, execute the following
-            command:
-                smount /mnt shared
-                the corresponding syntax planned for mount command is
-                mount --make-shared /mnt
-        (ii) To mark all the shared mounts under /mnt as slave execute the
-        following
-             command:
-                smount /mnt rslave
-                the corresponding syntax planned for mount command is
-                mount --make-rslave /mnt
-            just to mark a mount /mnt as slave, execute the following
-            command:
-                smount /mnt slave
-                the corresponding syntax planned for mount command is
-                mount --make-slave /mnt
-        (iii) To mark all the mounts under /mnt as private execute the
-        following command:
-                smount /mnt rprivate
-                the corresponding syntax planned for mount command is
-                mount --make-rprivate /mnt
-            just to mark a mount /mnt as private, execute the following
-            command:
-                smount /mnt private
-                the corresponding syntax planned for mount command is
-                mount --make-private /mnt
-              NOTE: by default all the mounts are created as private. But if
-              you want to change some shared/slave/unbindable  mount as
-              private at a later point in time, this command can help.
-        (iv) To mark all the mounts under /mnt as unbindable execute the
-        following
-             command:
-                smount /mnt runbindable
-                the corresponding syntax planned for mount command is
-                mount --make-runbindable /mnt
-            just to mark a mount /mnt as unbindable, execute the following
-            command:
-                smount /mnt unbindable
-                the corresponding syntax planned for mount command is
-                mount --make-unbindable /mnt
 4) Use cases
@@ -350,7 +214,7 @@ replicas continue to be exactly same.
                mount --rbind / /view/v3
                mount --rbind / /view/v4
-                and if /usr has a versioning filesystem mounted, than that
+                and if /usr has a versioning filesystem mounted, then that
                mount appears at /view/v1/usr, /view/v2/usr, /view/v3/usr and
                /view/v4/usr too
@@ -390,7 +254,7 @@ replicas continue to be exactly same.
                For example:
                        mount --make-shared /mnt
-                        mount --bin /mnt /tmp
+                        mount --bind /mnt /tmp
                The mount at /mnt and that at /tmp are both shared and belong
                to the same peer group. Anything mounted or unmounted under
@@ -558,7 +422,7 @@ replicas continue to be exactly same.
        then the subtree under the unbindable mount is pruned in the new
        location.
-        eg: lets say we have the following mount tree.
+        eg: let's say we have the following mount tree.
                A
              /   \
@@ -566,7 +430,7 @@ replicas continue to be exactly same.
             / \ / \
             D E F G
-             Lets say all the mount except the mount C in the tree are
+             Let's say all the mount except the mount C in the tree are
             of a type other than unbindable.
             If this tree is rbound to say Z
@@ -683,13 +547,13 @@ replicas continue to be exactly same.
        'b' on mounts that receive propagation from mount 'B' and does not have
        sub-mounts within them are unmounted.
-        Example: Lets say 'B1', 'B2', 'B3' are shared mounts that propagate to
+        Example: Let's say 'B1', 'B2', 'B3' are shared mounts that propagate to
        each other.
-        lets say 'A1', 'A2', 'A3' are first mounted at dentry 'b' on mount
+        let's say 'A1', 'A2', 'A3' are first mounted at dentry 'b' on mount
        'B1', 'B2' and 'B3' respectively.
-        lets say 'C1', 'C2', 'C3' are next mounted at the same dentry 'b' on
+        let's say 'C1', 'C2', 'C3' are next mounted at the same dentry 'b' on
        mount 'B1', 'B2' and 'B3' respectively.
        if 'C1' is unmounted, all the mounts that are most-recently-mounted on
@@ -710,7 +574,7 @@ replicas continue to be exactly same.
        A cloned namespace contains all the mounts as that of the parent
        namespace.
-        Lets say 'A' and 'B' are the corresponding mounts in the parent and the
+        Let's say 'A' and 'B' are the corresponding mounts in the parent and the
        child namespace.
        If 'A' is shared, then 'B' is also shared and 'A' and 'B' propagate to
@@ -759,11 +623,11 @@ replicas continue to be exactly same.
                mount --make-slave /mnt
                At this point we have the first mount at /tmp and
-                its root dentry is 1. Lets call this mount 'A'
+                its root dentry is 1. Let's call this mount 'A'
                And then we have a second mount at /tmp1 with root
-                dentry 2. Lets call this mount 'B'
+                dentry 2. Let's call this mount 'B'
                Next we have a third mount at /mnt with root dentry
-                mnt. Lets call this mount 'C'
+                mnt. Let's call this mount 'C'
                'B' is the slave of 'A' and 'C' is a slave of 'B'
                A -> B -> C
@@ -794,7 +658,7 @@ replicas continue to be exactly same.
        Q3 Why is unbindable mount needed?
-                Lets say we want to replicate the mount tree at multiple
+                Let's say we want to replicate the mount tree at multiple
                locations within the same subtree.
                if one rbind mounts a tree within the same subtree 'n' times
@@ -803,7 +667,7 @@ replicas continue to be exactly same.
                mounts. Here is a example.
                step 1:
-                   lets say the root tree has just two directories with
+                   let's say the root tree has just two directories with
                   one vfsmount.
                                    root
                                   /    \
@@ -875,7 +739,7 @@ replicas continue to be exactly same.
                Unclonable mounts come in handy here.
                step 1:
-                   lets say the root tree has just two directories with
+                   let's say the root tree has just two directories with
                   one vfsmount.
                                    root
                                   /    \
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index f49eecf2e573..623f094c9d8d 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -536,6 +536,7 @@ struct address_space_operations {
        /* migrate the contents of a page to the specified target */
        int (*migratepage) (struct page *, struct page *);
        int (*launder_page) (struct page *);
+        int (*error_remove_page) (struct mapping *mapping, struct page *page);
 };
  writepage: called by the VM to write a dirty page to backing store.
@@ -694,6 +695,12 @@ struct address_space_operations {
        prevent redirtying the page, it is kept locked during the whole
        operation.
+  error_remove_page: normally set to generic_error_remove_page if truncation
+        is ok for this address space. Used for memory failure handling.
+        Setting this implies you deal with pages going away under you,
+        unless you have them locked or reference counts increased.
 The File Object
 ===============
diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt
index aafca0a8f66a..947374977ca5 100644
--- a/Documentation/ioctl/ioctl-number.txt
+++ b/Documentation/ioctl/ioctl-number.txt
@@ -135,6 +135,7 @@ Code	Seq#	Include File		Comments
                                        <http://mikonos.dia.unisa.it/tcfs>
 'l'     40-7F   linux/udf_fs_i.h        in development:
                                        <http://sourceforge.net/projects/linux-udf/>
+'m'     00-09   linux/mmtimer.h
 'm'     all     linux/mtio.h            conflict!
 'm'     all     linux/soundcard.h       conflict!
 'm'     all     linux/synclink.h        conflict!
diff --git a/Documentation/sysctl/fs.txt b/Documentation/sysctl/fs.txt
index 1458448436cc..62682500878a 100644
--- a/Documentation/sysctl/fs.txt
+++ b/Documentation/sysctl/fs.txt
@@ -96,13 +96,16 @@ handles that the Linux kernel will allocate. When you get lots
 of error messages about running out of file handles, you might
 want to increase this limit.
-The three values in file-nr denote the number of allocated
+Historically, the three values in file-nr denoted the number of
-file handles, the number of unused file handles and the maximum
+allocated file handles, the number of allocated but unused file
-number of file handles. When the allocated file handles come
+handles, and the maximum number of file handles. Linux 2.6 always
-close to the maximum, but the number of unused file handles is
+reports 0 as the number of free file handles -- this is not an
-significantly greater than 0, you've encountered a peak in your 
+error, it just means that the number of allocated file handles
-usage of file handles and you don't need to increase the maximum.
+exactly matches the number of used file handles.
+Attempts to allocate more file descriptors than file-max are
+reported with printk, look for "VFS: file-max limit <number>
+reached".
 ==============================================================
 nr_open:
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index b3d8b4922740..a028b92001ed 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -22,6 +22,7 @@ show up in /proc/sys/kernel:
 - callhome                   [ S390 only ]
 - auto_msgmni
 - core_pattern
+- core_pipe_limit
 - core_uses_pid
 - ctrl-alt-del
 - dentry-state
@@ -135,6 +136,27 @@ core_pattern is used to specify a core dumpfile pattern name.
 ==============================================================
+core_pipe_limit:
+This sysctl is only applicable when core_pattern is configured to pipe core
+files to user space helper a (when the first character of core_pattern is a '|',
+see above).  When collecting cores via a pipe to an application, it is
+occasionally usefull for the collecting application to gather data about the
+crashing process from its /proc/pid directory.  In order to do this safely, the
+kernel must wait for the collecting process to exit, so as not to remove the
+crashing processes proc files prematurely.  This in turn creates the possibility
+that a misbehaving userspace collecting process can block the reaping of a
+crashed process simply by never exiting.  This sysctl defends against that.  It
+defines how many concurrent crashing processes may be piped to user space
+applications in parallel.  If this value is exceeded, then those crashing
+processes above that value are noted via the kernel log and their cores are
+skipped.  0 is a special value, indicating that unlimited processes may be
+captured in parallel, but that no waiting will take place (i.e. the collecting
+process is not guaranteed access to /proc/<crahing pid>/).  This value defaults
+to 0.
+==============================================================
 core_uses_pid:
 The default coredump filename is "core".  By setting
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index e6fb1ec2744b..a6e360d2055c 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -32,6 +32,8 @@ Currently, these files are in /proc/sys/vm:
 - legacy_va_layout
 - lowmem_reserve_ratio
 - max_map_count
+- memory_failure_early_kill
+- memory_failure_recovery
 - min_free_kbytes
 - min_slab_ratio
 - min_unmapped_ratio
@@ -53,7 +55,6 @@ Currently, these files are in /proc/sys/vm:
 - vfs_cache_pressure
 - zone_reclaim_mode
 ==============================================================
 block_dump
@@ -275,6 +276,44 @@ e.g., up to one or two maps per allocation.
 The default value is 65536.
+=============================================================
+memory_failure_early_kill:
+Control how to kill processes when uncorrected memory error (typically
+a 2bit error in a memory module) is detected in the background by hardware
+that cannot be handled by the kernel. In some cases (like the page
+still having a valid copy on disk) the kernel will handle the failure
+transparently without affecting any applications. But if there is
+no other uptodate copy of the data it will kill to prevent any data
+corruptions from propagating.
+1: Kill all processes that have the corrupted and not reloadable page mapped
+as soon as the corruption is detected.  Note this is not supported
+for a few types of pages, like kernel internally allocated data or
+the swap cache, but works for the majority of user pages.
+0: Only unmap the corrupted page from all processes and only kill a process
+who tries to access it.
+The kill is done using a catchable SIGBUS with BUS_MCEERR_AO, so processes can
+handle this if they want to.
+This is only active on architectures/platforms with advanced machine
+check handling and depends on the hardware capabilities.
+Applications can override this setting individually with the PR_MCE_KILL prctl
+==============================================================
+memory_failure_recovery
+Enable memory failure recovery (when supported by the platform)
+1: Attempt recovery.
+0: Always panic on a memory failure.
 ==============================================================
 min_free_kbytes:
diff --git a/Documentation/vm/.gitignore b/Documentation/vm/.gitignore
index 33e8a023df02..09b164a5700f 100644
--- a/Documentation/vm/.gitignore
+++ b/Documentation/vm/.gitignore
@@ -1 +1,2 @@
+page-types
 slabinfo
diff --git a/Documentation/vm/page-types.c b/Documentation/vm/page-types.c
index 3eda8ea00852..fa1a30d9e9d5 100644
--- a/Documentation/vm/page-types.c
+++ b/Documentation/vm/page-types.c
@@ -5,6 +5,7 @@
 * Copyright (C) 2009 Wu Fengguang <fengguang.wu@intel.com>
 */
+#define _LARGEFILE64_SOURCE
 #include <stdio.h>
 #include <stdlib.h>
 #include <unistd.h>
@@ -13,12 +14,33 @@
 #include <string.h>
 #include <getopt.h>
 #include <limits.h>
+#include <assert.h>
 #include <sys/types.h>
 #include <sys/errno.h>
 #include <sys/fcntl.h>
 /*
+ * pagemap kernel ABI bits
+ */
+#define PM_ENTRY_BYTES      sizeof(uint64_t)
+#define PM_STATUS_BITS      3
+#define PM_STATUS_OFFSET    (64 - PM_STATUS_BITS)
+#define PM_STATUS_MASK      (((1LL << PM_STATUS_BITS) - 1) << PM_STATUS_OFFSET)
+#define PM_STATUS(nr)       (((nr) << PM_STATUS_OFFSET) & PM_STATUS_MASK)
+#define PM_PSHIFT_BITS      6
+#define PM_PSHIFT_OFFSET    (PM_STATUS_OFFSET - PM_PSHIFT_BITS)
+#define PM_PSHIFT_MASK      (((1LL << PM_PSHIFT_BITS) - 1) << PM_PSHIFT_OFFSET)
+#define PM_PSHIFT(x)        (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK)
+#define PM_PFRAME_MASK      ((1LL << PM_PSHIFT_OFFSET) - 1)
+#define PM_PFRAME(x)        ((x) & PM_PFRAME_MASK)
+#define PM_PRESENT          PM_STATUS(4LL)
+#define PM_SWAP             PM_STATUS(2LL)
+/*
 * kernel page flags
 */
@@ -126,6 +148,14 @@ static int		nr_addr_ranges;
 static unsigned long    opt_offset[MAX_ADDR_RANGES];
 static unsigned long    opt_size[MAX_ADDR_RANGES];
+#define MAX_VMAS        10240
+static int              nr_vmas;
+static unsigned long    pg_start[MAX_VMAS];
+static unsigned long    pg_end[MAX_VMAS];
+static unsigned long    voffset;
+static int              pagemap_fd;
 #define MAX_BIT_FILTERS 64
 static int              nr_bit_filters;
 static uint64_t         opt_mask[MAX_BIT_FILTERS];
@@ -135,7 +165,6 @@ static int		page_size;
 #define PAGES_BATCH     (64 << 10)      /* 64k pages */
 static int              kpageflags_fd;
-static uint64_t         kpageflags_buf[KPF_BYTES * PAGES_BATCH];
 #define HASH_SHIFT      13
 #define HASH_SIZE       (1 << HASH_SHIFT)
@@ -158,6 +187,11 @@ static uint64_t 	page_flags[HASH_SIZE];
        type __min2 = (y);                      \
        __min1 < __min2 ? __min1 : __min2; })
+#define max_t(type, x, y) ({                    \
+        type __max1 = (x);                      \
+        type __max2 = (y);                      \
+        __max1 > __max2 ? __max1 : __max2; })
 static unsigned long pages2mb(unsigned long pages)
 {
        return (pages * page_size) >> 20;
@@ -224,26 +258,34 @@ static char *page_flag_longname(uint64_t flags)
 static void show_page_range(unsigned long offset, uint64_t flags)
 {
        static uint64_t      flags0;
+        static unsigned long voff;
        static unsigned long index;
        static unsigned long count;
-        if (flags == flags0 && offset == index + count) {
+        if (flags == flags0 && offset == index + count &&
+            (!opt_pid || voffset == voff + count)) {
                count++;
                return;
        }
-        if (count)
+        if (count) {
-                printf("%lu\t%lu\t%s\n",
+                if (opt_pid)
+                        printf("%lx\t", voff);
+                printf("%lx\t%lx\t%s\n",
                                index, count, page_flag_name(flags0));
+        }
        flags0 = flags;
        index  = offset;
+        voff   = voffset;
        count  = 1;
 }
 static void show_page(unsigned long offset, uint64_t flags)
 {
-        printf("%lu\t%s\n", offset, page_flag_name(flags));
+        if (opt_pid)
+                printf("%lx\t", voffset);
+        printf("%lx\t%s\n", offset, page_flag_name(flags));
 }
 static void show_summary(void)
@@ -383,6 +425,8 @@ static void walk_pfn(unsigned long index, unsigned long count)
        lseek(kpageflags_fd, index * KPF_BYTES, SEEK_SET);
        while (count) {
+                uint64_t kpageflags_buf[KPF_BYTES * PAGES_BATCH];
                batch = min_t(unsigned long, count, PAGES_BATCH);
                n = read(kpageflags_fd, kpageflags_buf, batch * KPF_BYTES);
                if (n == 0)
@@ -404,6 +448,81 @@ static void walk_pfn(unsigned long index, unsigned long count)
        }
 }
+#define PAGEMAP_BATCH   4096
+static unsigned long task_pfn(unsigned long pgoff)
+{
+        static uint64_t buf[PAGEMAP_BATCH];
+        static unsigned long start;
+        static long count;
+        uint64_t pfn;
+        if (pgoff < start || pgoff >= start + count) {
+                if (lseek64(pagemap_fd,
+                            (uint64_t)pgoff * PM_ENTRY_BYTES,
+                            SEEK_SET) < 0) {
+                        perror("pagemap seek");
+                        exit(EXIT_FAILURE);
+                }
+                count = read(pagemap_fd, buf, sizeof(buf));
+                if (count == 0)
+                        return 0;
+                if (count < 0) {
+                        perror("pagemap read");
+                        exit(EXIT_FAILURE);
+                }
+                if (count % PM_ENTRY_BYTES) {
+                        fatal("pagemap read not aligned.\n");
+                        exit(EXIT_FAILURE);
+                }
+                count /= PM_ENTRY_BYTES;
+                start = pgoff;
+        }
+        pfn = buf[pgoff - start];
+        if (pfn & PM_PRESENT)
+                pfn = PM_PFRAME(pfn);
+        else
+                pfn = 0;
+        return pfn;
+}
+static void walk_task(unsigned long index, unsigned long count)
+{
+        int i = 0;
+        const unsigned long end = index + count;
+        while (index < end) {
+                while (pg_end[i] <= index)
+                        if (++i >= nr_vmas)
+                                return;
+                if (pg_start[i] >= end)
+                        return;
+                voffset = max_t(unsigned long, pg_start[i], index);
+                index   = min_t(unsigned long, pg_end[i], end);
+                assert(voffset < index);
+                for (; voffset < index; voffset++) {
+                        unsigned long pfn = task_pfn(voffset);
+                        if (pfn)
+                                walk_pfn(pfn, 1);
+                }
+        }
+}
+static void add_addr_range(unsigned long offset, unsigned long size)
+{
+        if (nr_addr_ranges >= MAX_ADDR_RANGES)
+                fatal("too many addr ranges\n");
+        opt_offset[nr_addr_ranges] = offset;
+        opt_size[nr_addr_ranges] = min_t(unsigned long, size, ULONG_MAX-offset);
+        nr_addr_ranges++;
+}
 static void walk_addr_ranges(void)
 {
        int i;
@@ -415,10 +534,13 @@ static void walk_addr_ranges(void)
        }
        if (!nr_addr_ranges)
-                walk_pfn(0, ULONG_MAX);
+                add_addr_range(0, ULONG_MAX);
        for (i = 0; i < nr_addr_ranges; i++)
-                walk_pfn(opt_offset[i], opt_size[i]);
+                if (!opt_pid)
+                        walk_pfn(opt_offset[i], opt_size[i]);
+                else
+                        walk_task(opt_offset[i], opt_size[i]);
        close(kpageflags_fd);
 }
@@ -446,8 +568,8 @@ static void usage(void)
 "            -r|--raw                  Raw mode, for kernel developers\n"
 "            -a|--addr    addr-spec    Walk a range of pages\n"
 "            -b|--bits    bits-spec    Walk pages with specified bits\n"
-#if 0 /* planned features */
 "            -p|--pid     pid          Walk process address space\n"
+#if 0 /* planned features */
 "            -f|--file    filename     Walk file address space\n"
 #endif
 "            -l|--list                 Show page details in ranges\n"
@@ -459,7 +581,7 @@ static void usage(void)
 "            N+M                       pages range from N to N+M-1\n"
 "            N,M                       pages range from N to M-1\n"
 "            N,                        pages range from N to end\n"
-"            ,M                        pages range from 0 to M\n"
+"            ,M                        pages range from 0 to M-1\n"
 "bits-spec:\n"
 "            bit1,bit2                 (flags & (bit1|bit2)) != 0\n"
 "            bit1,bit2=bit1            (flags & (bit1|bit2)) == bit1\n"
@@ -496,21 +618,57 @@ static unsigned long long parse_number(const char *str)
 static void parse_pid(const char *str)
 {
+        FILE *file;
+        char buf[5000];
        opt_pid = parse_number(str);
-}
-static void parse_file(const char *name)
+        sprintf(buf, "/proc/%d/pagemap", opt_pid);
-{
+        pagemap_fd = open(buf, O_RDONLY);
+        if (pagemap_fd < 0) {
+                perror(buf);
+                exit(EXIT_FAILURE);
+        }
+        sprintf(buf, "/proc/%d/maps", opt_pid);
+        file = fopen(buf, "r");
+        if (!file) {
+                perror(buf);
+                exit(EXIT_FAILURE);
+        }
+        while (fgets(buf, sizeof(buf), file) != NULL) {
+                unsigned long vm_start;
+                unsigned long vm_end;
+                unsigned long long pgoff;
+                int major, minor;
+                char r, w, x, s;
+                unsigned long ino;
+                int n;
+                n = sscanf(buf, "%lx-%lx %c%c%c%c %llx %x:%x %lu",
+                           &vm_start,
+                           &vm_end,
+                           &r, &w, &x, &s,
+                           &pgoff,
+                           &major, &minor,
+                           &ino);
+                if (n < 10) {
+                        fprintf(stderr, "unexpected line: %s\n", buf);
+                        continue;
+                }
+                pg_start[nr_vmas] = vm_start / page_size;
+                pg_end[nr_vmas] = vm_end / page_size;
+                if (++nr_vmas >= MAX_VMAS) {
+                        fprintf(stderr, "too many VMAs\n");
+                        break;
+                }
+        }
+        fclose(file);
 }
-static void add_addr_range(unsigned long offset, unsigned long size)
+static void parse_file(const char *name)
 {
-        if (nr_addr_ranges >= MAX_ADDR_RANGES)
-                fatal("too much addr ranges\n");
-        opt_offset[nr_addr_ranges] = offset;
-        opt_size[nr_addr_ranges] = size;
-        nr_addr_ranges++;
 }
 static void parse_addr_range(const char *optarg)
@@ -676,8 +834,10 @@ int main(int argc, char *argv[])
                }
        }
+        if (opt_list && opt_pid)
+                printf("voffset\t");
        if (opt_list == 1)
-                printf("offset\tcount\tflags\n");
+                printf("offset\tlen\tflags\n");
        if (opt_list == 2)
                printf("offset\tflags\n");