11 files changed, 446 insertions, 40 deletions
diff --git a/Documentation/ABI/stable/sysfs-devices-node b/Documentation/ABI/stable/sysfs-devices-node
new file mode 100644
index 000000000000..49b82cad7003
--- /dev/null
+++ b/Documentation/ABI/stable/sysfs-devices-node
@@ -0,0 +1,7 @@
+What:           /sys/devices/system/node/nodeX
+Date:           October 2002
+Contact:        Linux Memory Management list <linux-mm@kvack.org>
+Description:
+                When CONFIG_NUMA is enabled, this is a directory containing
+                information on node X such as what CPUs are local to the
+                node.
diff --git a/Documentation/fault-injection/provoke-crashes.txt b/Documentation/fault-injection/provoke-crashes.txt
new file mode 100644
index 000000000000..7a9d3d81525b
--- /dev/null
+++ b/Documentation/fault-injection/provoke-crashes.txt
@@ -0,0 +1,38 @@
+The lkdtm module provides an interface to crash or injure the kernel at
+predefined crashpoints to evaluate the reliability of crash dumps obtained
+using different dumping solutions. The module uses KPROBEs to instrument
+crashing points, but can also crash the kernel directly without KRPOBE
+support.
+You can provide the way either through module arguments when inserting
+the module, or through a debugfs interface.
+Usage: insmod lkdtm.ko [recur_count={>0}] cpoint_name=<> cpoint_type=<>
+                                [cpoint_count={>0}]
+  recur_count : Recursion level for the stack overflow test. Default is 10.
+  cpoint_name : Crash point where the kernel is to be crashed. It can be
+         one of INT_HARDWARE_ENTRY, INT_HW_IRQ_EN, INT_TASKLET_ENTRY,
+         FS_DEVRW, MEM_SWAPOUT, TIMERADD, SCSI_DISPATCH_CMD,
+         IDE_CORE_CP, DIRECT
+  cpoint_type : Indicates the action to be taken on hitting the crash point.
+     It can be one of PANIC, BUG, EXCEPTION, LOOP, OVERFLOW,
+     CORRUPT_STACK, UNALIGNED_LOAD_STORE_WRITE, OVERWRITE_ALLOCATION,
+     WRITE_AFTER_FREE,
+  cpoint_count : Indicates the number of times the crash point is to be hit
+    to trigger an action. The default is 10.
+You can also induce failures by mounting debugfs and writing the type to
+<mountpoint>/provoke-crash/<crashpoint>. E.g.,
+  mount -t debugfs debugfs /mnt
+  echo EXCEPTION > /mnt/provoke-crash/INT_HARDWARE_ENTRY
+A special file is `DIRECT' which will induce the crash directly without
+KPROBE instrumentation. This mode is the only one available when the module
+is built on a kernel without KPROBEs support.
diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index 8debdd625e1a..a5cc0db63d7a 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -550,3 +550,35 @@ Why:	udev fully replaces this special file system that only contains CAPI
        NCCI TTY device nodes. User space (pppdcapiplugin) works without
        noticing the difference.
 Who:    Jan Kiszka <jan.kiszka@web.de>
+----------------------------
+What:   KVM memory aliases support
+When:   July 2010
+Why:    Memory aliasing support is used for speeding up guest vga access
+        through the vga windows.
+        Modern userspace no longer uses this feature, so it's just bitrotted
+        code and can be removed with no impact.
+Who:    Avi Kivity <avi@redhat.com>
+----------------------------
+What:   KVM kernel-allocated memory slots
+When:   July 2010
+Why:    Since 2.6.25, kvm supports user-allocated memory slots, which are
+        much more flexible than kernel-allocated slots.  All current userspace
+        supports the newer interface and this code can be removed with no
+        impact.
+Who:    Avi Kivity <avi@redhat.com>
+----------------------------
+What:   KVM paravirt mmu host support
+When:   January 2011
+Why:    The paravirt mmu host support is slower than non-paravirt mmu, both
+        on newer and older hardware.  It is already not exposed to the guest,
+        and kept only for live migration purposes.
+Who:    Avi Kivity <avi@redhat.com>
+----------------------------
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 18b9d0ca0630..06bbbed71206 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -460,13 +460,6 @@ in sys_read() and friends.
 --------------------------- dquot_operations -------------------------------
 prototypes:
-        int (*initialize) (struct inode *, int);
-        int (*drop) (struct inode *);
-        int (*alloc_space) (struct inode *, qsize_t, int);
-        int (*alloc_inode) (const struct inode *, unsigned long);
-        int (*free_space) (struct inode *, qsize_t);
-        int (*free_inode) (const struct inode *, unsigned long);
-        int (*transfer) (struct inode *, struct iattr *);
        int (*write_dquot) (struct dquot *);
        int (*acquire_dquot) (struct dquot *);
        int (*release_dquot) (struct dquot *);
@@ -479,13 +472,6 @@ a proper locking wrt the filesystem and call the generic quota operations.
 What filesystem should expect from the generic quota functions:
                FS recursion    Held locks when called
-initialize:     yes             maybe dqonoff_sem
-drop:           yes             -
-alloc_space:    ->mark_dirty()  -
-alloc_inode:    ->mark_dirty()  -
-free_space:     ->mark_dirty()  -
-free_inode:     ->mark_dirty()  -
-transfer:       yes             -
 write_dquot:    yes             dqonoff_sem or dqptr_sem
 acquire_dquot:  yes             dqonoff_sem or dqptr_sem
 release_dquot:  yes             dqonoff_sem or dqptr_sem
@@ -495,10 +481,6 @@ write_info:	yes		dqonoff_sem
 FS recursion means calling ->quota_read() and ->quota_write() from superblock
 operations.
->alloc_space(), ->alloc_inode(), ->free_space(), ->free_inode() are called
-only directly by the filesystem and do not call any fs functions only
-the ->mark_dirty() operation.
 More details about quota locking can be found in fs/dquot.c.
 --------------------------- vm_operations_struct -----------------------------
diff --git a/Documentation/filesystems/nfs/nfs41-server.txt b/Documentation/filesystems/nfs/nfs41-server.txt
index 1bd0d0c05171..6a53a84afc72 100644
--- a/Documentation/filesystems/nfs/nfs41-server.txt
+++ b/Documentation/filesystems/nfs/nfs41-server.txt
@@ -17,8 +17,7 @@ kernels must turn 4.1 on or off *before* turning support for version 4
 on or off; rpc.nfsd does this correctly.)
 The NFSv4 minorversion 1 (NFSv4.1) implementation in nfsd is based
-on the latest NFSv4.1 Internet Draft:
+on RFC 5661.
-http://tools.ietf.org/html/draft-ietf-nfsv4-minorversion1-29
 From the many new features in NFSv4.1 the current implementation
 focuses on the mandatory-to-implement NFSv4.1 Sessions, providing
@@ -44,7 +43,7 @@ interoperability problems with future clients.  Known issues:
          trunking, but this is a mandatory feature, and its use is
          recommended to clients in a number of places.  (E.g. to ensure
          timely renewal in case an existing connection's retry timeouts
-          have gotten too long; see section 8.3 of the draft.)
+          have gotten too long; see section 8.3 of the RFC.)
          Therefore, lack of this feature may cause future clients to
          fail.
        - Incomplete backchannel support: incomplete backchannel gss
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 0d07513a67a6..96a44dd95e03 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -164,6 +164,7 @@ read the file /proc/PID/status:
  VmExe:        68 kB
  VmLib:      1412 kB
  VmPTE:        20 kb
+  VmSwap:        0 kB
  Threads:        1
  SigQ:   0/28578
  SigPnd: 0000000000000000
@@ -188,6 +189,12 @@ memory usage. Its seven fields are explained in Table 1-3.  The stat file
 contains details information about the process itself.  Its fields are
 explained in Table 1-4.
+(for SMP CONFIG users)
+For making accounting scalable, RSS related information are handled in
+asynchronous manner and the vaule may not be very precise. To see a precise
+snapshot of a moment, you can see /proc/<pid>/smaps file and scan page table.
+It's slow but very precise.
 Table 1-2: Contents of the statm files (as of 2.6.30-rc7)
 ..............................................................................
 Field                       Content
@@ -213,6 +220,7 @@ Table 1-2: Contents of the statm files (as of 2.6.30-rc7)
 VmExe                       size of text segment
 VmLib                       size of shared library code
 VmPTE                       size of page table entries
+ VmSwap                      size of swap usage (the number of referred swapents)
 Threads                     number of threads
 SigQ                        number of signals queued/max. number for queue
 SigPnd                      bitmap of pending signals for the thread
@@ -430,6 +438,7 @@ Table 1-5: Kernel info in /proc
 modules     List of loaded modules                            
 mounts      Mounted filesystems                               
 net         Networking info (see text)                        
+ pagetypeinfo Additional page allocator information (see text)  (2.5)
 partitions  Table of partitions known to the system           
 pci         Deprecated info of PCI bus (new way -> /proc/bus/pci/,
             decoupled by lspci                                 (2.4)
@@ -584,7 +593,7 @@ Node 0, zone      DMA      0      4      5      4      4      3 ...
 Node 0, zone   Normal      1      0      0      1    101      8 ...
 Node 0, zone  HighMem      2      0      0      1      1      0 ...
-Memory fragmentation is a problem under some workloads, and buddyinfo is a 
+External fragmentation is a problem under some workloads, and buddyinfo is a
 useful tool for helping diagnose these problems.  Buddyinfo will give you a 
 clue as to how big an area you can safely allocate, or why a previous
 allocation failed.
@@ -594,6 +603,48 @@ available.  In this case, there are 0 chunks of 2^0*PAGE_SIZE available in
 ZONE_DMA, 4 chunks of 2^1*PAGE_SIZE in ZONE_DMA, 101 chunks of 2^4*PAGE_SIZE 
 available in ZONE_NORMAL, etc... 
+More information relevant to external fragmentation can be found in
+pagetypeinfo.
+> cat /proc/pagetypeinfo
+Page block order: 9
+Pages per block:  512
+Free pages count per migrate type at order       0      1      2      3      4      5      6      7      8      9     10
+Node    0, zone      DMA, type    Unmovable      0      0      0      1      1      1      1      1      1      1      0
+Node    0, zone      DMA, type  Reclaimable      0      0      0      0      0      0      0      0      0      0      0
+Node    0, zone      DMA, type      Movable      1      1      2      1      2      1      1      0      1      0      2
+Node    0, zone      DMA, type      Reserve      0      0      0      0      0      0      0      0      0      1      0
+Node    0, zone      DMA, type      Isolate      0      0      0      0      0      0      0      0      0      0      0
+Node    0, zone    DMA32, type    Unmovable    103     54     77      1      1      1     11      8      7      1      9
+Node    0, zone    DMA32, type  Reclaimable      0      0      2      1      0      0      0      0      1      0      0
+Node    0, zone    DMA32, type      Movable    169    152    113     91     77     54     39     13      6      1    452
+Node    0, zone    DMA32, type      Reserve      1      2      2      2      2      0      1      1      1      1      0
+Node    0, zone    DMA32, type      Isolate      0      0      0      0      0      0      0      0      0      0      0
+Number of blocks type     Unmovable  Reclaimable      Movable      Reserve      Isolate
+Node 0, zone      DMA            2            0            5            1            0
+Node 0, zone    DMA32           41            6          967            2            0
+Fragmentation avoidance in the kernel works by grouping pages of different
+migrate types into the same contiguous regions of memory called page blocks.
+A page block is typically the size of the default hugepage size e.g. 2MB on
+X86-64. By keeping pages grouped based on their ability to move, the kernel
+can reclaim pages within a page block to satisfy a high-order allocation.
+The pagetypinfo begins with information on the size of a page block. It
+then gives the same type of information as buddyinfo except broken down
+by migrate-type and finishes with details on how many page blocks of each
+type exist.
+If min_free_kbytes has been tuned correctly (recommendations made by hugeadm
+from libhugetlbfs http://sourceforge.net/projects/libhugetlbfs/), one can
+make an estimate of the likely number of huge pages that can be allocated
+at a given point in time. All the "Movable" blocks should be allocatable
+unless memory has been mlock()'d. Some of the Reclaimable blocks should
+also be allocatable although a lot of filesystem metadata may have to be
+reclaimed to achieve this.
 ..............................................................................
 meminfo:
diff --git a/Documentation/gpio.txt b/Documentation/gpio.txt
index 1866c27eec69..c2c6e9b39bbe 100644
--- a/Documentation/gpio.txt
+++ b/Documentation/gpio.txt
@@ -253,6 +253,70 @@ pin setup (e.g. controlling which pin the GPIO uses, pullup/pulldown).
 Also note that it's your responsibility to have stopped using a GPIO
 before you free it.
+Considering in most cases GPIOs are actually configured right after they
+are claimed, three additional calls are defined:
+        /* request a single GPIO, with initial configuration specified by
+         * 'flags', identical to gpio_request() wrt other arguments and
+         * return value
+         */
+        int gpio_request_one(unsigned gpio, unsigned long flags, const char *label);
+        /* request multiple GPIOs in a single call
+         */
+        int gpio_request_array(struct gpio *array, size_t num);
+        /* release multiple GPIOs in a single call
+         */
+        void gpio_free_array(struct gpio *array, size_t num);
+where 'flags' is currently defined to specify the following properties:
+        * GPIOF_DIR_IN          - to configure direction as input
+        * GPIOF_DIR_OUT         - to configure direction as output
+        * GPIOF_INIT_LOW        - as output, set initial level to LOW
+        * GPIOF_INIT_HIGH       - as output, set initial level to HIGH
+since GPIOF_INIT_* are only valid when configured as output, so group valid
+combinations as:
+        * GPIOF_IN              - configure as input
+        * GPIOF_OUT_INIT_LOW    - configured as output, initial level LOW
+        * GPIOF_OUT_INIT_HIGH   - configured as output, initial level HIGH
+In the future, these flags can be extended to support more properties such
+as open-drain status.
+Further more, to ease the claim/release of multiple GPIOs, 'struct gpio' is
+introduced to encapsulate all three fields as:
+        struct gpio {
+                unsigned        gpio;
+                unsigned long   flags;
+                const char      *label;
+        };
+A typical example of usage:
+        static struct gpio leds_gpios[] = {
+                { 32, GPIOF_OUT_INIT_HIGH, "Power LED" }, /* default to ON */
+                { 33, GPIOF_OUT_INIT_LOW,  "Green LED" }, /* default to OFF */
+                { 34, GPIOF_OUT_INIT_LOW,  "Red LED"   }, /* default to OFF */
+                { 35, GPIOF_OUT_INIT_LOW,  "Blue LED"  }, /* default to OFF */
+                { ... },
+        };
+        err = gpio_request_one(31, GPIOF_IN, "Reset Button");
+        if (err)
+                ...
+        err = gpio_request_array(leds_gpios, ARRAY_SIZE(leds_gpios));
+        if (err)
+                ...
+        gpio_free_array(leds_gpios, ARRAY_SIZE(leds_gpios));
 GPIOs mapped to IRQs
 --------------------
diff --git a/Documentation/init.txt b/Documentation/init.txt
new file mode 100644
index 000000000000..535ad5e82b98
--- /dev/null
+++ b/Documentation/init.txt
@@ -0,0 +1,49 @@
+Explaining the dreaded "No init found." boot hang message
+=========================================================
+OK, so you've got this pretty unintuitive message (currently located
+in init/main.c) and are wondering what the H*** went wrong.
+Some high-level reasons for failure (listed roughly in order of execution)
+to load the init binary are:
+A) Unable to mount root FS
+B) init binary doesn't exist on rootfs
+C) broken console device
+D) binary exists but dependencies not available
+E) binary cannot be loaded
+Detailed explanations:
+0) Set "debug" kernel parameter (in bootloader config file or CONFIG_CMDLINE)
+   to get more detailed kernel messages.
+A) make sure you have the correct root FS type
+   (and root= kernel parameter points to the correct partition),
+   required drivers such as storage hardware (such as SCSI or USB!)
+   and filesystem (ext3, jffs2 etc.) are builtin (alternatively as modules,
+   to be pre-loaded by an initrd)
+C) Possibly a conflict in console= setup --> initial console unavailable.
+   E.g. some serial consoles are unreliable due to serial IRQ issues (e.g.
+   missing interrupt-based configuration).
+   Try using a different console= device or e.g. netconsole= .
+D) e.g. required library dependencies of the init binary such as
+   /lib/ld-linux.so.2 missing or broken. Use readelf -d <INIT>|grep NEEDED
+   to find out which libraries are required.
+E) make sure the binary's architecture matches your hardware.
+   E.g. i386 vs. x86_64 mismatch, or trying to load x86 on ARM hardware.
+   In case you tried loading a non-binary file here (shell script?),
+   you should make sure that the script specifies an interpreter in its shebang
+   header line (#!/...) that is fully working (including its library
+   dependencies). And before tackling scripts, better first test a simple
+   non-script binary such as /bin/sh and confirm its successful execution.
+   To find out more, add code to init/main.c to display kernel_execve()s
+   return values.
+Please extend this explanation whenever you find new failure causes
+(after all loading the init binary is a CRITICAL and hard transition step
+which needs to be made as painless as possible), then submit patch to LKML.
+Further TODOs:
+- Implement the various run_init_process() invocations via a struct array
+  which can then store the kernel_execve() result value and on failure
+  log it all by iterating over _all_ results (very important usability fix).
+- try to make the implementation itself more helpful in general,
+  e.g. by providing additional error messages at affected places.
+Andreas Mohr <andi at lisas period de>
diff --git a/Documentation/kprobes.txt b/Documentation/kprobes.txt
index 053037a1fe6d..2f9115c0ae62 100644
--- a/Documentation/kprobes.txt
+++ b/Documentation/kprobes.txt
@@ -1,6 +1,7 @@
 Title   : Kernel Probes (Kprobes)
 Authors : Jim Keniston <jkenisto@us.ibm.com>
-        : Prasanna S Panchamukhi <prasanna@in.ibm.com>
+        : Prasanna S Panchamukhi <prasanna.panchamukhi@gmail.com>
+        : Masami Hiramatsu <mhiramat@redhat.com>
 CONTENTS
@@ -15,6 +16,7 @@ CONTENTS
 9. Jprobes Example
 10. Kretprobes Example
 Appendix A: The kprobes debugfs interface
+Appendix B: The kprobes sysctl interface
 1. Concepts: Kprobes, Jprobes, Return Probes
@@ -42,13 +44,13 @@ registration/unregistration of a group of *probes. These functions
 can speed up unregistration process when you have to unregister
 a lot of probes at once.
-The next three subsections explain how the different types of
+The next four subsections explain how the different types of
-probes work.  They explain certain things that you'll need to
+probes work and how jump optimization works.  They explain certain
-know in order to make the best use of Kprobes -- e.g., the
+things that you'll need to know in order to make the best use of
-difference between a pre_handler and a post_handler, and how
+Kprobes -- e.g., the difference between a pre_handler and
-to use the maxactive and nmissed fields of a kretprobe.  But
+a post_handler, and how to use the maxactive and nmissed fields of
-if you're in a hurry to start using Kprobes, you can skip ahead
+a kretprobe.  But if you're in a hurry to start using Kprobes, you
-to section 2.
+can skip ahead to section 2.
 1.1 How Does a Kprobe Work?
@@ -161,13 +163,125 @@ In case probed function is entered but there is no kretprobe_instance
 object available, then in addition to incrementing the nmissed count,
 the user entry_handler invocation is also skipped.
+1.4 How Does Jump Optimization Work?
+If you configured your kernel with CONFIG_OPTPROBES=y (currently
+this option is supported on x86/x86-64, non-preemptive kernel) and
+the "debug.kprobes_optimization" kernel parameter is set to 1 (see
+sysctl(8)), Kprobes tries to reduce probe-hit overhead by using a jump
+instruction instead of a breakpoint instruction at each probepoint.
+1.4.1 Init a Kprobe
+When a probe is registered, before attempting this optimization,
+Kprobes inserts an ordinary, breakpoint-based kprobe at the specified
+address. So, even if it's not possible to optimize this particular
+probepoint, there'll be a probe there.
+1.4.2 Safety Check
+Before optimizing a probe, Kprobes performs the following safety checks:
+- Kprobes verifies that the region that will be replaced by the jump
+instruction (the "optimized region") lies entirely within one function.
+(A jump instruction is multiple bytes, and so may overlay multiple
+instructions.)
+- Kprobes analyzes the entire function and verifies that there is no
+jump into the optimized region.  Specifically:
+  - the function contains no indirect jump;
+  - the function contains no instruction that causes an exception (since
+  the fixup code triggered by the exception could jump back into the
+  optimized region -- Kprobes checks the exception tables to verify this);
+  and
+  - there is no near jump to the optimized region (other than to the first
+  byte).
+- For each instruction in the optimized region, Kprobes verifies that
+the instruction can be executed out of line.
+1.4.3 Preparing Detour Buffer
+Next, Kprobes prepares a "detour" buffer, which contains the following
+instruction sequence:
+- code to push the CPU's registers (emulating a breakpoint trap)
+- a call to the trampoline code which calls user's probe handlers.
+- code to restore registers
+- the instructions from the optimized region
+- a jump back to the original execution path.
+1.4.4 Pre-optimization
+After preparing the detour buffer, Kprobes verifies that none of the
+following situations exist:
+- The probe has either a break_handler (i.e., it's a jprobe) or a
+post_handler.
+- Other instructions in the optimized region are probed.
+- The probe is disabled.
+In any of the above cases, Kprobes won't start optimizing the probe.
+Since these are temporary situations, Kprobes tries to start
+optimizing it again if the situation is changed.
+If the kprobe can be optimized, Kprobes enqueues the kprobe to an
+optimizing list, and kicks the kprobe-optimizer workqueue to optimize
+it.  If the to-be-optimized probepoint is hit before being optimized,
+Kprobes returns control to the original instruction path by setting
+the CPU's instruction pointer to the copied code in the detour buffer
+-- thus at least avoiding the single-step.
+1.4.5 Optimization
+The Kprobe-optimizer doesn't insert the jump instruction immediately;
+rather, it calls synchronize_sched() for safety first, because it's
+possible for a CPU to be interrupted in the middle of executing the
+optimized region(*).  As you know, synchronize_sched() can ensure
+that all interruptions that were active when synchronize_sched()
+was called are done, but only if CONFIG_PREEMPT=n.  So, this version
+of kprobe optimization supports only kernels with CONFIG_PREEMPT=n.(**)
+After that, the Kprobe-optimizer calls stop_machine() to replace
+the optimized region with a jump instruction to the detour buffer,
+using text_poke_smp().
+1.4.6 Unoptimization
+When an optimized kprobe is unregistered, disabled, or blocked by
+another kprobe, it will be unoptimized.  If this happens before
+the optimization is complete, the kprobe is just dequeued from the
+optimized list.  If the optimization has been done, the jump is
+replaced with the original code (except for an int3 breakpoint in
+the first byte) by using text_poke_smp().
+(*)Please imagine that the 2nd instruction is interrupted and then
+the optimizer replaces the 2nd instruction with the jump *address*
+while the interrupt handler is running. When the interrupt
+returns to original address, there is no valid instruction,
+and it causes an unexpected result.
+(**)This optimization-safety checking may be replaced with the
+stop-machine method that ksplice uses for supporting a CONFIG_PREEMPT=y
+kernel.
+NOTE for geeks:
+The jump optimization changes the kprobe's pre_handler behavior.
+Without optimization, the pre_handler can change the kernel's execution
+path by changing regs->ip and returning 1.  However, when the probe
+is optimized, that modification is ignored.  Thus, if you want to
+tweak the kernel's execution path, you need to suppress optimization,
+using one of the following techniques:
+- Specify an empty function for the kprobe's post_handler or break_handler.
+ or
+- Config CONFIG_OPTPROBES=n.
+ or
+- Execute 'sysctl -w debug.kprobes_optimization=n'
 2. Architectures Supported
 Kprobes, jprobes, and return probes are implemented on the following
 architectures:
- i386
+- i386 (Supports jump optimization)
- x86_64 (AMD-64, EM64T)
+- x86_64 (AMD-64, EM64T) (Supports jump optimization)
 - ppc64
 - ia64 (Does not support probes on instruction slot1.)
 - sparc64 (Return probes not yet implemented.)
@@ -193,6 +307,10 @@ it useful to "Compile the kernel with debug info" (CONFIG_DEBUG_INFO),
 so you can use "objdump -d -l vmlinux" to see the source-to-object
 code mapping.
+If you want to reduce probing overhead, set "Kprobes jump optimization
+support" (CONFIG_OPTPROBES) to "y". You can find this option under the
+"Kprobes" line.
 4. API Reference
 The Kprobes API includes a "register" function and an "unregister"
@@ -389,7 +507,10 @@ the probe which has been registered.
 Kprobes allows multiple probes at the same address.  Currently,
 however, there cannot be multiple jprobes on the same function at
-the same time.
+the same time.  Also, a probepoint for which there is a jprobe or
+a post_handler cannot be optimized.  So if you install a jprobe,
+or a kprobe with a post_handler, at an optimized probepoint, the
+probepoint will be unoptimized automatically.
 In general, you can install a probe anywhere in the kernel.
 In particular, you can probe interrupt handlers.  Known exceptions
@@ -453,6 +574,38 @@ reason, Kprobes doesn't support return probes (or kprobes or jprobes)
 on the x86_64 version of __switch_to(); the registration functions
 return -EINVAL.
+On x86/x86-64, since the Jump Optimization of Kprobes modifies
+instructions widely, there are some limitations to optimization. To
+explain it, we introduce some terminology. Imagine a 3-instruction
+sequence consisting of a two 2-byte instructions and one 3-byte
+instruction.
+        IA
+         |
+[-2][-1][0][1][2][3][4][5][6][7]
+        [ins1][ins2][  ins3 ]
+        [<-     DCR       ->]
+           [<- JTPR ->]
+ins1: 1st Instruction
+ins2: 2nd Instruction
+ins3: 3rd Instruction
+IA:  Insertion Address
+JTPR: Jump Target Prohibition Region
+DCR: Detoured Code Region
+The instructions in DCR are copied to the out-of-line buffer
+of the kprobe, because the bytes in DCR are replaced by
+a 5-byte jump instruction. So there are several limitations.
+a) The instructions in DCR must be relocatable.
+b) The instructions in DCR must not include a call instruction.
+c) JTPR must not be targeted by any jump or call instruction.
+d) DCR must not straddle the border betweeen functions.
+Anyway, these limitations are checked by the in-kernel instruction
+decoder, so you don't need to worry about that.
 6. Probe Overhead
 On a typical CPU in use in 2005, a kprobe hit takes 0.5 to 1.0
@@ -476,6 +629,19 @@ k = 0.49 usec; j = 0.76; r = 0.80; kr = 0.82; jr = 1.07
 ppc64: POWER5 (gr), 1656 MHz (SMT disabled, 1 virtual CPU per physical CPU)
 k = 0.77 usec; j = 1.31; r = 1.26; kr = 1.45; jr = 1.99
+6.1 Optimized Probe Overhead
+Typically, an optimized kprobe hit takes 0.07 to 0.1 microseconds to
+process. Here are sample overhead figures (in usec) for x86 architectures.
+k = unoptimized kprobe, b = boosted (single-step skipped), o = optimized kprobe,
+r = unoptimized kretprobe, rb = boosted kretprobe, ro = optimized kretprobe.
+i386: Intel(R) Xeon(R) E5410, 2.33GHz, 4656.90 bogomips
+k = 0.80 usec; b = 0.33; o = 0.05; r = 1.10; rb = 0.61; ro = 0.33
+x86-64: Intel(R) Xeon(R) E5410, 2.33GHz, 4656.90 bogomips
+k = 0.99 usec; b = 0.43; o = 0.06; r = 1.24; rb = 0.68; ro = 0.30
 7. TODO
 a. SystemTap (http://sourceware.org/systemtap): Provides a simplified
@@ -523,7 +689,8 @@ is also specified. Following columns show probe status. If the probe is on
 a virtual address that is no longer valid (module init sections, module
 virtual addresses that correspond to modules that've been unloaded),
 such probes are marked with [GONE]. If the probe is temporarily disabled,
-such probes are marked with [DISABLED].
+such probes are marked with [DISABLED]. If the probe is optimized, it is
+marked with [OPTIMIZED].
 /sys/kernel/debug/kprobes/enabled: Turn kprobes ON/OFF forcibly.
@@ -533,3 +700,19 @@ registered probes will be disarmed, till such time a "1" is echoed to this
 file. Note that this knob just disarms and arms all kprobes and doesn't
 change each probe's disabling state. This means that disabled kprobes (marked
 [DISABLED]) will be not enabled if you turn ON all kprobes by this knob.
+Appendix B: The kprobes sysctl interface
+/proc/sys/debug/kprobes-optimization: Turn kprobes optimization ON/OFF.
+When CONFIG_OPTPROBES=y, this sysctl interface appears and it provides
+a knob to globally and forcibly turn jump optimization (see section
+1.4) ON or OFF. By default, jump optimization is allowed (ON).
+If you echo "0" to this file or set "debug.kprobes_optimization" to
+0 via sysctl, all optimized probes will be unoptimized, and any new
+probes registered after that will not be optimized.  Note that this
+knob *changes* the optimized state. This means that optimized probes
+(marked [OPTIMIZED]) will be unoptimized ([OPTIMIZED] tag will be
+removed). If the knob is turned on, they will be optimized again.
diff --git a/Documentation/kvm/api.txt b/Documentation/kvm/api.txt
index 2811e452f756..c6416a398163 100644
--- a/Documentation/kvm/api.txt
+++ b/Documentation/kvm/api.txt
@@ -23,12 +23,12 @@ of a virtual machine.  The ioctls belong to three classes
   Only run vcpu ioctls from the same thread that was used to create the
   vcpu.
-2. File descritpors
+2. File descriptors
 The kvm API is centered around file descriptors.  An initial
 open("/dev/kvm") obtains a handle to the kvm subsystem; this handle
 can be used to issue system ioctls.  A KVM_CREATE_VM ioctl on this
-handle will create a VM file descripror which can be used to issue VM
+handle will create a VM file descriptor which can be used to issue VM
 ioctls.  A KVM_CREATE_VCPU ioctl on a VM fd will create a virtual cpu
 and return a file descriptor pointing to it.  Finally, ioctls on a vcpu
 fd can be used to control the vcpu, including the important task of
@@ -643,7 +643,7 @@ Type: vm ioctl
 Parameters: struct kvm_clock_data (in)
 Returns: 0 on success, -1 on error
-Sets the current timestamp of kvmclock to the valued specific in its parameter.
+Sets the current timestamp of kvmclock to the value specified in its parameter.
 In conjunction with KVM_GET_CLOCK, it is used to ensure monotonicity on scenarios
 such as migration.
@@ -795,11 +795,11 @@ Unused.
                        __u64 data_offset; /* relative to kvm_run start */
                } io;
-If exit_reason is KVM_EXIT_IO_IN or KVM_EXIT_IO_OUT, then the vcpu has
+If exit_reason is KVM_EXIT_IO, then the vcpu has
 executed a port I/O instruction which could not be satisfied by kvm.
 data_offset describes where the data is located (KVM_EXIT_IO_OUT) or
 where kvm expects application code to place the data for the next
-KVM_RUN invocation (KVM_EXIT_IO_IN).  Data format is a patcked array.
+KVM_RUN invocation (KVM_EXIT_IO_IN).  Data format is a packed array.
                struct {
                        struct kvm_debug_exit_arch arch;
@@ -815,7 +815,7 @@ Unused.
                        __u8  is_write;
                } mmio;
-If exit_reason is KVM_EXIT_MMIO or KVM_EXIT_IO_OUT, then the vcpu has
+If exit_reason is KVM_EXIT_MMIO, then the vcpu has
 executed a memory-mapped I/O instruction which could not be satisfied
 by kvm.  The 'data' member contains the written data if 'is_write' is
 true, and should be filled by application code otherwise.
diff --git a/Documentation/vm/slub.txt b/Documentation/vm/slub.txt
index b37300edf27c..07375e73981a 100644
--- a/Documentation/vm/slub.txt
+++ b/Documentation/vm/slub.txt
@@ -41,6 +41,7 @@ Possible debug options are
        P               Poisoning (object and padding)
        U               User tracking (free and alloc)
        T               Trace (please only use on single slabs)
+        A               Toggle failslab filter mark for the cache
        O               Switch debugging off for caches that would have
                        caused higher minimum slab orders
        -               Switch all debugging off (useful if the kernel is