16 files changed, 563 insertions, 257 deletions
diff --git a/Documentation/DocBook/tracepoint.tmpl b/Documentation/DocBook/tracepoint.tmpl
index b0756d0fd579..8bca1d5cec09 100644
--- a/Documentation/DocBook/tracepoint.tmpl
+++ b/Documentation/DocBook/tracepoint.tmpl
@@ -86,4 +86,9 @@
 !Iinclude/trace/events/irq.h
  </chapter>
+  <chapter id="signal">
+   <title>SIGNAL</title>
+!Iinclude/trace/events/signal.h
+  </chapter>
 </book>
diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt
index 187bbf10c923..8608fd85e921 100644
--- a/Documentation/RCU/trace.txt
+++ b/Documentation/RCU/trace.txt
@@ -1,185 +1,10 @@
 CONFIG_RCU_TRACE debugfs Files and Formats
-The rcupreempt and rcutree implementations of RCU provide debugfs trace
+The rcutree implementation of RCU provides debugfs trace output that
-output that summarizes counters and state.  This information is useful for
+summarizes counters and state.  This information is useful for debugging
-debugging RCU itself, and can sometimes also help to debug abuses of RCU.
+RCU itself, and can sometimes also help to debug abuses of RCU.
-Note that the rcuclassic implementation of RCU does not provide debugfs
+The following sections describe the debugfs files and formats.
-trace output.
-The following sections describe the debugfs files and formats for
-preemptable RCU (rcupreempt) and hierarchical RCU (rcutree).
-Preemptable RCU debugfs Files and Formats
-This implementation of RCU provides three debugfs files under the
-top-level directory RCU: rcu/rcuctrs (which displays the per-CPU
-counters used by preemptable RCU) rcu/rcugp (which displays grace-period
-counters), and rcu/rcustats (which internal counters for debugging RCU).
-The output of "cat rcu/rcuctrs" looks as follows:
-CPU last cur F M
-  0    5  -5 0 0
-  1   -1   0 0 0
-  2    0   1 0 0
-  3    0   1 0 0
-  4    0   1 0 0
-  5    0   1 0 0
-  6    0   2 0 0
-  7    0  -1 0 0
-  8    0   1 0 0
-ggp = 26226, state = waitzero
-The per-CPU fields are as follows:
-o       "CPU" gives the CPU number.  Offline CPUs are not displayed.
-o       "last" gives the value of the counter that is being decremented
-        for the current grace period phase.  In the example above,
-        the counters sum to 4, indicating that there are still four
-        RCU read-side critical sections still running that started
-        before the last counter flip.
-o       "cur" gives the value of the counter that is currently being
-        both incremented (by rcu_read_lock()) and decremented (by
-        rcu_read_unlock()).  In the example above, the counters sum to
-        1, indicating that there is only one RCU read-side critical section
-        still running that started after the last counter flip.
-o       "F" indicates whether RCU is waiting for this CPU to acknowledge
-        a counter flip.  In the above example, RCU is not waiting on any,
-        which is consistent with the state being "waitzero" rather than
-        "waitack".
-o       "M" indicates whether RCU is waiting for this CPU to execute a
-        memory barrier.  In the above example, RCU is not waiting on any,
-        which is consistent with the state being "waitzero" rather than
-        "waitmb".
-o       "ggp" is the global grace-period counter.
-o       "state" is the RCU state, which can be one of the following:
-        o       "idle": there is no grace period in progress.
-        o       "waitack": RCU just incremented the global grace-period
-                counter, which has the effect of reversing the roles of
-                the "last" and "cur" counters above, and is waiting for
-                all the CPUs to acknowledge the flip.  Once the flip has
-                been acknowledged, CPUs will no longer be incrementing
-                what are now the "last" counters, so that their sum will
-                decrease monotonically down to zero.
-        o       "waitzero": RCU is waiting for the sum of the "last" counters
-                to decrease to zero.
-        o       "waitmb": RCU is waiting for each CPU to execute a memory
-                barrier, which ensures that instructions from a given CPU's
-                last RCU read-side critical section cannot be reordered
-                with instructions following the memory-barrier instruction.
-The output of "cat rcu/rcugp" looks as follows:
-oldggp=48870  newggp=48873
-Note that reading from this file provokes a synchronize_rcu().  The
-"oldggp" value is that of "ggp" from rcu/rcuctrs above, taken before
-executing the synchronize_rcu(), and the "newggp" value is also the
-"ggp" value, but taken after the synchronize_rcu() command returns.
-The output of "cat rcu/rcugp" looks as follows:
-na=1337955 nl=40 wa=1337915 wl=44 da=1337871 dl=0 dr=1337871 di=1337871
-1=50989 e1=6138 i1=49722 ie1=82 g1=49640 a1=315203 ae1=265563 a2=49640
-z1=1401244 ze1=1351605 z2=49639 m1=5661253 me1=5611614 m2=49639
-These are counters tracking internal preemptable-RCU events, however,
-some of them may be useful for debugging algorithms using RCU.  In
-particular, the "nl", "wl", and "dl" values track the number of RCU
-callbacks in various states.  The fields are as follows:
-o       "na" is the total number of RCU callbacks that have been enqueued
-        since boot.
-o       "nl" is the number of RCU callbacks waiting for the previous
-        grace period to end so that they can start waiting on the next
-        grace period.
-o       "wa" is the total number of RCU callbacks that have started waiting
-        for a grace period since boot.  "na" should be roughly equal to
-        "nl" plus "wa".
-o       "wl" is the number of RCU callbacks currently waiting for their
-        grace period to end.
-o       "da" is the total number of RCU callbacks whose grace periods
-        have completed since boot.  "wa" should be roughly equal to
-        "wl" plus "da".
-o       "dr" is the total number of RCU callbacks that have been removed
-        from the list of callbacks ready to invoke.  "dr" should be roughly
-        equal to "da".
-o       "di" is the total number of RCU callbacks that have been invoked
-        since boot.  "di" should be roughly equal to "da", though some
-        early versions of preemptable RCU had a bug so that only the
-        last CPU's count of invocations was displayed, rather than the
-        sum of all CPU's counts.
-o       "1" is the number of calls to rcu_try_flip().  This should be
-        roughly equal to the sum of "e1", "i1", "a1", "z1", and "m1"
-        described below.  In other words, the number of times that
-        the state machine is visited should be equal to the sum of the
-        number of times that each state is visited plus the number of
-        times that the state-machine lock acquisition failed.
-o       "e1" is the number of times that rcu_try_flip() was unable to
-        acquire the fliplock.
-o       "i1" is the number of calls to rcu_try_flip_idle().
-o       "ie1" is the number of times rcu_try_flip_idle() exited early
-        due to the calling CPU having no work for RCU.
-o       "g1" is the number of times that rcu_try_flip_idle() decided
-        to start a new grace period.  "i1" should be roughly equal to
-        "ie1" plus "g1".
-o       "a1" is the number of calls to rcu_try_flip_waitack().
-o       "ae1" is the number of times that rcu_try_flip_waitack() found
-        that at least one CPU had not yet acknowledge the new grace period
-        (AKA "counter flip").
-o       "a2" is the number of time rcu_try_flip_waitack() found that
-        all CPUs had acknowledged.  "a1" should be roughly equal to
-        "ae1" plus "a2".  (This particular output was collected on
-        a 128-CPU machine, hence the smaller-than-usual fraction of
-        calls to rcu_try_flip_waitack() finding all CPUs having already
-        acknowledged.)
-o       "z1" is the number of calls to rcu_try_flip_waitzero().
-o       "ze1" is the number of times that rcu_try_flip_waitzero() found
-        that not all of the old RCU read-side critical sections had
-        completed.
-o       "z2" is the number of times that rcu_try_flip_waitzero() finds
-        the sum of the counters equal to zero, in other words, that
-        all of the old RCU read-side critical sections had completed.
-        The value of "z1" should be roughly equal to "ze1" plus
-        "z2".
-o       "m1" is the number of calls to rcu_try_flip_waitmb().
-o       "me1" is the number of times that rcu_try_flip_waitmb() finds
-        that at least one CPU has not yet executed a memory barrier.
-o       "m2" is the number of times that rcu_try_flip_waitmb() finds that
-        all CPUs have executed a memory barrier.
 Hierarchical RCU debugfs Files and Formats
@@ -210,9 +35,10 @@ rcu_bh:
  6 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=859/1 dn=0 df=15 of=0 ri=0 ql=0 b=10
  7 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=3761/1 dn=0 df=15 of=0 ri=0 ql=0 b=10
-The first section lists the rcu_data structures for rcu, the second for
+The first section lists the rcu_data structures for rcu_sched, the second
-rcu_bh.  Each section has one line per CPU, or eight for this 8-CPU system.
+for rcu_bh.  Note that CONFIG_TREE_PREEMPT_RCU kernels will have an
-The fields are as follows:
+additional section for rcu_preempt.  Each section has one line per CPU,
+or eight for this 8-CPU system.  The fields are as follows:
 o       The number at the beginning of each line is the CPU number.
        CPUs numbers followed by an exclamation mark are offline,
@@ -223,9 +49,9 @@ o	The number at the beginning of each line is the CPU number.
 o       "c" is the count of grace periods that this CPU believes have
        completed.  CPUs in dynticks idle mode may lag quite a ways
-        behind, for example, CPU 4 under "rcu" above, which has slept
+        behind, for example, CPU 4 under "rcu_sched" above, which has
-        through the past 25 RCU grace periods.  It is not unusual to
+        slept through the past 25 RCU grace periods.  It is not unusual
-        see CPUs lagging by thousands of grace periods.
+        to see CPUs lagging by thousands of grace periods.
 o       "g" is the count of grace periods that this CPU believes have
        started.  Again, CPUs in dynticks idle mode may lag behind.
@@ -308,8 +134,10 @@ The output of "cat rcu/rcugp" looks as follows:
 rcu_sched: completed=33062  gpnum=33063
 rcu_bh: completed=464  gpnum=464
-Again, this output is for both "rcu" and "rcu_bh".  The fields are
+Again, this output is for both "rcu_sched" and "rcu_bh".  Note that
-taken from the rcu_state structure, and are as follows:
+kernels built with CONFIG_TREE_PREEMPT_RCU will have an additional
+"rcu_preempt" line.  The fields are taken from the rcu_state structure,
+and are as follows:
 o       "completed" is the number of grace periods that have completed.
        It is comparable to the "c" field from rcu/rcudata in that a
@@ -324,23 +152,24 @@ o	"gpnum" is the number of grace periods that have started.  It is
        If these two fields are equal (as they are for "rcu_bh" above),
        then there is no grace period in progress, in other words, RCU
        is idle.  On the other hand, if the two fields differ (as they
-        do for "rcu" above), then an RCU grace period is in progress.
+        do for "rcu_sched" above), then an RCU grace period is in progress.
 The output of "cat rcu/rcuhier" looks as follows, with very long lines:
-c=6902 g=6903 s=2 jfq=3 j=72c7 nfqs=13142/nfqsng=0(13142) fqlh=6
+c=6902 g=6903 s=2 jfq=3 j=72c7 nfqs=13142/nfqsng=0(13142) fqlh=6 oqlen=0
-1/1 0:127 ^0    
+1/1 .>. 0:127 ^0    
-3/3 0:35 ^0    0/0 36:71 ^1    0/0 72:107 ^2    0/0 108:127 ^3    
+3/3 .>. 0:35 ^0    0/0 .>. 36:71 ^1    0/0 .>. 72:107 ^2    0/0 .>. 108:127 ^3    
-3/3f 0:5 ^0    2/3 6:11 ^1    0/0 12:17 ^2    0/0 18:23 ^3    0/0 24:29 ^4    0/0 30:35 ^5    0/0 36:41 ^0    0/0 42:47 ^1    0/0 48:53 ^2    0/0 54:59 ^3    0/0 60:65 ^4    0/0 66:71 ^5    0/0 72:77 ^0    0/0 78:83 ^1    0/0 84:89 ^2    0/0 90:95 ^3    0/0 96:101 ^4    0/0 102:107 ^5    0/0 108:113 ^0    0/0 114:119 ^1    0/0 120:125 ^2    0/0 126:127 ^3    
+3/3f .>. 0:5 ^0    2/3 .>. 6:11 ^1    0/0 .>. 12:17 ^2    0/0 .>. 18:23 ^3    0/0 .>. 24:29 ^4    0/0 .>. 30:35 ^5    0/0 .>. 36:41 ^0    0/0 .>. 42:47 ^1    0/0 .>. 48:53 ^2    0/0 .>. 54:59 ^3    0/0 .>. 60:65 ^4    0/0 .>. 66:71 ^5    0/0 .>. 72:77 ^0    0/0 .>. 78:83 ^1    0/0 .>. 84:89 ^2    0/0 .>. 90:95 ^3    0/0 .>. 96:101 ^4    0/0 .>. 102:107 ^5    0/0 .>. 108:113 ^0    0/0 .>. 114:119 ^1    0/0 .>. 120:125 ^2    0/0 .>. 126:127 ^3    
 rcu_bh:
-c=-226 g=-226 s=1 jfq=-5701 j=72c7 nfqs=88/nfqsng=0(88) fqlh=0
+c=-226 g=-226 s=1 jfq=-5701 j=72c7 nfqs=88/nfqsng=0(88) fqlh=0 oqlen=0
-0/1 0:127 ^0    
+0/1 .>. 0:127 ^0    
-0/3 0:35 ^0    0/0 36:71 ^1    0/0 72:107 ^2    0/0 108:127 ^3    
+0/3 .>. 0:35 ^0    0/0 .>. 36:71 ^1    0/0 .>. 72:107 ^2    0/0 .>. 108:127 ^3    
-0/3f 0:5 ^0    0/3 6:11 ^1    0/0 12:17 ^2    0/0 18:23 ^3    0/0 24:29 ^4    0/0 30:35 ^5    0/0 36:41 ^0    0/0 42:47 ^1    0/0 48:53 ^2    0/0 54:59 ^3    0/0 60:65 ^4    0/0 66:71 ^5    0/0 72:77 ^0    0/0 78:83 ^1    0/0 84:89 ^2    0/0 90:95 ^3    0/0 96:101 ^4    0/0 102:107 ^5    0/0 108:113 ^0    0/0 114:119 ^1    0/0 120:125 ^2    0/0 126:127 ^3
+0/3f .>. 0:5 ^0    0/3 .>. 6:11 ^1    0/0 .>. 12:17 ^2    0/0 .>. 18:23 ^3    0/0 .>. 24:29 ^4    0/0 .>. 30:35 ^5    0/0 .>. 36:41 ^0    0/0 .>. 42:47 ^1    0/0 .>. 48:53 ^2    0/0 .>. 54:59 ^3    0/0 .>. 60:65 ^4    0/0 .>. 66:71 ^5    0/0 .>. 72:77 ^0    0/0 .>. 78:83 ^1    0/0 .>. 84:89 ^2    0/0 .>. 90:95 ^3    0/0 .>. 96:101 ^4    0/0 .>. 102:107 ^5    0/0 .>. 108:113 ^0    0/0 .>. 114:119 ^1    0/0 .>. 120:125 ^2    0/0 .>. 126:127 ^3
-This is once again split into "rcu" and "rcu_bh" portions.  The fields are
+This is once again split into "rcu_sched" and "rcu_bh" portions,
-as follows:
+and CONFIG_TREE_PREEMPT_RCU kernels will again have an additional
+"rcu_preempt" section.  The fields are as follows:
 o       "c" is exactly the same as "completed" under rcu/rcugp.
@@ -372,6 +201,11 @@ o	"fqlh" is the number of calls to force_quiescent_state() that
        exited immediately (without even being counted in nfqs above)
        due to contention on ->fqslock.
+o       "oqlen" is the number of callbacks on the "orphan" callback
+        list.  RCU callbacks are placed on this list by CPUs going
+        offline, and are "adopted" either by the CPU helping the outgoing
+        CPU or by the next rcu_barrier*() call, whichever comes first.
 o       Each element of the form "1/1 0:127 ^0" represents one struct
        rcu_node.  Each line represents one level of the hierarchy, from
        root to leaves.  It is best to think of the rcu_data structures
@@ -379,7 +213,7 @@ o	Each element of the form "1/1 0:127 ^0" represents one struct
        might be either one, two, or three levels of rcu_node structures,
        depending on the relationship between CONFIG_RCU_FANOUT and
        CONFIG_NR_CPUS.
-        
        o       The numbers separated by the "/" are the qsmask followed
                by the qsmaskinit.  The qsmask will have one bit
                set for each entity in the next lower level that
@@ -389,10 +223,19 @@ o	Each element of the form "1/1 0:127 ^0" represents one struct
                The value of qsmaskinit is assigned to that of qsmask
                at the beginning of each grace period.
-                For example, for "rcu", the qsmask of the first entry
+                For example, for "rcu_sched", the qsmask of the first
-                of the lowest level is 0x14, meaning that we are still
+                entry of the lowest level is 0x14, meaning that we
-                waiting for CPUs 2 and 4 to check in for the current
+                are still waiting for CPUs 2 and 4 to check in for the
-                grace period.
+                current grace period.
+        o       The characters separated by the ">" indicate the state
+                of the blocked-tasks lists.  A "T" preceding the ">"
+                indicates that at least one task blocked in an RCU
+                read-side critical section blocks the current grace
+                period, while a "." preceding the ">" indicates otherwise.
+                The character following the ">" indicates similarly for
+                the next grace period.  A "T" should appear in this
+                field only for rcu-preempt.
        o       The numbers separated by the ":" are the range of CPUs
                served by this struct rcu_node.  This can be helpful
@@ -431,8 +274,9 @@ rcu_bh:
  6 np=120834 qsp=9902 cbr=0 cng=0 gpc=6 gps=3 nf=2 nn=110921
  7 np=144888 qsp=26336 cbr=0 cng=0 gpc=8 gps=2 nf=0 nn=118542
-As always, this is once again split into "rcu" and "rcu_bh" portions.
+As always, this is once again split into "rcu_sched" and "rcu_bh"
-The fields are as follows:
+portions, with CONFIG_TREE_PREEMPT_RCU kernels having an additional
+"rcu_preempt" section.  The fields are as follows:
 o       "np" is the number of times that __rcu_pending() has been invoked
        for the corresponding flavor of RCU.
diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt
index e41a7fecf0d3..d542ca243b80 100644
--- a/Documentation/RCU/whatisRCU.txt
+++ b/Documentation/RCU/whatisRCU.txt
@@ -830,7 +830,7 @@ sched:	Critical sections	Grace period		Barrier
 SRCU:   Critical sections       Grace period            Barrier
        srcu_read_lock          synchronize_srcu        N/A
-        srcu_read_unlock
+        srcu_read_unlock        synchronize_srcu_expedited
 SRCU:   Initialization/cleanup
        init_srcu_struct
diff --git a/Documentation/dontdiff b/Documentation/dontdiff
index e1efc400bed6..e151b2a36267 100644
--- a/Documentation/dontdiff
+++ b/Documentation/dontdiff
@@ -65,6 +65,7 @@ aicdb.h*
 asm-offsets.h
 asm_offsets.h
 autoconf.h*
+av_permissions.h
 bbootsect
 bin2c
 binkernel.spec
@@ -95,12 +96,14 @@ docproc
 elf2ecoff
 elfconfig.h*
 fixdep
+flask.h
 fore200e_mkfirm
 fore200e_pca_fw.c*
 gconf
 gen-devlist
 gen_crc32table
 gen_init_cpio
+genheaders
 genksyms
 *_gray256.c
 ihex2fw
diff --git a/Documentation/fb/framebuffer.txt b/Documentation/fb/framebuffer.txt
index b3e3a0356839..fe79e3c8847d 100644
--- a/Documentation/fb/framebuffer.txt
+++ b/Documentation/fb/framebuffer.txt
@@ -312,10 +312,8 @@ and to the following documentation:
 8. Mailing list
 ---------------
-There are several frame buffer device related mailing lists at SourceForge:
+There is a frame buffer device related mailing list at kernel.org:
-  - linux-fbdev-announce@lists.sourceforge.net, for announcements,
+linux-fbdev@vger.kernel.org.
-  - linux-fbdev-user@lists.sourceforge.net, for generic user support,
-  - linux-fbdev-devel@lists.sourceforge.net, for project developers.
 Point your web browser to http://sourceforge.net/projects/linux-fbdev/ for
 subscription information and archive browsing.
diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index bc693fffabe0..f613df8ec7bf 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -6,6 +6,21 @@ be removed from this file.
 ---------------------------
+What:   USER_SCHED
+When:   2.6.34
+Why:    USER_SCHED was implemented as a proof of concept for group scheduling.
+        The effect of USER_SCHED can already be achieved from userspace with
+        the help of libcgroup. The removal of USER_SCHED will also simplify
+        the scheduler code with the removal of one major ifdef. There are also
+        issues USER_SCHED has with USER_NS. A decision was taken not to fix
+        those and instead remove USER_SCHED. Also new group scheduling
+        features will not be implemented for USER_SCHED.
+Who:    Dhaval Giani <dhaval@linux.vnet.ibm.com>
+---------------------------
 What:   PRISM54
 When:   2.6.34
diff --git a/Documentation/filesystems/caching/fscache.txt b/Documentation/filesystems/caching/fscache.txt
index 9e94b9491d89..a91e2e2095b0 100644
--- a/Documentation/filesystems/caching/fscache.txt
+++ b/Documentation/filesystems/caching/fscache.txt
@@ -235,6 +235,7 @@ proc files.
                neg=N   Number of negative lookups made
                pos=N   Number of positive lookups made
                crt=N   Number of objects created by lookup
+                tmo=N   Number of lookups timed out and requeued
        Updates n=N     Number of update cookie requests seen
                nul=N   Number of upd reqs given a NULL parent
                run=N   Number of upd reqs granted CPU time
@@ -250,8 +251,10 @@ proc files.
                ok=N    Number of successful alloc reqs
                wt=N    Number of alloc reqs that waited on lookup completion
                nbf=N   Number of alloc reqs rejected -ENOBUFS
+                int=N   Number of alloc reqs aborted -ERESTARTSYS
                ops=N   Number of alloc reqs submitted
                owt=N   Number of alloc reqs waited for CPU time
+                abt=N   Number of alloc reqs aborted due to object death
        Retrvls n=N     Number of retrieval (read) requests seen
                ok=N    Number of successful retr reqs
                wt=N    Number of retr reqs that waited on lookup completion
@@ -261,6 +264,7 @@ proc files.
                oom=N   Number of retr reqs failed -ENOMEM
                ops=N   Number of retr reqs submitted
                owt=N   Number of retr reqs waited for CPU time
+                abt=N   Number of retr reqs aborted due to object death
        Stores  n=N     Number of storage (write) requests seen
                ok=N    Number of successful store reqs
                agn=N   Number of store reqs on a page already pending storage
@@ -268,12 +272,37 @@ proc files.
                oom=N   Number of store reqs failed -ENOMEM
                ops=N   Number of store reqs submitted
                run=N   Number of store reqs granted CPU time
+                pgs=N   Number of pages given store req processing time
+                rxd=N   Number of store reqs deleted from tracking tree
+                olm=N   Number of store reqs over store limit
+        VmScan  nos=N   Number of release reqs against pages with no pending store
+                gon=N   Number of release reqs against pages stored by time lock granted
+                bsy=N   Number of release reqs ignored due to in-progress store
+                can=N   Number of page stores cancelled due to release req
        Ops     pend=N  Number of times async ops added to pending queues
                run=N   Number of times async ops given CPU time
                enq=N   Number of times async ops queued for processing
+                can=N   Number of async ops cancelled
+                rej=N   Number of async ops rejected due to object lookup/create failure
                dfr=N   Number of async ops queued for deferred release
                rel=N   Number of async ops released
                gc=N    Number of deferred-release async ops garbage collected
+        CacheOp alo=N   Number of in-progress alloc_object() cache ops
+                luo=N   Number of in-progress lookup_object() cache ops
+                luc=N   Number of in-progress lookup_complete() cache ops
+                gro=N   Number of in-progress grab_object() cache ops
+                upo=N   Number of in-progress update_object() cache ops
+                dro=N   Number of in-progress drop_object() cache ops
+                pto=N   Number of in-progress put_object() cache ops
+                syn=N   Number of in-progress sync_cache() cache ops
+                atc=N   Number of in-progress attr_changed() cache ops
+                rap=N   Number of in-progress read_or_alloc_page() cache ops
+                ras=N   Number of in-progress read_or_alloc_pages() cache ops
+                alp=N   Number of in-progress allocate_page() cache ops
+                als=N   Number of in-progress allocate_pages() cache ops
+                wrp=N   Number of in-progress write_page() cache ops
+                ucp=N   Number of in-progress uncache_page() cache ops
+                dsp=N   Number of in-progress dissociate_pages() cache ops
 (*) /proc/fs/fscache/histogram
@@ -299,6 +328,87 @@ proc files.
     jiffy range covered, and the SECS field the equivalent number of seconds.
+===========
+OBJECT LIST
+===========
+If CONFIG_FSCACHE_OBJECT_LIST is enabled, the FS-Cache facility will maintain a
+list of all the objects currently allocated and allow them to be viewed
+through:
+        /proc/fs/fscache/objects
+This will look something like:
+        [root@andromeda ~]# head /proc/fs/fscache/objects
+        OBJECT   PARENT   STAT CHLDN OPS OOP IPR EX READS EM EV F S | NETFS_COOKIE_DEF TY FL NETFS_DATA       OBJECT_KEY, AUX_DATA
+        ======== ======== ==== ===== === === === == ===== == == = = | ================ == == ================ ================
+           17e4b        2 ACTV     0   0   0   0  0     0 7b  4 0 8 | NFS.fh           DT  0 ffff88001dd82820 010006017edcf8bbc93b43298fdfbe71e50b57b13a172c0117f38472, e567634700000000000000000000000063f2404a000000000000000000000000c9030000000000000000000063f2404a
+           1693a        2 ACTV     0   0   0   0  0     0 7b  4 0 8 | NFS.fh           DT  0 ffff88002db23380 010006017edcf8bbc93b43298fdfbe71e50b57b1e0162c01a2df0ea6, 420ebc4a000000000000000000000000420ebc4a0000000000000000000000000e1801000000000000000000420ebc4a
+where the first set of columns before the '|' describe the object:
+        COLUMN  DESCRIPTION
+        ======= ===============================================================
+        OBJECT  Object debugging ID (appears as OBJ%x in some debug messages)
+        PARENT  Debugging ID of parent object
+        STAT    Object state
+        CHLDN   Number of child objects of this object
+        OPS     Number of outstanding operations on this object
+        OOP     Number of outstanding child object management operations
+        IPR
+        EX      Number of outstanding exclusive operations
+        READS   Number of outstanding read operations
+        EM      Object's event mask
+        EV      Events raised on this object
+        F       Object flags
+        S       Object slow-work work item flags
+and the second set of columns describe the object's cookie, if present:
+        COLUMN          DESCRIPTION
+        =============== =======================================================
+        NETFS_COOKIE_DEF Name of netfs cookie definition
+        TY              Cookie type (IX - index, DT - data, hex - special)
+        FL              Cookie flags
+        NETFS_DATA      Netfs private data stored in the cookie
+        OBJECT_KEY      Object key      } 1 column, with separating comma
+        AUX_DATA        Object aux data } presence may be configured
+The data shown may be filtered by attaching the a key to an appropriate keyring
+before viewing the file.  Something like:
+                keyctl add user fscache:objlist <restrictions> @s
+where <restrictions> are a selection of the following letters:
+        K       Show hexdump of object key (don't show if not given)
+        A       Show hexdump of object aux data (don't show if not given)
+and the following paired letters:
+        C       Show objects that have a cookie
+        c       Show objects that don't have a cookie
+        B       Show objects that are busy
+        b       Show objects that aren't busy
+        W       Show objects that have pending writes
+        w       Show objects that don't have pending writes
+        R       Show objects that have outstanding reads
+        r       Show objects that don't have outstanding reads
+        S       Show objects that have slow work queued
+        s       Show objects that don't have slow work queued
+If neither side of a letter pair is given, then both are implied.  For example:
+        keyctl add user fscache:objlist KB @s
+shows objects that are busy, and lists their object keys, but does not dump
+their auxiliary data.  It also implies "CcWwRrSs", but as 'B' is given, 'b' is
+not implied.
+By default all objects and all fields will be shown.
 =========
 DEBUGGING
 =========
diff --git a/Documentation/filesystems/caching/netfs-api.txt b/Documentation/filesystems/caching/netfs-api.txt
index 2666b1ed5e9e..1902c57b72ef 100644
--- a/Documentation/filesystems/caching/netfs-api.txt
+++ b/Documentation/filesystems/caching/netfs-api.txt
@@ -641,7 +641,7 @@ data file must be retired (see the relinquish cookie function below).
 Furthermore, note that this does not cancel the asynchronous read or write
 operation started by the read/alloc and write functions, so the page
-invalidation and release functions must use:
+invalidation functions must use:
        bool fscache_check_page_write(struct fscache_cookie *cookie,
                                      struct page *page);
@@ -654,6 +654,25 @@ to see if a page is being written to the cache, and:
 to wait for it to finish if it is.
+When releasepage() is being implemented, a special FS-Cache function exists to
+manage the heuristics of coping with vmscan trying to eject pages, which may
+conflict with the cache trying to write pages to the cache (which may itself
+need to allocate memory):
+        bool fscache_maybe_release_page(struct fscache_cookie *cookie,
+                                        struct page *page,
+                                        gfp_t gfp);
+This takes the netfs cookie, and the page and gfp arguments as supplied to
+releasepage().  It will return false if the page cannot be released yet for
+some reason and if it returns true, the page has been uncached and can now be
+released.
+To make a page available for release, this function may wait for an outstanding
+storage request to complete, or it may attempt to cancel the storage request -
+in which case the page will not be stored in the cache this time.
 ==========================
 INDEX AND DATA FILE UPDATE
 ==========================
diff --git a/Documentation/filesystems/ocfs2.txt b/Documentation/filesystems/ocfs2.txt
index c2a0871280a0..c58b9f5ba002 100644
--- a/Documentation/filesystems/ocfs2.txt
+++ b/Documentation/filesystems/ocfs2.txt
@@ -20,15 +20,16 @@ Lots of code taken from ext3 and other projects.
 Authors in alphabetical order:
 Joel Becker   <joel.becker@oracle.com>
 Zach Brown    <zach.brown@oracle.com>
-Mark Fasheh   <mark.fasheh@oracle.com>
+Mark Fasheh   <mfasheh@suse.com>
 Kurt Hackel   <kurt.hackel@oracle.com>
+Tao Ma        <tao.ma@oracle.com>
 Sunil Mushran <sunil.mushran@oracle.com>
 Manish Singh  <manish.singh@oracle.com>
+Tiger Yang    <tiger.yang@oracle.com>
 Caveats
 =======
 Features which OCFS2 does not support yet:
-        - quotas
        - Directory change notification (F_NOTIFY)
        - Distributed Caching (F_SETLEASE/F_GETLEASE/break_lease)
@@ -70,7 +71,6 @@ commit=nrsec	(*)	Ocfs2 can be told to sync all its data and metadata
                        performance.
 localalloc=8(*)         Allows custom localalloc size in MB. If the value is too
                        large, the fs will silently revert it to the default.
-                        Localalloc is not enabled for local mounts.
 localflocks             This disables cluster aware flock.
 inode64                 Indicates that Ocfs2 is allowed to create inodes at
                        any location in the filesystem, including those which
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 2c48f945546b..4af0018533f2 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -1072,7 +1072,8 @@ second).  The meanings of the columns are as follows, from left to right:
 - irq: servicing interrupts
 - softirq: servicing softirqs
 - steal: involuntary wait
- guest: running a guest
+- guest: running a normal guest
+- guest_nice: running a niced guest
 The "intr" line gives counts of interrupts  serviced since boot time, for each
 of the  possible system interrupts.   The first  column  is the  total of  all
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 9107b387e91f..fce5b5e516cc 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -85,7 +85,6 @@ parameter is applicable:
        PPT     Parallel port support is enabled.
        PS2     Appropriate PS/2 support is enabled.
        RAM     RAM disk support is enabled.
-        ROOTPLUG The example Root Plug LSM is enabled.
        S390    S390 architecture is enabled.
        SCSI    Appropriate SCSI support is enabled.
                        A lot of drivers has their options described inside of
@@ -345,6 +344,15 @@ and is between 256 and 4096 characters. It is defined in the file
                        Change the amount of debugging information output
                        when initialising the APIC and IO-APIC components.
+        show_lapic=     [APIC,X86] Advanced Programmable Interrupt Controller
+                        Limit apic dumping. The parameter defines the maximal
+                        number of local apics being dumped. Also it is possible
+                        to set it to "all" by meaning -- no limit here.
+                        Format: { 1 (default) | 2 | ... | all }.
+                        The parameter valid if only apic=debug or
+                        apic=verbose is specified.
+                        Example: apic=debug show_lapic=all
        apm=            [APM] Advanced Power Management
                        See header of arch/x86/kernel/apm_32.c.
@@ -779,6 +787,13 @@ and is between 256 and 4096 characters. It is defined in the file
                        by the set_ftrace_notrace file in the debugfs
                        tracing directory.
+        ftrace_graph_filter=[function-list]
+                        [FTRACE] Limit the top level callers functions traced
+                        by the function graph tracer at boot up.
+                        function-list is a comma separated list of functions
+                        that can be changed at run time by the
+                        set_graph_function file in the debugfs tracing directory.
        gamecon.map[2|3]=
                        [HW,JOY] Multisystem joystick and NES/SNES/PSX pad
                        support via parallel port (up to 5 devices per port)
@@ -2032,8 +2047,15 @@ and is between 256 and 4096 characters. It is defined in the file
        print-fatal-signals=
                        [KNL] debug: print fatal signals
-                        print-fatal-signals=1: print segfault info to
-                        the kernel console.
+                        If enabled, warn about various signal handling
+                        related application anomalies: too many signals,
+                        too many POSIX.1 timers, fatal signals causing a
+                        coredump - etc.
+                        If you hit the warning due to signal overflow,
+                        you might want to try "ulimit -i unlimited".
                        default: off.
        printk.time=    Show timing data prefixed to each printk message line
@@ -2164,15 +2186,6 @@ and is between 256 and 4096 characters. It is defined in the file
                        Useful for devices that are detected asynchronously
                        (e.g. USB and MMC devices).
-        root_plug.vendor_id=
-                        [ROOTPLUG] Override the default vendor ID
-        root_plug.product_id=
-                        [ROOTPLUG] Override the default product ID
-        root_plug.debug=
-                        [ROOTPLUG] Enable debugging output
        rw              [KNL] Mount root device read-write on boot
        S               [KNL] Run init in single mode
@@ -2182,6 +2195,8 @@ and is between 256 and 4096 characters. It is defined in the file
        sbni=           [NET] Granch SBNI12 leased line adapter
+        sched_debug     [KNL] Enables verbose scheduler debug messages.
        sc1200wdt=      [HW,WDT] SC1200 WDT (watchdog) driver
                        Format: <io>[,<timeout>[,<isapnp>]]
diff --git a/Documentation/pcmcia/driver-changes.txt b/Documentation/pcmcia/driver-changes.txt
index 059934363caf..446f43b309df 100644
--- a/Documentation/pcmcia/driver-changes.txt
+++ b/Documentation/pcmcia/driver-changes.txt
@@ -1,5 +1,17 @@
 This file details changes in 2.6 which affect PCMCIA card driver authors:
+* no cs_error / CS_CHECK / CONFIG_PCMCIA_DEBUG (as of 2.6.33)
+   Instead of the cs_error() callback or the CS_CHECK() macro, please use
+   Linux-style checking of return values, and -- if necessary -- debug
+   messages using "dev_dbg()" or "pr_debug()".
+* New CIS tuple access (as of 2.6.33)
+   Instead of pcmcia_get_{first,next}_tuple(), pcmcia_get_tuple_data() and
+   pcmcia_parse_tuple(), a driver shall use "pcmcia_get_tuple()" if it is
+   only interested in one (raw) tuple, or "pcmcia_loop_tuple()" if it is
+   interested in all tuples of one type. To decode the MAC from CISTPL_FUNCE,
+   a new helper "pcmcia_get_mac_from_cis()" was added.
 * New configuration loop helper (as of 2.6.28)
   By calling pcmcia_loop_config(), a driver can iterate over all available
   configuration options. During a driver's probe() phase, one doesn't need
diff --git a/Documentation/slow-work.txt b/Documentation/slow-work.txt
index ebc50f808ea4..9dbf4470c7e1 100644
--- a/Documentation/slow-work.txt
+++ b/Documentation/slow-work.txt
@@ -41,6 +41,13 @@ expand files, provided the time taken to do so isn't too long.
 Operations of both types may sleep during execution, thus tying up the thread
 loaned to it.
+A further class of work item is available, based on the slow work item class:
+ (*) Delayed slow work items.
+These are slow work items that have a timer to defer queueing of the item for
+a while.
 THREAD-TO-CLASS ALLOCATION
 --------------------------
@@ -64,9 +71,11 @@ USING SLOW WORK ITEMS
 Firstly, a module or subsystem wanting to make use of slow work items must
 register its interest:
-         int ret = slow_work_register_user();
+         int ret = slow_work_register_user(struct module *module);
-This will return 0 if successful, or a -ve error upon failure.
+This will return 0 if successful, or a -ve error upon failure.  The module
+pointer should be the module interested in using this facility (almost
+certainly THIS_MODULE).
 Slow work items may then be set up by:
@@ -93,6 +102,10 @@ Slow work items may then be set up by:
     or:
+        delayed_slow_work_init(&myitem, &myitem_ops);
+     or:
        vslow_work_init(&myitem, &myitem_ops);
     depending on its class.
@@ -102,15 +115,92 @@ A suitably set up work item can then be enqueued for processing:
        int ret = slow_work_enqueue(&myitem);
 This will return a -ve error if the thread pool is unable to gain a reference
-on the item, 0 otherwise.
+on the item, 0 otherwise, or (for delayed work):
+        int ret = delayed_slow_work_enqueue(&myitem, my_jiffy_delay);
 The items are reference counted, so there ought to be no need for a flush
-operation.  When all a module's slow work items have been processed, and the
+operation.  But as the reference counting is optional, means to cancel
+existing work items are also included:
+        cancel_slow_work(&myitem);
+        cancel_delayed_slow_work(&myitem);
+can be used to cancel pending work.  The above cancel function waits for
+existing work to have been executed (or prevent execution of them, depending
+on timing).
+When all a module's slow work items have been processed, and the
 module has no further interest in the facility, it should unregister its
 interest:
-        slow_work_unregister_user();
+        slow_work_unregister_user(struct module *module);
+The module pointer is used to wait for all outstanding work items for that
+module before completing the unregistration.  This prevents the put_ref() code
+from being taken away before it completes.  module should almost certainly be
+THIS_MODULE.
+================
+HELPER FUNCTIONS
+================
+The slow-work facility provides a function by which it can be determined
+whether or not an item is queued for later execution:
+        bool queued = slow_work_is_queued(struct slow_work *work);
+If it returns false, then the item is not on the queue (it may be executing
+with a requeue pending).  This can be used to work out whether an item on which
+another depends is on the queue, thus allowing a dependent item to be queued
+after it.
+If the above shows an item on which another depends not to be queued, then the
+owner of the dependent item might need to wait.  However, to avoid locking up
+the threads unnecessarily be sleeping in them, it can make sense under some
+circumstances to return the work item to the queue, thus deferring it until
+some other items have had a chance to make use of the yielded thread.
+To yield a thread and defer an item, the work function should simply enqueue
+the work item again and return.  However, this doesn't work if there's nothing
+actually on the queue, as the thread just vacated will jump straight back into
+the item's work function, thus busy waiting on a CPU.
+Instead, the item should use the thread to wait for the dependency to go away,
+but rather than using schedule() or schedule_timeout() to sleep, it should use
+the following function:
+        bool requeue = slow_work_sleep_till_thread_needed(
+                        struct slow_work *work,
+                        signed long *_timeout);
+This will add a second wait and then sleep, such that it will be woken up if
+either something appears on the queue that could usefully make use of the
+thread - and behind which this item can be queued, or if the event the caller
+set up to wait for happens.  True will be returned if something else appeared
+on the queue and this work function should perhaps return, of false if
+something else woke it up.  The timeout is as for schedule_timeout().
+For example:
+        wq = bit_waitqueue(&my_flags, MY_BIT);
+        init_wait(&wait);
+        requeue = false;
+        do {
+                prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE);
+                if (!test_bit(MY_BIT, &my_flags))
+                        break;
+                requeue = slow_work_sleep_till_thread_needed(&my_work,
+                                                             &timeout);
+        } while (timeout > 0 && !requeue);
+        finish_wait(wq, &wait);
+        if (!test_bit(MY_BIT, &my_flags)
+                goto do_my_thing;
+        if (requeue)
+                return; // to slow_work
 ===============
@@ -118,7 +208,8 @@ ITEM OPERATIONS
 ===============
 Each work item requires a table of operations of type struct slow_work_ops.
-All members are required:
+Only ->execute() is required; the getting and putting of a reference and the
+describing of an item are all optional.
 (*) Get a reference on an item:
@@ -148,6 +239,16 @@ All members are required:
     This should perform the work required of the item.  It may sleep, it may
     perform disk I/O and it may wait for locks.
+ (*) View an item through /proc:
+        void (*desc)(struct slow_work *work, struct seq_file *m);
+     If supplied, this should print to 'm' a small string describing the work
+     the item is to do.  This should be no more than about 40 characters, and
+     shouldn't include a newline character.
+     See the 'Viewing executing and queued items' section below.
 ==================
 POOL CONFIGURATION
@@ -172,3 +273,50 @@ The slow-work thread pool has a number of configurables:
     is bounded to between 1 and one fewer than the number of active threads.
     This ensures there is always at least one thread that can process very
     slow work items, and always at least one thread that won't.
+==================================
+VIEWING EXECUTING AND QUEUED ITEMS
+==================================
+If CONFIG_SLOW_WORK_DEBUG is enabled, a debugfs file is made available:
+        /sys/kernel/debug/slow_work/runqueue
+through which the list of work items being executed and the queues of items to
+be executed may be viewed.  The owner of a work item is given the chance to
+add some information of its own.
+The contents look something like the following:
+    THR PID   ITEM ADDR        FL MARK  DESC
+    === ===== ================ == ===== ==========
+      0  3005 ffff880023f52348  a 952ms FSC: OBJ17d3: LOOK
+      1  3006 ffff880024e33668  2 160ms FSC: OBJ17e5 OP60d3b: Write1/Store fl=2
+      2  3165 ffff8800296dd180  a 424ms FSC: OBJ17e4: LOOK
+      3  4089 ffff8800262c8d78  a 212ms FSC: OBJ17ea: CRTN
+      4  4090 ffff88002792bed8  2 388ms FSC: OBJ17e8 OP60d36: Write1/Store fl=2
+      5  4092 ffff88002a0ef308  2 388ms FSC: OBJ17e7 OP60d2e: Write1/Store fl=2
+      6  4094 ffff88002abaf4b8  2 132ms FSC: OBJ17e2 OP60d4e: Write1/Store fl=2
+      7  4095 ffff88002bb188e0  a 388ms FSC: OBJ17e9: CRTN
+    vsq     - ffff880023d99668  1 308ms FSC: OBJ17e0 OP60f91: Write1/EnQ fl=2
+    vsq     - ffff8800295d1740  1 212ms FSC: OBJ16be OP4d4b6: Write1/EnQ fl=2
+    vsq     - ffff880025ba3308  1 160ms FSC: OBJ179a OP58dec: Write1/EnQ fl=2
+    vsq     - ffff880024ec83e0  1 160ms FSC: OBJ17ae OP599f2: Write1/EnQ fl=2
+    vsq     - ffff880026618e00  1 160ms FSC: OBJ17e6 OP60d33: Write1/EnQ fl=2
+    vsq     - ffff880025a2a4b8  1 132ms FSC: OBJ16a2 OP4d583: Write1/EnQ fl=2
+    vsq     - ffff880023cbe6d8  9 212ms FSC: OBJ17eb: LOOK
+    vsq     - ffff880024d37590  9 212ms FSC: OBJ17ec: LOOK
+    vsq     - ffff880027746cb0  9 212ms FSC: OBJ17ed: LOOK
+    vsq     - ffff880024d37ae8  9 212ms FSC: OBJ17ee: LOOK
+    vsq     - ffff880024d37cb0  9 212ms FSC: OBJ17ef: LOOK
+    vsq     - ffff880025036550  9 212ms FSC: OBJ17f0: LOOK
+    vsq     - ffff8800250368e0  9 212ms FSC: OBJ17f1: LOOK
+    vsq     - ffff880025036aa8  9 212ms FSC: OBJ17f2: LOOK
+In the 'THR' column, executing items show the thread they're occupying and
+queued threads indicate which queue they're on.  'PID' shows the process ID of
+a slow-work thread that's executing something.  'FL' shows the work item flags.
+'MARK' indicates how long since an item was queued or began executing.  Lastly,
+the 'DESC' column permits the owner of an item to give some information.
diff --git a/Documentation/sysctl/ctl_unnumbered.txt b/Documentation/sysctl/ctl_unnumbered.txt
deleted file mode 100644
index 23003a8ea3e7..000000000000
--- a/Documentation/sysctl/ctl_unnumbered.txt
+++ /dev/null
@@ -1,22 +0,0 @@
-Except for a few extremely rare exceptions user space applications do not use
-the binary sysctl interface.  Instead everyone uses /proc/sys/...  with
-readable ascii names.
-Recently the kernel has started supporting setting the binary sysctl value to
-CTL_UNNUMBERED so we no longer need to assign a binary sysctl path to allow
-sysctls to show up in /proc/sys.
-Assigning binary sysctl numbers is an endless source of conflicts in sysctl.h,
-breaking of the user space ABI (because of those conflicts), and maintenance
-problems.  A complete pass through all of the sysctl users revealed multiple
-instances where the sysctl binary interface was broken and had gone undetected
-for years.
-So please do not add new binary sysctl numbers.  They are unneeded and
-problematic.
-If you really need a new binary sysctl number please first merge your sysctl
-into the kernel and then as a separate patch allocate a binary sysctl number.
-(ebiederm@xmission.com, June 2007)
diff --git a/Documentation/trace/ftrace-design.txt b/Documentation/trace/ftrace-design.txt
index 7003e10f10f5..641a1ef2a7ff 100644
--- a/Documentation/trace/ftrace-design.txt
+++ b/Documentation/trace/ftrace-design.txt
@@ -213,10 +213,19 @@ If you can't trace NMI functions, then skip this option.
 <details to be filled>
-HAVE_FTRACE_SYSCALLS
+HAVE_SYSCALL_TRACEPOINTS
 ---------------------
-<details to be filled>
+You need very few things to get the syscalls tracing in an arch.
+- Have a NR_syscalls variable in <asm/unistd.h> that provides the number
+  of syscalls supported by the arch.
+- Implement arch_syscall_addr() that resolves a syscall address from a
+  syscall number.
+- Support the TIF_SYSCALL_TRACEPOINT thread flags
+- Put the trace_sys_enter() and trace_sys_exit() tracepoints calls from ptrace
+  in the ptrace syscalls tracing path.
+- Tag this arch as HAVE_SYSCALL_TRACEPOINTS.
 HAVE_FTRACE_MCOUNT_RECORD
diff --git a/Documentation/trace/kprobetrace.txt b/Documentation/trace/kprobetrace.txt
new file mode 100644
index 000000000000..47aabeebbdf6
--- /dev/null
+++ b/Documentation/trace/kprobetrace.txt
@@ -0,0 +1,149 @@
+                        Kprobe-based Event Tracing
+                        ==========================
+                 Documentation is written by Masami Hiramatsu
+Overview
+--------
+These events are similar to tracepoint based events. Instead of Tracepoint,
+this is based on kprobes (kprobe and kretprobe). So it can probe wherever
+kprobes can probe (this means, all functions body except for __kprobes
+functions). Unlike the Tracepoint based event, this can be added and removed
+dynamically, on the fly.
+To enable this feature, build your kernel with CONFIG_KPROBE_TRACING=y.
+Similar to the events tracer, this doesn't need to be activated via
+current_tracer. Instead of that, add probe points via
+/sys/kernel/debug/tracing/kprobe_events, and enable it via
+/sys/kernel/debug/tracing/events/kprobes/<EVENT>/enabled.
+Synopsis of kprobe_events
+-------------------------
+  p[:[GRP/]EVENT] SYMBOL[+offs]|MEMADDR [FETCHARGS]     : Set a probe
+  r[:[GRP/]EVENT] SYMBOL[+0] [FETCHARGS]                : Set a return probe
+ GRP            : Group name. If omitted, use "kprobes" for it.
+ EVENT          : Event name. If omitted, the event name is generated
+                  based on SYMBOL+offs or MEMADDR.
+ SYMBOL[+offs]  : Symbol+offset where the probe is inserted.
+ MEMADDR        : Address where the probe is inserted.
+ FETCHARGS      : Arguments. Each probe can have up to 128 args.
+  %REG          : Fetch register REG
+  @ADDR         : Fetch memory at ADDR (ADDR should be in kernel)
+  @SYM[+|-offs] : Fetch memory at SYM +|- offs (SYM should be a data symbol)
+  $stackN       : Fetch Nth entry of stack (N >= 0)
+  $stack        : Fetch stack address.
+  $argN         : Fetch function argument. (N >= 0)(*)
+  $retval       : Fetch return value.(**)
+  +|-offs(FETCHARG) : Fetch memory at FETCHARG +|- offs address.(***)
+  NAME=FETCHARG: Set NAME as the argument name of FETCHARG.
+  (*) aN may not correct on asmlinkaged functions and at the middle of
+      function body.
+  (**) only for return probe.
+  (***) this is useful for fetching a field of data structures.
+Per-Probe Event Filtering
+-------------------------
+ Per-probe event filtering feature allows you to set different filter on each
+probe and gives you what arguments will be shown in trace buffer. If an event
+name is specified right after 'p:' or 'r:' in kprobe_events, it adds an event
+under tracing/events/kprobes/<EVENT>, at the directory you can see 'id',
+'enabled', 'format' and 'filter'.
+enabled:
+  You can enable/disable the probe by writing 1 or 0 on it.
+format:
+  This shows the format of this probe event.
+filter:
+  You can write filtering rules of this event.
+id:
+  This shows the id of this probe event.
+Event Profiling
+---------------
+ You can check the total number of probe hits and probe miss-hits via
+/sys/kernel/debug/tracing/kprobe_profile.
+ The first column is event name, the second is the number of probe hits,
+the third is the number of probe miss-hits.
+Usage examples
+--------------
+To add a probe as a new event, write a new definition to kprobe_events
+as below.
+  echo p:myprobe do_sys_open dfd=$arg0 filename=$arg1 flags=$arg2 mode=$arg3 > /sys/kernel/debug/tracing/kprobe_events
+ This sets a kprobe on the top of do_sys_open() function with recording
+1st to 4th arguments as "myprobe" event. As this example shows, users can
+choose more familiar names for each arguments.
+  echo r:myretprobe do_sys_open $retval >> /sys/kernel/debug/tracing/kprobe_events
+ This sets a kretprobe on the return point of do_sys_open() function with
+recording return value as "myretprobe" event.
+ You can see the format of these events via
+/sys/kernel/debug/tracing/events/kprobes/<EVENT>/format.
+  cat /sys/kernel/debug/tracing/events/kprobes/myprobe/format
+name: myprobe
+ID: 75
+format:
+        field:unsigned short common_type;       offset:0;       size:2;
+        field:unsigned char common_flags;       offset:2;       size:1;
+        field:unsigned char common_preempt_count;       offset:3;       size:1;
+        field:int common_pid;   offset:4;       size:4;
+        field:int common_tgid;  offset:8;       size:4;
+        field: unsigned long ip;        offset:16;tsize:8;
+        field: int nargs;       offset:24;tsize:4;
+        field: unsigned long dfd;       offset:32;tsize:8;
+        field: unsigned long filename;  offset:40;tsize:8;
+        field: unsigned long flags;     offset:48;tsize:8;
+        field: unsigned long mode;      offset:56;tsize:8;
+print fmt: "(%lx) dfd=%lx filename=%lx flags=%lx mode=%lx", REC->ip, REC->dfd, REC->filename, REC->flags, REC->mode
+ You can see that the event has 4 arguments as in the expressions you specified.
+  echo > /sys/kernel/debug/tracing/kprobe_events
+ This clears all probe points.
+ Right after definition, each event is disabled by default. For tracing these
+events, you need to enable it.
+  echo 1 > /sys/kernel/debug/tracing/events/kprobes/myprobe/enable
+  echo 1 > /sys/kernel/debug/tracing/events/kprobes/myretprobe/enable
+ And you can see the traced information via /sys/kernel/debug/tracing/trace.
+  cat /sys/kernel/debug/tracing/trace
+# tracer: nop
+#
+#           TASK-PID    CPU#    TIMESTAMP  FUNCTION
+#              | |       |          |         |
+           <...>-1447  [001] 1038282.286875: myprobe: (do_sys_open+0x0/0xd6) dfd=3 filename=7fffd1ec4440 flags=8000 mode=0
+           <...>-1447  [001] 1038282.286878: myretprobe: (sys_openat+0xc/0xe <- do_sys_open) $retval=fffffffffffffffe
+           <...>-1447  [001] 1038282.286885: myprobe: (do_sys_open+0x0/0xd6) dfd=ffffff9c filename=40413c flags=8000 mode=1b6
+           <...>-1447  [001] 1038282.286915: myretprobe: (sys_open+0x1b/0x1d <- do_sys_open) $retval=3
+           <...>-1447  [001] 1038282.286969: myprobe: (do_sys_open+0x0/0xd6) dfd=ffffff9c filename=4041c6 flags=98800 mode=10
+           <...>-1447  [001] 1038282.286976: myretprobe: (sys_open+0x1b/0x1d <- do_sys_open) $retval=3
+ Each line shows when the kernel hits an event, and <- SYMBOL means kernel
+returns from SYMBOL(e.g. "sys_open+0x1b/0x1d <- do_sys_open" means kernel
+returns from do_sys_open to sys_open+0x1b).