diff options
Diffstat (limited to 'Documentation')
-rw-r--r-- | Documentation/ABI/stable/sysfs-devices-node | 7 | ||||
-rw-r--r-- | Documentation/fault-injection/provoke-crashes.txt | 38 | ||||
-rw-r--r-- | Documentation/feature-removal-schedule.txt | 32 | ||||
-rw-r--r-- | Documentation/filesystems/Locking | 18 | ||||
-rw-r--r-- | Documentation/filesystems/nfs/nfs41-server.txt | 5 | ||||
-rw-r--r-- | Documentation/filesystems/proc.txt | 53 | ||||
-rw-r--r-- | Documentation/gpio.txt | 64 | ||||
-rw-r--r-- | Documentation/init.txt | 49 | ||||
-rw-r--r-- | Documentation/kprobes.txt | 207 | ||||
-rw-r--r-- | Documentation/kvm/api.txt | 12 | ||||
-rw-r--r-- | Documentation/vm/slub.txt | 1 |
11 files changed, 446 insertions, 40 deletions
diff --git a/Documentation/ABI/stable/sysfs-devices-node b/Documentation/ABI/stable/sysfs-devices-node new file mode 100644 index 000000000000..49b82cad7003 --- /dev/null +++ b/Documentation/ABI/stable/sysfs-devices-node | |||
@@ -0,0 +1,7 @@ | |||
1 | What: /sys/devices/system/node/nodeX | ||
2 | Date: October 2002 | ||
3 | Contact: Linux Memory Management list <linux-mm@kvack.org> | ||
4 | Description: | ||
5 | When CONFIG_NUMA is enabled, this is a directory containing | ||
6 | information on node X such as what CPUs are local to the | ||
7 | node. | ||
diff --git a/Documentation/fault-injection/provoke-crashes.txt b/Documentation/fault-injection/provoke-crashes.txt new file mode 100644 index 000000000000..7a9d3d81525b --- /dev/null +++ b/Documentation/fault-injection/provoke-crashes.txt | |||
@@ -0,0 +1,38 @@ | |||
1 | The lkdtm module provides an interface to crash or injure the kernel at | ||
2 | predefined crashpoints to evaluate the reliability of crash dumps obtained | ||
3 | using different dumping solutions. The module uses KPROBEs to instrument | ||
4 | crashing points, but can also crash the kernel directly without KRPOBE | ||
5 | support. | ||
6 | |||
7 | |||
8 | You can provide the way either through module arguments when inserting | ||
9 | the module, or through a debugfs interface. | ||
10 | |||
11 | Usage: insmod lkdtm.ko [recur_count={>0}] cpoint_name=<> cpoint_type=<> | ||
12 | [cpoint_count={>0}] | ||
13 | |||
14 | recur_count : Recursion level for the stack overflow test. Default is 10. | ||
15 | |||
16 | cpoint_name : Crash point where the kernel is to be crashed. It can be | ||
17 | one of INT_HARDWARE_ENTRY, INT_HW_IRQ_EN, INT_TASKLET_ENTRY, | ||
18 | FS_DEVRW, MEM_SWAPOUT, TIMERADD, SCSI_DISPATCH_CMD, | ||
19 | IDE_CORE_CP, DIRECT | ||
20 | |||
21 | cpoint_type : Indicates the action to be taken on hitting the crash point. | ||
22 | It can be one of PANIC, BUG, EXCEPTION, LOOP, OVERFLOW, | ||
23 | CORRUPT_STACK, UNALIGNED_LOAD_STORE_WRITE, OVERWRITE_ALLOCATION, | ||
24 | WRITE_AFTER_FREE, | ||
25 | |||
26 | cpoint_count : Indicates the number of times the crash point is to be hit | ||
27 | to trigger an action. The default is 10. | ||
28 | |||
29 | You can also induce failures by mounting debugfs and writing the type to | ||
30 | <mountpoint>/provoke-crash/<crashpoint>. E.g., | ||
31 | |||
32 | mount -t debugfs debugfs /mnt | ||
33 | echo EXCEPTION > /mnt/provoke-crash/INT_HARDWARE_ENTRY | ||
34 | |||
35 | |||
36 | A special file is `DIRECT' which will induce the crash directly without | ||
37 | KPROBE instrumentation. This mode is the only one available when the module | ||
38 | is built on a kernel without KPROBEs support. | ||
diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index 8debdd625e1a..a5cc0db63d7a 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt | |||
@@ -550,3 +550,35 @@ Why: udev fully replaces this special file system that only contains CAPI | |||
550 | NCCI TTY device nodes. User space (pppdcapiplugin) works without | 550 | NCCI TTY device nodes. User space (pppdcapiplugin) works without |
551 | noticing the difference. | 551 | noticing the difference. |
552 | Who: Jan Kiszka <jan.kiszka@web.de> | 552 | Who: Jan Kiszka <jan.kiszka@web.de> |
553 | |||
554 | ---------------------------- | ||
555 | |||
556 | What: KVM memory aliases support | ||
557 | When: July 2010 | ||
558 | Why: Memory aliasing support is used for speeding up guest vga access | ||
559 | through the vga windows. | ||
560 | |||
561 | Modern userspace no longer uses this feature, so it's just bitrotted | ||
562 | code and can be removed with no impact. | ||
563 | Who: Avi Kivity <avi@redhat.com> | ||
564 | |||
565 | ---------------------------- | ||
566 | |||
567 | What: KVM kernel-allocated memory slots | ||
568 | When: July 2010 | ||
569 | Why: Since 2.6.25, kvm supports user-allocated memory slots, which are | ||
570 | much more flexible than kernel-allocated slots. All current userspace | ||
571 | supports the newer interface and this code can be removed with no | ||
572 | impact. | ||
573 | Who: Avi Kivity <avi@redhat.com> | ||
574 | |||
575 | ---------------------------- | ||
576 | |||
577 | What: KVM paravirt mmu host support | ||
578 | When: January 2011 | ||
579 | Why: The paravirt mmu host support is slower than non-paravirt mmu, both | ||
580 | on newer and older hardware. It is already not exposed to the guest, | ||
581 | and kept only for live migration purposes. | ||
582 | Who: Avi Kivity <avi@redhat.com> | ||
583 | |||
584 | ---------------------------- | ||
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 18b9d0ca0630..06bbbed71206 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking | |||
@@ -460,13 +460,6 @@ in sys_read() and friends. | |||
460 | 460 | ||
461 | --------------------------- dquot_operations ------------------------------- | 461 | --------------------------- dquot_operations ------------------------------- |
462 | prototypes: | 462 | prototypes: |
463 | int (*initialize) (struct inode *, int); | ||
464 | int (*drop) (struct inode *); | ||
465 | int (*alloc_space) (struct inode *, qsize_t, int); | ||
466 | int (*alloc_inode) (const struct inode *, unsigned long); | ||
467 | int (*free_space) (struct inode *, qsize_t); | ||
468 | int (*free_inode) (const struct inode *, unsigned long); | ||
469 | int (*transfer) (struct inode *, struct iattr *); | ||
470 | int (*write_dquot) (struct dquot *); | 463 | int (*write_dquot) (struct dquot *); |
471 | int (*acquire_dquot) (struct dquot *); | 464 | int (*acquire_dquot) (struct dquot *); |
472 | int (*release_dquot) (struct dquot *); | 465 | int (*release_dquot) (struct dquot *); |
@@ -479,13 +472,6 @@ a proper locking wrt the filesystem and call the generic quota operations. | |||
479 | What filesystem should expect from the generic quota functions: | 472 | What filesystem should expect from the generic quota functions: |
480 | 473 | ||
481 | FS recursion Held locks when called | 474 | FS recursion Held locks when called |
482 | initialize: yes maybe dqonoff_sem | ||
483 | drop: yes - | ||
484 | alloc_space: ->mark_dirty() - | ||
485 | alloc_inode: ->mark_dirty() - | ||
486 | free_space: ->mark_dirty() - | ||
487 | free_inode: ->mark_dirty() - | ||
488 | transfer: yes - | ||
489 | write_dquot: yes dqonoff_sem or dqptr_sem | 475 | write_dquot: yes dqonoff_sem or dqptr_sem |
490 | acquire_dquot: yes dqonoff_sem or dqptr_sem | 476 | acquire_dquot: yes dqonoff_sem or dqptr_sem |
491 | release_dquot: yes dqonoff_sem or dqptr_sem | 477 | release_dquot: yes dqonoff_sem or dqptr_sem |
@@ -495,10 +481,6 @@ write_info: yes dqonoff_sem | |||
495 | FS recursion means calling ->quota_read() and ->quota_write() from superblock | 481 | FS recursion means calling ->quota_read() and ->quota_write() from superblock |
496 | operations. | 482 | operations. |
497 | 483 | ||
498 | ->alloc_space(), ->alloc_inode(), ->free_space(), ->free_inode() are called | ||
499 | only directly by the filesystem and do not call any fs functions only | ||
500 | the ->mark_dirty() operation. | ||
501 | |||
502 | More details about quota locking can be found in fs/dquot.c. | 484 | More details about quota locking can be found in fs/dquot.c. |
503 | 485 | ||
504 | --------------------------- vm_operations_struct ----------------------------- | 486 | --------------------------- vm_operations_struct ----------------------------- |
diff --git a/Documentation/filesystems/nfs/nfs41-server.txt b/Documentation/filesystems/nfs/nfs41-server.txt index 1bd0d0c05171..6a53a84afc72 100644 --- a/Documentation/filesystems/nfs/nfs41-server.txt +++ b/Documentation/filesystems/nfs/nfs41-server.txt | |||
@@ -17,8 +17,7 @@ kernels must turn 4.1 on or off *before* turning support for version 4 | |||
17 | on or off; rpc.nfsd does this correctly.) | 17 | on or off; rpc.nfsd does this correctly.) |
18 | 18 | ||
19 | The NFSv4 minorversion 1 (NFSv4.1) implementation in nfsd is based | 19 | The NFSv4 minorversion 1 (NFSv4.1) implementation in nfsd is based |
20 | on the latest NFSv4.1 Internet Draft: | 20 | on RFC 5661. |
21 | http://tools.ietf.org/html/draft-ietf-nfsv4-minorversion1-29 | ||
22 | 21 | ||
23 | From the many new features in NFSv4.1 the current implementation | 22 | From the many new features in NFSv4.1 the current implementation |
24 | focuses on the mandatory-to-implement NFSv4.1 Sessions, providing | 23 | focuses on the mandatory-to-implement NFSv4.1 Sessions, providing |
@@ -44,7 +43,7 @@ interoperability problems with future clients. Known issues: | |||
44 | trunking, but this is a mandatory feature, and its use is | 43 | trunking, but this is a mandatory feature, and its use is |
45 | recommended to clients in a number of places. (E.g. to ensure | 44 | recommended to clients in a number of places. (E.g. to ensure |
46 | timely renewal in case an existing connection's retry timeouts | 45 | timely renewal in case an existing connection's retry timeouts |
47 | have gotten too long; see section 8.3 of the draft.) | 46 | have gotten too long; see section 8.3 of the RFC.) |
48 | Therefore, lack of this feature may cause future clients to | 47 | Therefore, lack of this feature may cause future clients to |
49 | fail. | 48 | fail. |
50 | - Incomplete backchannel support: incomplete backchannel gss | 49 | - Incomplete backchannel support: incomplete backchannel gss |
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 0d07513a67a6..96a44dd95e03 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt | |||
@@ -164,6 +164,7 @@ read the file /proc/PID/status: | |||
164 | VmExe: 68 kB | 164 | VmExe: 68 kB |
165 | VmLib: 1412 kB | 165 | VmLib: 1412 kB |
166 | VmPTE: 20 kb | 166 | VmPTE: 20 kb |
167 | VmSwap: 0 kB | ||
167 | Threads: 1 | 168 | Threads: 1 |
168 | SigQ: 0/28578 | 169 | SigQ: 0/28578 |
169 | SigPnd: 0000000000000000 | 170 | SigPnd: 0000000000000000 |
@@ -188,6 +189,12 @@ memory usage. Its seven fields are explained in Table 1-3. The stat file | |||
188 | contains details information about the process itself. Its fields are | 189 | contains details information about the process itself. Its fields are |
189 | explained in Table 1-4. | 190 | explained in Table 1-4. |
190 | 191 | ||
192 | (for SMP CONFIG users) | ||
193 | For making accounting scalable, RSS related information are handled in | ||
194 | asynchronous manner and the vaule may not be very precise. To see a precise | ||
195 | snapshot of a moment, you can see /proc/<pid>/smaps file and scan page table. | ||
196 | It's slow but very precise. | ||
197 | |||
191 | Table 1-2: Contents of the statm files (as of 2.6.30-rc7) | 198 | Table 1-2: Contents of the statm files (as of 2.6.30-rc7) |
192 | .............................................................................. | 199 | .............................................................................. |
193 | Field Content | 200 | Field Content |
@@ -213,6 +220,7 @@ Table 1-2: Contents of the statm files (as of 2.6.30-rc7) | |||
213 | VmExe size of text segment | 220 | VmExe size of text segment |
214 | VmLib size of shared library code | 221 | VmLib size of shared library code |
215 | VmPTE size of page table entries | 222 | VmPTE size of page table entries |
223 | VmSwap size of swap usage (the number of referred swapents) | ||
216 | Threads number of threads | 224 | Threads number of threads |
217 | SigQ number of signals queued/max. number for queue | 225 | SigQ number of signals queued/max. number for queue |
218 | SigPnd bitmap of pending signals for the thread | 226 | SigPnd bitmap of pending signals for the thread |
@@ -430,6 +438,7 @@ Table 1-5: Kernel info in /proc | |||
430 | modules List of loaded modules | 438 | modules List of loaded modules |
431 | mounts Mounted filesystems | 439 | mounts Mounted filesystems |
432 | net Networking info (see text) | 440 | net Networking info (see text) |
441 | pagetypeinfo Additional page allocator information (see text) (2.5) | ||
433 | partitions Table of partitions known to the system | 442 | partitions Table of partitions known to the system |
434 | pci Deprecated info of PCI bus (new way -> /proc/bus/pci/, | 443 | pci Deprecated info of PCI bus (new way -> /proc/bus/pci/, |
435 | decoupled by lspci (2.4) | 444 | decoupled by lspci (2.4) |
@@ -584,7 +593,7 @@ Node 0, zone DMA 0 4 5 4 4 3 ... | |||
584 | Node 0, zone Normal 1 0 0 1 101 8 ... | 593 | Node 0, zone Normal 1 0 0 1 101 8 ... |
585 | Node 0, zone HighMem 2 0 0 1 1 0 ... | 594 | Node 0, zone HighMem 2 0 0 1 1 0 ... |
586 | 595 | ||
587 | Memory fragmentation is a problem under some workloads, and buddyinfo is a | 596 | External fragmentation is a problem under some workloads, and buddyinfo is a |
588 | useful tool for helping diagnose these problems. Buddyinfo will give you a | 597 | useful tool for helping diagnose these problems. Buddyinfo will give you a |
589 | clue as to how big an area you can safely allocate, or why a previous | 598 | clue as to how big an area you can safely allocate, or why a previous |
590 | allocation failed. | 599 | allocation failed. |
@@ -594,6 +603,48 @@ available. In this case, there are 0 chunks of 2^0*PAGE_SIZE available in | |||
594 | ZONE_DMA, 4 chunks of 2^1*PAGE_SIZE in ZONE_DMA, 101 chunks of 2^4*PAGE_SIZE | 603 | ZONE_DMA, 4 chunks of 2^1*PAGE_SIZE in ZONE_DMA, 101 chunks of 2^4*PAGE_SIZE |
595 | available in ZONE_NORMAL, etc... | 604 | available in ZONE_NORMAL, etc... |
596 | 605 | ||
606 | More information relevant to external fragmentation can be found in | ||
607 | pagetypeinfo. | ||
608 | |||
609 | > cat /proc/pagetypeinfo | ||
610 | Page block order: 9 | ||
611 | Pages per block: 512 | ||
612 | |||
613 | Free pages count per migrate type at order 0 1 2 3 4 5 6 7 8 9 10 | ||
614 | Node 0, zone DMA, type Unmovable 0 0 0 1 1 1 1 1 1 1 0 | ||
615 | Node 0, zone DMA, type Reclaimable 0 0 0 0 0 0 0 0 0 0 0 | ||
616 | Node 0, zone DMA, type Movable 1 1 2 1 2 1 1 0 1 0 2 | ||
617 | Node 0, zone DMA, type Reserve 0 0 0 0 0 0 0 0 0 1 0 | ||
618 | Node 0, zone DMA, type Isolate 0 0 0 0 0 0 0 0 0 0 0 | ||
619 | Node 0, zone DMA32, type Unmovable 103 54 77 1 1 1 11 8 7 1 9 | ||
620 | Node 0, zone DMA32, type Reclaimable 0 0 2 1 0 0 0 0 1 0 0 | ||
621 | Node 0, zone DMA32, type Movable 169 152 113 91 77 54 39 13 6 1 452 | ||
622 | Node 0, zone DMA32, type Reserve 1 2 2 2 2 0 1 1 1 1 0 | ||
623 | Node 0, zone DMA32, type Isolate 0 0 0 0 0 0 0 0 0 0 0 | ||
624 | |||
625 | Number of blocks type Unmovable Reclaimable Movable Reserve Isolate | ||
626 | Node 0, zone DMA 2 0 5 1 0 | ||
627 | Node 0, zone DMA32 41 6 967 2 0 | ||
628 | |||
629 | Fragmentation avoidance in the kernel works by grouping pages of different | ||
630 | migrate types into the same contiguous regions of memory called page blocks. | ||
631 | A page block is typically the size of the default hugepage size e.g. 2MB on | ||
632 | X86-64. By keeping pages grouped based on their ability to move, the kernel | ||
633 | can reclaim pages within a page block to satisfy a high-order allocation. | ||
634 | |||
635 | The pagetypinfo begins with information on the size of a page block. It | ||
636 | then gives the same type of information as buddyinfo except broken down | ||
637 | by migrate-type and finishes with details on how many page blocks of each | ||
638 | type exist. | ||
639 | |||
640 | If min_free_kbytes has been tuned correctly (recommendations made by hugeadm | ||
641 | from libhugetlbfs http://sourceforge.net/projects/libhugetlbfs/), one can | ||
642 | make an estimate of the likely number of huge pages that can be allocated | ||
643 | at a given point in time. All the "Movable" blocks should be allocatable | ||
644 | unless memory has been mlock()'d. Some of the Reclaimable blocks should | ||
645 | also be allocatable although a lot of filesystem metadata may have to be | ||
646 | reclaimed to achieve this. | ||
647 | |||
597 | .............................................................................. | 648 | .............................................................................. |
598 | 649 | ||
599 | meminfo: | 650 | meminfo: |
diff --git a/Documentation/gpio.txt b/Documentation/gpio.txt index 1866c27eec69..c2c6e9b39bbe 100644 --- a/Documentation/gpio.txt +++ b/Documentation/gpio.txt | |||
@@ -253,6 +253,70 @@ pin setup (e.g. controlling which pin the GPIO uses, pullup/pulldown). | |||
253 | Also note that it's your responsibility to have stopped using a GPIO | 253 | Also note that it's your responsibility to have stopped using a GPIO |
254 | before you free it. | 254 | before you free it. |
255 | 255 | ||
256 | Considering in most cases GPIOs are actually configured right after they | ||
257 | are claimed, three additional calls are defined: | ||
258 | |||
259 | /* request a single GPIO, with initial configuration specified by | ||
260 | * 'flags', identical to gpio_request() wrt other arguments and | ||
261 | * return value | ||
262 | */ | ||
263 | int gpio_request_one(unsigned gpio, unsigned long flags, const char *label); | ||
264 | |||
265 | /* request multiple GPIOs in a single call | ||
266 | */ | ||
267 | int gpio_request_array(struct gpio *array, size_t num); | ||
268 | |||
269 | /* release multiple GPIOs in a single call | ||
270 | */ | ||
271 | void gpio_free_array(struct gpio *array, size_t num); | ||
272 | |||
273 | where 'flags' is currently defined to specify the following properties: | ||
274 | |||
275 | * GPIOF_DIR_IN - to configure direction as input | ||
276 | * GPIOF_DIR_OUT - to configure direction as output | ||
277 | |||
278 | * GPIOF_INIT_LOW - as output, set initial level to LOW | ||
279 | * GPIOF_INIT_HIGH - as output, set initial level to HIGH | ||
280 | |||
281 | since GPIOF_INIT_* are only valid when configured as output, so group valid | ||
282 | combinations as: | ||
283 | |||
284 | * GPIOF_IN - configure as input | ||
285 | * GPIOF_OUT_INIT_LOW - configured as output, initial level LOW | ||
286 | * GPIOF_OUT_INIT_HIGH - configured as output, initial level HIGH | ||
287 | |||
288 | In the future, these flags can be extended to support more properties such | ||
289 | as open-drain status. | ||
290 | |||
291 | Further more, to ease the claim/release of multiple GPIOs, 'struct gpio' is | ||
292 | introduced to encapsulate all three fields as: | ||
293 | |||
294 | struct gpio { | ||
295 | unsigned gpio; | ||
296 | unsigned long flags; | ||
297 | const char *label; | ||
298 | }; | ||
299 | |||
300 | A typical example of usage: | ||
301 | |||
302 | static struct gpio leds_gpios[] = { | ||
303 | { 32, GPIOF_OUT_INIT_HIGH, "Power LED" }, /* default to ON */ | ||
304 | { 33, GPIOF_OUT_INIT_LOW, "Green LED" }, /* default to OFF */ | ||
305 | { 34, GPIOF_OUT_INIT_LOW, "Red LED" }, /* default to OFF */ | ||
306 | { 35, GPIOF_OUT_INIT_LOW, "Blue LED" }, /* default to OFF */ | ||
307 | { ... }, | ||
308 | }; | ||
309 | |||
310 | err = gpio_request_one(31, GPIOF_IN, "Reset Button"); | ||
311 | if (err) | ||
312 | ... | ||
313 | |||
314 | err = gpio_request_array(leds_gpios, ARRAY_SIZE(leds_gpios)); | ||
315 | if (err) | ||
316 | ... | ||
317 | |||
318 | gpio_free_array(leds_gpios, ARRAY_SIZE(leds_gpios)); | ||
319 | |||
256 | 320 | ||
257 | GPIOs mapped to IRQs | 321 | GPIOs mapped to IRQs |
258 | -------------------- | 322 | -------------------- |
diff --git a/Documentation/init.txt b/Documentation/init.txt new file mode 100644 index 000000000000..535ad5e82b98 --- /dev/null +++ b/Documentation/init.txt | |||
@@ -0,0 +1,49 @@ | |||
1 | Explaining the dreaded "No init found." boot hang message | ||
2 | ========================================================= | ||
3 | |||
4 | OK, so you've got this pretty unintuitive message (currently located | ||
5 | in init/main.c) and are wondering what the H*** went wrong. | ||
6 | Some high-level reasons for failure (listed roughly in order of execution) | ||
7 | to load the init binary are: | ||
8 | A) Unable to mount root FS | ||
9 | B) init binary doesn't exist on rootfs | ||
10 | C) broken console device | ||
11 | D) binary exists but dependencies not available | ||
12 | E) binary cannot be loaded | ||
13 | |||
14 | Detailed explanations: | ||
15 | 0) Set "debug" kernel parameter (in bootloader config file or CONFIG_CMDLINE) | ||
16 | to get more detailed kernel messages. | ||
17 | A) make sure you have the correct root FS type | ||
18 | (and root= kernel parameter points to the correct partition), | ||
19 | required drivers such as storage hardware (such as SCSI or USB!) | ||
20 | and filesystem (ext3, jffs2 etc.) are builtin (alternatively as modules, | ||
21 | to be pre-loaded by an initrd) | ||
22 | C) Possibly a conflict in console= setup --> initial console unavailable. | ||
23 | E.g. some serial consoles are unreliable due to serial IRQ issues (e.g. | ||
24 | missing interrupt-based configuration). | ||
25 | Try using a different console= device or e.g. netconsole= . | ||
26 | D) e.g. required library dependencies of the init binary such as | ||
27 | /lib/ld-linux.so.2 missing or broken. Use readelf -d <INIT>|grep NEEDED | ||
28 | to find out which libraries are required. | ||
29 | E) make sure the binary's architecture matches your hardware. | ||
30 | E.g. i386 vs. x86_64 mismatch, or trying to load x86 on ARM hardware. | ||
31 | In case you tried loading a non-binary file here (shell script?), | ||
32 | you should make sure that the script specifies an interpreter in its shebang | ||
33 | header line (#!/...) that is fully working (including its library | ||
34 | dependencies). And before tackling scripts, better first test a simple | ||
35 | non-script binary such as /bin/sh and confirm its successful execution. | ||
36 | To find out more, add code to init/main.c to display kernel_execve()s | ||
37 | return values. | ||
38 | |||
39 | Please extend this explanation whenever you find new failure causes | ||
40 | (after all loading the init binary is a CRITICAL and hard transition step | ||
41 | which needs to be made as painless as possible), then submit patch to LKML. | ||
42 | Further TODOs: | ||
43 | - Implement the various run_init_process() invocations via a struct array | ||
44 | which can then store the kernel_execve() result value and on failure | ||
45 | log it all by iterating over _all_ results (very important usability fix). | ||
46 | - try to make the implementation itself more helpful in general, | ||
47 | e.g. by providing additional error messages at affected places. | ||
48 | |||
49 | Andreas Mohr <andi at lisas period de> | ||
diff --git a/Documentation/kprobes.txt b/Documentation/kprobes.txt index 053037a1fe6d..2f9115c0ae62 100644 --- a/Documentation/kprobes.txt +++ b/Documentation/kprobes.txt | |||
@@ -1,6 +1,7 @@ | |||
1 | Title : Kernel Probes (Kprobes) | 1 | Title : Kernel Probes (Kprobes) |
2 | Authors : Jim Keniston <jkenisto@us.ibm.com> | 2 | Authors : Jim Keniston <jkenisto@us.ibm.com> |
3 | : Prasanna S Panchamukhi <prasanna@in.ibm.com> | 3 | : Prasanna S Panchamukhi <prasanna.panchamukhi@gmail.com> |
4 | : Masami Hiramatsu <mhiramat@redhat.com> | ||
4 | 5 | ||
5 | CONTENTS | 6 | CONTENTS |
6 | 7 | ||
@@ -15,6 +16,7 @@ CONTENTS | |||
15 | 9. Jprobes Example | 16 | 9. Jprobes Example |
16 | 10. Kretprobes Example | 17 | 10. Kretprobes Example |
17 | Appendix A: The kprobes debugfs interface | 18 | Appendix A: The kprobes debugfs interface |
19 | Appendix B: The kprobes sysctl interface | ||
18 | 20 | ||
19 | 1. Concepts: Kprobes, Jprobes, Return Probes | 21 | 1. Concepts: Kprobes, Jprobes, Return Probes |
20 | 22 | ||
@@ -42,13 +44,13 @@ registration/unregistration of a group of *probes. These functions | |||
42 | can speed up unregistration process when you have to unregister | 44 | can speed up unregistration process when you have to unregister |
43 | a lot of probes at once. | 45 | a lot of probes at once. |
44 | 46 | ||
45 | The next three subsections explain how the different types of | 47 | The next four subsections explain how the different types of |
46 | probes work. They explain certain things that you'll need to | 48 | probes work and how jump optimization works. They explain certain |
47 | know in order to make the best use of Kprobes -- e.g., the | 49 | things that you'll need to know in order to make the best use of |
48 | difference between a pre_handler and a post_handler, and how | 50 | Kprobes -- e.g., the difference between a pre_handler and |
49 | to use the maxactive and nmissed fields of a kretprobe. But | 51 | a post_handler, and how to use the maxactive and nmissed fields of |
50 | if you're in a hurry to start using Kprobes, you can skip ahead | 52 | a kretprobe. But if you're in a hurry to start using Kprobes, you |
51 | to section 2. | 53 | can skip ahead to section 2. |
52 | 54 | ||
53 | 1.1 How Does a Kprobe Work? | 55 | 1.1 How Does a Kprobe Work? |
54 | 56 | ||
@@ -161,13 +163,125 @@ In case probed function is entered but there is no kretprobe_instance | |||
161 | object available, then in addition to incrementing the nmissed count, | 163 | object available, then in addition to incrementing the nmissed count, |
162 | the user entry_handler invocation is also skipped. | 164 | the user entry_handler invocation is also skipped. |
163 | 165 | ||
166 | 1.4 How Does Jump Optimization Work? | ||
167 | |||
168 | If you configured your kernel with CONFIG_OPTPROBES=y (currently | ||
169 | this option is supported on x86/x86-64, non-preemptive kernel) and | ||
170 | the "debug.kprobes_optimization" kernel parameter is set to 1 (see | ||
171 | sysctl(8)), Kprobes tries to reduce probe-hit overhead by using a jump | ||
172 | instruction instead of a breakpoint instruction at each probepoint. | ||
173 | |||
174 | 1.4.1 Init a Kprobe | ||
175 | |||
176 | When a probe is registered, before attempting this optimization, | ||
177 | Kprobes inserts an ordinary, breakpoint-based kprobe at the specified | ||
178 | address. So, even if it's not possible to optimize this particular | ||
179 | probepoint, there'll be a probe there. | ||
180 | |||
181 | 1.4.2 Safety Check | ||
182 | |||
183 | Before optimizing a probe, Kprobes performs the following safety checks: | ||
184 | |||
185 | - Kprobes verifies that the region that will be replaced by the jump | ||
186 | instruction (the "optimized region") lies entirely within one function. | ||
187 | (A jump instruction is multiple bytes, and so may overlay multiple | ||
188 | instructions.) | ||
189 | |||
190 | - Kprobes analyzes the entire function and verifies that there is no | ||
191 | jump into the optimized region. Specifically: | ||
192 | - the function contains no indirect jump; | ||
193 | - the function contains no instruction that causes an exception (since | ||
194 | the fixup code triggered by the exception could jump back into the | ||
195 | optimized region -- Kprobes checks the exception tables to verify this); | ||
196 | and | ||
197 | - there is no near jump to the optimized region (other than to the first | ||
198 | byte). | ||
199 | |||
200 | - For each instruction in the optimized region, Kprobes verifies that | ||
201 | the instruction can be executed out of line. | ||
202 | |||
203 | 1.4.3 Preparing Detour Buffer | ||
204 | |||
205 | Next, Kprobes prepares a "detour" buffer, which contains the following | ||
206 | instruction sequence: | ||
207 | - code to push the CPU's registers (emulating a breakpoint trap) | ||
208 | - a call to the trampoline code which calls user's probe handlers. | ||
209 | - code to restore registers | ||
210 | - the instructions from the optimized region | ||
211 | - a jump back to the original execution path. | ||
212 | |||
213 | 1.4.4 Pre-optimization | ||
214 | |||
215 | After preparing the detour buffer, Kprobes verifies that none of the | ||
216 | following situations exist: | ||
217 | - The probe has either a break_handler (i.e., it's a jprobe) or a | ||
218 | post_handler. | ||
219 | - Other instructions in the optimized region are probed. | ||
220 | - The probe is disabled. | ||
221 | In any of the above cases, Kprobes won't start optimizing the probe. | ||
222 | Since these are temporary situations, Kprobes tries to start | ||
223 | optimizing it again if the situation is changed. | ||
224 | |||
225 | If the kprobe can be optimized, Kprobes enqueues the kprobe to an | ||
226 | optimizing list, and kicks the kprobe-optimizer workqueue to optimize | ||
227 | it. If the to-be-optimized probepoint is hit before being optimized, | ||
228 | Kprobes returns control to the original instruction path by setting | ||
229 | the CPU's instruction pointer to the copied code in the detour buffer | ||
230 | -- thus at least avoiding the single-step. | ||
231 | |||
232 | 1.4.5 Optimization | ||
233 | |||
234 | The Kprobe-optimizer doesn't insert the jump instruction immediately; | ||
235 | rather, it calls synchronize_sched() for safety first, because it's | ||
236 | possible for a CPU to be interrupted in the middle of executing the | ||
237 | optimized region(*). As you know, synchronize_sched() can ensure | ||
238 | that all interruptions that were active when synchronize_sched() | ||
239 | was called are done, but only if CONFIG_PREEMPT=n. So, this version | ||
240 | of kprobe optimization supports only kernels with CONFIG_PREEMPT=n.(**) | ||
241 | |||
242 | After that, the Kprobe-optimizer calls stop_machine() to replace | ||
243 | the optimized region with a jump instruction to the detour buffer, | ||
244 | using text_poke_smp(). | ||
245 | |||
246 | 1.4.6 Unoptimization | ||
247 | |||
248 | When an optimized kprobe is unregistered, disabled, or blocked by | ||
249 | another kprobe, it will be unoptimized. If this happens before | ||
250 | the optimization is complete, the kprobe is just dequeued from the | ||
251 | optimized list. If the optimization has been done, the jump is | ||
252 | replaced with the original code (except for an int3 breakpoint in | ||
253 | the first byte) by using text_poke_smp(). | ||
254 | |||
255 | (*)Please imagine that the 2nd instruction is interrupted and then | ||
256 | the optimizer replaces the 2nd instruction with the jump *address* | ||
257 | while the interrupt handler is running. When the interrupt | ||
258 | returns to original address, there is no valid instruction, | ||
259 | and it causes an unexpected result. | ||
260 | |||
261 | (**)This optimization-safety checking may be replaced with the | ||
262 | stop-machine method that ksplice uses for supporting a CONFIG_PREEMPT=y | ||
263 | kernel. | ||
264 | |||
265 | NOTE for geeks: | ||
266 | The jump optimization changes the kprobe's pre_handler behavior. | ||
267 | Without optimization, the pre_handler can change the kernel's execution | ||
268 | path by changing regs->ip and returning 1. However, when the probe | ||
269 | is optimized, that modification is ignored. Thus, if you want to | ||
270 | tweak the kernel's execution path, you need to suppress optimization, | ||
271 | using one of the following techniques: | ||
272 | - Specify an empty function for the kprobe's post_handler or break_handler. | ||
273 | or | ||
274 | - Config CONFIG_OPTPROBES=n. | ||
275 | or | ||
276 | - Execute 'sysctl -w debug.kprobes_optimization=n' | ||
277 | |||
164 | 2. Architectures Supported | 278 | 2. Architectures Supported |
165 | 279 | ||
166 | Kprobes, jprobes, and return probes are implemented on the following | 280 | Kprobes, jprobes, and return probes are implemented on the following |
167 | architectures: | 281 | architectures: |
168 | 282 | ||
169 | - i386 | 283 | - i386 (Supports jump optimization) |
170 | - x86_64 (AMD-64, EM64T) | 284 | - x86_64 (AMD-64, EM64T) (Supports jump optimization) |
171 | - ppc64 | 285 | - ppc64 |
172 | - ia64 (Does not support probes on instruction slot1.) | 286 | - ia64 (Does not support probes on instruction slot1.) |
173 | - sparc64 (Return probes not yet implemented.) | 287 | - sparc64 (Return probes not yet implemented.) |
@@ -193,6 +307,10 @@ it useful to "Compile the kernel with debug info" (CONFIG_DEBUG_INFO), | |||
193 | so you can use "objdump -d -l vmlinux" to see the source-to-object | 307 | so you can use "objdump -d -l vmlinux" to see the source-to-object |
194 | code mapping. | 308 | code mapping. |
195 | 309 | ||
310 | If you want to reduce probing overhead, set "Kprobes jump optimization | ||
311 | support" (CONFIG_OPTPROBES) to "y". You can find this option under the | ||
312 | "Kprobes" line. | ||
313 | |||
196 | 4. API Reference | 314 | 4. API Reference |
197 | 315 | ||
198 | The Kprobes API includes a "register" function and an "unregister" | 316 | The Kprobes API includes a "register" function and an "unregister" |
@@ -389,7 +507,10 @@ the probe which has been registered. | |||
389 | 507 | ||
390 | Kprobes allows multiple probes at the same address. Currently, | 508 | Kprobes allows multiple probes at the same address. Currently, |
391 | however, there cannot be multiple jprobes on the same function at | 509 | however, there cannot be multiple jprobes on the same function at |
392 | the same time. | 510 | the same time. Also, a probepoint for which there is a jprobe or |
511 | a post_handler cannot be optimized. So if you install a jprobe, | ||
512 | or a kprobe with a post_handler, at an optimized probepoint, the | ||
513 | probepoint will be unoptimized automatically. | ||
393 | 514 | ||
394 | In general, you can install a probe anywhere in the kernel. | 515 | In general, you can install a probe anywhere in the kernel. |
395 | In particular, you can probe interrupt handlers. Known exceptions | 516 | In particular, you can probe interrupt handlers. Known exceptions |
@@ -453,6 +574,38 @@ reason, Kprobes doesn't support return probes (or kprobes or jprobes) | |||
453 | on the x86_64 version of __switch_to(); the registration functions | 574 | on the x86_64 version of __switch_to(); the registration functions |
454 | return -EINVAL. | 575 | return -EINVAL. |
455 | 576 | ||
577 | On x86/x86-64, since the Jump Optimization of Kprobes modifies | ||
578 | instructions widely, there are some limitations to optimization. To | ||
579 | explain it, we introduce some terminology. Imagine a 3-instruction | ||
580 | sequence consisting of a two 2-byte instructions and one 3-byte | ||
581 | instruction. | ||
582 | |||
583 | IA | ||
584 | | | ||
585 | [-2][-1][0][1][2][3][4][5][6][7] | ||
586 | [ins1][ins2][ ins3 ] | ||
587 | [<- DCR ->] | ||
588 | [<- JTPR ->] | ||
589 | |||
590 | ins1: 1st Instruction | ||
591 | ins2: 2nd Instruction | ||
592 | ins3: 3rd Instruction | ||
593 | IA: Insertion Address | ||
594 | JTPR: Jump Target Prohibition Region | ||
595 | DCR: Detoured Code Region | ||
596 | |||
597 | The instructions in DCR are copied to the out-of-line buffer | ||
598 | of the kprobe, because the bytes in DCR are replaced by | ||
599 | a 5-byte jump instruction. So there are several limitations. | ||
600 | |||
601 | a) The instructions in DCR must be relocatable. | ||
602 | b) The instructions in DCR must not include a call instruction. | ||
603 | c) JTPR must not be targeted by any jump or call instruction. | ||
604 | d) DCR must not straddle the border betweeen functions. | ||
605 | |||
606 | Anyway, these limitations are checked by the in-kernel instruction | ||
607 | decoder, so you don't need to worry about that. | ||
608 | |||
456 | 6. Probe Overhead | 609 | 6. Probe Overhead |
457 | 610 | ||
458 | On a typical CPU in use in 2005, a kprobe hit takes 0.5 to 1.0 | 611 | On a typical CPU in use in 2005, a kprobe hit takes 0.5 to 1.0 |
@@ -476,6 +629,19 @@ k = 0.49 usec; j = 0.76; r = 0.80; kr = 0.82; jr = 1.07 | |||
476 | ppc64: POWER5 (gr), 1656 MHz (SMT disabled, 1 virtual CPU per physical CPU) | 629 | ppc64: POWER5 (gr), 1656 MHz (SMT disabled, 1 virtual CPU per physical CPU) |
477 | k = 0.77 usec; j = 1.31; r = 1.26; kr = 1.45; jr = 1.99 | 630 | k = 0.77 usec; j = 1.31; r = 1.26; kr = 1.45; jr = 1.99 |
478 | 631 | ||
632 | 6.1 Optimized Probe Overhead | ||
633 | |||
634 | Typically, an optimized kprobe hit takes 0.07 to 0.1 microseconds to | ||
635 | process. Here are sample overhead figures (in usec) for x86 architectures. | ||
636 | k = unoptimized kprobe, b = boosted (single-step skipped), o = optimized kprobe, | ||
637 | r = unoptimized kretprobe, rb = boosted kretprobe, ro = optimized kretprobe. | ||
638 | |||
639 | i386: Intel(R) Xeon(R) E5410, 2.33GHz, 4656.90 bogomips | ||
640 | k = 0.80 usec; b = 0.33; o = 0.05; r = 1.10; rb = 0.61; ro = 0.33 | ||
641 | |||
642 | x86-64: Intel(R) Xeon(R) E5410, 2.33GHz, 4656.90 bogomips | ||
643 | k = 0.99 usec; b = 0.43; o = 0.06; r = 1.24; rb = 0.68; ro = 0.30 | ||
644 | |||
479 | 7. TODO | 645 | 7. TODO |
480 | 646 | ||
481 | a. SystemTap (http://sourceware.org/systemtap): Provides a simplified | 647 | a. SystemTap (http://sourceware.org/systemtap): Provides a simplified |
@@ -523,7 +689,8 @@ is also specified. Following columns show probe status. If the probe is on | |||
523 | a virtual address that is no longer valid (module init sections, module | 689 | a virtual address that is no longer valid (module init sections, module |
524 | virtual addresses that correspond to modules that've been unloaded), | 690 | virtual addresses that correspond to modules that've been unloaded), |
525 | such probes are marked with [GONE]. If the probe is temporarily disabled, | 691 | such probes are marked with [GONE]. If the probe is temporarily disabled, |
526 | such probes are marked with [DISABLED]. | 692 | such probes are marked with [DISABLED]. If the probe is optimized, it is |
693 | marked with [OPTIMIZED]. | ||
527 | 694 | ||
528 | /sys/kernel/debug/kprobes/enabled: Turn kprobes ON/OFF forcibly. | 695 | /sys/kernel/debug/kprobes/enabled: Turn kprobes ON/OFF forcibly. |
529 | 696 | ||
@@ -533,3 +700,19 @@ registered probes will be disarmed, till such time a "1" is echoed to this | |||
533 | file. Note that this knob just disarms and arms all kprobes and doesn't | 700 | file. Note that this knob just disarms and arms all kprobes and doesn't |
534 | change each probe's disabling state. This means that disabled kprobes (marked | 701 | change each probe's disabling state. This means that disabled kprobes (marked |
535 | [DISABLED]) will be not enabled if you turn ON all kprobes by this knob. | 702 | [DISABLED]) will be not enabled if you turn ON all kprobes by this knob. |
703 | |||
704 | |||
705 | Appendix B: The kprobes sysctl interface | ||
706 | |||
707 | /proc/sys/debug/kprobes-optimization: Turn kprobes optimization ON/OFF. | ||
708 | |||
709 | When CONFIG_OPTPROBES=y, this sysctl interface appears and it provides | ||
710 | a knob to globally and forcibly turn jump optimization (see section | ||
711 | 1.4) ON or OFF. By default, jump optimization is allowed (ON). | ||
712 | If you echo "0" to this file or set "debug.kprobes_optimization" to | ||
713 | 0 via sysctl, all optimized probes will be unoptimized, and any new | ||
714 | probes registered after that will not be optimized. Note that this | ||
715 | knob *changes* the optimized state. This means that optimized probes | ||
716 | (marked [OPTIMIZED]) will be unoptimized ([OPTIMIZED] tag will be | ||
717 | removed). If the knob is turned on, they will be optimized again. | ||
718 | |||
diff --git a/Documentation/kvm/api.txt b/Documentation/kvm/api.txt index 2811e452f756..c6416a398163 100644 --- a/Documentation/kvm/api.txt +++ b/Documentation/kvm/api.txt | |||
@@ -23,12 +23,12 @@ of a virtual machine. The ioctls belong to three classes | |||
23 | Only run vcpu ioctls from the same thread that was used to create the | 23 | Only run vcpu ioctls from the same thread that was used to create the |
24 | vcpu. | 24 | vcpu. |
25 | 25 | ||
26 | 2. File descritpors | 26 | 2. File descriptors |
27 | 27 | ||
28 | The kvm API is centered around file descriptors. An initial | 28 | The kvm API is centered around file descriptors. An initial |
29 | open("/dev/kvm") obtains a handle to the kvm subsystem; this handle | 29 | open("/dev/kvm") obtains a handle to the kvm subsystem; this handle |
30 | can be used to issue system ioctls. A KVM_CREATE_VM ioctl on this | 30 | can be used to issue system ioctls. A KVM_CREATE_VM ioctl on this |
31 | handle will create a VM file descripror which can be used to issue VM | 31 | handle will create a VM file descriptor which can be used to issue VM |
32 | ioctls. A KVM_CREATE_VCPU ioctl on a VM fd will create a virtual cpu | 32 | ioctls. A KVM_CREATE_VCPU ioctl on a VM fd will create a virtual cpu |
33 | and return a file descriptor pointing to it. Finally, ioctls on a vcpu | 33 | and return a file descriptor pointing to it. Finally, ioctls on a vcpu |
34 | fd can be used to control the vcpu, including the important task of | 34 | fd can be used to control the vcpu, including the important task of |
@@ -643,7 +643,7 @@ Type: vm ioctl | |||
643 | Parameters: struct kvm_clock_data (in) | 643 | Parameters: struct kvm_clock_data (in) |
644 | Returns: 0 on success, -1 on error | 644 | Returns: 0 on success, -1 on error |
645 | 645 | ||
646 | Sets the current timestamp of kvmclock to the valued specific in its parameter. | 646 | Sets the current timestamp of kvmclock to the value specified in its parameter. |
647 | In conjunction with KVM_GET_CLOCK, it is used to ensure monotonicity on scenarios | 647 | In conjunction with KVM_GET_CLOCK, it is used to ensure monotonicity on scenarios |
648 | such as migration. | 648 | such as migration. |
649 | 649 | ||
@@ -795,11 +795,11 @@ Unused. | |||
795 | __u64 data_offset; /* relative to kvm_run start */ | 795 | __u64 data_offset; /* relative to kvm_run start */ |
796 | } io; | 796 | } io; |
797 | 797 | ||
798 | If exit_reason is KVM_EXIT_IO_IN or KVM_EXIT_IO_OUT, then the vcpu has | 798 | If exit_reason is KVM_EXIT_IO, then the vcpu has |
799 | executed a port I/O instruction which could not be satisfied by kvm. | 799 | executed a port I/O instruction which could not be satisfied by kvm. |
800 | data_offset describes where the data is located (KVM_EXIT_IO_OUT) or | 800 | data_offset describes where the data is located (KVM_EXIT_IO_OUT) or |
801 | where kvm expects application code to place the data for the next | 801 | where kvm expects application code to place the data for the next |
802 | KVM_RUN invocation (KVM_EXIT_IO_IN). Data format is a patcked array. | 802 | KVM_RUN invocation (KVM_EXIT_IO_IN). Data format is a packed array. |
803 | 803 | ||
804 | struct { | 804 | struct { |
805 | struct kvm_debug_exit_arch arch; | 805 | struct kvm_debug_exit_arch arch; |
@@ -815,7 +815,7 @@ Unused. | |||
815 | __u8 is_write; | 815 | __u8 is_write; |
816 | } mmio; | 816 | } mmio; |
817 | 817 | ||
818 | If exit_reason is KVM_EXIT_MMIO or KVM_EXIT_IO_OUT, then the vcpu has | 818 | If exit_reason is KVM_EXIT_MMIO, then the vcpu has |
819 | executed a memory-mapped I/O instruction which could not be satisfied | 819 | executed a memory-mapped I/O instruction which could not be satisfied |
820 | by kvm. The 'data' member contains the written data if 'is_write' is | 820 | by kvm. The 'data' member contains the written data if 'is_write' is |
821 | true, and should be filled by application code otherwise. | 821 | true, and should be filled by application code otherwise. |
diff --git a/Documentation/vm/slub.txt b/Documentation/vm/slub.txt index b37300edf27c..07375e73981a 100644 --- a/Documentation/vm/slub.txt +++ b/Documentation/vm/slub.txt | |||
@@ -41,6 +41,7 @@ Possible debug options are | |||
41 | P Poisoning (object and padding) | 41 | P Poisoning (object and padding) |
42 | U User tracking (free and alloc) | 42 | U User tracking (free and alloc) |
43 | T Trace (please only use on single slabs) | 43 | T Trace (please only use on single slabs) |
44 | A Toggle failslab filter mark for the cache | ||
44 | O Switch debugging off for caches that would have | 45 | O Switch debugging off for caches that would have |
45 | caused higher minimum slab orders | 46 | caused higher minimum slab orders |
46 | - Switch all debugging off (useful if the kernel is | 47 | - Switch all debugging off (useful if the kernel is |