diff options
Diffstat (limited to 'Documentation')
-rw-r--r-- | Documentation/DocBook/uio-howto.tmpl | 4 | ||||
-rw-r--r-- | Documentation/accounting/getdelays.c | 2 | ||||
-rw-r--r-- | Documentation/dvb/get_dvb_firmware | 24 | ||||
-rw-r--r-- | Documentation/fb/pvr2fb.txt | 22 | ||||
-rw-r--r-- | Documentation/i386/zero-page.txt | 10 | ||||
-rw-r--r-- | Documentation/kbuild/kconfig-language.txt | 9 | ||||
-rw-r--r-- | Documentation/kernel-parameters.txt | 24 | ||||
-rw-r--r-- | Documentation/lguest/Makefile | 4 | ||||
-rw-r--r-- | Documentation/memory-hotplug.txt | 322 | ||||
-rw-r--r-- | Documentation/sched-design-CFS.txt | 2 | ||||
-rw-r--r-- | Documentation/sched-nice-design.txt | 108 | ||||
-rw-r--r-- | Documentation/sysrq.txt | 4 | ||||
-rw-r--r-- | Documentation/thinkpad-acpi.txt | 4 | ||||
-rw-r--r-- | Documentation/vm/numa_memory_policy.txt | 332 | ||||
-rw-r--r-- | Documentation/vm/slabinfo.c | 2 | ||||
-rw-r--r-- | Documentation/watchdog/00-INDEX | 10 |
16 files changed, 849 insertions, 34 deletions
diff --git a/Documentation/DocBook/uio-howto.tmpl b/Documentation/DocBook/uio-howto.tmpl index e3bb29a8d8dd..c119484258b8 100644 --- a/Documentation/DocBook/uio-howto.tmpl +++ b/Documentation/DocBook/uio-howto.tmpl | |||
@@ -133,10 +133,6 @@ interested in translating it, please email me | |||
133 | <para>updates of your driver can take place without recompiling | 133 | <para>updates of your driver can take place without recompiling |
134 | the kernel.</para> | 134 | the kernel.</para> |
135 | </listitem> | 135 | </listitem> |
136 | <listitem> | ||
137 | <para>if you need to keep some parts of your driver closed source, | ||
138 | you can do so without violating the GPL license on the kernel.</para> | ||
139 | </listitem> | ||
140 | </itemizedlist> | 136 | </itemizedlist> |
141 | 137 | ||
142 | <sect1 id="how_uio_works"> | 138 | <sect1 id="how_uio_works"> |
diff --git a/Documentation/accounting/getdelays.c b/Documentation/accounting/getdelays.c index 24c5aade8998..cbee3a27f768 100644 --- a/Documentation/accounting/getdelays.c +++ b/Documentation/accounting/getdelays.c | |||
@@ -196,7 +196,7 @@ void print_delayacct(struct taskstats *t) | |||
196 | "IO %15s%15s\n" | 196 | "IO %15s%15s\n" |
197 | " %15llu%15llu\n" | 197 | " %15llu%15llu\n" |
198 | "MEM %15s%15s\n" | 198 | "MEM %15s%15s\n" |
199 | " %15llu%15llu\n" | 199 | " %15llu%15llu\n", |
200 | "count", "real total", "virtual total", "delay total", | 200 | "count", "real total", "virtual total", "delay total", |
201 | t->cpu_count, t->cpu_run_real_total, t->cpu_run_virtual_total, | 201 | t->cpu_count, t->cpu_run_real_total, t->cpu_run_virtual_total, |
202 | t->cpu_delay_total, | 202 | t->cpu_delay_total, |
diff --git a/Documentation/dvb/get_dvb_firmware b/Documentation/dvb/get_dvb_firmware index b4d306ae9234..f2e908d7f90d 100644 --- a/Documentation/dvb/get_dvb_firmware +++ b/Documentation/dvb/get_dvb_firmware | |||
@@ -111,21 +111,21 @@ sub tda10045 { | |||
111 | } | 111 | } |
112 | 112 | ||
113 | sub tda10046 { | 113 | sub tda10046 { |
114 | my $sourcefile = "tt_budget_217g.zip"; | 114 | my $sourcefile = "TT_PCI_2.19h_28_11_2006.zip"; |
115 | my $url = "http://www.technotrend.de/new/217g/$sourcefile"; | 115 | my $url = "http://technotrend-online.com/download/software/219/$sourcefile"; |
116 | my $hash = "6a7e1e2f2644b162ff0502367553c72d"; | 116 | my $hash = "6a7e1e2f2644b162ff0502367553c72d"; |
117 | my $outfile = "dvb-fe-tda10046.fw"; | 117 | my $outfile = "dvb-fe-tda10046.fw"; |
118 | my $tmpdir = tempdir(DIR => "/tmp", CLEANUP => 1); | 118 | my $tmpdir = tempdir(DIR => "/tmp", CLEANUP => 1); |
119 | 119 | ||
120 | checkstandard(); | 120 | checkstandard(); |
121 | 121 | ||
122 | wgetfile($sourcefile, $url); | 122 | wgetfile($sourcefile, $url); |
123 | unzip($sourcefile, $tmpdir); | 123 | unzip($sourcefile, $tmpdir); |
124 | extract("$tmpdir/software/OEM/PCI/App/ttlcdacc.dll", 0x3f731, 24478, "$tmpdir/fwtmp"); | 124 | extract("$tmpdir/TT_PCI_2.19h_28_11_2006/software/OEM/PCI/App/ttlcdacc.dll", 0x65389, 24478, "$tmpdir/fwtmp"); |
125 | verify("$tmpdir/fwtmp", $hash); | 125 | verify("$tmpdir/fwtmp", $hash); |
126 | copy("$tmpdir/fwtmp", $outfile); | 126 | copy("$tmpdir/fwtmp", $outfile); |
127 | 127 | ||
128 | $outfile; | 128 | $outfile; |
129 | } | 129 | } |
130 | 130 | ||
131 | sub tda10046lifeview { | 131 | sub tda10046lifeview { |
diff --git a/Documentation/fb/pvr2fb.txt b/Documentation/fb/pvr2fb.txt index 2bf6c2321c2d..36bdeff585e2 100644 --- a/Documentation/fb/pvr2fb.txt +++ b/Documentation/fb/pvr2fb.txt | |||
@@ -9,14 +9,13 @@ one found in the Dreamcast. | |||
9 | Advantages: | 9 | Advantages: |
10 | 10 | ||
11 | * It provides a nice large console (128 cols + 48 lines with 1024x768) | 11 | * It provides a nice large console (128 cols + 48 lines with 1024x768) |
12 | without using tiny, unreadable fonts. | 12 | without using tiny, unreadable fonts (NOT on the Dreamcast) |
13 | * You can run XF86_FBDev on top of /dev/fb0 | 13 | * You can run XF86_FBDev on top of /dev/fb0 |
14 | * Most important: boot logo :-) | 14 | * Most important: boot logo :-) |
15 | 15 | ||
16 | Disadvantages: | 16 | Disadvantages: |
17 | 17 | ||
18 | * Driver is currently limited to the Dreamcast PowerVR 2 implementation | 18 | * Driver is largely untested on non-Dreamcast systems. |
19 | at the time of this writing. | ||
20 | 19 | ||
21 | Configuration | 20 | Configuration |
22 | ============= | 21 | ============= |
@@ -29,11 +28,16 @@ Accepted options: | |||
29 | font:X - default font to use. All fonts are supported, including the | 28 | font:X - default font to use. All fonts are supported, including the |
30 | SUN12x22 font which is very nice at high resolutions. | 29 | SUN12x22 font which is very nice at high resolutions. |
31 | 30 | ||
32 | mode:X - default video mode. The following video modes are supported: | ||
33 | 640x240-60, 640x480-60. | ||
34 | 31 | ||
32 | mode:X - default video mode with format [xres]x[yres]-<bpp>@<refresh rate> | ||
33 | The following video modes are supported: | ||
34 | 640x640-16@60, 640x480-24@60, 640x480-32@60. The Dreamcast | ||
35 | defaults to 640x480-16@60. At the time of writing the | ||
36 | 24bpp and 32bpp modes function poorly. Work to fix that is | ||
37 | ongoing | ||
38 | |||
35 | Note: the 640x240 mode is currently broken, and should not be | 39 | Note: the 640x240 mode is currently broken, and should not be |
36 | used for any reason. It is only mentioned as a reference. | 40 | used for any reason. It is only mentioned here as a reference. |
37 | 41 | ||
38 | inverse - invert colors on screen (for LCD displays) | 42 | inverse - invert colors on screen (for LCD displays) |
39 | 43 | ||
@@ -52,10 +56,10 @@ output:X - output type. This can be any of the following: pal, ntsc, and | |||
52 | X11 | 56 | X11 |
53 | === | 57 | === |
54 | 58 | ||
55 | XF86_FBDev should work, in theory. At the time of this writing it is | 59 | XF86_FBDev has been shown to work on the Dreamcast in the past - though not yet |
56 | totally untested and may or may not even portray the beginnings of | 60 | on any 2.6 series kernel. |
57 | working. If you end up testing this, please let me know! | ||
58 | 61 | ||
59 | -- | 62 | -- |
60 | Paul Mundt <lethal@linuxdc.org> | 63 | Paul Mundt <lethal@linuxdc.org> |
64 | Updated by Adrian McMenamin <adrian@mcmen.demon.co.uk> | ||
61 | 65 | ||
diff --git a/Documentation/i386/zero-page.txt b/Documentation/i386/zero-page.txt index 75b3680c41eb..6c0817c45683 100644 --- a/Documentation/i386/zero-page.txt +++ b/Documentation/i386/zero-page.txt | |||
@@ -1,3 +1,13 @@ | |||
1 | --------------------------------------------------------------------------- | ||
2 | !!!!!!!!!!!!!!!WARNING!!!!!!!! | ||
3 | The zero page is a kernel internal data structure, not a stable ABI. It might change | ||
4 | without warning and the kernel has no way to detect old version of it. | ||
5 | If you're writing some external code like a boot loader you should only use | ||
6 | the stable versioned real mode boot protocol described in boot.txt. Otherwise the kernel | ||
7 | might break you at any time. | ||
8 | !!!!!!!!!!!!!WARNING!!!!!!!!!!! | ||
9 | ---------------------------------------------------------------------------- | ||
10 | |||
1 | Summary of boot_params layout (kernel point of view) | 11 | Summary of boot_params layout (kernel point of view) |
2 | ( collected by Hans Lermen and Martin Mares ) | 12 | ( collected by Hans Lermen and Martin Mares ) |
3 | 13 | ||
diff --git a/Documentation/kbuild/kconfig-language.txt b/Documentation/kbuild/kconfig-language.txt index 536d5bfbdb8d..fe8b0c4892cf 100644 --- a/Documentation/kbuild/kconfig-language.txt +++ b/Documentation/kbuild/kconfig-language.txt | |||
@@ -98,6 +98,15 @@ applicable everywhere (see syntax). | |||
98 | times, the limit is set to the largest selection. | 98 | times, the limit is set to the largest selection. |
99 | Reverse dependencies can only be used with boolean or tristate | 99 | Reverse dependencies can only be used with boolean or tristate |
100 | symbols. | 100 | symbols. |
101 | Note: | ||
102 | select is evil.... select will by brute force set a symbol | ||
103 | equal to 'y' without visiting the dependencies. So abusing | ||
104 | select you are able to select a symbol FOO even if FOO depends | ||
105 | on BAR that is not set. In general use select only for | ||
106 | non-visible symbols (no promts anywhere) and for symbols with | ||
107 | no dependencies. That will limit the usefulness but on the | ||
108 | other hand avoid the illegal configurations all over. kconfig | ||
109 | should one day warn about such things. | ||
101 | 110 | ||
102 | - numerical ranges: "range" <symbol> <symbol> ["if" <expr>] | 111 | - numerical ranges: "range" <symbol> <symbol> ["if" <expr>] |
103 | This allows to limit the range of possible input values for int | 112 | This allows to limit the range of possible input values for int |
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index efdb42fd3fb8..975f029be25c 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt | |||
@@ -163,6 +163,8 @@ and is between 256 and 4096 characters. It is defined in the file | |||
163 | acpi_irq_isa= [HW,ACPI] If irq_balance, mark listed IRQs used by ISA | 163 | acpi_irq_isa= [HW,ACPI] If irq_balance, mark listed IRQs used by ISA |
164 | Format: <irq>,<irq>... | 164 | Format: <irq>,<irq>... |
165 | 165 | ||
166 | acpi_no_auto_ssdt [HW,ACPI] Disable automatic loading of SSDT | ||
167 | |||
166 | acpi_os_name= [HW,ACPI] Tell ACPI BIOS the name of the OS | 168 | acpi_os_name= [HW,ACPI] Tell ACPI BIOS the name of the OS |
167 | Format: To spoof as Windows 98: ="Microsoft Windows" | 169 | Format: To spoof as Windows 98: ="Microsoft Windows" |
168 | 170 | ||
@@ -1820,6 +1822,26 @@ and is between 256 and 4096 characters. It is defined in the file | |||
1820 | thash_entries= [KNL,NET] | 1822 | thash_entries= [KNL,NET] |
1821 | Set number of hash buckets for TCP connection | 1823 | Set number of hash buckets for TCP connection |
1822 | 1824 | ||
1825 | thermal.act= [HW,ACPI] | ||
1826 | -1: disable all active trip points in all thermal zones | ||
1827 | <degrees C>: override all lowest active trip points | ||
1828 | |||
1829 | thermal.nocrt= [HW,ACPI] | ||
1830 | Set to disable actions on ACPI thermal zone | ||
1831 | critical and hot trip points. | ||
1832 | |||
1833 | thermal.off= [HW,ACPI] | ||
1834 | 1: disable ACPI thermal control | ||
1835 | |||
1836 | thermal.psv= [HW,ACPI] | ||
1837 | -1: disable all passive trip points | ||
1838 | <degrees C>: override all passive trip points to this value | ||
1839 | |||
1840 | thermal.tzp= [HW,ACPI] | ||
1841 | Specify global default ACPI thermal zone polling rate | ||
1842 | <deci-seconds>: poll all this frequency | ||
1843 | 0: no polling (default) | ||
1844 | |||
1823 | time Show timing data prefixed to each printk message line | 1845 | time Show timing data prefixed to each printk message line |
1824 | [deprecated, see 'printk.time'] | 1846 | [deprecated, see 'printk.time'] |
1825 | 1847 | ||
@@ -1922,7 +1944,7 @@ and is between 256 and 4096 characters. It is defined in the file | |||
1922 | See header of drivers/scsi/wd7000.c. | 1944 | See header of drivers/scsi/wd7000.c. |
1923 | 1945 | ||
1924 | wdt= [WDT] Watchdog | 1946 | wdt= [WDT] Watchdog |
1925 | See Documentation/watchdog/watchdog.txt. | 1947 | See Documentation/watchdog/wdt.txt. |
1926 | 1948 | ||
1927 | xd= [HW,XT] Original XT pre-IDE (RLL encoded) disks. | 1949 | xd= [HW,XT] Original XT pre-IDE (RLL encoded) disks. |
1928 | xd_geo= See header of drivers/block/xd.c. | 1950 | xd_geo= See header of drivers/block/xd.c. |
diff --git a/Documentation/lguest/Makefile b/Documentation/lguest/Makefile index 31e794ef5f98..c0b7a4556390 100644 --- a/Documentation/lguest/Makefile +++ b/Documentation/lguest/Makefile | |||
@@ -13,7 +13,9 @@ LGUEST_GUEST_TOP := ($(CONFIG_PAGE_OFFSET) - 0x08000000) | |||
13 | 13 | ||
14 | CFLAGS:=-Wall -Wmissing-declarations -Wmissing-prototypes -O3 -Wl,-T,lguest.lds | 14 | CFLAGS:=-Wall -Wmissing-declarations -Wmissing-prototypes -O3 -Wl,-T,lguest.lds |
15 | LDLIBS:=-lz | 15 | LDLIBS:=-lz |
16 | 16 | # Removing this works for some versions of ld.so (eg. Ubuntu Feisty) and | |
17 | # not others (eg. FC7). | ||
18 | LDFLAGS+=-static | ||
17 | all: lguest.lds lguest | 19 | all: lguest.lds lguest |
18 | 20 | ||
19 | # The linker script on x86 is so complex the only way of creating one | 21 | # The linker script on x86 is so complex the only way of creating one |
diff --git a/Documentation/memory-hotplug.txt b/Documentation/memory-hotplug.txt new file mode 100644 index 000000000000..5fbcc22c98e9 --- /dev/null +++ b/Documentation/memory-hotplug.txt | |||
@@ -0,0 +1,322 @@ | |||
1 | ============== | ||
2 | Memory Hotplug | ||
3 | ============== | ||
4 | |||
5 | Last Updated: Jul 28 2007 | ||
6 | |||
7 | This document is about memory hotplug including how-to-use and current status. | ||
8 | Because Memory Hotplug is still under development, contents of this text will | ||
9 | be changed often. | ||
10 | |||
11 | 1. Introduction | ||
12 | 1.1 purpose of memory hotplug | ||
13 | 1.2. Phases of memory hotplug | ||
14 | 1.3. Unit of Memory online/offline operation | ||
15 | 2. Kernel Configuration | ||
16 | 3. sysfs files for memory hotplug | ||
17 | 4. Physical memory hot-add phase | ||
18 | 4.1 Hardware(Firmware) Support | ||
19 | 4.2 Notify memory hot-add event by hand | ||
20 | 5. Logical Memory hot-add phase | ||
21 | 5.1. State of memory | ||
22 | 5.2. How to online memory | ||
23 | 6. Logical memory remove | ||
24 | 6.1 Memory offline and ZONE_MOVABLE | ||
25 | 6.2. How to offline memory | ||
26 | 7. Physical memory remove | ||
27 | 8. Future Work List | ||
28 | |||
29 | Note(1): x86_64's has special implementation for memory hotplug. | ||
30 | This text does not describe it. | ||
31 | Note(2): This text assumes that sysfs is mounted at /sys. | ||
32 | |||
33 | |||
34 | --------------- | ||
35 | 1. Introduction | ||
36 | --------------- | ||
37 | |||
38 | 1.1 purpose of memory hotplug | ||
39 | ------------ | ||
40 | Memory Hotplug allows users to increase/decrease the amount of memory. | ||
41 | Generally, there are two purposes. | ||
42 | |||
43 | (A) For changing the amount of memory. | ||
44 | This is to allow a feature like capacity on demand. | ||
45 | (B) For installing/removing DIMMs or NUMA-nodes physically. | ||
46 | This is to exchange DIMMs/NUMA-nodes, reduce power consumption, etc. | ||
47 | |||
48 | (A) is required by highly virtualized environments and (B) is required by | ||
49 | hardware which supports memory power management. | ||
50 | |||
51 | Linux memory hotplug is designed for both purpose. | ||
52 | |||
53 | |||
54 | 1.2. Phases of memory hotplug | ||
55 | --------------- | ||
56 | There are 2 phases in Memory Hotplug. | ||
57 | 1) Physical Memory Hotplug phase | ||
58 | 2) Logical Memory Hotplug phase. | ||
59 | |||
60 | The First phase is to communicate hardware/firmware and make/erase | ||
61 | environment for hotplugged memory. Basically, this phase is necessary | ||
62 | for the purpose (B), but this is good phase for communication between | ||
63 | highly virtualized environments too. | ||
64 | |||
65 | When memory is hotplugged, the kernel recognizes new memory, makes new memory | ||
66 | management tables, and makes sysfs files for new memory's operation. | ||
67 | |||
68 | If firmware supports notification of connection of new memory to OS, | ||
69 | this phase is triggered automatically. ACPI can notify this event. If not, | ||
70 | "probe" operation by system administration is used instead. | ||
71 | (see Section 4.). | ||
72 | |||
73 | Logical Memory Hotplug phase is to change memory state into | ||
74 | avaiable/unavailable for users. Amount of memory from user's view is | ||
75 | changed by this phase. The kernel makes all memory in it as free pages | ||
76 | when a memory range is available. | ||
77 | |||
78 | In this document, this phase is described as online/offline. | ||
79 | |||
80 | Logical Memory Hotplug phase is triggred by write of sysfs file by system | ||
81 | administrator. For the hot-add case, it must be executed after Physical Hotplug | ||
82 | phase by hand. | ||
83 | (However, if you writes udev's hotplug scripts for memory hotplug, these | ||
84 | phases can be execute in seamless way.) | ||
85 | |||
86 | |||
87 | 1.3. Unit of Memory online/offline operation | ||
88 | ------------ | ||
89 | Memory hotplug uses SPARSEMEM memory model. SPARSEMEM divides the whole memory | ||
90 | into chunks of the same size. The chunk is called a "section". The size of | ||
91 | a section is architecture dependent. For example, power uses 16MiB, ia64 uses | ||
92 | 1GiB. The unit of online/offline operation is "one section". (see Section 3.) | ||
93 | |||
94 | To determine the size of sections, please read this file: | ||
95 | |||
96 | /sys/devices/system/memory/block_size_bytes | ||
97 | |||
98 | This file shows the size of sections in byte. | ||
99 | |||
100 | ----------------------- | ||
101 | 2. Kernel Configuration | ||
102 | ----------------------- | ||
103 | To use memory hotplug feature, kernel must be compiled with following | ||
104 | config options. | ||
105 | |||
106 | - For all memory hotplug | ||
107 | Memory model -> Sparse Memory (CONFIG_SPARSEMEM) | ||
108 | Allow for memory hot-add (CONFIG_MEMORY_HOTPLUG) | ||
109 | |||
110 | - To enable memory removal, the followings are also necessary | ||
111 | Allow for memory hot remove (CONFIG_MEMORY_HOTREMOVE) | ||
112 | Page Migration (CONFIG_MIGRATION) | ||
113 | |||
114 | - For ACPI memory hotplug, the followings are also necessary | ||
115 | Memory hotplug (under ACPI Support menu) (CONFIG_ACPI_HOTPLUG_MEMORY) | ||
116 | This option can be kernel module. | ||
117 | |||
118 | - As a related configuration, if your box has a feature of NUMA-node hotplug | ||
119 | via ACPI, then this option is necessary too. | ||
120 | ACPI0004,PNP0A05 and PNP0A06 Container Driver (under ACPI Support menu) | ||
121 | (CONFIG_ACPI_CONTAINER). | ||
122 | This option can be kernel module too. | ||
123 | |||
124 | -------------------------------- | ||
125 | 3 sysfs files for memory hotplug | ||
126 | -------------------------------- | ||
127 | All sections have their device information under /sys/devices/system/memory as | ||
128 | |||
129 | /sys/devices/system/memory/memoryXXX | ||
130 | (XXX is section id.) | ||
131 | |||
132 | Now, XXX is defined as start_address_of_section / section_size. | ||
133 | |||
134 | For example, assume 1GiB section size. A device for a memory starting at | ||
135 | 0x100000000 is /sys/device/system/memory/memory4 | ||
136 | (0x100000000 / 1Gib = 4) | ||
137 | This device covers address range [0x100000000 ... 0x140000000) | ||
138 | |||
139 | Under each section, you can see 3 files. | ||
140 | |||
141 | /sys/devices/system/memory/memoryXXX/phys_index | ||
142 | /sys/devices/system/memory/memoryXXX/phys_device | ||
143 | /sys/devices/system/memory/memoryXXX/state | ||
144 | |||
145 | 'phys_index' : read-only and contains section id, same as XXX. | ||
146 | 'state' : read-write | ||
147 | at read: contains online/offline state of memory. | ||
148 | at write: user can specify "online", "offline" command | ||
149 | 'phys_device': read-only: designed to show the name of physical memory device. | ||
150 | This is not well implemented now. | ||
151 | |||
152 | NOTE: | ||
153 | These directories/files appear after physical memory hotplug phase. | ||
154 | |||
155 | |||
156 | -------------------------------- | ||
157 | 4. Physical memory hot-add phase | ||
158 | -------------------------------- | ||
159 | |||
160 | 4.1 Hardware(Firmware) Support | ||
161 | ------------ | ||
162 | On x86_64/ia64 platform, memory hotplug by ACPI is supported. | ||
163 | |||
164 | In general, the firmware (ACPI) which supports memory hotplug defines | ||
165 | memory class object of _HID "PNP0C80". When a notify is asserted to PNP0C80, | ||
166 | Linux's ACPI handler does hot-add memory to the system and calls a hotplug udev | ||
167 | script. This will be done automatically. | ||
168 | |||
169 | But scripts for memory hotplug are not contained in generic udev package(now). | ||
170 | You may have to write it by yourself or online/offline memory by hand. | ||
171 | Please see "How to online memory", "How to offline memory" in this text. | ||
172 | |||
173 | If firmware supports NUMA-node hotplug, and defines an object _HID "ACPI0004", | ||
174 | "PNP0A05", or "PNP0A06", notification is asserted to it, and ACPI handler | ||
175 | calls hotplug code for all of objects which are defined in it. | ||
176 | If memory device is found, memory hotplug code will be called. | ||
177 | |||
178 | |||
179 | 4.2 Notify memory hot-add event by hand | ||
180 | ------------ | ||
181 | In some environments, especially virtualized environment, firmware will not | ||
182 | notify memory hotplug event to the kernel. For such environment, "probe" | ||
183 | interface is supported. This interface depends on CONFIG_ARCH_MEMORY_PROBE. | ||
184 | |||
185 | Now, CONFIG_ARCH_MEMORY_PROBE is supported only by powerpc but it does not | ||
186 | contain highly architecture codes. Please add config if you need "probe" | ||
187 | interface. | ||
188 | |||
189 | Probe interface is located at | ||
190 | /sys/devices/system/memory/probe | ||
191 | |||
192 | You can tell the physical address of new memory to the kernel by | ||
193 | |||
194 | % echo start_address_of_new_memory > /sys/devices/system/memory/probe | ||
195 | |||
196 | Then, [start_address_of_new_memory, start_address_of_new_memory + section_size) | ||
197 | memory range is hot-added. In this case, hotplug script is not called (in | ||
198 | current implementation). You'll have to online memory by yourself. | ||
199 | Please see "How to online memory" in this text. | ||
200 | |||
201 | |||
202 | |||
203 | ------------------------------ | ||
204 | 5. Logical Memory hot-add phase | ||
205 | ------------------------------ | ||
206 | |||
207 | 5.1. State of memory | ||
208 | ------------ | ||
209 | To see (online/offline) state of memory section, read 'state' file. | ||
210 | |||
211 | % cat /sys/device/system/memory/memoryXXX/state | ||
212 | |||
213 | |||
214 | If the memory section is online, you'll read "online". | ||
215 | If the memory section is offline, you'll read "offline". | ||
216 | |||
217 | |||
218 | 5.2. How to online memory | ||
219 | ------------ | ||
220 | Even if the memory is hot-added, it is not at ready-to-use state. | ||
221 | For using newly added memory, you have to "online" the memory section. | ||
222 | |||
223 | For onlining, you have to write "online" to the section's state file as: | ||
224 | |||
225 | % echo online > /sys/devices/system/memory/memoryXXX/state | ||
226 | |||
227 | After this, section memoryXXX's state will be 'online' and the amount of | ||
228 | available memory will be increased. | ||
229 | |||
230 | Currently, newly added memory is added as ZONE_NORMAL (for powerpc, ZONE_DMA). | ||
231 | This may be changed in future. | ||
232 | |||
233 | |||
234 | |||
235 | ------------------------ | ||
236 | 6. Logical memory remove | ||
237 | ------------------------ | ||
238 | |||
239 | 6.1 Memory offline and ZONE_MOVABLE | ||
240 | ------------ | ||
241 | Memory offlining is more complicated than memory online. Because memory offline | ||
242 | has to make the whole memory section be unused, memory offline can fail if | ||
243 | the section includes memory which cannot be freed. | ||
244 | |||
245 | In general, memory offline can use 2 techniques. | ||
246 | |||
247 | (1) reclaim and free all memory in the section. | ||
248 | (2) migrate all pages in the section. | ||
249 | |||
250 | In the current implementation, Linux's memory offline uses method (2), freeing | ||
251 | all pages in the section by page migration. But not all pages are | ||
252 | migratable. Under current Linux, migratable pages are anonymous pages and | ||
253 | page caches. For offlining a section by migration, the kernel has to guarantee | ||
254 | that the section contains only migratable pages. | ||
255 | |||
256 | Now, a boot option for making a section which consists of migratable pages is | ||
257 | supported. By specifying "kernelcore=" or "movablecore=" boot option, you can | ||
258 | create ZONE_MOVABLE...a zone which is just used for movable pages. | ||
259 | (See also Documentation/kernel-parameters.txt) | ||
260 | |||
261 | Assume the system has "TOTAL" amount of memory at boot time, this boot option | ||
262 | creates ZONE_MOVABLE as following. | ||
263 | |||
264 | 1) When kernelcore=YYYY boot option is used, | ||
265 | Size of memory not for movable pages (not for offline) is YYYY. | ||
266 | Size of memory for movable pages (for offline) is TOTAL-YYYY. | ||
267 | |||
268 | 2) When movablecore=ZZZZ boot option is used, | ||
269 | Size of memory not for movable pages (not for offline) is TOTAL - ZZZZ. | ||
270 | Size of memory for movable pages (for offline) is ZZZZ. | ||
271 | |||
272 | |||
273 | Note) Unfortunately, there is no information to show which section belongs | ||
274 | to ZONE_MOVABLE. This is TBD. | ||
275 | |||
276 | |||
277 | 6.2. How to offline memory | ||
278 | ------------ | ||
279 | You can offline a section by using the same sysfs interface that was used in | ||
280 | memory onlining. | ||
281 | |||
282 | % echo offline > /sys/devices/system/memory/memoryXXX/state | ||
283 | |||
284 | If offline succeeds, the state of the memory section is changed to be "offline". | ||
285 | If it fails, some error core (like -EBUSY) will be returned by the kernel. | ||
286 | Even if a section does not belong to ZONE_MOVABLE, you can try to offline it. | ||
287 | If it doesn't contain 'unmovable' memory, you'll get success. | ||
288 | |||
289 | A section under ZONE_MOVABLE is considered to be able to be offlined easily. | ||
290 | But under some busy state, it may return -EBUSY. Even if a memory section | ||
291 | cannot be offlined due to -EBUSY, you can retry offlining it and may be able to | ||
292 | offline it (or not). | ||
293 | (For example, a page is referred to by some kernel internal call and released | ||
294 | soon.) | ||
295 | |||
296 | Consideration: | ||
297 | Memory hotplug's design direction is to make the possibility of memory offlining | ||
298 | higher and to guarantee unplugging memory under any situation. But it needs | ||
299 | more work. Returning -EBUSY under some situation may be good because the user | ||
300 | can decide to retry more or not by himself. Currently, memory offlining code | ||
301 | does some amount of retry with 120 seconds timeout. | ||
302 | |||
303 | ------------------------- | ||
304 | 7. Physical memory remove | ||
305 | ------------------------- | ||
306 | Need more implementation yet.... | ||
307 | - Notification completion of remove works by OS to firmware. | ||
308 | - Guard from remove if not yet. | ||
309 | |||
310 | -------------- | ||
311 | 8. Future Work | ||
312 | -------------- | ||
313 | - allowing memory hot-add to ZONE_MOVABLE. maybe we need some switch like | ||
314 | sysctl or new control file. | ||
315 | - showing memory section and physical device relationship. | ||
316 | - showing memory section and node relationship (maybe good for NUMA) | ||
317 | - showing memory section is under ZONE_MOVABLE or not | ||
318 | - test and make it better memory offlining. | ||
319 | - support HugeTLB page migration and offlining. | ||
320 | - memmap removing at memory offline. | ||
321 | - physical remove memory. | ||
322 | |||
diff --git a/Documentation/sched-design-CFS.txt b/Documentation/sched-design-CFS.txt index 16feebb7bdc0..84901e7c0508 100644 --- a/Documentation/sched-design-CFS.txt +++ b/Documentation/sched-design-CFS.txt | |||
@@ -83,7 +83,7 @@ Some implementation details: | |||
83 | CFS uses nanosecond granularity accounting and does not rely on any | 83 | CFS uses nanosecond granularity accounting and does not rely on any |
84 | jiffies or other HZ detail. Thus the CFS scheduler has no notion of | 84 | jiffies or other HZ detail. Thus the CFS scheduler has no notion of |
85 | 'timeslices' and has no heuristics whatsoever. There is only one | 85 | 'timeslices' and has no heuristics whatsoever. There is only one |
86 | central tunable: | 86 | central tunable (you have to switch on CONFIG_SCHED_DEBUG): |
87 | 87 | ||
88 | /proc/sys/kernel/sched_granularity_ns | 88 | /proc/sys/kernel/sched_granularity_ns |
89 | 89 | ||
diff --git a/Documentation/sched-nice-design.txt b/Documentation/sched-nice-design.txt new file mode 100644 index 000000000000..e2bae5a577e3 --- /dev/null +++ b/Documentation/sched-nice-design.txt | |||
@@ -0,0 +1,108 @@ | |||
1 | This document explains the thinking about the revamped and streamlined | ||
2 | nice-levels implementation in the new Linux scheduler. | ||
3 | |||
4 | Nice levels were always pretty weak under Linux and people continuously | ||
5 | pestered us to make nice +19 tasks use up much less CPU time. | ||
6 | |||
7 | Unfortunately that was not that easy to implement under the old | ||
8 | scheduler, (otherwise we'd have done it long ago) because nice level | ||
9 | support was historically coupled to timeslice length, and timeslice | ||
10 | units were driven by the HZ tick, so the smallest timeslice was 1/HZ. | ||
11 | |||
12 | In the O(1) scheduler (in 2003) we changed negative nice levels to be | ||
13 | much stronger than they were before in 2.4 (and people were happy about | ||
14 | that change), and we also intentionally calibrated the linear timeslice | ||
15 | rule so that nice +19 level would be _exactly_ 1 jiffy. To better | ||
16 | understand it, the timeslice graph went like this (cheesy ASCII art | ||
17 | alert!): | ||
18 | |||
19 | |||
20 | A | ||
21 | \ | [timeslice length] | ||
22 | \ | | ||
23 | \ | | ||
24 | \ | | ||
25 | \ | | ||
26 | \|___100msecs | ||
27 | |^ . _ | ||
28 | | ^ . _ | ||
29 | | ^ . _ | ||
30 | -*----------------------------------*-----> [nice level] | ||
31 | -20 | +19 | ||
32 | | | ||
33 | | | ||
34 | |||
35 | So that if someone wanted to really renice tasks, +19 would give a much | ||
36 | bigger hit than the normal linear rule would do. (The solution of | ||
37 | changing the ABI to extend priorities was discarded early on.) | ||
38 | |||
39 | This approach worked to some degree for some time, but later on with | ||
40 | HZ=1000 it caused 1 jiffy to be 1 msec, which meant 0.1% CPU usage which | ||
41 | we felt to be a bit excessive. Excessive _not_ because it's too small of | ||
42 | a CPU utilization, but because it causes too frequent (once per | ||
43 | millisec) rescheduling. (and would thus trash the cache, etc. Remember, | ||
44 | this was long ago when hardware was weaker and caches were smaller, and | ||
45 | people were running number crunching apps at nice +19.) | ||
46 | |||
47 | So for HZ=1000 we changed nice +19 to 5msecs, because that felt like the | ||
48 | right minimal granularity - and this translates to 5% CPU utilization. | ||
49 | But the fundamental HZ-sensitive property for nice+19 still remained, | ||
50 | and we never got a single complaint about nice +19 being too _weak_ in | ||
51 | terms of CPU utilization, we only got complaints about it (still) being | ||
52 | too _strong_ :-) | ||
53 | |||
54 | To sum it up: we always wanted to make nice levels more consistent, but | ||
55 | within the constraints of HZ and jiffies and their nasty design level | ||
56 | coupling to timeslices and granularity it was not really viable. | ||
57 | |||
58 | The second (less frequent but still periodically occuring) complaint | ||
59 | about Linux's nice level support was its assymetry around the origo | ||
60 | (which you can see demonstrated in the picture above), or more | ||
61 | accurately: the fact that nice level behavior depended on the _absolute_ | ||
62 | nice level as well, while the nice API itself is fundamentally | ||
63 | "relative": | ||
64 | |||
65 | int nice(int inc); | ||
66 | |||
67 | asmlinkage long sys_nice(int increment) | ||
68 | |||
69 | (the first one is the glibc API, the second one is the syscall API.) | ||
70 | Note that the 'inc' is relative to the current nice level. Tools like | ||
71 | bash's "nice" command mirror this relative API. | ||
72 | |||
73 | With the old scheduler, if you for example started a niced task with +1 | ||
74 | and another task with +2, the CPU split between the two tasks would | ||
75 | depend on the nice level of the parent shell - if it was at nice -10 the | ||
76 | CPU split was different than if it was at +5 or +10. | ||
77 | |||
78 | A third complaint against Linux's nice level support was that negative | ||
79 | nice levels were not 'punchy enough', so lots of people had to resort to | ||
80 | run audio (and other multimedia) apps under RT priorities such as | ||
81 | SCHED_FIFO. But this caused other problems: SCHED_FIFO is not starvation | ||
82 | proof, and a buggy SCHED_FIFO app can also lock up the system for good. | ||
83 | |||
84 | The new scheduler in v2.6.23 addresses all three types of complaints: | ||
85 | |||
86 | To address the first complaint (of nice levels being not "punchy" | ||
87 | enough), the scheduler was decoupled from 'time slice' and HZ concepts | ||
88 | (and granularity was made a separate concept from nice levels) and thus | ||
89 | it was possible to implement better and more consistent nice +19 | ||
90 | support: with the new scheduler nice +19 tasks get a HZ-independent | ||
91 | 1.5%, instead of the variable 3%-5%-9% range they got in the old | ||
92 | scheduler. | ||
93 | |||
94 | To address the second complaint (of nice levels not being consistent), | ||
95 | the new scheduler makes nice(1) have the same CPU utilization effect on | ||
96 | tasks, regardless of their absolute nice levels. So on the new | ||
97 | scheduler, running a nice +10 and a nice 11 task has the same CPU | ||
98 | utilization "split" between them as running a nice -5 and a nice -4 | ||
99 | task. (one will get 55% of the CPU, the other 45%.) That is why nice | ||
100 | levels were changed to be "multiplicative" (or exponential) - that way | ||
101 | it does not matter which nice level you start out from, the 'relative | ||
102 | result' will always be the same. | ||
103 | |||
104 | The third complaint (of negative nice levels not being "punchy" enough | ||
105 | and forcing audio apps to run under the more dangerous SCHED_FIFO | ||
106 | scheduling policy) is addressed by the new scheduler almost | ||
107 | automatically: stronger negative nice levels are an automatic | ||
108 | side-effect of the recalibrated dynamic range of nice levels. | ||
diff --git a/Documentation/sysrq.txt b/Documentation/sysrq.txt index ba328f255417..ef19142896ca 100644 --- a/Documentation/sysrq.txt +++ b/Documentation/sysrq.txt | |||
@@ -1,6 +1,6 @@ | |||
1 | Linux Magic System Request Key Hacks | 1 | Linux Magic System Request Key Hacks |
2 | Documentation for sysrq.c | 2 | Documentation for sysrq.c |
3 | Last update: 2007-MAR-14 | 3 | Last update: 2007-AUG-04 |
4 | 4 | ||
5 | * What is the magic SysRq key? | 5 | * What is the magic SysRq key? |
6 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | 6 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
@@ -78,7 +78,7 @@ On all - write a character to /proc/sysrq-trigger. e.g.: | |||
78 | 'g' - Used by kgdb on ppc and sh platforms. | 78 | 'g' - Used by kgdb on ppc and sh platforms. |
79 | 79 | ||
80 | 'h' - Will display help (actually any other key than those listed | 80 | 'h' - Will display help (actually any other key than those listed |
81 | above will display help. but 'h' is easy to remember :-) | 81 | here will display help. but 'h' is easy to remember :-) |
82 | 82 | ||
83 | 'i' - Send a SIGKILL to all processes, except for init. | 83 | 'i' - Send a SIGKILL to all processes, except for init. |
84 | 84 | ||
diff --git a/Documentation/thinkpad-acpi.txt b/Documentation/thinkpad-acpi.txt index 6711fbcf4080..eb2f5986e1eb 100644 --- a/Documentation/thinkpad-acpi.txt +++ b/Documentation/thinkpad-acpi.txt | |||
@@ -105,10 +105,10 @@ The version of thinkpad-acpi's sysfs interface is exported by the driver | |||
105 | as a driver attribute (see below). | 105 | as a driver attribute (see below). |
106 | 106 | ||
107 | Sysfs driver attributes are on the driver's sysfs attribute space, | 107 | Sysfs driver attributes are on the driver's sysfs attribute space, |
108 | for 2.6.20 this is /sys/bus/platform/drivers/thinkpad-acpi/. | 108 | for 2.6.20 this is /sys/bus/platform/drivers/thinkpad_acpi/. |
109 | 109 | ||
110 | Sysfs device attributes are on the driver's sysfs attribute space, | 110 | Sysfs device attributes are on the driver's sysfs attribute space, |
111 | for 2.6.20 this is /sys/devices/platform/thinkpad-acpi/. | 111 | for 2.6.20 this is /sys/devices/platform/thinkpad_acpi/. |
112 | 112 | ||
113 | Driver version | 113 | Driver version |
114 | -------------- | 114 | -------------- |
diff --git a/Documentation/vm/numa_memory_policy.txt b/Documentation/vm/numa_memory_policy.txt new file mode 100644 index 000000000000..8242f52d0f22 --- /dev/null +++ b/Documentation/vm/numa_memory_policy.txt | |||
@@ -0,0 +1,332 @@ | |||
1 | |||
2 | What is Linux Memory Policy? | ||
3 | |||
4 | In the Linux kernel, "memory policy" determines from which node the kernel will | ||
5 | allocate memory in a NUMA system or in an emulated NUMA system. Linux has | ||
6 | supported platforms with Non-Uniform Memory Access architectures since 2.4.?. | ||
7 | The current memory policy support was added to Linux 2.6 around May 2004. This | ||
8 | document attempts to describe the concepts and APIs of the 2.6 memory policy | ||
9 | support. | ||
10 | |||
11 | Memory policies should not be confused with cpusets (Documentation/cpusets.txt) | ||
12 | which is an administrative mechanism for restricting the nodes from which | ||
13 | memory may be allocated by a set of processes. Memory policies are a | ||
14 | programming interface that a NUMA-aware application can take advantage of. When | ||
15 | both cpusets and policies are applied to a task, the restrictions of the cpuset | ||
16 | takes priority. See "MEMORY POLICIES AND CPUSETS" below for more details. | ||
17 | |||
18 | MEMORY POLICY CONCEPTS | ||
19 | |||
20 | Scope of Memory Policies | ||
21 | |||
22 | The Linux kernel supports _scopes_ of memory policy, described here from | ||
23 | most general to most specific: | ||
24 | |||
25 | System Default Policy: this policy is "hard coded" into the kernel. It | ||
26 | is the policy that governs all page allocations that aren't controlled | ||
27 | by one of the more specific policy scopes discussed below. When the | ||
28 | system is "up and running", the system default policy will use "local | ||
29 | allocation" described below. However, during boot up, the system | ||
30 | default policy will be set to interleave allocations across all nodes | ||
31 | with "sufficient" memory, so as not to overload the initial boot node | ||
32 | with boot-time allocations. | ||
33 | |||
34 | Task/Process Policy: this is an optional, per-task policy. When defined | ||
35 | for a specific task, this policy controls all page allocations made by or | ||
36 | on behalf of the task that aren't controlled by a more specific scope. | ||
37 | If a task does not define a task policy, then all page allocations that | ||
38 | would have been controlled by the task policy "fall back" to the System | ||
39 | Default Policy. | ||
40 | |||
41 | The task policy applies to the entire address space of a task. Thus, | ||
42 | it is inheritable, and indeed is inherited, across both fork() | ||
43 | [clone() w/o the CLONE_VM flag] and exec*(). This allows a parent task | ||
44 | to establish the task policy for a child task exec()'d from an | ||
45 | executable image that has no awareness of memory policy. See the | ||
46 | MEMORY POLICY APIS section, below, for an overview of the system call | ||
47 | that a task may use to set/change it's task/process policy. | ||
48 | |||
49 | In a multi-threaded task, task policies apply only to the thread | ||
50 | [Linux kernel task] that installs the policy and any threads | ||
51 | subsequently created by that thread. Any sibling threads existing | ||
52 | at the time a new task policy is installed retain their current | ||
53 | policy. | ||
54 | |||
55 | A task policy applies only to pages allocated after the policy is | ||
56 | installed. Any pages already faulted in by the task when the task | ||
57 | changes its task policy remain where they were allocated based on | ||
58 | the policy at the time they were allocated. | ||
59 | |||
60 | VMA Policy: A "VMA" or "Virtual Memory Area" refers to a range of a task's | ||
61 | virtual adddress space. A task may define a specific policy for a range | ||
62 | of its virtual address space. See the MEMORY POLICIES APIS section, | ||
63 | below, for an overview of the mbind() system call used to set a VMA | ||
64 | policy. | ||
65 | |||
66 | A VMA policy will govern the allocation of pages that back this region of | ||
67 | the address space. Any regions of the task's address space that don't | ||
68 | have an explicit VMA policy will fall back to the task policy, which may | ||
69 | itself fall back to the System Default Policy. | ||
70 | |||
71 | VMA policies have a few complicating details: | ||
72 | |||
73 | VMA policy applies ONLY to anonymous pages. These include pages | ||
74 | allocated for anonymous segments, such as the task stack and heap, and | ||
75 | any regions of the address space mmap()ed with the MAP_ANONYMOUS flag. | ||
76 | If a VMA policy is applied to a file mapping, it will be ignored if | ||
77 | the mapping used the MAP_SHARED flag. If the file mapping used the | ||
78 | MAP_PRIVATE flag, the VMA policy will only be applied when an | ||
79 | anonymous page is allocated on an attempt to write to the mapping-- | ||
80 | i.e., at Copy-On-Write. | ||
81 | |||
82 | VMA policies are shared between all tasks that share a virtual address | ||
83 | space--a.k.a. threads--independent of when the policy is installed; and | ||
84 | they are inherited across fork(). However, because VMA policies refer | ||
85 | to a specific region of a task's address space, and because the address | ||
86 | space is discarded and recreated on exec*(), VMA policies are NOT | ||
87 | inheritable across exec(). Thus, only NUMA-aware applications may | ||
88 | use VMA policies. | ||
89 | |||
90 | A task may install a new VMA policy on a sub-range of a previously | ||
91 | mmap()ed region. When this happens, Linux splits the existing virtual | ||
92 | memory area into 2 or 3 VMAs, each with it's own policy. | ||
93 | |||
94 | By default, VMA policy applies only to pages allocated after the policy | ||
95 | is installed. Any pages already faulted into the VMA range remain | ||
96 | where they were allocated based on the policy at the time they were | ||
97 | allocated. However, since 2.6.16, Linux supports page migration via | ||
98 | the mbind() system call, so that page contents can be moved to match | ||
99 | a newly installed policy. | ||
100 | |||
101 | Shared Policy: Conceptually, shared policies apply to "memory objects" | ||
102 | mapped shared into one or more tasks' distinct address spaces. An | ||
103 | application installs a shared policies the same way as VMA policies--using | ||
104 | the mbind() system call specifying a range of virtual addresses that map | ||
105 | the shared object. However, unlike VMA policies, which can be considered | ||
106 | to be an attribute of a range of a task's address space, shared policies | ||
107 | apply directly to the shared object. Thus, all tasks that attach to the | ||
108 | object share the policy, and all pages allocated for the shared object, | ||
109 | by any task, will obey the shared policy. | ||
110 | |||
111 | As of 2.6.22, only shared memory segments, created by shmget() or | ||
112 | mmap(MAP_ANONYMOUS|MAP_SHARED), support shared policy. When shared | ||
113 | policy support was added to Linux, the associated data structures were | ||
114 | added to hugetlbfs shmem segments. At the time, hugetlbfs did not | ||
115 | support allocation at fault time--a.k.a lazy allocation--so hugetlbfs | ||
116 | shmem segments were never "hooked up" to the shared policy support. | ||
117 | Although hugetlbfs segments now support lazy allocation, their support | ||
118 | for shared policy has not been completed. | ||
119 | |||
120 | As mentioned above [re: VMA policies], allocations of page cache | ||
121 | pages for regular files mmap()ed with MAP_SHARED ignore any VMA | ||
122 | policy installed on the virtual address range backed by the shared | ||
123 | file mapping. Rather, shared page cache pages, including pages backing | ||
124 | private mappings that have not yet been written by the task, follow | ||
125 | task policy, if any, else System Default Policy. | ||
126 | |||
127 | The shared policy infrastructure supports different policies on subset | ||
128 | ranges of the shared object. However, Linux still splits the VMA of | ||
129 | the task that installs the policy for each range of distinct policy. | ||
130 | Thus, different tasks that attach to a shared memory segment can have | ||
131 | different VMA configurations mapping that one shared object. This | ||
132 | can be seen by examining the /proc/<pid>/numa_maps of tasks sharing | ||
133 | a shared memory region, when one task has installed shared policy on | ||
134 | one or more ranges of the region. | ||
135 | |||
136 | Components of Memory Policies | ||
137 | |||
138 | A Linux memory policy is a tuple consisting of a "mode" and an optional set | ||
139 | of nodes. The mode determine the behavior of the policy, while the | ||
140 | optional set of nodes can be viewed as the arguments to the behavior. | ||
141 | |||
142 | Internally, memory policies are implemented by a reference counted | ||
143 | structure, struct mempolicy. Details of this structure will be discussed | ||
144 | in context, below, as required to explain the behavior. | ||
145 | |||
146 | Note: in some functions AND in the struct mempolicy itself, the mode | ||
147 | is called "policy". However, to avoid confusion with the policy tuple, | ||
148 | this document will continue to use the term "mode". | ||
149 | |||
150 | Linux memory policy supports the following 4 behavioral modes: | ||
151 | |||
152 | Default Mode--MPOL_DEFAULT: The behavior specified by this mode is | ||
153 | context or scope dependent. | ||
154 | |||
155 | As mentioned in the Policy Scope section above, during normal | ||
156 | system operation, the System Default Policy is hard coded to | ||
157 | contain the Default mode. | ||
158 | |||
159 | In this context, default mode means "local" allocation--that is | ||
160 | attempt to allocate the page from the node associated with the cpu | ||
161 | where the fault occurs. If the "local" node has no memory, or the | ||
162 | node's memory can be exhausted [no free pages available], local | ||
163 | allocation will "fallback to"--attempt to allocate pages from-- | ||
164 | "nearby" nodes, in order of increasing "distance". | ||
165 | |||
166 | Implementation detail -- subject to change: "Fallback" uses | ||
167 | a per node list of sibling nodes--called zonelists--built at | ||
168 | boot time, or when nodes or memory are added or removed from | ||
169 | the system [memory hotplug]. These per node zonelist are | ||
170 | constructed with nodes in order of increasing distance based | ||
171 | on information provided by the platform firmware. | ||
172 | |||
173 | When a task/process policy or a shared policy contains the Default | ||
174 | mode, this also means "local allocation", as described above. | ||
175 | |||
176 | In the context of a VMA, Default mode means "fall back to task | ||
177 | policy"--which may or may not specify Default mode. Thus, Default | ||
178 | mode can not be counted on to mean local allocation when used | ||
179 | on a non-shared region of the address space. However, see | ||
180 | MPOL_PREFERRED below. | ||
181 | |||
182 | The Default mode does not use the optional set of nodes. | ||
183 | |||
184 | MPOL_BIND: This mode specifies that memory must come from the | ||
185 | set of nodes specified by the policy. | ||
186 | |||
187 | The memory policy APIs do not specify an order in which the nodes | ||
188 | will be searched. However, unlike "local allocation", the Bind | ||
189 | policy does not consider the distance between the nodes. Rather, | ||
190 | allocations will fallback to the nodes specified by the policy in | ||
191 | order of numeric node id. Like everything in Linux, this is subject | ||
192 | to change. | ||
193 | |||
194 | MPOL_PREFERRED: This mode specifies that the allocation should be | ||
195 | attempted from the single node specified in the policy. If that | ||
196 | allocation fails, the kernel will search other nodes, exactly as | ||
197 | it would for a local allocation that started at the preferred node | ||
198 | in increasing distance from the preferred node. "Local" allocation | ||
199 | policy can be viewed as a Preferred policy that starts at the node | ||
200 | containing the cpu where the allocation takes place. | ||
201 | |||
202 | Internally, the Preferred policy uses a single node--the | ||
203 | preferred_node member of struct mempolicy. A "distinguished | ||
204 | value of this preferred_node, currently '-1', is interpreted | ||
205 | as "the node containing the cpu where the allocation takes | ||
206 | place"--local allocation. This is the way to specify | ||
207 | local allocation for a specific range of addresses--i.e. for | ||
208 | VMA policies. | ||
209 | |||
210 | MPOL_INTERLEAVED: This mode specifies that page allocations be | ||
211 | interleaved, on a page granularity, across the nodes specified in | ||
212 | the policy. This mode also behaves slightly differently, based on | ||
213 | the context where it is used: | ||
214 | |||
215 | For allocation of anonymous pages and shared memory pages, | ||
216 | Interleave mode indexes the set of nodes specified by the policy | ||
217 | using the page offset of the faulting address into the segment | ||
218 | [VMA] containing the address modulo the number of nodes specified | ||
219 | by the policy. It then attempts to allocate a page, starting at | ||
220 | the selected node, as if the node had been specified by a Preferred | ||
221 | policy or had been selected by a local allocation. That is, | ||
222 | allocation will follow the per node zonelist. | ||
223 | |||
224 | For allocation of page cache pages, Interleave mode indexes the set | ||
225 | of nodes specified by the policy using a node counter maintained | ||
226 | per task. This counter wraps around to the lowest specified node | ||
227 | after it reaches the highest specified node. This will tend to | ||
228 | spread the pages out over the nodes specified by the policy based | ||
229 | on the order in which they are allocated, rather than based on any | ||
230 | page offset into an address range or file. During system boot up, | ||
231 | the temporary interleaved system default policy works in this | ||
232 | mode. | ||
233 | |||
234 | MEMORY POLICY APIs | ||
235 | |||
236 | Linux supports 3 system calls for controlling memory policy. These APIS | ||
237 | always affect only the calling task, the calling task's address space, or | ||
238 | some shared object mapped into the calling task's address space. | ||
239 | |||
240 | Note: the headers that define these APIs and the parameter data types | ||
241 | for user space applications reside in a package that is not part of | ||
242 | the Linux kernel. The kernel system call interfaces, with the 'sys_' | ||
243 | prefix, are defined in <linux/syscalls.h>; the mode and flag | ||
244 | definitions are defined in <linux/mempolicy.h>. | ||
245 | |||
246 | Set [Task] Memory Policy: | ||
247 | |||
248 | long set_mempolicy(int mode, const unsigned long *nmask, | ||
249 | unsigned long maxnode); | ||
250 | |||
251 | Set's the calling task's "task/process memory policy" to mode | ||
252 | specified by the 'mode' argument and the set of nodes defined | ||
253 | by 'nmask'. 'nmask' points to a bit mask of node ids containing | ||
254 | at least 'maxnode' ids. | ||
255 | |||
256 | See the set_mempolicy(2) man page for more details | ||
257 | |||
258 | |||
259 | Get [Task] Memory Policy or Related Information | ||
260 | |||
261 | long get_mempolicy(int *mode, | ||
262 | const unsigned long *nmask, unsigned long maxnode, | ||
263 | void *addr, int flags); | ||
264 | |||
265 | Queries the "task/process memory policy" of the calling task, or | ||
266 | the policy or location of a specified virtual address, depending | ||
267 | on the 'flags' argument. | ||
268 | |||
269 | See the get_mempolicy(2) man page for more details | ||
270 | |||
271 | |||
272 | Install VMA/Shared Policy for a Range of Task's Address Space | ||
273 | |||
274 | long mbind(void *start, unsigned long len, int mode, | ||
275 | const unsigned long *nmask, unsigned long maxnode, | ||
276 | unsigned flags); | ||
277 | |||
278 | mbind() installs the policy specified by (mode, nmask, maxnodes) as | ||
279 | a VMA policy for the range of the calling task's address space | ||
280 | specified by the 'start' and 'len' arguments. Additional actions | ||
281 | may be requested via the 'flags' argument. | ||
282 | |||
283 | See the mbind(2) man page for more details. | ||
284 | |||
285 | MEMORY POLICY COMMAND LINE INTERFACE | ||
286 | |||
287 | Although not strictly part of the Linux implementation of memory policy, | ||
288 | a command line tool, numactl(8), exists that allows one to: | ||
289 | |||
290 | + set the task policy for a specified program via set_mempolicy(2), fork(2) and | ||
291 | exec(2) | ||
292 | |||
293 | + set the shared policy for a shared memory segment via mbind(2) | ||
294 | |||
295 | The numactl(8) tool is packages with the run-time version of the library | ||
296 | containing the memory policy system call wrappers. Some distributions | ||
297 | package the headers and compile-time libraries in a separate development | ||
298 | package. | ||
299 | |||
300 | |||
301 | MEMORY POLICIES AND CPUSETS | ||
302 | |||
303 | Memory policies work within cpusets as described above. For memory policies | ||
304 | that require a node or set of nodes, the nodes are restricted to the set of | ||
305 | nodes whose memories are allowed by the cpuset constraints. If the | ||
306 | intersection of the set of nodes specified for the policy and the set of nodes | ||
307 | allowed by the cpuset is the empty set, the policy is considered invalid and | ||
308 | cannot be installed. | ||
309 | |||
310 | The interaction of memory policies and cpusets can be problematic for a | ||
311 | couple of reasons: | ||
312 | |||
313 | 1) the memory policy APIs take physical node id's as arguments. However, the | ||
314 | memory policy APIs do not provide a way to determine what nodes are valid | ||
315 | in the context where the application is running. An application MAY consult | ||
316 | the cpuset file system [directly or via an out of tree, and not generally | ||
317 | available, libcpuset API] to obtain this information, but then the | ||
318 | application must be aware that it is running in a cpuset and use what are | ||
319 | intended primarily as administrative APIs. | ||
320 | |||
321 | However, as long as the policy specifies at least one node that is valid | ||
322 | in the controlling cpuset, the policy can be used. | ||
323 | |||
324 | 2) when tasks in two cpusets share access to a memory region, such as shared | ||
325 | memory segments created by shmget() of mmap() with the MAP_ANONYMOUS and | ||
326 | MAP_SHARED flags, and any of the tasks install shared policy on the region, | ||
327 | only nodes whose memories are allowed in both cpusets may be used in the | ||
328 | policies. Again, obtaining this information requires "stepping outside" | ||
329 | the memory policy APIs, as well as knowing in what cpusets other task might | ||
330 | be attaching to the shared region, to use the cpuset information. | ||
331 | Furthermore, if the cpusets' allowed memory sets are disjoint, "local" | ||
332 | allocation is the only valid policy. | ||
diff --git a/Documentation/vm/slabinfo.c b/Documentation/vm/slabinfo.c index d4f21ffd1404..1af7bd5a2183 100644 --- a/Documentation/vm/slabinfo.c +++ b/Documentation/vm/slabinfo.c | |||
@@ -396,7 +396,7 @@ void report(struct slabinfo *s) | |||
396 | if (strcmp(s->name, "*") == 0) | 396 | if (strcmp(s->name, "*") == 0) |
397 | return; | 397 | return; |
398 | 398 | ||
399 | printf("\nSlabcache: %-20s Aliases: %2d Order : %2d Objects: %d\n", | 399 | printf("\nSlabcache: %-20s Aliases: %2d Order : %2d Objects: %lu\n", |
400 | s->name, s->aliases, s->order, s->objects); | 400 | s->name, s->aliases, s->order, s->objects); |
401 | if (s->hwcache_align) | 401 | if (s->hwcache_align) |
402 | printf("** Hardware cacheline aligned\n"); | 402 | printf("** Hardware cacheline aligned\n"); |
diff --git a/Documentation/watchdog/00-INDEX b/Documentation/watchdog/00-INDEX new file mode 100644 index 000000000000..c3ea47e507fe --- /dev/null +++ b/Documentation/watchdog/00-INDEX | |||
@@ -0,0 +1,10 @@ | |||
1 | 00-INDEX | ||
2 | - this file. | ||
3 | pcwd-watchdog.txt | ||
4 | - documentation for Berkshire Products PC Watchdog ISA cards. | ||
5 | src/ | ||
6 | - directory holding watchdog related example programs. | ||
7 | watchdog-api.txt | ||
8 | - description of the Linux Watchdog driver API. | ||
9 | wdt.txt | ||
10 | - description of the Watchdog Timer Interfaces for Linux. | ||