diff options
Diffstat (limited to 'Documentation')
-rw-r--r-- | Documentation/DocBook/uio-howto.tmpl | 4 | ||||
-rw-r--r-- | Documentation/fb/pvr2fb.txt | 22 | ||||
-rw-r--r-- | Documentation/i386/zero-page.txt | 10 | ||||
-rw-r--r-- | Documentation/kbuild/kconfig-language.txt | 9 | ||||
-rw-r--r-- | Documentation/kernel-parameters.txt | 2 | ||||
-rw-r--r-- | Documentation/lguest/Makefile | 4 | ||||
-rw-r--r-- | Documentation/memory-hotplug.txt | 322 | ||||
-rw-r--r-- | Documentation/sched-design-CFS.txt | 2 | ||||
-rw-r--r-- | Documentation/sched-nice-design.txt | 108 | ||||
-rw-r--r-- | Documentation/sysrq.txt | 4 | ||||
-rw-r--r-- | Documentation/vm/slabinfo.c | 2 |
11 files changed, 470 insertions, 19 deletions
diff --git a/Documentation/DocBook/uio-howto.tmpl b/Documentation/DocBook/uio-howto.tmpl index e3bb29a8d8dd..c119484258b8 100644 --- a/Documentation/DocBook/uio-howto.tmpl +++ b/Documentation/DocBook/uio-howto.tmpl | |||
@@ -133,10 +133,6 @@ interested in translating it, please email me | |||
133 | <para>updates of your driver can take place without recompiling | 133 | <para>updates of your driver can take place without recompiling |
134 | the kernel.</para> | 134 | the kernel.</para> |
135 | </listitem> | 135 | </listitem> |
136 | <listitem> | ||
137 | <para>if you need to keep some parts of your driver closed source, | ||
138 | you can do so without violating the GPL license on the kernel.</para> | ||
139 | </listitem> | ||
140 | </itemizedlist> | 136 | </itemizedlist> |
141 | 137 | ||
142 | <sect1 id="how_uio_works"> | 138 | <sect1 id="how_uio_works"> |
diff --git a/Documentation/fb/pvr2fb.txt b/Documentation/fb/pvr2fb.txt index 2bf6c2321c2d..36bdeff585e2 100644 --- a/Documentation/fb/pvr2fb.txt +++ b/Documentation/fb/pvr2fb.txt | |||
@@ -9,14 +9,13 @@ one found in the Dreamcast. | |||
9 | Advantages: | 9 | Advantages: |
10 | 10 | ||
11 | * It provides a nice large console (128 cols + 48 lines with 1024x768) | 11 | * It provides a nice large console (128 cols + 48 lines with 1024x768) |
12 | without using tiny, unreadable fonts. | 12 | without using tiny, unreadable fonts (NOT on the Dreamcast) |
13 | * You can run XF86_FBDev on top of /dev/fb0 | 13 | * You can run XF86_FBDev on top of /dev/fb0 |
14 | * Most important: boot logo :-) | 14 | * Most important: boot logo :-) |
15 | 15 | ||
16 | Disadvantages: | 16 | Disadvantages: |
17 | 17 | ||
18 | * Driver is currently limited to the Dreamcast PowerVR 2 implementation | 18 | * Driver is largely untested on non-Dreamcast systems. |
19 | at the time of this writing. | ||
20 | 19 | ||
21 | Configuration | 20 | Configuration |
22 | ============= | 21 | ============= |
@@ -29,11 +28,16 @@ Accepted options: | |||
29 | font:X - default font to use. All fonts are supported, including the | 28 | font:X - default font to use. All fonts are supported, including the |
30 | SUN12x22 font which is very nice at high resolutions. | 29 | SUN12x22 font which is very nice at high resolutions. |
31 | 30 | ||
32 | mode:X - default video mode. The following video modes are supported: | ||
33 | 640x240-60, 640x480-60. | ||
34 | 31 | ||
32 | mode:X - default video mode with format [xres]x[yres]-<bpp>@<refresh rate> | ||
33 | The following video modes are supported: | ||
34 | 640x640-16@60, 640x480-24@60, 640x480-32@60. The Dreamcast | ||
35 | defaults to 640x480-16@60. At the time of writing the | ||
36 | 24bpp and 32bpp modes function poorly. Work to fix that is | ||
37 | ongoing | ||
38 | |||
35 | Note: the 640x240 mode is currently broken, and should not be | 39 | Note: the 640x240 mode is currently broken, and should not be |
36 | used for any reason. It is only mentioned as a reference. | 40 | used for any reason. It is only mentioned here as a reference. |
37 | 41 | ||
38 | inverse - invert colors on screen (for LCD displays) | 42 | inverse - invert colors on screen (for LCD displays) |
39 | 43 | ||
@@ -52,10 +56,10 @@ output:X - output type. This can be any of the following: pal, ntsc, and | |||
52 | X11 | 56 | X11 |
53 | === | 57 | === |
54 | 58 | ||
55 | XF86_FBDev should work, in theory. At the time of this writing it is | 59 | XF86_FBDev has been shown to work on the Dreamcast in the past - though not yet |
56 | totally untested and may or may not even portray the beginnings of | 60 | on any 2.6 series kernel. |
57 | working. If you end up testing this, please let me know! | ||
58 | 61 | ||
59 | -- | 62 | -- |
60 | Paul Mundt <lethal@linuxdc.org> | 63 | Paul Mundt <lethal@linuxdc.org> |
64 | Updated by Adrian McMenamin <adrian@mcmen.demon.co.uk> | ||
61 | 65 | ||
diff --git a/Documentation/i386/zero-page.txt b/Documentation/i386/zero-page.txt index 75b3680c41eb..6c0817c45683 100644 --- a/Documentation/i386/zero-page.txt +++ b/Documentation/i386/zero-page.txt | |||
@@ -1,3 +1,13 @@ | |||
1 | --------------------------------------------------------------------------- | ||
2 | !!!!!!!!!!!!!!!WARNING!!!!!!!! | ||
3 | The zero page is a kernel internal data structure, not a stable ABI. It might change | ||
4 | without warning and the kernel has no way to detect old version of it. | ||
5 | If you're writing some external code like a boot loader you should only use | ||
6 | the stable versioned real mode boot protocol described in boot.txt. Otherwise the kernel | ||
7 | might break you at any time. | ||
8 | !!!!!!!!!!!!!WARNING!!!!!!!!!!! | ||
9 | ---------------------------------------------------------------------------- | ||
10 | |||
1 | Summary of boot_params layout (kernel point of view) | 11 | Summary of boot_params layout (kernel point of view) |
2 | ( collected by Hans Lermen and Martin Mares ) | 12 | ( collected by Hans Lermen and Martin Mares ) |
3 | 13 | ||
diff --git a/Documentation/kbuild/kconfig-language.txt b/Documentation/kbuild/kconfig-language.txt index 536d5bfbdb8d..fe8b0c4892cf 100644 --- a/Documentation/kbuild/kconfig-language.txt +++ b/Documentation/kbuild/kconfig-language.txt | |||
@@ -98,6 +98,15 @@ applicable everywhere (see syntax). | |||
98 | times, the limit is set to the largest selection. | 98 | times, the limit is set to the largest selection. |
99 | Reverse dependencies can only be used with boolean or tristate | 99 | Reverse dependencies can only be used with boolean or tristate |
100 | symbols. | 100 | symbols. |
101 | Note: | ||
102 | select is evil.... select will by brute force set a symbol | ||
103 | equal to 'y' without visiting the dependencies. So abusing | ||
104 | select you are able to select a symbol FOO even if FOO depends | ||
105 | on BAR that is not set. In general use select only for | ||
106 | non-visible symbols (no promts anywhere) and for symbols with | ||
107 | no dependencies. That will limit the usefulness but on the | ||
108 | other hand avoid the illegal configurations all over. kconfig | ||
109 | should one day warn about such things. | ||
101 | 110 | ||
102 | - numerical ranges: "range" <symbol> <symbol> ["if" <expr>] | 111 | - numerical ranges: "range" <symbol> <symbol> ["if" <expr>] |
103 | This allows to limit the range of possible input values for int | 112 | This allows to limit the range of possible input values for int |
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index efdb42fd3fb8..a326487a3ab5 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt | |||
@@ -1922,7 +1922,7 @@ and is between 256 and 4096 characters. It is defined in the file | |||
1922 | See header of drivers/scsi/wd7000.c. | 1922 | See header of drivers/scsi/wd7000.c. |
1923 | 1923 | ||
1924 | wdt= [WDT] Watchdog | 1924 | wdt= [WDT] Watchdog |
1925 | See Documentation/watchdog/watchdog.txt. | 1925 | See Documentation/watchdog/wdt.txt. |
1926 | 1926 | ||
1927 | xd= [HW,XT] Original XT pre-IDE (RLL encoded) disks. | 1927 | xd= [HW,XT] Original XT pre-IDE (RLL encoded) disks. |
1928 | xd_geo= See header of drivers/block/xd.c. | 1928 | xd_geo= See header of drivers/block/xd.c. |
diff --git a/Documentation/lguest/Makefile b/Documentation/lguest/Makefile index 31e794ef5f98..c0b7a4556390 100644 --- a/Documentation/lguest/Makefile +++ b/Documentation/lguest/Makefile | |||
@@ -13,7 +13,9 @@ LGUEST_GUEST_TOP := ($(CONFIG_PAGE_OFFSET) - 0x08000000) | |||
13 | 13 | ||
14 | CFLAGS:=-Wall -Wmissing-declarations -Wmissing-prototypes -O3 -Wl,-T,lguest.lds | 14 | CFLAGS:=-Wall -Wmissing-declarations -Wmissing-prototypes -O3 -Wl,-T,lguest.lds |
15 | LDLIBS:=-lz | 15 | LDLIBS:=-lz |
16 | 16 | # Removing this works for some versions of ld.so (eg. Ubuntu Feisty) and | |
17 | # not others (eg. FC7). | ||
18 | LDFLAGS+=-static | ||
17 | all: lguest.lds lguest | 19 | all: lguest.lds lguest |
18 | 20 | ||
19 | # The linker script on x86 is so complex the only way of creating one | 21 | # The linker script on x86 is so complex the only way of creating one |
diff --git a/Documentation/memory-hotplug.txt b/Documentation/memory-hotplug.txt new file mode 100644 index 000000000000..5fbcc22c98e9 --- /dev/null +++ b/Documentation/memory-hotplug.txt | |||
@@ -0,0 +1,322 @@ | |||
1 | ============== | ||
2 | Memory Hotplug | ||
3 | ============== | ||
4 | |||
5 | Last Updated: Jul 28 2007 | ||
6 | |||
7 | This document is about memory hotplug including how-to-use and current status. | ||
8 | Because Memory Hotplug is still under development, contents of this text will | ||
9 | be changed often. | ||
10 | |||
11 | 1. Introduction | ||
12 | 1.1 purpose of memory hotplug | ||
13 | 1.2. Phases of memory hotplug | ||
14 | 1.3. Unit of Memory online/offline operation | ||
15 | 2. Kernel Configuration | ||
16 | 3. sysfs files for memory hotplug | ||
17 | 4. Physical memory hot-add phase | ||
18 | 4.1 Hardware(Firmware) Support | ||
19 | 4.2 Notify memory hot-add event by hand | ||
20 | 5. Logical Memory hot-add phase | ||
21 | 5.1. State of memory | ||
22 | 5.2. How to online memory | ||
23 | 6. Logical memory remove | ||
24 | 6.1 Memory offline and ZONE_MOVABLE | ||
25 | 6.2. How to offline memory | ||
26 | 7. Physical memory remove | ||
27 | 8. Future Work List | ||
28 | |||
29 | Note(1): x86_64's has special implementation for memory hotplug. | ||
30 | This text does not describe it. | ||
31 | Note(2): This text assumes that sysfs is mounted at /sys. | ||
32 | |||
33 | |||
34 | --------------- | ||
35 | 1. Introduction | ||
36 | --------------- | ||
37 | |||
38 | 1.1 purpose of memory hotplug | ||
39 | ------------ | ||
40 | Memory Hotplug allows users to increase/decrease the amount of memory. | ||
41 | Generally, there are two purposes. | ||
42 | |||
43 | (A) For changing the amount of memory. | ||
44 | This is to allow a feature like capacity on demand. | ||
45 | (B) For installing/removing DIMMs or NUMA-nodes physically. | ||
46 | This is to exchange DIMMs/NUMA-nodes, reduce power consumption, etc. | ||
47 | |||
48 | (A) is required by highly virtualized environments and (B) is required by | ||
49 | hardware which supports memory power management. | ||
50 | |||
51 | Linux memory hotplug is designed for both purpose. | ||
52 | |||
53 | |||
54 | 1.2. Phases of memory hotplug | ||
55 | --------------- | ||
56 | There are 2 phases in Memory Hotplug. | ||
57 | 1) Physical Memory Hotplug phase | ||
58 | 2) Logical Memory Hotplug phase. | ||
59 | |||
60 | The First phase is to communicate hardware/firmware and make/erase | ||
61 | environment for hotplugged memory. Basically, this phase is necessary | ||
62 | for the purpose (B), but this is good phase for communication between | ||
63 | highly virtualized environments too. | ||
64 | |||
65 | When memory is hotplugged, the kernel recognizes new memory, makes new memory | ||
66 | management tables, and makes sysfs files for new memory's operation. | ||
67 | |||
68 | If firmware supports notification of connection of new memory to OS, | ||
69 | this phase is triggered automatically. ACPI can notify this event. If not, | ||
70 | "probe" operation by system administration is used instead. | ||
71 | (see Section 4.). | ||
72 | |||
73 | Logical Memory Hotplug phase is to change memory state into | ||
74 | avaiable/unavailable for users. Amount of memory from user's view is | ||
75 | changed by this phase. The kernel makes all memory in it as free pages | ||
76 | when a memory range is available. | ||
77 | |||
78 | In this document, this phase is described as online/offline. | ||
79 | |||
80 | Logical Memory Hotplug phase is triggred by write of sysfs file by system | ||
81 | administrator. For the hot-add case, it must be executed after Physical Hotplug | ||
82 | phase by hand. | ||
83 | (However, if you writes udev's hotplug scripts for memory hotplug, these | ||
84 | phases can be execute in seamless way.) | ||
85 | |||
86 | |||
87 | 1.3. Unit of Memory online/offline operation | ||
88 | ------------ | ||
89 | Memory hotplug uses SPARSEMEM memory model. SPARSEMEM divides the whole memory | ||
90 | into chunks of the same size. The chunk is called a "section". The size of | ||
91 | a section is architecture dependent. For example, power uses 16MiB, ia64 uses | ||
92 | 1GiB. The unit of online/offline operation is "one section". (see Section 3.) | ||
93 | |||
94 | To determine the size of sections, please read this file: | ||
95 | |||
96 | /sys/devices/system/memory/block_size_bytes | ||
97 | |||
98 | This file shows the size of sections in byte. | ||
99 | |||
100 | ----------------------- | ||
101 | 2. Kernel Configuration | ||
102 | ----------------------- | ||
103 | To use memory hotplug feature, kernel must be compiled with following | ||
104 | config options. | ||
105 | |||
106 | - For all memory hotplug | ||
107 | Memory model -> Sparse Memory (CONFIG_SPARSEMEM) | ||
108 | Allow for memory hot-add (CONFIG_MEMORY_HOTPLUG) | ||
109 | |||
110 | - To enable memory removal, the followings are also necessary | ||
111 | Allow for memory hot remove (CONFIG_MEMORY_HOTREMOVE) | ||
112 | Page Migration (CONFIG_MIGRATION) | ||
113 | |||
114 | - For ACPI memory hotplug, the followings are also necessary | ||
115 | Memory hotplug (under ACPI Support menu) (CONFIG_ACPI_HOTPLUG_MEMORY) | ||
116 | This option can be kernel module. | ||
117 | |||
118 | - As a related configuration, if your box has a feature of NUMA-node hotplug | ||
119 | via ACPI, then this option is necessary too. | ||
120 | ACPI0004,PNP0A05 and PNP0A06 Container Driver (under ACPI Support menu) | ||
121 | (CONFIG_ACPI_CONTAINER). | ||
122 | This option can be kernel module too. | ||
123 | |||
124 | -------------------------------- | ||
125 | 3 sysfs files for memory hotplug | ||
126 | -------------------------------- | ||
127 | All sections have their device information under /sys/devices/system/memory as | ||
128 | |||
129 | /sys/devices/system/memory/memoryXXX | ||
130 | (XXX is section id.) | ||
131 | |||
132 | Now, XXX is defined as start_address_of_section / section_size. | ||
133 | |||
134 | For example, assume 1GiB section size. A device for a memory starting at | ||
135 | 0x100000000 is /sys/device/system/memory/memory4 | ||
136 | (0x100000000 / 1Gib = 4) | ||
137 | This device covers address range [0x100000000 ... 0x140000000) | ||
138 | |||
139 | Under each section, you can see 3 files. | ||
140 | |||
141 | /sys/devices/system/memory/memoryXXX/phys_index | ||
142 | /sys/devices/system/memory/memoryXXX/phys_device | ||
143 | /sys/devices/system/memory/memoryXXX/state | ||
144 | |||
145 | 'phys_index' : read-only and contains section id, same as XXX. | ||
146 | 'state' : read-write | ||
147 | at read: contains online/offline state of memory. | ||
148 | at write: user can specify "online", "offline" command | ||
149 | 'phys_device': read-only: designed to show the name of physical memory device. | ||
150 | This is not well implemented now. | ||
151 | |||
152 | NOTE: | ||
153 | These directories/files appear after physical memory hotplug phase. | ||
154 | |||
155 | |||
156 | -------------------------------- | ||
157 | 4. Physical memory hot-add phase | ||
158 | -------------------------------- | ||
159 | |||
160 | 4.1 Hardware(Firmware) Support | ||
161 | ------------ | ||
162 | On x86_64/ia64 platform, memory hotplug by ACPI is supported. | ||
163 | |||
164 | In general, the firmware (ACPI) which supports memory hotplug defines | ||
165 | memory class object of _HID "PNP0C80". When a notify is asserted to PNP0C80, | ||
166 | Linux's ACPI handler does hot-add memory to the system and calls a hotplug udev | ||
167 | script. This will be done automatically. | ||
168 | |||
169 | But scripts for memory hotplug are not contained in generic udev package(now). | ||
170 | You may have to write it by yourself or online/offline memory by hand. | ||
171 | Please see "How to online memory", "How to offline memory" in this text. | ||
172 | |||
173 | If firmware supports NUMA-node hotplug, and defines an object _HID "ACPI0004", | ||
174 | "PNP0A05", or "PNP0A06", notification is asserted to it, and ACPI handler | ||
175 | calls hotplug code for all of objects which are defined in it. | ||
176 | If memory device is found, memory hotplug code will be called. | ||
177 | |||
178 | |||
179 | 4.2 Notify memory hot-add event by hand | ||
180 | ------------ | ||
181 | In some environments, especially virtualized environment, firmware will not | ||
182 | notify memory hotplug event to the kernel. For such environment, "probe" | ||
183 | interface is supported. This interface depends on CONFIG_ARCH_MEMORY_PROBE. | ||
184 | |||
185 | Now, CONFIG_ARCH_MEMORY_PROBE is supported only by powerpc but it does not | ||
186 | contain highly architecture codes. Please add config if you need "probe" | ||
187 | interface. | ||
188 | |||
189 | Probe interface is located at | ||
190 | /sys/devices/system/memory/probe | ||
191 | |||
192 | You can tell the physical address of new memory to the kernel by | ||
193 | |||
194 | % echo start_address_of_new_memory > /sys/devices/system/memory/probe | ||
195 | |||
196 | Then, [start_address_of_new_memory, start_address_of_new_memory + section_size) | ||
197 | memory range is hot-added. In this case, hotplug script is not called (in | ||
198 | current implementation). You'll have to online memory by yourself. | ||
199 | Please see "How to online memory" in this text. | ||
200 | |||
201 | |||
202 | |||
203 | ------------------------------ | ||
204 | 5. Logical Memory hot-add phase | ||
205 | ------------------------------ | ||
206 | |||
207 | 5.1. State of memory | ||
208 | ------------ | ||
209 | To see (online/offline) state of memory section, read 'state' file. | ||
210 | |||
211 | % cat /sys/device/system/memory/memoryXXX/state | ||
212 | |||
213 | |||
214 | If the memory section is online, you'll read "online". | ||
215 | If the memory section is offline, you'll read "offline". | ||
216 | |||
217 | |||
218 | 5.2. How to online memory | ||
219 | ------------ | ||
220 | Even if the memory is hot-added, it is not at ready-to-use state. | ||
221 | For using newly added memory, you have to "online" the memory section. | ||
222 | |||
223 | For onlining, you have to write "online" to the section's state file as: | ||
224 | |||
225 | % echo online > /sys/devices/system/memory/memoryXXX/state | ||
226 | |||
227 | After this, section memoryXXX's state will be 'online' and the amount of | ||
228 | available memory will be increased. | ||
229 | |||
230 | Currently, newly added memory is added as ZONE_NORMAL (for powerpc, ZONE_DMA). | ||
231 | This may be changed in future. | ||
232 | |||
233 | |||
234 | |||
235 | ------------------------ | ||
236 | 6. Logical memory remove | ||
237 | ------------------------ | ||
238 | |||
239 | 6.1 Memory offline and ZONE_MOVABLE | ||
240 | ------------ | ||
241 | Memory offlining is more complicated than memory online. Because memory offline | ||
242 | has to make the whole memory section be unused, memory offline can fail if | ||
243 | the section includes memory which cannot be freed. | ||
244 | |||
245 | In general, memory offline can use 2 techniques. | ||
246 | |||
247 | (1) reclaim and free all memory in the section. | ||
248 | (2) migrate all pages in the section. | ||
249 | |||
250 | In the current implementation, Linux's memory offline uses method (2), freeing | ||
251 | all pages in the section by page migration. But not all pages are | ||
252 | migratable. Under current Linux, migratable pages are anonymous pages and | ||
253 | page caches. For offlining a section by migration, the kernel has to guarantee | ||
254 | that the section contains only migratable pages. | ||
255 | |||
256 | Now, a boot option for making a section which consists of migratable pages is | ||
257 | supported. By specifying "kernelcore=" or "movablecore=" boot option, you can | ||
258 | create ZONE_MOVABLE...a zone which is just used for movable pages. | ||
259 | (See also Documentation/kernel-parameters.txt) | ||
260 | |||
261 | Assume the system has "TOTAL" amount of memory at boot time, this boot option | ||
262 | creates ZONE_MOVABLE as following. | ||
263 | |||
264 | 1) When kernelcore=YYYY boot option is used, | ||
265 | Size of memory not for movable pages (not for offline) is YYYY. | ||
266 | Size of memory for movable pages (for offline) is TOTAL-YYYY. | ||
267 | |||
268 | 2) When movablecore=ZZZZ boot option is used, | ||
269 | Size of memory not for movable pages (not for offline) is TOTAL - ZZZZ. | ||
270 | Size of memory for movable pages (for offline) is ZZZZ. | ||
271 | |||
272 | |||
273 | Note) Unfortunately, there is no information to show which section belongs | ||
274 | to ZONE_MOVABLE. This is TBD. | ||
275 | |||
276 | |||
277 | 6.2. How to offline memory | ||
278 | ------------ | ||
279 | You can offline a section by using the same sysfs interface that was used in | ||
280 | memory onlining. | ||
281 | |||
282 | % echo offline > /sys/devices/system/memory/memoryXXX/state | ||
283 | |||
284 | If offline succeeds, the state of the memory section is changed to be "offline". | ||
285 | If it fails, some error core (like -EBUSY) will be returned by the kernel. | ||
286 | Even if a section does not belong to ZONE_MOVABLE, you can try to offline it. | ||
287 | If it doesn't contain 'unmovable' memory, you'll get success. | ||
288 | |||
289 | A section under ZONE_MOVABLE is considered to be able to be offlined easily. | ||
290 | But under some busy state, it may return -EBUSY. Even if a memory section | ||
291 | cannot be offlined due to -EBUSY, you can retry offlining it and may be able to | ||
292 | offline it (or not). | ||
293 | (For example, a page is referred to by some kernel internal call and released | ||
294 | soon.) | ||
295 | |||
296 | Consideration: | ||
297 | Memory hotplug's design direction is to make the possibility of memory offlining | ||
298 | higher and to guarantee unplugging memory under any situation. But it needs | ||
299 | more work. Returning -EBUSY under some situation may be good because the user | ||
300 | can decide to retry more or not by himself. Currently, memory offlining code | ||
301 | does some amount of retry with 120 seconds timeout. | ||
302 | |||
303 | ------------------------- | ||
304 | 7. Physical memory remove | ||
305 | ------------------------- | ||
306 | Need more implementation yet.... | ||
307 | - Notification completion of remove works by OS to firmware. | ||
308 | - Guard from remove if not yet. | ||
309 | |||
310 | -------------- | ||
311 | 8. Future Work | ||
312 | -------------- | ||
313 | - allowing memory hot-add to ZONE_MOVABLE. maybe we need some switch like | ||
314 | sysctl or new control file. | ||
315 | - showing memory section and physical device relationship. | ||
316 | - showing memory section and node relationship (maybe good for NUMA) | ||
317 | - showing memory section is under ZONE_MOVABLE or not | ||
318 | - test and make it better memory offlining. | ||
319 | - support HugeTLB page migration and offlining. | ||
320 | - memmap removing at memory offline. | ||
321 | - physical remove memory. | ||
322 | |||
diff --git a/Documentation/sched-design-CFS.txt b/Documentation/sched-design-CFS.txt index 16feebb7bdc0..84901e7c0508 100644 --- a/Documentation/sched-design-CFS.txt +++ b/Documentation/sched-design-CFS.txt | |||
@@ -83,7 +83,7 @@ Some implementation details: | |||
83 | CFS uses nanosecond granularity accounting and does not rely on any | 83 | CFS uses nanosecond granularity accounting and does not rely on any |
84 | jiffies or other HZ detail. Thus the CFS scheduler has no notion of | 84 | jiffies or other HZ detail. Thus the CFS scheduler has no notion of |
85 | 'timeslices' and has no heuristics whatsoever. There is only one | 85 | 'timeslices' and has no heuristics whatsoever. There is only one |
86 | central tunable: | 86 | central tunable (you have to switch on CONFIG_SCHED_DEBUG): |
87 | 87 | ||
88 | /proc/sys/kernel/sched_granularity_ns | 88 | /proc/sys/kernel/sched_granularity_ns |
89 | 89 | ||
diff --git a/Documentation/sched-nice-design.txt b/Documentation/sched-nice-design.txt new file mode 100644 index 000000000000..e2bae5a577e3 --- /dev/null +++ b/Documentation/sched-nice-design.txt | |||
@@ -0,0 +1,108 @@ | |||
1 | This document explains the thinking about the revamped and streamlined | ||
2 | nice-levels implementation in the new Linux scheduler. | ||
3 | |||
4 | Nice levels were always pretty weak under Linux and people continuously | ||
5 | pestered us to make nice +19 tasks use up much less CPU time. | ||
6 | |||
7 | Unfortunately that was not that easy to implement under the old | ||
8 | scheduler, (otherwise we'd have done it long ago) because nice level | ||
9 | support was historically coupled to timeslice length, and timeslice | ||
10 | units were driven by the HZ tick, so the smallest timeslice was 1/HZ. | ||
11 | |||
12 | In the O(1) scheduler (in 2003) we changed negative nice levels to be | ||
13 | much stronger than they were before in 2.4 (and people were happy about | ||
14 | that change), and we also intentionally calibrated the linear timeslice | ||
15 | rule so that nice +19 level would be _exactly_ 1 jiffy. To better | ||
16 | understand it, the timeslice graph went like this (cheesy ASCII art | ||
17 | alert!): | ||
18 | |||
19 | |||
20 | A | ||
21 | \ | [timeslice length] | ||
22 | \ | | ||
23 | \ | | ||
24 | \ | | ||
25 | \ | | ||
26 | \|___100msecs | ||
27 | |^ . _ | ||
28 | | ^ . _ | ||
29 | | ^ . _ | ||
30 | -*----------------------------------*-----> [nice level] | ||
31 | -20 | +19 | ||
32 | | | ||
33 | | | ||
34 | |||
35 | So that if someone wanted to really renice tasks, +19 would give a much | ||
36 | bigger hit than the normal linear rule would do. (The solution of | ||
37 | changing the ABI to extend priorities was discarded early on.) | ||
38 | |||
39 | This approach worked to some degree for some time, but later on with | ||
40 | HZ=1000 it caused 1 jiffy to be 1 msec, which meant 0.1% CPU usage which | ||
41 | we felt to be a bit excessive. Excessive _not_ because it's too small of | ||
42 | a CPU utilization, but because it causes too frequent (once per | ||
43 | millisec) rescheduling. (and would thus trash the cache, etc. Remember, | ||
44 | this was long ago when hardware was weaker and caches were smaller, and | ||
45 | people were running number crunching apps at nice +19.) | ||
46 | |||
47 | So for HZ=1000 we changed nice +19 to 5msecs, because that felt like the | ||
48 | right minimal granularity - and this translates to 5% CPU utilization. | ||
49 | But the fundamental HZ-sensitive property for nice+19 still remained, | ||
50 | and we never got a single complaint about nice +19 being too _weak_ in | ||
51 | terms of CPU utilization, we only got complaints about it (still) being | ||
52 | too _strong_ :-) | ||
53 | |||
54 | To sum it up: we always wanted to make nice levels more consistent, but | ||
55 | within the constraints of HZ and jiffies and their nasty design level | ||
56 | coupling to timeslices and granularity it was not really viable. | ||
57 | |||
58 | The second (less frequent but still periodically occuring) complaint | ||
59 | about Linux's nice level support was its assymetry around the origo | ||
60 | (which you can see demonstrated in the picture above), or more | ||
61 | accurately: the fact that nice level behavior depended on the _absolute_ | ||
62 | nice level as well, while the nice API itself is fundamentally | ||
63 | "relative": | ||
64 | |||
65 | int nice(int inc); | ||
66 | |||
67 | asmlinkage long sys_nice(int increment) | ||
68 | |||
69 | (the first one is the glibc API, the second one is the syscall API.) | ||
70 | Note that the 'inc' is relative to the current nice level. Tools like | ||
71 | bash's "nice" command mirror this relative API. | ||
72 | |||
73 | With the old scheduler, if you for example started a niced task with +1 | ||
74 | and another task with +2, the CPU split between the two tasks would | ||
75 | depend on the nice level of the parent shell - if it was at nice -10 the | ||
76 | CPU split was different than if it was at +5 or +10. | ||
77 | |||
78 | A third complaint against Linux's nice level support was that negative | ||
79 | nice levels were not 'punchy enough', so lots of people had to resort to | ||
80 | run audio (and other multimedia) apps under RT priorities such as | ||
81 | SCHED_FIFO. But this caused other problems: SCHED_FIFO is not starvation | ||
82 | proof, and a buggy SCHED_FIFO app can also lock up the system for good. | ||
83 | |||
84 | The new scheduler in v2.6.23 addresses all three types of complaints: | ||
85 | |||
86 | To address the first complaint (of nice levels being not "punchy" | ||
87 | enough), the scheduler was decoupled from 'time slice' and HZ concepts | ||
88 | (and granularity was made a separate concept from nice levels) and thus | ||
89 | it was possible to implement better and more consistent nice +19 | ||
90 | support: with the new scheduler nice +19 tasks get a HZ-independent | ||
91 | 1.5%, instead of the variable 3%-5%-9% range they got in the old | ||
92 | scheduler. | ||
93 | |||
94 | To address the second complaint (of nice levels not being consistent), | ||
95 | the new scheduler makes nice(1) have the same CPU utilization effect on | ||
96 | tasks, regardless of their absolute nice levels. So on the new | ||
97 | scheduler, running a nice +10 and a nice 11 task has the same CPU | ||
98 | utilization "split" between them as running a nice -5 and a nice -4 | ||
99 | task. (one will get 55% of the CPU, the other 45%.) That is why nice | ||
100 | levels were changed to be "multiplicative" (or exponential) - that way | ||
101 | it does not matter which nice level you start out from, the 'relative | ||
102 | result' will always be the same. | ||
103 | |||
104 | The third complaint (of negative nice levels not being "punchy" enough | ||
105 | and forcing audio apps to run under the more dangerous SCHED_FIFO | ||
106 | scheduling policy) is addressed by the new scheduler almost | ||
107 | automatically: stronger negative nice levels are an automatic | ||
108 | side-effect of the recalibrated dynamic range of nice levels. | ||
diff --git a/Documentation/sysrq.txt b/Documentation/sysrq.txt index ba328f255417..ef19142896ca 100644 --- a/Documentation/sysrq.txt +++ b/Documentation/sysrq.txt | |||
@@ -1,6 +1,6 @@ | |||
1 | Linux Magic System Request Key Hacks | 1 | Linux Magic System Request Key Hacks |
2 | Documentation for sysrq.c | 2 | Documentation for sysrq.c |
3 | Last update: 2007-MAR-14 | 3 | Last update: 2007-AUG-04 |
4 | 4 | ||
5 | * What is the magic SysRq key? | 5 | * What is the magic SysRq key? |
6 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | 6 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
@@ -78,7 +78,7 @@ On all - write a character to /proc/sysrq-trigger. e.g.: | |||
78 | 'g' - Used by kgdb on ppc and sh platforms. | 78 | 'g' - Used by kgdb on ppc and sh platforms. |
79 | 79 | ||
80 | 'h' - Will display help (actually any other key than those listed | 80 | 'h' - Will display help (actually any other key than those listed |
81 | above will display help. but 'h' is easy to remember :-) | 81 | here will display help. but 'h' is easy to remember :-) |
82 | 82 | ||
83 | 'i' - Send a SIGKILL to all processes, except for init. | 83 | 'i' - Send a SIGKILL to all processes, except for init. |
84 | 84 | ||
diff --git a/Documentation/vm/slabinfo.c b/Documentation/vm/slabinfo.c index d4f21ffd1404..1af7bd5a2183 100644 --- a/Documentation/vm/slabinfo.c +++ b/Documentation/vm/slabinfo.c | |||
@@ -396,7 +396,7 @@ void report(struct slabinfo *s) | |||
396 | if (strcmp(s->name, "*") == 0) | 396 | if (strcmp(s->name, "*") == 0) |
397 | return; | 397 | return; |
398 | 398 | ||
399 | printf("\nSlabcache: %-20s Aliases: %2d Order : %2d Objects: %d\n", | 399 | printf("\nSlabcache: %-20s Aliases: %2d Order : %2d Objects: %lu\n", |
400 | s->name, s->aliases, s->order, s->objects); | 400 | s->name, s->aliases, s->order, s->objects); |
401 | if (s->hwcache_align) | 401 | if (s->hwcache_align) |
402 | printf("** Hardware cacheline aligned\n"); | 402 | printf("** Hardware cacheline aligned\n"); |