diff options
Diffstat (limited to 'Documentation')
-rw-r--r-- | Documentation/ABI/testing/sysfs-devices-cache_disable | 18 | ||||
-rw-r--r-- | Documentation/futex-requeue-pi.txt | 131 | ||||
-rw-r--r-- | Documentation/kernel-parameters.txt | 3 | ||||
-rw-r--r-- | Documentation/memory-barriers.txt | 129 | ||||
-rw-r--r-- | Documentation/scheduler/sched-rt-group.txt | 20 | ||||
-rw-r--r-- | Documentation/trace/ftrace.txt | 15 | ||||
-rw-r--r-- | Documentation/x86/boot.txt | 122 | ||||
-rw-r--r-- | Documentation/x86/x86_64/boot-options.txt | 5 | ||||
-rw-r--r-- | Documentation/x86/x86_64/mm.txt | 9 |
9 files changed, 430 insertions, 22 deletions
diff --git a/Documentation/ABI/testing/sysfs-devices-cache_disable b/Documentation/ABI/testing/sysfs-devices-cache_disable new file mode 100644 index 000000000000..175bb4f70512 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-devices-cache_disable | |||
@@ -0,0 +1,18 @@ | |||
1 | What: /sys/devices/system/cpu/cpu*/cache/index*/cache_disable_X | ||
2 | Date: August 2008 | ||
3 | KernelVersion: 2.6.27 | ||
4 | Contact: mark.langsdorf@amd.com | ||
5 | Description: These files exist in every cpu's cache index directories. | ||
6 | There are currently 2 cache_disable_# files in each | ||
7 | directory. Reading from these files on a supported | ||
8 | processor will return that cache disable index value | ||
9 | for that processor and node. Writing to one of these | ||
10 | files will cause the specificed cache index to be disabled. | ||
11 | |||
12 | Currently, only AMD Family 10h Processors support cache index | ||
13 | disable, and only for their L3 caches. See the BIOS and | ||
14 | Kernel Developer's Guide at | ||
15 | http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/31116-Public-GH-BKDG_3.20_2-4-09.pdf | ||
16 | for formatting information and other details on the | ||
17 | cache index disable. | ||
18 | Users: joachim.deguara@amd.com | ||
diff --git a/Documentation/futex-requeue-pi.txt b/Documentation/futex-requeue-pi.txt new file mode 100644 index 000000000000..9dc1ff4fd536 --- /dev/null +++ b/Documentation/futex-requeue-pi.txt | |||
@@ -0,0 +1,131 @@ | |||
1 | Futex Requeue PI | ||
2 | ---------------- | ||
3 | |||
4 | Requeueing of tasks from a non-PI futex to a PI futex requires | ||
5 | special handling in order to ensure the underlying rt_mutex is never | ||
6 | left without an owner if it has waiters; doing so would break the PI | ||
7 | boosting logic [see rt-mutex-desgin.txt] For the purposes of | ||
8 | brevity, this action will be referred to as "requeue_pi" throughout | ||
9 | this document. Priority inheritance is abbreviated throughout as | ||
10 | "PI". | ||
11 | |||
12 | Motivation | ||
13 | ---------- | ||
14 | |||
15 | Without requeue_pi, the glibc implementation of | ||
16 | pthread_cond_broadcast() must resort to waking all the tasks waiting | ||
17 | on a pthread_condvar and letting them try to sort out which task | ||
18 | gets to run first in classic thundering-herd formation. An ideal | ||
19 | implementation would wake the highest-priority waiter, and leave the | ||
20 | rest to the natural wakeup inherent in unlocking the mutex | ||
21 | associated with the condvar. | ||
22 | |||
23 | Consider the simplified glibc calls: | ||
24 | |||
25 | /* caller must lock mutex */ | ||
26 | pthread_cond_wait(cond, mutex) | ||
27 | { | ||
28 | lock(cond->__data.__lock); | ||
29 | unlock(mutex); | ||
30 | do { | ||
31 | unlock(cond->__data.__lock); | ||
32 | futex_wait(cond->__data.__futex); | ||
33 | lock(cond->__data.__lock); | ||
34 | } while(...) | ||
35 | unlock(cond->__data.__lock); | ||
36 | lock(mutex); | ||
37 | } | ||
38 | |||
39 | pthread_cond_broadcast(cond) | ||
40 | { | ||
41 | lock(cond->__data.__lock); | ||
42 | unlock(cond->__data.__lock); | ||
43 | futex_requeue(cond->data.__futex, cond->mutex); | ||
44 | } | ||
45 | |||
46 | Once pthread_cond_broadcast() requeues the tasks, the cond->mutex | ||
47 | has waiters. Note that pthread_cond_wait() attempts to lock the | ||
48 | mutex only after it has returned to user space. This will leave the | ||
49 | underlying rt_mutex with waiters, and no owner, breaking the | ||
50 | previously mentioned PI-boosting algorithms. | ||
51 | |||
52 | In order to support PI-aware pthread_condvar's, the kernel needs to | ||
53 | be able to requeue tasks to PI futexes. This support implies that | ||
54 | upon a successful futex_wait system call, the caller would return to | ||
55 | user space already holding the PI futex. The glibc implementation | ||
56 | would be modified as follows: | ||
57 | |||
58 | |||
59 | /* caller must lock mutex */ | ||
60 | pthread_cond_wait_pi(cond, mutex) | ||
61 | { | ||
62 | lock(cond->__data.__lock); | ||
63 | unlock(mutex); | ||
64 | do { | ||
65 | unlock(cond->__data.__lock); | ||
66 | futex_wait_requeue_pi(cond->__data.__futex); | ||
67 | lock(cond->__data.__lock); | ||
68 | } while(...) | ||
69 | unlock(cond->__data.__lock); | ||
70 | /* the kernel acquired the the mutex for us */ | ||
71 | } | ||
72 | |||
73 | pthread_cond_broadcast_pi(cond) | ||
74 | { | ||
75 | lock(cond->__data.__lock); | ||
76 | unlock(cond->__data.__lock); | ||
77 | futex_requeue_pi(cond->data.__futex, cond->mutex); | ||
78 | } | ||
79 | |||
80 | The actual glibc implementation will likely test for PI and make the | ||
81 | necessary changes inside the existing calls rather than creating new | ||
82 | calls for the PI cases. Similar changes are needed for | ||
83 | pthread_cond_timedwait() and pthread_cond_signal(). | ||
84 | |||
85 | Implementation | ||
86 | -------------- | ||
87 | |||
88 | In order to ensure the rt_mutex has an owner if it has waiters, it | ||
89 | is necessary for both the requeue code, as well as the waiting code, | ||
90 | to be able to acquire the rt_mutex before returning to user space. | ||
91 | The requeue code cannot simply wake the waiter and leave it to | ||
92 | acquire the rt_mutex as it would open a race window between the | ||
93 | requeue call returning to user space and the waiter waking and | ||
94 | starting to run. This is especially true in the uncontended case. | ||
95 | |||
96 | The solution involves two new rt_mutex helper routines, | ||
97 | rt_mutex_start_proxy_lock() and rt_mutex_finish_proxy_lock(), which | ||
98 | allow the requeue code to acquire an uncontended rt_mutex on behalf | ||
99 | of the waiter and to enqueue the waiter on a contended rt_mutex. | ||
100 | Two new system calls provide the kernel<->user interface to | ||
101 | requeue_pi: FUTEX_WAIT_REQUEUE_PI and FUTEX_REQUEUE_CMP_PI. | ||
102 | |||
103 | FUTEX_WAIT_REQUEUE_PI is called by the waiter (pthread_cond_wait() | ||
104 | and pthread_cond_timedwait()) to block on the initial futex and wait | ||
105 | to be requeued to a PI-aware futex. The implementation is the | ||
106 | result of a high-speed collision between futex_wait() and | ||
107 | futex_lock_pi(), with some extra logic to check for the additional | ||
108 | wake-up scenarios. | ||
109 | |||
110 | FUTEX_REQUEUE_CMP_PI is called by the waker | ||
111 | (pthread_cond_broadcast() and pthread_cond_signal()) to requeue and | ||
112 | possibly wake the waiting tasks. Internally, this system call is | ||
113 | still handled by futex_requeue (by passing requeue_pi=1). Before | ||
114 | requeueing, futex_requeue() attempts to acquire the requeue target | ||
115 | PI futex on behalf of the top waiter. If it can, this waiter is | ||
116 | woken. futex_requeue() then proceeds to requeue the remaining | ||
117 | nr_wake+nr_requeue tasks to the PI futex, calling | ||
118 | rt_mutex_start_proxy_lock() prior to each requeue to prepare the | ||
119 | task as a waiter on the underlying rt_mutex. It is possible that | ||
120 | the lock can be acquired at this stage as well, if so, the next | ||
121 | waiter is woken to finish the acquisition of the lock. | ||
122 | |||
123 | FUTEX_REQUEUE_PI accepts nr_wake and nr_requeue as arguments, but | ||
124 | their sum is all that really matters. futex_requeue() will wake or | ||
125 | requeue up to nr_wake + nr_requeue tasks. It will wake only as many | ||
126 | tasks as it can acquire the lock for, which in the majority of cases | ||
127 | should be 0 as good programming practice dictates that the caller of | ||
128 | either pthread_cond_broadcast() or pthread_cond_signal() acquire the | ||
129 | mutex prior to making the call. FUTEX_REQUEUE_PI requires that | ||
130 | nr_wake=1. nr_requeue should be INT_MAX for broadcast and 0 for | ||
131 | signal. | ||
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 8f9e17c855a0..af43f45e8358 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt | |||
@@ -1577,6 +1577,9 @@ and is between 256 and 4096 characters. It is defined in the file | |||
1577 | noinitrd [RAM] Tells the kernel not to load any configured | 1577 | noinitrd [RAM] Tells the kernel not to load any configured |
1578 | initial RAM disk. | 1578 | initial RAM disk. |
1579 | 1579 | ||
1580 | nointremap [X86-64, Intel-IOMMU] Do not enable interrupt | ||
1581 | remapping. | ||
1582 | |||
1580 | nointroute [IA-64] | 1583 | nointroute [IA-64] |
1581 | 1584 | ||
1582 | nojitter [IA64] Disables jitter checking for ITC timers. | 1585 | nojitter [IA64] Disables jitter checking for ITC timers. |
diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt index f5b7127f54ac..7f5809eddee6 100644 --- a/Documentation/memory-barriers.txt +++ b/Documentation/memory-barriers.txt | |||
@@ -31,6 +31,7 @@ Contents: | |||
31 | 31 | ||
32 | - Locking functions. | 32 | - Locking functions. |
33 | - Interrupt disabling functions. | 33 | - Interrupt disabling functions. |
34 | - Sleep and wake-up functions. | ||
34 | - Miscellaneous functions. | 35 | - Miscellaneous functions. |
35 | 36 | ||
36 | (*) Inter-CPU locking barrier effects. | 37 | (*) Inter-CPU locking barrier effects. |
@@ -1217,6 +1218,132 @@ barriers are required in such a situation, they must be provided from some | |||
1217 | other means. | 1218 | other means. |
1218 | 1219 | ||
1219 | 1220 | ||
1221 | SLEEP AND WAKE-UP FUNCTIONS | ||
1222 | --------------------------- | ||
1223 | |||
1224 | Sleeping and waking on an event flagged in global data can be viewed as an | ||
1225 | interaction between two pieces of data: the task state of the task waiting for | ||
1226 | the event and the global data used to indicate the event. To make sure that | ||
1227 | these appear to happen in the right order, the primitives to begin the process | ||
1228 | of going to sleep, and the primitives to initiate a wake up imply certain | ||
1229 | barriers. | ||
1230 | |||
1231 | Firstly, the sleeper normally follows something like this sequence of events: | ||
1232 | |||
1233 | for (;;) { | ||
1234 | set_current_state(TASK_UNINTERRUPTIBLE); | ||
1235 | if (event_indicated) | ||
1236 | break; | ||
1237 | schedule(); | ||
1238 | } | ||
1239 | |||
1240 | A general memory barrier is interpolated automatically by set_current_state() | ||
1241 | after it has altered the task state: | ||
1242 | |||
1243 | CPU 1 | ||
1244 | =============================== | ||
1245 | set_current_state(); | ||
1246 | set_mb(); | ||
1247 | STORE current->state | ||
1248 | <general barrier> | ||
1249 | LOAD event_indicated | ||
1250 | |||
1251 | set_current_state() may be wrapped by: | ||
1252 | |||
1253 | prepare_to_wait(); | ||
1254 | prepare_to_wait_exclusive(); | ||
1255 | |||
1256 | which therefore also imply a general memory barrier after setting the state. | ||
1257 | The whole sequence above is available in various canned forms, all of which | ||
1258 | interpolate the memory barrier in the right place: | ||
1259 | |||
1260 | wait_event(); | ||
1261 | wait_event_interruptible(); | ||
1262 | wait_event_interruptible_exclusive(); | ||
1263 | wait_event_interruptible_timeout(); | ||
1264 | wait_event_killable(); | ||
1265 | wait_event_timeout(); | ||
1266 | wait_on_bit(); | ||
1267 | wait_on_bit_lock(); | ||
1268 | |||
1269 | |||
1270 | Secondly, code that performs a wake up normally follows something like this: | ||
1271 | |||
1272 | event_indicated = 1; | ||
1273 | wake_up(&event_wait_queue); | ||
1274 | |||
1275 | or: | ||
1276 | |||
1277 | event_indicated = 1; | ||
1278 | wake_up_process(event_daemon); | ||
1279 | |||
1280 | A write memory barrier is implied by wake_up() and co. if and only if they wake | ||
1281 | something up. The barrier occurs before the task state is cleared, and so sits | ||
1282 | between the STORE to indicate the event and the STORE to set TASK_RUNNING: | ||
1283 | |||
1284 | CPU 1 CPU 2 | ||
1285 | =============================== =============================== | ||
1286 | set_current_state(); STORE event_indicated | ||
1287 | set_mb(); wake_up(); | ||
1288 | STORE current->state <write barrier> | ||
1289 | <general barrier> STORE current->state | ||
1290 | LOAD event_indicated | ||
1291 | |||
1292 | The available waker functions include: | ||
1293 | |||
1294 | complete(); | ||
1295 | wake_up(); | ||
1296 | wake_up_all(); | ||
1297 | wake_up_bit(); | ||
1298 | wake_up_interruptible(); | ||
1299 | wake_up_interruptible_all(); | ||
1300 | wake_up_interruptible_nr(); | ||
1301 | wake_up_interruptible_poll(); | ||
1302 | wake_up_interruptible_sync(); | ||
1303 | wake_up_interruptible_sync_poll(); | ||
1304 | wake_up_locked(); | ||
1305 | wake_up_locked_poll(); | ||
1306 | wake_up_nr(); | ||
1307 | wake_up_poll(); | ||
1308 | wake_up_process(); | ||
1309 | |||
1310 | |||
1311 | [!] Note that the memory barriers implied by the sleeper and the waker do _not_ | ||
1312 | order multiple stores before the wake-up with respect to loads of those stored | ||
1313 | values after the sleeper has called set_current_state(). For instance, if the | ||
1314 | sleeper does: | ||
1315 | |||
1316 | set_current_state(TASK_INTERRUPTIBLE); | ||
1317 | if (event_indicated) | ||
1318 | break; | ||
1319 | __set_current_state(TASK_RUNNING); | ||
1320 | do_something(my_data); | ||
1321 | |||
1322 | and the waker does: | ||
1323 | |||
1324 | my_data = value; | ||
1325 | event_indicated = 1; | ||
1326 | wake_up(&event_wait_queue); | ||
1327 | |||
1328 | there's no guarantee that the change to event_indicated will be perceived by | ||
1329 | the sleeper as coming after the change to my_data. In such a circumstance, the | ||
1330 | code on both sides must interpolate its own memory barriers between the | ||
1331 | separate data accesses. Thus the above sleeper ought to do: | ||
1332 | |||
1333 | set_current_state(TASK_INTERRUPTIBLE); | ||
1334 | if (event_indicated) { | ||
1335 | smp_rmb(); | ||
1336 | do_something(my_data); | ||
1337 | } | ||
1338 | |||
1339 | and the waker should do: | ||
1340 | |||
1341 | my_data = value; | ||
1342 | smp_wmb(); | ||
1343 | event_indicated = 1; | ||
1344 | wake_up(&event_wait_queue); | ||
1345 | |||
1346 | |||
1220 | MISCELLANEOUS FUNCTIONS | 1347 | MISCELLANEOUS FUNCTIONS |
1221 | ----------------------- | 1348 | ----------------------- |
1222 | 1349 | ||
@@ -1366,7 +1493,7 @@ WHERE ARE MEMORY BARRIERS NEEDED? | |||
1366 | 1493 | ||
1367 | Under normal operation, memory operation reordering is generally not going to | 1494 | Under normal operation, memory operation reordering is generally not going to |
1368 | be a problem as a single-threaded linear piece of code will still appear to | 1495 | be a problem as a single-threaded linear piece of code will still appear to |
1369 | work correctly, even if it's in an SMP kernel. There are, however, three | 1496 | work correctly, even if it's in an SMP kernel. There are, however, four |
1370 | circumstances in which reordering definitely _could_ be a problem: | 1497 | circumstances in which reordering definitely _could_ be a problem: |
1371 | 1498 | ||
1372 | (*) Interprocessor interaction. | 1499 | (*) Interprocessor interaction. |
diff --git a/Documentation/scheduler/sched-rt-group.txt b/Documentation/scheduler/sched-rt-group.txt index 5ba4d3fc625a..1df7f9cdab05 100644 --- a/Documentation/scheduler/sched-rt-group.txt +++ b/Documentation/scheduler/sched-rt-group.txt | |||
@@ -4,6 +4,7 @@ | |||
4 | CONTENTS | 4 | CONTENTS |
5 | ======== | 5 | ======== |
6 | 6 | ||
7 | 0. WARNING | ||
7 | 1. Overview | 8 | 1. Overview |
8 | 1.1 The problem | 9 | 1.1 The problem |
9 | 1.2 The solution | 10 | 1.2 The solution |
@@ -14,6 +15,23 @@ CONTENTS | |||
14 | 3. Future plans | 15 | 3. Future plans |
15 | 16 | ||
16 | 17 | ||
18 | 0. WARNING | ||
19 | ========== | ||
20 | |||
21 | Fiddling with these settings can result in an unstable system, the knobs are | ||
22 | root only and assumes root knows what he is doing. | ||
23 | |||
24 | Most notable: | ||
25 | |||
26 | * very small values in sched_rt_period_us can result in an unstable | ||
27 | system when the period is smaller than either the available hrtimer | ||
28 | resolution, or the time it takes to handle the budget refresh itself. | ||
29 | |||
30 | * very small values in sched_rt_runtime_us can result in an unstable | ||
31 | system when the runtime is so small the system has difficulty making | ||
32 | forward progress (NOTE: the migration thread and kstopmachine both | ||
33 | are real-time processes). | ||
34 | |||
17 | 1. Overview | 35 | 1. Overview |
18 | =========== | 36 | =========== |
19 | 37 | ||
@@ -169,7 +187,7 @@ get their allocated time. | |||
169 | 187 | ||
170 | Implementing SCHED_EDF might take a while to complete. Priority Inheritance is | 188 | Implementing SCHED_EDF might take a while to complete. Priority Inheritance is |
171 | the biggest challenge as the current linux PI infrastructure is geared towards | 189 | the biggest challenge as the current linux PI infrastructure is geared towards |
172 | the limited static priority levels 0-139. With deadline scheduling you need to | 190 | the limited static priority levels 0-99. With deadline scheduling you need to |
173 | do deadline inheritance (since priority is inversely proportional to the | 191 | do deadline inheritance (since priority is inversely proportional to the |
174 | deadline delta (deadline - now). | 192 | deadline delta (deadline - now). |
175 | 193 | ||
diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt index fd9a3e693813..e362f50c496f 100644 --- a/Documentation/trace/ftrace.txt +++ b/Documentation/trace/ftrace.txt | |||
@@ -518,9 +518,18 @@ priority with zero (0) being the highest priority and the nice | |||
518 | values starting at 100 (nice -20). Below is a quick chart to map | 518 | values starting at 100 (nice -20). Below is a quick chart to map |
519 | the kernel priority to user land priorities. | 519 | the kernel priority to user land priorities. |
520 | 520 | ||
521 | Kernel priority: 0 to 99 ==> user RT priority 99 to 0 | 521 | Kernel Space User Space |
522 | Kernel priority: 100 to 139 ==> user nice -20 to 19 | 522 | =============================================================== |
523 | Kernel priority: 140 ==> idle task priority | 523 | 0(high) to 98(low) user RT priority 99(high) to 1(low) |
524 | with SCHED_RR or SCHED_FIFO | ||
525 | --------------------------------------------------------------- | ||
526 | 99 sched_priority is not used in scheduling | ||
527 | decisions(it must be specified as 0) | ||
528 | --------------------------------------------------------------- | ||
529 | 100(high) to 139(low) user nice -20(high) to 19(low) | ||
530 | --------------------------------------------------------------- | ||
531 | 140 idle task priority | ||
532 | --------------------------------------------------------------- | ||
524 | 533 | ||
525 | The task states are: | 534 | The task states are: |
526 | 535 | ||
diff --git a/Documentation/x86/boot.txt b/Documentation/x86/boot.txt index e0203662f9e9..8da3a795083f 100644 --- a/Documentation/x86/boot.txt +++ b/Documentation/x86/boot.txt | |||
@@ -50,6 +50,10 @@ Protocol 2.08: (Kernel 2.6.26) Added crc32 checksum and ELF format | |||
50 | Protocol 2.09: (Kernel 2.6.26) Added a field of 64-bit physical | 50 | Protocol 2.09: (Kernel 2.6.26) Added a field of 64-bit physical |
51 | pointer to single linked list of struct setup_data. | 51 | pointer to single linked list of struct setup_data. |
52 | 52 | ||
53 | Protocol 2.10: (Kernel 2.6.31) Added a protocol for relaxed alignment | ||
54 | beyond the kernel_alignment added, new init_size and | ||
55 | pref_address fields. Added extended boot loader IDs. | ||
56 | |||
53 | **** MEMORY LAYOUT | 57 | **** MEMORY LAYOUT |
54 | 58 | ||
55 | The traditional memory map for the kernel loader, used for Image or | 59 | The traditional memory map for the kernel loader, used for Image or |
@@ -168,12 +172,13 @@ Offset Proto Name Meaning | |||
168 | 021C/4 2.00+ ramdisk_size initrd size (set by boot loader) | 172 | 021C/4 2.00+ ramdisk_size initrd size (set by boot loader) |
169 | 0220/4 2.00+ bootsect_kludge DO NOT USE - for bootsect.S use only | 173 | 0220/4 2.00+ bootsect_kludge DO NOT USE - for bootsect.S use only |
170 | 0224/2 2.01+ heap_end_ptr Free memory after setup end | 174 | 0224/2 2.01+ heap_end_ptr Free memory after setup end |
171 | 0226/2 N/A pad1 Unused | 175 | 0226/1 2.02+(3 ext_loader_ver Extended boot loader version |
176 | 0227/1 2.02+(3 ext_loader_type Extended boot loader ID | ||
172 | 0228/4 2.02+ cmd_line_ptr 32-bit pointer to the kernel command line | 177 | 0228/4 2.02+ cmd_line_ptr 32-bit pointer to the kernel command line |
173 | 022C/4 2.03+ ramdisk_max Highest legal initrd address | 178 | 022C/4 2.03+ ramdisk_max Highest legal initrd address |
174 | 0230/4 2.05+ kernel_alignment Physical addr alignment required for kernel | 179 | 0230/4 2.05+ kernel_alignment Physical addr alignment required for kernel |
175 | 0234/1 2.05+ relocatable_kernel Whether kernel is relocatable or not | 180 | 0234/1 2.05+ relocatable_kernel Whether kernel is relocatable or not |
176 | 0235/1 N/A pad2 Unused | 181 | 0235/1 2.10+ min_alignment Minimum alignment, as a power of two |
177 | 0236/2 N/A pad3 Unused | 182 | 0236/2 N/A pad3 Unused |
178 | 0238/4 2.06+ cmdline_size Maximum size of the kernel command line | 183 | 0238/4 2.06+ cmdline_size Maximum size of the kernel command line |
179 | 023C/4 2.07+ hardware_subarch Hardware subarchitecture | 184 | 023C/4 2.07+ hardware_subarch Hardware subarchitecture |
@@ -182,6 +187,8 @@ Offset Proto Name Meaning | |||
182 | 024C/4 2.08+ payload_length Length of kernel payload | 187 | 024C/4 2.08+ payload_length Length of kernel payload |
183 | 0250/8 2.09+ setup_data 64-bit physical pointer to linked list | 188 | 0250/8 2.09+ setup_data 64-bit physical pointer to linked list |
184 | of struct setup_data | 189 | of struct setup_data |
190 | 0258/8 2.10+ pref_address Preferred loading address | ||
191 | 0260/4 2.10+ init_size Linear memory required during initialization | ||
185 | 192 | ||
186 | (1) For backwards compatibility, if the setup_sects field contains 0, the | 193 | (1) For backwards compatibility, if the setup_sects field contains 0, the |
187 | real value is 4. | 194 | real value is 4. |
@@ -190,6 +197,8 @@ Offset Proto Name Meaning | |||
190 | field are unusable, which means the size of a bzImage kernel | 197 | field are unusable, which means the size of a bzImage kernel |
191 | cannot be determined. | 198 | cannot be determined. |
192 | 199 | ||
200 | (3) Ignored, but safe to set, for boot protocols 2.02-2.09. | ||
201 | |||
193 | If the "HdrS" (0x53726448) magic number is not found at offset 0x202, | 202 | If the "HdrS" (0x53726448) magic number is not found at offset 0x202, |
194 | the boot protocol version is "old". Loading an old kernel, the | 203 | the boot protocol version is "old". Loading an old kernel, the |
195 | following parameters should be assumed: | 204 | following parameters should be assumed: |
@@ -343,18 +352,32 @@ Protocol: 2.00+ | |||
343 | 0xTV here, where T is an identifier for the boot loader and V is | 352 | 0xTV here, where T is an identifier for the boot loader and V is |
344 | a version number. Otherwise, enter 0xFF here. | 353 | a version number. Otherwise, enter 0xFF here. |
345 | 354 | ||
355 | For boot loader IDs above T = 0xD, write T = 0xE to this field and | ||
356 | write the extended ID minus 0x10 to the ext_loader_type field. | ||
357 | Similarly, the ext_loader_ver field can be used to provide more than | ||
358 | four bits for the bootloader version. | ||
359 | |||
360 | For example, for T = 0x15, V = 0x234, write: | ||
361 | |||
362 | type_of_loader <- 0xE4 | ||
363 | ext_loader_type <- 0x05 | ||
364 | ext_loader_ver <- 0x23 | ||
365 | |||
346 | Assigned boot loader ids: | 366 | Assigned boot loader ids: |
347 | 0 LILO (0x00 reserved for pre-2.00 bootloader) | 367 | 0 LILO (0x00 reserved for pre-2.00 bootloader) |
348 | 1 Loadlin | 368 | 1 Loadlin |
349 | 2 bootsect-loader (0x20, all other values reserved) | 369 | 2 bootsect-loader (0x20, all other values reserved) |
350 | 3 SYSLINUX | 370 | 3 Syslinux |
351 | 4 EtherBoot | 371 | 4 Etherboot/gPXE |
352 | 5 ELILO | 372 | 5 ELILO |
353 | 7 GRUB | 373 | 7 GRUB |
354 | 8 U-BOOT | 374 | 8 U-Boot |
355 | 9 Xen | 375 | 9 Xen |
356 | A Gujin | 376 | A Gujin |
357 | B Qemu | 377 | B Qemu |
378 | C Arcturus Networks uCbootloader | ||
379 | E Extended (see ext_loader_type) | ||
380 | F Special (0xFF = undefined) | ||
358 | 381 | ||
359 | Please contact <hpa@zytor.com> if you need a bootloader ID | 382 | Please contact <hpa@zytor.com> if you need a bootloader ID |
360 | value assigned. | 383 | value assigned. |
@@ -453,6 +476,35 @@ Protocol: 2.01+ | |||
453 | Set this field to the offset (from the beginning of the real-mode | 476 | Set this field to the offset (from the beginning of the real-mode |
454 | code) of the end of the setup stack/heap, minus 0x0200. | 477 | code) of the end of the setup stack/heap, minus 0x0200. |
455 | 478 | ||
479 | Field name: ext_loader_ver | ||
480 | Type: write (optional) | ||
481 | Offset/size: 0x226/1 | ||
482 | Protocol: 2.02+ | ||
483 | |||
484 | This field is used as an extension of the version number in the | ||
485 | type_of_loader field. The total version number is considered to be | ||
486 | (type_of_loader & 0x0f) + (ext_loader_ver << 4). | ||
487 | |||
488 | The use of this field is boot loader specific. If not written, it | ||
489 | is zero. | ||
490 | |||
491 | Kernels prior to 2.6.31 did not recognize this field, but it is safe | ||
492 | to write for protocol version 2.02 or higher. | ||
493 | |||
494 | Field name: ext_loader_type | ||
495 | Type: write (obligatory if (type_of_loader & 0xf0) == 0xe0) | ||
496 | Offset/size: 0x227/1 | ||
497 | Protocol: 2.02+ | ||
498 | |||
499 | This field is used as an extension of the type number in | ||
500 | type_of_loader field. If the type in type_of_loader is 0xE, then | ||
501 | the actual type is (ext_loader_type + 0x10). | ||
502 | |||
503 | This field is ignored if the type in type_of_loader is not 0xE. | ||
504 | |||
505 | Kernels prior to 2.6.31 did not recognize this field, but it is safe | ||
506 | to write for protocol version 2.02 or higher. | ||
507 | |||
456 | Field name: cmd_line_ptr | 508 | Field name: cmd_line_ptr |
457 | Type: write (obligatory) | 509 | Type: write (obligatory) |
458 | Offset/size: 0x228/4 | 510 | Offset/size: 0x228/4 |
@@ -482,11 +534,19 @@ Protocol: 2.03+ | |||
482 | 0x37FFFFFF, you can start your ramdisk at 0x37FE0000.) | 534 | 0x37FFFFFF, you can start your ramdisk at 0x37FE0000.) |
483 | 535 | ||
484 | Field name: kernel_alignment | 536 | Field name: kernel_alignment |
485 | Type: read (reloc) | 537 | Type: read/modify (reloc) |
486 | Offset/size: 0x230/4 | 538 | Offset/size: 0x230/4 |
487 | Protocol: 2.05+ | 539 | Protocol: 2.05+ (read), 2.10+ (modify) |
540 | |||
541 | Alignment unit required by the kernel (if relocatable_kernel is | ||
542 | true.) A relocatable kernel that is loaded at an alignment | ||
543 | incompatible with the value in this field will be realigned during | ||
544 | kernel initialization. | ||
488 | 545 | ||
489 | Alignment unit required by the kernel (if relocatable_kernel is true.) | 546 | Starting with protocol version 2.10, this reflects the kernel |
547 | alignment preferred for optimal performance; it is possible for the | ||
548 | loader to modify this field to permit a lesser alignment. See the | ||
549 | min_alignment and pref_address field below. | ||
490 | 550 | ||
491 | Field name: relocatable_kernel | 551 | Field name: relocatable_kernel |
492 | Type: read (reloc) | 552 | Type: read (reloc) |
@@ -498,6 +558,22 @@ Protocol: 2.05+ | |||
498 | After loading, the boot loader must set the code32_start field to | 558 | After loading, the boot loader must set the code32_start field to |
499 | point to the loaded code, or to a boot loader hook. | 559 | point to the loaded code, or to a boot loader hook. |
500 | 560 | ||
561 | Field name: min_alignment | ||
562 | Type: read (reloc) | ||
563 | Offset/size: 0x235/1 | ||
564 | Protocol: 2.10+ | ||
565 | |||
566 | This field, if nonzero, indicates as a power of two the minimum | ||
567 | alignment required, as opposed to preferred, by the kernel to boot. | ||
568 | If a boot loader makes use of this field, it should update the | ||
569 | kernel_alignment field with the alignment unit desired; typically: | ||
570 | |||
571 | kernel_alignment = 1 << min_alignment | ||
572 | |||
573 | There may be a considerable performance cost with an excessively | ||
574 | misaligned kernel. Therefore, a loader should typically try each | ||
575 | power-of-two alignment from kernel_alignment down to this alignment. | ||
576 | |||
501 | Field name: cmdline_size | 577 | Field name: cmdline_size |
502 | Type: read | 578 | Type: read |
503 | Offset/size: 0x238/4 | 579 | Offset/size: 0x238/4 |
@@ -582,6 +658,36 @@ Protocol: 2.09+ | |||
582 | sure to consider the case where the linked list already contains | 658 | sure to consider the case where the linked list already contains |
583 | entries. | 659 | entries. |
584 | 660 | ||
661 | Field name: pref_address | ||
662 | Type: read (reloc) | ||
663 | Offset/size: 0x258/8 | ||
664 | Protocol: 2.10+ | ||
665 | |||
666 | This field, if nonzero, represents a preferred load address for the | ||
667 | kernel. A relocating bootloader should attempt to load at this | ||
668 | address if possible. | ||
669 | |||
670 | A non-relocatable kernel will unconditionally move itself and to run | ||
671 | at this address. | ||
672 | |||
673 | Field name: init_size | ||
674 | Type: read | ||
675 | Offset/size: 0x25c/4 | ||
676 | |||
677 | This field indicates the amount of linear contiguous memory starting | ||
678 | at the kernel runtime start address that the kernel needs before it | ||
679 | is capable of examining its memory map. This is not the same thing | ||
680 | as the total amount of memory the kernel needs to boot, but it can | ||
681 | be used by a relocating boot loader to help select a safe load | ||
682 | address for the kernel. | ||
683 | |||
684 | The kernel runtime start address is determined by the following algorithm: | ||
685 | |||
686 | if (relocatable_kernel) | ||
687 | runtime_start = align_up(load_address, kernel_alignment) | ||
688 | else | ||
689 | runtime_start = pref_address | ||
690 | |||
585 | 691 | ||
586 | **** THE IMAGE CHECKSUM | 692 | **** THE IMAGE CHECKSUM |
587 | 693 | ||
diff --git a/Documentation/x86/x86_64/boot-options.txt b/Documentation/x86/x86_64/boot-options.txt index 34c13040a718..2db5893d6c97 100644 --- a/Documentation/x86/x86_64/boot-options.txt +++ b/Documentation/x86/x86_64/boot-options.txt | |||
@@ -150,11 +150,6 @@ NUMA | |||
150 | Otherwise, the remaining system RAM is allocated to an | 150 | Otherwise, the remaining system RAM is allocated to an |
151 | additional node. | 151 | additional node. |
152 | 152 | ||
153 | numa=hotadd=percent | ||
154 | Only allow hotadd memory to preallocate page structures upto | ||
155 | percent of already available memory. | ||
156 | numa=hotadd=0 will disable hotadd memory. | ||
157 | |||
158 | ACPI | 153 | ACPI |
159 | 154 | ||
160 | acpi=off Don't enable ACPI | 155 | acpi=off Don't enable ACPI |
diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt index 29b52b14d0b4..d6498e3cd713 100644 --- a/Documentation/x86/x86_64/mm.txt +++ b/Documentation/x86/x86_64/mm.txt | |||
@@ -6,10 +6,11 @@ Virtual memory map with 4 level page tables: | |||
6 | 0000000000000000 - 00007fffffffffff (=47 bits) user space, different per mm | 6 | 0000000000000000 - 00007fffffffffff (=47 bits) user space, different per mm |
7 | hole caused by [48:63] sign extension | 7 | hole caused by [48:63] sign extension |
8 | ffff800000000000 - ffff80ffffffffff (=40 bits) guard hole | 8 | ffff800000000000 - ffff80ffffffffff (=40 bits) guard hole |
9 | ffff880000000000 - ffffc0ffffffffff (=57 TB) direct mapping of all phys. memory | 9 | ffff880000000000 - ffffc7ffffffffff (=64 TB) direct mapping of all phys. memory |
10 | ffffc10000000000 - ffffc1ffffffffff (=40 bits) hole | 10 | ffffc80000000000 - ffffc8ffffffffff (=40 bits) hole |
11 | ffffc20000000000 - ffffe1ffffffffff (=45 bits) vmalloc/ioremap space | 11 | ffffc90000000000 - ffffe8ffffffffff (=45 bits) vmalloc/ioremap space |
12 | ffffe20000000000 - ffffe2ffffffffff (=40 bits) virtual memory map (1TB) | 12 | ffffe90000000000 - ffffe9ffffffffff (=40 bits) hole |
13 | ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB) | ||
13 | ... unused hole ... | 14 | ... unused hole ... |
14 | ffffffff80000000 - ffffffffa0000000 (=512 MB) kernel text mapping, from phys 0 | 15 | ffffffff80000000 - ffffffffa0000000 (=512 MB) kernel text mapping, from phys 0 |
15 | ffffffffa0000000 - fffffffffff00000 (=1536 MB) module mapping space | 16 | ffffffffa0000000 - fffffffffff00000 (=1536 MB) module mapping space |