diff options
Diffstat (limited to 'Documentation')
31 files changed, 1809 insertions, 504 deletions
diff --git a/Documentation/Changes b/Documentation/Changes index 73a8617f1861..cb2b141b1c3e 100644 --- a/Documentation/Changes +++ b/Documentation/Changes | |||
@@ -45,6 +45,7 @@ o nfs-utils 1.0.5 # showmount --version | |||
45 | o procps 3.2.0 # ps --version | 45 | o procps 3.2.0 # ps --version |
46 | o oprofile 0.9 # oprofiled --version | 46 | o oprofile 0.9 # oprofiled --version |
47 | o udev 081 # udevinfo -V | 47 | o udev 081 # udevinfo -V |
48 | o grub 0.93 # grub --version | ||
48 | 49 | ||
49 | Kernel compilation | 50 | Kernel compilation |
50 | ================== | 51 | ================== |
diff --git a/Documentation/DMA-API.txt b/Documentation/DMA-API.txt index 805db4b2cba6..cc7a8c39fb6f 100644 --- a/Documentation/DMA-API.txt +++ b/Documentation/DMA-API.txt | |||
@@ -26,7 +26,7 @@ Part Ia - Using large dma-coherent buffers | |||
26 | 26 | ||
27 | void * | 27 | void * |
28 | dma_alloc_coherent(struct device *dev, size_t size, | 28 | dma_alloc_coherent(struct device *dev, size_t size, |
29 | dma_addr_t *dma_handle, int flag) | 29 | dma_addr_t *dma_handle, gfp_t flag) |
30 | void * | 30 | void * |
31 | pci_alloc_consistent(struct pci_dev *dev, size_t size, | 31 | pci_alloc_consistent(struct pci_dev *dev, size_t size, |
32 | dma_addr_t *dma_handle) | 32 | dma_addr_t *dma_handle) |
@@ -38,7 +38,7 @@ to make sure to flush the processor's write buffers before telling | |||
38 | devices to read that memory.) | 38 | devices to read that memory.) |
39 | 39 | ||
40 | This routine allocates a region of <size> bytes of consistent memory. | 40 | This routine allocates a region of <size> bytes of consistent memory. |
41 | it also returns a <dma_handle> which may be cast to an unsigned | 41 | It also returns a <dma_handle> which may be cast to an unsigned |
42 | integer the same width as the bus and used as the physical address | 42 | integer the same width as the bus and used as the physical address |
43 | base of the region. | 43 | base of the region. |
44 | 44 | ||
@@ -52,21 +52,21 @@ The simplest way to do that is to use the dma_pool calls (see below). | |||
52 | 52 | ||
53 | The flag parameter (dma_alloc_coherent only) allows the caller to | 53 | The flag parameter (dma_alloc_coherent only) allows the caller to |
54 | specify the GFP_ flags (see kmalloc) for the allocation (the | 54 | specify the GFP_ flags (see kmalloc) for the allocation (the |
55 | implementation may chose to ignore flags that affect the location of | 55 | implementation may choose to ignore flags that affect the location of |
56 | the returned memory, like GFP_DMA). For pci_alloc_consistent, you | 56 | the returned memory, like GFP_DMA). For pci_alloc_consistent, you |
57 | must assume GFP_ATOMIC behaviour. | 57 | must assume GFP_ATOMIC behaviour. |
58 | 58 | ||
59 | void | 59 | void |
60 | dma_free_coherent(struct device *dev, size_t size, void *cpu_addr | 60 | dma_free_coherent(struct device *dev, size_t size, void *cpu_addr, |
61 | dma_addr_t dma_handle) | 61 | dma_addr_t dma_handle) |
62 | void | 62 | void |
63 | pci_free_consistent(struct pci_dev *dev, size_t size, void *cpu_addr | 63 | pci_free_consistent(struct pci_dev *dev, size_t size, void *cpu_addr, |
64 | dma_addr_t dma_handle) | 64 | dma_addr_t dma_handle) |
65 | 65 | ||
66 | Free the region of consistent memory you previously allocated. dev, | 66 | Free the region of consistent memory you previously allocated. dev, |
67 | size and dma_handle must all be the same as those passed into the | 67 | size and dma_handle must all be the same as those passed into the |
68 | consistent allocate. cpu_addr must be the virtual address returned by | 68 | consistent allocate. cpu_addr must be the virtual address returned by |
69 | the consistent allocate | 69 | the consistent allocate. |
70 | 70 | ||
71 | 71 | ||
72 | Part Ib - Using small dma-coherent buffers | 72 | Part Ib - Using small dma-coherent buffers |
@@ -77,9 +77,9 @@ To get this part of the dma_ API, you must #include <linux/dmapool.h> | |||
77 | Many drivers need lots of small dma-coherent memory regions for DMA | 77 | Many drivers need lots of small dma-coherent memory regions for DMA |
78 | descriptors or I/O buffers. Rather than allocating in units of a page | 78 | descriptors or I/O buffers. Rather than allocating in units of a page |
79 | or more using dma_alloc_coherent(), you can use DMA pools. These work | 79 | or more using dma_alloc_coherent(), you can use DMA pools. These work |
80 | much like a struct kmem_cache, except that they use the dma-coherent allocator | 80 | much like a struct kmem_cache, except that they use the dma-coherent allocator, |
81 | not __get_free_pages(). Also, they understand common hardware constraints | 81 | not __get_free_pages(). Also, they understand common hardware constraints |
82 | for alignment, like queue heads needing to be aligned on N byte boundaries. | 82 | for alignment, like queue heads needing to be aligned on N-byte boundaries. |
83 | 83 | ||
84 | 84 | ||
85 | struct dma_pool * | 85 | struct dma_pool * |
@@ -102,15 +102,15 @@ crossing restrictions, pass 0 for alloc; passing 4096 says memory allocated | |||
102 | from this pool must not cross 4KByte boundaries. | 102 | from this pool must not cross 4KByte boundaries. |
103 | 103 | ||
104 | 104 | ||
105 | void *dma_pool_alloc(struct dma_pool *pool, int gfp_flags, | 105 | void *dma_pool_alloc(struct dma_pool *pool, gfp_t gfp_flags, |
106 | dma_addr_t *dma_handle); | 106 | dma_addr_t *dma_handle); |
107 | 107 | ||
108 | void *pci_pool_alloc(struct pci_pool *pool, int gfp_flags, | 108 | void *pci_pool_alloc(struct pci_pool *pool, gfp_t gfp_flags, |
109 | dma_addr_t *dma_handle); | 109 | dma_addr_t *dma_handle); |
110 | 110 | ||
111 | This allocates memory from the pool; the returned memory will meet the size | 111 | This allocates memory from the pool; the returned memory will meet the size |
112 | and alignment requirements specified at creation time. Pass GFP_ATOMIC to | 112 | and alignment requirements specified at creation time. Pass GFP_ATOMIC to |
113 | prevent blocking, or if it's permitted (not in_interrupt, not holding SMP locks) | 113 | prevent blocking, or if it's permitted (not in_interrupt, not holding SMP locks), |
114 | pass GFP_KERNEL to allow blocking. Like dma_alloc_coherent(), this returns | 114 | pass GFP_KERNEL to allow blocking. Like dma_alloc_coherent(), this returns |
115 | two values: an address usable by the cpu, and the dma address usable by the | 115 | two values: an address usable by the cpu, and the dma address usable by the |
116 | pool's device. | 116 | pool's device. |
@@ -123,7 +123,7 @@ pool's device. | |||
123 | dma_addr_t addr); | 123 | dma_addr_t addr); |
124 | 124 | ||
125 | This puts memory back into the pool. The pool is what was passed to | 125 | This puts memory back into the pool. The pool is what was passed to |
126 | the pool allocation routine; the cpu and dma addresses are what | 126 | the pool allocation routine; the cpu (vaddr) and dma addresses are what |
127 | were returned when that routine allocated the memory being freed. | 127 | were returned when that routine allocated the memory being freed. |
128 | 128 | ||
129 | 129 | ||
@@ -209,18 +209,18 @@ Notes: Not all memory regions in a machine can be mapped by this | |||
209 | API. Further, regions that appear to be physically contiguous in | 209 | API. Further, regions that appear to be physically contiguous in |
210 | kernel virtual space may not be contiguous as physical memory. Since | 210 | kernel virtual space may not be contiguous as physical memory. Since |
211 | this API does not provide any scatter/gather capability, it will fail | 211 | this API does not provide any scatter/gather capability, it will fail |
212 | if the user tries to map a non physically contiguous piece of memory. | 212 | if the user tries to map a non-physically contiguous piece of memory. |
213 | For this reason, it is recommended that memory mapped by this API be | 213 | For this reason, it is recommended that memory mapped by this API be |
214 | obtained only from sources which guarantee to be physically contiguous | 214 | obtained only from sources which guarantee it to be physically contiguous |
215 | (like kmalloc). | 215 | (like kmalloc). |
216 | 216 | ||
217 | Further, the physical address of the memory must be within the | 217 | Further, the physical address of the memory must be within the |
218 | dma_mask of the device (the dma_mask represents a bit mask of the | 218 | dma_mask of the device (the dma_mask represents a bit mask of the |
219 | addressable region for the device. i.e. if the physical address of | 219 | addressable region for the device. I.e., if the physical address of |
220 | the memory anded with the dma_mask is still equal to the physical | 220 | the memory anded with the dma_mask is still equal to the physical |
221 | address, then the device can perform DMA to the memory). In order to | 221 | address, then the device can perform DMA to the memory). In order to |
222 | ensure that the memory allocated by kmalloc is within the dma_mask, | 222 | ensure that the memory allocated by kmalloc is within the dma_mask, |
223 | the driver may specify various platform dependent flags to restrict | 223 | the driver may specify various platform-dependent flags to restrict |
224 | the physical memory range of the allocation (e.g. on x86, GFP_DMA | 224 | the physical memory range of the allocation (e.g. on x86, GFP_DMA |
225 | guarantees to be within the first 16Mb of available physical memory, | 225 | guarantees to be within the first 16Mb of available physical memory, |
226 | as required by ISA devices). | 226 | as required by ISA devices). |
@@ -244,14 +244,14 @@ are guaranteed also to be cache line boundaries). | |||
244 | 244 | ||
245 | DMA_TO_DEVICE synchronisation must be done after the last modification | 245 | DMA_TO_DEVICE synchronisation must be done after the last modification |
246 | of the memory region by the software and before it is handed off to | 246 | of the memory region by the software and before it is handed off to |
247 | the driver. Once this primitive is used. Memory covered by this | 247 | the driver. Once this primitive is used, memory covered by this |
248 | primitive should be treated as read only by the device. If the device | 248 | primitive should be treated as read-only by the device. If the device |
249 | may write to it at any point, it should be DMA_BIDIRECTIONAL (see | 249 | may write to it at any point, it should be DMA_BIDIRECTIONAL (see |
250 | below). | 250 | below). |
251 | 251 | ||
252 | DMA_FROM_DEVICE synchronisation must be done before the driver | 252 | DMA_FROM_DEVICE synchronisation must be done before the driver |
253 | accesses data that may be changed by the device. This memory should | 253 | accesses data that may be changed by the device. This memory should |
254 | be treated as read only by the driver. If the driver needs to write | 254 | be treated as read-only by the driver. If the driver needs to write |
255 | to it at any point, it should be DMA_BIDIRECTIONAL (see below). | 255 | to it at any point, it should be DMA_BIDIRECTIONAL (see below). |
256 | 256 | ||
257 | DMA_BIDIRECTIONAL requires special handling: it means that the driver | 257 | DMA_BIDIRECTIONAL requires special handling: it means that the driver |
@@ -261,7 +261,7 @@ you must always sync bidirectional memory twice: once before the | |||
261 | memory is handed off to the device (to make sure all memory changes | 261 | memory is handed off to the device (to make sure all memory changes |
262 | are flushed from the processor) and once before the data may be | 262 | are flushed from the processor) and once before the data may be |
263 | accessed after being used by the device (to make sure any processor | 263 | accessed after being used by the device (to make sure any processor |
264 | cache lines are updated with data that the device may have changed. | 264 | cache lines are updated with data that the device may have changed). |
265 | 265 | ||
266 | void | 266 | void |
267 | dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size, | 267 | dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size, |
@@ -302,8 +302,8 @@ pci_dma_mapping_error(dma_addr_t dma_addr) | |||
302 | 302 | ||
303 | In some circumstances dma_map_single and dma_map_page will fail to create | 303 | In some circumstances dma_map_single and dma_map_page will fail to create |
304 | a mapping. A driver can check for these errors by testing the returned | 304 | a mapping. A driver can check for these errors by testing the returned |
305 | dma address with dma_mapping_error(). A non zero return value means the mapping | 305 | dma address with dma_mapping_error(). A non-zero return value means the mapping |
306 | could not be created and the driver should take appropriate action (eg | 306 | could not be created and the driver should take appropriate action (e.g. |
307 | reduce current DMA mapping usage or delay and try again later). | 307 | reduce current DMA mapping usage or delay and try again later). |
308 | 308 | ||
309 | int | 309 | int |
@@ -315,7 +315,7 @@ reduce current DMA mapping usage or delay and try again later). | |||
315 | 315 | ||
316 | Maps a scatter gather list from the block layer. | 316 | Maps a scatter gather list from the block layer. |
317 | 317 | ||
318 | Returns: the number of physical segments mapped (this may be shorted | 318 | Returns: the number of physical segments mapped (this may be shorter |
319 | than <nents> passed in if the block layer determines that some | 319 | than <nents> passed in if the block layer determines that some |
320 | elements of the scatter/gather list are physically adjacent and thus | 320 | elements of the scatter/gather list are physically adjacent and thus |
321 | may be mapped with a single entry). | 321 | may be mapped with a single entry). |
@@ -357,7 +357,7 @@ accessed sg->address and sg->length as shown above. | |||
357 | pci_unmap_sg(struct pci_dev *hwdev, struct scatterlist *sg, | 357 | pci_unmap_sg(struct pci_dev *hwdev, struct scatterlist *sg, |
358 | int nents, int direction) | 358 | int nents, int direction) |
359 | 359 | ||
360 | unmap the previously mapped scatter/gather list. All the parameters | 360 | Unmap the previously mapped scatter/gather list. All the parameters |
361 | must be the same as those and passed in to the scatter/gather mapping | 361 | must be the same as those and passed in to the scatter/gather mapping |
362 | API. | 362 | API. |
363 | 363 | ||
@@ -377,7 +377,7 @@ void | |||
377 | pci_dma_sync_sg(struct pci_dev *hwdev, struct scatterlist *sg, | 377 | pci_dma_sync_sg(struct pci_dev *hwdev, struct scatterlist *sg, |
378 | int nelems, int direction) | 378 | int nelems, int direction) |
379 | 379 | ||
380 | synchronise a single contiguous or scatter/gather mapping. All the | 380 | Synchronise a single contiguous or scatter/gather mapping. All the |
381 | parameters must be the same as those passed into the single mapping | 381 | parameters must be the same as those passed into the single mapping |
382 | API. | 382 | API. |
383 | 383 | ||
@@ -406,7 +406,7 @@ API at all. | |||
406 | 406 | ||
407 | void * | 407 | void * |
408 | dma_alloc_noncoherent(struct device *dev, size_t size, | 408 | dma_alloc_noncoherent(struct device *dev, size_t size, |
409 | dma_addr_t *dma_handle, int flag) | 409 | dma_addr_t *dma_handle, gfp_t flag) |
410 | 410 | ||
411 | Identical to dma_alloc_coherent() except that the platform will | 411 | Identical to dma_alloc_coherent() except that the platform will |
412 | choose to return either consistent or non-consistent memory as it sees | 412 | choose to return either consistent or non-consistent memory as it sees |
@@ -426,34 +426,34 @@ void | |||
426 | dma_free_noncoherent(struct device *dev, size_t size, void *cpu_addr, | 426 | dma_free_noncoherent(struct device *dev, size_t size, void *cpu_addr, |
427 | dma_addr_t dma_handle) | 427 | dma_addr_t dma_handle) |
428 | 428 | ||
429 | free memory allocated by the nonconsistent API. All parameters must | 429 | Free memory allocated by the nonconsistent API. All parameters must |
430 | be identical to those passed in (and returned by | 430 | be identical to those passed in (and returned by |
431 | dma_alloc_noncoherent()). | 431 | dma_alloc_noncoherent()). |
432 | 432 | ||
433 | int | 433 | int |
434 | dma_is_consistent(struct device *dev, dma_addr_t dma_handle) | 434 | dma_is_consistent(struct device *dev, dma_addr_t dma_handle) |
435 | 435 | ||
436 | returns true if the device dev is performing consistent DMA on the memory | 436 | Returns true if the device dev is performing consistent DMA on the memory |
437 | area pointed to by the dma_handle. | 437 | area pointed to by the dma_handle. |
438 | 438 | ||
439 | int | 439 | int |
440 | dma_get_cache_alignment(void) | 440 | dma_get_cache_alignment(void) |
441 | 441 | ||
442 | returns the processor cache alignment. This is the absolute minimum | 442 | Returns the processor cache alignment. This is the absolute minimum |
443 | alignment *and* width that you must observe when either mapping | 443 | alignment *and* width that you must observe when either mapping |
444 | memory or doing partial flushes. | 444 | memory or doing partial flushes. |
445 | 445 | ||
446 | Notes: This API may return a number *larger* than the actual cache | 446 | Notes: This API may return a number *larger* than the actual cache |
447 | line, but it will guarantee that one or more cache lines fit exactly | 447 | line, but it will guarantee that one or more cache lines fit exactly |
448 | into the width returned by this call. It will also always be a power | 448 | into the width returned by this call. It will also always be a power |
449 | of two for easy alignment | 449 | of two for easy alignment. |
450 | 450 | ||
451 | void | 451 | void |
452 | dma_sync_single_range(struct device *dev, dma_addr_t dma_handle, | 452 | dma_sync_single_range(struct device *dev, dma_addr_t dma_handle, |
453 | unsigned long offset, size_t size, | 453 | unsigned long offset, size_t size, |
454 | enum dma_data_direction direction) | 454 | enum dma_data_direction direction) |
455 | 455 | ||
456 | does a partial sync. starting at offset and continuing for size. You | 456 | Does a partial sync, starting at offset and continuing for size. You |
457 | must be careful to observe the cache alignment and width when doing | 457 | must be careful to observe the cache alignment and width when doing |
458 | anything like this. You must also be extra careful about accessing | 458 | anything like this. You must also be extra careful about accessing |
459 | memory you intend to sync partially. | 459 | memory you intend to sync partially. |
@@ -472,21 +472,20 @@ dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr, | |||
472 | dma_addr_t device_addr, size_t size, int | 472 | dma_addr_t device_addr, size_t size, int |
473 | flags) | 473 | flags) |
474 | 474 | ||
475 | |||
476 | Declare region of memory to be handed out by dma_alloc_coherent when | 475 | Declare region of memory to be handed out by dma_alloc_coherent when |
477 | it's asked for coherent memory for this device. | 476 | it's asked for coherent memory for this device. |
478 | 477 | ||
479 | bus_addr is the physical address to which the memory is currently | 478 | bus_addr is the physical address to which the memory is currently |
480 | assigned in the bus responding region (this will be used by the | 479 | assigned in the bus responding region (this will be used by the |
481 | platform to perform the mapping) | 480 | platform to perform the mapping). |
482 | 481 | ||
483 | device_addr is the physical address the device needs to be programmed | 482 | device_addr is the physical address the device needs to be programmed |
484 | with actually to address this memory (this will be handed out as the | 483 | with actually to address this memory (this will be handed out as the |
485 | dma_addr_t in dma_alloc_coherent()) | 484 | dma_addr_t in dma_alloc_coherent()). |
486 | 485 | ||
487 | size is the size of the area (must be multiples of PAGE_SIZE). | 486 | size is the size of the area (must be multiples of PAGE_SIZE). |
488 | 487 | ||
489 | flags can be or'd together and are | 488 | flags can be or'd together and are: |
490 | 489 | ||
491 | DMA_MEMORY_MAP - request that the memory returned from | 490 | DMA_MEMORY_MAP - request that the memory returned from |
492 | dma_alloc_coherent() be directly writable. | 491 | dma_alloc_coherent() be directly writable. |
@@ -494,7 +493,7 @@ dma_alloc_coherent() be directly writable. | |||
494 | DMA_MEMORY_IO - request that the memory returned from | 493 | DMA_MEMORY_IO - request that the memory returned from |
495 | dma_alloc_coherent() be addressable using read/write/memcpy_toio etc. | 494 | dma_alloc_coherent() be addressable using read/write/memcpy_toio etc. |
496 | 495 | ||
497 | One or both of these flags must be present | 496 | One or both of these flags must be present. |
498 | 497 | ||
499 | DMA_MEMORY_INCLUDES_CHILDREN - make the declared memory be allocated by | 498 | DMA_MEMORY_INCLUDES_CHILDREN - make the declared memory be allocated by |
500 | dma_alloc_coherent of any child devices of this one (for memory residing | 499 | dma_alloc_coherent of any child devices of this one (for memory residing |
@@ -528,7 +527,7 @@ dma_release_declared_memory(struct device *dev) | |||
528 | Remove the memory region previously declared from the system. This | 527 | Remove the memory region previously declared from the system. This |
529 | API performs *no* in-use checking for this region and will return | 528 | API performs *no* in-use checking for this region and will return |
530 | unconditionally having removed all the required structures. It is the | 529 | unconditionally having removed all the required structures. It is the |
531 | drivers job to ensure that no parts of this memory region are | 530 | driver's job to ensure that no parts of this memory region are |
532 | currently in use. | 531 | currently in use. |
533 | 532 | ||
534 | void * | 533 | void * |
@@ -538,12 +537,10 @@ dma_mark_declared_memory_occupied(struct device *dev, | |||
538 | This is used to occupy specific regions of the declared space | 537 | This is used to occupy specific regions of the declared space |
539 | (dma_alloc_coherent() will hand out the first free region it finds). | 538 | (dma_alloc_coherent() will hand out the first free region it finds). |
540 | 539 | ||
541 | device_addr is the *device* address of the region requested | 540 | device_addr is the *device* address of the region requested. |
542 | 541 | ||
543 | size is the size (and should be a page sized multiple). | 542 | size is the size (and should be a page-sized multiple). |
544 | 543 | ||
545 | The return value will be either a pointer to the processor virtual | 544 | The return value will be either a pointer to the processor virtual |
546 | address of the memory, or an error (via PTR_ERR()) if any part of the | 545 | address of the memory, or an error (via PTR_ERR()) if any part of the |
547 | region is occupied. | 546 | region is occupied. |
548 | |||
549 | |||
diff --git a/Documentation/DocBook/kernel-api.tmpl b/Documentation/DocBook/kernel-api.tmpl index eb42bf9847cb..b886f52a9aac 100644 --- a/Documentation/DocBook/kernel-api.tmpl +++ b/Documentation/DocBook/kernel-api.tmpl | |||
@@ -380,7 +380,6 @@ X!Edrivers/base/interface.c | |||
380 | !Edrivers/base/bus.c | 380 | !Edrivers/base/bus.c |
381 | </sect1> | 381 | </sect1> |
382 | <sect1><title>Device Drivers Power Management</title> | 382 | <sect1><title>Device Drivers Power Management</title> |
383 | !Edrivers/base/power/main.c | ||
384 | !Edrivers/base/power/resume.c | 383 | !Edrivers/base/power/resume.c |
385 | !Edrivers/base/power/suspend.c | 384 | !Edrivers/base/power/suspend.c |
386 | </sect1> | 385 | </sect1> |
@@ -398,12 +397,12 @@ X!Edrivers/acpi/pci_bind.c | |||
398 | --> | 397 | --> |
399 | </sect1> | 398 | </sect1> |
400 | <sect1><title>Device drivers PnP support</title> | 399 | <sect1><title>Device drivers PnP support</title> |
401 | !Edrivers/pnp/core.c | 400 | !Idrivers/pnp/core.c |
402 | <!-- No correct structured comments | 401 | <!-- No correct structured comments |
403 | X!Edrivers/pnp/system.c | 402 | X!Edrivers/pnp/system.c |
404 | --> | 403 | --> |
405 | !Edrivers/pnp/card.c | 404 | !Edrivers/pnp/card.c |
406 | !Edrivers/pnp/driver.c | 405 | !Idrivers/pnp/driver.c |
407 | !Edrivers/pnp/manager.c | 406 | !Edrivers/pnp/manager.c |
408 | !Edrivers/pnp/support.c | 407 | !Edrivers/pnp/support.c |
409 | </sect1> | 408 | </sect1> |
@@ -704,14 +703,22 @@ X!Idrivers/video/console/fonts.c | |||
704 | 703 | ||
705 | <chapter id="splice"> | 704 | <chapter id="splice"> |
706 | <title>splice API</title> | 705 | <title>splice API</title> |
707 | <para>) | 706 | <para> |
708 | splice is a method for moving blocks of data around inside the | 707 | splice is a method for moving blocks of data around inside the |
709 | kernel, without continually transferring it between the kernel | 708 | kernel, without continually transferring them between the kernel |
710 | and user space. | 709 | and user space. |
711 | </para> | 710 | </para> |
712 | !Iinclude/linux/splice.h | ||
713 | !Ffs/splice.c | 711 | !Ffs/splice.c |
714 | </chapter> | 712 | </chapter> |
715 | 713 | ||
714 | <chapter id="pipes"> | ||
715 | <title>pipes API</title> | ||
716 | <para> | ||
717 | Pipe interfaces are all for in-kernel (builtin image) use. | ||
718 | They are not exported for use by modules. | ||
719 | </para> | ||
720 | !Iinclude/linux/pipe_fs_i.h | ||
721 | !Ffs/pipe.c | ||
722 | </chapter> | ||
716 | 723 | ||
717 | </book> | 724 | </book> |
diff --git a/Documentation/DocBook/uio-howto.tmpl b/Documentation/DocBook/uio-howto.tmpl index e3bb29a8d8dd..c119484258b8 100644 --- a/Documentation/DocBook/uio-howto.tmpl +++ b/Documentation/DocBook/uio-howto.tmpl | |||
@@ -133,10 +133,6 @@ interested in translating it, please email me | |||
133 | <para>updates of your driver can take place without recompiling | 133 | <para>updates of your driver can take place without recompiling |
134 | the kernel.</para> | 134 | the kernel.</para> |
135 | </listitem> | 135 | </listitem> |
136 | <listitem> | ||
137 | <para>if you need to keep some parts of your driver closed source, | ||
138 | you can do so without violating the GPL license on the kernel.</para> | ||
139 | </listitem> | ||
140 | </itemizedlist> | 136 | </itemizedlist> |
141 | 137 | ||
142 | <sect1 id="how_uio_works"> | 138 | <sect1 id="how_uio_works"> |
diff --git a/Documentation/dontdiff b/Documentation/dontdiff index 595a5ea4c690..7b9551fc6fe3 100644 --- a/Documentation/dontdiff +++ b/Documentation/dontdiff | |||
@@ -18,6 +18,7 @@ | |||
18 | *.moc | 18 | *.moc |
19 | *.mod.c | 19 | *.mod.c |
20 | *.o | 20 | *.o |
21 | *.o.* | ||
21 | *.orig | 22 | *.orig |
22 | *.out | 23 | *.out |
23 | 24 | ||
@@ -163,6 +164,8 @@ raid6tables.c | |||
163 | relocs | 164 | relocs |
164 | series | 165 | series |
165 | setup | 166 | setup |
167 | setup.bin | ||
168 | setup.elf | ||
166 | sim710_d.h* | 169 | sim710_d.h* |
167 | sImage | 170 | sImage |
168 | sm_tbl* | 171 | sm_tbl* |
diff --git a/Documentation/fb/pvr2fb.txt b/Documentation/fb/pvr2fb.txt index 2bf6c2321c2d..36bdeff585e2 100644 --- a/Documentation/fb/pvr2fb.txt +++ b/Documentation/fb/pvr2fb.txt | |||
@@ -9,14 +9,13 @@ one found in the Dreamcast. | |||
9 | Advantages: | 9 | Advantages: |
10 | 10 | ||
11 | * It provides a nice large console (128 cols + 48 lines with 1024x768) | 11 | * It provides a nice large console (128 cols + 48 lines with 1024x768) |
12 | without using tiny, unreadable fonts. | 12 | without using tiny, unreadable fonts (NOT on the Dreamcast) |
13 | * You can run XF86_FBDev on top of /dev/fb0 | 13 | * You can run XF86_FBDev on top of /dev/fb0 |
14 | * Most important: boot logo :-) | 14 | * Most important: boot logo :-) |
15 | 15 | ||
16 | Disadvantages: | 16 | Disadvantages: |
17 | 17 | ||
18 | * Driver is currently limited to the Dreamcast PowerVR 2 implementation | 18 | * Driver is largely untested on non-Dreamcast systems. |
19 | at the time of this writing. | ||
20 | 19 | ||
21 | Configuration | 20 | Configuration |
22 | ============= | 21 | ============= |
@@ -29,11 +28,16 @@ Accepted options: | |||
29 | font:X - default font to use. All fonts are supported, including the | 28 | font:X - default font to use. All fonts are supported, including the |
30 | SUN12x22 font which is very nice at high resolutions. | 29 | SUN12x22 font which is very nice at high resolutions. |
31 | 30 | ||
32 | mode:X - default video mode. The following video modes are supported: | ||
33 | 640x240-60, 640x480-60. | ||
34 | 31 | ||
32 | mode:X - default video mode with format [xres]x[yres]-<bpp>@<refresh rate> | ||
33 | The following video modes are supported: | ||
34 | 640x640-16@60, 640x480-24@60, 640x480-32@60. The Dreamcast | ||
35 | defaults to 640x480-16@60. At the time of writing the | ||
36 | 24bpp and 32bpp modes function poorly. Work to fix that is | ||
37 | ongoing | ||
38 | |||
35 | Note: the 640x240 mode is currently broken, and should not be | 39 | Note: the 640x240 mode is currently broken, and should not be |
36 | used for any reason. It is only mentioned as a reference. | 40 | used for any reason. It is only mentioned here as a reference. |
37 | 41 | ||
38 | inverse - invert colors on screen (for LCD displays) | 42 | inverse - invert colors on screen (for LCD displays) |
39 | 43 | ||
@@ -52,10 +56,10 @@ output:X - output type. This can be any of the following: pal, ntsc, and | |||
52 | X11 | 56 | X11 |
53 | === | 57 | === |
54 | 58 | ||
55 | XF86_FBDev should work, in theory. At the time of this writing it is | 59 | XF86_FBDev has been shown to work on the Dreamcast in the past - though not yet |
56 | totally untested and may or may not even portray the beginnings of | 60 | on any 2.6 series kernel. |
57 | working. If you end up testing this, please let me know! | ||
58 | 61 | ||
59 | -- | 62 | -- |
60 | Paul Mundt <lethal@linuxdc.org> | 63 | Paul Mundt <lethal@linuxdc.org> |
64 | Updated by Adrian McMenamin <adrian@mcmen.demon.co.uk> | ||
61 | 65 | ||
diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index c175eedadb5f..a43d2878a4ef 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt | |||
@@ -211,22 +211,6 @@ Who: Richard Purdie <rpurdie@rpsys.net> | |||
211 | 211 | ||
212 | --------------------------- | 212 | --------------------------- |
213 | 213 | ||
214 | What: read_dev_chars(), read_conf_data{,_lpm}() (s390 common I/O layer) | ||
215 | When: December 2007 | ||
216 | Why: These functions are a leftover from 2.4 times. They have several | ||
217 | problems: | ||
218 | - Duplication of checks that are done in the device driver's | ||
219 | interrupt handler | ||
220 | - common I/O layer can't do device specific error recovery | ||
221 | - device driver can't be notified for conditions happening during | ||
222 | execution of the function | ||
223 | Device drivers should issue the read device characteristics and read | ||
224 | configuration data ccws and do the appropriate error handling | ||
225 | themselves. | ||
226 | Who: Cornelia Huck <cornelia.huck@de.ibm.com> | ||
227 | |||
228 | --------------------------- | ||
229 | |||
230 | What: i2c-ixp2000, i2c-ixp4xx and scx200_i2c drivers | 214 | What: i2c-ixp2000, i2c-ixp4xx and scx200_i2c drivers |
231 | When: September 2007 | 215 | When: September 2007 |
232 | Why: Obsolete. The new i2c-gpio driver replaces all hardware-specific | 216 | Why: Obsolete. The new i2c-gpio driver replaces all hardware-specific |
diff --git a/Documentation/filesystems/hfsplus.txt b/Documentation/filesystems/hfsplus.txt new file mode 100644 index 000000000000..af1628a1061c --- /dev/null +++ b/Documentation/filesystems/hfsplus.txt | |||
@@ -0,0 +1,59 @@ | |||
1 | |||
2 | Macintosh HFSPlus Filesystem for Linux | ||
3 | ====================================== | ||
4 | |||
5 | HFSPlus is a filesystem first introduced in MacOS 8.1. | ||
6 | HFSPlus has several extensions to HFS, including 32-bit allocation | ||
7 | blocks, 255-character unicode filenames, and file sizes of 2^63 bytes. | ||
8 | |||
9 | |||
10 | Mount options | ||
11 | ============= | ||
12 | |||
13 | When mounting an HFSPlus filesystem, the following options are accepted: | ||
14 | |||
15 | creator=cccc, type=cccc | ||
16 | Specifies the creator/type values as shown by the MacOS finder | ||
17 | used for creating new files. Default values: '????'. | ||
18 | |||
19 | uid=n, gid=n | ||
20 | Specifies the user/group that owns all files on the filesystem | ||
21 | that have uninitialized permissions structures. | ||
22 | Default: user/group id of the mounting process. | ||
23 | |||
24 | umask=n | ||
25 | Specifies the umask (in octal) used for files and directories | ||
26 | that have uninitialized permissions structures. | ||
27 | Default: umask of the mounting process. | ||
28 | |||
29 | session=n | ||
30 | Select the CDROM session to mount as HFSPlus filesystem. Defaults to | ||
31 | leaving that decision to the CDROM driver. This option will fail | ||
32 | with anything but a CDROM as underlying devices. | ||
33 | |||
34 | part=n | ||
35 | Select partition number n from the devices. This option only makes | ||
36 | sense for CDROMs because they can't be partitioned under Linux. | ||
37 | For disk devices the generic partition parsing code does this | ||
38 | for us. Defaults to not parsing the partition table at all. | ||
39 | |||
40 | decompose | ||
41 | Decompose file name characters. | ||
42 | |||
43 | nodecompose | ||
44 | Do not decompose file name characters. | ||
45 | |||
46 | force | ||
47 | Used to force write access to volumes that are marked as journalled | ||
48 | or locked. Use at your own risk. | ||
49 | |||
50 | nls=cccc | ||
51 | Encoding to use when presenting file names. | ||
52 | |||
53 | |||
54 | References | ||
55 | ========== | ||
56 | |||
57 | kernel source: <file:fs/hfsplus> | ||
58 | |||
59 | Apple Technote 1150 http://developer.apple.com/technotes/tn/tn1150.html | ||
diff --git a/Documentation/hpet.txt b/Documentation/hpet.txt index b7a3dc38dd52..6ad52d9dad6c 100644 --- a/Documentation/hpet.txt +++ b/Documentation/hpet.txt | |||
@@ -5,7 +5,7 @@ for the 8254 and Real Time Clock (RTC) periodic timer functionality. | |||
5 | Each HPET can have up to 32 timers. It is possible to configure the | 5 | Each HPET can have up to 32 timers. It is possible to configure the |
6 | first two timers as legacy replacements for 8254 and RTC periodic timers. | 6 | first two timers as legacy replacements for 8254 and RTC periodic timers. |
7 | A specification done by Intel and Microsoft can be found at | 7 | A specification done by Intel and Microsoft can be found at |
8 | <http://www.intel.com/hardwaredesign/hpetspec.htm>. | 8 | <http://www.intel.com/technology/architecture/hpetspec.htm>. |
9 | 9 | ||
10 | The driver supports detection of HPET driver allocation and initialization | 10 | The driver supports detection of HPET driver allocation and initialization |
11 | of the HPET before the driver module_init routine is called. This enables | 11 | of the HPET before the driver module_init routine is called. This enables |
diff --git a/Documentation/hwmon/adm1031 b/Documentation/hwmon/adm1031 index 130a38382b98..be92a77da1d5 100644 --- a/Documentation/hwmon/adm1031 +++ b/Documentation/hwmon/adm1031 | |||
@@ -6,13 +6,13 @@ Supported chips: | |||
6 | Prefix: 'adm1030' | 6 | Prefix: 'adm1030' |
7 | Addresses scanned: I2C 0x2c to 0x2e | 7 | Addresses scanned: I2C 0x2c to 0x2e |
8 | Datasheet: Publicly available at the Analog Devices website | 8 | Datasheet: Publicly available at the Analog Devices website |
9 | http://products.analog.com/products/info.asp?product=ADM1030 | 9 | http://www.analog.com/en/prod/0%2C2877%2CADM1030%2C00.html |
10 | 10 | ||
11 | * Analog Devices ADM1031 | 11 | * Analog Devices ADM1031 |
12 | Prefix: 'adm1031' | 12 | Prefix: 'adm1031' |
13 | Addresses scanned: I2C 0x2c to 0x2e | 13 | Addresses scanned: I2C 0x2c to 0x2e |
14 | Datasheet: Publicly available at the Analog Devices website | 14 | Datasheet: Publicly available at the Analog Devices website |
15 | http://products.analog.com/products/info.asp?product=ADM1031 | 15 | http://www.analog.com/en/prod/0%2C2877%2CADM1031%2C00.html |
16 | 16 | ||
17 | Authors: | 17 | Authors: |
18 | Alexandre d'Alton <alex@alexdalton.org> | 18 | Alexandre d'Alton <alex@alexdalton.org> |
diff --git a/Documentation/hwmon/thmc50 b/Documentation/hwmon/thmc50 new file mode 100644 index 000000000000..9639ca93d559 --- /dev/null +++ b/Documentation/hwmon/thmc50 | |||
@@ -0,0 +1,74 @@ | |||
1 | Kernel driver thmc50 | ||
2 | ===================== | ||
3 | |||
4 | Supported chips: | ||
5 | * Analog Devices ADM1022 | ||
6 | Prefix: 'adm1022' | ||
7 | Addresses scanned: I2C 0x2c - 0x2e | ||
8 | Datasheet: http://www.analog.com/en/prod/0,2877,ADM1022,00.html | ||
9 | * Texas Instruments THMC50 | ||
10 | Prefix: 'thmc50' | ||
11 | Addresses scanned: I2C 0x2c - 0x2e | ||
12 | Datasheet: http://focus.ti.com/docs/prod/folders/print/thmc50.html | ||
13 | |||
14 | Author: Krzysztof Helt <krzysztof.h1@wp.pl> | ||
15 | |||
16 | This driver was derived from the 2.4 kernel thmc50.c source file. | ||
17 | |||
18 | Credits: | ||
19 | thmc50.c (2.4 kernel): | ||
20 | Frodo Looijaard <frodol@dds.nl> | ||
21 | Philip Edelbrock <phil@netroedge.com> | ||
22 | |||
23 | Module Parameters | ||
24 | ----------------- | ||
25 | |||
26 | * adm1022_temp3: short array | ||
27 | List of adapter,address pairs to force chips into ADM1022 mode with | ||
28 | second remote temperature. This does not work for original THMC50 chips. | ||
29 | |||
30 | Description | ||
31 | ----------- | ||
32 | |||
33 | The THMC50 implements: an internal temperature sensor, support for an | ||
34 | external diode-type temperature sensor (compatible w/ the diode sensor inside | ||
35 | many processors), and a controllable fan/analog_out DAC. For the temperature | ||
36 | sensors, limits can be set through the appropriate Overtemperature Shutdown | ||
37 | register and Hysteresis register. Each value can be set and read to half-degree | ||
38 | accuracy. An alarm is issued (usually to a connected LM78) when the | ||
39 | temperature gets higher then the Overtemperature Shutdown value; it stays on | ||
40 | until the temperature falls below the Hysteresis value. All temperatures are in | ||
41 | degrees Celsius, and are guaranteed within a range of -55 to +125 degrees. | ||
42 | |||
43 | The THMC50 only updates its values each 1.5 seconds; reading it more often | ||
44 | will do no harm, but will return 'old' values. | ||
45 | |||
46 | The THMC50 is usually used in combination with LM78-like chips, to measure | ||
47 | the temperature of the processor(s). | ||
48 | |||
49 | The ADM1022 works the same as THMC50 but it is faster (5 Hz instead of | ||
50 | 1 Hz for THMC50). It can be also put in a new mode to handle additional | ||
51 | remote temperature sensor. The driver use the mode set by BIOS by default. | ||
52 | |||
53 | In case the BIOS is broken and the mode is set incorrectly, you can force | ||
54 | the mode with additional remote temperature with adm1022_temp3 parameter. | ||
55 | A typical symptom of wrong setting is a fan forced to full speed. | ||
56 | |||
57 | Driver Features | ||
58 | --------------- | ||
59 | |||
60 | The driver provides up to three temperatures: | ||
61 | |||
62 | temp1 -- internal | ||
63 | temp2 -- remote | ||
64 | temp3 -- 2nd remote only for ADM1022 | ||
65 | |||
66 | pwm1 -- fan speed (0 = stop, 255 = full) | ||
67 | pwm1_mode -- always 0 (DC mode) | ||
68 | |||
69 | The value of 0 for pwm1 also forces FAN_OFF signal from the chip, | ||
70 | so it stops fans even if the value 0 into the ANALOG_OUT register does not. | ||
71 | |||
72 | The driver was tested on Compaq AP550 with two ADM1022 chips (one works | ||
73 | in the temp3 mode), five temperature readings and two fans. | ||
74 | |||
diff --git a/Documentation/i386/zero-page.txt b/Documentation/i386/zero-page.txt index 75b3680c41eb..6c0817c45683 100644 --- a/Documentation/i386/zero-page.txt +++ b/Documentation/i386/zero-page.txt | |||
@@ -1,3 +1,13 @@ | |||
1 | --------------------------------------------------------------------------- | ||
2 | !!!!!!!!!!!!!!!WARNING!!!!!!!! | ||
3 | The zero page is a kernel internal data structure, not a stable ABI. It might change | ||
4 | without warning and the kernel has no way to detect old version of it. | ||
5 | If you're writing some external code like a boot loader you should only use | ||
6 | the stable versioned real mode boot protocol described in boot.txt. Otherwise the kernel | ||
7 | might break you at any time. | ||
8 | !!!!!!!!!!!!!WARNING!!!!!!!!!!! | ||
9 | ---------------------------------------------------------------------------- | ||
10 | |||
1 | Summary of boot_params layout (kernel point of view) | 11 | Summary of boot_params layout (kernel point of view) |
2 | ( collected by Hans Lermen and Martin Mares ) | 12 | ( collected by Hans Lermen and Martin Mares ) |
3 | 13 | ||
diff --git a/Documentation/ja_JP/HOWTO b/Documentation/ja_JP/HOWTO index b2446a090870..9f08dab1e75b 100644 --- a/Documentation/ja_JP/HOWTO +++ b/Documentation/ja_JP/HOWTO | |||
@@ -1,23 +1,24 @@ | |||
1 | NOTE: | 1 | NOTE: |
2 | This is Japanese translated version of "Documentation/HOWTO". | 2 | This is a version of Documentation/HOWTO translated into Japanese. |
3 | This one is maintained by Tsugikazu Shibata <tshibata@ab.jp.nec.com> | 3 | This document is maintained by Tsugikazu Shibata <tshibata@ab.jp.nec.com> |
4 | and JF Project team <www.linux.or.jp/JF>. | 4 | and the JF Project team <www.linux.or.jp/JF>. |
5 | If you find difference with original file or problem in translation, | 5 | If you find any difference between this document and the original file |
6 | please contact maintainer of this file or JF project. | 6 | or a problem with the translation, |
7 | 7 | please contact the maintainer of this file or JF project. | |
8 | Please also note that purpose of this file is easier to read for non | 8 | |
9 | English natives and not to be intended to fork. So, if you have any | 9 | Please also note that the purpose of this file is to be easier to read |
10 | comments or updates of this file, please try to update Original(English) | 10 | for non English (read: Japanese) speakers and is not intended as a |
11 | file at first. | 11 | fork. So if you have any comments or updates for this file, please try |
12 | 12 | to update the original English file first. | |
13 | Last Updated: 2007/06/04 | 13 | |
14 | Last Updated: 2007/07/18 | ||
14 | ================================== | 15 | ================================== |
15 | ã“ã‚Œã¯ã€ | 16 | ã“ã‚Œã¯ã€ |
16 | linux-2.6.21/Documentation/HOWTO | 17 | linux-2.6.22/Documentation/HOWTO |
17 | ã®å’Œè¨³ã§ã™ã€‚ | 18 | ã®å’Œè¨³ã§ã™ã€‚ |
18 | 19 | ||
19 | 翻訳団体: JF プãƒã‚¸ã‚§ã‚¯ãƒˆ < http://www.linux.or.jp/JF/ > | 20 | 翻訳団体: JF プãƒã‚¸ã‚§ã‚¯ãƒˆ < http://www.linux.or.jp/JF/ > |
20 | 翻訳日: 2007/06/04 | 21 | 翻訳日: 2007/07/16 |
21 | 翻訳者: Tsugikazu Shibata <tshibata at ab dot jp dot nec dot com> | 22 | 翻訳者: Tsugikazu Shibata <tshibata at ab dot jp dot nec dot com> |
22 | æ ¡æ£è€…: æ¾å€‰ã•ã‚“ <nbh--mats at nifty dot com> | 23 | æ ¡æ£è€…: æ¾å€‰ã•ã‚“ <nbh--mats at nifty dot com> |
23 | å°æž— é›…å…¸ã•ã‚“ (Masanori Kobayasi) <zap03216 at nifty dot ne dot jp> | 24 | å°æž— é›…å…¸ã•ã‚“ (Masanori Kobayasi) <zap03216 at nifty dot ne dot jp> |
@@ -52,6 +53,7 @@ Linux カーãƒãƒ«é–‹ç™ºã‚³ãƒŸãƒ¥ãƒ‹ãƒ†ã‚£ã¨å…±ã«æ´»å‹•ã™ã‚‹ã‚„り方をå¦ã | |||
52 | ã¾ãŸã€ã“ã®ã‚³ãƒŸãƒ¥ãƒ‹ãƒ†ã‚£ãŒãªãœä»Šã†ã¾ãã¾ã‚ã£ã¦ã„ã‚‹ã®ã‹ã¨ã„ã†ç†ç”±ã®ä¸€éƒ¨ã‚‚ | 53 | ã¾ãŸã€ã“ã®ã‚³ãƒŸãƒ¥ãƒ‹ãƒ†ã‚£ãŒãªãœä»Šã†ã¾ãã¾ã‚ã£ã¦ã„ã‚‹ã®ã‹ã¨ã„ã†ç†ç”±ã®ä¸€éƒ¨ã‚‚ |
53 | 説明ã—よã†ã¨è©¦ã¿ã¦ã„ã¾ã™ã€‚ | 54 | 説明ã—よã†ã¨è©¦ã¿ã¦ã„ã¾ã™ã€‚ |
54 | 55 | ||
56 | |||
55 | カーãƒãƒ«ã¯ å°‘é‡ã®ã‚¢ãƒ¼ã‚テクãƒãƒ£ä¾å˜éƒ¨åˆ†ãŒã‚¢ã‚»ãƒ³ãƒ–リ言語ã§æ›¸ã‹ã‚Œã¦ã„ã‚‹ | 57 | カーãƒãƒ«ã¯ å°‘é‡ã®ã‚¢ãƒ¼ã‚テクãƒãƒ£ä¾å˜éƒ¨åˆ†ãŒã‚¢ã‚»ãƒ³ãƒ–リ言語ã§æ›¸ã‹ã‚Œã¦ã„ã‚‹ |
56 | 以外ã¯å¤§éƒ¨åˆ†ã¯ C 言語ã§æ›¸ã‹ã‚Œã¦ã„ã¾ã™ã€‚C言語をよãç†è§£ã—ã¦ã„ã‚‹ã“ã¨ã¯ã‚«ãƒ¼ | 58 | 以外ã¯å¤§éƒ¨åˆ†ã¯ C 言語ã§æ›¸ã‹ã‚Œã¦ã„ã¾ã™ã€‚C言語をよãç†è§£ã—ã¦ã„ã‚‹ã“ã¨ã¯ã‚«ãƒ¼ |
57 | ãƒãƒ«é–‹ç™ºè€…ã«ã¯å¿…è¦ã§ã™ã€‚アーã‚テクãƒãƒ£å‘ã‘ã®ä½Žãƒ¬ãƒ™ãƒ«éƒ¨åˆ†ã®é–‹ç™ºã‚’ã™ã‚‹ã® | 59 | ãƒãƒ«é–‹ç™ºè€…ã«ã¯å¿…è¦ã§ã™ã€‚アーã‚テクãƒãƒ£å‘ã‘ã®ä½Žãƒ¬ãƒ™ãƒ«éƒ¨åˆ†ã®é–‹ç™ºã‚’ã™ã‚‹ã® |
@@ -141,6 +143,7 @@ Linux カーãƒãƒ«ã‚½ãƒ¼ã‚¹ãƒ„リーã¯å¹…広ã„範囲ã®ãƒ‰ã‚ュメントをå | |||
141 | ã“れらã®ãƒ«ãƒ¼ãƒ«ã«å¾“ãˆã°ã†ã¾ãã„ãã“ã¨ã‚’ä¿è¨¼ã™ã‚‹ã“ã¨ã§ã¯ã‚ã‚Šã¾ã›ã‚“ | 143 | ã“れらã®ãƒ«ãƒ¼ãƒ«ã«å¾“ãˆã°ã†ã¾ãã„ãã“ã¨ã‚’ä¿è¨¼ã™ã‚‹ã“ã¨ã§ã¯ã‚ã‚Šã¾ã›ã‚“ |
142 | ㌠(ã™ã¹ã¦ã®ãƒ‘ッãƒã¯å†…容ã¨ã‚¹ã‚¿ã‚¤ãƒ«ã«ã¤ã„ã¦ç²¾æŸ»ã‚’å—ã‘ã‚‹ã®ã§)〠| 144 | ㌠(ã™ã¹ã¦ã®ãƒ‘ッãƒã¯å†…容ã¨ã‚¹ã‚¿ã‚¤ãƒ«ã«ã¤ã„ã¦ç²¾æŸ»ã‚’å—ã‘ã‚‹ã®ã§)〠|
143 | ルールã«å¾“ã‚ãªã‘ã‚Œã°é–“é•ã„ãªãã†ã¾ãã„ã‹ãªã„ã§ã—ょã†ã€‚ | 145 | ルールã«å¾“ã‚ãªã‘ã‚Œã°é–“é•ã„ãªãã†ã¾ãã„ã‹ãªã„ã§ã—ょã†ã€‚ |
146 | |||
144 | ã“ã®ä»–ã«ãƒ‘ッãƒã‚’作る方法ã«ã¤ã„ã¦ã®ã‚ˆãã§ããŸè¨˜è¿°ã¯- | 147 | ã“ã®ä»–ã«ãƒ‘ッãƒã‚’作る方法ã«ã¤ã„ã¦ã®ã‚ˆãã§ããŸè¨˜è¿°ã¯- |
145 | 148 | ||
146 | "The Perfect Patch" | 149 | "The Perfect Patch" |
@@ -360,44 +363,42 @@ linux-kernel メーリングリストã§åŽé›†ã•ã‚ŒãŸå¤šæ•°ã®ãƒ‘ッãƒã¨åŒæ | |||
360 | 363 | ||
361 | git ツリー- | 364 | git ツリー- |
362 | - Kbuild ã®é–‹ç™ºãƒ„リーã€Sam Ravnborg <sam@ravnborg.org> | 365 | - Kbuild ã®é–‹ç™ºãƒ„リーã€Sam Ravnborg <sam@ravnborg.org> |
363 | kernel.org:/pub/scm/linux/kernel/git/sam/kbuild.git | 366 | git.kernel.org:/pub/scm/linux/kernel/git/sam/kbuild.git |
364 | 367 | ||
365 | - ACPI ã®é–‹ç™ºãƒ„リー〠Len Brown <len.brown@intel.com> | 368 | - ACPI ã®é–‹ç™ºãƒ„リー〠Len Brown <len.brown@intel.com> |
366 | kernel.org:/pub/scm/linux/kernel/git/lenb/linux-acpi-2.6.git | 369 | git.kernel.org:/pub/scm/linux/kernel/git/lenb/linux-acpi-2.6.git |
367 | 370 | ||
368 | - Block ã®é–‹ç™ºãƒ„リーã€Jens Axboe <axboe@suse.de> | 371 | - Block ã®é–‹ç™ºãƒ„リーã€Jens Axboe <axboe@suse.de> |
369 | kernel.org:/pub/scm/linux/kernel/git/axboe/linux-2.6-block.git | 372 | git.kernel.org:/pub/scm/linux/kernel/git/axboe/linux-2.6-block.git |
370 | 373 | ||
371 | - DRM ã®é–‹ç™ºãƒ„リーã€Dave Airlie <airlied@linux.ie> | 374 | - DRM ã®é–‹ç™ºãƒ„リーã€Dave Airlie <airlied@linux.ie> |
372 | kernel.org:/pub/scm/linux/kernel/git/airlied/drm-2.6.git | 375 | git.kernel.org:/pub/scm/linux/kernel/git/airlied/drm-2.6.git |
373 | 376 | ||
374 | - ia64 ã®é–‹ç™ºãƒ„リーã€Tony Luck <tony.luck@intel.com> | 377 | - ia64 ã®é–‹ç™ºãƒ„リーã€Tony Luck <tony.luck@intel.com> |
375 | kernel.org:/pub/scm/linux/kernel/git/aegl/linux-2.6.git | 378 | git.kernel.org:/pub/scm/linux/kernel/git/aegl/linux-2.6.git |
376 | |||
377 | - ieee1394 ã®é–‹ç™ºãƒ„リーã€Jody McIntyre <scjody@modernduck.com> | ||
378 | kernel.org:/pub/scm/linux/kernel/git/scjody/ieee1394.git | ||
379 | 379 | ||
380 | - infiniband, Roland Dreier <rolandd@cisco.com> | 380 | - infiniband, Roland Dreier <rolandd@cisco.com> |
381 | kernel.org:/pub/scm/linux/kernel/git/roland/infiniband.git | 381 | git.kernel.org:/pub/scm/linux/kernel/git/roland/infiniband.git |
382 | 382 | ||
383 | - libata, Jeff Garzik <jgarzik@pobox.com> | 383 | - libata, Jeff Garzik <jgarzik@pobox.com> |
384 | kernel.org:/pub/scm/linux/kernel/git/jgarzik/libata-dev.git | 384 | git.kernel.org:/pub/scm/linux/kernel/git/jgarzik/libata-dev.git |
385 | 385 | ||
386 | - ãƒãƒƒãƒˆãƒ¯ãƒ¼ã‚¯ãƒ‰ãƒ©ã‚¤ãƒ, Jeff Garzik <jgarzik@pobox.com> | 386 | - ãƒãƒƒãƒˆãƒ¯ãƒ¼ã‚¯ãƒ‰ãƒ©ã‚¤ãƒ, Jeff Garzik <jgarzik@pobox.com> |
387 | kernel.org:/pub/scm/linux/kernel/git/jgarzik/netdev-2.6.git | 387 | git.kernel.org:/pub/scm/linux/kernel/git/jgarzik/netdev-2.6.git |
388 | 388 | ||
389 | - pcmcia, Dominik Brodowski <linux@dominikbrodowski.net> | 389 | - pcmcia, Dominik Brodowski <linux@dominikbrodowski.net> |
390 | kernel.org:/pub/scm/linux/kernel/git/brodo/pcmcia-2.6.git | 390 | git.kernel.org:/pub/scm/linux/kernel/git/brodo/pcmcia-2.6.git |
391 | 391 | ||
392 | - SCSI, James Bottomley <James.Bottomley@SteelEye.com> | 392 | - SCSI, James Bottomley <James.Bottomley@SteelEye.com> |
393 | kernel.org:/pub/scm/linux/kernel/git/jejb/scsi-misc-2.6.git | 393 | git.kernel.org:/pub/scm/linux/kernel/git/jejb/scsi-misc-2.6.git |
394 | |||
395 | ãã®ä»–ã® git カーãƒãƒ«ãƒ„リー㯠http://kernel.org/git ã«ä¸€è¦§è¡¨ãŒã‚ã‚Šã¾ | ||
396 | ã™ã€‚ | ||
397 | 394 | ||
398 | quilt ツリー- | 395 | quilt ツリー- |
399 | - USB, PCI ドライãƒã‚³ã‚¢ã¨ I2C, Greg Kroah-Hartman <gregkh@suse.de> | 396 | - USB, PCI ドライãƒã‚³ã‚¢ã¨ I2C, Greg Kroah-Hartman <gregkh@suse.de> |
400 | kernel.org/pub/linux/kernel/people/gregkh/gregkh-2.6/ | 397 | kernel.org/pub/linux/kernel/people/gregkh/gregkh-2.6/ |
398 | - x86-64 㨠i386 ã®ä»²é–“ Andi Kleen <ak@suse.de> | ||
399 | |||
400 | ãã®ä»–ã®ã‚«ãƒ¼ãƒãƒ«ãƒ„リー㯠http://git.kernel.org/ 㨠MAINTAINERS ファ | ||
401 | イルã«ä¸€è¦§è¡¨ãŒã‚ã‚Šã¾ã™ã€‚ | ||
401 | 402 | ||
402 | ãƒã‚°ãƒ¬ãƒãƒ¼ãƒˆ | 403 | ãƒã‚°ãƒ¬ãƒãƒ¼ãƒˆ |
403 | ------------- | 404 | ------------- |
@@ -508,6 +509,7 @@ MAINTAINERS ファイルã«ãƒªã‚¹ãƒˆãŒã‚ã‚Šã¾ã™ã®ã§å‚ç…§ã—ã¦ãã ã•ã | |||
508 | ã›ã‚“*。å˜ã«è‡ªåˆ†ã®ãƒ‘ッãƒã«å¯¾ã—ã¦æŒ‡æ‘˜ã•ã‚ŒãŸå•é¡Œã‚’å…¨ã¦ä¿®æ£ã—ã¦å†é€ã™ã‚Œã° | 509 | ã›ã‚“*。å˜ã«è‡ªåˆ†ã®ãƒ‘ッãƒã«å¯¾ã—ã¦æŒ‡æ‘˜ã•ã‚ŒãŸå•é¡Œã‚’å…¨ã¦ä¿®æ£ã—ã¦å†é€ã™ã‚Œã° |
509 | ã„ã„ã®ã§ã™ã€‚ | 510 | ã„ã„ã®ã§ã™ã€‚ |
510 | 511 | ||
512 | |||
511 | カーãƒãƒ«ã‚³ãƒŸãƒ¥ãƒ‹ãƒ†ã‚£ã¨ä¼æ¥çµ„ç¹”ã®ã¡ãŒã„ | 513 | カーãƒãƒ«ã‚³ãƒŸãƒ¥ãƒ‹ãƒ†ã‚£ã¨ä¼æ¥çµ„ç¹”ã®ã¡ãŒã„ |
512 | ----------------------------------------------------------------- | 514 | ----------------------------------------------------------------- |
513 | 515 | ||
@@ -577,6 +579,7 @@ Linux カーãƒãƒ«ã‚³ãƒŸãƒ¥ãƒ‹ãƒ†ã‚£ã¯ã€ä¸€åº¦ã«å¤§é‡ã®ã‚³ãƒ¼ãƒ‰ã®å¡Šã‚’å– | |||
577 | ã‹ã—ã€500è¡Œã®ãƒ‘ッãƒã¯ã€æ£ã—ã„ã“ã¨ã‚’レビューã™ã‚‹ã®ã«æ•°æ™‚é–“ã‹ã‹ã‚‹ã‹ã‚‚ | 579 | ã‹ã—ã€500è¡Œã®ãƒ‘ッãƒã¯ã€æ£ã—ã„ã“ã¨ã‚’レビューã™ã‚‹ã®ã«æ•°æ™‚é–“ã‹ã‹ã‚‹ã‹ã‚‚ |
578 | ã—ã‚Œã¾ã›ã‚“(時間ã¯ãƒ‘ッãƒã®ã‚µã‚¤ã‚ºãªã©ã«ã‚ˆã‚ŠæŒ‡æ•°é–¢æ•°ã«æ¯”例ã—ã¦ã‹ã‹ã‚Šã¾ | 580 | ã—ã‚Œã¾ã›ã‚“(時間ã¯ãƒ‘ッãƒã®ã‚µã‚¤ã‚ºãªã©ã«ã‚ˆã‚ŠæŒ‡æ•°é–¢æ•°ã«æ¯”例ã—ã¦ã‹ã‹ã‚Šã¾ |
579 | ã™) | 581 | ã™) |
582 | |||
580 | å°ã•ã„パッãƒã¯ä½•ã‹ã‚ã£ãŸã¨ãã«ãƒ‡ãƒãƒƒã‚°ã‚‚ã¨ã¦ã‚‚ç°¡å˜ã«ãªã‚Šã¾ã™ã€‚パッ | 583 | å°ã•ã„パッãƒã¯ä½•ã‹ã‚ã£ãŸã¨ãã«ãƒ‡ãƒãƒƒã‚°ã‚‚ã¨ã¦ã‚‚ç°¡å˜ã«ãªã‚Šã¾ã™ã€‚パッ |
581 | ãƒã‚’1個1個å–り除ãã®ã¯ã€ã¨ã¦ã‚‚大ããªãƒ‘ッãƒã‚’当ã¦ãŸå¾Œã«(ã‹ã¤ã€ä½•ã‹ãŠ | 584 | ãƒã‚’1個1個å–り除ãã®ã¯ã€ã¨ã¦ã‚‚大ããªãƒ‘ッãƒã‚’当ã¦ãŸå¾Œã«(ã‹ã¤ã€ä½•ã‹ãŠ |
582 | ã‹ã—ããªã£ãŸå¾Œã§)解剖ã™ã‚‹ã®ã«æ¯”ã¹ã‚Œã°ã¨ã¦ã‚‚ç°¡å˜ã§ã™ã€‚ | 585 | ã‹ã—ããªã£ãŸå¾Œã§)解剖ã™ã‚‹ã®ã«æ¯”ã¹ã‚Œã°ã¨ã¦ã‚‚ç°¡å˜ã§ã™ã€‚ |
@@ -591,6 +594,7 @@ Linux カーãƒãƒ«ã‚³ãƒŸãƒ¥ãƒ‹ãƒ†ã‚£ã¯ã€ä¸€åº¦ã«å¤§é‡ã®ã‚³ãƒ¼ãƒ‰ã®å¡Šã‚’å– | |||
591 | ã†ã€‚先生ã¯ç°¡æ½”ãªæœ€é«˜ã®è§£ã‚’ã¿ãŸã„ã®ã§ã™ã€‚良ã„生徒ã¯ã“れを知ã£ã¦ | 594 | ã†ã€‚先生ã¯ç°¡æ½”ãªæœ€é«˜ã®è§£ã‚’ã¿ãŸã„ã®ã§ã™ã€‚良ã„生徒ã¯ã“れを知ã£ã¦ |
592 | ãŠã‚Šã€ãã—ã¦æœ€çµ‚解ã®å‰ã®ä¸é–“作æ¥ã‚’æ出ã™ã‚‹ã“ã¨ã¯æ±ºã—ã¦ãªã„ã®ã§ | 595 | ãŠã‚Šã€ãã—ã¦æœ€çµ‚解ã®å‰ã®ä¸é–“作æ¥ã‚’æ出ã™ã‚‹ã“ã¨ã¯æ±ºã—ã¦ãªã„ã®ã§ |
593 | ã™" | 596 | ã™" |
597 | |||
594 | カーãƒãƒ«é–‹ç™ºã§ã‚‚ã“ã‚Œã¯åŒã˜ã§ã™ã€‚メンテナーé”ã¨ãƒ¬ãƒ“ューアé”ã¯ã€ | 598 | カーãƒãƒ«é–‹ç™ºã§ã‚‚ã“ã‚Œã¯åŒã˜ã§ã™ã€‚メンテナーé”ã¨ãƒ¬ãƒ“ューアé”ã¯ã€ |
595 | å•é¡Œã‚’解決ã™ã‚‹è§£ã®èƒŒå¾Œã«ãªã‚‹æ€è€ƒãƒ—ãƒã‚»ã‚¹ã‚’ã¿ãŸã„ã¨ã¯æ€ã„ã¾ã›ã‚“。 | 599 | å•é¡Œã‚’解決ã™ã‚‹è§£ã®èƒŒå¾Œã«ãªã‚‹æ€è€ƒãƒ—ãƒã‚»ã‚¹ã‚’ã¿ãŸã„ã¨ã¯æ€ã„ã¾ã›ã‚“。 |
596 | 彼らã¯å˜ç´”ã§ã‚ã–ã‚„ã‹ãªè§£æ±ºæ–¹æ³•ã‚’ã¿ãŸã„ã®ã§ã™ã€‚ | 600 | 彼らã¯å˜ç´”ã§ã‚ã–ã‚„ã‹ãªè§£æ±ºæ–¹æ³•ã‚’ã¿ãŸã„ã®ã§ã™ã€‚ |
diff --git a/Documentation/ja_JP/stable_api_nonsense.txt b/Documentation/ja_JP/stable_api_nonsense.txt index b3f2b27f0881..7653b5cbfed2 100644 --- a/Documentation/ja_JP/stable_api_nonsense.txt +++ b/Documentation/ja_JP/stable_api_nonsense.txt | |||
@@ -1,17 +1,17 @@ | |||
1 | NOTE: | 1 | NOTE: |
2 | This is a Japanese translated version of | 2 | This is a version of Documentation/stable_api_nonsense.txt into Japanese. |
3 | "Documentation/stable_api_nonsense.txt". | 3 | This document is maintained by IKEDA, Munehiro <m-ikeda@ds.jp.nec.com> |
4 | This one is maintained by | 4 | and the JF Project team <http://www.linux.or.jp/JF/>. |
5 | IKEDA, Munehiro <m-ikeda@ds.jp.nec.com> | 5 | If you find any difference between this document and the original file |
6 | and JF Project team <http://www.linux.or.jp/JF/>. | 6 | or a problem with the translation, |
7 | If you find difference with original file or problem in translation, | ||
8 | please contact the maintainer of this file or JF project. | 7 | please contact the maintainer of this file or JF project. |
9 | 8 | ||
10 | Please also note that purpose of this file is easier to read for non | 9 | Please also note that the purpose of this file is to be easier to read |
11 | English natives and not to be intended to fork. So, if you have any | 10 | for non English (read: Japanese) speakers and is not intended as a |
12 | comments or updates of this file, please try to update | 11 | fork. So if you have any comments or updates of this file, please try |
13 | Original(English) file at first. | 12 | to update the original English file first. |
14 | 13 | ||
14 | Last Updated: 2007/07/18 | ||
15 | ================================== | 15 | ================================== |
16 | ã“ã‚Œã¯ã€ | 16 | ã“ã‚Œã¯ã€ |
17 | linux-2.6.22-rc4/Documentation/stable_api_nonsense.txt ã®å’Œè¨³ | 17 | linux-2.6.22-rc4/Documentation/stable_api_nonsense.txt ã®å’Œè¨³ |
diff --git a/Documentation/kbuild/kconfig-language.txt b/Documentation/kbuild/kconfig-language.txt index 536d5bfbdb8d..fe8b0c4892cf 100644 --- a/Documentation/kbuild/kconfig-language.txt +++ b/Documentation/kbuild/kconfig-language.txt | |||
@@ -98,6 +98,15 @@ applicable everywhere (see syntax). | |||
98 | times, the limit is set to the largest selection. | 98 | times, the limit is set to the largest selection. |
99 | Reverse dependencies can only be used with boolean or tristate | 99 | Reverse dependencies can only be used with boolean or tristate |
100 | symbols. | 100 | symbols. |
101 | Note: | ||
102 | select is evil.... select will by brute force set a symbol | ||
103 | equal to 'y' without visiting the dependencies. So abusing | ||
104 | select you are able to select a symbol FOO even if FOO depends | ||
105 | on BAR that is not set. In general use select only for | ||
106 | non-visible symbols (no promts anywhere) and for symbols with | ||
107 | no dependencies. That will limit the usefulness but on the | ||
108 | other hand avoid the illegal configurations all over. kconfig | ||
109 | should one day warn about such things. | ||
101 | 110 | ||
102 | - numerical ranges: "range" <symbol> <symbol> ["if" <expr>] | 111 | - numerical ranges: "range" <symbol> <symbol> ["if" <expr>] |
103 | This allows to limit the range of possible input values for int | 112 | This allows to limit the range of possible input values for int |
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 379931e74334..09c0ec100f61 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt | |||
@@ -30,6 +30,7 @@ the beginning of each description states the restrictions within which a | |||
30 | parameter is applicable: | 30 | parameter is applicable: |
31 | 31 | ||
32 | ACPI ACPI support is enabled. | 32 | ACPI ACPI support is enabled. |
33 | AGP AGP (Accelerated Graphics Port) is enabled. | ||
33 | ALSA ALSA sound support is enabled. | 34 | ALSA ALSA sound support is enabled. |
34 | APIC APIC support is enabled. | 35 | APIC APIC support is enabled. |
35 | APM Advanced Power Management support is enabled. | 36 | APM Advanced Power Management support is enabled. |
@@ -40,7 +41,6 @@ parameter is applicable: | |||
40 | EIDE EIDE/ATAPI support is enabled. | 41 | EIDE EIDE/ATAPI support is enabled. |
41 | FB The frame buffer device is enabled. | 42 | FB The frame buffer device is enabled. |
42 | HW Appropriate hardware is enabled. | 43 | HW Appropriate hardware is enabled. |
43 | IA-32 IA-32 aka i386 architecture is enabled. | ||
44 | IA-64 IA-64 architecture is enabled. | 44 | IA-64 IA-64 architecture is enabled. |
45 | IOSCHED More than one I/O scheduler is enabled. | 45 | IOSCHED More than one I/O scheduler is enabled. |
46 | IP_PNP IP DHCP, BOOTP, or RARP is enabled. | 46 | IP_PNP IP DHCP, BOOTP, or RARP is enabled. |
@@ -57,14 +57,14 @@ parameter is applicable: | |||
57 | MDA MDA console support is enabled. | 57 | MDA MDA console support is enabled. |
58 | MOUSE Appropriate mouse support is enabled. | 58 | MOUSE Appropriate mouse support is enabled. |
59 | MSI Message Signaled Interrupts (PCI). | 59 | MSI Message Signaled Interrupts (PCI). |
60 | MTD MTD support is enabled. | 60 | MTD MTD (Memory Technology Device) support is enabled. |
61 | NET Appropriate network support is enabled. | 61 | NET Appropriate network support is enabled. |
62 | NUMA NUMA support is enabled. | 62 | NUMA NUMA support is enabled. |
63 | GENERIC_TIME The generic timeofday code is enabled. | 63 | GENERIC_TIME The generic timeofday code is enabled. |
64 | NFS Appropriate NFS support is enabled. | 64 | NFS Appropriate NFS support is enabled. |
65 | OSS OSS sound support is enabled. | 65 | OSS OSS sound support is enabled. |
66 | PV_OPS A paravirtualized kernel | 66 | PV_OPS A paravirtualized kernel is enabled. |
67 | PARIDE The ParIDE subsystem is enabled. | 67 | PARIDE The ParIDE (parallel port IDE) subsystem is enabled. |
68 | PARISC The PA-RISC architecture is enabled. | 68 | PARISC The PA-RISC architecture is enabled. |
69 | PCI PCI bus support is enabled. | 69 | PCI PCI bus support is enabled. |
70 | PCMCIA The PCMCIA subsystem is enabled. | 70 | PCMCIA The PCMCIA subsystem is enabled. |
@@ -91,6 +91,7 @@ parameter is applicable: | |||
91 | VT Virtual terminal support is enabled. | 91 | VT Virtual terminal support is enabled. |
92 | WDT Watchdog support is enabled. | 92 | WDT Watchdog support is enabled. |
93 | XT IBM PC/XT MFM hard disk support is enabled. | 93 | XT IBM PC/XT MFM hard disk support is enabled. |
94 | X86-32 X86-32, aka i386 architecture is enabled. | ||
94 | X86-64 X86-64 architecture is enabled. | 95 | X86-64 X86-64 architecture is enabled. |
95 | More X86-64 boot options can be found in | 96 | More X86-64 boot options can be found in |
96 | Documentation/x86_64/boot-options.txt . | 97 | Documentation/x86_64/boot-options.txt . |
@@ -122,10 +123,6 @@ and is between 256 and 4096 characters. It is defined in the file | |||
122 | ./include/asm/setup.h as COMMAND_LINE_SIZE. | 123 | ./include/asm/setup.h as COMMAND_LINE_SIZE. |
123 | 124 | ||
124 | 125 | ||
125 | 53c7xx= [HW,SCSI] Amiga SCSI controllers | ||
126 | See header of drivers/scsi/53c7xx.c. | ||
127 | See also Documentation/scsi/ncr53c7xx.txt. | ||
128 | |||
129 | acpi= [HW,ACPI,X86-64,i386] | 126 | acpi= [HW,ACPI,X86-64,i386] |
130 | Advanced Configuration and Power Interface | 127 | Advanced Configuration and Power Interface |
131 | Format: { force | off | ht | strict | noirq } | 128 | Format: { force | off | ht | strict | noirq } |
@@ -224,11 +221,17 @@ and is between 256 and 4096 characters. It is defined in the file | |||
224 | 221 | ||
225 | acpi_fake_ecdt [HW,ACPI] Workaround failure due to BIOS lacking ECDT | 222 | acpi_fake_ecdt [HW,ACPI] Workaround failure due to BIOS lacking ECDT |
226 | 223 | ||
227 | acpi_pm_good [IA-32,X86-64] | 224 | acpi_pm_good [X86-32,X86-64] |
228 | Override the pmtimer bug detection: force the kernel | 225 | Override the pmtimer bug detection: force the kernel |
229 | to assume that this machine's pmtimer latches its value | 226 | to assume that this machine's pmtimer latches its value |
230 | and always returns good values. | 227 | and always returns good values. |
231 | 228 | ||
229 | agp= [AGP] | ||
230 | { off | try_unsupported } | ||
231 | off: disable AGP support | ||
232 | try_unsupported: try to drive unsupported chipsets | ||
233 | (may crash computer or cause data corruption) | ||
234 | |||
232 | enable_timer_pin_1 [i386,x86-64] | 235 | enable_timer_pin_1 [i386,x86-64] |
233 | Enable PIN 1 of APIC timer | 236 | Enable PIN 1 of APIC timer |
234 | Can be useful to work around chipset bugs | 237 | Can be useful to work around chipset bugs |
@@ -281,7 +284,8 @@ and is between 256 and 4096 characters. It is defined in the file | |||
281 | not play well with APC CPU idle - disable it if you have | 284 | not play well with APC CPU idle - disable it if you have |
282 | APC and your system crashes randomly. | 285 | APC and your system crashes randomly. |
283 | 286 | ||
284 | apic= [APIC,i386] Change the output verbosity whilst booting | 287 | apic= [APIC,i386] Advanced Programmable Interrupt Controller |
288 | Change the output verbosity whilst booting | ||
285 | Format: { quiet (default) | verbose | debug } | 289 | Format: { quiet (default) | verbose | debug } |
286 | Change the amount of debugging information output | 290 | Change the amount of debugging information output |
287 | when initialising the APIC and IO-APIC components. | 291 | when initialising the APIC and IO-APIC components. |
@@ -355,7 +359,7 @@ and is between 256 and 4096 characters. It is defined in the file | |||
355 | 359 | ||
356 | c101= [NET] Moxa C101 synchronous serial card | 360 | c101= [NET] Moxa C101 synchronous serial card |
357 | 361 | ||
358 | cachesize= [BUGS=IA-32] Override level 2 CPU cache size detection. | 362 | cachesize= [BUGS=X86-32] Override level 2 CPU cache size detection. |
359 | Sometimes CPU hardware bugs make them report the cache | 363 | Sometimes CPU hardware bugs make them report the cache |
360 | size incorrectly. The kernel will attempt work arounds | 364 | size incorrectly. The kernel will attempt work arounds |
361 | to fix known problems, but for some CPUs it is not | 365 | to fix known problems, but for some CPUs it is not |
@@ -374,7 +378,7 @@ and is between 256 and 4096 characters. It is defined in the file | |||
374 | Value can be changed at runtime via | 378 | Value can be changed at runtime via |
375 | /selinux/checkreqprot. | 379 | /selinux/checkreqprot. |
376 | 380 | ||
377 | clock= [BUGS=IA-32, HW] gettimeofday clocksource override. | 381 | clock= [BUGS=X86-32, HW] gettimeofday clocksource override. |
378 | [Deprecated] | 382 | [Deprecated] |
379 | Forces specified clocksource (if available) to be used | 383 | Forces specified clocksource (if available) to be used |
380 | when calculating gettimeofday(). If specified | 384 | when calculating gettimeofday(). If specified |
@@ -392,7 +396,7 @@ and is between 256 and 4096 characters. It is defined in the file | |||
392 | [ARM] imx_timer1,OSTS,netx_timer,mpu_timer2, | 396 | [ARM] imx_timer1,OSTS,netx_timer,mpu_timer2, |
393 | pxa_timer,timer3,32k_counter,timer0_1 | 397 | pxa_timer,timer3,32k_counter,timer0_1 |
394 | [AVR32] avr32 | 398 | [AVR32] avr32 |
395 | [IA-32] pit,hpet,tsc,vmi-timer; | 399 | [X86-32] pit,hpet,tsc,vmi-timer; |
396 | scx200_hrt on Geode; cyclone on IBM x440 | 400 | scx200_hrt on Geode; cyclone on IBM x440 |
397 | [MIPS] MIPS | 401 | [MIPS] MIPS |
398 | [PARISC] cr16 | 402 | [PARISC] cr16 |
@@ -412,7 +416,7 @@ and is between 256 and 4096 characters. It is defined in the file | |||
412 | over the 8254 in addition to over the IO-APIC. The | 416 | over the 8254 in addition to over the IO-APIC. The |
413 | kernel tries to set a sensible default. | 417 | kernel tries to set a sensible default. |
414 | 418 | ||
415 | hpet= [IA-32,HPET] option to disable HPET and use PIT. | 419 | hpet= [X86-32,HPET] option to disable HPET and use PIT. |
416 | Format: disable | 420 | Format: disable |
417 | 421 | ||
418 | com20020= [HW,NET] ARCnet - COM20020 chipset | 422 | com20020= [HW,NET] ARCnet - COM20020 chipset |
@@ -549,7 +553,7 @@ and is between 256 and 4096 characters. It is defined in the file | |||
549 | 553 | ||
550 | dtc3181e= [HW,SCSI] | 554 | dtc3181e= [HW,SCSI] |
551 | 555 | ||
552 | earlyprintk= [IA-32,X86-64,SH] | 556 | earlyprintk= [X86-32,X86-64,SH] |
553 | earlyprintk=vga | 557 | earlyprintk=vga |
554 | earlyprintk=serial[,ttySn[,baudrate]] | 558 | earlyprintk=serial[,ttySn[,baudrate]] |
555 | 559 | ||
@@ -587,7 +591,7 @@ and is between 256 and 4096 characters. It is defined in the file | |||
587 | eisa_irq_edge= [PARISC,HW] | 591 | eisa_irq_edge= [PARISC,HW] |
588 | See header of drivers/parisc/eisa.c. | 592 | See header of drivers/parisc/eisa.c. |
589 | 593 | ||
590 | elanfreq= [IA-32] | 594 | elanfreq= [X86-32] |
591 | See comment before function elanfreq_setup() in | 595 | See comment before function elanfreq_setup() in |
592 | arch/i386/kernel/cpu/cpufreq/elanfreq.c. | 596 | arch/i386/kernel/cpu/cpufreq/elanfreq.c. |
593 | 597 | ||
@@ -596,7 +600,7 @@ and is between 256 and 4096 characters. It is defined in the file | |||
596 | See Documentation/block/as-iosched.txt and | 600 | See Documentation/block/as-iosched.txt and |
597 | Documentation/block/deadline-iosched.txt for details. | 601 | Documentation/block/deadline-iosched.txt for details. |
598 | 602 | ||
599 | elfcorehdr= [IA-32, X86_64] | 603 | elfcorehdr= [X86-32, X86_64] |
600 | Specifies physical address of start of kernel core | 604 | Specifies physical address of start of kernel core |
601 | image elf header. Generally kexec loader will | 605 | image elf header. Generally kexec loader will |
602 | pass this option to capture kernel. | 606 | pass this option to capture kernel. |
@@ -678,7 +682,7 @@ and is between 256 and 4096 characters. It is defined in the file | |||
678 | hisax= [HW,ISDN] | 682 | hisax= [HW,ISDN] |
679 | See Documentation/isdn/README.HiSax. | 683 | See Documentation/isdn/README.HiSax. |
680 | 684 | ||
681 | hugepages= [HW,IA-32,IA-64] Maximal number of HugeTLB pages. | 685 | hugepages= [HW,X86-32,IA-64] Maximal number of HugeTLB pages. |
682 | 686 | ||
683 | i8042.direct [HW] Put keyboard port into non-translated mode | 687 | i8042.direct [HW] Put keyboard port into non-translated mode |
684 | i8042.dumbkbd [HW] Pretend that controller can only read data from | 688 | i8042.dumbkbd [HW] Pretend that controller can only read data from |
@@ -770,7 +774,8 @@ and is between 256 and 4096 characters. It is defined in the file | |||
770 | See Documentation/nfsroot.txt. | 774 | See Documentation/nfsroot.txt. |
771 | 775 | ||
772 | ip2= [HW] Set IO/IRQ pairs for up to 4 IntelliPort boards | 776 | ip2= [HW] Set IO/IRQ pairs for up to 4 IntelliPort boards |
773 | See comment before ip2_setup() in drivers/char/ip2.c. | 777 | See comment before ip2_setup() in |
778 | drivers/char/ip2/ip2base.c. | ||
774 | 779 | ||
775 | ips= [HW,SCSI] Adaptec / IBM ServeRAID controller | 780 | ips= [HW,SCSI] Adaptec / IBM ServeRAID controller |
776 | See header of drivers/scsi/ips.c. | 781 | See header of drivers/scsi/ips.c. |
@@ -819,7 +824,7 @@ and is between 256 and 4096 characters. It is defined in the file | |||
819 | js= [HW,JOY] Analog joystick | 824 | js= [HW,JOY] Analog joystick |
820 | See Documentation/input/joystick.txt. | 825 | See Documentation/input/joystick.txt. |
821 | 826 | ||
822 | kernelcore=nn[KMG] [KNL,IA-32,IA-64,PPC,X86-64] This parameter | 827 | kernelcore=nn[KMG] [KNL,X86-32,IA-64,PPC,X86-64] This parameter |
823 | specifies the amount of memory usable by the kernel | 828 | specifies the amount of memory usable by the kernel |
824 | for non-movable allocations. The requested amount is | 829 | for non-movable allocations. The requested amount is |
825 | spread evenly throughout all nodes in the system. The | 830 | spread evenly throughout all nodes in the system. The |
@@ -835,7 +840,7 @@ and is between 256 and 4096 characters. It is defined in the file | |||
835 | use the HighMem zone if it exists, and the Normal | 840 | use the HighMem zone if it exists, and the Normal |
836 | zone if it does not. | 841 | zone if it does not. |
837 | 842 | ||
838 | movablecore=nn[KMG] [KNL,IA-32,IA-64,PPC,X86-64] This parameter | 843 | movablecore=nn[KMG] [KNL,X86-32,IA-64,PPC,X86-64] This parameter |
839 | is similar to kernelcore except it specifies the | 844 | is similar to kernelcore except it specifies the |
840 | amount of memory used for migratable allocations. | 845 | amount of memory used for migratable allocations. |
841 | If both kernelcore and movablecore is specified, | 846 | If both kernelcore and movablecore is specified, |
@@ -847,28 +852,20 @@ and is between 256 and 4096 characters. It is defined in the file | |||
847 | 852 | ||
848 | keepinitrd [HW,ARM] | 853 | keepinitrd [HW,ARM] |
849 | 854 | ||
850 | kstack=N [IA-32,X86-64] Print N words from the kernel stack | 855 | kstack=N [X86-32,X86-64] Print N words from the kernel stack |
851 | in oops dumps. | 856 | in oops dumps. |
852 | 857 | ||
853 | l2cr= [PPC] | 858 | l2cr= [PPC] |
854 | 859 | ||
855 | lapic [IA-32,APIC] Enable the local APIC even if BIOS | 860 | lapic [X86-32,APIC] Enable the local APIC even if BIOS |
856 | disabled it. | 861 | disabled it. |
857 | 862 | ||
858 | lapic_timer_c2_ok [IA-32,x86-64,APIC] trust the local apic timer in | 863 | lapic_timer_c2_ok [X86-32,x86-64,APIC] trust the local apic timer in |
859 | C2 power state. | 864 | C2 power state. |
860 | 865 | ||
861 | lasi= [HW,SCSI] PARISC LASI driver for the 53c700 chip | 866 | lasi= [HW,SCSI] PARISC LASI driver for the 53c700 chip |
862 | Format: addr:<io>,irq:<irq> | 867 | Format: addr:<io>,irq:<irq> |
863 | 868 | ||
864 | legacy_serial.force [HW,IA-32,X86-64] | ||
865 | Probe for COM ports at legacy addresses even | ||
866 | if PNPBIOS or ACPI should describe them. This | ||
867 | is for working around firmware defects. | ||
868 | |||
869 | llsc*= [IA64] See function print_params() in | ||
870 | arch/ia64/sn/kernel/llsc4.c. | ||
871 | |||
872 | load_ramdisk= [RAM] List of ramdisks to load from floppy | 869 | load_ramdisk= [RAM] List of ramdisks to load from floppy |
873 | See Documentation/ramdisk.txt. | 870 | See Documentation/ramdisk.txt. |
874 | 871 | ||
@@ -974,11 +971,11 @@ and is between 256 and 4096 characters. It is defined in the file | |||
974 | [SCSI] Maximum number of LUNs received. | 971 | [SCSI] Maximum number of LUNs received. |
975 | Should be between 1 and 16384. | 972 | Should be between 1 and 16384. |
976 | 973 | ||
977 | mca-pentium [BUGS=IA-32] | 974 | mca-pentium [BUGS=X86-32] |
978 | 975 | ||
979 | mcatest= [IA-64] | 976 | mcatest= [IA-64] |
980 | 977 | ||
981 | mce [IA-32] Machine Check Exception | 978 | mce [X86-32] Machine Check Exception |
982 | 979 | ||
983 | md= [HW] RAID subsystems devices and level | 980 | md= [HW] RAID subsystems devices and level |
984 | See Documentation/md.txt. | 981 | See Documentation/md.txt. |
@@ -990,14 +987,14 @@ and is between 256 and 4096 characters. It is defined in the file | |||
990 | mem=nn[KMG] [KNL,BOOT] Force usage of a specific amount of memory | 987 | mem=nn[KMG] [KNL,BOOT] Force usage of a specific amount of memory |
991 | Amount of memory to be used when the kernel is not able | 988 | Amount of memory to be used when the kernel is not able |
992 | to see the whole system memory or for test. | 989 | to see the whole system memory or for test. |
993 | [IA-32] Use together with memmap= to avoid physical | 990 | [X86-32] Use together with memmap= to avoid physical |
994 | address space collisions. Without memmap= PCI devices | 991 | address space collisions. Without memmap= PCI devices |
995 | could be placed at addresses belonging to unused RAM. | 992 | could be placed at addresses belonging to unused RAM. |
996 | 993 | ||
997 | mem=nopentium [BUGS=IA-32] Disable usage of 4MB pages for kernel | 994 | mem=nopentium [BUGS=X86-32] Disable usage of 4MB pages for kernel |
998 | memory. | 995 | memory. |
999 | 996 | ||
1000 | memmap=exactmap [KNL,IA-32,X86_64] Enable setting of an exact | 997 | memmap=exactmap [KNL,X86-32,X86_64] Enable setting of an exact |
1001 | E820 memory map, as specified by the user. | 998 | E820 memory map, as specified by the user. |
1002 | Such memmap=exactmap lines can be constructed based on | 999 | Such memmap=exactmap lines can be constructed based on |
1003 | BIOS output or other requirements. See the memmap=nn@ss | 1000 | BIOS output or other requirements. See the memmap=nn@ss |
@@ -1041,7 +1038,7 @@ and is between 256 and 4096 characters. It is defined in the file | |||
1041 | <name>,<region-number>[,<base>,<size>,<buswidth>,<altbuswidth>] | 1038 | <name>,<region-number>[,<base>,<size>,<buswidth>,<altbuswidth>] |
1042 | 1039 | ||
1043 | mtdparts= [MTD] | 1040 | mtdparts= [MTD] |
1044 | See drivers/mtd/cmdline.c. | 1041 | See drivers/mtd/cmdlinepart.c. |
1045 | 1042 | ||
1046 | mtouchusb.raw_coordinates= | 1043 | mtouchusb.raw_coordinates= |
1047 | [HW] Make the MicroTouch USB driver use raw coordinates | 1044 | [HW] Make the MicroTouch USB driver use raw coordinates |
@@ -1083,9 +1080,9 @@ and is between 256 and 4096 characters. It is defined in the file | |||
1083 | [NFS] set the maximum lifetime for idmapper cache | 1080 | [NFS] set the maximum lifetime for idmapper cache |
1084 | entries. | 1081 | entries. |
1085 | 1082 | ||
1086 | nmi_watchdog= [KNL,BUGS=IA-32] Debugging features for SMP kernels | 1083 | nmi_watchdog= [KNL,BUGS=X86-32] Debugging features for SMP kernels |
1087 | 1084 | ||
1088 | no387 [BUGS=IA-32] Tells the kernel to use the 387 maths | 1085 | no387 [BUGS=X86-32] Tells the kernel to use the 387 maths |
1089 | emulation library even if a 387 maths coprocessor | 1086 | emulation library even if a 387 maths coprocessor |
1090 | is present. | 1087 | is present. |
1091 | 1088 | ||
@@ -1116,17 +1113,17 @@ and is between 256 and 4096 characters. It is defined in the file | |||
1116 | 1113 | ||
1117 | noexec [IA-64] | 1114 | noexec [IA-64] |
1118 | 1115 | ||
1119 | noexec [IA-32,X86-64] | 1116 | noexec [X86-32,X86-64] |
1120 | noexec=on: enable non-executable mappings (default) | 1117 | noexec=on: enable non-executable mappings (default) |
1121 | noexec=off: disable nn-executable mappings | 1118 | noexec=off: disable nn-executable mappings |
1122 | 1119 | ||
1123 | nofxsr [BUGS=IA-32] Disables x86 floating point extended | 1120 | nofxsr [BUGS=X86-32] Disables x86 floating point extended |
1124 | register save and restore. The kernel will only save | 1121 | register save and restore. The kernel will only save |
1125 | legacy floating-point registers on task switch. | 1122 | legacy floating-point registers on task switch. |
1126 | 1123 | ||
1127 | nohlt [BUGS=ARM] | 1124 | nohlt [BUGS=ARM] |
1128 | 1125 | ||
1129 | no-hlt [BUGS=IA-32] Tells the kernel that the hlt | 1126 | no-hlt [BUGS=X86-32] Tells the kernel that the hlt |
1130 | instruction doesn't work correctly and not to | 1127 | instruction doesn't work correctly and not to |
1131 | use it. | 1128 | use it. |
1132 | 1129 | ||
@@ -1141,12 +1138,12 @@ and is between 256 and 4096 characters. It is defined in the file | |||
1141 | Valid arguments: on, off | 1138 | Valid arguments: on, off |
1142 | Default: on | 1139 | Default: on |
1143 | 1140 | ||
1144 | noirqbalance [IA-32,SMP,KNL] Disable kernel irq balancing | 1141 | noirqbalance [X86-32,SMP,KNL] Disable kernel irq balancing |
1145 | 1142 | ||
1146 | noirqdebug [IA-32] Disables the code which attempts to detect and | 1143 | noirqdebug [X86-32] Disables the code which attempts to detect and |
1147 | disable unhandled interrupt sources. | 1144 | disable unhandled interrupt sources. |
1148 | 1145 | ||
1149 | no_timer_check [IA-32,X86_64,APIC] Disables the code which tests for | 1146 | no_timer_check [X86-32,X86_64,APIC] Disables the code which tests for |
1150 | broken timer IRQ sources. | 1147 | broken timer IRQ sources. |
1151 | 1148 | ||
1152 | noisapnp [ISAPNP] Disables ISA PnP code. | 1149 | noisapnp [ISAPNP] Disables ISA PnP code. |
@@ -1158,20 +1155,20 @@ and is between 256 and 4096 characters. It is defined in the file | |||
1158 | 1155 | ||
1159 | nojitter [IA64] Disables jitter checking for ITC timers. | 1156 | nojitter [IA64] Disables jitter checking for ITC timers. |
1160 | 1157 | ||
1161 | nolapic [IA-32,APIC] Do not enable or use the local APIC. | 1158 | nolapic [X86-32,APIC] Do not enable or use the local APIC. |
1162 | 1159 | ||
1163 | nolapic_timer [IA-32,APIC] Do not use the local APIC timer. | 1160 | nolapic_timer [X86-32,APIC] Do not use the local APIC timer. |
1164 | 1161 | ||
1165 | noltlbs [PPC] Do not use large page/tlb entries for kernel | 1162 | noltlbs [PPC] Do not use large page/tlb entries for kernel |
1166 | lowmem mapping on PPC40x. | 1163 | lowmem mapping on PPC40x. |
1167 | 1164 | ||
1168 | nomca [IA-64] Disable machine check abort handling | 1165 | nomca [IA-64] Disable machine check abort handling |
1169 | 1166 | ||
1170 | nomce [IA-32] Machine Check Exception | 1167 | nomce [X86-32] Machine Check Exception |
1171 | 1168 | ||
1172 | noreplace-paravirt [IA-32,PV_OPS] Don't patch paravirt_ops | 1169 | noreplace-paravirt [X86-32,PV_OPS] Don't patch paravirt_ops |
1173 | 1170 | ||
1174 | noreplace-smp [IA-32,SMP] Don't replace SMP instructions | 1171 | noreplace-smp [X86-32,SMP] Don't replace SMP instructions |
1175 | with UP alternatives | 1172 | with UP alternatives |
1176 | 1173 | ||
1177 | noresidual [PPC] Don't use residual data on PReP machines. | 1174 | noresidual [PPC] Don't use residual data on PReP machines. |
@@ -1185,7 +1182,7 @@ and is between 256 and 4096 characters. It is defined in the file | |||
1185 | 1182 | ||
1186 | nosbagart [IA-64] | 1183 | nosbagart [IA-64] |
1187 | 1184 | ||
1188 | nosep [BUGS=IA-32] Disables x86 SYSENTER/SYSEXIT support. | 1185 | nosep [BUGS=X86-32] Disables x86 SYSENTER/SYSEXIT support. |
1189 | 1186 | ||
1190 | nosmp [SMP] Tells an SMP kernel to act as a UP kernel. | 1187 | nosmp [SMP] Tells an SMP kernel to act as a UP kernel. |
1191 | 1188 | ||
@@ -1193,7 +1190,7 @@ and is between 256 and 4096 characters. It is defined in the file | |||
1193 | 1190 | ||
1194 | nosync [HW,M68K] Disables sync negotiation for all devices. | 1191 | nosync [HW,M68K] Disables sync negotiation for all devices. |
1195 | 1192 | ||
1196 | notsc [BUGS=IA-32] Disable Time Stamp Counter | 1193 | notsc [BUGS=X86-32] Disable Time Stamp Counter |
1197 | 1194 | ||
1198 | nousb [USB] Disable the USB subsystem | 1195 | nousb [USB] Disable the USB subsystem |
1199 | 1196 | ||
@@ -1266,28 +1263,28 @@ and is between 256 and 4096 characters. It is defined in the file | |||
1266 | See also Documentation/paride.txt. | 1263 | See also Documentation/paride.txt. |
1267 | 1264 | ||
1268 | pci=option[,option...] [PCI] various PCI subsystem options: | 1265 | pci=option[,option...] [PCI] various PCI subsystem options: |
1269 | off [IA-32] don't probe for the PCI bus | 1266 | off [X86-32] don't probe for the PCI bus |
1270 | bios [IA-32] force use of PCI BIOS, don't access | 1267 | bios [X86-32] force use of PCI BIOS, don't access |
1271 | the hardware directly. Use this if your machine | 1268 | the hardware directly. Use this if your machine |
1272 | has a non-standard PCI host bridge. | 1269 | has a non-standard PCI host bridge. |
1273 | nobios [IA-32] disallow use of PCI BIOS, only direct | 1270 | nobios [X86-32] disallow use of PCI BIOS, only direct |
1274 | hardware access methods are allowed. Use this | 1271 | hardware access methods are allowed. Use this |
1275 | if you experience crashes upon bootup and you | 1272 | if you experience crashes upon bootup and you |
1276 | suspect they are caused by the BIOS. | 1273 | suspect they are caused by the BIOS. |
1277 | conf1 [IA-32] Force use of PCI Configuration | 1274 | conf1 [X86-32] Force use of PCI Configuration |
1278 | Mechanism 1. | 1275 | Mechanism 1. |
1279 | conf2 [IA-32] Force use of PCI Configuration | 1276 | conf2 [X86-32] Force use of PCI Configuration |
1280 | Mechanism 2. | 1277 | Mechanism 2. |
1281 | nommconf [IA-32,X86_64] Disable use of MMCONFIG for PCI | 1278 | nommconf [X86-32,X86_64] Disable use of MMCONFIG for PCI |
1282 | Configuration | 1279 | Configuration |
1283 | nomsi [MSI] If the PCI_MSI kernel config parameter is | 1280 | nomsi [MSI] If the PCI_MSI kernel config parameter is |
1284 | enabled, this kernel boot option can be used to | 1281 | enabled, this kernel boot option can be used to |
1285 | disable the use of MSI interrupts system-wide. | 1282 | disable the use of MSI interrupts system-wide. |
1286 | nosort [IA-32] Don't sort PCI devices according to | 1283 | nosort [X86-32] Don't sort PCI devices according to |
1287 | order given by the PCI BIOS. This sorting is | 1284 | order given by the PCI BIOS. This sorting is |
1288 | done to get a device order compatible with | 1285 | done to get a device order compatible with |
1289 | older kernels. | 1286 | older kernels. |
1290 | biosirq [IA-32] Use PCI BIOS calls to get the interrupt | 1287 | biosirq [X86-32] Use PCI BIOS calls to get the interrupt |
1291 | routing table. These calls are known to be buggy | 1288 | routing table. These calls are known to be buggy |
1292 | on several machines and they hang the machine | 1289 | on several machines and they hang the machine |
1293 | when used, but on other computers it's the only | 1290 | when used, but on other computers it's the only |
@@ -1295,32 +1292,32 @@ and is between 256 and 4096 characters. It is defined in the file | |||
1295 | this option if the kernel is unable to allocate | 1292 | this option if the kernel is unable to allocate |
1296 | IRQs or discover secondary PCI buses on your | 1293 | IRQs or discover secondary PCI buses on your |
1297 | motherboard. | 1294 | motherboard. |
1298 | rom [IA-32] Assign address space to expansion ROMs. | 1295 | rom [X86-32] Assign address space to expansion ROMs. |
1299 | Use with caution as certain devices share | 1296 | Use with caution as certain devices share |
1300 | address decoders between ROMs and other | 1297 | address decoders between ROMs and other |
1301 | resources. | 1298 | resources. |
1302 | irqmask=0xMMMM [IA-32] Set a bit mask of IRQs allowed to be | 1299 | irqmask=0xMMMM [X86-32] Set a bit mask of IRQs allowed to be |
1303 | assigned automatically to PCI devices. You can | 1300 | assigned automatically to PCI devices. You can |
1304 | make the kernel exclude IRQs of your ISA cards | 1301 | make the kernel exclude IRQs of your ISA cards |
1305 | this way. | 1302 | this way. |
1306 | pirqaddr=0xAAAAA [IA-32] Specify the physical address | 1303 | pirqaddr=0xAAAAA [X86-32] Specify the physical address |
1307 | of the PIRQ table (normally generated | 1304 | of the PIRQ table (normally generated |
1308 | by the BIOS) if it is outside the | 1305 | by the BIOS) if it is outside the |
1309 | F0000h-100000h range. | 1306 | F0000h-100000h range. |
1310 | lastbus=N [IA-32] Scan all buses thru bus #N. Can be | 1307 | lastbus=N [X86-32] Scan all buses thru bus #N. Can be |
1311 | useful if the kernel is unable to find your | 1308 | useful if the kernel is unable to find your |
1312 | secondary buses and you want to tell it | 1309 | secondary buses and you want to tell it |
1313 | explicitly which ones they are. | 1310 | explicitly which ones they are. |
1314 | assign-busses [IA-32] Always assign all PCI bus | 1311 | assign-busses [X86-32] Always assign all PCI bus |
1315 | numbers ourselves, overriding | 1312 | numbers ourselves, overriding |
1316 | whatever the firmware may have done. | 1313 | whatever the firmware may have done. |
1317 | usepirqmask [IA-32] Honor the possible IRQ mask stored | 1314 | usepirqmask [X86-32] Honor the possible IRQ mask stored |
1318 | in the BIOS $PIR table. This is needed on | 1315 | in the BIOS $PIR table. This is needed on |
1319 | some systems with broken BIOSes, notably | 1316 | some systems with broken BIOSes, notably |
1320 | some HP Pavilion N5400 and Omnibook XE3 | 1317 | some HP Pavilion N5400 and Omnibook XE3 |
1321 | notebooks. This will have no effect if ACPI | 1318 | notebooks. This will have no effect if ACPI |
1322 | IRQ routing is enabled. | 1319 | IRQ routing is enabled. |
1323 | noacpi [IA-32] Do not use ACPI for IRQ routing | 1320 | noacpi [X86-32] Do not use ACPI for IRQ routing |
1324 | or for PCI scanning. | 1321 | or for PCI scanning. |
1325 | routeirq Do IRQ routing for all PCI devices. | 1322 | routeirq Do IRQ routing for all PCI devices. |
1326 | This is normally done in pci_enable_device(), | 1323 | This is normally done in pci_enable_device(), |
@@ -1469,13 +1466,13 @@ and is between 256 and 4096 characters. It is defined in the file | |||
1469 | Run specified binary instead of /init from the ramdisk, | 1466 | Run specified binary instead of /init from the ramdisk, |
1470 | used for early userspace startup. See initrd. | 1467 | used for early userspace startup. See initrd. |
1471 | 1468 | ||
1472 | reboot= [BUGS=IA-32,BUGS=ARM,BUGS=IA-64] Rebooting mode | 1469 | reboot= [BUGS=X86-32,BUGS=ARM,BUGS=IA-64] Rebooting mode |
1473 | Format: <reboot_mode>[,<reboot_mode2>[,...]] | 1470 | Format: <reboot_mode>[,<reboot_mode2>[,...]] |
1474 | See arch/*/kernel/reboot.c or arch/*/kernel/process.c | 1471 | See arch/*/kernel/reboot.c or arch/*/kernel/process.c |
1475 | 1472 | ||
1476 | reserve= [KNL,BUGS] Force the kernel to ignore some iomem area | 1473 | reserve= [KNL,BUGS] Force the kernel to ignore some iomem area |
1477 | 1474 | ||
1478 | reservetop= [IA-32] | 1475 | reservetop= [X86-32] |
1479 | Format: nn[KMG] | 1476 | Format: nn[KMG] |
1480 | Reserves a hole at the top of the kernel virtual | 1477 | Reserves a hole at the top of the kernel virtual |
1481 | address space. | 1478 | address space. |
@@ -1566,7 +1563,7 @@ and is between 256 and 4096 characters. It is defined in the file | |||
1566 | Value can be changed at runtime via | 1563 | Value can be changed at runtime via |
1567 | /selinux/compat_net. | 1564 | /selinux/compat_net. |
1568 | 1565 | ||
1569 | serialnumber [BUGS=IA-32] | 1566 | serialnumber [BUGS=X86-32] |
1570 | 1567 | ||
1571 | sg_def_reserved_size= [SCSI] | 1568 | sg_def_reserved_size= [SCSI] |
1572 | 1569 | ||
@@ -1619,7 +1616,7 @@ and is between 256 and 4096 characters. It is defined in the file | |||
1619 | smart2= [HW] | 1616 | smart2= [HW] |
1620 | Format: <io1>[,<io2>[,...,<io8>]] | 1617 | Format: <io1>[,<io2>[,...,<io8>]] |
1621 | 1618 | ||
1622 | smp-alt-once [IA-32,SMP] On a hotplug CPU system, only | 1619 | smp-alt-once [X86-32,SMP] On a hotplug CPU system, only |
1623 | attempt to substitute SMP alternatives once at boot. | 1620 | attempt to substitute SMP alternatives once at boot. |
1624 | 1621 | ||
1625 | smsc-ircc2.nopnp [HW] Don't use PNP to discover SMC devices | 1622 | smsc-ircc2.nopnp [HW] Don't use PNP to discover SMC devices |
@@ -1884,7 +1881,7 @@ and is between 256 and 4096 characters. It is defined in the file | |||
1884 | usbhid.mousepoll= | 1881 | usbhid.mousepoll= |
1885 | [USBHID] The interval which mice are to be polled at. | 1882 | [USBHID] The interval which mice are to be polled at. |
1886 | 1883 | ||
1887 | vdso= [IA-32,SH,x86-64] | 1884 | vdso= [X86-32,SH,x86-64] |
1888 | vdso=2: enable compat VDSO (default with COMPAT_VDSO) | 1885 | vdso=2: enable compat VDSO (default with COMPAT_VDSO) |
1889 | vdso=1: enable VDSO (default) | 1886 | vdso=1: enable VDSO (default) |
1890 | vdso=0: disable VDSO mapping | 1887 | vdso=0: disable VDSO mapping |
@@ -1895,7 +1892,7 @@ and is between 256 and 4096 characters. It is defined in the file | |||
1895 | video= [FB] Frame buffer configuration | 1892 | video= [FB] Frame buffer configuration |
1896 | See Documentation/fb/modedb.txt. | 1893 | See Documentation/fb/modedb.txt. |
1897 | 1894 | ||
1898 | vga= [BOOT,IA-32] Select a particular video mode | 1895 | vga= [BOOT,X86-32] Select a particular video mode |
1899 | See Documentation/i386/boot.txt and | 1896 | See Documentation/i386/boot.txt and |
1900 | Documentation/svga.txt. | 1897 | Documentation/svga.txt. |
1901 | Use vga=ask for menu. | 1898 | Use vga=ask for menu. |
@@ -1927,7 +1924,7 @@ and is between 256 and 4096 characters. It is defined in the file | |||
1927 | See header of drivers/scsi/wd7000.c. | 1924 | See header of drivers/scsi/wd7000.c. |
1928 | 1925 | ||
1929 | wdt= [WDT] Watchdog | 1926 | wdt= [WDT] Watchdog |
1930 | See Documentation/watchdog/watchdog.txt. | 1927 | See Documentation/watchdog/wdt.txt. |
1931 | 1928 | ||
1932 | xd= [HW,XT] Original XT pre-IDE (RLL encoded) disks. | 1929 | xd= [HW,XT] Original XT pre-IDE (RLL encoded) disks. |
1933 | xd_geo= See header of drivers/block/xd.c. | 1930 | xd_geo= See header of drivers/block/xd.c. |
diff --git a/Documentation/keys.txt b/Documentation/keys.txt index 81d9aa097298..947d57d53453 100644 --- a/Documentation/keys.txt +++ b/Documentation/keys.txt | |||
@@ -859,9 +859,8 @@ payload contents" for more information. | |||
859 | void unregister_key_type(struct key_type *type); | 859 | void unregister_key_type(struct key_type *type); |
860 | 860 | ||
861 | 861 | ||
862 | Under some circumstances, it may be desirable to desirable to deal with a | 862 | Under some circumstances, it may be desirable to deal with a bundle of keys. |
863 | bundle of keys. The facility provides access to the keyring type for managing | 863 | The facility provides access to the keyring type for managing such a bundle: |
864 | such a bundle: | ||
865 | 864 | ||
866 | struct key_type key_type_keyring; | 865 | struct key_type key_type_keyring; |
867 | 866 | ||
diff --git a/Documentation/kobject.txt b/Documentation/kobject.txt index e44855513b3d..8ee49ee7c963 100644 --- a/Documentation/kobject.txt +++ b/Documentation/kobject.txt | |||
@@ -27,7 +27,6 @@ in detail, and briefly here: | |||
27 | - kobjects a simple object. | 27 | - kobjects a simple object. |
28 | - kset a set of objects of a certain type. | 28 | - kset a set of objects of a certain type. |
29 | - ktype a set of helpers for objects of a common type. | 29 | - ktype a set of helpers for objects of a common type. |
30 | - subsystem a controlling object for a number of ksets. | ||
31 | 30 | ||
32 | 31 | ||
33 | The kobject infrastructure maintains a close relationship with the | 32 | The kobject infrastructure maintains a close relationship with the |
@@ -54,13 +53,15 @@ embedded in larger data structures and replace fields they duplicate. | |||
54 | 1.2 Definition | 53 | 1.2 Definition |
55 | 54 | ||
56 | struct kobject { | 55 | struct kobject { |
56 | const char * k_name; | ||
57 | char name[KOBJ_NAME_LEN]; | 57 | char name[KOBJ_NAME_LEN]; |
58 | atomic_t refcount; | 58 | struct kref kref; |
59 | struct list_head entry; | 59 | struct list_head entry; |
60 | struct kobject * parent; | 60 | struct kobject * parent; |
61 | struct kset * kset; | 61 | struct kset * kset; |
62 | struct kobj_type * ktype; | 62 | struct kobj_type * ktype; |
63 | struct dentry * dentry; | 63 | struct sysfs_dirent * sd; |
64 | wait_queue_head_t poll; | ||
64 | }; | 65 | }; |
65 | 66 | ||
66 | void kobject_init(struct kobject *); | 67 | void kobject_init(struct kobject *); |
@@ -137,8 +138,7 @@ If a kobject does not have a parent when it is registered, its parent | |||
137 | becomes its dominant kset. | 138 | becomes its dominant kset. |
138 | 139 | ||
139 | If a kobject does not have a parent nor a dominant kset, its directory | 140 | If a kobject does not have a parent nor a dominant kset, its directory |
140 | is created at the top-level of the sysfs partition. This should only | 141 | is created at the top-level of the sysfs partition. |
141 | happen for kobjects that are embedded in a struct subsystem. | ||
142 | 142 | ||
143 | 143 | ||
144 | 144 | ||
@@ -150,10 +150,10 @@ A kset is a set of kobjects that are embedded in the same type. | |||
150 | 150 | ||
151 | 151 | ||
152 | struct kset { | 152 | struct kset { |
153 | struct subsystem * subsys; | ||
154 | struct kobj_type * ktype; | 153 | struct kobj_type * ktype; |
155 | struct list_head list; | 154 | struct list_head list; |
156 | struct kobject kobj; | 155 | struct kobject kobj; |
156 | struct kset_uevent_ops * uevent_ops; | ||
157 | }; | 157 | }; |
158 | 158 | ||
159 | 159 | ||
@@ -169,8 +169,7 @@ struct kobject * kset_find_obj(struct kset *, char *); | |||
169 | 169 | ||
170 | 170 | ||
171 | The type that the kobjects are embedded in is described by the ktype | 171 | The type that the kobjects are embedded in is described by the ktype |
172 | pointer. The subsystem that the kobject belongs to is pointed to by the | 172 | pointer. |
173 | subsys pointer. | ||
174 | 173 | ||
175 | A kset contains a kobject itself, meaning that it may be registered in | 174 | A kset contains a kobject itself, meaning that it may be registered in |
176 | the kobject hierarchy and exported via sysfs. More importantly, the | 175 | the kobject hierarchy and exported via sysfs. More importantly, the |
@@ -209,6 +208,58 @@ the hierarchy. | |||
209 | kset_find_obj() may be used to locate a kobject with a particular | 208 | kset_find_obj() may be used to locate a kobject with a particular |
210 | name. The kobject, if found, is returned. | 209 | name. The kobject, if found, is returned. |
211 | 210 | ||
211 | There are also some helper functions which names point to the formerly | ||
212 | existing "struct subsystem", whose functions have been taken over by | ||
213 | ksets. | ||
214 | |||
215 | |||
216 | decl_subsys(name,type,uevent_ops) | ||
217 | |||
218 | Declares a kset named '<name>_subsys' of type <type> with | ||
219 | uevent_ops <uevent_ops>. For example, | ||
220 | |||
221 | decl_subsys(devices, &ktype_device, &device_uevent_ops); | ||
222 | |||
223 | is equivalent to doing: | ||
224 | |||
225 | struct kset devices_subsys = { | ||
226 | .kobj = { | ||
227 | .name = "devices", | ||
228 | }, | ||
229 | .ktype = &ktype_devices, | ||
230 | .uevent_ops = &device_uevent_ops, | ||
231 | }; | ||
232 | |||
233 | |||
234 | The objects that are registered with a subsystem that use the | ||
235 | subsystem's default list must have their kset ptr set properly. These | ||
236 | objects may have embedded kobjects or ksets. The | ||
237 | following helpers make setting the kset easier: | ||
238 | |||
239 | |||
240 | kobj_set_kset_s(obj,subsys) | ||
241 | |||
242 | - Assumes that obj->kobj exists, and is a struct kobject. | ||
243 | - Sets the kset of that kobject to the kset <subsys>. | ||
244 | |||
245 | |||
246 | kset_set_kset_s(obj,subsys) | ||
247 | |||
248 | - Assumes that obj->kset exists, and is a struct kset. | ||
249 | - Sets the kset of the embedded kobject to the kset <subsys>. | ||
250 | |||
251 | subsys_set_kset(obj,subsys) | ||
252 | |||
253 | - Assumes obj->subsys exists, and is a struct subsystem. | ||
254 | - Sets obj->subsys.kset.kobj.kset to the subsystem's embedded kset. | ||
255 | |||
256 | void subsystem_init(struct kset *s); | ||
257 | int subsystem_register(struct kset *s); | ||
258 | void subsystem_unregister(struct kset *s); | ||
259 | struct kset *subsys_get(struct kset *s); | ||
260 | void kset_put(struct kset *s); | ||
261 | |||
262 | These are just wrappers around the respective kset_* functions. | ||
212 | 263 | ||
213 | 2.3 sysfs | 264 | 2.3 sysfs |
214 | 265 | ||
@@ -254,114 +305,3 @@ Instances of struct kobj_type are not registered; only referenced by | |||
254 | the kset. A kobj_type may be referenced by an arbitrary number of | 305 | the kset. A kobj_type may be referenced by an arbitrary number of |
255 | ksets, as there may be disparate sets of identical objects. | 306 | ksets, as there may be disparate sets of identical objects. |
256 | 307 | ||
257 | |||
258 | |||
259 | 4. subsystems | ||
260 | |||
261 | 4.1 Description | ||
262 | |||
263 | A subsystem represents a significant entity of code that maintains an | ||
264 | arbitrary number of sets of objects of various types. Since the number | ||
265 | of ksets and the type of objects they contain are variable, a | ||
266 | generic representation of a subsystem is minimal. | ||
267 | |||
268 | |||
269 | struct subsystem { | ||
270 | struct kset kset; | ||
271 | struct rw_semaphore rwsem; | ||
272 | }; | ||
273 | |||
274 | int subsystem_register(struct subsystem *); | ||
275 | void subsystem_unregister(struct subsystem *); | ||
276 | |||
277 | struct subsystem * subsys_get(struct subsystem * s); | ||
278 | void subsys_put(struct subsystem * s); | ||
279 | |||
280 | |||
281 | A subsystem contains an embedded kset so: | ||
282 | |||
283 | - It can be represented in the object hierarchy via the kset's | ||
284 | embedded kobject. | ||
285 | |||
286 | - It can maintain a default list of objects of one type. | ||
287 | |||
288 | Additional ksets may attach to the subsystem simply by referencing the | ||
289 | subsystem before they are registered. (This one-way reference means | ||
290 | that there is no way to determine the ksets that are attached to the | ||
291 | subsystem.) | ||
292 | |||
293 | All ksets that are attached to a subsystem share the subsystem's R/W | ||
294 | semaphore. | ||
295 | |||
296 | |||
297 | 4.2 subsystem Programming Interface. | ||
298 | |||
299 | The subsystem programming interface is simple and does not offer the | ||
300 | flexibility that the kset and kobject programming interfaces do. They | ||
301 | may be registered and unregistered, as well as reference counted. Each | ||
302 | call forwards the calls to their embedded ksets (which forward the | ||
303 | calls to their embedded kobjects). | ||
304 | |||
305 | |||
306 | 4.3 Helpers | ||
307 | |||
308 | A number of macros are available to make dealing with subsystems and | ||
309 | their embedded objects easier. | ||
310 | |||
311 | |||
312 | decl_subsys(name,type) | ||
313 | |||
314 | Declares a subsystem named '<name>_subsys', with an embedded kset of | ||
315 | type <type>. For example, | ||
316 | |||
317 | decl_subsys(devices,&ktype_devices); | ||
318 | |||
319 | is equivalent to doing: | ||
320 | |||
321 | struct subsystem device_subsys = { | ||
322 | .kset = { | ||
323 | .kobj = { | ||
324 | .name = "devices", | ||
325 | }, | ||
326 | .ktype = &ktype_devices, | ||
327 | } | ||
328 | }; | ||
329 | |||
330 | |||
331 | The objects that are registered with a subsystem that use the | ||
332 | subsystem's default list must have their kset ptr set properly. These | ||
333 | objects may have embedded kobjects, ksets, or other subsystems. The | ||
334 | following helpers make setting the kset easier: | ||
335 | |||
336 | |||
337 | kobj_set_kset_s(obj,subsys) | ||
338 | |||
339 | - Assumes that obj->kobj exists, and is a struct kobject. | ||
340 | - Sets the kset of that kobject to the subsystem's embedded kset. | ||
341 | |||
342 | |||
343 | kset_set_kset_s(obj,subsys) | ||
344 | |||
345 | - Assumes that obj->kset exists, and is a struct kset. | ||
346 | - Sets the kset of the embedded kobject to the subsystem's | ||
347 | embedded kset. | ||
348 | |||
349 | subsys_set_kset(obj,subsys) | ||
350 | |||
351 | - Assumes obj->subsys exists, and is a struct subsystem. | ||
352 | - Sets obj->subsys.kset.kobj.kset to the subsystem's embedded kset. | ||
353 | |||
354 | |||
355 | 4.4 sysfs | ||
356 | |||
357 | subsystems are represented in sysfs via their embedded kobjects. They | ||
358 | follow the same rules as previously mentioned with no exceptions. They | ||
359 | typically receive a top-level directory in sysfs, except when their | ||
360 | embedded kobject is part of another kset, or the parent of the | ||
361 | embedded kobject is explicitly set. | ||
362 | |||
363 | Note that the subsystem's embedded kset must be 'attached' to the | ||
364 | subsystem itself in order to use its rwsem. This is done after | ||
365 | kset_add() has been called. (Not before, because kset_add() uses its | ||
366 | subsystem for a default parent if it doesn't already have one). | ||
367 | |||
diff --git a/Documentation/lguest/Makefile b/Documentation/lguest/Makefile index 31e794ef5f98..c0b7a4556390 100644 --- a/Documentation/lguest/Makefile +++ b/Documentation/lguest/Makefile | |||
@@ -13,7 +13,9 @@ LGUEST_GUEST_TOP := ($(CONFIG_PAGE_OFFSET) - 0x08000000) | |||
13 | 13 | ||
14 | CFLAGS:=-Wall -Wmissing-declarations -Wmissing-prototypes -O3 -Wl,-T,lguest.lds | 14 | CFLAGS:=-Wall -Wmissing-declarations -Wmissing-prototypes -O3 -Wl,-T,lguest.lds |
15 | LDLIBS:=-lz | 15 | LDLIBS:=-lz |
16 | 16 | # Removing this works for some versions of ld.so (eg. Ubuntu Feisty) and | |
17 | # not others (eg. FC7). | ||
18 | LDFLAGS+=-static | ||
17 | all: lguest.lds lguest | 19 | all: lguest.lds lguest |
18 | 20 | ||
19 | # The linker script on x86 is so complex the only way of creating one | 21 | # The linker script on x86 is so complex the only way of creating one |
diff --git a/Documentation/lguest/extract b/Documentation/lguest/extract new file mode 100644 index 000000000000..7730bb6e4b94 --- /dev/null +++ b/Documentation/lguest/extract | |||
@@ -0,0 +1,58 @@ | |||
1 | #! /bin/sh | ||
2 | |||
3 | set -e | ||
4 | |||
5 | PREFIX=$1 | ||
6 | shift | ||
7 | |||
8 | trap 'rm -r $TMPDIR' 0 | ||
9 | TMPDIR=`mktemp -d` | ||
10 | |||
11 | exec 3>/dev/null | ||
12 | for f; do | ||
13 | while IFS=" | ||
14 | " read -r LINE; do | ||
15 | case "$LINE" in | ||
16 | *$PREFIX:[0-9]*:\**) | ||
17 | NUM=`echo "$LINE" | sed "s/.*$PREFIX:\([0-9]*\).*/\1/"` | ||
18 | if [ -f $TMPDIR/$NUM ]; then | ||
19 | echo "$TMPDIR/$NUM already exits prior to $f" | ||
20 | exit 1 | ||
21 | fi | ||
22 | exec 3>>$TMPDIR/$NUM | ||
23 | echo $f | sed 's,\.\./,,g' > $TMPDIR/.$NUM | ||
24 | /bin/echo "$LINE" | sed -e "s/$PREFIX:[0-9]*//" -e "s/:\*/*/" >&3 | ||
25 | ;; | ||
26 | *$PREFIX:[0-9]*) | ||
27 | NUM=`echo "$LINE" | sed "s/.*$PREFIX:\([0-9]*\).*/\1/"` | ||
28 | if [ -f $TMPDIR/$NUM ]; then | ||
29 | echo "$TMPDIR/$NUM already exits prior to $f" | ||
30 | exit 1 | ||
31 | fi | ||
32 | exec 3>>$TMPDIR/$NUM | ||
33 | echo $f | sed 's,\.\./,,g' > $TMPDIR/.$NUM | ||
34 | /bin/echo "$LINE" | sed "s/$PREFIX:[0-9]*//" >&3 | ||
35 | ;; | ||
36 | *:\**) | ||
37 | /bin/echo "$LINE" | sed -e "s/:\*/*/" -e "s,/\*\*/,," >&3 | ||
38 | echo >&3 | ||
39 | exec 3>/dev/null | ||
40 | ;; | ||
41 | *) | ||
42 | /bin/echo "$LINE" >&3 | ||
43 | ;; | ||
44 | esac | ||
45 | done < $f | ||
46 | echo >&3 | ||
47 | exec 3>/dev/null | ||
48 | done | ||
49 | |||
50 | LASTFILE="" | ||
51 | for f in $TMPDIR/*; do | ||
52 | if [ "$LASTFILE" != $(cat $TMPDIR/.$(basename $f) ) ]; then | ||
53 | LASTFILE=$(cat $TMPDIR/.$(basename $f) ) | ||
54 | echo "[ $LASTFILE ]" | ||
55 | fi | ||
56 | cat $f | ||
57 | done | ||
58 | |||
diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c index 62a8133393e1..f7918401a007 100644 --- a/Documentation/lguest/lguest.c +++ b/Documentation/lguest/lguest.c | |||
@@ -1,5 +1,10 @@ | |||
1 | /* Simple program to layout "physical" memory for new lguest guest. | 1 | /*P:100 This is the Launcher code, a simple program which lays out the |
2 | * Linked high to avoid likely physical memory. */ | 2 | * "physical" memory for the new Guest by mapping the kernel image and the |
3 | * virtual devices, then reads repeatedly from /dev/lguest to run the Guest. | ||
4 | * | ||
5 | * The only trick: the Makefile links it at a high address so it will be clear | ||
6 | * of the guest memory region. It means that each Guest cannot have more than | ||
7 | * about 2.5G of memory on a normally configured Host. :*/ | ||
3 | #define _LARGEFILE64_SOURCE | 8 | #define _LARGEFILE64_SOURCE |
4 | #define _GNU_SOURCE | 9 | #define _GNU_SOURCE |
5 | #include <stdio.h> | 10 | #include <stdio.h> |
@@ -29,12 +34,20 @@ | |||
29 | #include <termios.h> | 34 | #include <termios.h> |
30 | #include <getopt.h> | 35 | #include <getopt.h> |
31 | #include <zlib.h> | 36 | #include <zlib.h> |
37 | /*L:110 We can ignore the 28 include files we need for this program, but I do | ||
38 | * want to draw attention to the use of kernel-style types. | ||
39 | * | ||
40 | * As Linus said, "C is a Spartan language, and so should your naming be." I | ||
41 | * like these abbreviations and the header we need uses them, so we define them | ||
42 | * here. | ||
43 | */ | ||
32 | typedef unsigned long long u64; | 44 | typedef unsigned long long u64; |
33 | typedef uint32_t u32; | 45 | typedef uint32_t u32; |
34 | typedef uint16_t u16; | 46 | typedef uint16_t u16; |
35 | typedef uint8_t u8; | 47 | typedef uint8_t u8; |
36 | #include "../../include/linux/lguest_launcher.h" | 48 | #include "../../include/linux/lguest_launcher.h" |
37 | #include "../../include/asm-i386/e820.h" | 49 | #include "../../include/asm-i386/e820.h" |
50 | /*:*/ | ||
38 | 51 | ||
39 | #define PAGE_PRESENT 0x7 /* Present, RW, Execute */ | 52 | #define PAGE_PRESENT 0x7 /* Present, RW, Execute */ |
40 | #define NET_PEERNUM 1 | 53 | #define NET_PEERNUM 1 |
@@ -43,33 +56,52 @@ typedef uint8_t u8; | |||
43 | #define SIOCBRADDIF 0x89a2 /* add interface to bridge */ | 56 | #define SIOCBRADDIF 0x89a2 /* add interface to bridge */ |
44 | #endif | 57 | #endif |
45 | 58 | ||
59 | /*L:120 verbose is both a global flag and a macro. The C preprocessor allows | ||
60 | * this, and although I wouldn't recommend it, it works quite nicely here. */ | ||
46 | static bool verbose; | 61 | static bool verbose; |
47 | #define verbose(args...) \ | 62 | #define verbose(args...) \ |
48 | do { if (verbose) printf(args); } while(0) | 63 | do { if (verbose) printf(args); } while(0) |
64 | /*:*/ | ||
65 | |||
66 | /* The pipe to send commands to the waker process */ | ||
49 | static int waker_fd; | 67 | static int waker_fd; |
68 | /* The top of guest physical memory. */ | ||
50 | static u32 top; | 69 | static u32 top; |
51 | 70 | ||
71 | /* This is our list of devices. */ | ||
52 | struct device_list | 72 | struct device_list |
53 | { | 73 | { |
74 | /* Summary information about the devices in our list: ready to pass to | ||
75 | * select() to ask which need servicing.*/ | ||
54 | fd_set infds; | 76 | fd_set infds; |
55 | int max_infd; | 77 | int max_infd; |
56 | 78 | ||
79 | /* The descriptor page for the devices. */ | ||
57 | struct lguest_device_desc *descs; | 80 | struct lguest_device_desc *descs; |
81 | |||
82 | /* A single linked list of devices. */ | ||
58 | struct device *dev; | 83 | struct device *dev; |
84 | /* ... And an end pointer so we can easily append new devices */ | ||
59 | struct device **lastdev; | 85 | struct device **lastdev; |
60 | }; | 86 | }; |
61 | 87 | ||
88 | /* The device structure describes a single device. */ | ||
62 | struct device | 89 | struct device |
63 | { | 90 | { |
91 | /* The linked-list pointer. */ | ||
64 | struct device *next; | 92 | struct device *next; |
93 | /* The descriptor for this device, as mapped into the Guest. */ | ||
65 | struct lguest_device_desc *desc; | 94 | struct lguest_device_desc *desc; |
95 | /* The memory page(s) of this device, if any. Also mapped in Guest. */ | ||
66 | void *mem; | 96 | void *mem; |
67 | 97 | ||
68 | /* Watch this fd if handle_input non-NULL. */ | 98 | /* If handle_input is set, it wants to be called when this file |
99 | * descriptor is ready. */ | ||
69 | int fd; | 100 | int fd; |
70 | bool (*handle_input)(int fd, struct device *me); | 101 | bool (*handle_input)(int fd, struct device *me); |
71 | 102 | ||
72 | /* Watch DMA to this key if handle_input non-NULL. */ | 103 | /* If handle_output is set, it wants to be called when the Guest sends |
104 | * DMA to this key. */ | ||
73 | unsigned long watch_key; | 105 | unsigned long watch_key; |
74 | u32 (*handle_output)(int fd, const struct iovec *iov, | 106 | u32 (*handle_output)(int fd, const struct iovec *iov, |
75 | unsigned int num, struct device *me); | 107 | unsigned int num, struct device *me); |
@@ -78,6 +110,11 @@ struct device | |||
78 | void *priv; | 110 | void *priv; |
79 | }; | 111 | }; |
80 | 112 | ||
113 | /*L:130 | ||
114 | * Loading the Kernel. | ||
115 | * | ||
116 | * We start with couple of simple helper routines. open_or_die() avoids | ||
117 | * error-checking code cluttering the callers: */ | ||
81 | static int open_or_die(const char *name, int flags) | 118 | static int open_or_die(const char *name, int flags) |
82 | { | 119 | { |
83 | int fd = open(name, flags); | 120 | int fd = open(name, flags); |
@@ -86,26 +123,38 @@ static int open_or_die(const char *name, int flags) | |||
86 | return fd; | 123 | return fd; |
87 | } | 124 | } |
88 | 125 | ||
126 | /* map_zeroed_pages() takes a (page-aligned) address and a number of pages. */ | ||
89 | static void *map_zeroed_pages(unsigned long addr, unsigned int num) | 127 | static void *map_zeroed_pages(unsigned long addr, unsigned int num) |
90 | { | 128 | { |
129 | /* We cache the /dev/zero file-descriptor so we only open it once. */ | ||
91 | static int fd = -1; | 130 | static int fd = -1; |
92 | 131 | ||
93 | if (fd == -1) | 132 | if (fd == -1) |
94 | fd = open_or_die("/dev/zero", O_RDONLY); | 133 | fd = open_or_die("/dev/zero", O_RDONLY); |
95 | 134 | ||
135 | /* We use a private mapping (ie. if we write to the page, it will be | ||
136 | * copied), and obviously we insist that it be mapped where we ask. */ | ||
96 | if (mmap((void *)addr, getpagesize() * num, | 137 | if (mmap((void *)addr, getpagesize() * num, |
97 | PROT_READ|PROT_WRITE|PROT_EXEC, MAP_FIXED|MAP_PRIVATE, fd, 0) | 138 | PROT_READ|PROT_WRITE|PROT_EXEC, MAP_FIXED|MAP_PRIVATE, fd, 0) |
98 | != (void *)addr) | 139 | != (void *)addr) |
99 | err(1, "Mmaping %u pages of /dev/zero @%p", num, (void *)addr); | 140 | err(1, "Mmaping %u pages of /dev/zero @%p", num, (void *)addr); |
141 | |||
142 | /* Returning the address is just a courtesy: can simplify callers. */ | ||
100 | return (void *)addr; | 143 | return (void *)addr; |
101 | } | 144 | } |
102 | 145 | ||
103 | /* Find magic string marking entry point, return entry point. */ | 146 | /* To find out where to start we look for the magic Guest string, which marks |
147 | * the code we see in lguest_asm.S. This is a hack which we are currently | ||
148 | * plotting to replace with the normal Linux entry point. */ | ||
104 | static unsigned long entry_point(void *start, void *end, | 149 | static unsigned long entry_point(void *start, void *end, |
105 | unsigned long page_offset) | 150 | unsigned long page_offset) |
106 | { | 151 | { |
107 | void *p; | 152 | void *p; |
108 | 153 | ||
154 | /* The scan gives us the physical starting address. We want the | ||
155 | * virtual address in this case, and fortunately, we already figured | ||
156 | * out the physical-virtual difference and passed it here in | ||
157 | * "page_offset". */ | ||
109 | for (p = start; p < end; p++) | 158 | for (p = start; p < end; p++) |
110 | if (memcmp(p, "GenuineLguest", strlen("GenuineLguest")) == 0) | 159 | if (memcmp(p, "GenuineLguest", strlen("GenuineLguest")) == 0) |
111 | return (long)p + strlen("GenuineLguest") + page_offset; | 160 | return (long)p + strlen("GenuineLguest") + page_offset; |
@@ -113,7 +162,17 @@ static unsigned long entry_point(void *start, void *end, | |||
113 | err(1, "Is this image a genuine lguest?"); | 162 | err(1, "Is this image a genuine lguest?"); |
114 | } | 163 | } |
115 | 164 | ||
116 | /* Returns the entry point */ | 165 | /* This routine takes an open vmlinux image, which is in ELF, and maps it into |
166 | * the Guest memory. ELF = Embedded Linking Format, which is the format used | ||
167 | * by all modern binaries on Linux including the kernel. | ||
168 | * | ||
169 | * The ELF headers give *two* addresses: a physical address, and a virtual | ||
170 | * address. The Guest kernel expects to be placed in memory at the physical | ||
171 | * address, and the page tables set up so it will correspond to that virtual | ||
172 | * address. We return the difference between the virtual and physical | ||
173 | * addresses in the "page_offset" pointer. | ||
174 | * | ||
175 | * We return the starting address. */ | ||
117 | static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr, | 176 | static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr, |
118 | unsigned long *page_offset) | 177 | unsigned long *page_offset) |
119 | { | 178 | { |
@@ -122,40 +181,61 @@ static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr, | |||
122 | unsigned int i; | 181 | unsigned int i; |
123 | unsigned long start = -1UL, end = 0; | 182 | unsigned long start = -1UL, end = 0; |
124 | 183 | ||
125 | /* Sanity checks. */ | 184 | /* Sanity checks on the main ELF header: an x86 executable with a |
185 | * reasonable number of correctly-sized program headers. */ | ||
126 | if (ehdr->e_type != ET_EXEC | 186 | if (ehdr->e_type != ET_EXEC |
127 | || ehdr->e_machine != EM_386 | 187 | || ehdr->e_machine != EM_386 |
128 | || ehdr->e_phentsize != sizeof(Elf32_Phdr) | 188 | || ehdr->e_phentsize != sizeof(Elf32_Phdr) |
129 | || ehdr->e_phnum < 1 || ehdr->e_phnum > 65536U/sizeof(Elf32_Phdr)) | 189 | || ehdr->e_phnum < 1 || ehdr->e_phnum > 65536U/sizeof(Elf32_Phdr)) |
130 | errx(1, "Malformed elf header"); | 190 | errx(1, "Malformed elf header"); |
131 | 191 | ||
192 | /* An ELF executable contains an ELF header and a number of "program" | ||
193 | * headers which indicate which parts ("segments") of the program to | ||
194 | * load where. */ | ||
195 | |||
196 | /* We read in all the program headers at once: */ | ||
132 | if (lseek(elf_fd, ehdr->e_phoff, SEEK_SET) < 0) | 197 | if (lseek(elf_fd, ehdr->e_phoff, SEEK_SET) < 0) |
133 | err(1, "Seeking to program headers"); | 198 | err(1, "Seeking to program headers"); |
134 | if (read(elf_fd, phdr, sizeof(phdr)) != sizeof(phdr)) | 199 | if (read(elf_fd, phdr, sizeof(phdr)) != sizeof(phdr)) |
135 | err(1, "Reading program headers"); | 200 | err(1, "Reading program headers"); |
136 | 201 | ||
202 | /* We don't know page_offset yet. */ | ||
137 | *page_offset = 0; | 203 | *page_offset = 0; |
138 | /* We map the loadable segments at virtual addresses corresponding | 204 | |
139 | * to their physical addresses (our virtual == guest physical). */ | 205 | /* Try all the headers: there are usually only three. A read-only one, |
206 | * a read-write one, and a "note" section which isn't loadable. */ | ||
140 | for (i = 0; i < ehdr->e_phnum; i++) { | 207 | for (i = 0; i < ehdr->e_phnum; i++) { |
208 | /* If this isn't a loadable segment, we ignore it */ | ||
141 | if (phdr[i].p_type != PT_LOAD) | 209 | if (phdr[i].p_type != PT_LOAD) |
142 | continue; | 210 | continue; |
143 | 211 | ||
144 | verbose("Section %i: size %i addr %p\n", | 212 | verbose("Section %i: size %i addr %p\n", |
145 | i, phdr[i].p_memsz, (void *)phdr[i].p_paddr); | 213 | i, phdr[i].p_memsz, (void *)phdr[i].p_paddr); |
146 | 214 | ||
147 | /* We expect linear address space. */ | 215 | /* We expect a simple linear address space: every segment must |
216 | * have the same difference between virtual (p_vaddr) and | ||
217 | * physical (p_paddr) address. */ | ||
148 | if (!*page_offset) | 218 | if (!*page_offset) |
149 | *page_offset = phdr[i].p_vaddr - phdr[i].p_paddr; | 219 | *page_offset = phdr[i].p_vaddr - phdr[i].p_paddr; |
150 | else if (*page_offset != phdr[i].p_vaddr - phdr[i].p_paddr) | 220 | else if (*page_offset != phdr[i].p_vaddr - phdr[i].p_paddr) |
151 | errx(1, "Page offset of section %i different", i); | 221 | errx(1, "Page offset of section %i different", i); |
152 | 222 | ||
223 | /* We track the first and last address we mapped, so we can | ||
224 | * tell entry_point() where to scan. */ | ||
153 | if (phdr[i].p_paddr < start) | 225 | if (phdr[i].p_paddr < start) |
154 | start = phdr[i].p_paddr; | 226 | start = phdr[i].p_paddr; |
155 | if (phdr[i].p_paddr + phdr[i].p_filesz > end) | 227 | if (phdr[i].p_paddr + phdr[i].p_filesz > end) |
156 | end = phdr[i].p_paddr + phdr[i].p_filesz; | 228 | end = phdr[i].p_paddr + phdr[i].p_filesz; |
157 | 229 | ||
158 | /* We map everything private, writable. */ | 230 | /* We map this section of the file at its physical address. We |
231 | * map it read & write even if the header says this segment is | ||
232 | * read-only. The kernel really wants to be writable: it | ||
233 | * patches its own instructions which would normally be | ||
234 | * read-only. | ||
235 | * | ||
236 | * MAP_PRIVATE means that the page won't be copied until a | ||
237 | * write is done to it. This allows us to share much of the | ||
238 | * kernel memory between Guests. */ | ||
159 | addr = mmap((void *)phdr[i].p_paddr, | 239 | addr = mmap((void *)phdr[i].p_paddr, |
160 | phdr[i].p_filesz, | 240 | phdr[i].p_filesz, |
161 | PROT_READ|PROT_WRITE|PROT_EXEC, | 241 | PROT_READ|PROT_WRITE|PROT_EXEC, |
@@ -169,7 +249,31 @@ static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr, | |||
169 | return entry_point((void *)start, (void *)end, *page_offset); | 249 | return entry_point((void *)start, (void *)end, *page_offset); |
170 | } | 250 | } |
171 | 251 | ||
172 | /* This is amazingly reliable. */ | 252 | /*L:170 Prepare to be SHOCKED and AMAZED. And possibly a trifle nauseated. |
253 | * | ||
254 | * We know that CONFIG_PAGE_OFFSET sets what virtual address the kernel expects | ||
255 | * to be. We don't know what that option was, but we can figure it out | ||
256 | * approximately by looking at the addresses in the code. I chose the common | ||
257 | * case of reading a memory location into the %eax register: | ||
258 | * | ||
259 | * movl <some-address>, %eax | ||
260 | * | ||
261 | * This gets encoded as five bytes: "0xA1 <4-byte-address>". For example, | ||
262 | * "0xA1 0x18 0x60 0x47 0xC0" reads the address 0xC0476018 into %eax. | ||
263 | * | ||
264 | * In this example can guess that the kernel was compiled with | ||
265 | * CONFIG_PAGE_OFFSET set to 0xC0000000 (it's always a round number). If the | ||
266 | * kernel were larger than 16MB, we might see 0xC1 addresses show up, but our | ||
267 | * kernel isn't that bloated yet. | ||
268 | * | ||
269 | * Unfortunately, x86 has variable-length instructions, so finding this | ||
270 | * particular instruction properly involves writing a disassembler. Instead, | ||
271 | * we rely on statistics. We look for "0xA1" and tally the different bytes | ||
272 | * which occur 4 bytes later (the "0xC0" in our example above). When one of | ||
273 | * those bytes appears three times, we can be reasonably confident that it | ||
274 | * forms the start of CONFIG_PAGE_OFFSET. | ||
275 | * | ||
276 | * This is amazingly reliable. */ | ||
173 | static unsigned long intuit_page_offset(unsigned char *img, unsigned long len) | 277 | static unsigned long intuit_page_offset(unsigned char *img, unsigned long len) |
174 | { | 278 | { |
175 | unsigned int i, possibilities[256] = { 0 }; | 279 | unsigned int i, possibilities[256] = { 0 }; |
@@ -182,30 +286,52 @@ static unsigned long intuit_page_offset(unsigned char *img, unsigned long len) | |||
182 | errx(1, "could not determine page offset"); | 286 | errx(1, "could not determine page offset"); |
183 | } | 287 | } |
184 | 288 | ||
289 | /*L:160 Unfortunately the entire ELF image isn't compressed: the segments | ||
290 | * which need loading are extracted and compressed raw. This denies us the | ||
291 | * information we need to make a fully-general loader. */ | ||
185 | static unsigned long unpack_bzimage(int fd, unsigned long *page_offset) | 292 | static unsigned long unpack_bzimage(int fd, unsigned long *page_offset) |
186 | { | 293 | { |
187 | gzFile f; | 294 | gzFile f; |
188 | int ret, len = 0; | 295 | int ret, len = 0; |
296 | /* A bzImage always gets loaded at physical address 1M. This is | ||
297 | * actually configurable as CONFIG_PHYSICAL_START, but as the comment | ||
298 | * there says, "Don't change this unless you know what you are doing". | ||
299 | * Indeed. */ | ||
189 | void *img = (void *)0x100000; | 300 | void *img = (void *)0x100000; |
190 | 301 | ||
302 | /* gzdopen takes our file descriptor (carefully placed at the start of | ||
303 | * the GZIP header we found) and returns a gzFile. */ | ||
191 | f = gzdopen(fd, "rb"); | 304 | f = gzdopen(fd, "rb"); |
305 | /* We read it into memory in 64k chunks until we hit the end. */ | ||
192 | while ((ret = gzread(f, img + len, 65536)) > 0) | 306 | while ((ret = gzread(f, img + len, 65536)) > 0) |
193 | len += ret; | 307 | len += ret; |
194 | if (ret < 0) | 308 | if (ret < 0) |
195 | err(1, "reading image from bzImage"); | 309 | err(1, "reading image from bzImage"); |
196 | 310 | ||
197 | verbose("Unpacked size %i addr %p\n", len, img); | 311 | verbose("Unpacked size %i addr %p\n", len, img); |
312 | |||
313 | /* Without the ELF header, we can't tell virtual-physical gap. This is | ||
314 | * CONFIG_PAGE_OFFSET, and people do actually change it. Fortunately, | ||
315 | * I have a clever way of figuring it out from the code itself. */ | ||
198 | *page_offset = intuit_page_offset(img, len); | 316 | *page_offset = intuit_page_offset(img, len); |
199 | 317 | ||
200 | return entry_point(img, img + len, *page_offset); | 318 | return entry_point(img, img + len, *page_offset); |
201 | } | 319 | } |
202 | 320 | ||
321 | /*L:150 A bzImage, unlike an ELF file, is not meant to be loaded. You're | ||
322 | * supposed to jump into it and it will unpack itself. We can't do that | ||
323 | * because the Guest can't run the unpacking code, and adding features to | ||
324 | * lguest kills puppies, so we don't want to. | ||
325 | * | ||
326 | * The bzImage is formed by putting the decompressing code in front of the | ||
327 | * compressed kernel code. So we can simple scan through it looking for the | ||
328 | * first "gzip" header, and start decompressing from there. */ | ||
203 | static unsigned long load_bzimage(int fd, unsigned long *page_offset) | 329 | static unsigned long load_bzimage(int fd, unsigned long *page_offset) |
204 | { | 330 | { |
205 | unsigned char c; | 331 | unsigned char c; |
206 | int state = 0; | 332 | int state = 0; |
207 | 333 | ||
208 | /* Ugly brute force search for gzip header. */ | 334 | /* GZIP header is 0x1F 0x8B <method> <flags>... <compressed-by>. */ |
209 | while (read(fd, &c, 1) == 1) { | 335 | while (read(fd, &c, 1) == 1) { |
210 | switch (state) { | 336 | switch (state) { |
211 | case 0: | 337 | case 0: |
@@ -222,8 +348,10 @@ static unsigned long load_bzimage(int fd, unsigned long *page_offset) | |||
222 | state++; | 348 | state++; |
223 | break; | 349 | break; |
224 | case 9: | 350 | case 9: |
351 | /* Seek back to the start of the gzip header. */ | ||
225 | lseek(fd, -10, SEEK_CUR); | 352 | lseek(fd, -10, SEEK_CUR); |
226 | if (c != 0x03) /* Compressed under UNIX. */ | 353 | /* One final check: "compressed under UNIX". */ |
354 | if (c != 0x03) | ||
227 | state = -1; | 355 | state = -1; |
228 | else | 356 | else |
229 | return unpack_bzimage(fd, page_offset); | 357 | return unpack_bzimage(fd, page_offset); |
@@ -232,25 +360,43 @@ static unsigned long load_bzimage(int fd, unsigned long *page_offset) | |||
232 | errx(1, "Could not find kernel in bzImage"); | 360 | errx(1, "Could not find kernel in bzImage"); |
233 | } | 361 | } |
234 | 362 | ||
363 | /*L:140 Loading the kernel is easy when it's a "vmlinux", but most kernels | ||
364 | * come wrapped up in the self-decompressing "bzImage" format. With some funky | ||
365 | * coding, we can load those, too. */ | ||
235 | static unsigned long load_kernel(int fd, unsigned long *page_offset) | 366 | static unsigned long load_kernel(int fd, unsigned long *page_offset) |
236 | { | 367 | { |
237 | Elf32_Ehdr hdr; | 368 | Elf32_Ehdr hdr; |
238 | 369 | ||
370 | /* Read in the first few bytes. */ | ||
239 | if (read(fd, &hdr, sizeof(hdr)) != sizeof(hdr)) | 371 | if (read(fd, &hdr, sizeof(hdr)) != sizeof(hdr)) |
240 | err(1, "Reading kernel"); | 372 | err(1, "Reading kernel"); |
241 | 373 | ||
374 | /* If it's an ELF file, it starts with "\177ELF" */ | ||
242 | if (memcmp(hdr.e_ident, ELFMAG, SELFMAG) == 0) | 375 | if (memcmp(hdr.e_ident, ELFMAG, SELFMAG) == 0) |
243 | return map_elf(fd, &hdr, page_offset); | 376 | return map_elf(fd, &hdr, page_offset); |
244 | 377 | ||
378 | /* Otherwise we assume it's a bzImage, and try to unpack it */ | ||
245 | return load_bzimage(fd, page_offset); | 379 | return load_bzimage(fd, page_offset); |
246 | } | 380 | } |
247 | 381 | ||
382 | /* This is a trivial little helper to align pages. Andi Kleen hated it because | ||
383 | * it calls getpagesize() twice: "it's dumb code." | ||
384 | * | ||
385 | * Kernel guys get really het up about optimization, even when it's not | ||
386 | * necessary. I leave this code as a reaction against that. */ | ||
248 | static inline unsigned long page_align(unsigned long addr) | 387 | static inline unsigned long page_align(unsigned long addr) |
249 | { | 388 | { |
389 | /* Add upwards and truncate downwards. */ | ||
250 | return ((addr + getpagesize()-1) & ~(getpagesize()-1)); | 390 | return ((addr + getpagesize()-1) & ~(getpagesize()-1)); |
251 | } | 391 | } |
252 | 392 | ||
253 | /* initrd gets loaded at top of memory: return length. */ | 393 | /*L:180 An "initial ram disk" is a disk image loaded into memory along with |
394 | * the kernel which the kernel can use to boot from without needing any | ||
395 | * drivers. Most distributions now use this as standard: the initrd contains | ||
396 | * the code to load the appropriate driver modules for the current machine. | ||
397 | * | ||
398 | * Importantly, James Morris works for RedHat, and Fedora uses initrds for its | ||
399 | * kernels. He sent me this (and tells me when I break it). */ | ||
254 | static unsigned long load_initrd(const char *name, unsigned long mem) | 400 | static unsigned long load_initrd(const char *name, unsigned long mem) |
255 | { | 401 | { |
256 | int ifd; | 402 | int ifd; |
@@ -259,21 +405,35 @@ static unsigned long load_initrd(const char *name, unsigned long mem) | |||
259 | void *iaddr; | 405 | void *iaddr; |
260 | 406 | ||
261 | ifd = open_or_die(name, O_RDONLY); | 407 | ifd = open_or_die(name, O_RDONLY); |
408 | /* fstat() is needed to get the file size. */ | ||
262 | if (fstat(ifd, &st) < 0) | 409 | if (fstat(ifd, &st) < 0) |
263 | err(1, "fstat() on initrd '%s'", name); | 410 | err(1, "fstat() on initrd '%s'", name); |
264 | 411 | ||
412 | /* The length needs to be rounded up to a page size: mmap needs the | ||
413 | * address to be page aligned. */ | ||
265 | len = page_align(st.st_size); | 414 | len = page_align(st.st_size); |
415 | /* We map the initrd at the top of memory. */ | ||
266 | iaddr = mmap((void *)mem - len, st.st_size, | 416 | iaddr = mmap((void *)mem - len, st.st_size, |
267 | PROT_READ|PROT_EXEC|PROT_WRITE, | 417 | PROT_READ|PROT_EXEC|PROT_WRITE, |
268 | MAP_FIXED|MAP_PRIVATE, ifd, 0); | 418 | MAP_FIXED|MAP_PRIVATE, ifd, 0); |
269 | if (iaddr != (void *)mem - len) | 419 | if (iaddr != (void *)mem - len) |
270 | err(1, "Mmaping initrd '%s' returned %p not %p", | 420 | err(1, "Mmaping initrd '%s' returned %p not %p", |
271 | name, iaddr, (void *)mem - len); | 421 | name, iaddr, (void *)mem - len); |
422 | /* Once a file is mapped, you can close the file descriptor. It's a | ||
423 | * little odd, but quite useful. */ | ||
272 | close(ifd); | 424 | close(ifd); |
273 | verbose("mapped initrd %s size=%lu @ %p\n", name, st.st_size, iaddr); | 425 | verbose("mapped initrd %s size=%lu @ %p\n", name, st.st_size, iaddr); |
426 | |||
427 | /* We return the initrd size. */ | ||
274 | return len; | 428 | return len; |
275 | } | 429 | } |
276 | 430 | ||
431 | /* Once we know how much memory we have, and the address the Guest kernel | ||
432 | * expects, we can construct simple linear page tables which will get the Guest | ||
433 | * far enough into the boot to create its own. | ||
434 | * | ||
435 | * We lay them out of the way, just below the initrd (which is why we need to | ||
436 | * know its size). */ | ||
277 | static unsigned long setup_pagetables(unsigned long mem, | 437 | static unsigned long setup_pagetables(unsigned long mem, |
278 | unsigned long initrd_size, | 438 | unsigned long initrd_size, |
279 | unsigned long page_offset) | 439 | unsigned long page_offset) |
@@ -282,23 +442,32 @@ static unsigned long setup_pagetables(unsigned long mem, | |||
282 | unsigned int mapped_pages, i, linear_pages; | 442 | unsigned int mapped_pages, i, linear_pages; |
283 | unsigned int ptes_per_page = getpagesize()/sizeof(u32); | 443 | unsigned int ptes_per_page = getpagesize()/sizeof(u32); |
284 | 444 | ||
285 | /* If we can map all of memory above page_offset, we do so. */ | 445 | /* Ideally we map all physical memory starting at page_offset. |
446 | * However, if page_offset is 0xC0000000 we can only map 1G of physical | ||
447 | * (0xC0000000 + 1G overflows). */ | ||
286 | if (mem <= -page_offset) | 448 | if (mem <= -page_offset) |
287 | mapped_pages = mem/getpagesize(); | 449 | mapped_pages = mem/getpagesize(); |
288 | else | 450 | else |
289 | mapped_pages = -page_offset/getpagesize(); | 451 | mapped_pages = -page_offset/getpagesize(); |
290 | 452 | ||
291 | /* Each linear PTE page can map ptes_per_page pages. */ | 453 | /* Each PTE page can map ptes_per_page pages: how many do we need? */ |
292 | linear_pages = (mapped_pages + ptes_per_page-1)/ptes_per_page; | 454 | linear_pages = (mapped_pages + ptes_per_page-1)/ptes_per_page; |
293 | 455 | ||
294 | /* We lay out top-level then linear mapping immediately below initrd */ | 456 | /* We put the toplevel page directory page at the top of memory. */ |
295 | pgdir = (void *)mem - initrd_size - getpagesize(); | 457 | pgdir = (void *)mem - initrd_size - getpagesize(); |
458 | |||
459 | /* Now we use the next linear_pages pages as pte pages */ | ||
296 | linear = (void *)pgdir - linear_pages*getpagesize(); | 460 | linear = (void *)pgdir - linear_pages*getpagesize(); |
297 | 461 | ||
462 | /* Linear mapping is easy: put every page's address into the mapping in | ||
463 | * order. PAGE_PRESENT contains the flags Present, Writable and | ||
464 | * Executable. */ | ||
298 | for (i = 0; i < mapped_pages; i++) | 465 | for (i = 0; i < mapped_pages; i++) |
299 | linear[i] = ((i * getpagesize()) | PAGE_PRESENT); | 466 | linear[i] = ((i * getpagesize()) | PAGE_PRESENT); |
300 | 467 | ||
301 | /* Now set up pgd so that this memory is at page_offset */ | 468 | /* The top level points to the linear page table pages above. The |
469 | * entry representing page_offset points to the first one, and they | ||
470 | * continue from there. */ | ||
302 | for (i = 0; i < mapped_pages; i += ptes_per_page) { | 471 | for (i = 0; i < mapped_pages; i += ptes_per_page) { |
303 | pgdir[(i + page_offset/getpagesize())/ptes_per_page] | 472 | pgdir[(i + page_offset/getpagesize())/ptes_per_page] |
304 | = (((u32)linear + i*sizeof(u32)) | PAGE_PRESENT); | 473 | = (((u32)linear + i*sizeof(u32)) | PAGE_PRESENT); |
@@ -307,9 +476,13 @@ static unsigned long setup_pagetables(unsigned long mem, | |||
307 | verbose("Linear mapping of %u pages in %u pte pages at %p\n", | 476 | verbose("Linear mapping of %u pages in %u pte pages at %p\n", |
308 | mapped_pages, linear_pages, linear); | 477 | mapped_pages, linear_pages, linear); |
309 | 478 | ||
479 | /* We return the top level (guest-physical) address: the kernel needs | ||
480 | * to know where it is. */ | ||
310 | return (unsigned long)pgdir; | 481 | return (unsigned long)pgdir; |
311 | } | 482 | } |
312 | 483 | ||
484 | /* Simple routine to roll all the commandline arguments together with spaces | ||
485 | * between them. */ | ||
313 | static void concat(char *dst, char *args[]) | 486 | static void concat(char *dst, char *args[]) |
314 | { | 487 | { |
315 | unsigned int i, len = 0; | 488 | unsigned int i, len = 0; |
@@ -323,6 +496,10 @@ static void concat(char *dst, char *args[]) | |||
323 | dst[len] = '\0'; | 496 | dst[len] = '\0'; |
324 | } | 497 | } |
325 | 498 | ||
499 | /* This is where we actually tell the kernel to initialize the Guest. We saw | ||
500 | * the arguments it expects when we looked at initialize() in lguest_user.c: | ||
501 | * the top physical page to allow, the top level pagetable, the entry point and | ||
502 | * the page_offset constant for the Guest. */ | ||
326 | static int tell_kernel(u32 pgdir, u32 start, u32 page_offset) | 503 | static int tell_kernel(u32 pgdir, u32 start, u32 page_offset) |
327 | { | 504 | { |
328 | u32 args[] = { LHREQ_INITIALIZE, | 505 | u32 args[] = { LHREQ_INITIALIZE, |
@@ -332,8 +509,11 @@ static int tell_kernel(u32 pgdir, u32 start, u32 page_offset) | |||
332 | fd = open_or_die("/dev/lguest", O_RDWR); | 509 | fd = open_or_die("/dev/lguest", O_RDWR); |
333 | if (write(fd, args, sizeof(args)) < 0) | 510 | if (write(fd, args, sizeof(args)) < 0) |
334 | err(1, "Writing to /dev/lguest"); | 511 | err(1, "Writing to /dev/lguest"); |
512 | |||
513 | /* We return the /dev/lguest file descriptor to control this Guest */ | ||
335 | return fd; | 514 | return fd; |
336 | } | 515 | } |
516 | /*:*/ | ||
337 | 517 | ||
338 | static void set_fd(int fd, struct device_list *devices) | 518 | static void set_fd(int fd, struct device_list *devices) |
339 | { | 519 | { |
@@ -342,61 +522,108 @@ static void set_fd(int fd, struct device_list *devices) | |||
342 | devices->max_infd = fd; | 522 | devices->max_infd = fd; |
343 | } | 523 | } |
344 | 524 | ||
345 | /* When input arrives, we tell the kernel to kick lguest out with -EAGAIN. */ | 525 | /*L:200 |
526 | * The Waker. | ||
527 | * | ||
528 | * With a console and network devices, we can have lots of input which we need | ||
529 | * to process. We could try to tell the kernel what file descriptors to watch, | ||
530 | * but handing a file descriptor mask through to the kernel is fairly icky. | ||
531 | * | ||
532 | * Instead, we fork off a process which watches the file descriptors and writes | ||
533 | * the LHREQ_BREAK command to the /dev/lguest filedescriptor to tell the Host | ||
534 | * loop to stop running the Guest. This causes it to return from the | ||
535 | * /dev/lguest read with -EAGAIN, where it will write to /dev/lguest to reset | ||
536 | * the LHREQ_BREAK and wake us up again. | ||
537 | * | ||
538 | * This, of course, is merely a different *kind* of icky. | ||
539 | */ | ||
346 | static void wake_parent(int pipefd, int lguest_fd, struct device_list *devices) | 540 | static void wake_parent(int pipefd, int lguest_fd, struct device_list *devices) |
347 | { | 541 | { |
542 | /* Add the pipe from the Launcher to the fdset in the device_list, so | ||
543 | * we watch it, too. */ | ||
348 | set_fd(pipefd, devices); | 544 | set_fd(pipefd, devices); |
349 | 545 | ||
350 | for (;;) { | 546 | for (;;) { |
351 | fd_set rfds = devices->infds; | 547 | fd_set rfds = devices->infds; |
352 | u32 args[] = { LHREQ_BREAK, 1 }; | 548 | u32 args[] = { LHREQ_BREAK, 1 }; |
353 | 549 | ||
550 | /* Wait until input is ready from one of the devices. */ | ||
354 | select(devices->max_infd+1, &rfds, NULL, NULL, NULL); | 551 | select(devices->max_infd+1, &rfds, NULL, NULL, NULL); |
552 | /* Is it a message from the Launcher? */ | ||
355 | if (FD_ISSET(pipefd, &rfds)) { | 553 | if (FD_ISSET(pipefd, &rfds)) { |
356 | int ignorefd; | 554 | int ignorefd; |
555 | /* If read() returns 0, it means the Launcher has | ||
556 | * exited. We silently follow. */ | ||
357 | if (read(pipefd, &ignorefd, sizeof(ignorefd)) == 0) | 557 | if (read(pipefd, &ignorefd, sizeof(ignorefd)) == 0) |
358 | exit(0); | 558 | exit(0); |
559 | /* Otherwise it's telling us there's a problem with one | ||
560 | * of the devices, and we should ignore that file | ||
561 | * descriptor from now on. */ | ||
359 | FD_CLR(ignorefd, &devices->infds); | 562 | FD_CLR(ignorefd, &devices->infds); |
360 | } else | 563 | } else /* Send LHREQ_BREAK command. */ |
361 | write(lguest_fd, args, sizeof(args)); | 564 | write(lguest_fd, args, sizeof(args)); |
362 | } | 565 | } |
363 | } | 566 | } |
364 | 567 | ||
568 | /* This routine just sets up a pipe to the Waker process. */ | ||
365 | static int setup_waker(int lguest_fd, struct device_list *device_list) | 569 | static int setup_waker(int lguest_fd, struct device_list *device_list) |
366 | { | 570 | { |
367 | int pipefd[2], child; | 571 | int pipefd[2], child; |
368 | 572 | ||
573 | /* We create a pipe to talk to the waker, and also so it knows when the | ||
574 | * Launcher dies (and closes pipe). */ | ||
369 | pipe(pipefd); | 575 | pipe(pipefd); |
370 | child = fork(); | 576 | child = fork(); |
371 | if (child == -1) | 577 | if (child == -1) |
372 | err(1, "forking"); | 578 | err(1, "forking"); |
373 | 579 | ||
374 | if (child == 0) { | 580 | if (child == 0) { |
581 | /* Close the "writing" end of our copy of the pipe */ | ||
375 | close(pipefd[1]); | 582 | close(pipefd[1]); |
376 | wake_parent(pipefd[0], lguest_fd, device_list); | 583 | wake_parent(pipefd[0], lguest_fd, device_list); |
377 | } | 584 | } |
585 | /* Close the reading end of our copy of the pipe. */ | ||
378 | close(pipefd[0]); | 586 | close(pipefd[0]); |
379 | 587 | ||
588 | /* Here is the fd used to talk to the waker. */ | ||
380 | return pipefd[1]; | 589 | return pipefd[1]; |
381 | } | 590 | } |
382 | 591 | ||
592 | /*L:210 | ||
593 | * Device Handling. | ||
594 | * | ||
595 | * When the Guest sends DMA to us, it sends us an array of addresses and sizes. | ||
596 | * We need to make sure it's not trying to reach into the Launcher itself, so | ||
597 | * we have a convenient routine which check it and exits with an error message | ||
598 | * if something funny is going on: | ||
599 | */ | ||
383 | static void *_check_pointer(unsigned long addr, unsigned int size, | 600 | static void *_check_pointer(unsigned long addr, unsigned int size, |
384 | unsigned int line) | 601 | unsigned int line) |
385 | { | 602 | { |
603 | /* We have to separately check addr and addr+size, because size could | ||
604 | * be huge and addr + size might wrap around. */ | ||
386 | if (addr >= top || addr + size >= top) | 605 | if (addr >= top || addr + size >= top) |
387 | errx(1, "%s:%i: Invalid address %li", __FILE__, line, addr); | 606 | errx(1, "%s:%i: Invalid address %li", __FILE__, line, addr); |
607 | /* We return a pointer for the caller's convenience, now we know it's | ||
608 | * safe to use. */ | ||
388 | return (void *)addr; | 609 | return (void *)addr; |
389 | } | 610 | } |
611 | /* A macro which transparently hands the line number to the real function. */ | ||
390 | #define check_pointer(addr,size) _check_pointer(addr, size, __LINE__) | 612 | #define check_pointer(addr,size) _check_pointer(addr, size, __LINE__) |
391 | 613 | ||
392 | /* Returns pointer to dma->used_len */ | 614 | /* The Guest has given us the address of a "struct lguest_dma". We check it's |
615 | * OK and convert it to an iovec (which is a simple array of ptr/size | ||
616 | * pairs). */ | ||
393 | static u32 *dma2iov(unsigned long dma, struct iovec iov[], unsigned *num) | 617 | static u32 *dma2iov(unsigned long dma, struct iovec iov[], unsigned *num) |
394 | { | 618 | { |
395 | unsigned int i; | 619 | unsigned int i; |
396 | struct lguest_dma *udma; | 620 | struct lguest_dma *udma; |
397 | 621 | ||
622 | /* First we make sure that the array memory itself is valid. */ | ||
398 | udma = check_pointer(dma, sizeof(*udma)); | 623 | udma = check_pointer(dma, sizeof(*udma)); |
624 | /* Now we check each element */ | ||
399 | for (i = 0; i < LGUEST_MAX_DMA_SECTIONS; i++) { | 625 | for (i = 0; i < LGUEST_MAX_DMA_SECTIONS; i++) { |
626 | /* A zero length ends the array. */ | ||
400 | if (!udma->len[i]) | 627 | if (!udma->len[i]) |
401 | break; | 628 | break; |
402 | 629 | ||
@@ -404,9 +631,15 @@ static u32 *dma2iov(unsigned long dma, struct iovec iov[], unsigned *num) | |||
404 | iov[i].iov_len = udma->len[i]; | 631 | iov[i].iov_len = udma->len[i]; |
405 | } | 632 | } |
406 | *num = i; | 633 | *num = i; |
634 | |||
635 | /* We return the pointer to where the caller should write the amount of | ||
636 | * the buffer used. */ | ||
407 | return &udma->used_len; | 637 | return &udma->used_len; |
408 | } | 638 | } |
409 | 639 | ||
640 | /* This routine gets a DMA buffer from the Guest for a given key, and converts | ||
641 | * it to an iovec array. It returns the interrupt the Guest wants when we're | ||
642 | * finished, and a pointer to the "used_len" field to fill in. */ | ||
410 | static u32 *get_dma_buffer(int fd, void *key, | 643 | static u32 *get_dma_buffer(int fd, void *key, |
411 | struct iovec iov[], unsigned int *num, u32 *irq) | 644 | struct iovec iov[], unsigned int *num, u32 *irq) |
412 | { | 645 | { |
@@ -414,16 +647,21 @@ static u32 *get_dma_buffer(int fd, void *key, | |||
414 | unsigned long udma; | 647 | unsigned long udma; |
415 | u32 *res; | 648 | u32 *res; |
416 | 649 | ||
650 | /* Ask the kernel for a DMA buffer corresponding to this key. */ | ||
417 | udma = write(fd, buf, sizeof(buf)); | 651 | udma = write(fd, buf, sizeof(buf)); |
652 | /* They haven't registered any, or they're all used? */ | ||
418 | if (udma == (unsigned long)-1) | 653 | if (udma == (unsigned long)-1) |
419 | return NULL; | 654 | return NULL; |
420 | 655 | ||
421 | /* Kernel stashes irq in ->used_len. */ | 656 | /* Convert it into our iovec array */ |
422 | res = dma2iov(udma, iov, num); | 657 | res = dma2iov(udma, iov, num); |
658 | /* The kernel stashes irq in ->used_len to get it out to us. */ | ||
423 | *irq = *res; | 659 | *irq = *res; |
660 | /* Return a pointer to ((struct lguest_dma *)udma)->used_len. */ | ||
424 | return res; | 661 | return res; |
425 | } | 662 | } |
426 | 663 | ||
664 | /* This is a convenient routine to send the Guest an interrupt. */ | ||
427 | static void trigger_irq(int fd, u32 irq) | 665 | static void trigger_irq(int fd, u32 irq) |
428 | { | 666 | { |
429 | u32 buf[] = { LHREQ_IRQ, irq }; | 667 | u32 buf[] = { LHREQ_IRQ, irq }; |
@@ -431,6 +669,10 @@ static void trigger_irq(int fd, u32 irq) | |||
431 | err(1, "Triggering irq %i", irq); | 669 | err(1, "Triggering irq %i", irq); |
432 | } | 670 | } |
433 | 671 | ||
672 | /* This simply sets up an iovec array where we can put data to be discarded. | ||
673 | * This happens when the Guest doesn't want or can't handle the input: we have | ||
674 | * to get rid of it somewhere, and if we bury it in the ceiling space it will | ||
675 | * start to smell after a week. */ | ||
434 | static void discard_iovec(struct iovec *iov, unsigned int *num) | 676 | static void discard_iovec(struct iovec *iov, unsigned int *num) |
435 | { | 677 | { |
436 | static char discard_buf[1024]; | 678 | static char discard_buf[1024]; |
@@ -439,19 +681,24 @@ static void discard_iovec(struct iovec *iov, unsigned int *num) | |||
439 | iov->iov_len = sizeof(discard_buf); | 681 | iov->iov_len = sizeof(discard_buf); |
440 | } | 682 | } |
441 | 683 | ||
684 | /* Here is the input terminal setting we save, and the routine to restore them | ||
685 | * on exit so the user can see what they type next. */ | ||
442 | static struct termios orig_term; | 686 | static struct termios orig_term; |
443 | static void restore_term(void) | 687 | static void restore_term(void) |
444 | { | 688 | { |
445 | tcsetattr(STDIN_FILENO, TCSANOW, &orig_term); | 689 | tcsetattr(STDIN_FILENO, TCSANOW, &orig_term); |
446 | } | 690 | } |
447 | 691 | ||
692 | /* We associate some data with the console for our exit hack. */ | ||
448 | struct console_abort | 693 | struct console_abort |
449 | { | 694 | { |
695 | /* How many times have they hit ^C? */ | ||
450 | int count; | 696 | int count; |
697 | /* When did they start? */ | ||
451 | struct timeval start; | 698 | struct timeval start; |
452 | }; | 699 | }; |
453 | 700 | ||
454 | /* We DMA input to buffer bound at start of console page. */ | 701 | /* This is the routine which handles console input (ie. stdin). */ |
455 | static bool handle_console_input(int fd, struct device *dev) | 702 | static bool handle_console_input(int fd, struct device *dev) |
456 | { | 703 | { |
457 | u32 irq = 0, *lenp; | 704 | u32 irq = 0, *lenp; |
@@ -460,24 +707,38 @@ static bool handle_console_input(int fd, struct device *dev) | |||
460 | struct iovec iov[LGUEST_MAX_DMA_SECTIONS]; | 707 | struct iovec iov[LGUEST_MAX_DMA_SECTIONS]; |
461 | struct console_abort *abort = dev->priv; | 708 | struct console_abort *abort = dev->priv; |
462 | 709 | ||
710 | /* First we get the console buffer from the Guest. The key is dev->mem | ||
711 | * which was set to 0 in setup_console(). */ | ||
463 | lenp = get_dma_buffer(fd, dev->mem, iov, &num, &irq); | 712 | lenp = get_dma_buffer(fd, dev->mem, iov, &num, &irq); |
464 | if (!lenp) { | 713 | if (!lenp) { |
714 | /* If it's not ready for input, warn and set up to discard. */ | ||
465 | warn("console: no dma buffer!"); | 715 | warn("console: no dma buffer!"); |
466 | discard_iovec(iov, &num); | 716 | discard_iovec(iov, &num); |
467 | } | 717 | } |
468 | 718 | ||
719 | /* This is why we convert to iovecs: the readv() call uses them, and so | ||
720 | * it reads straight into the Guest's buffer. */ | ||
469 | len = readv(dev->fd, iov, num); | 721 | len = readv(dev->fd, iov, num); |
470 | if (len <= 0) { | 722 | if (len <= 0) { |
723 | /* This implies that the console is closed, is /dev/null, or | ||
724 | * something went terribly wrong. We still go through the rest | ||
725 | * of the logic, though, especially the exit handling below. */ | ||
471 | warnx("Failed to get console input, ignoring console."); | 726 | warnx("Failed to get console input, ignoring console."); |
472 | len = 0; | 727 | len = 0; |
473 | } | 728 | } |
474 | 729 | ||
730 | /* If we read the data into the Guest, fill in the length and send the | ||
731 | * interrupt. */ | ||
475 | if (lenp) { | 732 | if (lenp) { |
476 | *lenp = len; | 733 | *lenp = len; |
477 | trigger_irq(fd, irq); | 734 | trigger_irq(fd, irq); |
478 | } | 735 | } |
479 | 736 | ||
480 | /* Three ^C within one second? Exit. */ | 737 | /* Three ^C within one second? Exit. |
738 | * | ||
739 | * This is such a hack, but works surprisingly well. Each ^C has to be | ||
740 | * in a buffer by itself, so they can't be too fast. But we check that | ||
741 | * we get three within about a second, so they can't be too slow. */ | ||
481 | if (len == 1 && ((char *)iov[0].iov_base)[0] == 3) { | 742 | if (len == 1 && ((char *)iov[0].iov_base)[0] == 3) { |
482 | if (!abort->count++) | 743 | if (!abort->count++) |
483 | gettimeofday(&abort->start, NULL); | 744 | gettimeofday(&abort->start, NULL); |
@@ -485,43 +746,60 @@ static bool handle_console_input(int fd, struct device *dev) | |||
485 | struct timeval now; | 746 | struct timeval now; |
486 | gettimeofday(&now, NULL); | 747 | gettimeofday(&now, NULL); |
487 | if (now.tv_sec <= abort->start.tv_sec+1) { | 748 | if (now.tv_sec <= abort->start.tv_sec+1) { |
488 | /* Make sure waker is not blocked in BREAK */ | ||
489 | u32 args[] = { LHREQ_BREAK, 0 }; | 749 | u32 args[] = { LHREQ_BREAK, 0 }; |
750 | /* Close the fd so Waker will know it has to | ||
751 | * exit. */ | ||
490 | close(waker_fd); | 752 | close(waker_fd); |
753 | /* Just in case waker is blocked in BREAK, send | ||
754 | * unbreak now. */ | ||
491 | write(fd, args, sizeof(args)); | 755 | write(fd, args, sizeof(args)); |
492 | exit(2); | 756 | exit(2); |
493 | } | 757 | } |
494 | abort->count = 0; | 758 | abort->count = 0; |
495 | } | 759 | } |
496 | } else | 760 | } else |
761 | /* Any other key resets the abort counter. */ | ||
497 | abort->count = 0; | 762 | abort->count = 0; |
498 | 763 | ||
764 | /* Now, if we didn't read anything, put the input terminal back and | ||
765 | * return failure (meaning, don't call us again). */ | ||
499 | if (!len) { | 766 | if (!len) { |
500 | restore_term(); | 767 | restore_term(); |
501 | return false; | 768 | return false; |
502 | } | 769 | } |
770 | /* Everything went OK! */ | ||
503 | return true; | 771 | return true; |
504 | } | 772 | } |
505 | 773 | ||
774 | /* Handling console output is much simpler than input. */ | ||
506 | static u32 handle_console_output(int fd, const struct iovec *iov, | 775 | static u32 handle_console_output(int fd, const struct iovec *iov, |
507 | unsigned num, struct device*dev) | 776 | unsigned num, struct device*dev) |
508 | { | 777 | { |
778 | /* Whatever the Guest sends, write it to standard output. Return the | ||
779 | * number of bytes written. */ | ||
509 | return writev(STDOUT_FILENO, iov, num); | 780 | return writev(STDOUT_FILENO, iov, num); |
510 | } | 781 | } |
511 | 782 | ||
783 | /* Guest->Host network output is also pretty easy. */ | ||
512 | static u32 handle_tun_output(int fd, const struct iovec *iov, | 784 | static u32 handle_tun_output(int fd, const struct iovec *iov, |
513 | unsigned num, struct device *dev) | 785 | unsigned num, struct device *dev) |
514 | { | 786 | { |
515 | /* Now we've seen output, we should warn if we can't get buffers. */ | 787 | /* We put a flag in the "priv" pointer of the network device, and set |
788 | * it as soon as we see output. We'll see why in handle_tun_input() */ | ||
516 | *(bool *)dev->priv = true; | 789 | *(bool *)dev->priv = true; |
790 | /* Whatever packet the Guest sent us, write it out to the tun | ||
791 | * device. */ | ||
517 | return writev(dev->fd, iov, num); | 792 | return writev(dev->fd, iov, num); |
518 | } | 793 | } |
519 | 794 | ||
795 | /* This matches the peer_key() in lguest_net.c. The key for any given slot | ||
796 | * is the address of the network device's page plus 4 * the slot number. */ | ||
520 | static unsigned long peer_offset(unsigned int peernum) | 797 | static unsigned long peer_offset(unsigned int peernum) |
521 | { | 798 | { |
522 | return 4 * peernum; | 799 | return 4 * peernum; |
523 | } | 800 | } |
524 | 801 | ||
802 | /* This is where we handle a packet coming in from the tun device */ | ||
525 | static bool handle_tun_input(int fd, struct device *dev) | 803 | static bool handle_tun_input(int fd, struct device *dev) |
526 | { | 804 | { |
527 | u32 irq = 0, *lenp; | 805 | u32 irq = 0, *lenp; |
@@ -529,17 +807,28 @@ static bool handle_tun_input(int fd, struct device *dev) | |||
529 | unsigned num; | 807 | unsigned num; |
530 | struct iovec iov[LGUEST_MAX_DMA_SECTIONS]; | 808 | struct iovec iov[LGUEST_MAX_DMA_SECTIONS]; |
531 | 809 | ||
810 | /* First we get a buffer the Guest has bound to its key. */ | ||
532 | lenp = get_dma_buffer(fd, dev->mem+peer_offset(NET_PEERNUM), iov, &num, | 811 | lenp = get_dma_buffer(fd, dev->mem+peer_offset(NET_PEERNUM), iov, &num, |
533 | &irq); | 812 | &irq); |
534 | if (!lenp) { | 813 | if (!lenp) { |
814 | /* Now, it's expected that if we try to send a packet too | ||
815 | * early, the Guest won't be ready yet. This is why we set a | ||
816 | * flag when the Guest sends its first packet. If it's sent a | ||
817 | * packet we assume it should be ready to receive them. | ||
818 | * | ||
819 | * Actually, this is what the status bits in the descriptor are | ||
820 | * for: we should *use* them. FIXME! */ | ||
535 | if (*(bool *)dev->priv) | 821 | if (*(bool *)dev->priv) |
536 | warn("network: no dma buffer!"); | 822 | warn("network: no dma buffer!"); |
537 | discard_iovec(iov, &num); | 823 | discard_iovec(iov, &num); |
538 | } | 824 | } |
539 | 825 | ||
826 | /* Read the packet from the device directly into the Guest's buffer. */ | ||
540 | len = readv(dev->fd, iov, num); | 827 | len = readv(dev->fd, iov, num); |
541 | if (len <= 0) | 828 | if (len <= 0) |
542 | err(1, "reading network"); | 829 | err(1, "reading network"); |
830 | |||
831 | /* Write the used_len, and trigger the interrupt for the Guest */ | ||
543 | if (lenp) { | 832 | if (lenp) { |
544 | *lenp = len; | 833 | *lenp = len; |
545 | trigger_irq(fd, irq); | 834 | trigger_irq(fd, irq); |
@@ -547,9 +836,13 @@ static bool handle_tun_input(int fd, struct device *dev) | |||
547 | verbose("tun input packet len %i [%02x %02x] (%s)\n", len, | 836 | verbose("tun input packet len %i [%02x %02x] (%s)\n", len, |
548 | ((u8 *)iov[0].iov_base)[0], ((u8 *)iov[0].iov_base)[1], | 837 | ((u8 *)iov[0].iov_base)[0], ((u8 *)iov[0].iov_base)[1], |
549 | lenp ? "sent" : "discarded"); | 838 | lenp ? "sent" : "discarded"); |
839 | /* All good. */ | ||
550 | return true; | 840 | return true; |
551 | } | 841 | } |
552 | 842 | ||
843 | /* The last device handling routine is block output: the Guest has sent a DMA | ||
844 | * to the block device. It will have placed the command it wants in the | ||
845 | * "struct lguest_block_page". */ | ||
553 | static u32 handle_block_output(int fd, const struct iovec *iov, | 846 | static u32 handle_block_output(int fd, const struct iovec *iov, |
554 | unsigned num, struct device *dev) | 847 | unsigned num, struct device *dev) |
555 | { | 848 | { |
@@ -559,36 +852,64 @@ static u32 handle_block_output(int fd, const struct iovec *iov, | |||
559 | struct iovec reply[LGUEST_MAX_DMA_SECTIONS]; | 852 | struct iovec reply[LGUEST_MAX_DMA_SECTIONS]; |
560 | off64_t device_len, off = (off64_t)p->sector * 512; | 853 | off64_t device_len, off = (off64_t)p->sector * 512; |
561 | 854 | ||
855 | /* First we extract the device length from the dev->priv pointer. */ | ||
562 | device_len = *(off64_t *)dev->priv; | 856 | device_len = *(off64_t *)dev->priv; |
563 | 857 | ||
858 | /* We first check that the read or write is within the length of the | ||
859 | * block file. */ | ||
564 | if (off >= device_len) | 860 | if (off >= device_len) |
565 | err(1, "Bad offset %llu vs %llu", off, device_len); | 861 | err(1, "Bad offset %llu vs %llu", off, device_len); |
862 | /* Move to the right location in the block file. This shouldn't fail, | ||
863 | * but best to check. */ | ||
566 | if (lseek64(dev->fd, off, SEEK_SET) != off) | 864 | if (lseek64(dev->fd, off, SEEK_SET) != off) |
567 | err(1, "Bad seek to sector %i", p->sector); | 865 | err(1, "Bad seek to sector %i", p->sector); |
568 | 866 | ||
569 | verbose("Block: %s at offset %llu\n", p->type ? "WRITE" : "READ", off); | 867 | verbose("Block: %s at offset %llu\n", p->type ? "WRITE" : "READ", off); |
570 | 868 | ||
869 | /* They were supposed to bind a reply buffer at key equal to the start | ||
870 | * of the block device memory. We need this to tell them when the | ||
871 | * request is finished. */ | ||
571 | lenp = get_dma_buffer(fd, dev->mem, reply, &reply_num, &irq); | 872 | lenp = get_dma_buffer(fd, dev->mem, reply, &reply_num, &irq); |
572 | if (!lenp) | 873 | if (!lenp) |
573 | err(1, "Block request didn't give us a dma buffer"); | 874 | err(1, "Block request didn't give us a dma buffer"); |
574 | 875 | ||
575 | if (p->type) { | 876 | if (p->type) { |
877 | /* A write request. The DMA they sent contained the data, so | ||
878 | * write it out. */ | ||
576 | len = writev(dev->fd, iov, num); | 879 | len = writev(dev->fd, iov, num); |
880 | /* Grr... Now we know how long the "struct lguest_dma" they | ||
881 | * sent was, we make sure they didn't try to write over the end | ||
882 | * of the block file (possibly extending it). */ | ||
577 | if (off + len > device_len) { | 883 | if (off + len > device_len) { |
884 | /* Trim it back to the correct length */ | ||
578 | ftruncate(dev->fd, device_len); | 885 | ftruncate(dev->fd, device_len); |
886 | /* Die, bad Guest, die. */ | ||
579 | errx(1, "Write past end %llu+%u", off, len); | 887 | errx(1, "Write past end %llu+%u", off, len); |
580 | } | 888 | } |
889 | /* The reply length is 0: we just send back an empty DMA to | ||
890 | * interrupt them and tell them the write is finished. */ | ||
581 | *lenp = 0; | 891 | *lenp = 0; |
582 | } else { | 892 | } else { |
893 | /* A read request. They sent an empty DMA to start the | ||
894 | * request, and we put the read contents into the reply | ||
895 | * buffer. */ | ||
583 | len = readv(dev->fd, reply, reply_num); | 896 | len = readv(dev->fd, reply, reply_num); |
584 | *lenp = len; | 897 | *lenp = len; |
585 | } | 898 | } |
586 | 899 | ||
900 | /* The result is 1 (done), 2 if there was an error (short read or | ||
901 | * write). */ | ||
587 | p->result = 1 + (p->bytes != len); | 902 | p->result = 1 + (p->bytes != len); |
903 | /* Now tell them we've used their reply buffer. */ | ||
588 | trigger_irq(fd, irq); | 904 | trigger_irq(fd, irq); |
905 | |||
906 | /* We're supposed to return the number of bytes of the output buffer we | ||
907 | * used. But the block device uses the "result" field instead, so we | ||
908 | * don't bother. */ | ||
589 | return 0; | 909 | return 0; |
590 | } | 910 | } |
591 | 911 | ||
912 | /* This is the generic routine we call when the Guest sends some DMA out. */ | ||
592 | static void handle_output(int fd, unsigned long dma, unsigned long key, | 913 | static void handle_output(int fd, unsigned long dma, unsigned long key, |
593 | struct device_list *devices) | 914 | struct device_list *devices) |
594 | { | 915 | { |
@@ -597,30 +918,53 @@ static void handle_output(int fd, unsigned long dma, unsigned long key, | |||
597 | struct iovec iov[LGUEST_MAX_DMA_SECTIONS]; | 918 | struct iovec iov[LGUEST_MAX_DMA_SECTIONS]; |
598 | unsigned num = 0; | 919 | unsigned num = 0; |
599 | 920 | ||
921 | /* Convert the "struct lguest_dma" they're sending to a "struct | ||
922 | * iovec". */ | ||
600 | lenp = dma2iov(dma, iov, &num); | 923 | lenp = dma2iov(dma, iov, &num); |
924 | |||
925 | /* Check each device: if they expect output to this key, tell them to | ||
926 | * handle it. */ | ||
601 | for (i = devices->dev; i; i = i->next) { | 927 | for (i = devices->dev; i; i = i->next) { |
602 | if (i->handle_output && key == i->watch_key) { | 928 | if (i->handle_output && key == i->watch_key) { |
929 | /* We write the result straight into the used_len field | ||
930 | * for them. */ | ||
603 | *lenp = i->handle_output(fd, iov, num, i); | 931 | *lenp = i->handle_output(fd, iov, num, i); |
604 | return; | 932 | return; |
605 | } | 933 | } |
606 | } | 934 | } |
935 | |||
936 | /* This can happen: the kernel sends any SEND_DMA which doesn't match | ||
937 | * another Guest to us. It could be that another Guest just left a | ||
938 | * network, for example. But it's unusual. */ | ||
607 | warnx("Pending dma %p, key %p", (void *)dma, (void *)key); | 939 | warnx("Pending dma %p, key %p", (void *)dma, (void *)key); |
608 | } | 940 | } |
609 | 941 | ||
942 | /* This is called when the waker wakes us up: check for incoming file | ||
943 | * descriptors. */ | ||
610 | static void handle_input(int fd, struct device_list *devices) | 944 | static void handle_input(int fd, struct device_list *devices) |
611 | { | 945 | { |
946 | /* select() wants a zeroed timeval to mean "don't wait". */ | ||
612 | struct timeval poll = { .tv_sec = 0, .tv_usec = 0 }; | 947 | struct timeval poll = { .tv_sec = 0, .tv_usec = 0 }; |
613 | 948 | ||
614 | for (;;) { | 949 | for (;;) { |
615 | struct device *i; | 950 | struct device *i; |
616 | fd_set fds = devices->infds; | 951 | fd_set fds = devices->infds; |
617 | 952 | ||
953 | /* If nothing is ready, we're done. */ | ||
618 | if (select(devices->max_infd+1, &fds, NULL, NULL, &poll) == 0) | 954 | if (select(devices->max_infd+1, &fds, NULL, NULL, &poll) == 0) |
619 | break; | 955 | break; |
620 | 956 | ||
957 | /* Otherwise, call the device(s) which have readable | ||
958 | * file descriptors and a method of handling them. */ | ||
621 | for (i = devices->dev; i; i = i->next) { | 959 | for (i = devices->dev; i; i = i->next) { |
622 | if (i->handle_input && FD_ISSET(i->fd, &fds)) { | 960 | if (i->handle_input && FD_ISSET(i->fd, &fds)) { |
961 | /* If handle_input() returns false, it means we | ||
962 | * should no longer service it. | ||
963 | * handle_console_input() does this. */ | ||
623 | if (!i->handle_input(fd, i)) { | 964 | if (!i->handle_input(fd, i)) { |
965 | /* Clear it from the set of input file | ||
966 | * descriptors kept at the head of the | ||
967 | * device list. */ | ||
624 | FD_CLR(i->fd, &devices->infds); | 968 | FD_CLR(i->fd, &devices->infds); |
625 | /* Tell waker to ignore it too... */ | 969 | /* Tell waker to ignore it too... */ |
626 | write(waker_fd, &i->fd, sizeof(i->fd)); | 970 | write(waker_fd, &i->fd, sizeof(i->fd)); |
@@ -630,6 +974,15 @@ static void handle_input(int fd, struct device_list *devices) | |||
630 | } | 974 | } |
631 | } | 975 | } |
632 | 976 | ||
977 | /*L:190 | ||
978 | * Device Setup | ||
979 | * | ||
980 | * All devices need a descriptor so the Guest knows it exists, and a "struct | ||
981 | * device" so the Launcher can keep track of it. We have common helper | ||
982 | * routines to allocate them. | ||
983 | * | ||
984 | * This routine allocates a new "struct lguest_device_desc" from descriptor | ||
985 | * table in the devices array just above the Guest's normal memory. */ | ||
633 | static struct lguest_device_desc * | 986 | static struct lguest_device_desc * |
634 | new_dev_desc(struct lguest_device_desc *descs, | 987 | new_dev_desc(struct lguest_device_desc *descs, |
635 | u16 type, u16 features, u16 num_pages) | 988 | u16 type, u16 features, u16 num_pages) |
@@ -641,6 +994,8 @@ new_dev_desc(struct lguest_device_desc *descs, | |||
641 | descs[i].type = type; | 994 | descs[i].type = type; |
642 | descs[i].features = features; | 995 | descs[i].features = features; |
643 | descs[i].num_pages = num_pages; | 996 | descs[i].num_pages = num_pages; |
997 | /* If they said the device needs memory, we allocate | ||
998 | * that now, bumping up the top of Guest memory. */ | ||
644 | if (num_pages) { | 999 | if (num_pages) { |
645 | map_zeroed_pages(top, num_pages); | 1000 | map_zeroed_pages(top, num_pages); |
646 | descs[i].pfn = top/getpagesize(); | 1001 | descs[i].pfn = top/getpagesize(); |
@@ -652,6 +1007,9 @@ new_dev_desc(struct lguest_device_desc *descs, | |||
652 | errx(1, "too many devices"); | 1007 | errx(1, "too many devices"); |
653 | } | 1008 | } |
654 | 1009 | ||
1010 | /* This monster routine does all the creation and setup of a new device, | ||
1011 | * including caling new_dev_desc() to allocate the descriptor and device | ||
1012 | * memory. */ | ||
655 | static struct device *new_device(struct device_list *devices, | 1013 | static struct device *new_device(struct device_list *devices, |
656 | u16 type, u16 num_pages, u16 features, | 1014 | u16 type, u16 num_pages, u16 features, |
657 | int fd, | 1015 | int fd, |
@@ -664,12 +1022,18 @@ static struct device *new_device(struct device_list *devices, | |||
664 | { | 1022 | { |
665 | struct device *dev = malloc(sizeof(*dev)); | 1023 | struct device *dev = malloc(sizeof(*dev)); |
666 | 1024 | ||
667 | /* Append to device list. */ | 1025 | /* Append to device list. Prepending to a single-linked list is |
1026 | * easier, but the user expects the devices to be arranged on the bus | ||
1027 | * in command-line order. The first network device on the command line | ||
1028 | * is eth0, the first block device /dev/lgba, etc. */ | ||
668 | *devices->lastdev = dev; | 1029 | *devices->lastdev = dev; |
669 | dev->next = NULL; | 1030 | dev->next = NULL; |
670 | devices->lastdev = &dev->next; | 1031 | devices->lastdev = &dev->next; |
671 | 1032 | ||
1033 | /* Now we populate the fields one at a time. */ | ||
672 | dev->fd = fd; | 1034 | dev->fd = fd; |
1035 | /* If we have an input handler for this file descriptor, then we add it | ||
1036 | * to the device_list's fdset and maxfd. */ | ||
673 | if (handle_input) | 1037 | if (handle_input) |
674 | set_fd(dev->fd, devices); | 1038 | set_fd(dev->fd, devices); |
675 | dev->desc = new_dev_desc(devices->descs, type, features, num_pages); | 1039 | dev->desc = new_dev_desc(devices->descs, type, features, num_pages); |
@@ -680,27 +1044,37 @@ static struct device *new_device(struct device_list *devices, | |||
680 | return dev; | 1044 | return dev; |
681 | } | 1045 | } |
682 | 1046 | ||
1047 | /* Our first setup routine is the console. It's a fairly simple device, but | ||
1048 | * UNIX tty handling makes it uglier than it could be. */ | ||
683 | static void setup_console(struct device_list *devices) | 1049 | static void setup_console(struct device_list *devices) |
684 | { | 1050 | { |
685 | struct device *dev; | 1051 | struct device *dev; |
686 | 1052 | ||
1053 | /* If we can save the initial standard input settings... */ | ||
687 | if (tcgetattr(STDIN_FILENO, &orig_term) == 0) { | 1054 | if (tcgetattr(STDIN_FILENO, &orig_term) == 0) { |
688 | struct termios term = orig_term; | 1055 | struct termios term = orig_term; |
1056 | /* Then we turn off echo, line buffering and ^C etc. We want a | ||
1057 | * raw input stream to the Guest. */ | ||
689 | term.c_lflag &= ~(ISIG|ICANON|ECHO); | 1058 | term.c_lflag &= ~(ISIG|ICANON|ECHO); |
690 | tcsetattr(STDIN_FILENO, TCSANOW, &term); | 1059 | tcsetattr(STDIN_FILENO, TCSANOW, &term); |
1060 | /* If we exit gracefully, the original settings will be | ||
1061 | * restored so the user can see what they're typing. */ | ||
691 | atexit(restore_term); | 1062 | atexit(restore_term); |
692 | } | 1063 | } |
693 | 1064 | ||
694 | /* We don't currently require a page for the console. */ | 1065 | /* We don't currently require any memory for the console, so we ask for |
1066 | * 0 pages. */ | ||
695 | dev = new_device(devices, LGUEST_DEVICE_T_CONSOLE, 0, 0, | 1067 | dev = new_device(devices, LGUEST_DEVICE_T_CONSOLE, 0, 0, |
696 | STDIN_FILENO, handle_console_input, | 1068 | STDIN_FILENO, handle_console_input, |
697 | LGUEST_CONSOLE_DMA_KEY, handle_console_output); | 1069 | LGUEST_CONSOLE_DMA_KEY, handle_console_output); |
1070 | /* We store the console state in dev->priv, and initialize it. */ | ||
698 | dev->priv = malloc(sizeof(struct console_abort)); | 1071 | dev->priv = malloc(sizeof(struct console_abort)); |
699 | ((struct console_abort *)dev->priv)->count = 0; | 1072 | ((struct console_abort *)dev->priv)->count = 0; |
700 | verbose("device %p: console\n", | 1073 | verbose("device %p: console\n", |
701 | (void *)(dev->desc->pfn * getpagesize())); | 1074 | (void *)(dev->desc->pfn * getpagesize())); |
702 | } | 1075 | } |
703 | 1076 | ||
1077 | /* Setting up a block file is also fairly straightforward. */ | ||
704 | static void setup_block_file(const char *filename, struct device_list *devices) | 1078 | static void setup_block_file(const char *filename, struct device_list *devices) |
705 | { | 1079 | { |
706 | int fd; | 1080 | int fd; |
@@ -708,20 +1082,47 @@ static void setup_block_file(const char *filename, struct device_list *devices) | |||
708 | off64_t *device_len; | 1082 | off64_t *device_len; |
709 | struct lguest_block_page *p; | 1083 | struct lguest_block_page *p; |
710 | 1084 | ||
1085 | /* We open with O_LARGEFILE because otherwise we get stuck at 2G. We | ||
1086 | * open with O_DIRECT because otherwise our benchmarks go much too | ||
1087 | * fast. */ | ||
711 | fd = open_or_die(filename, O_RDWR|O_LARGEFILE|O_DIRECT); | 1088 | fd = open_or_die(filename, O_RDWR|O_LARGEFILE|O_DIRECT); |
1089 | |||
1090 | /* We want one page, and have no input handler (the block file never | ||
1091 | * has anything interesting to say to us). Our timing will be quite | ||
1092 | * random, so it should be a reasonable randomness source. */ | ||
712 | dev = new_device(devices, LGUEST_DEVICE_T_BLOCK, 1, | 1093 | dev = new_device(devices, LGUEST_DEVICE_T_BLOCK, 1, |
713 | LGUEST_DEVICE_F_RANDOMNESS, | 1094 | LGUEST_DEVICE_F_RANDOMNESS, |
714 | fd, NULL, 0, handle_block_output); | 1095 | fd, NULL, 0, handle_block_output); |
1096 | |||
1097 | /* We store the device size in the private area */ | ||
715 | device_len = dev->priv = malloc(sizeof(*device_len)); | 1098 | device_len = dev->priv = malloc(sizeof(*device_len)); |
1099 | /* This is the safe way of establishing the size of our device: it | ||
1100 | * might be a normal file or an actual block device like /dev/hdb. */ | ||
716 | *device_len = lseek64(fd, 0, SEEK_END); | 1101 | *device_len = lseek64(fd, 0, SEEK_END); |
717 | p = dev->mem; | ||
718 | 1102 | ||
1103 | /* The device memory is a "struct lguest_block_page". It's zeroed | ||
1104 | * already, we just need to put in the device size. Block devices | ||
1105 | * think in sectors (ie. 512 byte chunks), so we translate here. */ | ||
1106 | p = dev->mem; | ||
719 | p->num_sectors = *device_len/512; | 1107 | p->num_sectors = *device_len/512; |
720 | verbose("device %p: block %i sectors\n", | 1108 | verbose("device %p: block %i sectors\n", |
721 | (void *)(dev->desc->pfn * getpagesize()), p->num_sectors); | 1109 | (void *)(dev->desc->pfn * getpagesize()), p->num_sectors); |
722 | } | 1110 | } |
723 | 1111 | ||
724 | /* We use fnctl locks to reserve network slots (autocleanup!) */ | 1112 | /* |
1113 | * Network Devices. | ||
1114 | * | ||
1115 | * Setting up network devices is quite a pain, because we have three types. | ||
1116 | * First, we have the inter-Guest network. This is a file which is mapped into | ||
1117 | * the address space of the Guests who are on the network. Because it is a | ||
1118 | * shared mapping, the same page underlies all the devices, and they can send | ||
1119 | * DMA to each other. | ||
1120 | * | ||
1121 | * Remember from our network driver, the Guest is told what slot in the page it | ||
1122 | * is to use. We use exclusive fnctl locks to reserve a slot. If another | ||
1123 | * Guest is using a slot, the lock will fail and we try another. Because fnctl | ||
1124 | * locks are cleaned up automatically when we die, this cleverly means that our | ||
1125 | * reservation on the slot will vanish if we crash. */ | ||
725 | static unsigned int find_slot(int netfd, const char *filename) | 1126 | static unsigned int find_slot(int netfd, const char *filename) |
726 | { | 1127 | { |
727 | struct flock fl; | 1128 | struct flock fl; |
@@ -729,26 +1130,33 @@ static unsigned int find_slot(int netfd, const char *filename) | |||
729 | fl.l_type = F_WRLCK; | 1130 | fl.l_type = F_WRLCK; |
730 | fl.l_whence = SEEK_SET; | 1131 | fl.l_whence = SEEK_SET; |
731 | fl.l_len = 1; | 1132 | fl.l_len = 1; |
1133 | /* Try a 1 byte lock in each possible position number */ | ||
732 | for (fl.l_start = 0; | 1134 | for (fl.l_start = 0; |
733 | fl.l_start < getpagesize()/sizeof(struct lguest_net); | 1135 | fl.l_start < getpagesize()/sizeof(struct lguest_net); |
734 | fl.l_start++) { | 1136 | fl.l_start++) { |
1137 | /* If we succeed, return the slot number. */ | ||
735 | if (fcntl(netfd, F_SETLK, &fl) == 0) | 1138 | if (fcntl(netfd, F_SETLK, &fl) == 0) |
736 | return fl.l_start; | 1139 | return fl.l_start; |
737 | } | 1140 | } |
738 | errx(1, "No free slots in network file %s", filename); | 1141 | errx(1, "No free slots in network file %s", filename); |
739 | } | 1142 | } |
740 | 1143 | ||
1144 | /* This function sets up the network file */ | ||
741 | static void setup_net_file(const char *filename, | 1145 | static void setup_net_file(const char *filename, |
742 | struct device_list *devices) | 1146 | struct device_list *devices) |
743 | { | 1147 | { |
744 | int netfd; | 1148 | int netfd; |
745 | struct device *dev; | 1149 | struct device *dev; |
746 | 1150 | ||
1151 | /* We don't use open_or_die() here: for friendliness we create the file | ||
1152 | * if it doesn't already exist. */ | ||
747 | netfd = open(filename, O_RDWR, 0); | 1153 | netfd = open(filename, O_RDWR, 0); |
748 | if (netfd < 0) { | 1154 | if (netfd < 0) { |
749 | if (errno == ENOENT) { | 1155 | if (errno == ENOENT) { |
750 | netfd = open(filename, O_RDWR|O_CREAT, 0600); | 1156 | netfd = open(filename, O_RDWR|O_CREAT, 0600); |
751 | if (netfd >= 0) { | 1157 | if (netfd >= 0) { |
1158 | /* If we succeeded, initialize the file with a | ||
1159 | * blank page. */ | ||
752 | char page[getpagesize()]; | 1160 | char page[getpagesize()]; |
753 | memset(page, 0, sizeof(page)); | 1161 | memset(page, 0, sizeof(page)); |
754 | write(netfd, page, sizeof(page)); | 1162 | write(netfd, page, sizeof(page)); |
@@ -758,11 +1166,15 @@ static void setup_net_file(const char *filename, | |||
758 | err(1, "cannot open net file '%s'", filename); | 1166 | err(1, "cannot open net file '%s'", filename); |
759 | } | 1167 | } |
760 | 1168 | ||
1169 | /* We need 1 page, and the features indicate the slot to use and that | ||
1170 | * no checksum is needed. We never touch this device again; it's | ||
1171 | * between the Guests on the network, so we don't register input or | ||
1172 | * output handlers. */ | ||
761 | dev = new_device(devices, LGUEST_DEVICE_T_NET, 1, | 1173 | dev = new_device(devices, LGUEST_DEVICE_T_NET, 1, |
762 | find_slot(netfd, filename)|LGUEST_NET_F_NOCSUM, | 1174 | find_slot(netfd, filename)|LGUEST_NET_F_NOCSUM, |
763 | -1, NULL, 0, NULL); | 1175 | -1, NULL, 0, NULL); |
764 | 1176 | ||
765 | /* We overwrite the /dev/zero mapping with the actual file. */ | 1177 | /* Map the shared file. */ |
766 | if (mmap(dev->mem, getpagesize(), PROT_READ|PROT_WRITE, | 1178 | if (mmap(dev->mem, getpagesize(), PROT_READ|PROT_WRITE, |
767 | MAP_FIXED|MAP_SHARED, netfd, 0) != dev->mem) | 1179 | MAP_FIXED|MAP_SHARED, netfd, 0) != dev->mem) |
768 | err(1, "could not mmap '%s'", filename); | 1180 | err(1, "could not mmap '%s'", filename); |
@@ -770,6 +1182,7 @@ static void setup_net_file(const char *filename, | |||
770 | (void *)(dev->desc->pfn * getpagesize()), filename, | 1182 | (void *)(dev->desc->pfn * getpagesize()), filename, |
771 | dev->desc->features & ~LGUEST_NET_F_NOCSUM); | 1183 | dev->desc->features & ~LGUEST_NET_F_NOCSUM); |
772 | } | 1184 | } |
1185 | /*:*/ | ||
773 | 1186 | ||
774 | static u32 str2ip(const char *ipaddr) | 1187 | static u32 str2ip(const char *ipaddr) |
775 | { | 1188 | { |
@@ -779,7 +1192,11 @@ static u32 str2ip(const char *ipaddr) | |||
779 | return (byte[0] << 24) | (byte[1] << 16) | (byte[2] << 8) | byte[3]; | 1192 | return (byte[0] << 24) | (byte[1] << 16) | (byte[2] << 8) | byte[3]; |
780 | } | 1193 | } |
781 | 1194 | ||
782 | /* adapted from libbridge */ | 1195 | /* This code is "adapted" from libbridge: it attaches the Host end of the |
1196 | * network device to the bridge device specified by the command line. | ||
1197 | * | ||
1198 | * This is yet another James Morris contribution (I'm an IP-level guy, so I | ||
1199 | * dislike bridging), and I just try not to break it. */ | ||
783 | static void add_to_bridge(int fd, const char *if_name, const char *br_name) | 1200 | static void add_to_bridge(int fd, const char *if_name, const char *br_name) |
784 | { | 1201 | { |
785 | int ifidx; | 1202 | int ifidx; |
@@ -798,12 +1215,16 @@ static void add_to_bridge(int fd, const char *if_name, const char *br_name) | |||
798 | err(1, "can't add %s to bridge %s", if_name, br_name); | 1215 | err(1, "can't add %s to bridge %s", if_name, br_name); |
799 | } | 1216 | } |
800 | 1217 | ||
1218 | /* This sets up the Host end of the network device with an IP address, brings | ||
1219 | * it up so packets will flow, the copies the MAC address into the hwaddr | ||
1220 | * pointer (in practice, the Host's slot in the network device's memory). */ | ||
801 | static void configure_device(int fd, const char *devname, u32 ipaddr, | 1221 | static void configure_device(int fd, const char *devname, u32 ipaddr, |
802 | unsigned char hwaddr[6]) | 1222 | unsigned char hwaddr[6]) |
803 | { | 1223 | { |
804 | struct ifreq ifr; | 1224 | struct ifreq ifr; |
805 | struct sockaddr_in *sin = (struct sockaddr_in *)&ifr.ifr_addr; | 1225 | struct sockaddr_in *sin = (struct sockaddr_in *)&ifr.ifr_addr; |
806 | 1226 | ||
1227 | /* Don't read these incantations. Just cut & paste them like I did! */ | ||
807 | memset(&ifr, 0, sizeof(ifr)); | 1228 | memset(&ifr, 0, sizeof(ifr)); |
808 | strcpy(ifr.ifr_name, devname); | 1229 | strcpy(ifr.ifr_name, devname); |
809 | sin->sin_family = AF_INET; | 1230 | sin->sin_family = AF_INET; |
@@ -814,12 +1235,19 @@ static void configure_device(int fd, const char *devname, u32 ipaddr, | |||
814 | if (ioctl(fd, SIOCSIFFLAGS, &ifr) != 0) | 1235 | if (ioctl(fd, SIOCSIFFLAGS, &ifr) != 0) |
815 | err(1, "Bringing interface %s up", devname); | 1236 | err(1, "Bringing interface %s up", devname); |
816 | 1237 | ||
1238 | /* SIOC stands for Socket I/O Control. G means Get (vs S for Set | ||
1239 | * above). IF means Interface, and HWADDR is hardware address. | ||
1240 | * Simple! */ | ||
817 | if (ioctl(fd, SIOCGIFHWADDR, &ifr) != 0) | 1241 | if (ioctl(fd, SIOCGIFHWADDR, &ifr) != 0) |
818 | err(1, "getting hw address for %s", devname); | 1242 | err(1, "getting hw address for %s", devname); |
819 | |||
820 | memcpy(hwaddr, ifr.ifr_hwaddr.sa_data, 6); | 1243 | memcpy(hwaddr, ifr.ifr_hwaddr.sa_data, 6); |
821 | } | 1244 | } |
822 | 1245 | ||
1246 | /*L:195 The other kind of network is a Host<->Guest network. This can either | ||
1247 | * use briding or routing, but the principle is the same: it uses the "tun" | ||
1248 | * device to inject packets into the Host as if they came in from a normal | ||
1249 | * network card. We just shunt packets between the Guest and the tun | ||
1250 | * device. */ | ||
823 | static void setup_tun_net(const char *arg, struct device_list *devices) | 1251 | static void setup_tun_net(const char *arg, struct device_list *devices) |
824 | { | 1252 | { |
825 | struct device *dev; | 1253 | struct device *dev; |
@@ -828,36 +1256,56 @@ static void setup_tun_net(const char *arg, struct device_list *devices) | |||
828 | u32 ip; | 1256 | u32 ip; |
829 | const char *br_name = NULL; | 1257 | const char *br_name = NULL; |
830 | 1258 | ||
1259 | /* We open the /dev/net/tun device and tell it we want a tap device. A | ||
1260 | * tap device is like a tun device, only somehow different. To tell | ||
1261 | * the truth, I completely blundered my way through this code, but it | ||
1262 | * works now! */ | ||
831 | netfd = open_or_die("/dev/net/tun", O_RDWR); | 1263 | netfd = open_or_die("/dev/net/tun", O_RDWR); |
832 | memset(&ifr, 0, sizeof(ifr)); | 1264 | memset(&ifr, 0, sizeof(ifr)); |
833 | ifr.ifr_flags = IFF_TAP | IFF_NO_PI; | 1265 | ifr.ifr_flags = IFF_TAP | IFF_NO_PI; |
834 | strcpy(ifr.ifr_name, "tap%d"); | 1266 | strcpy(ifr.ifr_name, "tap%d"); |
835 | if (ioctl(netfd, TUNSETIFF, &ifr) != 0) | 1267 | if (ioctl(netfd, TUNSETIFF, &ifr) != 0) |
836 | err(1, "configuring /dev/net/tun"); | 1268 | err(1, "configuring /dev/net/tun"); |
1269 | /* We don't need checksums calculated for packets coming in this | ||
1270 | * device: trust us! */ | ||
837 | ioctl(netfd, TUNSETNOCSUM, 1); | 1271 | ioctl(netfd, TUNSETNOCSUM, 1); |
838 | 1272 | ||
839 | /* You will be peer 1: we should create enough jitter to randomize */ | 1273 | /* We create the net device with 1 page, using the features field of |
1274 | * the descriptor to tell the Guest it is in slot 1 (NET_PEERNUM), and | ||
1275 | * that the device has fairly random timing. We do *not* specify | ||
1276 | * LGUEST_NET_F_NOCSUM: these packets can reach the real world. | ||
1277 | * | ||
1278 | * We will put our MAC address is slot 0 for the Guest to see, so | ||
1279 | * it will send packets to us using the key "peer_offset(0)": */ | ||
840 | dev = new_device(devices, LGUEST_DEVICE_T_NET, 1, | 1280 | dev = new_device(devices, LGUEST_DEVICE_T_NET, 1, |
841 | NET_PEERNUM|LGUEST_DEVICE_F_RANDOMNESS, netfd, | 1281 | NET_PEERNUM|LGUEST_DEVICE_F_RANDOMNESS, netfd, |
842 | handle_tun_input, peer_offset(0), handle_tun_output); | 1282 | handle_tun_input, peer_offset(0), handle_tun_output); |
1283 | |||
1284 | /* We keep a flag which says whether we've seen packets come out from | ||
1285 | * this network device. */ | ||
843 | dev->priv = malloc(sizeof(bool)); | 1286 | dev->priv = malloc(sizeof(bool)); |
844 | *(bool *)dev->priv = false; | 1287 | *(bool *)dev->priv = false; |
845 | 1288 | ||
1289 | /* We need a socket to perform the magic network ioctls to bring up the | ||
1290 | * tap interface, connect to the bridge etc. Any socket will do! */ | ||
846 | ipfd = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP); | 1291 | ipfd = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP); |
847 | if (ipfd < 0) | 1292 | if (ipfd < 0) |
848 | err(1, "opening IP socket"); | 1293 | err(1, "opening IP socket"); |
849 | 1294 | ||
1295 | /* If the command line was --tunnet=bridge:<name> do bridging. */ | ||
850 | if (!strncmp(BRIDGE_PFX, arg, strlen(BRIDGE_PFX))) { | 1296 | if (!strncmp(BRIDGE_PFX, arg, strlen(BRIDGE_PFX))) { |
851 | ip = INADDR_ANY; | 1297 | ip = INADDR_ANY; |
852 | br_name = arg + strlen(BRIDGE_PFX); | 1298 | br_name = arg + strlen(BRIDGE_PFX); |
853 | add_to_bridge(ipfd, ifr.ifr_name, br_name); | 1299 | add_to_bridge(ipfd, ifr.ifr_name, br_name); |
854 | } else | 1300 | } else /* It is an IP address to set up the device with */ |
855 | ip = str2ip(arg); | 1301 | ip = str2ip(arg); |
856 | 1302 | ||
857 | /* We are peer 0, ie. first slot. */ | 1303 | /* We are peer 0, ie. first slot, so we hand dev->mem to this routine |
1304 | * to write the MAC address at the start of the device memory. */ | ||
858 | configure_device(ipfd, ifr.ifr_name, ip, dev->mem); | 1305 | configure_device(ipfd, ifr.ifr_name, ip, dev->mem); |
859 | 1306 | ||
860 | /* Set "promisc" bit: we want every single packet. */ | 1307 | /* Set "promisc" bit: we want every single packet if we're going to |
1308 | * bridge to other machines (and otherwise it doesn't matter). */ | ||
861 | *((u8 *)dev->mem) |= 0x1; | 1309 | *((u8 *)dev->mem) |= 0x1; |
862 | 1310 | ||
863 | close(ipfd); | 1311 | close(ipfd); |
@@ -868,7 +1316,10 @@ static void setup_tun_net(const char *arg, struct device_list *devices) | |||
868 | if (br_name) | 1316 | if (br_name) |
869 | verbose("attached to bridge: %s\n", br_name); | 1317 | verbose("attached to bridge: %s\n", br_name); |
870 | } | 1318 | } |
1319 | /* That's the end of device setup. */ | ||
871 | 1320 | ||
1321 | /*L:220 Finally we reach the core of the Launcher, which runs the Guest, serves | ||
1322 | * its input and output, and finally, lays it to rest. */ | ||
872 | static void __attribute__((noreturn)) | 1323 | static void __attribute__((noreturn)) |
873 | run_guest(int lguest_fd, struct device_list *device_list) | 1324 | run_guest(int lguest_fd, struct device_list *device_list) |
874 | { | 1325 | { |
@@ -880,20 +1331,37 @@ run_guest(int lguest_fd, struct device_list *device_list) | |||
880 | /* We read from the /dev/lguest device to run the Guest. */ | 1331 | /* We read from the /dev/lguest device to run the Guest. */ |
881 | readval = read(lguest_fd, arr, sizeof(arr)); | 1332 | readval = read(lguest_fd, arr, sizeof(arr)); |
882 | 1333 | ||
1334 | /* The read can only really return sizeof(arr) (the Guest did a | ||
1335 | * SEND_DMA to us), or an error. */ | ||
1336 | |||
1337 | /* For a successful read, arr[0] is the address of the "struct | ||
1338 | * lguest_dma", and arr[1] is the key the Guest sent to. */ | ||
883 | if (readval == sizeof(arr)) { | 1339 | if (readval == sizeof(arr)) { |
884 | handle_output(lguest_fd, arr[0], arr[1], device_list); | 1340 | handle_output(lguest_fd, arr[0], arr[1], device_list); |
885 | continue; | 1341 | continue; |
1342 | /* ENOENT means the Guest died. Reading tells us why. */ | ||
886 | } else if (errno == ENOENT) { | 1343 | } else if (errno == ENOENT) { |
887 | char reason[1024] = { 0 }; | 1344 | char reason[1024] = { 0 }; |
888 | read(lguest_fd, reason, sizeof(reason)-1); | 1345 | read(lguest_fd, reason, sizeof(reason)-1); |
889 | errx(1, "%s", reason); | 1346 | errx(1, "%s", reason); |
1347 | /* EAGAIN means the waker wanted us to look at some input. | ||
1348 | * Anything else means a bug or incompatible change. */ | ||
890 | } else if (errno != EAGAIN) | 1349 | } else if (errno != EAGAIN) |
891 | err(1, "Running guest failed"); | 1350 | err(1, "Running guest failed"); |
1351 | |||
1352 | /* Service input, then unset the BREAK which releases | ||
1353 | * the Waker. */ | ||
892 | handle_input(lguest_fd, device_list); | 1354 | handle_input(lguest_fd, device_list); |
893 | if (write(lguest_fd, args, sizeof(args)) < 0) | 1355 | if (write(lguest_fd, args, sizeof(args)) < 0) |
894 | err(1, "Resetting break"); | 1356 | err(1, "Resetting break"); |
895 | } | 1357 | } |
896 | } | 1358 | } |
1359 | /* | ||
1360 | * This is the end of the Launcher. | ||
1361 | * | ||
1362 | * But wait! We've seen I/O from the Launcher, and we've seen I/O from the | ||
1363 | * Drivers. If we were to see the Host kernel I/O code, our understanding | ||
1364 | * would be complete... :*/ | ||
897 | 1365 | ||
898 | static struct option opts[] = { | 1366 | static struct option opts[] = { |
899 | { "verbose", 0, NULL, 'v' }, | 1367 | { "verbose", 0, NULL, 'v' }, |
@@ -911,20 +1379,49 @@ static void usage(void) | |||
911 | "<mem-in-mb> vmlinux [args...]"); | 1379 | "<mem-in-mb> vmlinux [args...]"); |
912 | } | 1380 | } |
913 | 1381 | ||
1382 | /*L:100 The Launcher code itself takes us out into userspace, that scary place | ||
1383 | * where pointers run wild and free! Unfortunately, like most userspace | ||
1384 | * programs, it's quite boring (which is why everyone like to hack on the | ||
1385 | * kernel!). Perhaps if you make up an Lguest Drinking Game at this point, it | ||
1386 | * will get you through this section. Or, maybe not. | ||
1387 | * | ||
1388 | * The Launcher binary sits up high, usually starting at address 0xB8000000. | ||
1389 | * Everything below this is the "physical" memory for the Guest. For example, | ||
1390 | * if the Guest were to write a "1" at physical address 0, we would see a "1" | ||
1391 | * in the Launcher at "(int *)0". Guest physical == Launcher virtual. | ||
1392 | * | ||
1393 | * This can be tough to get your head around, but usually it just means that we | ||
1394 | * don't need to do any conversion when the Guest gives us it's "physical" | ||
1395 | * addresses. | ||
1396 | */ | ||
914 | int main(int argc, char *argv[]) | 1397 | int main(int argc, char *argv[]) |
915 | { | 1398 | { |
1399 | /* Memory, top-level pagetable, code startpoint, PAGE_OFFSET and size | ||
1400 | * of the (optional) initrd. */ | ||
916 | unsigned long mem = 0, pgdir, start, page_offset, initrd_size = 0; | 1401 | unsigned long mem = 0, pgdir, start, page_offset, initrd_size = 0; |
1402 | /* A temporary and the /dev/lguest file descriptor. */ | ||
917 | int i, c, lguest_fd; | 1403 | int i, c, lguest_fd; |
1404 | /* The list of Guest devices, based on command line arguments. */ | ||
918 | struct device_list device_list; | 1405 | struct device_list device_list; |
1406 | /* The boot information for the Guest: at guest-physical address 0. */ | ||
919 | void *boot = (void *)0; | 1407 | void *boot = (void *)0; |
1408 | /* If they specify an initrd file to load. */ | ||
920 | const char *initrd_name = NULL; | 1409 | const char *initrd_name = NULL; |
921 | 1410 | ||
1411 | /* First we initialize the device list. Since console and network | ||
1412 | * device receive input from a file descriptor, we keep an fdset | ||
1413 | * (infds) and the maximum fd number (max_infd) with the head of the | ||
1414 | * list. We also keep a pointer to the last device, for easy appending | ||
1415 | * to the list. */ | ||
922 | device_list.max_infd = -1; | 1416 | device_list.max_infd = -1; |
923 | device_list.dev = NULL; | 1417 | device_list.dev = NULL; |
924 | device_list.lastdev = &device_list.dev; | 1418 | device_list.lastdev = &device_list.dev; |
925 | FD_ZERO(&device_list.infds); | 1419 | FD_ZERO(&device_list.infds); |
926 | 1420 | ||
927 | /* We need to know how much memory so we can allocate devices. */ | 1421 | /* We need to know how much memory so we can set up the device |
1422 | * descriptor and memory pages for the devices as we parse the command | ||
1423 | * line. So we quickly look through the arguments to find the amount | ||
1424 | * of memory now. */ | ||
928 | for (i = 1; i < argc; i++) { | 1425 | for (i = 1; i < argc; i++) { |
929 | if (argv[i][0] != '-') { | 1426 | if (argv[i][0] != '-') { |
930 | mem = top = atoi(argv[i]) * 1024 * 1024; | 1427 | mem = top = atoi(argv[i]) * 1024 * 1024; |
@@ -933,6 +1430,8 @@ int main(int argc, char *argv[]) | |||
933 | break; | 1430 | break; |
934 | } | 1431 | } |
935 | } | 1432 | } |
1433 | |||
1434 | /* The options are fairly straight-forward */ | ||
936 | while ((c = getopt_long(argc, argv, "v", opts, NULL)) != EOF) { | 1435 | while ((c = getopt_long(argc, argv, "v", opts, NULL)) != EOF) { |
937 | switch (c) { | 1436 | switch (c) { |
938 | case 'v': | 1437 | case 'v': |
@@ -955,42 +1454,71 @@ int main(int argc, char *argv[]) | |||
955 | usage(); | 1454 | usage(); |
956 | } | 1455 | } |
957 | } | 1456 | } |
1457 | /* After the other arguments we expect memory and kernel image name, | ||
1458 | * followed by command line arguments for the kernel. */ | ||
958 | if (optind + 2 > argc) | 1459 | if (optind + 2 > argc) |
959 | usage(); | 1460 | usage(); |
960 | 1461 | ||
961 | /* We need a console device */ | 1462 | /* We always have a console device */ |
962 | setup_console(&device_list); | 1463 | setup_console(&device_list); |
963 | 1464 | ||
964 | /* First we map /dev/zero over all of guest-physical memory. */ | 1465 | /* We start by mapping anonymous pages over all of guest-physical |
1466 | * memory range. This fills it with 0, and ensures that the Guest | ||
1467 | * won't be killed when it tries to access it. */ | ||
965 | map_zeroed_pages(0, mem / getpagesize()); | 1468 | map_zeroed_pages(0, mem / getpagesize()); |
966 | 1469 | ||
967 | /* Now we load the kernel */ | 1470 | /* Now we load the kernel */ |
968 | start = load_kernel(open_or_die(argv[optind+1], O_RDONLY), | 1471 | start = load_kernel(open_or_die(argv[optind+1], O_RDONLY), |
969 | &page_offset); | 1472 | &page_offset); |
970 | 1473 | ||
971 | /* Map the initrd image if requested */ | 1474 | /* Map the initrd image if requested (at top of physical memory) */ |
972 | if (initrd_name) { | 1475 | if (initrd_name) { |
973 | initrd_size = load_initrd(initrd_name, mem); | 1476 | initrd_size = load_initrd(initrd_name, mem); |
1477 | /* These are the location in the Linux boot header where the | ||
1478 | * start and size of the initrd are expected to be found. */ | ||
974 | *(unsigned long *)(boot+0x218) = mem - initrd_size; | 1479 | *(unsigned long *)(boot+0x218) = mem - initrd_size; |
975 | *(unsigned long *)(boot+0x21c) = initrd_size; | 1480 | *(unsigned long *)(boot+0x21c) = initrd_size; |
1481 | /* The bootloader type 0xFF means "unknown"; that's OK. */ | ||
976 | *(unsigned char *)(boot+0x210) = 0xFF; | 1482 | *(unsigned char *)(boot+0x210) = 0xFF; |
977 | } | 1483 | } |
978 | 1484 | ||
979 | /* Set up the initial linar pagetables. */ | 1485 | /* Set up the initial linear pagetables, starting below the initrd. */ |
980 | pgdir = setup_pagetables(mem, initrd_size, page_offset); | 1486 | pgdir = setup_pagetables(mem, initrd_size, page_offset); |
981 | 1487 | ||
982 | /* E820 memory map: ours is a simple, single region. */ | 1488 | /* The Linux boot header contains an "E820" memory map: ours is a |
1489 | * simple, single region. */ | ||
983 | *(char*)(boot+E820NR) = 1; | 1490 | *(char*)(boot+E820NR) = 1; |
984 | *((struct e820entry *)(boot+E820MAP)) | 1491 | *((struct e820entry *)(boot+E820MAP)) |
985 | = ((struct e820entry) { 0, mem, E820_RAM }); | 1492 | = ((struct e820entry) { 0, mem, E820_RAM }); |
986 | /* Command line pointer and command line (at 4096) */ | 1493 | /* The boot header contains a command line pointer: we put the command |
1494 | * line after the boot header (at address 4096) */ | ||
987 | *(void **)(boot + 0x228) = boot + 4096; | 1495 | *(void **)(boot + 0x228) = boot + 4096; |
988 | concat(boot + 4096, argv+optind+2); | 1496 | concat(boot + 4096, argv+optind+2); |
989 | /* Paravirt type: 1 == lguest */ | 1497 | |
1498 | /* The guest type value of "1" tells the Guest it's under lguest. */ | ||
990 | *(int *)(boot + 0x23c) = 1; | 1499 | *(int *)(boot + 0x23c) = 1; |
991 | 1500 | ||
1501 | /* We tell the kernel to initialize the Guest: this returns the open | ||
1502 | * /dev/lguest file descriptor. */ | ||
992 | lguest_fd = tell_kernel(pgdir, start, page_offset); | 1503 | lguest_fd = tell_kernel(pgdir, start, page_offset); |
1504 | |||
1505 | /* We fork off a child process, which wakes the Launcher whenever one | ||
1506 | * of the input file descriptors needs attention. Otherwise we would | ||
1507 | * run the Guest until it tries to output something. */ | ||
993 | waker_fd = setup_waker(lguest_fd, &device_list); | 1508 | waker_fd = setup_waker(lguest_fd, &device_list); |
994 | 1509 | ||
1510 | /* Finally, run the Guest. This doesn't return. */ | ||
995 | run_guest(lguest_fd, &device_list); | 1511 | run_guest(lguest_fd, &device_list); |
996 | } | 1512 | } |
1513 | /*:*/ | ||
1514 | |||
1515 | /*M:999 | ||
1516 | * Mastery is done: you now know everything I do. | ||
1517 | * | ||
1518 | * But surely you have seen code, features and bugs in your wanderings which | ||
1519 | * you now yearn to attack? That is the real game, and I look forward to you | ||
1520 | * patching and forking lguest into the Your-Name-Here-visor. | ||
1521 | * | ||
1522 | * Farewell, and good coding! | ||
1523 | * Rusty Russell. | ||
1524 | */ | ||
diff --git a/Documentation/memory-hotplug.txt b/Documentation/memory-hotplug.txt new file mode 100644 index 000000000000..5fbcc22c98e9 --- /dev/null +++ b/Documentation/memory-hotplug.txt | |||
@@ -0,0 +1,322 @@ | |||
1 | ============== | ||
2 | Memory Hotplug | ||
3 | ============== | ||
4 | |||
5 | Last Updated: Jul 28 2007 | ||
6 | |||
7 | This document is about memory hotplug including how-to-use and current status. | ||
8 | Because Memory Hotplug is still under development, contents of this text will | ||
9 | be changed often. | ||
10 | |||
11 | 1. Introduction | ||
12 | 1.1 purpose of memory hotplug | ||
13 | 1.2. Phases of memory hotplug | ||
14 | 1.3. Unit of Memory online/offline operation | ||
15 | 2. Kernel Configuration | ||
16 | 3. sysfs files for memory hotplug | ||
17 | 4. Physical memory hot-add phase | ||
18 | 4.1 Hardware(Firmware) Support | ||
19 | 4.2 Notify memory hot-add event by hand | ||
20 | 5. Logical Memory hot-add phase | ||
21 | 5.1. State of memory | ||
22 | 5.2. How to online memory | ||
23 | 6. Logical memory remove | ||
24 | 6.1 Memory offline and ZONE_MOVABLE | ||
25 | 6.2. How to offline memory | ||
26 | 7. Physical memory remove | ||
27 | 8. Future Work List | ||
28 | |||
29 | Note(1): x86_64's has special implementation for memory hotplug. | ||
30 | This text does not describe it. | ||
31 | Note(2): This text assumes that sysfs is mounted at /sys. | ||
32 | |||
33 | |||
34 | --------------- | ||
35 | 1. Introduction | ||
36 | --------------- | ||
37 | |||
38 | 1.1 purpose of memory hotplug | ||
39 | ------------ | ||
40 | Memory Hotplug allows users to increase/decrease the amount of memory. | ||
41 | Generally, there are two purposes. | ||
42 | |||
43 | (A) For changing the amount of memory. | ||
44 | This is to allow a feature like capacity on demand. | ||
45 | (B) For installing/removing DIMMs or NUMA-nodes physically. | ||
46 | This is to exchange DIMMs/NUMA-nodes, reduce power consumption, etc. | ||
47 | |||
48 | (A) is required by highly virtualized environments and (B) is required by | ||
49 | hardware which supports memory power management. | ||
50 | |||
51 | Linux memory hotplug is designed for both purpose. | ||
52 | |||
53 | |||
54 | 1.2. Phases of memory hotplug | ||
55 | --------------- | ||
56 | There are 2 phases in Memory Hotplug. | ||
57 | 1) Physical Memory Hotplug phase | ||
58 | 2) Logical Memory Hotplug phase. | ||
59 | |||
60 | The First phase is to communicate hardware/firmware and make/erase | ||
61 | environment for hotplugged memory. Basically, this phase is necessary | ||
62 | for the purpose (B), but this is good phase for communication between | ||
63 | highly virtualized environments too. | ||
64 | |||
65 | When memory is hotplugged, the kernel recognizes new memory, makes new memory | ||
66 | management tables, and makes sysfs files for new memory's operation. | ||
67 | |||
68 | If firmware supports notification of connection of new memory to OS, | ||
69 | this phase is triggered automatically. ACPI can notify this event. If not, | ||
70 | "probe" operation by system administration is used instead. | ||
71 | (see Section 4.). | ||
72 | |||
73 | Logical Memory Hotplug phase is to change memory state into | ||
74 | avaiable/unavailable for users. Amount of memory from user's view is | ||
75 | changed by this phase. The kernel makes all memory in it as free pages | ||
76 | when a memory range is available. | ||
77 | |||
78 | In this document, this phase is described as online/offline. | ||
79 | |||
80 | Logical Memory Hotplug phase is triggred by write of sysfs file by system | ||
81 | administrator. For the hot-add case, it must be executed after Physical Hotplug | ||
82 | phase by hand. | ||
83 | (However, if you writes udev's hotplug scripts for memory hotplug, these | ||
84 | phases can be execute in seamless way.) | ||
85 | |||
86 | |||
87 | 1.3. Unit of Memory online/offline operation | ||
88 | ------------ | ||
89 | Memory hotplug uses SPARSEMEM memory model. SPARSEMEM divides the whole memory | ||
90 | into chunks of the same size. The chunk is called a "section". The size of | ||
91 | a section is architecture dependent. For example, power uses 16MiB, ia64 uses | ||
92 | 1GiB. The unit of online/offline operation is "one section". (see Section 3.) | ||
93 | |||
94 | To determine the size of sections, please read this file: | ||
95 | |||
96 | /sys/devices/system/memory/block_size_bytes | ||
97 | |||
98 | This file shows the size of sections in byte. | ||
99 | |||
100 | ----------------------- | ||
101 | 2. Kernel Configuration | ||
102 | ----------------------- | ||
103 | To use memory hotplug feature, kernel must be compiled with following | ||
104 | config options. | ||
105 | |||
106 | - For all memory hotplug | ||
107 | Memory model -> Sparse Memory (CONFIG_SPARSEMEM) | ||
108 | Allow for memory hot-add (CONFIG_MEMORY_HOTPLUG) | ||
109 | |||
110 | - To enable memory removal, the followings are also necessary | ||
111 | Allow for memory hot remove (CONFIG_MEMORY_HOTREMOVE) | ||
112 | Page Migration (CONFIG_MIGRATION) | ||
113 | |||
114 | - For ACPI memory hotplug, the followings are also necessary | ||
115 | Memory hotplug (under ACPI Support menu) (CONFIG_ACPI_HOTPLUG_MEMORY) | ||
116 | This option can be kernel module. | ||
117 | |||
118 | - As a related configuration, if your box has a feature of NUMA-node hotplug | ||
119 | via ACPI, then this option is necessary too. | ||
120 | ACPI0004,PNP0A05 and PNP0A06 Container Driver (under ACPI Support menu) | ||
121 | (CONFIG_ACPI_CONTAINER). | ||
122 | This option can be kernel module too. | ||
123 | |||
124 | -------------------------------- | ||
125 | 3 sysfs files for memory hotplug | ||
126 | -------------------------------- | ||
127 | All sections have their device information under /sys/devices/system/memory as | ||
128 | |||
129 | /sys/devices/system/memory/memoryXXX | ||
130 | (XXX is section id.) | ||
131 | |||
132 | Now, XXX is defined as start_address_of_section / section_size. | ||
133 | |||
134 | For example, assume 1GiB section size. A device for a memory starting at | ||
135 | 0x100000000 is /sys/device/system/memory/memory4 | ||
136 | (0x100000000 / 1Gib = 4) | ||
137 | This device covers address range [0x100000000 ... 0x140000000) | ||
138 | |||
139 | Under each section, you can see 3 files. | ||
140 | |||
141 | /sys/devices/system/memory/memoryXXX/phys_index | ||
142 | /sys/devices/system/memory/memoryXXX/phys_device | ||
143 | /sys/devices/system/memory/memoryXXX/state | ||
144 | |||
145 | 'phys_index' : read-only and contains section id, same as XXX. | ||
146 | 'state' : read-write | ||
147 | at read: contains online/offline state of memory. | ||
148 | at write: user can specify "online", "offline" command | ||
149 | 'phys_device': read-only: designed to show the name of physical memory device. | ||
150 | This is not well implemented now. | ||
151 | |||
152 | NOTE: | ||
153 | These directories/files appear after physical memory hotplug phase. | ||
154 | |||
155 | |||
156 | -------------------------------- | ||
157 | 4. Physical memory hot-add phase | ||
158 | -------------------------------- | ||
159 | |||
160 | 4.1 Hardware(Firmware) Support | ||
161 | ------------ | ||
162 | On x86_64/ia64 platform, memory hotplug by ACPI is supported. | ||
163 | |||
164 | In general, the firmware (ACPI) which supports memory hotplug defines | ||
165 | memory class object of _HID "PNP0C80". When a notify is asserted to PNP0C80, | ||
166 | Linux's ACPI handler does hot-add memory to the system and calls a hotplug udev | ||
167 | script. This will be done automatically. | ||
168 | |||
169 | But scripts for memory hotplug are not contained in generic udev package(now). | ||
170 | You may have to write it by yourself or online/offline memory by hand. | ||
171 | Please see "How to online memory", "How to offline memory" in this text. | ||
172 | |||
173 | If firmware supports NUMA-node hotplug, and defines an object _HID "ACPI0004", | ||
174 | "PNP0A05", or "PNP0A06", notification is asserted to it, and ACPI handler | ||
175 | calls hotplug code for all of objects which are defined in it. | ||
176 | If memory device is found, memory hotplug code will be called. | ||
177 | |||
178 | |||
179 | 4.2 Notify memory hot-add event by hand | ||
180 | ------------ | ||
181 | In some environments, especially virtualized environment, firmware will not | ||
182 | notify memory hotplug event to the kernel. For such environment, "probe" | ||
183 | interface is supported. This interface depends on CONFIG_ARCH_MEMORY_PROBE. | ||
184 | |||
185 | Now, CONFIG_ARCH_MEMORY_PROBE is supported only by powerpc but it does not | ||
186 | contain highly architecture codes. Please add config if you need "probe" | ||
187 | interface. | ||
188 | |||
189 | Probe interface is located at | ||
190 | /sys/devices/system/memory/probe | ||
191 | |||
192 | You can tell the physical address of new memory to the kernel by | ||
193 | |||
194 | % echo start_address_of_new_memory > /sys/devices/system/memory/probe | ||
195 | |||
196 | Then, [start_address_of_new_memory, start_address_of_new_memory + section_size) | ||
197 | memory range is hot-added. In this case, hotplug script is not called (in | ||
198 | current implementation). You'll have to online memory by yourself. | ||
199 | Please see "How to online memory" in this text. | ||
200 | |||
201 | |||
202 | |||
203 | ------------------------------ | ||
204 | 5. Logical Memory hot-add phase | ||
205 | ------------------------------ | ||
206 | |||
207 | 5.1. State of memory | ||
208 | ------------ | ||
209 | To see (online/offline) state of memory section, read 'state' file. | ||
210 | |||
211 | % cat /sys/device/system/memory/memoryXXX/state | ||
212 | |||
213 | |||
214 | If the memory section is online, you'll read "online". | ||
215 | If the memory section is offline, you'll read "offline". | ||
216 | |||
217 | |||
218 | 5.2. How to online memory | ||
219 | ------------ | ||
220 | Even if the memory is hot-added, it is not at ready-to-use state. | ||
221 | For using newly added memory, you have to "online" the memory section. | ||
222 | |||
223 | For onlining, you have to write "online" to the section's state file as: | ||
224 | |||
225 | % echo online > /sys/devices/system/memory/memoryXXX/state | ||
226 | |||
227 | After this, section memoryXXX's state will be 'online' and the amount of | ||
228 | available memory will be increased. | ||
229 | |||
230 | Currently, newly added memory is added as ZONE_NORMAL (for powerpc, ZONE_DMA). | ||
231 | This may be changed in future. | ||
232 | |||
233 | |||
234 | |||
235 | ------------------------ | ||
236 | 6. Logical memory remove | ||
237 | ------------------------ | ||
238 | |||
239 | 6.1 Memory offline and ZONE_MOVABLE | ||
240 | ------------ | ||
241 | Memory offlining is more complicated than memory online. Because memory offline | ||
242 | has to make the whole memory section be unused, memory offline can fail if | ||
243 | the section includes memory which cannot be freed. | ||
244 | |||
245 | In general, memory offline can use 2 techniques. | ||
246 | |||
247 | (1) reclaim and free all memory in the section. | ||
248 | (2) migrate all pages in the section. | ||
249 | |||
250 | In the current implementation, Linux's memory offline uses method (2), freeing | ||
251 | all pages in the section by page migration. But not all pages are | ||
252 | migratable. Under current Linux, migratable pages are anonymous pages and | ||
253 | page caches. For offlining a section by migration, the kernel has to guarantee | ||
254 | that the section contains only migratable pages. | ||
255 | |||
256 | Now, a boot option for making a section which consists of migratable pages is | ||
257 | supported. By specifying "kernelcore=" or "movablecore=" boot option, you can | ||
258 | create ZONE_MOVABLE...a zone which is just used for movable pages. | ||
259 | (See also Documentation/kernel-parameters.txt) | ||
260 | |||
261 | Assume the system has "TOTAL" amount of memory at boot time, this boot option | ||
262 | creates ZONE_MOVABLE as following. | ||
263 | |||
264 | 1) When kernelcore=YYYY boot option is used, | ||
265 | Size of memory not for movable pages (not for offline) is YYYY. | ||
266 | Size of memory for movable pages (for offline) is TOTAL-YYYY. | ||
267 | |||
268 | 2) When movablecore=ZZZZ boot option is used, | ||
269 | Size of memory not for movable pages (not for offline) is TOTAL - ZZZZ. | ||
270 | Size of memory for movable pages (for offline) is ZZZZ. | ||
271 | |||
272 | |||
273 | Note) Unfortunately, there is no information to show which section belongs | ||
274 | to ZONE_MOVABLE. This is TBD. | ||
275 | |||
276 | |||
277 | 6.2. How to offline memory | ||
278 | ------------ | ||
279 | You can offline a section by using the same sysfs interface that was used in | ||
280 | memory onlining. | ||
281 | |||
282 | % echo offline > /sys/devices/system/memory/memoryXXX/state | ||
283 | |||
284 | If offline succeeds, the state of the memory section is changed to be "offline". | ||
285 | If it fails, some error core (like -EBUSY) will be returned by the kernel. | ||
286 | Even if a section does not belong to ZONE_MOVABLE, you can try to offline it. | ||
287 | If it doesn't contain 'unmovable' memory, you'll get success. | ||
288 | |||
289 | A section under ZONE_MOVABLE is considered to be able to be offlined easily. | ||
290 | But under some busy state, it may return -EBUSY. Even if a memory section | ||
291 | cannot be offlined due to -EBUSY, you can retry offlining it and may be able to | ||
292 | offline it (or not). | ||
293 | (For example, a page is referred to by some kernel internal call and released | ||
294 | soon.) | ||
295 | |||
296 | Consideration: | ||
297 | Memory hotplug's design direction is to make the possibility of memory offlining | ||
298 | higher and to guarantee unplugging memory under any situation. But it needs | ||
299 | more work. Returning -EBUSY under some situation may be good because the user | ||
300 | can decide to retry more or not by himself. Currently, memory offlining code | ||
301 | does some amount of retry with 120 seconds timeout. | ||
302 | |||
303 | ------------------------- | ||
304 | 7. Physical memory remove | ||
305 | ------------------------- | ||
306 | Need more implementation yet.... | ||
307 | - Notification completion of remove works by OS to firmware. | ||
308 | - Guard from remove if not yet. | ||
309 | |||
310 | -------------- | ||
311 | 8. Future Work | ||
312 | -------------- | ||
313 | - allowing memory hot-add to ZONE_MOVABLE. maybe we need some switch like | ||
314 | sysctl or new control file. | ||
315 | - showing memory section and physical device relationship. | ||
316 | - showing memory section and node relationship (maybe good for NUMA) | ||
317 | - showing memory section is under ZONE_MOVABLE or not | ||
318 | - test and make it better memory offlining. | ||
319 | - support HugeTLB page migration and offlining. | ||
320 | - memmap removing at memory offline. | ||
321 | - physical remove memory. | ||
322 | |||
diff --git a/Documentation/sched-design-CFS.txt b/Documentation/sched-design-CFS.txt index 16feebb7bdc0..84901e7c0508 100644 --- a/Documentation/sched-design-CFS.txt +++ b/Documentation/sched-design-CFS.txt | |||
@@ -83,7 +83,7 @@ Some implementation details: | |||
83 | CFS uses nanosecond granularity accounting and does not rely on any | 83 | CFS uses nanosecond granularity accounting and does not rely on any |
84 | jiffies or other HZ detail. Thus the CFS scheduler has no notion of | 84 | jiffies or other HZ detail. Thus the CFS scheduler has no notion of |
85 | 'timeslices' and has no heuristics whatsoever. There is only one | 85 | 'timeslices' and has no heuristics whatsoever. There is only one |
86 | central tunable: | 86 | central tunable (you have to switch on CONFIG_SCHED_DEBUG): |
87 | 87 | ||
88 | /proc/sys/kernel/sched_granularity_ns | 88 | /proc/sys/kernel/sched_granularity_ns |
89 | 89 | ||
diff --git a/Documentation/sched-nice-design.txt b/Documentation/sched-nice-design.txt new file mode 100644 index 000000000000..e2bae5a577e3 --- /dev/null +++ b/Documentation/sched-nice-design.txt | |||
@@ -0,0 +1,108 @@ | |||
1 | This document explains the thinking about the revamped and streamlined | ||
2 | nice-levels implementation in the new Linux scheduler. | ||
3 | |||
4 | Nice levels were always pretty weak under Linux and people continuously | ||
5 | pestered us to make nice +19 tasks use up much less CPU time. | ||
6 | |||
7 | Unfortunately that was not that easy to implement under the old | ||
8 | scheduler, (otherwise we'd have done it long ago) because nice level | ||
9 | support was historically coupled to timeslice length, and timeslice | ||
10 | units were driven by the HZ tick, so the smallest timeslice was 1/HZ. | ||
11 | |||
12 | In the O(1) scheduler (in 2003) we changed negative nice levels to be | ||
13 | much stronger than they were before in 2.4 (and people were happy about | ||
14 | that change), and we also intentionally calibrated the linear timeslice | ||
15 | rule so that nice +19 level would be _exactly_ 1 jiffy. To better | ||
16 | understand it, the timeslice graph went like this (cheesy ASCII art | ||
17 | alert!): | ||
18 | |||
19 | |||
20 | A | ||
21 | \ | [timeslice length] | ||
22 | \ | | ||
23 | \ | | ||
24 | \ | | ||
25 | \ | | ||
26 | \|___100msecs | ||
27 | |^ . _ | ||
28 | | ^ . _ | ||
29 | | ^ . _ | ||
30 | -*----------------------------------*-----> [nice level] | ||
31 | -20 | +19 | ||
32 | | | ||
33 | | | ||
34 | |||
35 | So that if someone wanted to really renice tasks, +19 would give a much | ||
36 | bigger hit than the normal linear rule would do. (The solution of | ||
37 | changing the ABI to extend priorities was discarded early on.) | ||
38 | |||
39 | This approach worked to some degree for some time, but later on with | ||
40 | HZ=1000 it caused 1 jiffy to be 1 msec, which meant 0.1% CPU usage which | ||
41 | we felt to be a bit excessive. Excessive _not_ because it's too small of | ||
42 | a CPU utilization, but because it causes too frequent (once per | ||
43 | millisec) rescheduling. (and would thus trash the cache, etc. Remember, | ||
44 | this was long ago when hardware was weaker and caches were smaller, and | ||
45 | people were running number crunching apps at nice +19.) | ||
46 | |||
47 | So for HZ=1000 we changed nice +19 to 5msecs, because that felt like the | ||
48 | right minimal granularity - and this translates to 5% CPU utilization. | ||
49 | But the fundamental HZ-sensitive property for nice+19 still remained, | ||
50 | and we never got a single complaint about nice +19 being too _weak_ in | ||
51 | terms of CPU utilization, we only got complaints about it (still) being | ||
52 | too _strong_ :-) | ||
53 | |||
54 | To sum it up: we always wanted to make nice levels more consistent, but | ||
55 | within the constraints of HZ and jiffies and their nasty design level | ||
56 | coupling to timeslices and granularity it was not really viable. | ||
57 | |||
58 | The second (less frequent but still periodically occuring) complaint | ||
59 | about Linux's nice level support was its assymetry around the origo | ||
60 | (which you can see demonstrated in the picture above), or more | ||
61 | accurately: the fact that nice level behavior depended on the _absolute_ | ||
62 | nice level as well, while the nice API itself is fundamentally | ||
63 | "relative": | ||
64 | |||
65 | int nice(int inc); | ||
66 | |||
67 | asmlinkage long sys_nice(int increment) | ||
68 | |||
69 | (the first one is the glibc API, the second one is the syscall API.) | ||
70 | Note that the 'inc' is relative to the current nice level. Tools like | ||
71 | bash's "nice" command mirror this relative API. | ||
72 | |||
73 | With the old scheduler, if you for example started a niced task with +1 | ||
74 | and another task with +2, the CPU split between the two tasks would | ||
75 | depend on the nice level of the parent shell - if it was at nice -10 the | ||
76 | CPU split was different than if it was at +5 or +10. | ||
77 | |||
78 | A third complaint against Linux's nice level support was that negative | ||
79 | nice levels were not 'punchy enough', so lots of people had to resort to | ||
80 | run audio (and other multimedia) apps under RT priorities such as | ||
81 | SCHED_FIFO. But this caused other problems: SCHED_FIFO is not starvation | ||
82 | proof, and a buggy SCHED_FIFO app can also lock up the system for good. | ||
83 | |||
84 | The new scheduler in v2.6.23 addresses all three types of complaints: | ||
85 | |||
86 | To address the first complaint (of nice levels being not "punchy" | ||
87 | enough), the scheduler was decoupled from 'time slice' and HZ concepts | ||
88 | (and granularity was made a separate concept from nice levels) and thus | ||
89 | it was possible to implement better and more consistent nice +19 | ||
90 | support: with the new scheduler nice +19 tasks get a HZ-independent | ||
91 | 1.5%, instead of the variable 3%-5%-9% range they got in the old | ||
92 | scheduler. | ||
93 | |||
94 | To address the second complaint (of nice levels not being consistent), | ||
95 | the new scheduler makes nice(1) have the same CPU utilization effect on | ||
96 | tasks, regardless of their absolute nice levels. So on the new | ||
97 | scheduler, running a nice +10 and a nice 11 task has the same CPU | ||
98 | utilization "split" between them as running a nice -5 and a nice -4 | ||
99 | task. (one will get 55% of the CPU, the other 45%.) That is why nice | ||
100 | levels were changed to be "multiplicative" (or exponential) - that way | ||
101 | it does not matter which nice level you start out from, the 'relative | ||
102 | result' will always be the same. | ||
103 | |||
104 | The third complaint (of negative nice levels not being "punchy" enough | ||
105 | and forcing audio apps to run under the more dangerous SCHED_FIFO | ||
106 | scheduling policy) is addressed by the new scheduler almost | ||
107 | automatically: stronger negative nice levels are an automatic | ||
108 | side-effect of the recalibrated dynamic range of nice levels. | ||
diff --git a/Documentation/sched-stats.txt b/Documentation/sched-stats.txt index 6f72021aae51..442e14d35dea 100644 --- a/Documentation/sched-stats.txt +++ b/Documentation/sched-stats.txt | |||
@@ -1,10 +1,11 @@ | |||
1 | Version 10 of schedstats includes support for sched_domains, which | 1 | Version 14 of schedstats includes support for sched_domains, which hit the |
2 | hit the mainline kernel in 2.6.7. Some counters make more sense to be | 2 | mainline kernel in 2.6.20 although it is identical to the stats from version |
3 | per-runqueue; other to be per-domain. Note that domains (and their associated | 3 | 12 which was in the kernel from 2.6.13-2.6.19 (version 13 never saw a kernel |
4 | information) will only be pertinent and available on machines utilizing | 4 | release). Some counters make more sense to be per-runqueue; other to be |
5 | CONFIG_SMP. | 5 | per-domain. Note that domains (and their associated information) will only |
6 | 6 | be pertinent and available on machines utilizing CONFIG_SMP. | |
7 | In version 10 of schedstat, there is at least one level of domain | 7 | |
8 | In version 14 of schedstat, there is at least one level of domain | ||
8 | statistics for each cpu listed, and there may well be more than one | 9 | statistics for each cpu listed, and there may well be more than one |
9 | domain. Domains have no particular names in this implementation, but | 10 | domain. Domains have no particular names in this implementation, but |
10 | the highest numbered one typically arbitrates balancing across all the | 11 | the highest numbered one typically arbitrates balancing across all the |
@@ -27,7 +28,7 @@ to write their own scripts, the fields are described here. | |||
27 | 28 | ||
28 | CPU statistics | 29 | CPU statistics |
29 | -------------- | 30 | -------------- |
30 | cpu<N> 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 | 31 | cpu<N> 1 2 3 4 5 6 7 8 9 10 11 12 |
31 | 32 | ||
32 | NOTE: In the sched_yield() statistics, the active queue is considered empty | 33 | NOTE: In the sched_yield() statistics, the active queue is considered empty |
33 | if it has only one process in it, since obviously the process calling | 34 | if it has only one process in it, since obviously the process calling |
@@ -39,48 +40,20 @@ First four fields are sched_yield() statistics: | |||
39 | 3) # of times just the expired queue was empty | 40 | 3) # of times just the expired queue was empty |
40 | 4) # of times sched_yield() was called | 41 | 4) # of times sched_yield() was called |
41 | 42 | ||
42 | Next four are schedule() statistics: | 43 | Next three are schedule() statistics: |
43 | 5) # of times the active queue had at least one other process on it | 44 | 5) # of times we switched to the expired queue and reused it |
44 | 6) # of times we switched to the expired queue and reused it | 45 | 6) # of times schedule() was called |
45 | 7) # of times schedule() was called | 46 | 7) # of times schedule() left the processor idle |
46 | 8) # of times schedule() left the processor idle | ||
47 | |||
48 | Next four are active_load_balance() statistics: | ||
49 | 9) # of times active_load_balance() was called | ||
50 | 10) # of times active_load_balance() caused this cpu to gain a task | ||
51 | 11) # of times active_load_balance() caused this cpu to lose a task | ||
52 | 12) # of times active_load_balance() tried to move a task and failed | ||
53 | |||
54 | Next three are try_to_wake_up() statistics: | ||
55 | 13) # of times try_to_wake_up() was called | ||
56 | 14) # of times try_to_wake_up() successfully moved the awakening task | ||
57 | 15) # of times try_to_wake_up() attempted to move the awakening task | ||
58 | |||
59 | Next two are wake_up_new_task() statistics: | ||
60 | 16) # of times wake_up_new_task() was called | ||
61 | 17) # of times wake_up_new_task() successfully moved the new task | ||
62 | |||
63 | Next one is a sched_migrate_task() statistic: | ||
64 | 18) # of times sched_migrate_task() was called | ||
65 | 47 | ||
66 | Next one is a sched_balance_exec() statistic: | 48 | Next two are try_to_wake_up() statistics: |
67 | 19) # of times sched_balance_exec() was called | 49 | 8) # of times try_to_wake_up() was called |
50 | 9) # of times try_to_wake_up() was called to wake up the local cpu | ||
68 | 51 | ||
69 | Next three are statistics describing scheduling latency: | 52 | Next three are statistics describing scheduling latency: |
70 | 20) sum of all time spent running by tasks on this processor (in ms) | 53 | 10) sum of all time spent running by tasks on this processor (in jiffies) |
71 | 21) sum of all time spent waiting to run by tasks on this processor (in ms) | 54 | 11) sum of all time spent waiting to run by tasks on this processor (in |
72 | 22) # of tasks (not necessarily unique) given to the processor | 55 | jiffies) |
73 | 56 | 12) # of timeslices run on this cpu | |
74 | The last six are statistics dealing with pull_task(): | ||
75 | 23) # of times pull_task() moved a task to this cpu when newly idle | ||
76 | 24) # of times pull_task() stole a task from this cpu when another cpu | ||
77 | was newly idle | ||
78 | 25) # of times pull_task() moved a task to this cpu when idle | ||
79 | 26) # of times pull_task() stole a task from this cpu when another cpu | ||
80 | was idle | ||
81 | 27) # of times pull_task() moved a task to this cpu when busy | ||
82 | 28) # of times pull_task() stole a task from this cpu when another cpu | ||
83 | was busy | ||
84 | 57 | ||
85 | 58 | ||
86 | Domain statistics | 59 | Domain statistics |
@@ -89,65 +62,95 @@ One of these is produced per domain for each cpu described. (Note that if | |||
89 | CONFIG_SMP is not defined, *no* domains are utilized and these lines | 62 | CONFIG_SMP is not defined, *no* domains are utilized and these lines |
90 | will not appear in the output.) | 63 | will not appear in the output.) |
91 | 64 | ||
92 | domain<N> 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 | 65 | domain<N> <cpumask> 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 |
93 | 66 | ||
94 | The first field is a bit mask indicating what cpus this domain operates over. | 67 | The first field is a bit mask indicating what cpus this domain operates over. |
95 | 68 | ||
96 | The next fifteen are a variety of load_balance() statistics: | 69 | The next 24 are a variety of load_balance() statistics in grouped into types |
97 | 70 | of idleness (idle, busy, and newly idle): | |
98 | 1) # of times in this domain load_balance() was called when the cpu | 71 | |
99 | was idle | 72 | 1) # of times in this domain load_balance() was called when the |
100 | 2) # of times in this domain load_balance() was called when the cpu | 73 | cpu was idle |
101 | was busy | 74 | 2) # of times in this domain load_balance() checked but found |
102 | 3) # of times in this domain load_balance() was called when the cpu | 75 | the load did not require balancing when the cpu was idle |
103 | was just becoming idle | 76 | 3) # of times in this domain load_balance() tried to move one or |
104 | 4) # of times in this domain load_balance() tried to move one or more | 77 | more tasks and failed, when the cpu was idle |
105 | tasks and failed, when the cpu was idle | 78 | 4) sum of imbalances discovered (if any) with each call to |
106 | 5) # of times in this domain load_balance() tried to move one or more | 79 | load_balance() in this domain when the cpu was idle |
107 | tasks and failed, when the cpu was busy | 80 | 5) # of times in this domain pull_task() was called when the cpu |
108 | 6) # of times in this domain load_balance() tried to move one or more | 81 | was idle |
109 | tasks and failed, when the cpu was just becoming idle | 82 | 6) # of times in this domain pull_task() was called even though |
110 | 7) sum of imbalances discovered (if any) with each call to | 83 | the target task was cache-hot when idle |
111 | load_balance() in this domain when the cpu was idle | 84 | 7) # of times in this domain load_balance() was called but did |
112 | 8) sum of imbalances discovered (if any) with each call to | 85 | not find a busier queue while the cpu was idle |
113 | load_balance() in this domain when the cpu was busy | 86 | 8) # of times in this domain a busier queue was found while the |
114 | 9) sum of imbalances discovered (if any) with each call to | 87 | cpu was idle but no busier group was found |
115 | load_balance() in this domain when the cpu was just becoming idle | 88 | |
116 | 10) # of times in this domain load_balance() was called but did not find | 89 | 9) # of times in this domain load_balance() was called when the |
117 | a busier queue while the cpu was idle | 90 | cpu was busy |
118 | 11) # of times in this domain load_balance() was called but did not find | 91 | 10) # of times in this domain load_balance() checked but found the |
119 | a busier queue while the cpu was busy | 92 | load did not require balancing when busy |
120 | 12) # of times in this domain load_balance() was called but did not find | 93 | 11) # of times in this domain load_balance() tried to move one or |
121 | a busier queue while the cpu was just becoming idle | 94 | more tasks and failed, when the cpu was busy |
122 | 13) # of times in this domain a busier queue was found while the cpu was | 95 | 12) sum of imbalances discovered (if any) with each call to |
123 | idle but no busier group was found | 96 | load_balance() in this domain when the cpu was busy |
124 | 14) # of times in this domain a busier queue was found while the cpu was | 97 | 13) # of times in this domain pull_task() was called when busy |
125 | busy but no busier group was found | 98 | 14) # of times in this domain pull_task() was called even though the |
126 | 15) # of times in this domain a busier queue was found while the cpu was | 99 | target task was cache-hot when busy |
127 | just becoming idle but no busier group was found | 100 | 15) # of times in this domain load_balance() was called but did not |
128 | 101 | find a busier queue while the cpu was busy | |
129 | Next two are sched_balance_exec() statistics: | 102 | 16) # of times in this domain a busier queue was found while the cpu |
130 | 17) # of times in this domain sched_balance_exec() successfully pushed | 103 | was busy but no busier group was found |
131 | a task to a new cpu | 104 | |
132 | 18) # of times in this domain sched_balance_exec() tried but failed to | 105 | 17) # of times in this domain load_balance() was called when the |
133 | push a task to a new cpu | 106 | cpu was just becoming idle |
134 | 107 | 18) # of times in this domain load_balance() checked but found the | |
135 | Next two are try_to_wake_up() statistics: | 108 | load did not require balancing when the cpu was just becoming idle |
136 | 19) # of times in this domain try_to_wake_up() tried to move a task based | 109 | 19) # of times in this domain load_balance() tried to move one or more |
137 | on affinity and cache warmth | 110 | tasks and failed, when the cpu was just becoming idle |
138 | 20) # of times in this domain try_to_wake_up() tried to move a task based | 111 | 20) sum of imbalances discovered (if any) with each call to |
139 | on load balancing | 112 | load_balance() in this domain when the cpu was just becoming idle |
140 | 113 | 21) # of times in this domain pull_task() was called when newly idle | |
114 | 22) # of times in this domain pull_task() was called even though the | ||
115 | target task was cache-hot when just becoming idle | ||
116 | 23) # of times in this domain load_balance() was called but did not | ||
117 | find a busier queue while the cpu was just becoming idle | ||
118 | 24) # of times in this domain a busier queue was found while the cpu | ||
119 | was just becoming idle but no busier group was found | ||
120 | |||
121 | Next three are active_load_balance() statistics: | ||
122 | 25) # of times active_load_balance() was called | ||
123 | 26) # of times active_load_balance() tried to move a task and failed | ||
124 | 27) # of times active_load_balance() successfully moved a task | ||
125 | |||
126 | Next three are sched_balance_exec() statistics: | ||
127 | 28) sbe_cnt is not used | ||
128 | 29) sbe_balanced is not used | ||
129 | 30) sbe_pushed is not used | ||
130 | |||
131 | Next three are sched_balance_fork() statistics: | ||
132 | 31) sbf_cnt is not used | ||
133 | 32) sbf_balanced is not used | ||
134 | 33) sbf_pushed is not used | ||
135 | |||
136 | Next three are try_to_wake_up() statistics: | ||
137 | 34) # of times in this domain try_to_wake_up() awoke a task that | ||
138 | last ran on a different cpu in this domain | ||
139 | 35) # of times in this domain try_to_wake_up() moved a task to the | ||
140 | waking cpu because it was cache-cold on its own cpu anyway | ||
141 | 36) # of times in this domain try_to_wake_up() started passive balancing | ||
141 | 142 | ||
142 | /proc/<pid>/schedstat | 143 | /proc/<pid>/schedstat |
143 | ---------------- | 144 | ---------------- |
144 | schedstats also adds a new /proc/<pid/schedstat file to include some of | 145 | schedstats also adds a new /proc/<pid/schedstat file to include some of |
145 | the same information on a per-process level. There are three fields in | 146 | the same information on a per-process level. There are three fields in |
146 | this file correlating to fields 20, 21, and 22 in the CPU fields, but | 147 | this file correlating for that process to: |
147 | they only apply for that process. | 148 | 1) time spent on the cpu |
149 | 2) time spent waiting on a runqueue | ||
150 | 3) # of timeslices run on this cpu | ||
148 | 151 | ||
149 | A program could be easily written to make use of these extra fields to | 152 | A program could be easily written to make use of these extra fields to |
150 | report on how well a particular process or set of processes is faring | 153 | report on how well a particular process or set of processes is faring |
151 | under the scheduler's policies. A simple version of such a program is | 154 | under the scheduler's policies. A simple version of such a program is |
152 | available at | 155 | available at |
153 | http://eaglet.rain.com/rick/linux/schedstat/v10/latency.c | 156 | http://eaglet.rain.com/rick/linux/schedstat/v12/latency.c |
diff --git a/Documentation/spi/spidev_test.c b/Documentation/spi/spidev_test.c new file mode 100644 index 000000000000..218e86215297 --- /dev/null +++ b/Documentation/spi/spidev_test.c | |||
@@ -0,0 +1,202 @@ | |||
1 | /* | ||
2 | * SPI testing utility (using spidev driver) | ||
3 | * | ||
4 | * Copyright (c) 2007 MontaVista Software, Inc. | ||
5 | * Copyright (c) 2007 Anton Vorontsov <avorontsov@ru.mvista.com> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or modify | ||
8 | * it under the terms of the GNU General Public License as published by | ||
9 | * the Free Software Foundation; either version 2 of the License. | ||
10 | * | ||
11 | * Cross-compile with cross-gcc -I/path/to/cross-kernel/include | ||
12 | */ | ||
13 | |||
14 | #include <stdint.h> | ||
15 | #include <unistd.h> | ||
16 | #include <stdio.h> | ||
17 | #include <stdlib.h> | ||
18 | #include <getopt.h> | ||
19 | #include <fcntl.h> | ||
20 | #include <sys/ioctl.h> | ||
21 | #include <linux/types.h> | ||
22 | #include <linux/spi/spidev.h> | ||
23 | |||
24 | #define ARRAY_SIZE(a) (sizeof(a) / sizeof((a)[0])) | ||
25 | |||
26 | static void pabort(const char *s) | ||
27 | { | ||
28 | perror(s); | ||
29 | abort(); | ||
30 | } | ||
31 | |||
32 | static char *device = "/dev/spidev1.1"; | ||
33 | static uint8_t mode; | ||
34 | static uint8_t bits = 8; | ||
35 | static uint32_t speed = 500000; | ||
36 | static uint16_t delay; | ||
37 | |||
38 | static void transfer(int fd) | ||
39 | { | ||
40 | int ret; | ||
41 | uint8_t tx[] = { | ||
42 | 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, | ||
43 | 0x40, 0x00, 0x00, 0x00, 0x00, 0x95, | ||
44 | 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, | ||
45 | 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, | ||
46 | 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, | ||
47 | 0xDE, 0xAD, 0xBE, 0xEF, 0xBA, 0xAD, | ||
48 | 0xF0, 0x0D, | ||
49 | }; | ||
50 | uint8_t rx[ARRAY_SIZE(tx)] = {0, }; | ||
51 | struct spi_ioc_transfer tr = { | ||
52 | .tx_buf = (unsigned long)tx, | ||
53 | .rx_buf = (unsigned long)rx, | ||
54 | .len = ARRAY_SIZE(tx), | ||
55 | .delay_usecs = delay, | ||
56 | .speed_hz = speed, | ||
57 | .bits_per_word = bits, | ||
58 | }; | ||
59 | |||
60 | ret = ioctl(fd, SPI_IOC_MESSAGE(1), &tr); | ||
61 | if (ret == 1) | ||
62 | pabort("can't send spi message"); | ||
63 | |||
64 | for (ret = 0; ret < ARRAY_SIZE(tx); ret++) { | ||
65 | if (!(ret % 6)) | ||
66 | puts(""); | ||
67 | printf("%.2X ", rx[ret]); | ||
68 | } | ||
69 | puts(""); | ||
70 | } | ||
71 | |||
72 | void print_usage(char *prog) | ||
73 | { | ||
74 | printf("Usage: %s [-DsbdlHOLC3]\n", prog); | ||
75 | puts(" -D --device device to use (default /dev/spidev1.1)\n" | ||
76 | " -s --speed max speed (Hz)\n" | ||
77 | " -d --delay delay (usec)\n" | ||
78 | " -b --bpw bits per word \n" | ||
79 | " -l --loop loopback\n" | ||
80 | " -H --cpha clock phase\n" | ||
81 | " -O --cpol clock polarity\n" | ||
82 | " -L --lsb least significant bit first\n" | ||
83 | " -C --cs-high chip select active high\n" | ||
84 | " -3 --3wire SI/SO signals shared\n"); | ||
85 | exit(1); | ||
86 | } | ||
87 | |||
88 | void parse_opts(int argc, char *argv[]) | ||
89 | { | ||
90 | while (1) { | ||
91 | static struct option lopts[] = { | ||
92 | { "device", 1, 0, 'D' }, | ||
93 | { "speed", 1, 0, 's' }, | ||
94 | { "delay", 1, 0, 'd' }, | ||
95 | { "bpw", 1, 0, 'b' }, | ||
96 | { "loop", 0, 0, 'l' }, | ||
97 | { "cpha", 0, 0, 'H' }, | ||
98 | { "cpol", 0, 0, 'O' }, | ||
99 | { "lsb", 0, 0, 'L' }, | ||
100 | { "cs-high", 0, 0, 'C' }, | ||
101 | { "3wire", 0, 0, '3' }, | ||
102 | { NULL, 0, 0, 0 }, | ||
103 | }; | ||
104 | int c; | ||
105 | |||
106 | c = getopt_long(argc, argv, "D:s:d:b:lHOLC3", lopts, NULL); | ||
107 | |||
108 | if (c == -1) | ||
109 | break; | ||
110 | |||
111 | switch (c) { | ||
112 | case 'D': | ||
113 | device = optarg; | ||
114 | break; | ||
115 | case 's': | ||
116 | speed = atoi(optarg); | ||
117 | break; | ||
118 | case 'd': | ||
119 | delay = atoi(optarg); | ||
120 | break; | ||
121 | case 'b': | ||
122 | bits = atoi(optarg); | ||
123 | break; | ||
124 | case 'l': | ||
125 | mode |= SPI_LOOP; | ||
126 | break; | ||
127 | case 'H': | ||
128 | mode |= SPI_CPHA; | ||
129 | break; | ||
130 | case 'O': | ||
131 | mode |= SPI_CPOL; | ||
132 | break; | ||
133 | case 'L': | ||
134 | mode |= SPI_LSB_FIRST; | ||
135 | break; | ||
136 | case 'C': | ||
137 | mode |= SPI_CS_HIGH; | ||
138 | break; | ||
139 | case '3': | ||
140 | mode |= SPI_3WIRE; | ||
141 | break; | ||
142 | default: | ||
143 | print_usage(argv[0]); | ||
144 | break; | ||
145 | } | ||
146 | } | ||
147 | } | ||
148 | |||
149 | int main(int argc, char *argv[]) | ||
150 | { | ||
151 | int ret = 0; | ||
152 | int fd; | ||
153 | |||
154 | parse_opts(argc, argv); | ||
155 | |||
156 | fd = open(device, O_RDWR); | ||
157 | if (fd < 0) | ||
158 | pabort("can't open device"); | ||
159 | |||
160 | /* | ||
161 | * spi mode | ||
162 | */ | ||
163 | ret = ioctl(fd, SPI_IOC_WR_MODE, &mode); | ||
164 | if (ret == -1) | ||
165 | pabort("can't set spi mode"); | ||
166 | |||
167 | ret = ioctl(fd, SPI_IOC_RD_MODE, &mode); | ||
168 | if (ret == -1) | ||
169 | pabort("can't get spi mode"); | ||
170 | |||
171 | /* | ||
172 | * bits per word | ||
173 | */ | ||
174 | ret = ioctl(fd, SPI_IOC_WR_BITS_PER_WORD, &bits); | ||
175 | if (ret == -1) | ||
176 | pabort("can't set bits per word"); | ||
177 | |||
178 | ret = ioctl(fd, SPI_IOC_RD_BITS_PER_WORD, &bits); | ||
179 | if (ret == -1) | ||
180 | pabort("can't get bits per word"); | ||
181 | |||
182 | /* | ||
183 | * max speed hz | ||
184 | */ | ||
185 | ret = ioctl(fd, SPI_IOC_WR_MAX_SPEED_HZ, &speed); | ||
186 | if (ret == -1) | ||
187 | pabort("can't set max speed hz"); | ||
188 | |||
189 | ret = ioctl(fd, SPI_IOC_RD_MAX_SPEED_HZ, &speed); | ||
190 | if (ret == -1) | ||
191 | pabort("can't get max speed hz"); | ||
192 | |||
193 | printf("spi mode: %d\n", mode); | ||
194 | printf("bits per word: %d\n", bits); | ||
195 | printf("max speed: %d Hz (%d KHz)\n", speed, speed/1000); | ||
196 | |||
197 | transfer(fd); | ||
198 | |||
199 | close(fd); | ||
200 | |||
201 | return ret; | ||
202 | } | ||
diff --git a/Documentation/stable_api_nonsense.txt b/Documentation/stable_api_nonsense.txt index a2afca3b2bab..847b342b7b20 100644 --- a/Documentation/stable_api_nonsense.txt +++ b/Documentation/stable_api_nonsense.txt | |||
@@ -10,7 +10,7 @@ kernel to userspace interfaces. The kernel to userspace interface is | |||
10 | the one that application programs use, the syscall interface. That | 10 | the one that application programs use, the syscall interface. That |
11 | interface is _very_ stable over time, and will not break. I have old | 11 | interface is _very_ stable over time, and will not break. I have old |
12 | programs that were built on a pre 0.9something kernel that still work | 12 | programs that were built on a pre 0.9something kernel that still work |
13 | just fine on the latest 2.6 kernel release. This interface is the one | 13 | just fine on the latest 2.6 kernel release. That interface is the one |
14 | that users and application programmers can count on being stable. | 14 | that users and application programmers can count on being stable. |
15 | 15 | ||
16 | 16 | ||
diff --git a/Documentation/sysfs-rules.txt b/Documentation/sysfs-rules.txt index 42861bb0bc9b..80ef562160bb 100644 --- a/Documentation/sysfs-rules.txt +++ b/Documentation/sysfs-rules.txt | |||
@@ -1,19 +1,18 @@ | |||
1 | Rules on how to access information in the Linux kernel sysfs | 1 | Rules on how to access information in the Linux kernel sysfs |
2 | 2 | ||
3 | The kernel exported sysfs exports internal kernel implementation-details | 3 | The kernel-exported sysfs exports internal kernel implementation details |
4 | and depends on internal kernel structures and layout. It is agreed upon | 4 | and depends on internal kernel structures and layout. It is agreed upon |
5 | by the kernel developers that the Linux kernel does not provide a stable | 5 | by the kernel developers that the Linux kernel does not provide a stable |
6 | internal API. As sysfs is a direct export of kernel internal | 6 | internal API. As sysfs is a direct export of kernel internal |
7 | structures, the sysfs interface can not provide a stable interface eighter, | 7 | structures, the sysfs interface cannot provide a stable interface either; |
8 | it may always change along with internal kernel changes. | 8 | it may always change along with internal kernel changes. |
9 | 9 | ||
10 | To minimize the risk of breaking users of sysfs, which are in most cases | 10 | To minimize the risk of breaking users of sysfs, which are in most cases |
11 | low-level userspace applications, with a new kernel release, the users | 11 | low-level userspace applications, with a new kernel release, the users |
12 | of sysfs must follow some rules to use an as abstract-as-possible way to | 12 | of sysfs must follow some rules to use an as-abstract-as-possible way to |
13 | access this filesystem. The current udev and HAL programs already | 13 | access this filesystem. The current udev and HAL programs already |
14 | implement this and users are encouraged to plug, if possible, into the | 14 | implement this and users are encouraged to plug, if possible, into the |
15 | abstractions these programs provide instead of accessing sysfs | 15 | abstractions these programs provide instead of accessing sysfs directly. |
16 | directly. | ||
17 | 16 | ||
18 | But if you really do want or need to access sysfs directly, please follow | 17 | But if you really do want or need to access sysfs directly, please follow |
19 | the following rules and then your programs should work with future | 18 | the following rules and then your programs should work with future |
@@ -25,22 +24,22 @@ versions of the sysfs interface. | |||
25 | implementation details in its own API. Therefore it is not better than | 24 | implementation details in its own API. Therefore it is not better than |
26 | reading directories and opening the files yourself. | 25 | reading directories and opening the files yourself. |
27 | Also, it is not actively maintained, in the sense of reflecting the | 26 | Also, it is not actively maintained, in the sense of reflecting the |
28 | current kernel-development. The goal of providing a stable interface | 27 | current kernel development. The goal of providing a stable interface |
29 | to sysfs has failed, it causes more problems, than it solves. It | 28 | to sysfs has failed; it causes more problems than it solves. It |
30 | violates many of the rules in this document. | 29 | violates many of the rules in this document. |
31 | 30 | ||
32 | - sysfs is always at /sys | 31 | - sysfs is always at /sys |
33 | Parsing /proc/mounts is a waste of time. Other mount points are a | 32 | Parsing /proc/mounts is a waste of time. Other mount points are a |
34 | system configuration bug you should not try to solve. For test cases, | 33 | system configuration bug you should not try to solve. For test cases, |
35 | possibly support a SYSFS_PATH environment variable to overwrite the | 34 | possibly support a SYSFS_PATH environment variable to overwrite the |
36 | applications behavior, but never try to search for sysfs. Never try | 35 | application's behavior, but never try to search for sysfs. Never try |
37 | to mount it, if you are not an early boot script. | 36 | to mount it, if you are not an early boot script. |
38 | 37 | ||
39 | - devices are only "devices" | 38 | - devices are only "devices" |
40 | There is no such thing like class-, bus-, physical devices, | 39 | There is no such thing like class-, bus-, physical devices, |
41 | interfaces, and such that you can rely on in userspace. Everything is | 40 | interfaces, and such that you can rely on in userspace. Everything is |
42 | just simply a "device". Class-, bus-, physical, ... types are just | 41 | just simply a "device". Class-, bus-, physical, ... types are just |
43 | kernel implementation details, which should not be expected by | 42 | kernel implementation details which should not be expected by |
44 | applications that look for devices in sysfs. | 43 | applications that look for devices in sysfs. |
45 | 44 | ||
46 | The properties of a device are: | 45 | The properties of a device are: |
@@ -48,11 +47,11 @@ versions of the sysfs interface. | |||
48 | - identical to the DEVPATH value in the event sent from the kernel | 47 | - identical to the DEVPATH value in the event sent from the kernel |
49 | at device creation and removal | 48 | at device creation and removal |
50 | - the unique key to the device at that point in time | 49 | - the unique key to the device at that point in time |
51 | - the kernels path to the device-directory without the leading | 50 | - the kernel's path to the device directory without the leading |
52 | /sys, and always starting with with a slash | 51 | /sys, and always starting with with a slash |
53 | - all elements of a devpath must be real directories. Symlinks | 52 | - all elements of a devpath must be real directories. Symlinks |
54 | pointing to /sys/devices must always be resolved to their real | 53 | pointing to /sys/devices must always be resolved to their real |
55 | target, and the target path must be used to access the device. | 54 | target and the target path must be used to access the device. |
56 | That way the devpath to the device matches the devpath of the | 55 | That way the devpath to the device matches the devpath of the |
57 | kernel used at event time. | 56 | kernel used at event time. |
58 | - using or exposing symlink values as elements in a devpath string | 57 | - using or exposing symlink values as elements in a devpath string |
@@ -73,17 +72,17 @@ versions of the sysfs interface. | |||
73 | link | 72 | link |
74 | - it is retrieved by reading the "driver"-link and using only the | 73 | - it is retrieved by reading the "driver"-link and using only the |
75 | last element of the target path | 74 | last element of the target path |
76 | - devices which do not have "driver"-link, just do not have a | 75 | - devices which do not have "driver"-link just do not have a |
77 | driver; copying the driver value in a child device context, is a | 76 | driver; copying the driver value in a child device context is a |
78 | bug in the application | 77 | bug in the application |
79 | 78 | ||
80 | o attributes | 79 | o attributes |
81 | - the files in the device directory or files below a subdirectories | 80 | - the files in the device directory or files below subdirectories |
82 | of the same device directory | 81 | of the same device directory |
83 | - accessing attributes reached by a symlink pointing to another device, | 82 | - accessing attributes reached by a symlink pointing to another device, |
84 | like the "device"-link, is a bug in the application | 83 | like the "device"-link, is a bug in the application |
85 | 84 | ||
86 | Everything else is just a kernel driver-core implementation detail, | 85 | Everything else is just a kernel driver-core implementation detail |
87 | that should not be assumed to be stable across kernel releases. | 86 | that should not be assumed to be stable across kernel releases. |
88 | 87 | ||
89 | - Properties of parent devices never belong into a child device. | 88 | - Properties of parent devices never belong into a child device. |
@@ -91,25 +90,25 @@ versions of the sysfs interface. | |||
91 | context properties. If the device 'eth0' or 'sda' does not have a | 90 | context properties. If the device 'eth0' or 'sda' does not have a |
92 | "driver"-link, then this device does not have a driver. Its value is empty. | 91 | "driver"-link, then this device does not have a driver. Its value is empty. |
93 | Never copy any property of the parent-device into a child-device. Parent | 92 | Never copy any property of the parent-device into a child-device. Parent |
94 | device-properties may change dynamically without any notice to the | 93 | device properties may change dynamically without any notice to the |
95 | child device. | 94 | child device. |
96 | 95 | ||
97 | - Hierarchy in a single device-tree | 96 | - Hierarchy in a single device tree |
98 | There is only one valid place in sysfs where hierarchy can be examined | 97 | There is only one valid place in sysfs where hierarchy can be examined |
99 | and this is below: /sys/devices. | 98 | and this is below: /sys/devices. |
100 | It is planned, that all device directories will end up in the tree | 99 | It is planned that all device directories will end up in the tree |
101 | below this directory. | 100 | below this directory. |
102 | 101 | ||
103 | - Classification by subsystem | 102 | - Classification by subsystem |
104 | There are currently three places for classification of devices: | 103 | There are currently three places for classification of devices: |
105 | /sys/block, /sys/class and /sys/bus. It is planned that these will | 104 | /sys/block, /sys/class and /sys/bus. It is planned that these will |
106 | not contain any device-directories themselves, but only flat lists of | 105 | not contain any device directories themselves, but only flat lists of |
107 | symlinks pointing to the unified /sys/devices tree. | 106 | symlinks pointing to the unified /sys/devices tree. |
108 | All three places have completely different rules on how to access | 107 | All three places have completely different rules on how to access |
109 | device information. It is planned to merge all three | 108 | device information. It is planned to merge all three |
110 | classification-directories into one place at /sys/subsystem, | 109 | classification directories into one place at /sys/subsystem, |
111 | following the layout of the bus-directories. All buses and | 110 | following the layout of the bus directories. All buses and |
112 | classes, including the converted block-subsystem, will show up | 111 | classes, including the converted block subsystem, will show up |
113 | there. | 112 | there. |
114 | The devices belonging to a subsystem will create a symlink in the | 113 | The devices belonging to a subsystem will create a symlink in the |
115 | "devices" directory at /sys/subsystem/<name>/devices. | 114 | "devices" directory at /sys/subsystem/<name>/devices. |
@@ -121,38 +120,38 @@ versions of the sysfs interface. | |||
121 | subsystem name. | 120 | subsystem name. |
122 | 121 | ||
123 | Assuming /sys/class/<subsystem> and /sys/bus/<subsystem>, or | 122 | Assuming /sys/class/<subsystem> and /sys/bus/<subsystem>, or |
124 | /sys/block and /sys/class/block are not interchangeable, is a bug in | 123 | /sys/block and /sys/class/block are not interchangeable is a bug in |
125 | the application. | 124 | the application. |
126 | 125 | ||
127 | - Block | 126 | - Block |
128 | The converted block-subsystem at /sys/class/block, or | 127 | The converted block subsystem at /sys/class/block or |
129 | /sys/subsystem/block will contain the links for disks and partitions | 128 | /sys/subsystem/block will contain the links for disks and partitions |
130 | at the same level, never in a hierarchy. Assuming the block-subsytem to | 129 | at the same level, never in a hierarchy. Assuming the block subsytem to |
131 | contain only disks and not partition-devices in the same flat list is | 130 | contain only disks and not partition devices in the same flat list is |
132 | a bug in the application. | 131 | a bug in the application. |
133 | 132 | ||
134 | - "device"-link and <subsystem>:<kernel name>-links | 133 | - "device"-link and <subsystem>:<kernel name>-links |
135 | Never depend on the "device"-link. The "device"-link is a workaround | 134 | Never depend on the "device"-link. The "device"-link is a workaround |
136 | for the old layout, where class-devices are not created in | 135 | for the old layout, where class devices are not created in |
137 | /sys/devices/ like the bus-devices. If the link-resolving of a | 136 | /sys/devices/ like the bus devices. If the link-resolving of a |
138 | device-directory does not end in /sys/devices/, you can use the | 137 | device directory does not end in /sys/devices/, you can use the |
139 | "device"-link to find the parent devices in /sys/devices/. That is the | 138 | "device"-link to find the parent devices in /sys/devices/. That is the |
140 | single valid use of the "device"-link, it must never appear in any | 139 | single valid use of the "device"-link; it must never appear in any |
141 | path as an element. Assuming the existence of the "device"-link for | 140 | path as an element. Assuming the existence of the "device"-link for |
142 | a device in /sys/devices/ is a bug in the application. | 141 | a device in /sys/devices/ is a bug in the application. |
143 | Accessing /sys/class/net/eth0/device is a bug in the application. | 142 | Accessing /sys/class/net/eth0/device is a bug in the application. |
144 | 143 | ||
145 | Never depend on the class-specific links back to the /sys/class | 144 | Never depend on the class-specific links back to the /sys/class |
146 | directory. These links are also a workaround for the design mistake | 145 | directory. These links are also a workaround for the design mistake |
147 | that class-devices are not created in /sys/devices. If a device | 146 | that class devices are not created in /sys/devices. If a device |
148 | directory does not contain directories for child devices, these links | 147 | directory does not contain directories for child devices, these links |
149 | may be used to find the child devices in /sys/class. That is the single | 148 | may be used to find the child devices in /sys/class. That is the single |
150 | valid use of these links, they must never appear in any path as an | 149 | valid use of these links; they must never appear in any path as an |
151 | element. Assuming the existence of these links for devices which are | 150 | element. Assuming the existence of these links for devices which are |
152 | real child device directories in the /sys/devices tree, is a bug in | 151 | real child device directories in the /sys/devices tree is a bug in |
153 | the application. | 152 | the application. |
154 | 153 | ||
155 | It is planned to remove all these links when when all class-device | 154 | It is planned to remove all these links when all class device |
156 | directories live in /sys/devices. | 155 | directories live in /sys/devices. |
157 | 156 | ||
158 | - Position of devices along device chain can change. | 157 | - Position of devices along device chain can change. |
@@ -161,6 +160,5 @@ versions of the sysfs interface. | |||
161 | the chain. You must always request the parent device you are looking for | 160 | the chain. You must always request the parent device you are looking for |
162 | by its subsystem value. You need to walk up the chain until you find | 161 | by its subsystem value. You need to walk up the chain until you find |
163 | the device that matches the expected subsystem. Depending on a specific | 162 | the device that matches the expected subsystem. Depending on a specific |
164 | position of a parent device, or exposing relative paths, using "../" to | 163 | position of a parent device or exposing relative paths using "../" to |
165 | access the chain of parents, is a bug in the application. | 164 | access the chain of parents is a bug in the application. |
166 | |||
diff --git a/Documentation/sysrq.txt b/Documentation/sysrq.txt index ba328f255417..ef19142896ca 100644 --- a/Documentation/sysrq.txt +++ b/Documentation/sysrq.txt | |||
@@ -1,6 +1,6 @@ | |||
1 | Linux Magic System Request Key Hacks | 1 | Linux Magic System Request Key Hacks |
2 | Documentation for sysrq.c | 2 | Documentation for sysrq.c |
3 | Last update: 2007-MAR-14 | 3 | Last update: 2007-AUG-04 |
4 | 4 | ||
5 | * What is the magic SysRq key? | 5 | * What is the magic SysRq key? |
6 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | 6 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
@@ -78,7 +78,7 @@ On all - write a character to /proc/sysrq-trigger. e.g.: | |||
78 | 'g' - Used by kgdb on ppc and sh platforms. | 78 | 'g' - Used by kgdb on ppc and sh platforms. |
79 | 79 | ||
80 | 'h' - Will display help (actually any other key than those listed | 80 | 'h' - Will display help (actually any other key than those listed |
81 | above will display help. but 'h' is easy to remember :-) | 81 | here will display help. but 'h' is easy to remember :-) |
82 | 82 | ||
83 | 'i' - Send a SIGKILL to all processes, except for init. | 83 | 'i' - Send a SIGKILL to all processes, except for init. |
84 | 84 | ||
diff --git a/Documentation/thinkpad-acpi.txt b/Documentation/thinkpad-acpi.txt index 6711fbcf4080..eb2f5986e1eb 100644 --- a/Documentation/thinkpad-acpi.txt +++ b/Documentation/thinkpad-acpi.txt | |||
@@ -105,10 +105,10 @@ The version of thinkpad-acpi's sysfs interface is exported by the driver | |||
105 | as a driver attribute (see below). | 105 | as a driver attribute (see below). |
106 | 106 | ||
107 | Sysfs driver attributes are on the driver's sysfs attribute space, | 107 | Sysfs driver attributes are on the driver's sysfs attribute space, |
108 | for 2.6.20 this is /sys/bus/platform/drivers/thinkpad-acpi/. | 108 | for 2.6.20 this is /sys/bus/platform/drivers/thinkpad_acpi/. |
109 | 109 | ||
110 | Sysfs device attributes are on the driver's sysfs attribute space, | 110 | Sysfs device attributes are on the driver's sysfs attribute space, |
111 | for 2.6.20 this is /sys/devices/platform/thinkpad-acpi/. | 111 | for 2.6.20 this is /sys/devices/platform/thinkpad_acpi/. |
112 | 112 | ||
113 | Driver version | 113 | Driver version |
114 | -------------- | 114 | -------------- |
diff --git a/Documentation/vm/slabinfo.c b/Documentation/vm/slabinfo.c index d4f21ffd1404..1af7bd5a2183 100644 --- a/Documentation/vm/slabinfo.c +++ b/Documentation/vm/slabinfo.c | |||
@@ -396,7 +396,7 @@ void report(struct slabinfo *s) | |||
396 | if (strcmp(s->name, "*") == 0) | 396 | if (strcmp(s->name, "*") == 0) |
397 | return; | 397 | return; |
398 | 398 | ||
399 | printf("\nSlabcache: %-20s Aliases: %2d Order : %2d Objects: %d\n", | 399 | printf("\nSlabcache: %-20s Aliases: %2d Order : %2d Objects: %lu\n", |
400 | s->name, s->aliases, s->order, s->objects); | 400 | s->name, s->aliases, s->order, s->objects); |
401 | if (s->hwcache_align) | 401 | if (s->hwcache_align) |
402 | printf("** Hardware cacheline aligned\n"); | 402 | printf("** Hardware cacheline aligned\n"); |