aboutsummaryrefslogtreecommitdiffstats
path: root/Documentation
diff options
context:
space:
mode:
Diffstat (limited to 'Documentation')
-rw-r--r--Documentation/DMA-API.txt79
-rw-r--r--Documentation/DocBook/kernel-api.tmpl19
-rw-r--r--Documentation/block/barrier.txt6
-rw-r--r--Documentation/block/biodoc.txt10
-rw-r--r--Documentation/block/request.txt2
-rw-r--r--Documentation/dontdiff3
-rw-r--r--Documentation/feature-removal-schedule.txt16
-rw-r--r--Documentation/filesystems/hfsplus.txt59
-rw-r--r--Documentation/gpio.txt4
-rw-r--r--Documentation/hpet.txt2
-rw-r--r--Documentation/hwmon/adm10314
-rw-r--r--Documentation/hwmon/thmc5074
-rw-r--r--Documentation/iostats.txt2
-rw-r--r--Documentation/ja_JP/HOWTO66
-rw-r--r--Documentation/ja_JP/stable_api_nonsense.txt20
-rw-r--r--Documentation/kernel-parameters.txt145
-rw-r--r--Documentation/keys.txt5
-rw-r--r--Documentation/kobject.txt178
-rw-r--r--Documentation/lguest/Makefile3
-rw-r--r--Documentation/lguest/extract58
-rw-r--r--Documentation/lguest/lguest.c702
-rw-r--r--Documentation/sched-stats.txt195
-rw-r--r--Documentation/spi/spidev_test.c202
-rw-r--r--Documentation/stable_api_nonsense.txt2
-rw-r--r--Documentation/sysfs-rules.txt72
25 files changed, 1382 insertions, 546 deletions
diff --git a/Documentation/DMA-API.txt b/Documentation/DMA-API.txt
index 805db4b2cba6..cc7a8c39fb6f 100644
--- a/Documentation/DMA-API.txt
+++ b/Documentation/DMA-API.txt
@@ -26,7 +26,7 @@ Part Ia - Using large dma-coherent buffers
26 26
27void * 27void *
28dma_alloc_coherent(struct device *dev, size_t size, 28dma_alloc_coherent(struct device *dev, size_t size,
29 dma_addr_t *dma_handle, int flag) 29 dma_addr_t *dma_handle, gfp_t flag)
30void * 30void *
31pci_alloc_consistent(struct pci_dev *dev, size_t size, 31pci_alloc_consistent(struct pci_dev *dev, size_t size,
32 dma_addr_t *dma_handle) 32 dma_addr_t *dma_handle)
@@ -38,7 +38,7 @@ to make sure to flush the processor's write buffers before telling
38devices to read that memory.) 38devices to read that memory.)
39 39
40This routine allocates a region of <size> bytes of consistent memory. 40This routine allocates a region of <size> bytes of consistent memory.
41it also returns a <dma_handle> which may be cast to an unsigned 41It also returns a <dma_handle> which may be cast to an unsigned
42integer the same width as the bus and used as the physical address 42integer the same width as the bus and used as the physical address
43base of the region. 43base of the region.
44 44
@@ -52,21 +52,21 @@ The simplest way to do that is to use the dma_pool calls (see below).
52 52
53The flag parameter (dma_alloc_coherent only) allows the caller to 53The flag parameter (dma_alloc_coherent only) allows the caller to
54specify the GFP_ flags (see kmalloc) for the allocation (the 54specify the GFP_ flags (see kmalloc) for the allocation (the
55implementation may chose to ignore flags that affect the location of 55implementation may choose to ignore flags that affect the location of
56the returned memory, like GFP_DMA). For pci_alloc_consistent, you 56the returned memory, like GFP_DMA). For pci_alloc_consistent, you
57must assume GFP_ATOMIC behaviour. 57must assume GFP_ATOMIC behaviour.
58 58
59void 59void
60dma_free_coherent(struct device *dev, size_t size, void *cpu_addr 60dma_free_coherent(struct device *dev, size_t size, void *cpu_addr,
61 dma_addr_t dma_handle) 61 dma_addr_t dma_handle)
62void 62void
63pci_free_consistent(struct pci_dev *dev, size_t size, void *cpu_addr 63pci_free_consistent(struct pci_dev *dev, size_t size, void *cpu_addr,
64 dma_addr_t dma_handle) 64 dma_addr_t dma_handle)
65 65
66Free the region of consistent memory you previously allocated. dev, 66Free the region of consistent memory you previously allocated. dev,
67size and dma_handle must all be the same as those passed into the 67size and dma_handle must all be the same as those passed into the
68consistent allocate. cpu_addr must be the virtual address returned by 68consistent allocate. cpu_addr must be the virtual address returned by
69the consistent allocate 69the consistent allocate.
70 70
71 71
72Part Ib - Using small dma-coherent buffers 72Part Ib - Using small dma-coherent buffers
@@ -77,9 +77,9 @@ To get this part of the dma_ API, you must #include <linux/dmapool.h>
77Many drivers need lots of small dma-coherent memory regions for DMA 77Many drivers need lots of small dma-coherent memory regions for DMA
78descriptors or I/O buffers. Rather than allocating in units of a page 78descriptors or I/O buffers. Rather than allocating in units of a page
79or more using dma_alloc_coherent(), you can use DMA pools. These work 79or more using dma_alloc_coherent(), you can use DMA pools. These work
80much like a struct kmem_cache, except that they use the dma-coherent allocator 80much like a struct kmem_cache, except that they use the dma-coherent allocator,
81not __get_free_pages(). Also, they understand common hardware constraints 81not __get_free_pages(). Also, they understand common hardware constraints
82for alignment, like queue heads needing to be aligned on N byte boundaries. 82for alignment, like queue heads needing to be aligned on N-byte boundaries.
83 83
84 84
85 struct dma_pool * 85 struct dma_pool *
@@ -102,15 +102,15 @@ crossing restrictions, pass 0 for alloc; passing 4096 says memory allocated
102from this pool must not cross 4KByte boundaries. 102from this pool must not cross 4KByte boundaries.
103 103
104 104
105 void *dma_pool_alloc(struct dma_pool *pool, int gfp_flags, 105 void *dma_pool_alloc(struct dma_pool *pool, gfp_t gfp_flags,
106 dma_addr_t *dma_handle); 106 dma_addr_t *dma_handle);
107 107
108 void *pci_pool_alloc(struct pci_pool *pool, int gfp_flags, 108 void *pci_pool_alloc(struct pci_pool *pool, gfp_t gfp_flags,
109 dma_addr_t *dma_handle); 109 dma_addr_t *dma_handle);
110 110
111This allocates memory from the pool; the returned memory will meet the size 111This allocates memory from the pool; the returned memory will meet the size
112and alignment requirements specified at creation time. Pass GFP_ATOMIC to 112and alignment requirements specified at creation time. Pass GFP_ATOMIC to
113prevent blocking, or if it's permitted (not in_interrupt, not holding SMP locks) 113prevent blocking, or if it's permitted (not in_interrupt, not holding SMP locks),
114pass GFP_KERNEL to allow blocking. Like dma_alloc_coherent(), this returns 114pass GFP_KERNEL to allow blocking. Like dma_alloc_coherent(), this returns
115two values: an address usable by the cpu, and the dma address usable by the 115two values: an address usable by the cpu, and the dma address usable by the
116pool's device. 116pool's device.
@@ -123,7 +123,7 @@ pool's device.
123 dma_addr_t addr); 123 dma_addr_t addr);
124 124
125This puts memory back into the pool. The pool is what was passed to 125This puts memory back into the pool. The pool is what was passed to
126the pool allocation routine; the cpu and dma addresses are what 126the pool allocation routine; the cpu (vaddr) and dma addresses are what
127were returned when that routine allocated the memory being freed. 127were returned when that routine allocated the memory being freed.
128 128
129 129
@@ -209,18 +209,18 @@ Notes: Not all memory regions in a machine can be mapped by this
209API. Further, regions that appear to be physically contiguous in 209API. Further, regions that appear to be physically contiguous in
210kernel virtual space may not be contiguous as physical memory. Since 210kernel virtual space may not be contiguous as physical memory. Since
211this API does not provide any scatter/gather capability, it will fail 211this API does not provide any scatter/gather capability, it will fail
212if the user tries to map a non physically contiguous piece of memory. 212if the user tries to map a non-physically contiguous piece of memory.
213For this reason, it is recommended that memory mapped by this API be 213For this reason, it is recommended that memory mapped by this API be
214obtained only from sources which guarantee to be physically contiguous 214obtained only from sources which guarantee it to be physically contiguous
215(like kmalloc). 215(like kmalloc).
216 216
217Further, the physical address of the memory must be within the 217Further, the physical address of the memory must be within the
218dma_mask of the device (the dma_mask represents a bit mask of the 218dma_mask of the device (the dma_mask represents a bit mask of the
219addressable region for the device. i.e. if the physical address of 219addressable region for the device. I.e., if the physical address of
220the memory anded with the dma_mask is still equal to the physical 220the memory anded with the dma_mask is still equal to the physical
221address, then the device can perform DMA to the memory). In order to 221address, then the device can perform DMA to the memory). In order to
222ensure that the memory allocated by kmalloc is within the dma_mask, 222ensure that the memory allocated by kmalloc is within the dma_mask,
223the driver may specify various platform dependent flags to restrict 223the driver may specify various platform-dependent flags to restrict
224the physical memory range of the allocation (e.g. on x86, GFP_DMA 224the physical memory range of the allocation (e.g. on x86, GFP_DMA
225guarantees to be within the first 16Mb of available physical memory, 225guarantees to be within the first 16Mb of available physical memory,
226as required by ISA devices). 226as required by ISA devices).
@@ -244,14 +244,14 @@ are guaranteed also to be cache line boundaries).
244 244
245DMA_TO_DEVICE synchronisation must be done after the last modification 245DMA_TO_DEVICE synchronisation must be done after the last modification
246of the memory region by the software and before it is handed off to 246of the memory region by the software and before it is handed off to
247the driver. Once this primitive is used. Memory covered by this 247the driver. Once this primitive is used, memory covered by this
248primitive should be treated as read only by the device. If the device 248primitive should be treated as read-only by the device. If the device
249may write to it at any point, it should be DMA_BIDIRECTIONAL (see 249may write to it at any point, it should be DMA_BIDIRECTIONAL (see
250below). 250below).
251 251
252DMA_FROM_DEVICE synchronisation must be done before the driver 252DMA_FROM_DEVICE synchronisation must be done before the driver
253accesses data that may be changed by the device. This memory should 253accesses data that may be changed by the device. This memory should
254be treated as read only by the driver. If the driver needs to write 254be treated as read-only by the driver. If the driver needs to write
255to it at any point, it should be DMA_BIDIRECTIONAL (see below). 255to it at any point, it should be DMA_BIDIRECTIONAL (see below).
256 256
257DMA_BIDIRECTIONAL requires special handling: it means that the driver 257DMA_BIDIRECTIONAL requires special handling: it means that the driver
@@ -261,7 +261,7 @@ you must always sync bidirectional memory twice: once before the
261memory is handed off to the device (to make sure all memory changes 261memory is handed off to the device (to make sure all memory changes
262are flushed from the processor) and once before the data may be 262are flushed from the processor) and once before the data may be
263accessed after being used by the device (to make sure any processor 263accessed after being used by the device (to make sure any processor
264cache lines are updated with data that the device may have changed. 264cache lines are updated with data that the device may have changed).
265 265
266void 266void
267dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size, 267dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size,
@@ -302,8 +302,8 @@ pci_dma_mapping_error(dma_addr_t dma_addr)
302 302
303In some circumstances dma_map_single and dma_map_page will fail to create 303In some circumstances dma_map_single and dma_map_page will fail to create
304a mapping. A driver can check for these errors by testing the returned 304a mapping. A driver can check for these errors by testing the returned
305dma address with dma_mapping_error(). A non zero return value means the mapping 305dma address with dma_mapping_error(). A non-zero return value means the mapping
306could not be created and the driver should take appropriate action (eg 306could not be created and the driver should take appropriate action (e.g.
307reduce current DMA mapping usage or delay and try again later). 307reduce current DMA mapping usage or delay and try again later).
308 308
309 int 309 int
@@ -315,7 +315,7 @@ reduce current DMA mapping usage or delay and try again later).
315 315
316Maps a scatter gather list from the block layer. 316Maps a scatter gather list from the block layer.
317 317
318Returns: the number of physical segments mapped (this may be shorted 318Returns: the number of physical segments mapped (this may be shorter
319than <nents> passed in if the block layer determines that some 319than <nents> passed in if the block layer determines that some
320elements of the scatter/gather list are physically adjacent and thus 320elements of the scatter/gather list are physically adjacent and thus
321may be mapped with a single entry). 321may be mapped with a single entry).
@@ -357,7 +357,7 @@ accessed sg->address and sg->length as shown above.
357 pci_unmap_sg(struct pci_dev *hwdev, struct scatterlist *sg, 357 pci_unmap_sg(struct pci_dev *hwdev, struct scatterlist *sg,
358 int nents, int direction) 358 int nents, int direction)
359 359
360unmap the previously mapped scatter/gather list. All the parameters 360Unmap the previously mapped scatter/gather list. All the parameters
361must be the same as those and passed in to the scatter/gather mapping 361must be the same as those and passed in to the scatter/gather mapping
362API. 362API.
363 363
@@ -377,7 +377,7 @@ void
377pci_dma_sync_sg(struct pci_dev *hwdev, struct scatterlist *sg, 377pci_dma_sync_sg(struct pci_dev *hwdev, struct scatterlist *sg,
378 int nelems, int direction) 378 int nelems, int direction)
379 379
380synchronise a single contiguous or scatter/gather mapping. All the 380Synchronise a single contiguous or scatter/gather mapping. All the
381parameters must be the same as those passed into the single mapping 381parameters must be the same as those passed into the single mapping
382API. 382API.
383 383
@@ -406,7 +406,7 @@ API at all.
406 406
407void * 407void *
408dma_alloc_noncoherent(struct device *dev, size_t size, 408dma_alloc_noncoherent(struct device *dev, size_t size,
409 dma_addr_t *dma_handle, int flag) 409 dma_addr_t *dma_handle, gfp_t flag)
410 410
411Identical to dma_alloc_coherent() except that the platform will 411Identical to dma_alloc_coherent() except that the platform will
412choose to return either consistent or non-consistent memory as it sees 412choose to return either consistent or non-consistent memory as it sees
@@ -426,34 +426,34 @@ void
426dma_free_noncoherent(struct device *dev, size_t size, void *cpu_addr, 426dma_free_noncoherent(struct device *dev, size_t size, void *cpu_addr,
427 dma_addr_t dma_handle) 427 dma_addr_t dma_handle)
428 428
429free memory allocated by the nonconsistent API. All parameters must 429Free memory allocated by the nonconsistent API. All parameters must
430be identical to those passed in (and returned by 430be identical to those passed in (and returned by
431dma_alloc_noncoherent()). 431dma_alloc_noncoherent()).
432 432
433int 433int
434dma_is_consistent(struct device *dev, dma_addr_t dma_handle) 434dma_is_consistent(struct device *dev, dma_addr_t dma_handle)
435 435
436returns true if the device dev is performing consistent DMA on the memory 436Returns true if the device dev is performing consistent DMA on the memory
437area pointed to by the dma_handle. 437area pointed to by the dma_handle.
438 438
439int 439int
440dma_get_cache_alignment(void) 440dma_get_cache_alignment(void)
441 441
442returns the processor cache alignment. This is the absolute minimum 442Returns the processor cache alignment. This is the absolute minimum
443alignment *and* width that you must observe when either mapping 443alignment *and* width that you must observe when either mapping
444memory or doing partial flushes. 444memory or doing partial flushes.
445 445
446Notes: This API may return a number *larger* than the actual cache 446Notes: This API may return a number *larger* than the actual cache
447line, but it will guarantee that one or more cache lines fit exactly 447line, but it will guarantee that one or more cache lines fit exactly
448into the width returned by this call. It will also always be a power 448into the width returned by this call. It will also always be a power
449of two for easy alignment 449of two for easy alignment.
450 450
451void 451void
452dma_sync_single_range(struct device *dev, dma_addr_t dma_handle, 452dma_sync_single_range(struct device *dev, dma_addr_t dma_handle,
453 unsigned long offset, size_t size, 453 unsigned long offset, size_t size,
454 enum dma_data_direction direction) 454 enum dma_data_direction direction)
455 455
456does a partial sync. starting at offset and continuing for size. You 456Does a partial sync, starting at offset and continuing for size. You
457must be careful to observe the cache alignment and width when doing 457must be careful to observe the cache alignment and width when doing
458anything like this. You must also be extra careful about accessing 458anything like this. You must also be extra careful about accessing
459memory you intend to sync partially. 459memory you intend to sync partially.
@@ -472,21 +472,20 @@ dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
472 dma_addr_t device_addr, size_t size, int 472 dma_addr_t device_addr, size_t size, int
473 flags) 473 flags)
474 474
475
476Declare region of memory to be handed out by dma_alloc_coherent when 475Declare region of memory to be handed out by dma_alloc_coherent when
477it's asked for coherent memory for this device. 476it's asked for coherent memory for this device.
478 477
479bus_addr is the physical address to which the memory is currently 478bus_addr is the physical address to which the memory is currently
480assigned in the bus responding region (this will be used by the 479assigned in the bus responding region (this will be used by the
481platform to perform the mapping) 480platform to perform the mapping).
482 481
483device_addr is the physical address the device needs to be programmed 482device_addr is the physical address the device needs to be programmed
484with actually to address this memory (this will be handed out as the 483with actually to address this memory (this will be handed out as the
485dma_addr_t in dma_alloc_coherent()) 484dma_addr_t in dma_alloc_coherent()).
486 485
487size is the size of the area (must be multiples of PAGE_SIZE). 486size is the size of the area (must be multiples of PAGE_SIZE).
488 487
489flags can be or'd together and are 488flags can be or'd together and are:
490 489
491DMA_MEMORY_MAP - request that the memory returned from 490DMA_MEMORY_MAP - request that the memory returned from
492dma_alloc_coherent() be directly writable. 491dma_alloc_coherent() be directly writable.
@@ -494,7 +493,7 @@ dma_alloc_coherent() be directly writable.
494DMA_MEMORY_IO - request that the memory returned from 493DMA_MEMORY_IO - request that the memory returned from
495dma_alloc_coherent() be addressable using read/write/memcpy_toio etc. 494dma_alloc_coherent() be addressable using read/write/memcpy_toio etc.
496 495
497One or both of these flags must be present 496One or both of these flags must be present.
498 497
499DMA_MEMORY_INCLUDES_CHILDREN - make the declared memory be allocated by 498DMA_MEMORY_INCLUDES_CHILDREN - make the declared memory be allocated by
500dma_alloc_coherent of any child devices of this one (for memory residing 499dma_alloc_coherent of any child devices of this one (for memory residing
@@ -528,7 +527,7 @@ dma_release_declared_memory(struct device *dev)
528Remove the memory region previously declared from the system. This 527Remove the memory region previously declared from the system. This
529API performs *no* in-use checking for this region and will return 528API performs *no* in-use checking for this region and will return
530unconditionally having removed all the required structures. It is the 529unconditionally having removed all the required structures. It is the
531drivers job to ensure that no parts of this memory region are 530driver's job to ensure that no parts of this memory region are
532currently in use. 531currently in use.
533 532
534void * 533void *
@@ -538,12 +537,10 @@ dma_mark_declared_memory_occupied(struct device *dev,
538This is used to occupy specific regions of the declared space 537This is used to occupy specific regions of the declared space
539(dma_alloc_coherent() will hand out the first free region it finds). 538(dma_alloc_coherent() will hand out the first free region it finds).
540 539
541device_addr is the *device* address of the region requested 540device_addr is the *device* address of the region requested.
542 541
543size is the size (and should be a page sized multiple). 542size is the size (and should be a page-sized multiple).
544 543
545The return value will be either a pointer to the processor virtual 544The return value will be either a pointer to the processor virtual
546address of the memory, or an error (via PTR_ERR()) if any part of the 545address of the memory, or an error (via PTR_ERR()) if any part of the
547region is occupied. 546region is occupied.
548
549
diff --git a/Documentation/DocBook/kernel-api.tmpl b/Documentation/DocBook/kernel-api.tmpl
index eb42bf9847cb..b886f52a9aac 100644
--- a/Documentation/DocBook/kernel-api.tmpl
+++ b/Documentation/DocBook/kernel-api.tmpl
@@ -380,7 +380,6 @@ X!Edrivers/base/interface.c
380!Edrivers/base/bus.c 380!Edrivers/base/bus.c
381 </sect1> 381 </sect1>
382 <sect1><title>Device Drivers Power Management</title> 382 <sect1><title>Device Drivers Power Management</title>
383!Edrivers/base/power/main.c
384!Edrivers/base/power/resume.c 383!Edrivers/base/power/resume.c
385!Edrivers/base/power/suspend.c 384!Edrivers/base/power/suspend.c
386 </sect1> 385 </sect1>
@@ -398,12 +397,12 @@ X!Edrivers/acpi/pci_bind.c
398--> 397-->
399 </sect1> 398 </sect1>
400 <sect1><title>Device drivers PnP support</title> 399 <sect1><title>Device drivers PnP support</title>
401!Edrivers/pnp/core.c 400!Idrivers/pnp/core.c
402<!-- No correct structured comments 401<!-- No correct structured comments
403X!Edrivers/pnp/system.c 402X!Edrivers/pnp/system.c
404 --> 403 -->
405!Edrivers/pnp/card.c 404!Edrivers/pnp/card.c
406!Edrivers/pnp/driver.c 405!Idrivers/pnp/driver.c
407!Edrivers/pnp/manager.c 406!Edrivers/pnp/manager.c
408!Edrivers/pnp/support.c 407!Edrivers/pnp/support.c
409 </sect1> 408 </sect1>
@@ -704,14 +703,22 @@ X!Idrivers/video/console/fonts.c
704 703
705 <chapter id="splice"> 704 <chapter id="splice">
706 <title>splice API</title> 705 <title>splice API</title>
707 <para>) 706 <para>
708 splice is a method for moving blocks of data around inside the 707 splice is a method for moving blocks of data around inside the
709 kernel, without continually transferring it between the kernel 708 kernel, without continually transferring them between the kernel
710 and user space. 709 and user space.
711 </para> 710 </para>
712!Iinclude/linux/splice.h
713!Ffs/splice.c 711!Ffs/splice.c
714 </chapter> 712 </chapter>
715 713
714 <chapter id="pipes">
715 <title>pipes API</title>
716 <para>
717 Pipe interfaces are all for in-kernel (builtin image) use.
718 They are not exported for use by modules.
719 </para>
720!Iinclude/linux/pipe_fs_i.h
721!Ffs/pipe.c
722 </chapter>
716 723
717</book> 724</book>
diff --git a/Documentation/block/barrier.txt b/Documentation/block/barrier.txt
index 7d279f2f5bb2..2c2f24f634e4 100644
--- a/Documentation/block/barrier.txt
+++ b/Documentation/block/barrier.txt
@@ -79,9 +79,9 @@ and how to prepare flush requests. Note that the term 'ordered' is
79used to indicate the whole sequence of performing barrier requests 79used to indicate the whole sequence of performing barrier requests
80including draining and flushing. 80including draining and flushing.
81 81
82typedef void (prepare_flush_fn)(request_queue_t *q, struct request *rq); 82typedef void (prepare_flush_fn)(struct request_queue *q, struct request *rq);
83 83
84int blk_queue_ordered(request_queue_t *q, unsigned ordered, 84int blk_queue_ordered(struct request_queue *q, unsigned ordered,
85 prepare_flush_fn *prepare_flush_fn); 85 prepare_flush_fn *prepare_flush_fn);
86 86
87@q : the queue in question 87@q : the queue in question
@@ -92,7 +92,7 @@ int blk_queue_ordered(request_queue_t *q, unsigned ordered,
92For example, SCSI disk driver's prepare_flush_fn looks like the 92For example, SCSI disk driver's prepare_flush_fn looks like the
93following. 93following.
94 94
95static void sd_prepare_flush(request_queue_t *q, struct request *rq) 95static void sd_prepare_flush(struct request_queue *q, struct request *rq)
96{ 96{
97 memset(rq->cmd, 0, sizeof(rq->cmd)); 97 memset(rq->cmd, 0, sizeof(rq->cmd));
98 rq->cmd_type = REQ_TYPE_BLOCK_PC; 98 rq->cmd_type = REQ_TYPE_BLOCK_PC;
diff --git a/Documentation/block/biodoc.txt b/Documentation/block/biodoc.txt
index 3adaace328a6..8af392fc6ef0 100644
--- a/Documentation/block/biodoc.txt
+++ b/Documentation/block/biodoc.txt
@@ -740,12 +740,12 @@ Block now offers some simple generic functionality to help support command
740queueing (typically known as tagged command queueing), ie manage more than 740queueing (typically known as tagged command queueing), ie manage more than
741one outstanding command on a queue at any given time. 741one outstanding command on a queue at any given time.
742 742
743 blk_queue_init_tags(request_queue_t *q, int depth) 743 blk_queue_init_tags(struct request_queue *q, int depth)
744 744
745 Initialize internal command tagging structures for a maximum 745 Initialize internal command tagging structures for a maximum
746 depth of 'depth'. 746 depth of 'depth'.
747 747
748 blk_queue_free_tags((request_queue_t *q) 748 blk_queue_free_tags((struct request_queue *q)
749 749
750 Teardown tag info associated with the queue. This will be done 750 Teardown tag info associated with the queue. This will be done
751 automatically by block if blk_queue_cleanup() is called on a queue 751 automatically by block if blk_queue_cleanup() is called on a queue
@@ -754,7 +754,7 @@ one outstanding command on a queue at any given time.
754The above are initialization and exit management, the main helpers during 754The above are initialization and exit management, the main helpers during
755normal operations are: 755normal operations are:
756 756
757 blk_queue_start_tag(request_queue_t *q, struct request *rq) 757 blk_queue_start_tag(struct request_queue *q, struct request *rq)
758 758
759 Start tagged operation for this request. A free tag number between 759 Start tagged operation for this request. A free tag number between
760 0 and 'depth' is assigned to the request (rq->tag holds this number), 760 0 and 'depth' is assigned to the request (rq->tag holds this number),
@@ -762,7 +762,7 @@ normal operations are:
762 for this queue is already achieved (or if the tag wasn't started for 762 for this queue is already achieved (or if the tag wasn't started for
763 some other reason), 1 is returned. Otherwise 0 is returned. 763 some other reason), 1 is returned. Otherwise 0 is returned.
764 764
765 blk_queue_end_tag(request_queue_t *q, struct request *rq) 765 blk_queue_end_tag(struct request_queue *q, struct request *rq)
766 766
767 End tagged operation on this request. 'rq' is removed from the internal 767 End tagged operation on this request. 'rq' is removed from the internal
768 book keeping structures. 768 book keeping structures.
@@ -781,7 +781,7 @@ queue. For instance, on IDE any tagged request error needs to clear both
781the hardware and software block queue and enable the driver to sanely restart 781the hardware and software block queue and enable the driver to sanely restart
782all the outstanding requests. There's a third helper to do that: 782all the outstanding requests. There's a third helper to do that:
783 783
784 blk_queue_invalidate_tags(request_queue_t *q) 784 blk_queue_invalidate_tags(struct request_queue *q)
785 785
786 Clear the internal block tag queue and re-add all the pending requests 786 Clear the internal block tag queue and re-add all the pending requests
787 to the request queue. The driver will receive them again on the 787 to the request queue. The driver will receive them again on the
diff --git a/Documentation/block/request.txt b/Documentation/block/request.txt
index 75924e2a6975..fff58acb40a3 100644
--- a/Documentation/block/request.txt
+++ b/Documentation/block/request.txt
@@ -83,6 +83,6 @@ struct bio *bio DBI First bio in request
83 83
84struct bio *biotail DBI Last bio in request 84struct bio *biotail DBI Last bio in request
85 85
86request_queue_t *q DB Request queue this request belongs to 86struct request_queue *q DB Request queue this request belongs to
87 87
88struct request_list *rl B Request list this request came from 88struct request_list *rl B Request list this request came from
diff --git a/Documentation/dontdiff b/Documentation/dontdiff
index 595a5ea4c690..7b9551fc6fe3 100644
--- a/Documentation/dontdiff
+++ b/Documentation/dontdiff
@@ -18,6 +18,7 @@
18*.moc 18*.moc
19*.mod.c 19*.mod.c
20*.o 20*.o
21*.o.*
21*.orig 22*.orig
22*.out 23*.out
23*.pdf 24*.pdf
@@ -163,6 +164,8 @@ raid6tables.c
163relocs 164relocs
164series 165series
165setup 166setup
167setup.bin
168setup.elf
166sim710_d.h* 169sim710_d.h*
167sImage 170sImage
168sm_tbl* 171sm_tbl*
diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index c175eedadb5f..a43d2878a4ef 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -211,22 +211,6 @@ Who: Richard Purdie <rpurdie@rpsys.net>
211 211
212--------------------------- 212---------------------------
213 213
214What: read_dev_chars(), read_conf_data{,_lpm}() (s390 common I/O layer)
215When: December 2007
216Why: These functions are a leftover from 2.4 times. They have several
217 problems:
218 - Duplication of checks that are done in the device driver's
219 interrupt handler
220 - common I/O layer can't do device specific error recovery
221 - device driver can't be notified for conditions happening during
222 execution of the function
223 Device drivers should issue the read device characteristics and read
224 configuration data ccws and do the appropriate error handling
225 themselves.
226Who: Cornelia Huck <cornelia.huck@de.ibm.com>
227
228---------------------------
229
230What: i2c-ixp2000, i2c-ixp4xx and scx200_i2c drivers 214What: i2c-ixp2000, i2c-ixp4xx and scx200_i2c drivers
231When: September 2007 215When: September 2007
232Why: Obsolete. The new i2c-gpio driver replaces all hardware-specific 216Why: Obsolete. The new i2c-gpio driver replaces all hardware-specific
diff --git a/Documentation/filesystems/hfsplus.txt b/Documentation/filesystems/hfsplus.txt
new file mode 100644
index 000000000000..af1628a1061c
--- /dev/null
+++ b/Documentation/filesystems/hfsplus.txt
@@ -0,0 +1,59 @@
1
2Macintosh HFSPlus Filesystem for Linux
3======================================
4
5HFSPlus is a filesystem first introduced in MacOS 8.1.
6HFSPlus has several extensions to HFS, including 32-bit allocation
7blocks, 255-character unicode filenames, and file sizes of 2^63 bytes.
8
9
10Mount options
11=============
12
13When mounting an HFSPlus filesystem, the following options are accepted:
14
15 creator=cccc, type=cccc
16 Specifies the creator/type values as shown by the MacOS finder
17 used for creating new files. Default values: '????'.
18
19 uid=n, gid=n
20 Specifies the user/group that owns all files on the filesystem
21 that have uninitialized permissions structures.
22 Default: user/group id of the mounting process.
23
24 umask=n
25 Specifies the umask (in octal) used for files and directories
26 that have uninitialized permissions structures.
27 Default: umask of the mounting process.
28
29 session=n
30 Select the CDROM session to mount as HFSPlus filesystem. Defaults to
31 leaving that decision to the CDROM driver. This option will fail
32 with anything but a CDROM as underlying devices.
33
34 part=n
35 Select partition number n from the devices. This option only makes
36 sense for CDROMs because they can't be partitioned under Linux.
37 For disk devices the generic partition parsing code does this
38 for us. Defaults to not parsing the partition table at all.
39
40 decompose
41 Decompose file name characters.
42
43 nodecompose
44 Do not decompose file name characters.
45
46 force
47 Used to force write access to volumes that are marked as journalled
48 or locked. Use at your own risk.
49
50 nls=cccc
51 Encoding to use when presenting file names.
52
53
54References
55==========
56
57kernel source: <file:fs/hfsplus>
58
59Apple Technote 1150 http://developer.apple.com/technotes/tn/tn1150.html
diff --git a/Documentation/gpio.txt b/Documentation/gpio.txt
index 218a8650f48d..6bc2ba215df9 100644
--- a/Documentation/gpio.txt
+++ b/Documentation/gpio.txt
@@ -148,7 +148,7 @@ pin ... that won't always match the specified output value, because of
148issues including wire-OR and output latencies. 148issues including wire-OR and output latencies.
149 149
150The get/set calls have no error returns because "invalid GPIO" should have 150The get/set calls have no error returns because "invalid GPIO" should have
151been reported earlier in gpio_set_direction(). However, note that not all 151been reported earlier from gpio_direction_*(). However, note that not all
152platforms can read the value of output pins; those that can't should always 152platforms can read the value of output pins; those that can't should always
153return zero. Also, using these calls for GPIOs that can't safely be accessed 153return zero. Also, using these calls for GPIOs that can't safely be accessed
154without sleeping (see below) is an error. 154without sleeping (see below) is an error.
@@ -239,7 +239,7 @@ map between them using calls like:
239Those return either the corresponding number in the other namespace, or 239Those return either the corresponding number in the other namespace, or
240else a negative errno code if the mapping can't be done. (For example, 240else a negative errno code if the mapping can't be done. (For example,
241some GPIOs can't used as IRQs.) It is an unchecked error to use a GPIO 241some GPIOs can't used as IRQs.) It is an unchecked error to use a GPIO
242number that hasn't been marked as an input using gpio_set_direction(), or 242number that wasn't set up as an input using gpio_direction_input(), or
243to use an IRQ number that didn't originally come from gpio_to_irq(). 243to use an IRQ number that didn't originally come from gpio_to_irq().
244 244
245These two mapping calls are expected to cost on the order of a single 245These two mapping calls are expected to cost on the order of a single
diff --git a/Documentation/hpet.txt b/Documentation/hpet.txt
index b7a3dc38dd52..6ad52d9dad6c 100644
--- a/Documentation/hpet.txt
+++ b/Documentation/hpet.txt
@@ -5,7 +5,7 @@ for the 8254 and Real Time Clock (RTC) periodic timer functionality.
5Each HPET can have up to 32 timers. It is possible to configure the 5Each HPET can have up to 32 timers. It is possible to configure the
6first two timers as legacy replacements for 8254 and RTC periodic timers. 6first two timers as legacy replacements for 8254 and RTC periodic timers.
7A specification done by Intel and Microsoft can be found at 7A specification done by Intel and Microsoft can be found at
8<http://www.intel.com/hardwaredesign/hpetspec.htm>. 8<http://www.intel.com/technology/architecture/hpetspec.htm>.
9 9
10The driver supports detection of HPET driver allocation and initialization 10The driver supports detection of HPET driver allocation and initialization
11of the HPET before the driver module_init routine is called. This enables 11of the HPET before the driver module_init routine is called. This enables
diff --git a/Documentation/hwmon/adm1031 b/Documentation/hwmon/adm1031
index 130a38382b98..be92a77da1d5 100644
--- a/Documentation/hwmon/adm1031
+++ b/Documentation/hwmon/adm1031
@@ -6,13 +6,13 @@ Supported chips:
6 Prefix: 'adm1030' 6 Prefix: 'adm1030'
7 Addresses scanned: I2C 0x2c to 0x2e 7 Addresses scanned: I2C 0x2c to 0x2e
8 Datasheet: Publicly available at the Analog Devices website 8 Datasheet: Publicly available at the Analog Devices website
9 http://products.analog.com/products/info.asp?product=ADM1030 9 http://www.analog.com/en/prod/0%2C2877%2CADM1030%2C00.html
10 10
11 * Analog Devices ADM1031 11 * Analog Devices ADM1031
12 Prefix: 'adm1031' 12 Prefix: 'adm1031'
13 Addresses scanned: I2C 0x2c to 0x2e 13 Addresses scanned: I2C 0x2c to 0x2e
14 Datasheet: Publicly available at the Analog Devices website 14 Datasheet: Publicly available at the Analog Devices website
15 http://products.analog.com/products/info.asp?product=ADM1031 15 http://www.analog.com/en/prod/0%2C2877%2CADM1031%2C00.html
16 16
17Authors: 17Authors:
18 Alexandre d'Alton <alex@alexdalton.org> 18 Alexandre d'Alton <alex@alexdalton.org>
diff --git a/Documentation/hwmon/thmc50 b/Documentation/hwmon/thmc50
new file mode 100644
index 000000000000..9639ca93d559
--- /dev/null
+++ b/Documentation/hwmon/thmc50
@@ -0,0 +1,74 @@
1Kernel driver thmc50
2=====================
3
4Supported chips:
5 * Analog Devices ADM1022
6 Prefix: 'adm1022'
7 Addresses scanned: I2C 0x2c - 0x2e
8 Datasheet: http://www.analog.com/en/prod/0,2877,ADM1022,00.html
9 * Texas Instruments THMC50
10 Prefix: 'thmc50'
11 Addresses scanned: I2C 0x2c - 0x2e
12 Datasheet: http://focus.ti.com/docs/prod/folders/print/thmc50.html
13
14Author: Krzysztof Helt <krzysztof.h1@wp.pl>
15
16This driver was derived from the 2.4 kernel thmc50.c source file.
17
18Credits:
19 thmc50.c (2.4 kernel):
20 Frodo Looijaard <frodol@dds.nl>
21 Philip Edelbrock <phil@netroedge.com>
22
23Module Parameters
24-----------------
25
26* adm1022_temp3: short array
27 List of adapter,address pairs to force chips into ADM1022 mode with
28 second remote temperature. This does not work for original THMC50 chips.
29
30Description
31-----------
32
33The THMC50 implements: an internal temperature sensor, support for an
34external diode-type temperature sensor (compatible w/ the diode sensor inside
35many processors), and a controllable fan/analog_out DAC. For the temperature
36sensors, limits can be set through the appropriate Overtemperature Shutdown
37register and Hysteresis register. Each value can be set and read to half-degree
38accuracy. An alarm is issued (usually to a connected LM78) when the
39temperature gets higher then the Overtemperature Shutdown value; it stays on
40until the temperature falls below the Hysteresis value. All temperatures are in
41degrees Celsius, and are guaranteed within a range of -55 to +125 degrees.
42
43The THMC50 only updates its values each 1.5 seconds; reading it more often
44will do no harm, but will return 'old' values.
45
46The THMC50 is usually used in combination with LM78-like chips, to measure
47the temperature of the processor(s).
48
49The ADM1022 works the same as THMC50 but it is faster (5 Hz instead of
501 Hz for THMC50). It can be also put in a new mode to handle additional
51remote temperature sensor. The driver use the mode set by BIOS by default.
52
53In case the BIOS is broken and the mode is set incorrectly, you can force
54the mode with additional remote temperature with adm1022_temp3 parameter.
55A typical symptom of wrong setting is a fan forced to full speed.
56
57Driver Features
58---------------
59
60The driver provides up to three temperatures:
61
62temp1 -- internal
63temp2 -- remote
64temp3 -- 2nd remote only for ADM1022
65
66pwm1 -- fan speed (0 = stop, 255 = full)
67pwm1_mode -- always 0 (DC mode)
68
69The value of 0 for pwm1 also forces FAN_OFF signal from the chip,
70so it stops fans even if the value 0 into the ANALOG_OUT register does not.
71
72The driver was tested on Compaq AP550 with two ADM1022 chips (one works
73in the temp3 mode), five temperature readings and two fans.
74
diff --git a/Documentation/iostats.txt b/Documentation/iostats.txt
index 09a1bafe2528..b963c3b4afa5 100644
--- a/Documentation/iostats.txt
+++ b/Documentation/iostats.txt
@@ -79,7 +79,7 @@ Field 8 -- # of milliseconds spent writing
79 measured from __make_request() to end_that_request_last()). 79 measured from __make_request() to end_that_request_last()).
80Field 9 -- # of I/Os currently in progress 80Field 9 -- # of I/Os currently in progress
81 The only field that should go to zero. Incremented as requests are 81 The only field that should go to zero. Incremented as requests are
82 given to appropriate request_queue_t and decremented as they finish. 82 given to appropriate struct request_queue and decremented as they finish.
83Field 10 -- # of milliseconds spent doing I/Os 83Field 10 -- # of milliseconds spent doing I/Os
84 This field is increases so long as field 9 is nonzero. 84 This field is increases so long as field 9 is nonzero.
85Field 11 -- weighted # of milliseconds spent doing I/Os 85Field 11 -- weighted # of milliseconds spent doing I/Os
diff --git a/Documentation/ja_JP/HOWTO b/Documentation/ja_JP/HOWTO
index b2446a090870..9f08dab1e75b 100644
--- a/Documentation/ja_JP/HOWTO
+++ b/Documentation/ja_JP/HOWTO
@@ -1,23 +1,24 @@
1NOTE: 1NOTE:
2This is Japanese translated version of "Documentation/HOWTO". 2This is a version of Documentation/HOWTO translated into Japanese.
3This one is maintained by Tsugikazu Shibata <tshibata@ab.jp.nec.com> 3This document is maintained by Tsugikazu Shibata <tshibata@ab.jp.nec.com>
4and JF Project team <www.linux.or.jp/JF>. 4and the JF Project team <www.linux.or.jp/JF>.
5If you find difference with original file or problem in translation, 5If you find any difference between this document and the original file
6please contact maintainer of this file or JF project. 6or a problem with the translation,
7 7please contact the maintainer of this file or JF project.
8Please also note that purpose of this file is easier to read for non 8
9English natives and not to be intended to fork. So, if you have any 9Please also note that the purpose of this file is to be easier to read
10comments or updates of this file, please try to update Original(English) 10for non English (read: Japanese) speakers and is not intended as a
11file at first. 11fork. So if you have any comments or updates for this file, please try
12 12to update the original English file first.
13Last Updated: 2007/06/04 13
14Last Updated: 2007/07/18
14================================== 15==================================
15ã“ã‚Œã¯ã€ 16ã“ã‚Œã¯ã€
16linux-2.6.21/Documentation/HOWTO 17linux-2.6.22/Documentation/HOWTO
17ã®å’Œè¨³ã§ã™ã€‚ 18ã®å’Œè¨³ã§ã™ã€‚
18 19
19翻訳団体: JF プロジェクト < http://www.linux.or.jp/JF/ > 20翻訳団体: JF プロジェクト < http://www.linux.or.jp/JF/ >
20翻訳日: 2007/06/04 21翻訳日: 2007/07/16
21翻訳者: Tsugikazu Shibata <tshibata at ab dot jp dot nec dot com> 22翻訳者: Tsugikazu Shibata <tshibata at ab dot jp dot nec dot com>
22校正者: æ¾å€‰ã•ã‚“ <nbh--mats at nifty dot com> 23校正者: æ¾å€‰ã•ã‚“ <nbh--mats at nifty dot com>
23 å°æž— é›…å…¸ã•ã‚“ (Masanori Kobayasi) <zap03216 at nifty dot ne dot jp> 24 å°æž— é›…å…¸ã•ã‚“ (Masanori Kobayasi) <zap03216 at nifty dot ne dot jp>
@@ -52,6 +53,7 @@ Linux カーãƒãƒ«é–‹ç™ºã‚³ãƒŸãƒ¥ãƒ‹ãƒ†ã‚£ã¨å…±ã«æ´»å‹•ã™ã‚‹ã‚„り方を学ã
52ã¾ãŸã€ã“ã®ã‚³ãƒŸãƒ¥ãƒ‹ãƒ†ã‚£ãŒãªãœä»Šã†ã¾ãã¾ã‚ã£ã¦ã„ã‚‹ã®ã‹ã¨ã„ã†ç†ç”±ã®ä¸€éƒ¨ã‚‚ 53ã¾ãŸã€ã“ã®ã‚³ãƒŸãƒ¥ãƒ‹ãƒ†ã‚£ãŒãªãœä»Šã†ã¾ãã¾ã‚ã£ã¦ã„ã‚‹ã®ã‹ã¨ã„ã†ç†ç”±ã®ä¸€éƒ¨ã‚‚
53説明ã—よã†ã¨è©¦ã¿ã¦ã„ã¾ã™ã€‚ 54説明ã—よã†ã¨è©¦ã¿ã¦ã„ã¾ã™ã€‚
54 55
56
55カーãƒãƒ«ã¯ å°‘é‡ã®ã‚¢ãƒ¼ã‚­ãƒ†ã‚¯ãƒãƒ£ä¾å­˜éƒ¨åˆ†ãŒã‚¢ã‚»ãƒ³ãƒ–リ言語ã§æ›¸ã‹ã‚Œã¦ã„ã‚‹ 57カーãƒãƒ«ã¯ å°‘é‡ã®ã‚¢ãƒ¼ã‚­ãƒ†ã‚¯ãƒãƒ£ä¾å­˜éƒ¨åˆ†ãŒã‚¢ã‚»ãƒ³ãƒ–リ言語ã§æ›¸ã‹ã‚Œã¦ã„ã‚‹
56以外ã¯å¤§éƒ¨åˆ†ã¯ C 言語ã§æ›¸ã‹ã‚Œã¦ã„ã¾ã™ã€‚C言語をよãç†è§£ã—ã¦ã„ã‚‹ã“ã¨ã¯ã‚«ãƒ¼ 58以外ã¯å¤§éƒ¨åˆ†ã¯ C 言語ã§æ›¸ã‹ã‚Œã¦ã„ã¾ã™ã€‚C言語をよãç†è§£ã—ã¦ã„ã‚‹ã“ã¨ã¯ã‚«ãƒ¼
57ãƒãƒ«é–‹ç™ºè€…ã«ã¯å¿…è¦ã§ã™ã€‚アーキテクãƒãƒ£å‘ã‘ã®ä½Žãƒ¬ãƒ™ãƒ«éƒ¨åˆ†ã®é–‹ç™ºã‚’ã™ã‚‹ã® 59ãƒãƒ«é–‹ç™ºè€…ã«ã¯å¿…è¦ã§ã™ã€‚アーキテクãƒãƒ£å‘ã‘ã®ä½Žãƒ¬ãƒ™ãƒ«éƒ¨åˆ†ã®é–‹ç™ºã‚’ã™ã‚‹ã®
@@ -141,6 +143,7 @@ Linux カーãƒãƒ«ã‚½ãƒ¼ã‚¹ãƒ„リーã¯å¹…広ã„範囲ã®ãƒ‰ã‚­ãƒ¥ãƒ¡ãƒ³ãƒˆã‚’å
141 ã“れらã®ãƒ«ãƒ¼ãƒ«ã«å¾“ãˆã°ã†ã¾ãã„ãã“ã¨ã‚’ä¿è¨¼ã™ã‚‹ã“ã¨ã§ã¯ã‚ã‚Šã¾ã›ã‚“ 143 ã“れらã®ãƒ«ãƒ¼ãƒ«ã«å¾“ãˆã°ã†ã¾ãã„ãã“ã¨ã‚’ä¿è¨¼ã™ã‚‹ã“ã¨ã§ã¯ã‚ã‚Šã¾ã›ã‚“
142 ㌠(ã™ã¹ã¦ã®ãƒ‘ッãƒã¯å†…容ã¨ã‚¹ã‚¿ã‚¤ãƒ«ã«ã¤ã„ã¦ç²¾æŸ»ã‚’å—ã‘ã‚‹ã®ã§)〠144 ㌠(ã™ã¹ã¦ã®ãƒ‘ッãƒã¯å†…容ã¨ã‚¹ã‚¿ã‚¤ãƒ«ã«ã¤ã„ã¦ç²¾æŸ»ã‚’å—ã‘ã‚‹ã®ã§)ã€
143 ルールã«å¾“ã‚ãªã‘ã‚Œã°é–“é•ã„ãªãã†ã¾ãã„ã‹ãªã„ã§ã—ょã†ã€‚ 145 ルールã«å¾“ã‚ãªã‘ã‚Œã°é–“é•ã„ãªãã†ã¾ãã„ã‹ãªã„ã§ã—ょã†ã€‚
146
144 ã“ã®ä»–ã«ãƒ‘ッãƒã‚’作る方法ã«ã¤ã„ã¦ã®ã‚ˆãã§ããŸè¨˜è¿°ã¯- 147 ã“ã®ä»–ã«ãƒ‘ッãƒã‚’作る方法ã«ã¤ã„ã¦ã®ã‚ˆãã§ããŸè¨˜è¿°ã¯-
145 148
146 "The Perfect Patch" 149 "The Perfect Patch"
@@ -360,44 +363,42 @@ linux-kernel メーリングリストã§åŽé›†ã•ã‚ŒãŸå¤šæ•°ã®ãƒ‘ッãƒã¨åŒæ
360 363
361 git ツリー- 364 git ツリー-
362 - Kbuild ã®é–‹ç™ºãƒ„リーã€Sam Ravnborg <sam@ravnborg.org> 365 - Kbuild ã®é–‹ç™ºãƒ„リーã€Sam Ravnborg <sam@ravnborg.org>
363 kernel.org:/pub/scm/linux/kernel/git/sam/kbuild.git 366 git.kernel.org:/pub/scm/linux/kernel/git/sam/kbuild.git
364 367
365 - ACPI ã®é–‹ç™ºãƒ„リー〠Len Brown <len.brown@intel.com> 368 - ACPI ã®é–‹ç™ºãƒ„リー〠Len Brown <len.brown@intel.com>
366 kernel.org:/pub/scm/linux/kernel/git/lenb/linux-acpi-2.6.git 369 git.kernel.org:/pub/scm/linux/kernel/git/lenb/linux-acpi-2.6.git
367 370
368 - Block ã®é–‹ç™ºãƒ„リーã€Jens Axboe <axboe@suse.de> 371 - Block ã®é–‹ç™ºãƒ„リーã€Jens Axboe <axboe@suse.de>
369 kernel.org:/pub/scm/linux/kernel/git/axboe/linux-2.6-block.git 372 git.kernel.org:/pub/scm/linux/kernel/git/axboe/linux-2.6-block.git
370 373
371 - DRM ã®é–‹ç™ºãƒ„リーã€Dave Airlie <airlied@linux.ie> 374 - DRM ã®é–‹ç™ºãƒ„リーã€Dave Airlie <airlied@linux.ie>
372 kernel.org:/pub/scm/linux/kernel/git/airlied/drm-2.6.git 375 git.kernel.org:/pub/scm/linux/kernel/git/airlied/drm-2.6.git
373 376
374 - ia64 ã®é–‹ç™ºãƒ„リーã€Tony Luck <tony.luck@intel.com> 377 - ia64 ã®é–‹ç™ºãƒ„リーã€Tony Luck <tony.luck@intel.com>
375 kernel.org:/pub/scm/linux/kernel/git/aegl/linux-2.6.git 378 git.kernel.org:/pub/scm/linux/kernel/git/aegl/linux-2.6.git
376
377 - ieee1394 ã®é–‹ç™ºãƒ„リーã€Jody McIntyre <scjody@modernduck.com>
378 kernel.org:/pub/scm/linux/kernel/git/scjody/ieee1394.git
379 379
380 - infiniband, Roland Dreier <rolandd@cisco.com> 380 - infiniband, Roland Dreier <rolandd@cisco.com>
381 kernel.org:/pub/scm/linux/kernel/git/roland/infiniband.git 381 git.kernel.org:/pub/scm/linux/kernel/git/roland/infiniband.git
382 382
383 - libata, Jeff Garzik <jgarzik@pobox.com> 383 - libata, Jeff Garzik <jgarzik@pobox.com>
384 kernel.org:/pub/scm/linux/kernel/git/jgarzik/libata-dev.git 384 git.kernel.org:/pub/scm/linux/kernel/git/jgarzik/libata-dev.git
385 385
386 - ãƒãƒƒãƒˆãƒ¯ãƒ¼ã‚¯ãƒ‰ãƒ©ã‚¤ãƒ, Jeff Garzik <jgarzik@pobox.com> 386 - ãƒãƒƒãƒˆãƒ¯ãƒ¼ã‚¯ãƒ‰ãƒ©ã‚¤ãƒ, Jeff Garzik <jgarzik@pobox.com>
387 kernel.org:/pub/scm/linux/kernel/git/jgarzik/netdev-2.6.git 387 git.kernel.org:/pub/scm/linux/kernel/git/jgarzik/netdev-2.6.git
388 388
389 - pcmcia, Dominik Brodowski <linux@dominikbrodowski.net> 389 - pcmcia, Dominik Brodowski <linux@dominikbrodowski.net>
390 kernel.org:/pub/scm/linux/kernel/git/brodo/pcmcia-2.6.git 390 git.kernel.org:/pub/scm/linux/kernel/git/brodo/pcmcia-2.6.git
391 391
392 - SCSI, James Bottomley <James.Bottomley@SteelEye.com> 392 - SCSI, James Bottomley <James.Bottomley@SteelEye.com>
393 kernel.org:/pub/scm/linux/kernel/git/jejb/scsi-misc-2.6.git 393 git.kernel.org:/pub/scm/linux/kernel/git/jejb/scsi-misc-2.6.git
394
395 ãã®ä»–ã® git カーãƒãƒ«ãƒ„リー㯠http://kernel.org/git ã«ä¸€è¦§è¡¨ãŒã‚ã‚Šã¾
396 ã™ã€‚
397 394
398 quilt ツリー- 395 quilt ツリー-
399 - USB, PCI ドライãƒã‚³ã‚¢ã¨ I2C, Greg Kroah-Hartman <gregkh@suse.de> 396 - USB, PCI ドライãƒã‚³ã‚¢ã¨ I2C, Greg Kroah-Hartman <gregkh@suse.de>
400 kernel.org/pub/linux/kernel/people/gregkh/gregkh-2.6/ 397 kernel.org/pub/linux/kernel/people/gregkh/gregkh-2.6/
398 - x86-64 㨠i386 ã®ä»²é–“ Andi Kleen <ak@suse.de>
399
400 ãã®ä»–ã®ã‚«ãƒ¼ãƒãƒ«ãƒ„リー㯠http://git.kernel.org/ 㨠MAINTAINERS ファ
401 イルã«ä¸€è¦§è¡¨ãŒã‚ã‚Šã¾ã™ã€‚
401 402
402ãƒã‚°ãƒ¬ãƒãƒ¼ãƒˆ 403ãƒã‚°ãƒ¬ãƒãƒ¼ãƒˆ
403------------- 404-------------
@@ -508,6 +509,7 @@ MAINTAINERS ファイルã«ãƒªã‚¹ãƒˆãŒã‚ã‚Šã¾ã™ã®ã§å‚ç…§ã—ã¦ãã ã•ã
508ã›ã‚“*。å˜ã«è‡ªåˆ†ã®ãƒ‘ッãƒã«å¯¾ã—ã¦æŒ‡æ‘˜ã•ã‚ŒãŸå•é¡Œã‚’å…¨ã¦ä¿®æ­£ã—ã¦å†é€ã™ã‚Œã° 509ã›ã‚“*。å˜ã«è‡ªåˆ†ã®ãƒ‘ッãƒã«å¯¾ã—ã¦æŒ‡æ‘˜ã•ã‚ŒãŸå•é¡Œã‚’å…¨ã¦ä¿®æ­£ã—ã¦å†é€ã™ã‚Œã°
509ã„ã„ã®ã§ã™ã€‚ 510ã„ã„ã®ã§ã™ã€‚
510 511
512
511カーãƒãƒ«ã‚³ãƒŸãƒ¥ãƒ‹ãƒ†ã‚£ã¨ä¼æ¥­çµ„ç¹”ã®ã¡ãŒã„ 513カーãƒãƒ«ã‚³ãƒŸãƒ¥ãƒ‹ãƒ†ã‚£ã¨ä¼æ¥­çµ„ç¹”ã®ã¡ãŒã„
512----------------------------------------------------------------- 514-----------------------------------------------------------------
513 515
@@ -577,6 +579,7 @@ Linux カーãƒãƒ«ã‚³ãƒŸãƒ¥ãƒ‹ãƒ†ã‚£ã¯ã€ä¸€åº¦ã«å¤§é‡ã®ã‚³ãƒ¼ãƒ‰ã®å¡Šã‚’å–
577 ã‹ã—ã€500è¡Œã®ãƒ‘ッãƒã¯ã€æ­£ã—ã„ã“ã¨ã‚’レビューã™ã‚‹ã®ã«æ•°æ™‚é–“ã‹ã‹ã‚‹ã‹ã‚‚ 579 ã‹ã—ã€500è¡Œã®ãƒ‘ッãƒã¯ã€æ­£ã—ã„ã“ã¨ã‚’レビューã™ã‚‹ã®ã«æ•°æ™‚é–“ã‹ã‹ã‚‹ã‹ã‚‚
578 ã—ã‚Œã¾ã›ã‚“(時間ã¯ãƒ‘ッãƒã®ã‚µã‚¤ã‚ºãªã©ã«ã‚ˆã‚ŠæŒ‡æ•°é–¢æ•°ã«æ¯”例ã—ã¦ã‹ã‹ã‚Šã¾ 580 ã—ã‚Œã¾ã›ã‚“(時間ã¯ãƒ‘ッãƒã®ã‚µã‚¤ã‚ºãªã©ã«ã‚ˆã‚ŠæŒ‡æ•°é–¢æ•°ã«æ¯”例ã—ã¦ã‹ã‹ã‚Šã¾
579 ã™) 581 ã™)
582
580 å°ã•ã„パッãƒã¯ä½•ã‹ã‚ã£ãŸã¨ãã«ãƒ‡ãƒãƒƒã‚°ã‚‚ã¨ã¦ã‚‚ç°¡å˜ã«ãªã‚Šã¾ã™ã€‚パッ 583 å°ã•ã„パッãƒã¯ä½•ã‹ã‚ã£ãŸã¨ãã«ãƒ‡ãƒãƒƒã‚°ã‚‚ã¨ã¦ã‚‚ç°¡å˜ã«ãªã‚Šã¾ã™ã€‚パッ
581 ãƒã‚’1個1個å–り除ãã®ã¯ã€ã¨ã¦ã‚‚大ããªãƒ‘ッãƒã‚’当ã¦ãŸå¾Œã«(ã‹ã¤ã€ä½•ã‹ãŠ 584 ãƒã‚’1個1個å–り除ãã®ã¯ã€ã¨ã¦ã‚‚大ããªãƒ‘ッãƒã‚’当ã¦ãŸå¾Œã«(ã‹ã¤ã€ä½•ã‹ãŠ
582 ã‹ã—ããªã£ãŸå¾Œã§)解剖ã™ã‚‹ã®ã«æ¯”ã¹ã‚Œã°ã¨ã¦ã‚‚ç°¡å˜ã§ã™ã€‚ 585 ã‹ã—ããªã£ãŸå¾Œã§)解剖ã™ã‚‹ã®ã«æ¯”ã¹ã‚Œã°ã¨ã¦ã‚‚ç°¡å˜ã§ã™ã€‚
@@ -591,6 +594,7 @@ Linux カーãƒãƒ«ã‚³ãƒŸãƒ¥ãƒ‹ãƒ†ã‚£ã¯ã€ä¸€åº¦ã«å¤§é‡ã®ã‚³ãƒ¼ãƒ‰ã®å¡Šã‚’å–
591 ã†ã€‚先生ã¯ç°¡æ½”ãªæœ€é«˜ã®è§£ã‚’ã¿ãŸã„ã®ã§ã™ã€‚良ã„生徒ã¯ã“れを知ã£ã¦ 594 ã†ã€‚先生ã¯ç°¡æ½”ãªæœ€é«˜ã®è§£ã‚’ã¿ãŸã„ã®ã§ã™ã€‚良ã„生徒ã¯ã“れを知ã£ã¦
592 ãŠã‚Šã€ãã—ã¦æœ€çµ‚解ã®å‰ã®ä¸­é–“作業をæ出ã™ã‚‹ã“ã¨ã¯æ±ºã—ã¦ãªã„ã®ã§ 595 ãŠã‚Šã€ãã—ã¦æœ€çµ‚解ã®å‰ã®ä¸­é–“作業をæ出ã™ã‚‹ã“ã¨ã¯æ±ºã—ã¦ãªã„ã®ã§
593 ã™" 596 ã™"
597
594 カーãƒãƒ«é–‹ç™ºã§ã‚‚ã“ã‚Œã¯åŒã˜ã§ã™ã€‚メンテナーé”ã¨ãƒ¬ãƒ“ューアé”ã¯ã€ 598 カーãƒãƒ«é–‹ç™ºã§ã‚‚ã“ã‚Œã¯åŒã˜ã§ã™ã€‚メンテナーé”ã¨ãƒ¬ãƒ“ューアé”ã¯ã€
595 å•é¡Œã‚’解決ã™ã‚‹è§£ã®èƒŒå¾Œã«ãªã‚‹æ€è€ƒãƒ—ロセスをã¿ãŸã„ã¨ã¯æ€ã„ã¾ã›ã‚“。 599 å•é¡Œã‚’解決ã™ã‚‹è§£ã®èƒŒå¾Œã«ãªã‚‹æ€è€ƒãƒ—ロセスをã¿ãŸã„ã¨ã¯æ€ã„ã¾ã›ã‚“。
596 彼らã¯å˜ç´”ã§ã‚ã–ã‚„ã‹ãªè§£æ±ºæ–¹æ³•ã‚’ã¿ãŸã„ã®ã§ã™ã€‚ 600 彼らã¯å˜ç´”ã§ã‚ã–ã‚„ã‹ãªè§£æ±ºæ–¹æ³•ã‚’ã¿ãŸã„ã®ã§ã™ã€‚
diff --git a/Documentation/ja_JP/stable_api_nonsense.txt b/Documentation/ja_JP/stable_api_nonsense.txt
index b3f2b27f0881..7653b5cbfed2 100644
--- a/Documentation/ja_JP/stable_api_nonsense.txt
+++ b/Documentation/ja_JP/stable_api_nonsense.txt
@@ -1,17 +1,17 @@
1NOTE: 1NOTE:
2This is a Japanese translated version of 2This is a version of Documentation/stable_api_nonsense.txt into Japanese.
3"Documentation/stable_api_nonsense.txt". 3This document is maintained by IKEDA, Munehiro <m-ikeda@ds.jp.nec.com>
4This one is maintained by 4and the JF Project team <http://www.linux.or.jp/JF/>.
5IKEDA, Munehiro <m-ikeda@ds.jp.nec.com> 5If you find any difference between this document and the original file
6and JF Project team <http://www.linux.or.jp/JF/>. 6or a problem with the translation,
7If you find difference with original file or problem in translation,
8please contact the maintainer of this file or JF project. 7please contact the maintainer of this file or JF project.
9 8
10Please also note that purpose of this file is easier to read for non 9Please also note that the purpose of this file is to be easier to read
11English natives and not to be intended to fork. So, if you have any 10for non English (read: Japanese) speakers and is not intended as a
12comments or updates of this file, please try to update 11fork. So if you have any comments or updates of this file, please try
13Original(English) file at first. 12to update the original English file first.
14 13
14Last Updated: 2007/07/18
15================================== 15==================================
16ã“ã‚Œã¯ã€ 16ã“ã‚Œã¯ã€
17linux-2.6.22-rc4/Documentation/stable_api_nonsense.txt ã®å’Œè¨³ 17linux-2.6.22-rc4/Documentation/stable_api_nonsense.txt ã®å’Œè¨³
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index fb80e9ffea68..efdb42fd3fb8 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -30,6 +30,7 @@ the beginning of each description states the restrictions within which a
30parameter is applicable: 30parameter is applicable:
31 31
32 ACPI ACPI support is enabled. 32 ACPI ACPI support is enabled.
33 AGP AGP (Accelerated Graphics Port) is enabled.
33 ALSA ALSA sound support is enabled. 34 ALSA ALSA sound support is enabled.
34 APIC APIC support is enabled. 35 APIC APIC support is enabled.
35 APM Advanced Power Management support is enabled. 36 APM Advanced Power Management support is enabled.
@@ -40,7 +41,6 @@ parameter is applicable:
40 EIDE EIDE/ATAPI support is enabled. 41 EIDE EIDE/ATAPI support is enabled.
41 FB The frame buffer device is enabled. 42 FB The frame buffer device is enabled.
42 HW Appropriate hardware is enabled. 43 HW Appropriate hardware is enabled.
43 IA-32 IA-32 aka i386 architecture is enabled.
44 IA-64 IA-64 architecture is enabled. 44 IA-64 IA-64 architecture is enabled.
45 IOSCHED More than one I/O scheduler is enabled. 45 IOSCHED More than one I/O scheduler is enabled.
46 IP_PNP IP DHCP, BOOTP, or RARP is enabled. 46 IP_PNP IP DHCP, BOOTP, or RARP is enabled.
@@ -57,14 +57,14 @@ parameter is applicable:
57 MDA MDA console support is enabled. 57 MDA MDA console support is enabled.
58 MOUSE Appropriate mouse support is enabled. 58 MOUSE Appropriate mouse support is enabled.
59 MSI Message Signaled Interrupts (PCI). 59 MSI Message Signaled Interrupts (PCI).
60 MTD MTD support is enabled. 60 MTD MTD (Memory Technology Device) support is enabled.
61 NET Appropriate network support is enabled. 61 NET Appropriate network support is enabled.
62 NUMA NUMA support is enabled. 62 NUMA NUMA support is enabled.
63 GENERIC_TIME The generic timeofday code is enabled. 63 GENERIC_TIME The generic timeofday code is enabled.
64 NFS Appropriate NFS support is enabled. 64 NFS Appropriate NFS support is enabled.
65 OSS OSS sound support is enabled. 65 OSS OSS sound support is enabled.
66 PV_OPS A paravirtualized kernel 66 PV_OPS A paravirtualized kernel is enabled.
67 PARIDE The ParIDE subsystem is enabled. 67 PARIDE The ParIDE (parallel port IDE) subsystem is enabled.
68 PARISC The PA-RISC architecture is enabled. 68 PARISC The PA-RISC architecture is enabled.
69 PCI PCI bus support is enabled. 69 PCI PCI bus support is enabled.
70 PCMCIA The PCMCIA subsystem is enabled. 70 PCMCIA The PCMCIA subsystem is enabled.
@@ -91,6 +91,7 @@ parameter is applicable:
91 VT Virtual terminal support is enabled. 91 VT Virtual terminal support is enabled.
92 WDT Watchdog support is enabled. 92 WDT Watchdog support is enabled.
93 XT IBM PC/XT MFM hard disk support is enabled. 93 XT IBM PC/XT MFM hard disk support is enabled.
94 X86-32 X86-32, aka i386 architecture is enabled.
94 X86-64 X86-64 architecture is enabled. 95 X86-64 X86-64 architecture is enabled.
95 More X86-64 boot options can be found in 96 More X86-64 boot options can be found in
96 Documentation/x86_64/boot-options.txt . 97 Documentation/x86_64/boot-options.txt .
@@ -122,10 +123,6 @@ and is between 256 and 4096 characters. It is defined in the file
122./include/asm/setup.h as COMMAND_LINE_SIZE. 123./include/asm/setup.h as COMMAND_LINE_SIZE.
123 124
124 125
125 53c7xx= [HW,SCSI] Amiga SCSI controllers
126 See header of drivers/scsi/53c7xx.c.
127 See also Documentation/scsi/ncr53c7xx.txt.
128
129 acpi= [HW,ACPI,X86-64,i386] 126 acpi= [HW,ACPI,X86-64,i386]
130 Advanced Configuration and Power Interface 127 Advanced Configuration and Power Interface
131 Format: { force | off | ht | strict | noirq } 128 Format: { force | off | ht | strict | noirq }
@@ -222,11 +219,17 @@ and is between 256 and 4096 characters. It is defined in the file
222 219
223 acpi_fake_ecdt [HW,ACPI] Workaround failure due to BIOS lacking ECDT 220 acpi_fake_ecdt [HW,ACPI] Workaround failure due to BIOS lacking ECDT
224 221
225 acpi_pm_good [IA-32,X86-64] 222 acpi_pm_good [X86-32,X86-64]
226 Override the pmtimer bug detection: force the kernel 223 Override the pmtimer bug detection: force the kernel
227 to assume that this machine's pmtimer latches its value 224 to assume that this machine's pmtimer latches its value
228 and always returns good values. 225 and always returns good values.
229 226
227 agp= [AGP]
228 { off | try_unsupported }
229 off: disable AGP support
230 try_unsupported: try to drive unsupported chipsets
231 (may crash computer or cause data corruption)
232
230 enable_timer_pin_1 [i386,x86-64] 233 enable_timer_pin_1 [i386,x86-64]
231 Enable PIN 1 of APIC timer 234 Enable PIN 1 of APIC timer
232 Can be useful to work around chipset bugs 235 Can be useful to work around chipset bugs
@@ -279,7 +282,8 @@ and is between 256 and 4096 characters. It is defined in the file
279 not play well with APC CPU idle - disable it if you have 282 not play well with APC CPU idle - disable it if you have
280 APC and your system crashes randomly. 283 APC and your system crashes randomly.
281 284
282 apic= [APIC,i386] Change the output verbosity whilst booting 285 apic= [APIC,i386] Advanced Programmable Interrupt Controller
286 Change the output verbosity whilst booting
283 Format: { quiet (default) | verbose | debug } 287 Format: { quiet (default) | verbose | debug }
284 Change the amount of debugging information output 288 Change the amount of debugging information output
285 when initialising the APIC and IO-APIC components. 289 when initialising the APIC and IO-APIC components.
@@ -353,7 +357,7 @@ and is between 256 and 4096 characters. It is defined in the file
353 357
354 c101= [NET] Moxa C101 synchronous serial card 358 c101= [NET] Moxa C101 synchronous serial card
355 359
356 cachesize= [BUGS=IA-32] Override level 2 CPU cache size detection. 360 cachesize= [BUGS=X86-32] Override level 2 CPU cache size detection.
357 Sometimes CPU hardware bugs make them report the cache 361 Sometimes CPU hardware bugs make them report the cache
358 size incorrectly. The kernel will attempt work arounds 362 size incorrectly. The kernel will attempt work arounds
359 to fix known problems, but for some CPUs it is not 363 to fix known problems, but for some CPUs it is not
@@ -372,7 +376,7 @@ and is between 256 and 4096 characters. It is defined in the file
372 Value can be changed at runtime via 376 Value can be changed at runtime via
373 /selinux/checkreqprot. 377 /selinux/checkreqprot.
374 378
375 clock= [BUGS=IA-32, HW] gettimeofday clocksource override. 379 clock= [BUGS=X86-32, HW] gettimeofday clocksource override.
376 [Deprecated] 380 [Deprecated]
377 Forces specified clocksource (if available) to be used 381 Forces specified clocksource (if available) to be used
378 when calculating gettimeofday(). If specified 382 when calculating gettimeofday(). If specified
@@ -390,7 +394,7 @@ and is between 256 and 4096 characters. It is defined in the file
390 [ARM] imx_timer1,OSTS,netx_timer,mpu_timer2, 394 [ARM] imx_timer1,OSTS,netx_timer,mpu_timer2,
391 pxa_timer,timer3,32k_counter,timer0_1 395 pxa_timer,timer3,32k_counter,timer0_1
392 [AVR32] avr32 396 [AVR32] avr32
393 [IA-32] pit,hpet,tsc,vmi-timer; 397 [X86-32] pit,hpet,tsc,vmi-timer;
394 scx200_hrt on Geode; cyclone on IBM x440 398 scx200_hrt on Geode; cyclone on IBM x440
395 [MIPS] MIPS 399 [MIPS] MIPS
396 [PARISC] cr16 400 [PARISC] cr16
@@ -410,7 +414,7 @@ and is between 256 and 4096 characters. It is defined in the file
410 over the 8254 in addition to over the IO-APIC. The 414 over the 8254 in addition to over the IO-APIC. The
411 kernel tries to set a sensible default. 415 kernel tries to set a sensible default.
412 416
413 hpet= [IA-32,HPET] option to disable HPET and use PIT. 417 hpet= [X86-32,HPET] option to disable HPET and use PIT.
414 Format: disable 418 Format: disable
415 419
416 com20020= [HW,NET] ARCnet - COM20020 chipset 420 com20020= [HW,NET] ARCnet - COM20020 chipset
@@ -547,7 +551,7 @@ and is between 256 and 4096 characters. It is defined in the file
547 551
548 dtc3181e= [HW,SCSI] 552 dtc3181e= [HW,SCSI]
549 553
550 earlyprintk= [IA-32,X86-64,SH] 554 earlyprintk= [X86-32,X86-64,SH]
551 earlyprintk=vga 555 earlyprintk=vga
552 earlyprintk=serial[,ttySn[,baudrate]] 556 earlyprintk=serial[,ttySn[,baudrate]]
553 557
@@ -585,7 +589,7 @@ and is between 256 and 4096 characters. It is defined in the file
585 eisa_irq_edge= [PARISC,HW] 589 eisa_irq_edge= [PARISC,HW]
586 See header of drivers/parisc/eisa.c. 590 See header of drivers/parisc/eisa.c.
587 591
588 elanfreq= [IA-32] 592 elanfreq= [X86-32]
589 See comment before function elanfreq_setup() in 593 See comment before function elanfreq_setup() in
590 arch/i386/kernel/cpu/cpufreq/elanfreq.c. 594 arch/i386/kernel/cpu/cpufreq/elanfreq.c.
591 595
@@ -594,7 +598,7 @@ and is between 256 and 4096 characters. It is defined in the file
594 See Documentation/block/as-iosched.txt and 598 See Documentation/block/as-iosched.txt and
595 Documentation/block/deadline-iosched.txt for details. 599 Documentation/block/deadline-iosched.txt for details.
596 600
597 elfcorehdr= [IA-32, X86_64] 601 elfcorehdr= [X86-32, X86_64]
598 Specifies physical address of start of kernel core 602 Specifies physical address of start of kernel core
599 image elf header. Generally kexec loader will 603 image elf header. Generally kexec loader will
600 pass this option to capture kernel. 604 pass this option to capture kernel.
@@ -676,7 +680,7 @@ and is between 256 and 4096 characters. It is defined in the file
676 hisax= [HW,ISDN] 680 hisax= [HW,ISDN]
677 See Documentation/isdn/README.HiSax. 681 See Documentation/isdn/README.HiSax.
678 682
679 hugepages= [HW,IA-32,IA-64] Maximal number of HugeTLB pages. 683 hugepages= [HW,X86-32,IA-64] Maximal number of HugeTLB pages.
680 684
681 i8042.direct [HW] Put keyboard port into non-translated mode 685 i8042.direct [HW] Put keyboard port into non-translated mode
682 i8042.dumbkbd [HW] Pretend that controller can only read data from 686 i8042.dumbkbd [HW] Pretend that controller can only read data from
@@ -768,7 +772,8 @@ and is between 256 and 4096 characters. It is defined in the file
768 See Documentation/nfsroot.txt. 772 See Documentation/nfsroot.txt.
769 773
770 ip2= [HW] Set IO/IRQ pairs for up to 4 IntelliPort boards 774 ip2= [HW] Set IO/IRQ pairs for up to 4 IntelliPort boards
771 See comment before ip2_setup() in drivers/char/ip2.c. 775 See comment before ip2_setup() in
776 drivers/char/ip2/ip2base.c.
772 777
773 ips= [HW,SCSI] Adaptec / IBM ServeRAID controller 778 ips= [HW,SCSI] Adaptec / IBM ServeRAID controller
774 See header of drivers/scsi/ips.c. 779 See header of drivers/scsi/ips.c.
@@ -817,7 +822,7 @@ and is between 256 and 4096 characters. It is defined in the file
817 js= [HW,JOY] Analog joystick 822 js= [HW,JOY] Analog joystick
818 See Documentation/input/joystick.txt. 823 See Documentation/input/joystick.txt.
819 824
820 kernelcore=nn[KMG] [KNL,IA-32,IA-64,PPC,X86-64] This parameter 825 kernelcore=nn[KMG] [KNL,X86-32,IA-64,PPC,X86-64] This parameter
821 specifies the amount of memory usable by the kernel 826 specifies the amount of memory usable by the kernel
822 for non-movable allocations. The requested amount is 827 for non-movable allocations. The requested amount is
823 spread evenly throughout all nodes in the system. The 828 spread evenly throughout all nodes in the system. The
@@ -833,7 +838,7 @@ and is between 256 and 4096 characters. It is defined in the file
833 use the HighMem zone if it exists, and the Normal 838 use the HighMem zone if it exists, and the Normal
834 zone if it does not. 839 zone if it does not.
835 840
836 movablecore=nn[KMG] [KNL,IA-32,IA-64,PPC,X86-64] This parameter 841 movablecore=nn[KMG] [KNL,X86-32,IA-64,PPC,X86-64] This parameter
837 is similar to kernelcore except it specifies the 842 is similar to kernelcore except it specifies the
838 amount of memory used for migratable allocations. 843 amount of memory used for migratable allocations.
839 If both kernelcore and movablecore is specified, 844 If both kernelcore and movablecore is specified,
@@ -845,28 +850,20 @@ and is between 256 and 4096 characters. It is defined in the file
845 850
846 keepinitrd [HW,ARM] 851 keepinitrd [HW,ARM]
847 852
848 kstack=N [IA-32,X86-64] Print N words from the kernel stack 853 kstack=N [X86-32,X86-64] Print N words from the kernel stack
849 in oops dumps. 854 in oops dumps.
850 855
851 l2cr= [PPC] 856 l2cr= [PPC]
852 857
853 lapic [IA-32,APIC] Enable the local APIC even if BIOS 858 lapic [X86-32,APIC] Enable the local APIC even if BIOS
854 disabled it. 859 disabled it.
855 860
856 lapic_timer_c2_ok [IA-32,x86-64,APIC] trust the local apic timer in 861 lapic_timer_c2_ok [X86-32,x86-64,APIC] trust the local apic timer in
857 C2 power state. 862 C2 power state.
858 863
859 lasi= [HW,SCSI] PARISC LASI driver for the 53c700 chip 864 lasi= [HW,SCSI] PARISC LASI driver for the 53c700 chip
860 Format: addr:<io>,irq:<irq> 865 Format: addr:<io>,irq:<irq>
861 866
862 legacy_serial.force [HW,IA-32,X86-64]
863 Probe for COM ports at legacy addresses even
864 if PNPBIOS or ACPI should describe them. This
865 is for working around firmware defects.
866
867 llsc*= [IA64] See function print_params() in
868 arch/ia64/sn/kernel/llsc4.c.
869
870 load_ramdisk= [RAM] List of ramdisks to load from floppy 867 load_ramdisk= [RAM] List of ramdisks to load from floppy
871 See Documentation/ramdisk.txt. 868 See Documentation/ramdisk.txt.
872 869
@@ -972,11 +969,11 @@ and is between 256 and 4096 characters. It is defined in the file
972 [SCSI] Maximum number of LUNs received. 969 [SCSI] Maximum number of LUNs received.
973 Should be between 1 and 16384. 970 Should be between 1 and 16384.
974 971
975 mca-pentium [BUGS=IA-32] 972 mca-pentium [BUGS=X86-32]
976 973
977 mcatest= [IA-64] 974 mcatest= [IA-64]
978 975
979 mce [IA-32] Machine Check Exception 976 mce [X86-32] Machine Check Exception
980 977
981 md= [HW] RAID subsystems devices and level 978 md= [HW] RAID subsystems devices and level
982 See Documentation/md.txt. 979 See Documentation/md.txt.
@@ -988,14 +985,14 @@ and is between 256 and 4096 characters. It is defined in the file
988 mem=nn[KMG] [KNL,BOOT] Force usage of a specific amount of memory 985 mem=nn[KMG] [KNL,BOOT] Force usage of a specific amount of memory
989 Amount of memory to be used when the kernel is not able 986 Amount of memory to be used when the kernel is not able
990 to see the whole system memory or for test. 987 to see the whole system memory or for test.
991 [IA-32] Use together with memmap= to avoid physical 988 [X86-32] Use together with memmap= to avoid physical
992 address space collisions. Without memmap= PCI devices 989 address space collisions. Without memmap= PCI devices
993 could be placed at addresses belonging to unused RAM. 990 could be placed at addresses belonging to unused RAM.
994 991
995 mem=nopentium [BUGS=IA-32] Disable usage of 4MB pages for kernel 992 mem=nopentium [BUGS=X86-32] Disable usage of 4MB pages for kernel
996 memory. 993 memory.
997 994
998 memmap=exactmap [KNL,IA-32,X86_64] Enable setting of an exact 995 memmap=exactmap [KNL,X86-32,X86_64] Enable setting of an exact
999 E820 memory map, as specified by the user. 996 E820 memory map, as specified by the user.
1000 Such memmap=exactmap lines can be constructed based on 997 Such memmap=exactmap lines can be constructed based on
1001 BIOS output or other requirements. See the memmap=nn@ss 998 BIOS output or other requirements. See the memmap=nn@ss
@@ -1039,7 +1036,7 @@ and is between 256 and 4096 characters. It is defined in the file
1039 <name>,<region-number>[,<base>,<size>,<buswidth>,<altbuswidth>] 1036 <name>,<region-number>[,<base>,<size>,<buswidth>,<altbuswidth>]
1040 1037
1041 mtdparts= [MTD] 1038 mtdparts= [MTD]
1042 See drivers/mtd/cmdline.c. 1039 See drivers/mtd/cmdlinepart.c.
1043 1040
1044 mtouchusb.raw_coordinates= 1041 mtouchusb.raw_coordinates=
1045 [HW] Make the MicroTouch USB driver use raw coordinates 1042 [HW] Make the MicroTouch USB driver use raw coordinates
@@ -1081,9 +1078,9 @@ and is between 256 and 4096 characters. It is defined in the file
1081 [NFS] set the maximum lifetime for idmapper cache 1078 [NFS] set the maximum lifetime for idmapper cache
1082 entries. 1079 entries.
1083 1080
1084 nmi_watchdog= [KNL,BUGS=IA-32] Debugging features for SMP kernels 1081 nmi_watchdog= [KNL,BUGS=X86-32] Debugging features for SMP kernels
1085 1082
1086 no387 [BUGS=IA-32] Tells the kernel to use the 387 maths 1083 no387 [BUGS=X86-32] Tells the kernel to use the 387 maths
1087 emulation library even if a 387 maths coprocessor 1084 emulation library even if a 387 maths coprocessor
1088 is present. 1085 is present.
1089 1086
@@ -1114,17 +1111,17 @@ and is between 256 and 4096 characters. It is defined in the file
1114 1111
1115 noexec [IA-64] 1112 noexec [IA-64]
1116 1113
1117 noexec [IA-32,X86-64] 1114 noexec [X86-32,X86-64]
1118 noexec=on: enable non-executable mappings (default) 1115 noexec=on: enable non-executable mappings (default)
1119 noexec=off: disable nn-executable mappings 1116 noexec=off: disable nn-executable mappings
1120 1117
1121 nofxsr [BUGS=IA-32] Disables x86 floating point extended 1118 nofxsr [BUGS=X86-32] Disables x86 floating point extended
1122 register save and restore. The kernel will only save 1119 register save and restore. The kernel will only save
1123 legacy floating-point registers on task switch. 1120 legacy floating-point registers on task switch.
1124 1121
1125 nohlt [BUGS=ARM] 1122 nohlt [BUGS=ARM]
1126 1123
1127 no-hlt [BUGS=IA-32] Tells the kernel that the hlt 1124 no-hlt [BUGS=X86-32] Tells the kernel that the hlt
1128 instruction doesn't work correctly and not to 1125 instruction doesn't work correctly and not to
1129 use it. 1126 use it.
1130 1127
@@ -1139,12 +1136,12 @@ and is between 256 and 4096 characters. It is defined in the file
1139 Valid arguments: on, off 1136 Valid arguments: on, off
1140 Default: on 1137 Default: on
1141 1138
1142 noirqbalance [IA-32,SMP,KNL] Disable kernel irq balancing 1139 noirqbalance [X86-32,SMP,KNL] Disable kernel irq balancing
1143 1140
1144 noirqdebug [IA-32] Disables the code which attempts to detect and 1141 noirqdebug [X86-32] Disables the code which attempts to detect and
1145 disable unhandled interrupt sources. 1142 disable unhandled interrupt sources.
1146 1143
1147 no_timer_check [IA-32,X86_64,APIC] Disables the code which tests for 1144 no_timer_check [X86-32,X86_64,APIC] Disables the code which tests for
1148 broken timer IRQ sources. 1145 broken timer IRQ sources.
1149 1146
1150 noisapnp [ISAPNP] Disables ISA PnP code. 1147 noisapnp [ISAPNP] Disables ISA PnP code.
@@ -1156,20 +1153,20 @@ and is between 256 and 4096 characters. It is defined in the file
1156 1153
1157 nojitter [IA64] Disables jitter checking for ITC timers. 1154 nojitter [IA64] Disables jitter checking for ITC timers.
1158 1155
1159 nolapic [IA-32,APIC] Do not enable or use the local APIC. 1156 nolapic [X86-32,APIC] Do not enable or use the local APIC.
1160 1157
1161 nolapic_timer [IA-32,APIC] Do not use the local APIC timer. 1158 nolapic_timer [X86-32,APIC] Do not use the local APIC timer.
1162 1159
1163 noltlbs [PPC] Do not use large page/tlb entries for kernel 1160 noltlbs [PPC] Do not use large page/tlb entries for kernel
1164 lowmem mapping on PPC40x. 1161 lowmem mapping on PPC40x.
1165 1162
1166 nomca [IA-64] Disable machine check abort handling 1163 nomca [IA-64] Disable machine check abort handling
1167 1164
1168 nomce [IA-32] Machine Check Exception 1165 nomce [X86-32] Machine Check Exception
1169 1166
1170 noreplace-paravirt [IA-32,PV_OPS] Don't patch paravirt_ops 1167 noreplace-paravirt [X86-32,PV_OPS] Don't patch paravirt_ops
1171 1168
1172 noreplace-smp [IA-32,SMP] Don't replace SMP instructions 1169 noreplace-smp [X86-32,SMP] Don't replace SMP instructions
1173 with UP alternatives 1170 with UP alternatives
1174 1171
1175 noresidual [PPC] Don't use residual data on PReP machines. 1172 noresidual [PPC] Don't use residual data on PReP machines.
@@ -1183,7 +1180,7 @@ and is between 256 and 4096 characters. It is defined in the file
1183 1180
1184 nosbagart [IA-64] 1181 nosbagart [IA-64]
1185 1182
1186 nosep [BUGS=IA-32] Disables x86 SYSENTER/SYSEXIT support. 1183 nosep [BUGS=X86-32] Disables x86 SYSENTER/SYSEXIT support.
1187 1184
1188 nosmp [SMP] Tells an SMP kernel to act as a UP kernel. 1185 nosmp [SMP] Tells an SMP kernel to act as a UP kernel.
1189 1186
@@ -1191,7 +1188,7 @@ and is between 256 and 4096 characters. It is defined in the file
1191 1188
1192 nosync [HW,M68K] Disables sync negotiation for all devices. 1189 nosync [HW,M68K] Disables sync negotiation for all devices.
1193 1190
1194 notsc [BUGS=IA-32] Disable Time Stamp Counter 1191 notsc [BUGS=X86-32] Disable Time Stamp Counter
1195 1192
1196 nousb [USB] Disable the USB subsystem 1193 nousb [USB] Disable the USB subsystem
1197 1194
@@ -1264,28 +1261,28 @@ and is between 256 and 4096 characters. It is defined in the file
1264 See also Documentation/paride.txt. 1261 See also Documentation/paride.txt.
1265 1262
1266 pci=option[,option...] [PCI] various PCI subsystem options: 1263 pci=option[,option...] [PCI] various PCI subsystem options:
1267 off [IA-32] don't probe for the PCI bus 1264 off [X86-32] don't probe for the PCI bus
1268 bios [IA-32] force use of PCI BIOS, don't access 1265 bios [X86-32] force use of PCI BIOS, don't access
1269 the hardware directly. Use this if your machine 1266 the hardware directly. Use this if your machine
1270 has a non-standard PCI host bridge. 1267 has a non-standard PCI host bridge.
1271 nobios [IA-32] disallow use of PCI BIOS, only direct 1268 nobios [X86-32] disallow use of PCI BIOS, only direct
1272 hardware access methods are allowed. Use this 1269 hardware access methods are allowed. Use this
1273 if you experience crashes upon bootup and you 1270 if you experience crashes upon bootup and you
1274 suspect they are caused by the BIOS. 1271 suspect they are caused by the BIOS.
1275 conf1 [IA-32] Force use of PCI Configuration 1272 conf1 [X86-32] Force use of PCI Configuration
1276 Mechanism 1. 1273 Mechanism 1.
1277 conf2 [IA-32] Force use of PCI Configuration 1274 conf2 [X86-32] Force use of PCI Configuration
1278 Mechanism 2. 1275 Mechanism 2.
1279 nommconf [IA-32,X86_64] Disable use of MMCONFIG for PCI 1276 nommconf [X86-32,X86_64] Disable use of MMCONFIG for PCI
1280 Configuration 1277 Configuration
1281 nomsi [MSI] If the PCI_MSI kernel config parameter is 1278 nomsi [MSI] If the PCI_MSI kernel config parameter is
1282 enabled, this kernel boot option can be used to 1279 enabled, this kernel boot option can be used to
1283 disable the use of MSI interrupts system-wide. 1280 disable the use of MSI interrupts system-wide.
1284 nosort [IA-32] Don't sort PCI devices according to 1281 nosort [X86-32] Don't sort PCI devices according to
1285 order given by the PCI BIOS. This sorting is 1282 order given by the PCI BIOS. This sorting is
1286 done to get a device order compatible with 1283 done to get a device order compatible with
1287 older kernels. 1284 older kernels.
1288 biosirq [IA-32] Use PCI BIOS calls to get the interrupt 1285 biosirq [X86-32] Use PCI BIOS calls to get the interrupt
1289 routing table. These calls are known to be buggy 1286 routing table. These calls are known to be buggy
1290 on several machines and they hang the machine 1287 on several machines and they hang the machine
1291 when used, but on other computers it's the only 1288 when used, but on other computers it's the only
@@ -1293,32 +1290,32 @@ and is between 256 and 4096 characters. It is defined in the file
1293 this option if the kernel is unable to allocate 1290 this option if the kernel is unable to allocate
1294 IRQs or discover secondary PCI buses on your 1291 IRQs or discover secondary PCI buses on your
1295 motherboard. 1292 motherboard.
1296 rom [IA-32] Assign address space to expansion ROMs. 1293 rom [X86-32] Assign address space to expansion ROMs.
1297 Use with caution as certain devices share 1294 Use with caution as certain devices share
1298 address decoders between ROMs and other 1295 address decoders between ROMs and other
1299 resources. 1296 resources.
1300 irqmask=0xMMMM [IA-32] Set a bit mask of IRQs allowed to be 1297 irqmask=0xMMMM [X86-32] Set a bit mask of IRQs allowed to be
1301 assigned automatically to PCI devices. You can 1298 assigned automatically to PCI devices. You can
1302 make the kernel exclude IRQs of your ISA cards 1299 make the kernel exclude IRQs of your ISA cards
1303 this way. 1300 this way.
1304 pirqaddr=0xAAAAA [IA-32] Specify the physical address 1301 pirqaddr=0xAAAAA [X86-32] Specify the physical address
1305 of the PIRQ table (normally generated 1302 of the PIRQ table (normally generated
1306 by the BIOS) if it is outside the 1303 by the BIOS) if it is outside the
1307 F0000h-100000h range. 1304 F0000h-100000h range.
1308 lastbus=N [IA-32] Scan all buses thru bus #N. Can be 1305 lastbus=N [X86-32] Scan all buses thru bus #N. Can be
1309 useful if the kernel is unable to find your 1306 useful if the kernel is unable to find your
1310 secondary buses and you want to tell it 1307 secondary buses and you want to tell it
1311 explicitly which ones they are. 1308 explicitly which ones they are.
1312 assign-busses [IA-32] Always assign all PCI bus 1309 assign-busses [X86-32] Always assign all PCI bus
1313 numbers ourselves, overriding 1310 numbers ourselves, overriding
1314 whatever the firmware may have done. 1311 whatever the firmware may have done.
1315 usepirqmask [IA-32] Honor the possible IRQ mask stored 1312 usepirqmask [X86-32] Honor the possible IRQ mask stored
1316 in the BIOS $PIR table. This is needed on 1313 in the BIOS $PIR table. This is needed on
1317 some systems with broken BIOSes, notably 1314 some systems with broken BIOSes, notably
1318 some HP Pavilion N5400 and Omnibook XE3 1315 some HP Pavilion N5400 and Omnibook XE3
1319 notebooks. This will have no effect if ACPI 1316 notebooks. This will have no effect if ACPI
1320 IRQ routing is enabled. 1317 IRQ routing is enabled.
1321 noacpi [IA-32] Do not use ACPI for IRQ routing 1318 noacpi [X86-32] Do not use ACPI for IRQ routing
1322 or for PCI scanning. 1319 or for PCI scanning.
1323 routeirq Do IRQ routing for all PCI devices. 1320 routeirq Do IRQ routing for all PCI devices.
1324 This is normally done in pci_enable_device(), 1321 This is normally done in pci_enable_device(),
@@ -1467,13 +1464,13 @@ and is between 256 and 4096 characters. It is defined in the file
1467 Run specified binary instead of /init from the ramdisk, 1464 Run specified binary instead of /init from the ramdisk,
1468 used for early userspace startup. See initrd. 1465 used for early userspace startup. See initrd.
1469 1466
1470 reboot= [BUGS=IA-32,BUGS=ARM,BUGS=IA-64] Rebooting mode 1467 reboot= [BUGS=X86-32,BUGS=ARM,BUGS=IA-64] Rebooting mode
1471 Format: <reboot_mode>[,<reboot_mode2>[,...]] 1468 Format: <reboot_mode>[,<reboot_mode2>[,...]]
1472 See arch/*/kernel/reboot.c or arch/*/kernel/process.c 1469 See arch/*/kernel/reboot.c or arch/*/kernel/process.c
1473 1470
1474 reserve= [KNL,BUGS] Force the kernel to ignore some iomem area 1471 reserve= [KNL,BUGS] Force the kernel to ignore some iomem area
1475 1472
1476 reservetop= [IA-32] 1473 reservetop= [X86-32]
1477 Format: nn[KMG] 1474 Format: nn[KMG]
1478 Reserves a hole at the top of the kernel virtual 1475 Reserves a hole at the top of the kernel virtual
1479 address space. 1476 address space.
@@ -1564,7 +1561,7 @@ and is between 256 and 4096 characters. It is defined in the file
1564 Value can be changed at runtime via 1561 Value can be changed at runtime via
1565 /selinux/compat_net. 1562 /selinux/compat_net.
1566 1563
1567 serialnumber [BUGS=IA-32] 1564 serialnumber [BUGS=X86-32]
1568 1565
1569 sg_def_reserved_size= [SCSI] 1566 sg_def_reserved_size= [SCSI]
1570 1567
@@ -1617,7 +1614,7 @@ and is between 256 and 4096 characters. It is defined in the file
1617 smart2= [HW] 1614 smart2= [HW]
1618 Format: <io1>[,<io2>[,...,<io8>]] 1615 Format: <io1>[,<io2>[,...,<io8>]]
1619 1616
1620 smp-alt-once [IA-32,SMP] On a hotplug CPU system, only 1617 smp-alt-once [X86-32,SMP] On a hotplug CPU system, only
1621 attempt to substitute SMP alternatives once at boot. 1618 attempt to substitute SMP alternatives once at boot.
1622 1619
1623 smsc-ircc2.nopnp [HW] Don't use PNP to discover SMC devices 1620 smsc-ircc2.nopnp [HW] Don't use PNP to discover SMC devices
@@ -1882,7 +1879,7 @@ and is between 256 and 4096 characters. It is defined in the file
1882 usbhid.mousepoll= 1879 usbhid.mousepoll=
1883 [USBHID] The interval which mice are to be polled at. 1880 [USBHID] The interval which mice are to be polled at.
1884 1881
1885 vdso= [IA-32,SH,x86-64] 1882 vdso= [X86-32,SH,x86-64]
1886 vdso=2: enable compat VDSO (default with COMPAT_VDSO) 1883 vdso=2: enable compat VDSO (default with COMPAT_VDSO)
1887 vdso=1: enable VDSO (default) 1884 vdso=1: enable VDSO (default)
1888 vdso=0: disable VDSO mapping 1885 vdso=0: disable VDSO mapping
@@ -1893,7 +1890,7 @@ and is between 256 and 4096 characters. It is defined in the file
1893 video= [FB] Frame buffer configuration 1890 video= [FB] Frame buffer configuration
1894 See Documentation/fb/modedb.txt. 1891 See Documentation/fb/modedb.txt.
1895 1892
1896 vga= [BOOT,IA-32] Select a particular video mode 1893 vga= [BOOT,X86-32] Select a particular video mode
1897 See Documentation/i386/boot.txt and 1894 See Documentation/i386/boot.txt and
1898 Documentation/svga.txt. 1895 Documentation/svga.txt.
1899 Use vga=ask for menu. 1896 Use vga=ask for menu.
diff --git a/Documentation/keys.txt b/Documentation/keys.txt
index 81d9aa097298..947d57d53453 100644
--- a/Documentation/keys.txt
+++ b/Documentation/keys.txt
@@ -859,9 +859,8 @@ payload contents" for more information.
859 void unregister_key_type(struct key_type *type); 859 void unregister_key_type(struct key_type *type);
860 860
861 861
862Under some circumstances, it may be desirable to desirable to deal with a 862Under some circumstances, it may be desirable to deal with a bundle of keys.
863bundle of keys. The facility provides access to the keyring type for managing 863The facility provides access to the keyring type for managing such a bundle:
864such a bundle:
865 864
866 struct key_type key_type_keyring; 865 struct key_type key_type_keyring;
867 866
diff --git a/Documentation/kobject.txt b/Documentation/kobject.txt
index e44855513b3d..8ee49ee7c963 100644
--- a/Documentation/kobject.txt
+++ b/Documentation/kobject.txt
@@ -27,7 +27,6 @@ in detail, and briefly here:
27- kobjects a simple object. 27- kobjects a simple object.
28- kset a set of objects of a certain type. 28- kset a set of objects of a certain type.
29- ktype a set of helpers for objects of a common type. 29- ktype a set of helpers for objects of a common type.
30- subsystem a controlling object for a number of ksets.
31 30
32 31
33The kobject infrastructure maintains a close relationship with the 32The kobject infrastructure maintains a close relationship with the
@@ -54,13 +53,15 @@ embedded in larger data structures and replace fields they duplicate.
541.2 Definition 531.2 Definition
55 54
56struct kobject { 55struct kobject {
56 const char * k_name;
57 char name[KOBJ_NAME_LEN]; 57 char name[KOBJ_NAME_LEN];
58 atomic_t refcount; 58 struct kref kref;
59 struct list_head entry; 59 struct list_head entry;
60 struct kobject * parent; 60 struct kobject * parent;
61 struct kset * kset; 61 struct kset * kset;
62 struct kobj_type * ktype; 62 struct kobj_type * ktype;
63 struct dentry * dentry; 63 struct sysfs_dirent * sd;
64 wait_queue_head_t poll;
64}; 65};
65 66
66void kobject_init(struct kobject *); 67void kobject_init(struct kobject *);
@@ -137,8 +138,7 @@ If a kobject does not have a parent when it is registered, its parent
137becomes its dominant kset. 138becomes its dominant kset.
138 139
139If a kobject does not have a parent nor a dominant kset, its directory 140If a kobject does not have a parent nor a dominant kset, its directory
140is created at the top-level of the sysfs partition. This should only 141is created at the top-level of the sysfs partition.
141happen for kobjects that are embedded in a struct subsystem.
142 142
143 143
144 144
@@ -150,10 +150,10 @@ A kset is a set of kobjects that are embedded in the same type.
150 150
151 151
152struct kset { 152struct kset {
153 struct subsystem * subsys;
154 struct kobj_type * ktype; 153 struct kobj_type * ktype;
155 struct list_head list; 154 struct list_head list;
156 struct kobject kobj; 155 struct kobject kobj;
156 struct kset_uevent_ops * uevent_ops;
157}; 157};
158 158
159 159
@@ -169,8 +169,7 @@ struct kobject * kset_find_obj(struct kset *, char *);
169 169
170 170
171The type that the kobjects are embedded in is described by the ktype 171The type that the kobjects are embedded in is described by the ktype
172pointer. The subsystem that the kobject belongs to is pointed to by the 172pointer.
173subsys pointer.
174 173
175A kset contains a kobject itself, meaning that it may be registered in 174A kset contains a kobject itself, meaning that it may be registered in
176the kobject hierarchy and exported via sysfs. More importantly, the 175the kobject hierarchy and exported via sysfs. More importantly, the
@@ -209,6 +208,58 @@ the hierarchy.
209kset_find_obj() may be used to locate a kobject with a particular 208kset_find_obj() may be used to locate a kobject with a particular
210name. The kobject, if found, is returned. 209name. The kobject, if found, is returned.
211 210
211There are also some helper functions which names point to the formerly
212existing "struct subsystem", whose functions have been taken over by
213ksets.
214
215
216decl_subsys(name,type,uevent_ops)
217
218Declares a kset named '<name>_subsys' of type <type> with
219uevent_ops <uevent_ops>. For example,
220
221decl_subsys(devices, &ktype_device, &device_uevent_ops);
222
223is equivalent to doing:
224
225struct kset devices_subsys = {
226 .kobj = {
227 .name = "devices",
228 },
229 .ktype = &ktype_devices,
230 .uevent_ops = &device_uevent_ops,
231};
232
233
234The objects that are registered with a subsystem that use the
235subsystem's default list must have their kset ptr set properly. These
236objects may have embedded kobjects or ksets. The
237following helpers make setting the kset easier:
238
239
240kobj_set_kset_s(obj,subsys)
241
242- Assumes that obj->kobj exists, and is a struct kobject.
243- Sets the kset of that kobject to the kset <subsys>.
244
245
246kset_set_kset_s(obj,subsys)
247
248- Assumes that obj->kset exists, and is a struct kset.
249- Sets the kset of the embedded kobject to the kset <subsys>.
250
251subsys_set_kset(obj,subsys)
252
253- Assumes obj->subsys exists, and is a struct subsystem.
254- Sets obj->subsys.kset.kobj.kset to the subsystem's embedded kset.
255
256void subsystem_init(struct kset *s);
257int subsystem_register(struct kset *s);
258void subsystem_unregister(struct kset *s);
259struct kset *subsys_get(struct kset *s);
260void kset_put(struct kset *s);
261
262These are just wrappers around the respective kset_* functions.
212 263
2132.3 sysfs 2642.3 sysfs
214 265
@@ -254,114 +305,3 @@ Instances of struct kobj_type are not registered; only referenced by
254the kset. A kobj_type may be referenced by an arbitrary number of 305the kset. A kobj_type may be referenced by an arbitrary number of
255ksets, as there may be disparate sets of identical objects. 306ksets, as there may be disparate sets of identical objects.
256 307
257
258
2594. subsystems
260
2614.1 Description
262
263A subsystem represents a significant entity of code that maintains an
264arbitrary number of sets of objects of various types. Since the number
265of ksets and the type of objects they contain are variable, a
266generic representation of a subsystem is minimal.
267
268
269struct subsystem {
270 struct kset kset;
271 struct rw_semaphore rwsem;
272};
273
274int subsystem_register(struct subsystem *);
275void subsystem_unregister(struct subsystem *);
276
277struct subsystem * subsys_get(struct subsystem * s);
278void subsys_put(struct subsystem * s);
279
280
281A subsystem contains an embedded kset so:
282
283- It can be represented in the object hierarchy via the kset's
284 embedded kobject.
285
286- It can maintain a default list of objects of one type.
287
288Additional ksets may attach to the subsystem simply by referencing the
289subsystem before they are registered. (This one-way reference means
290that there is no way to determine the ksets that are attached to the
291subsystem.)
292
293All ksets that are attached to a subsystem share the subsystem's R/W
294semaphore.
295
296
2974.2 subsystem Programming Interface.
298
299The subsystem programming interface is simple and does not offer the
300flexibility that the kset and kobject programming interfaces do. They
301may be registered and unregistered, as well as reference counted. Each
302call forwards the calls to their embedded ksets (which forward the
303calls to their embedded kobjects).
304
305
3064.3 Helpers
307
308A number of macros are available to make dealing with subsystems and
309their embedded objects easier.
310
311
312decl_subsys(name,type)
313
314Declares a subsystem named '<name>_subsys', with an embedded kset of
315type <type>. For example,
316
317decl_subsys(devices,&ktype_devices);
318
319is equivalent to doing:
320
321struct subsystem device_subsys = {
322 .kset = {
323 .kobj = {
324 .name = "devices",
325 },
326 .ktype = &ktype_devices,
327 }
328};
329
330
331The objects that are registered with a subsystem that use the
332subsystem's default list must have their kset ptr set properly. These
333objects may have embedded kobjects, ksets, or other subsystems. The
334following helpers make setting the kset easier:
335
336
337kobj_set_kset_s(obj,subsys)
338
339- Assumes that obj->kobj exists, and is a struct kobject.
340- Sets the kset of that kobject to the subsystem's embedded kset.
341
342
343kset_set_kset_s(obj,subsys)
344
345- Assumes that obj->kset exists, and is a struct kset.
346- Sets the kset of the embedded kobject to the subsystem's
347 embedded kset.
348
349subsys_set_kset(obj,subsys)
350
351- Assumes obj->subsys exists, and is a struct subsystem.
352- Sets obj->subsys.kset.kobj.kset to the subsystem's embedded kset.
353
354
3554.4 sysfs
356
357subsystems are represented in sysfs via their embedded kobjects. They
358follow the same rules as previously mentioned with no exceptions. They
359typically receive a top-level directory in sysfs, except when their
360embedded kobject is part of another kset, or the parent of the
361embedded kobject is explicitly set.
362
363Note that the subsystem's embedded kset must be 'attached' to the
364subsystem itself in order to use its rwsem. This is done after
365kset_add() has been called. (Not before, because kset_add() uses its
366subsystem for a default parent if it doesn't already have one).
367
diff --git a/Documentation/lguest/Makefile b/Documentation/lguest/Makefile
index b9b9427376e9..31e794ef5f98 100644
--- a/Documentation/lguest/Makefile
+++ b/Documentation/lguest/Makefile
@@ -11,8 +11,7 @@ endif
11include $(KBUILD_OUTPUT)/.config 11include $(KBUILD_OUTPUT)/.config
12LGUEST_GUEST_TOP := ($(CONFIG_PAGE_OFFSET) - 0x08000000) 12LGUEST_GUEST_TOP := ($(CONFIG_PAGE_OFFSET) - 0x08000000)
13 13
14CFLAGS:=-Wall -Wmissing-declarations -Wmissing-prototypes -O3 \ 14CFLAGS:=-Wall -Wmissing-declarations -Wmissing-prototypes -O3 -Wl,-T,lguest.lds
15 -static -DLGUEST_GUEST_TOP="$(LGUEST_GUEST_TOP)" -Wl,-T,lguest.lds
16LDLIBS:=-lz 15LDLIBS:=-lz
17 16
18all: lguest.lds lguest 17all: lguest.lds lguest
diff --git a/Documentation/lguest/extract b/Documentation/lguest/extract
new file mode 100644
index 000000000000..7730bb6e4b94
--- /dev/null
+++ b/Documentation/lguest/extract
@@ -0,0 +1,58 @@
1#! /bin/sh
2
3set -e
4
5PREFIX=$1
6shift
7
8trap 'rm -r $TMPDIR' 0
9TMPDIR=`mktemp -d`
10
11exec 3>/dev/null
12for f; do
13 while IFS="
14" read -r LINE; do
15 case "$LINE" in
16 *$PREFIX:[0-9]*:\**)
17 NUM=`echo "$LINE" | sed "s/.*$PREFIX:\([0-9]*\).*/\1/"`
18 if [ -f $TMPDIR/$NUM ]; then
19 echo "$TMPDIR/$NUM already exits prior to $f"
20 exit 1
21 fi
22 exec 3>>$TMPDIR/$NUM
23 echo $f | sed 's,\.\./,,g' > $TMPDIR/.$NUM
24 /bin/echo "$LINE" | sed -e "s/$PREFIX:[0-9]*//" -e "s/:\*/*/" >&3
25 ;;
26 *$PREFIX:[0-9]*)
27 NUM=`echo "$LINE" | sed "s/.*$PREFIX:\([0-9]*\).*/\1/"`
28 if [ -f $TMPDIR/$NUM ]; then
29 echo "$TMPDIR/$NUM already exits prior to $f"
30 exit 1
31 fi
32 exec 3>>$TMPDIR/$NUM
33 echo $f | sed 's,\.\./,,g' > $TMPDIR/.$NUM
34 /bin/echo "$LINE" | sed "s/$PREFIX:[0-9]*//" >&3
35 ;;
36 *:\**)
37 /bin/echo "$LINE" | sed -e "s/:\*/*/" -e "s,/\*\*/,," >&3
38 echo >&3
39 exec 3>/dev/null
40 ;;
41 *)
42 /bin/echo "$LINE" >&3
43 ;;
44 esac
45 done < $f
46 echo >&3
47 exec 3>/dev/null
48done
49
50LASTFILE=""
51for f in $TMPDIR/*; do
52 if [ "$LASTFILE" != $(cat $TMPDIR/.$(basename $f) ) ]; then
53 LASTFILE=$(cat $TMPDIR/.$(basename $f) )
54 echo "[ $LASTFILE ]"
55 fi
56 cat $f
57done
58
diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c
index 1432b502a2d9..f7918401a007 100644
--- a/Documentation/lguest/lguest.c
+++ b/Documentation/lguest/lguest.c
@@ -1,5 +1,10 @@
1/* Simple program to layout "physical" memory for new lguest guest. 1/*P:100 This is the Launcher code, a simple program which lays out the
2 * Linked high to avoid likely physical memory. */ 2 * "physical" memory for the new Guest by mapping the kernel image and the
3 * virtual devices, then reads repeatedly from /dev/lguest to run the Guest.
4 *
5 * The only trick: the Makefile links it at a high address so it will be clear
6 * of the guest memory region. It means that each Guest cannot have more than
7 * about 2.5G of memory on a normally configured Host. :*/
3#define _LARGEFILE64_SOURCE 8#define _LARGEFILE64_SOURCE
4#define _GNU_SOURCE 9#define _GNU_SOURCE
5#include <stdio.h> 10#include <stdio.h>
@@ -29,12 +34,20 @@
29#include <termios.h> 34#include <termios.h>
30#include <getopt.h> 35#include <getopt.h>
31#include <zlib.h> 36#include <zlib.h>
37/*L:110 We can ignore the 28 include files we need for this program, but I do
38 * want to draw attention to the use of kernel-style types.
39 *
40 * As Linus said, "C is a Spartan language, and so should your naming be." I
41 * like these abbreviations and the header we need uses them, so we define them
42 * here.
43 */
32typedef unsigned long long u64; 44typedef unsigned long long u64;
33typedef uint32_t u32; 45typedef uint32_t u32;
34typedef uint16_t u16; 46typedef uint16_t u16;
35typedef uint8_t u8; 47typedef uint8_t u8;
36#include "../../include/linux/lguest_launcher.h" 48#include "../../include/linux/lguest_launcher.h"
37#include "../../include/asm-i386/e820.h" 49#include "../../include/asm-i386/e820.h"
50/*:*/
38 51
39#define PAGE_PRESENT 0x7 /* Present, RW, Execute */ 52#define PAGE_PRESENT 0x7 /* Present, RW, Execute */
40#define NET_PEERNUM 1 53#define NET_PEERNUM 1
@@ -43,31 +56,52 @@ typedef uint8_t u8;
43#define SIOCBRADDIF 0x89a2 /* add interface to bridge */ 56#define SIOCBRADDIF 0x89a2 /* add interface to bridge */
44#endif 57#endif
45 58
59/*L:120 verbose is both a global flag and a macro. The C preprocessor allows
60 * this, and although I wouldn't recommend it, it works quite nicely here. */
46static bool verbose; 61static bool verbose;
47#define verbose(args...) \ 62#define verbose(args...) \
48 do { if (verbose) printf(args); } while(0) 63 do { if (verbose) printf(args); } while(0)
64/*:*/
65
66/* The pipe to send commands to the waker process */
49static int waker_fd; 67static int waker_fd;
68/* The top of guest physical memory. */
69static u32 top;
50 70
71/* This is our list of devices. */
51struct device_list 72struct device_list
52{ 73{
74 /* Summary information about the devices in our list: ready to pass to
75 * select() to ask which need servicing.*/
53 fd_set infds; 76 fd_set infds;
54 int max_infd; 77 int max_infd;
55 78
79 /* The descriptor page for the devices. */
80 struct lguest_device_desc *descs;
81
82 /* A single linked list of devices. */
56 struct device *dev; 83 struct device *dev;
84 /* ... And an end pointer so we can easily append new devices */
57 struct device **lastdev; 85 struct device **lastdev;
58}; 86};
59 87
88/* The device structure describes a single device. */
60struct device 89struct device
61{ 90{
91 /* The linked-list pointer. */
62 struct device *next; 92 struct device *next;
93 /* The descriptor for this device, as mapped into the Guest. */
63 struct lguest_device_desc *desc; 94 struct lguest_device_desc *desc;
95 /* The memory page(s) of this device, if any. Also mapped in Guest. */
64 void *mem; 96 void *mem;
65 97
66 /* Watch this fd if handle_input non-NULL. */ 98 /* If handle_input is set, it wants to be called when this file
99 * descriptor is ready. */
67 int fd; 100 int fd;
68 bool (*handle_input)(int fd, struct device *me); 101 bool (*handle_input)(int fd, struct device *me);
69 102
70 /* Watch DMA to this key if handle_input non-NULL. */ 103 /* If handle_output is set, it wants to be called when the Guest sends
104 * DMA to this key. */
71 unsigned long watch_key; 105 unsigned long watch_key;
72 u32 (*handle_output)(int fd, const struct iovec *iov, 106 u32 (*handle_output)(int fd, const struct iovec *iov,
73 unsigned int num, struct device *me); 107 unsigned int num, struct device *me);
@@ -76,6 +110,11 @@ struct device
76 void *priv; 110 void *priv;
77}; 111};
78 112
113/*L:130
114 * Loading the Kernel.
115 *
116 * We start with couple of simple helper routines. open_or_die() avoids
117 * error-checking code cluttering the callers: */
79static int open_or_die(const char *name, int flags) 118static int open_or_die(const char *name, int flags)
80{ 119{
81 int fd = open(name, flags); 120 int fd = open(name, flags);
@@ -84,26 +123,38 @@ static int open_or_die(const char *name, int flags)
84 return fd; 123 return fd;
85} 124}
86 125
126/* map_zeroed_pages() takes a (page-aligned) address and a number of pages. */
87static void *map_zeroed_pages(unsigned long addr, unsigned int num) 127static void *map_zeroed_pages(unsigned long addr, unsigned int num)
88{ 128{
129 /* We cache the /dev/zero file-descriptor so we only open it once. */
89 static int fd = -1; 130 static int fd = -1;
90 131
91 if (fd == -1) 132 if (fd == -1)
92 fd = open_or_die("/dev/zero", O_RDONLY); 133 fd = open_or_die("/dev/zero", O_RDONLY);
93 134
135 /* We use a private mapping (ie. if we write to the page, it will be
136 * copied), and obviously we insist that it be mapped where we ask. */
94 if (mmap((void *)addr, getpagesize() * num, 137 if (mmap((void *)addr, getpagesize() * num,
95 PROT_READ|PROT_WRITE|PROT_EXEC, MAP_FIXED|MAP_PRIVATE, fd, 0) 138 PROT_READ|PROT_WRITE|PROT_EXEC, MAP_FIXED|MAP_PRIVATE, fd, 0)
96 != (void *)addr) 139 != (void *)addr)
97 err(1, "Mmaping %u pages of /dev/zero @%p", num, (void *)addr); 140 err(1, "Mmaping %u pages of /dev/zero @%p", num, (void *)addr);
141
142 /* Returning the address is just a courtesy: can simplify callers. */
98 return (void *)addr; 143 return (void *)addr;
99} 144}
100 145
101/* Find magic string marking entry point, return entry point. */ 146/* To find out where to start we look for the magic Guest string, which marks
147 * the code we see in lguest_asm.S. This is a hack which we are currently
148 * plotting to replace with the normal Linux entry point. */
102static unsigned long entry_point(void *start, void *end, 149static unsigned long entry_point(void *start, void *end,
103 unsigned long page_offset) 150 unsigned long page_offset)
104{ 151{
105 void *p; 152 void *p;
106 153
154 /* The scan gives us the physical starting address. We want the
155 * virtual address in this case, and fortunately, we already figured
156 * out the physical-virtual difference and passed it here in
157 * "page_offset". */
107 for (p = start; p < end; p++) 158 for (p = start; p < end; p++)
108 if (memcmp(p, "GenuineLguest", strlen("GenuineLguest")) == 0) 159 if (memcmp(p, "GenuineLguest", strlen("GenuineLguest")) == 0)
109 return (long)p + strlen("GenuineLguest") + page_offset; 160 return (long)p + strlen("GenuineLguest") + page_offset;
@@ -111,7 +162,17 @@ static unsigned long entry_point(void *start, void *end,
111 err(1, "Is this image a genuine lguest?"); 162 err(1, "Is this image a genuine lguest?");
112} 163}
113 164
114/* Returns the entry point */ 165/* This routine takes an open vmlinux image, which is in ELF, and maps it into
166 * the Guest memory. ELF = Embedded Linking Format, which is the format used
167 * by all modern binaries on Linux including the kernel.
168 *
169 * The ELF headers give *two* addresses: a physical address, and a virtual
170 * address. The Guest kernel expects to be placed in memory at the physical
171 * address, and the page tables set up so it will correspond to that virtual
172 * address. We return the difference between the virtual and physical
173 * addresses in the "page_offset" pointer.
174 *
175 * We return the starting address. */
115static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr, 176static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr,
116 unsigned long *page_offset) 177 unsigned long *page_offset)
117{ 178{
@@ -120,40 +181,61 @@ static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr,
120 unsigned int i; 181 unsigned int i;
121 unsigned long start = -1UL, end = 0; 182 unsigned long start = -1UL, end = 0;
122 183
123 /* Sanity checks. */ 184 /* Sanity checks on the main ELF header: an x86 executable with a
185 * reasonable number of correctly-sized program headers. */
124 if (ehdr->e_type != ET_EXEC 186 if (ehdr->e_type != ET_EXEC
125 || ehdr->e_machine != EM_386 187 || ehdr->e_machine != EM_386
126 || ehdr->e_phentsize != sizeof(Elf32_Phdr) 188 || ehdr->e_phentsize != sizeof(Elf32_Phdr)
127 || ehdr->e_phnum < 1 || ehdr->e_phnum > 65536U/sizeof(Elf32_Phdr)) 189 || ehdr->e_phnum < 1 || ehdr->e_phnum > 65536U/sizeof(Elf32_Phdr))
128 errx(1, "Malformed elf header"); 190 errx(1, "Malformed elf header");
129 191
192 /* An ELF executable contains an ELF header and a number of "program"
193 * headers which indicate which parts ("segments") of the program to
194 * load where. */
195
196 /* We read in all the program headers at once: */
130 if (lseek(elf_fd, ehdr->e_phoff, SEEK_SET) < 0) 197 if (lseek(elf_fd, ehdr->e_phoff, SEEK_SET) < 0)
131 err(1, "Seeking to program headers"); 198 err(1, "Seeking to program headers");
132 if (read(elf_fd, phdr, sizeof(phdr)) != sizeof(phdr)) 199 if (read(elf_fd, phdr, sizeof(phdr)) != sizeof(phdr))
133 err(1, "Reading program headers"); 200 err(1, "Reading program headers");
134 201
202 /* We don't know page_offset yet. */
135 *page_offset = 0; 203 *page_offset = 0;
136 /* We map the loadable segments at virtual addresses corresponding 204
137 * to their physical addresses (our virtual == guest physical). */ 205 /* Try all the headers: there are usually only three. A read-only one,
206 * a read-write one, and a "note" section which isn't loadable. */
138 for (i = 0; i < ehdr->e_phnum; i++) { 207 for (i = 0; i < ehdr->e_phnum; i++) {
208 /* If this isn't a loadable segment, we ignore it */
139 if (phdr[i].p_type != PT_LOAD) 209 if (phdr[i].p_type != PT_LOAD)
140 continue; 210 continue;
141 211
142 verbose("Section %i: size %i addr %p\n", 212 verbose("Section %i: size %i addr %p\n",
143 i, phdr[i].p_memsz, (void *)phdr[i].p_paddr); 213 i, phdr[i].p_memsz, (void *)phdr[i].p_paddr);
144 214
145 /* We expect linear address space. */ 215 /* We expect a simple linear address space: every segment must
216 * have the same difference between virtual (p_vaddr) and
217 * physical (p_paddr) address. */
146 if (!*page_offset) 218 if (!*page_offset)
147 *page_offset = phdr[i].p_vaddr - phdr[i].p_paddr; 219 *page_offset = phdr[i].p_vaddr - phdr[i].p_paddr;
148 else if (*page_offset != phdr[i].p_vaddr - phdr[i].p_paddr) 220 else if (*page_offset != phdr[i].p_vaddr - phdr[i].p_paddr)
149 errx(1, "Page offset of section %i different", i); 221 errx(1, "Page offset of section %i different", i);
150 222
223 /* We track the first and last address we mapped, so we can
224 * tell entry_point() where to scan. */
151 if (phdr[i].p_paddr < start) 225 if (phdr[i].p_paddr < start)
152 start = phdr[i].p_paddr; 226 start = phdr[i].p_paddr;
153 if (phdr[i].p_paddr + phdr[i].p_filesz > end) 227 if (phdr[i].p_paddr + phdr[i].p_filesz > end)
154 end = phdr[i].p_paddr + phdr[i].p_filesz; 228 end = phdr[i].p_paddr + phdr[i].p_filesz;
155 229
156 /* We map everything private, writable. */ 230 /* We map this section of the file at its physical address. We
231 * map it read & write even if the header says this segment is
232 * read-only. The kernel really wants to be writable: it
233 * patches its own instructions which would normally be
234 * read-only.
235 *
236 * MAP_PRIVATE means that the page won't be copied until a
237 * write is done to it. This allows us to share much of the
238 * kernel memory between Guests. */
157 addr = mmap((void *)phdr[i].p_paddr, 239 addr = mmap((void *)phdr[i].p_paddr,
158 phdr[i].p_filesz, 240 phdr[i].p_filesz,
159 PROT_READ|PROT_WRITE|PROT_EXEC, 241 PROT_READ|PROT_WRITE|PROT_EXEC,
@@ -167,7 +249,31 @@ static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr,
167 return entry_point((void *)start, (void *)end, *page_offset); 249 return entry_point((void *)start, (void *)end, *page_offset);
168} 250}
169 251
170/* This is amazingly reliable. */ 252/*L:170 Prepare to be SHOCKED and AMAZED. And possibly a trifle nauseated.
253 *
254 * We know that CONFIG_PAGE_OFFSET sets what virtual address the kernel expects
255 * to be. We don't know what that option was, but we can figure it out
256 * approximately by looking at the addresses in the code. I chose the common
257 * case of reading a memory location into the %eax register:
258 *
259 * movl <some-address>, %eax
260 *
261 * This gets encoded as five bytes: "0xA1 <4-byte-address>". For example,
262 * "0xA1 0x18 0x60 0x47 0xC0" reads the address 0xC0476018 into %eax.
263 *
264 * In this example can guess that the kernel was compiled with
265 * CONFIG_PAGE_OFFSET set to 0xC0000000 (it's always a round number). If the
266 * kernel were larger than 16MB, we might see 0xC1 addresses show up, but our
267 * kernel isn't that bloated yet.
268 *
269 * Unfortunately, x86 has variable-length instructions, so finding this
270 * particular instruction properly involves writing a disassembler. Instead,
271 * we rely on statistics. We look for "0xA1" and tally the different bytes
272 * which occur 4 bytes later (the "0xC0" in our example above). When one of
273 * those bytes appears three times, we can be reasonably confident that it
274 * forms the start of CONFIG_PAGE_OFFSET.
275 *
276 * This is amazingly reliable. */
171static unsigned long intuit_page_offset(unsigned char *img, unsigned long len) 277static unsigned long intuit_page_offset(unsigned char *img, unsigned long len)
172{ 278{
173 unsigned int i, possibilities[256] = { 0 }; 279 unsigned int i, possibilities[256] = { 0 };
@@ -180,30 +286,52 @@ static unsigned long intuit_page_offset(unsigned char *img, unsigned long len)
180 errx(1, "could not determine page offset"); 286 errx(1, "could not determine page offset");
181} 287}
182 288
289/*L:160 Unfortunately the entire ELF image isn't compressed: the segments
290 * which need loading are extracted and compressed raw. This denies us the
291 * information we need to make a fully-general loader. */
183static unsigned long unpack_bzimage(int fd, unsigned long *page_offset) 292static unsigned long unpack_bzimage(int fd, unsigned long *page_offset)
184{ 293{
185 gzFile f; 294 gzFile f;
186 int ret, len = 0; 295 int ret, len = 0;
296 /* A bzImage always gets loaded at physical address 1M. This is
297 * actually configurable as CONFIG_PHYSICAL_START, but as the comment
298 * there says, "Don't change this unless you know what you are doing".
299 * Indeed. */
187 void *img = (void *)0x100000; 300 void *img = (void *)0x100000;
188 301
302 /* gzdopen takes our file descriptor (carefully placed at the start of
303 * the GZIP header we found) and returns a gzFile. */
189 f = gzdopen(fd, "rb"); 304 f = gzdopen(fd, "rb");
305 /* We read it into memory in 64k chunks until we hit the end. */
190 while ((ret = gzread(f, img + len, 65536)) > 0) 306 while ((ret = gzread(f, img + len, 65536)) > 0)
191 len += ret; 307 len += ret;
192 if (ret < 0) 308 if (ret < 0)
193 err(1, "reading image from bzImage"); 309 err(1, "reading image from bzImage");
194 310
195 verbose("Unpacked size %i addr %p\n", len, img); 311 verbose("Unpacked size %i addr %p\n", len, img);
312
313 /* Without the ELF header, we can't tell virtual-physical gap. This is
314 * CONFIG_PAGE_OFFSET, and people do actually change it. Fortunately,
315 * I have a clever way of figuring it out from the code itself. */
196 *page_offset = intuit_page_offset(img, len); 316 *page_offset = intuit_page_offset(img, len);
197 317
198 return entry_point(img, img + len, *page_offset); 318 return entry_point(img, img + len, *page_offset);
199} 319}
200 320
321/*L:150 A bzImage, unlike an ELF file, is not meant to be loaded. You're
322 * supposed to jump into it and it will unpack itself. We can't do that
323 * because the Guest can't run the unpacking code, and adding features to
324 * lguest kills puppies, so we don't want to.
325 *
326 * The bzImage is formed by putting the decompressing code in front of the
327 * compressed kernel code. So we can simple scan through it looking for the
328 * first "gzip" header, and start decompressing from there. */
201static unsigned long load_bzimage(int fd, unsigned long *page_offset) 329static unsigned long load_bzimage(int fd, unsigned long *page_offset)
202{ 330{
203 unsigned char c; 331 unsigned char c;
204 int state = 0; 332 int state = 0;
205 333
206 /* Ugly brute force search for gzip header. */ 334 /* GZIP header is 0x1F 0x8B <method> <flags>... <compressed-by>. */
207 while (read(fd, &c, 1) == 1) { 335 while (read(fd, &c, 1) == 1) {
208 switch (state) { 336 switch (state) {
209 case 0: 337 case 0:
@@ -220,8 +348,10 @@ static unsigned long load_bzimage(int fd, unsigned long *page_offset)
220 state++; 348 state++;
221 break; 349 break;
222 case 9: 350 case 9:
351 /* Seek back to the start of the gzip header. */
223 lseek(fd, -10, SEEK_CUR); 352 lseek(fd, -10, SEEK_CUR);
224 if (c != 0x03) /* Compressed under UNIX. */ 353 /* One final check: "compressed under UNIX". */
354 if (c != 0x03)
225 state = -1; 355 state = -1;
226 else 356 else
227 return unpack_bzimage(fd, page_offset); 357 return unpack_bzimage(fd, page_offset);
@@ -230,25 +360,43 @@ static unsigned long load_bzimage(int fd, unsigned long *page_offset)
230 errx(1, "Could not find kernel in bzImage"); 360 errx(1, "Could not find kernel in bzImage");
231} 361}
232 362
363/*L:140 Loading the kernel is easy when it's a "vmlinux", but most kernels
364 * come wrapped up in the self-decompressing "bzImage" format. With some funky
365 * coding, we can load those, too. */
233static unsigned long load_kernel(int fd, unsigned long *page_offset) 366static unsigned long load_kernel(int fd, unsigned long *page_offset)
234{ 367{
235 Elf32_Ehdr hdr; 368 Elf32_Ehdr hdr;
236 369
370 /* Read in the first few bytes. */
237 if (read(fd, &hdr, sizeof(hdr)) != sizeof(hdr)) 371 if (read(fd, &hdr, sizeof(hdr)) != sizeof(hdr))
238 err(1, "Reading kernel"); 372 err(1, "Reading kernel");
239 373
374 /* If it's an ELF file, it starts with "\177ELF" */
240 if (memcmp(hdr.e_ident, ELFMAG, SELFMAG) == 0) 375 if (memcmp(hdr.e_ident, ELFMAG, SELFMAG) == 0)
241 return map_elf(fd, &hdr, page_offset); 376 return map_elf(fd, &hdr, page_offset);
242 377
378 /* Otherwise we assume it's a bzImage, and try to unpack it */
243 return load_bzimage(fd, page_offset); 379 return load_bzimage(fd, page_offset);
244} 380}
245 381
382/* This is a trivial little helper to align pages. Andi Kleen hated it because
383 * it calls getpagesize() twice: "it's dumb code."
384 *
385 * Kernel guys get really het up about optimization, even when it's not
386 * necessary. I leave this code as a reaction against that. */
246static inline unsigned long page_align(unsigned long addr) 387static inline unsigned long page_align(unsigned long addr)
247{ 388{
389 /* Add upwards and truncate downwards. */
248 return ((addr + getpagesize()-1) & ~(getpagesize()-1)); 390 return ((addr + getpagesize()-1) & ~(getpagesize()-1));
249} 391}
250 392
251/* initrd gets loaded at top of memory: return length. */ 393/*L:180 An "initial ram disk" is a disk image loaded into memory along with
394 * the kernel which the kernel can use to boot from without needing any
395 * drivers. Most distributions now use this as standard: the initrd contains
396 * the code to load the appropriate driver modules for the current machine.
397 *
398 * Importantly, James Morris works for RedHat, and Fedora uses initrds for its
399 * kernels. He sent me this (and tells me when I break it). */
252static unsigned long load_initrd(const char *name, unsigned long mem) 400static unsigned long load_initrd(const char *name, unsigned long mem)
253{ 401{
254 int ifd; 402 int ifd;
@@ -257,21 +405,35 @@ static unsigned long load_initrd(const char *name, unsigned long mem)
257 void *iaddr; 405 void *iaddr;
258 406
259 ifd = open_or_die(name, O_RDONLY); 407 ifd = open_or_die(name, O_RDONLY);
408 /* fstat() is needed to get the file size. */
260 if (fstat(ifd, &st) < 0) 409 if (fstat(ifd, &st) < 0)
261 err(1, "fstat() on initrd '%s'", name); 410 err(1, "fstat() on initrd '%s'", name);
262 411
412 /* The length needs to be rounded up to a page size: mmap needs the
413 * address to be page aligned. */
263 len = page_align(st.st_size); 414 len = page_align(st.st_size);
415 /* We map the initrd at the top of memory. */
264 iaddr = mmap((void *)mem - len, st.st_size, 416 iaddr = mmap((void *)mem - len, st.st_size,
265 PROT_READ|PROT_EXEC|PROT_WRITE, 417 PROT_READ|PROT_EXEC|PROT_WRITE,
266 MAP_FIXED|MAP_PRIVATE, ifd, 0); 418 MAP_FIXED|MAP_PRIVATE, ifd, 0);
267 if (iaddr != (void *)mem - len) 419 if (iaddr != (void *)mem - len)
268 err(1, "Mmaping initrd '%s' returned %p not %p", 420 err(1, "Mmaping initrd '%s' returned %p not %p",
269 name, iaddr, (void *)mem - len); 421 name, iaddr, (void *)mem - len);
422 /* Once a file is mapped, you can close the file descriptor. It's a
423 * little odd, but quite useful. */
270 close(ifd); 424 close(ifd);
271 verbose("mapped initrd %s size=%lu @ %p\n", name, st.st_size, iaddr); 425 verbose("mapped initrd %s size=%lu @ %p\n", name, st.st_size, iaddr);
426
427 /* We return the initrd size. */
272 return len; 428 return len;
273} 429}
274 430
431/* Once we know how much memory we have, and the address the Guest kernel
432 * expects, we can construct simple linear page tables which will get the Guest
433 * far enough into the boot to create its own.
434 *
435 * We lay them out of the way, just below the initrd (which is why we need to
436 * know its size). */
275static unsigned long setup_pagetables(unsigned long mem, 437static unsigned long setup_pagetables(unsigned long mem,
276 unsigned long initrd_size, 438 unsigned long initrd_size,
277 unsigned long page_offset) 439 unsigned long page_offset)
@@ -280,23 +442,32 @@ static unsigned long setup_pagetables(unsigned long mem,
280 unsigned int mapped_pages, i, linear_pages; 442 unsigned int mapped_pages, i, linear_pages;
281 unsigned int ptes_per_page = getpagesize()/sizeof(u32); 443 unsigned int ptes_per_page = getpagesize()/sizeof(u32);
282 444
283 /* If we can map all of memory above page_offset, we do so. */ 445 /* Ideally we map all physical memory starting at page_offset.
446 * However, if page_offset is 0xC0000000 we can only map 1G of physical
447 * (0xC0000000 + 1G overflows). */
284 if (mem <= -page_offset) 448 if (mem <= -page_offset)
285 mapped_pages = mem/getpagesize(); 449 mapped_pages = mem/getpagesize();
286 else 450 else
287 mapped_pages = -page_offset/getpagesize(); 451 mapped_pages = -page_offset/getpagesize();
288 452
289 /* Each linear PTE page can map ptes_per_page pages. */ 453 /* Each PTE page can map ptes_per_page pages: how many do we need? */
290 linear_pages = (mapped_pages + ptes_per_page-1)/ptes_per_page; 454 linear_pages = (mapped_pages + ptes_per_page-1)/ptes_per_page;
291 455
292 /* We lay out top-level then linear mapping immediately below initrd */ 456 /* We put the toplevel page directory page at the top of memory. */
293 pgdir = (void *)mem - initrd_size - getpagesize(); 457 pgdir = (void *)mem - initrd_size - getpagesize();
458
459 /* Now we use the next linear_pages pages as pte pages */
294 linear = (void *)pgdir - linear_pages*getpagesize(); 460 linear = (void *)pgdir - linear_pages*getpagesize();
295 461
462 /* Linear mapping is easy: put every page's address into the mapping in
463 * order. PAGE_PRESENT contains the flags Present, Writable and
464 * Executable. */
296 for (i = 0; i < mapped_pages; i++) 465 for (i = 0; i < mapped_pages; i++)
297 linear[i] = ((i * getpagesize()) | PAGE_PRESENT); 466 linear[i] = ((i * getpagesize()) | PAGE_PRESENT);
298 467
299 /* Now set up pgd so that this memory is at page_offset */ 468 /* The top level points to the linear page table pages above. The
469 * entry representing page_offset points to the first one, and they
470 * continue from there. */
300 for (i = 0; i < mapped_pages; i += ptes_per_page) { 471 for (i = 0; i < mapped_pages; i += ptes_per_page) {
301 pgdir[(i + page_offset/getpagesize())/ptes_per_page] 472 pgdir[(i + page_offset/getpagesize())/ptes_per_page]
302 = (((u32)linear + i*sizeof(u32)) | PAGE_PRESENT); 473 = (((u32)linear + i*sizeof(u32)) | PAGE_PRESENT);
@@ -305,9 +476,13 @@ static unsigned long setup_pagetables(unsigned long mem,
305 verbose("Linear mapping of %u pages in %u pte pages at %p\n", 476 verbose("Linear mapping of %u pages in %u pte pages at %p\n",
306 mapped_pages, linear_pages, linear); 477 mapped_pages, linear_pages, linear);
307 478
479 /* We return the top level (guest-physical) address: the kernel needs
480 * to know where it is. */
308 return (unsigned long)pgdir; 481 return (unsigned long)pgdir;
309} 482}
310 483
484/* Simple routine to roll all the commandline arguments together with spaces
485 * between them. */
311static void concat(char *dst, char *args[]) 486static void concat(char *dst, char *args[])
312{ 487{
313 unsigned int i, len = 0; 488 unsigned int i, len = 0;
@@ -321,18 +496,24 @@ static void concat(char *dst, char *args[])
321 dst[len] = '\0'; 496 dst[len] = '\0';
322} 497}
323 498
499/* This is where we actually tell the kernel to initialize the Guest. We saw
500 * the arguments it expects when we looked at initialize() in lguest_user.c:
501 * the top physical page to allow, the top level pagetable, the entry point and
502 * the page_offset constant for the Guest. */
324static int tell_kernel(u32 pgdir, u32 start, u32 page_offset) 503static int tell_kernel(u32 pgdir, u32 start, u32 page_offset)
325{ 504{
326 u32 args[] = { LHREQ_INITIALIZE, 505 u32 args[] = { LHREQ_INITIALIZE,
327 LGUEST_GUEST_TOP/getpagesize(), /* Just below us */ 506 top/getpagesize(), pgdir, start, page_offset };
328 pgdir, start, page_offset };
329 int fd; 507 int fd;
330 508
331 fd = open_or_die("/dev/lguest", O_RDWR); 509 fd = open_or_die("/dev/lguest", O_RDWR);
332 if (write(fd, args, sizeof(args)) < 0) 510 if (write(fd, args, sizeof(args)) < 0)
333 err(1, "Writing to /dev/lguest"); 511 err(1, "Writing to /dev/lguest");
512
513 /* We return the /dev/lguest file descriptor to control this Guest */
334 return fd; 514 return fd;
335} 515}
516/*:*/
336 517
337static void set_fd(int fd, struct device_list *devices) 518static void set_fd(int fd, struct device_list *devices)
338{ 519{
@@ -341,61 +522,108 @@ static void set_fd(int fd, struct device_list *devices)
341 devices->max_infd = fd; 522 devices->max_infd = fd;
342} 523}
343 524
344/* When input arrives, we tell the kernel to kick lguest out with -EAGAIN. */ 525/*L:200
526 * The Waker.
527 *
528 * With a console and network devices, we can have lots of input which we need
529 * to process. We could try to tell the kernel what file descriptors to watch,
530 * but handing a file descriptor mask through to the kernel is fairly icky.
531 *
532 * Instead, we fork off a process which watches the file descriptors and writes
533 * the LHREQ_BREAK command to the /dev/lguest filedescriptor to tell the Host
534 * loop to stop running the Guest. This causes it to return from the
535 * /dev/lguest read with -EAGAIN, where it will write to /dev/lguest to reset
536 * the LHREQ_BREAK and wake us up again.
537 *
538 * This, of course, is merely a different *kind* of icky.
539 */
345static void wake_parent(int pipefd, int lguest_fd, struct device_list *devices) 540static void wake_parent(int pipefd, int lguest_fd, struct device_list *devices)
346{ 541{
542 /* Add the pipe from the Launcher to the fdset in the device_list, so
543 * we watch it, too. */
347 set_fd(pipefd, devices); 544 set_fd(pipefd, devices);
348 545
349 for (;;) { 546 for (;;) {
350 fd_set rfds = devices->infds; 547 fd_set rfds = devices->infds;
351 u32 args[] = { LHREQ_BREAK, 1 }; 548 u32 args[] = { LHREQ_BREAK, 1 };
352 549
550 /* Wait until input is ready from one of the devices. */
353 select(devices->max_infd+1, &rfds, NULL, NULL, NULL); 551 select(devices->max_infd+1, &rfds, NULL, NULL, NULL);
552 /* Is it a message from the Launcher? */
354 if (FD_ISSET(pipefd, &rfds)) { 553 if (FD_ISSET(pipefd, &rfds)) {
355 int ignorefd; 554 int ignorefd;
555 /* If read() returns 0, it means the Launcher has
556 * exited. We silently follow. */
356 if (read(pipefd, &ignorefd, sizeof(ignorefd)) == 0) 557 if (read(pipefd, &ignorefd, sizeof(ignorefd)) == 0)
357 exit(0); 558 exit(0);
559 /* Otherwise it's telling us there's a problem with one
560 * of the devices, and we should ignore that file
561 * descriptor from now on. */
358 FD_CLR(ignorefd, &devices->infds); 562 FD_CLR(ignorefd, &devices->infds);
359 } else 563 } else /* Send LHREQ_BREAK command. */
360 write(lguest_fd, args, sizeof(args)); 564 write(lguest_fd, args, sizeof(args));
361 } 565 }
362} 566}
363 567
568/* This routine just sets up a pipe to the Waker process. */
364static int setup_waker(int lguest_fd, struct device_list *device_list) 569static int setup_waker(int lguest_fd, struct device_list *device_list)
365{ 570{
366 int pipefd[2], child; 571 int pipefd[2], child;
367 572
573 /* We create a pipe to talk to the waker, and also so it knows when the
574 * Launcher dies (and closes pipe). */
368 pipe(pipefd); 575 pipe(pipefd);
369 child = fork(); 576 child = fork();
370 if (child == -1) 577 if (child == -1)
371 err(1, "forking"); 578 err(1, "forking");
372 579
373 if (child == 0) { 580 if (child == 0) {
581 /* Close the "writing" end of our copy of the pipe */
374 close(pipefd[1]); 582 close(pipefd[1]);
375 wake_parent(pipefd[0], lguest_fd, device_list); 583 wake_parent(pipefd[0], lguest_fd, device_list);
376 } 584 }
585 /* Close the reading end of our copy of the pipe. */
377 close(pipefd[0]); 586 close(pipefd[0]);
378 587
588 /* Here is the fd used to talk to the waker. */
379 return pipefd[1]; 589 return pipefd[1];
380} 590}
381 591
592/*L:210
593 * Device Handling.
594 *
595 * When the Guest sends DMA to us, it sends us an array of addresses and sizes.
596 * We need to make sure it's not trying to reach into the Launcher itself, so
597 * we have a convenient routine which check it and exits with an error message
598 * if something funny is going on:
599 */
382static void *_check_pointer(unsigned long addr, unsigned int size, 600static void *_check_pointer(unsigned long addr, unsigned int size,
383 unsigned int line) 601 unsigned int line)
384{ 602{
385 if (addr >= LGUEST_GUEST_TOP || addr + size >= LGUEST_GUEST_TOP) 603 /* We have to separately check addr and addr+size, because size could
604 * be huge and addr + size might wrap around. */
605 if (addr >= top || addr + size >= top)
386 errx(1, "%s:%i: Invalid address %li", __FILE__, line, addr); 606 errx(1, "%s:%i: Invalid address %li", __FILE__, line, addr);
607 /* We return a pointer for the caller's convenience, now we know it's
608 * safe to use. */
387 return (void *)addr; 609 return (void *)addr;
388} 610}
611/* A macro which transparently hands the line number to the real function. */
389#define check_pointer(addr,size) _check_pointer(addr, size, __LINE__) 612#define check_pointer(addr,size) _check_pointer(addr, size, __LINE__)
390 613
391/* Returns pointer to dma->used_len */ 614/* The Guest has given us the address of a "struct lguest_dma". We check it's
615 * OK and convert it to an iovec (which is a simple array of ptr/size
616 * pairs). */
392static u32 *dma2iov(unsigned long dma, struct iovec iov[], unsigned *num) 617static u32 *dma2iov(unsigned long dma, struct iovec iov[], unsigned *num)
393{ 618{
394 unsigned int i; 619 unsigned int i;
395 struct lguest_dma *udma; 620 struct lguest_dma *udma;
396 621
622 /* First we make sure that the array memory itself is valid. */
397 udma = check_pointer(dma, sizeof(*udma)); 623 udma = check_pointer(dma, sizeof(*udma));
624 /* Now we check each element */
398 for (i = 0; i < LGUEST_MAX_DMA_SECTIONS; i++) { 625 for (i = 0; i < LGUEST_MAX_DMA_SECTIONS; i++) {
626 /* A zero length ends the array. */
399 if (!udma->len[i]) 627 if (!udma->len[i])
400 break; 628 break;
401 629
@@ -403,9 +631,15 @@ static u32 *dma2iov(unsigned long dma, struct iovec iov[], unsigned *num)
403 iov[i].iov_len = udma->len[i]; 631 iov[i].iov_len = udma->len[i];
404 } 632 }
405 *num = i; 633 *num = i;
634
635 /* We return the pointer to where the caller should write the amount of
636 * the buffer used. */
406 return &udma->used_len; 637 return &udma->used_len;
407} 638}
408 639
640/* This routine gets a DMA buffer from the Guest for a given key, and converts
641 * it to an iovec array. It returns the interrupt the Guest wants when we're
642 * finished, and a pointer to the "used_len" field to fill in. */
409static u32 *get_dma_buffer(int fd, void *key, 643static u32 *get_dma_buffer(int fd, void *key,
410 struct iovec iov[], unsigned int *num, u32 *irq) 644 struct iovec iov[], unsigned int *num, u32 *irq)
411{ 645{
@@ -413,16 +647,21 @@ static u32 *get_dma_buffer(int fd, void *key,
413 unsigned long udma; 647 unsigned long udma;
414 u32 *res; 648 u32 *res;
415 649
650 /* Ask the kernel for a DMA buffer corresponding to this key. */
416 udma = write(fd, buf, sizeof(buf)); 651 udma = write(fd, buf, sizeof(buf));
652 /* They haven't registered any, or they're all used? */
417 if (udma == (unsigned long)-1) 653 if (udma == (unsigned long)-1)
418 return NULL; 654 return NULL;
419 655
420 /* Kernel stashes irq in ->used_len. */ 656 /* Convert it into our iovec array */
421 res = dma2iov(udma, iov, num); 657 res = dma2iov(udma, iov, num);
658 /* The kernel stashes irq in ->used_len to get it out to us. */
422 *irq = *res; 659 *irq = *res;
660 /* Return a pointer to ((struct lguest_dma *)udma)->used_len. */
423 return res; 661 return res;
424} 662}
425 663
664/* This is a convenient routine to send the Guest an interrupt. */
426static void trigger_irq(int fd, u32 irq) 665static void trigger_irq(int fd, u32 irq)
427{ 666{
428 u32 buf[] = { LHREQ_IRQ, irq }; 667 u32 buf[] = { LHREQ_IRQ, irq };
@@ -430,6 +669,10 @@ static void trigger_irq(int fd, u32 irq)
430 err(1, "Triggering irq %i", irq); 669 err(1, "Triggering irq %i", irq);
431} 670}
432 671
672/* This simply sets up an iovec array where we can put data to be discarded.
673 * This happens when the Guest doesn't want or can't handle the input: we have
674 * to get rid of it somewhere, and if we bury it in the ceiling space it will
675 * start to smell after a week. */
433static void discard_iovec(struct iovec *iov, unsigned int *num) 676static void discard_iovec(struct iovec *iov, unsigned int *num)
434{ 677{
435 static char discard_buf[1024]; 678 static char discard_buf[1024];
@@ -438,19 +681,24 @@ static void discard_iovec(struct iovec *iov, unsigned int *num)
438 iov->iov_len = sizeof(discard_buf); 681 iov->iov_len = sizeof(discard_buf);
439} 682}
440 683
684/* Here is the input terminal setting we save, and the routine to restore them
685 * on exit so the user can see what they type next. */
441static struct termios orig_term; 686static struct termios orig_term;
442static void restore_term(void) 687static void restore_term(void)
443{ 688{
444 tcsetattr(STDIN_FILENO, TCSANOW, &orig_term); 689 tcsetattr(STDIN_FILENO, TCSANOW, &orig_term);
445} 690}
446 691
692/* We associate some data with the console for our exit hack. */
447struct console_abort 693struct console_abort
448{ 694{
695 /* How many times have they hit ^C? */
449 int count; 696 int count;
697 /* When did they start? */
450 struct timeval start; 698 struct timeval start;
451}; 699};
452 700
453/* We DMA input to buffer bound at start of console page. */ 701/* This is the routine which handles console input (ie. stdin). */
454static bool handle_console_input(int fd, struct device *dev) 702static bool handle_console_input(int fd, struct device *dev)
455{ 703{
456 u32 irq = 0, *lenp; 704 u32 irq = 0, *lenp;
@@ -459,24 +707,38 @@ static bool handle_console_input(int fd, struct device *dev)
459 struct iovec iov[LGUEST_MAX_DMA_SECTIONS]; 707 struct iovec iov[LGUEST_MAX_DMA_SECTIONS];
460 struct console_abort *abort = dev->priv; 708 struct console_abort *abort = dev->priv;
461 709
710 /* First we get the console buffer from the Guest. The key is dev->mem
711 * which was set to 0 in setup_console(). */
462 lenp = get_dma_buffer(fd, dev->mem, iov, &num, &irq); 712 lenp = get_dma_buffer(fd, dev->mem, iov, &num, &irq);
463 if (!lenp) { 713 if (!lenp) {
714 /* If it's not ready for input, warn and set up to discard. */
464 warn("console: no dma buffer!"); 715 warn("console: no dma buffer!");
465 discard_iovec(iov, &num); 716 discard_iovec(iov, &num);
466 } 717 }
467 718
719 /* This is why we convert to iovecs: the readv() call uses them, and so
720 * it reads straight into the Guest's buffer. */
468 len = readv(dev->fd, iov, num); 721 len = readv(dev->fd, iov, num);
469 if (len <= 0) { 722 if (len <= 0) {
723 /* This implies that the console is closed, is /dev/null, or
724 * something went terribly wrong. We still go through the rest
725 * of the logic, though, especially the exit handling below. */
470 warnx("Failed to get console input, ignoring console."); 726 warnx("Failed to get console input, ignoring console.");
471 len = 0; 727 len = 0;
472 } 728 }
473 729
730 /* If we read the data into the Guest, fill in the length and send the
731 * interrupt. */
474 if (lenp) { 732 if (lenp) {
475 *lenp = len; 733 *lenp = len;
476 trigger_irq(fd, irq); 734 trigger_irq(fd, irq);
477 } 735 }
478 736
479 /* Three ^C within one second? Exit. */ 737 /* Three ^C within one second? Exit.
738 *
739 * This is such a hack, but works surprisingly well. Each ^C has to be
740 * in a buffer by itself, so they can't be too fast. But we check that
741 * we get three within about a second, so they can't be too slow. */
480 if (len == 1 && ((char *)iov[0].iov_base)[0] == 3) { 742 if (len == 1 && ((char *)iov[0].iov_base)[0] == 3) {
481 if (!abort->count++) 743 if (!abort->count++)
482 gettimeofday(&abort->start, NULL); 744 gettimeofday(&abort->start, NULL);
@@ -484,43 +746,60 @@ static bool handle_console_input(int fd, struct device *dev)
484 struct timeval now; 746 struct timeval now;
485 gettimeofday(&now, NULL); 747 gettimeofday(&now, NULL);
486 if (now.tv_sec <= abort->start.tv_sec+1) { 748 if (now.tv_sec <= abort->start.tv_sec+1) {
487 /* Make sure waker is not blocked in BREAK */
488 u32 args[] = { LHREQ_BREAK, 0 }; 749 u32 args[] = { LHREQ_BREAK, 0 };
750 /* Close the fd so Waker will know it has to
751 * exit. */
489 close(waker_fd); 752 close(waker_fd);
753 /* Just in case waker is blocked in BREAK, send
754 * unbreak now. */
490 write(fd, args, sizeof(args)); 755 write(fd, args, sizeof(args));
491 exit(2); 756 exit(2);
492 } 757 }
493 abort->count = 0; 758 abort->count = 0;
494 } 759 }
495 } else 760 } else
761 /* Any other key resets the abort counter. */
496 abort->count = 0; 762 abort->count = 0;
497 763
764 /* Now, if we didn't read anything, put the input terminal back and
765 * return failure (meaning, don't call us again). */
498 if (!len) { 766 if (!len) {
499 restore_term(); 767 restore_term();
500 return false; 768 return false;
501 } 769 }
770 /* Everything went OK! */
502 return true; 771 return true;
503} 772}
504 773
774/* Handling console output is much simpler than input. */
505static u32 handle_console_output(int fd, const struct iovec *iov, 775static u32 handle_console_output(int fd, const struct iovec *iov,
506 unsigned num, struct device*dev) 776 unsigned num, struct device*dev)
507{ 777{
778 /* Whatever the Guest sends, write it to standard output. Return the
779 * number of bytes written. */
508 return writev(STDOUT_FILENO, iov, num); 780 return writev(STDOUT_FILENO, iov, num);
509} 781}
510 782
783/* Guest->Host network output is also pretty easy. */
511static u32 handle_tun_output(int fd, const struct iovec *iov, 784static u32 handle_tun_output(int fd, const struct iovec *iov,
512 unsigned num, struct device *dev) 785 unsigned num, struct device *dev)
513{ 786{
514 /* Now we've seen output, we should warn if we can't get buffers. */ 787 /* We put a flag in the "priv" pointer of the network device, and set
788 * it as soon as we see output. We'll see why in handle_tun_input() */
515 *(bool *)dev->priv = true; 789 *(bool *)dev->priv = true;
790 /* Whatever packet the Guest sent us, write it out to the tun
791 * device. */
516 return writev(dev->fd, iov, num); 792 return writev(dev->fd, iov, num);
517} 793}
518 794
795/* This matches the peer_key() in lguest_net.c. The key for any given slot
796 * is the address of the network device's page plus 4 * the slot number. */
519static unsigned long peer_offset(unsigned int peernum) 797static unsigned long peer_offset(unsigned int peernum)
520{ 798{
521 return 4 * peernum; 799 return 4 * peernum;
522} 800}
523 801
802/* This is where we handle a packet coming in from the tun device */
524static bool handle_tun_input(int fd, struct device *dev) 803static bool handle_tun_input(int fd, struct device *dev)
525{ 804{
526 u32 irq = 0, *lenp; 805 u32 irq = 0, *lenp;
@@ -528,17 +807,28 @@ static bool handle_tun_input(int fd, struct device *dev)
528 unsigned num; 807 unsigned num;
529 struct iovec iov[LGUEST_MAX_DMA_SECTIONS]; 808 struct iovec iov[LGUEST_MAX_DMA_SECTIONS];
530 809
810 /* First we get a buffer the Guest has bound to its key. */
531 lenp = get_dma_buffer(fd, dev->mem+peer_offset(NET_PEERNUM), iov, &num, 811 lenp = get_dma_buffer(fd, dev->mem+peer_offset(NET_PEERNUM), iov, &num,
532 &irq); 812 &irq);
533 if (!lenp) { 813 if (!lenp) {
814 /* Now, it's expected that if we try to send a packet too
815 * early, the Guest won't be ready yet. This is why we set a
816 * flag when the Guest sends its first packet. If it's sent a
817 * packet we assume it should be ready to receive them.
818 *
819 * Actually, this is what the status bits in the descriptor are
820 * for: we should *use* them. FIXME! */
534 if (*(bool *)dev->priv) 821 if (*(bool *)dev->priv)
535 warn("network: no dma buffer!"); 822 warn("network: no dma buffer!");
536 discard_iovec(iov, &num); 823 discard_iovec(iov, &num);
537 } 824 }
538 825
826 /* Read the packet from the device directly into the Guest's buffer. */
539 len = readv(dev->fd, iov, num); 827 len = readv(dev->fd, iov, num);
540 if (len <= 0) 828 if (len <= 0)
541 err(1, "reading network"); 829 err(1, "reading network");
830
831 /* Write the used_len, and trigger the interrupt for the Guest */
542 if (lenp) { 832 if (lenp) {
543 *lenp = len; 833 *lenp = len;
544 trigger_irq(fd, irq); 834 trigger_irq(fd, irq);
@@ -546,9 +836,13 @@ static bool handle_tun_input(int fd, struct device *dev)
546 verbose("tun input packet len %i [%02x %02x] (%s)\n", len, 836 verbose("tun input packet len %i [%02x %02x] (%s)\n", len,
547 ((u8 *)iov[0].iov_base)[0], ((u8 *)iov[0].iov_base)[1], 837 ((u8 *)iov[0].iov_base)[0], ((u8 *)iov[0].iov_base)[1],
548 lenp ? "sent" : "discarded"); 838 lenp ? "sent" : "discarded");
839 /* All good. */
549 return true; 840 return true;
550} 841}
551 842
843/* The last device handling routine is block output: the Guest has sent a DMA
844 * to the block device. It will have placed the command it wants in the
845 * "struct lguest_block_page". */
552static u32 handle_block_output(int fd, const struct iovec *iov, 846static u32 handle_block_output(int fd, const struct iovec *iov,
553 unsigned num, struct device *dev) 847 unsigned num, struct device *dev)
554{ 848{
@@ -558,36 +852,64 @@ static u32 handle_block_output(int fd, const struct iovec *iov,
558 struct iovec reply[LGUEST_MAX_DMA_SECTIONS]; 852 struct iovec reply[LGUEST_MAX_DMA_SECTIONS];
559 off64_t device_len, off = (off64_t)p->sector * 512; 853 off64_t device_len, off = (off64_t)p->sector * 512;
560 854
855 /* First we extract the device length from the dev->priv pointer. */
561 device_len = *(off64_t *)dev->priv; 856 device_len = *(off64_t *)dev->priv;
562 857
858 /* We first check that the read or write is within the length of the
859 * block file. */
563 if (off >= device_len) 860 if (off >= device_len)
564 err(1, "Bad offset %llu vs %llu", off, device_len); 861 err(1, "Bad offset %llu vs %llu", off, device_len);
862 /* Move to the right location in the block file. This shouldn't fail,
863 * but best to check. */
565 if (lseek64(dev->fd, off, SEEK_SET) != off) 864 if (lseek64(dev->fd, off, SEEK_SET) != off)
566 err(1, "Bad seek to sector %i", p->sector); 865 err(1, "Bad seek to sector %i", p->sector);
567 866
568 verbose("Block: %s at offset %llu\n", p->type ? "WRITE" : "READ", off); 867 verbose("Block: %s at offset %llu\n", p->type ? "WRITE" : "READ", off);
569 868
869 /* They were supposed to bind a reply buffer at key equal to the start
870 * of the block device memory. We need this to tell them when the
871 * request is finished. */
570 lenp = get_dma_buffer(fd, dev->mem, reply, &reply_num, &irq); 872 lenp = get_dma_buffer(fd, dev->mem, reply, &reply_num, &irq);
571 if (!lenp) 873 if (!lenp)
572 err(1, "Block request didn't give us a dma buffer"); 874 err(1, "Block request didn't give us a dma buffer");
573 875
574 if (p->type) { 876 if (p->type) {
877 /* A write request. The DMA they sent contained the data, so
878 * write it out. */
575 len = writev(dev->fd, iov, num); 879 len = writev(dev->fd, iov, num);
880 /* Grr... Now we know how long the "struct lguest_dma" they
881 * sent was, we make sure they didn't try to write over the end
882 * of the block file (possibly extending it). */
576 if (off + len > device_len) { 883 if (off + len > device_len) {
884 /* Trim it back to the correct length */
577 ftruncate(dev->fd, device_len); 885 ftruncate(dev->fd, device_len);
886 /* Die, bad Guest, die. */
578 errx(1, "Write past end %llu+%u", off, len); 887 errx(1, "Write past end %llu+%u", off, len);
579 } 888 }
889 /* The reply length is 0: we just send back an empty DMA to
890 * interrupt them and tell them the write is finished. */
580 *lenp = 0; 891 *lenp = 0;
581 } else { 892 } else {
893 /* A read request. They sent an empty DMA to start the
894 * request, and we put the read contents into the reply
895 * buffer. */
582 len = readv(dev->fd, reply, reply_num); 896 len = readv(dev->fd, reply, reply_num);
583 *lenp = len; 897 *lenp = len;
584 } 898 }
585 899
900 /* The result is 1 (done), 2 if there was an error (short read or
901 * write). */
586 p->result = 1 + (p->bytes != len); 902 p->result = 1 + (p->bytes != len);
903 /* Now tell them we've used their reply buffer. */
587 trigger_irq(fd, irq); 904 trigger_irq(fd, irq);
905
906 /* We're supposed to return the number of bytes of the output buffer we
907 * used. But the block device uses the "result" field instead, so we
908 * don't bother. */
588 return 0; 909 return 0;
589} 910}
590 911
912/* This is the generic routine we call when the Guest sends some DMA out. */
591static void handle_output(int fd, unsigned long dma, unsigned long key, 913static void handle_output(int fd, unsigned long dma, unsigned long key,
592 struct device_list *devices) 914 struct device_list *devices)
593{ 915{
@@ -596,30 +918,53 @@ static void handle_output(int fd, unsigned long dma, unsigned long key,
596 struct iovec iov[LGUEST_MAX_DMA_SECTIONS]; 918 struct iovec iov[LGUEST_MAX_DMA_SECTIONS];
597 unsigned num = 0; 919 unsigned num = 0;
598 920
921 /* Convert the "struct lguest_dma" they're sending to a "struct
922 * iovec". */
599 lenp = dma2iov(dma, iov, &num); 923 lenp = dma2iov(dma, iov, &num);
924
925 /* Check each device: if they expect output to this key, tell them to
926 * handle it. */
600 for (i = devices->dev; i; i = i->next) { 927 for (i = devices->dev; i; i = i->next) {
601 if (i->handle_output && key == i->watch_key) { 928 if (i->handle_output && key == i->watch_key) {
929 /* We write the result straight into the used_len field
930 * for them. */
602 *lenp = i->handle_output(fd, iov, num, i); 931 *lenp = i->handle_output(fd, iov, num, i);
603 return; 932 return;
604 } 933 }
605 } 934 }
935
936 /* This can happen: the kernel sends any SEND_DMA which doesn't match
937 * another Guest to us. It could be that another Guest just left a
938 * network, for example. But it's unusual. */
606 warnx("Pending dma %p, key %p", (void *)dma, (void *)key); 939 warnx("Pending dma %p, key %p", (void *)dma, (void *)key);
607} 940}
608 941
942/* This is called when the waker wakes us up: check for incoming file
943 * descriptors. */
609static void handle_input(int fd, struct device_list *devices) 944static void handle_input(int fd, struct device_list *devices)
610{ 945{
946 /* select() wants a zeroed timeval to mean "don't wait". */
611 struct timeval poll = { .tv_sec = 0, .tv_usec = 0 }; 947 struct timeval poll = { .tv_sec = 0, .tv_usec = 0 };
612 948
613 for (;;) { 949 for (;;) {
614 struct device *i; 950 struct device *i;
615 fd_set fds = devices->infds; 951 fd_set fds = devices->infds;
616 952
953 /* If nothing is ready, we're done. */
617 if (select(devices->max_infd+1, &fds, NULL, NULL, &poll) == 0) 954 if (select(devices->max_infd+1, &fds, NULL, NULL, &poll) == 0)
618 break; 955 break;
619 956
957 /* Otherwise, call the device(s) which have readable
958 * file descriptors and a method of handling them. */
620 for (i = devices->dev; i; i = i->next) { 959 for (i = devices->dev; i; i = i->next) {
621 if (i->handle_input && FD_ISSET(i->fd, &fds)) { 960 if (i->handle_input && FD_ISSET(i->fd, &fds)) {
961 /* If handle_input() returns false, it means we
962 * should no longer service it.
963 * handle_console_input() does this. */
622 if (!i->handle_input(fd, i)) { 964 if (!i->handle_input(fd, i)) {
965 /* Clear it from the set of input file
966 * descriptors kept at the head of the
967 * device list. */
623 FD_CLR(i->fd, &devices->infds); 968 FD_CLR(i->fd, &devices->infds);
624 /* Tell waker to ignore it too... */ 969 /* Tell waker to ignore it too... */
625 write(waker_fd, &i->fd, sizeof(i->fd)); 970 write(waker_fd, &i->fd, sizeof(i->fd));
@@ -629,26 +974,42 @@ static void handle_input(int fd, struct device_list *devices)
629 } 974 }
630} 975}
631 976
632static struct lguest_device_desc *new_dev_desc(u16 type, u16 features, 977/*L:190
633 u16 num_pages) 978 * Device Setup
979 *
980 * All devices need a descriptor so the Guest knows it exists, and a "struct
981 * device" so the Launcher can keep track of it. We have common helper
982 * routines to allocate them.
983 *
984 * This routine allocates a new "struct lguest_device_desc" from descriptor
985 * table in the devices array just above the Guest's normal memory. */
986static struct lguest_device_desc *
987new_dev_desc(struct lguest_device_desc *descs,
988 u16 type, u16 features, u16 num_pages)
634{ 989{
635 static unsigned long top = LGUEST_GUEST_TOP; 990 unsigned int i;
636 struct lguest_device_desc *desc;
637 991
638 desc = malloc(sizeof(*desc)); 992 for (i = 0; i < LGUEST_MAX_DEVICES; i++) {
639 desc->type = type; 993 if (!descs[i].type) {
640 desc->num_pages = num_pages; 994 descs[i].type = type;
641 desc->features = features; 995 descs[i].features = features;
642 desc->status = 0; 996 descs[i].num_pages = num_pages;
643 if (num_pages) { 997 /* If they said the device needs memory, we allocate
644 top -= num_pages*getpagesize(); 998 * that now, bumping up the top of Guest memory. */
645 map_zeroed_pages(top, num_pages); 999 if (num_pages) {
646 desc->pfn = top / getpagesize(); 1000 map_zeroed_pages(top, num_pages);
647 } else 1001 descs[i].pfn = top/getpagesize();
648 desc->pfn = 0; 1002 top += num_pages*getpagesize();
649 return desc; 1003 }
1004 return &descs[i];
1005 }
1006 }
1007 errx(1, "too many devices");
650} 1008}
651 1009
1010/* This monster routine does all the creation and setup of a new device,
1011 * including caling new_dev_desc() to allocate the descriptor and device
1012 * memory. */
652static struct device *new_device(struct device_list *devices, 1013static struct device *new_device(struct device_list *devices,
653 u16 type, u16 num_pages, u16 features, 1014 u16 type, u16 num_pages, u16 features,
654 int fd, 1015 int fd,
@@ -661,15 +1022,21 @@ static struct device *new_device(struct device_list *devices,
661{ 1022{
662 struct device *dev = malloc(sizeof(*dev)); 1023 struct device *dev = malloc(sizeof(*dev));
663 1024
664 /* Append to device list. */ 1025 /* Append to device list. Prepending to a single-linked list is
1026 * easier, but the user expects the devices to be arranged on the bus
1027 * in command-line order. The first network device on the command line
1028 * is eth0, the first block device /dev/lgba, etc. */
665 *devices->lastdev = dev; 1029 *devices->lastdev = dev;
666 dev->next = NULL; 1030 dev->next = NULL;
667 devices->lastdev = &dev->next; 1031 devices->lastdev = &dev->next;
668 1032
1033 /* Now we populate the fields one at a time. */
669 dev->fd = fd; 1034 dev->fd = fd;
1035 /* If we have an input handler for this file descriptor, then we add it
1036 * to the device_list's fdset and maxfd. */
670 if (handle_input) 1037 if (handle_input)
671 set_fd(dev->fd, devices); 1038 set_fd(dev->fd, devices);
672 dev->desc = new_dev_desc(type, features, num_pages); 1039 dev->desc = new_dev_desc(devices->descs, type, features, num_pages);
673 dev->mem = (void *)(dev->desc->pfn * getpagesize()); 1040 dev->mem = (void *)(dev->desc->pfn * getpagesize());
674 dev->handle_input = handle_input; 1041 dev->handle_input = handle_input;
675 dev->watch_key = (unsigned long)dev->mem + watch_off; 1042 dev->watch_key = (unsigned long)dev->mem + watch_off;
@@ -677,27 +1044,37 @@ static struct device *new_device(struct device_list *devices,
677 return dev; 1044 return dev;
678} 1045}
679 1046
1047/* Our first setup routine is the console. It's a fairly simple device, but
1048 * UNIX tty handling makes it uglier than it could be. */
680static void setup_console(struct device_list *devices) 1049static void setup_console(struct device_list *devices)
681{ 1050{
682 struct device *dev; 1051 struct device *dev;
683 1052
1053 /* If we can save the initial standard input settings... */
684 if (tcgetattr(STDIN_FILENO, &orig_term) == 0) { 1054 if (tcgetattr(STDIN_FILENO, &orig_term) == 0) {
685 struct termios term = orig_term; 1055 struct termios term = orig_term;
1056 /* Then we turn off echo, line buffering and ^C etc. We want a
1057 * raw input stream to the Guest. */
686 term.c_lflag &= ~(ISIG|ICANON|ECHO); 1058 term.c_lflag &= ~(ISIG|ICANON|ECHO);
687 tcsetattr(STDIN_FILENO, TCSANOW, &term); 1059 tcsetattr(STDIN_FILENO, TCSANOW, &term);
1060 /* If we exit gracefully, the original settings will be
1061 * restored so the user can see what they're typing. */
688 atexit(restore_term); 1062 atexit(restore_term);
689 } 1063 }
690 1064
691 /* We don't currently require a page for the console. */ 1065 /* We don't currently require any memory for the console, so we ask for
1066 * 0 pages. */
692 dev = new_device(devices, LGUEST_DEVICE_T_CONSOLE, 0, 0, 1067 dev = new_device(devices, LGUEST_DEVICE_T_CONSOLE, 0, 0,
693 STDIN_FILENO, handle_console_input, 1068 STDIN_FILENO, handle_console_input,
694 LGUEST_CONSOLE_DMA_KEY, handle_console_output); 1069 LGUEST_CONSOLE_DMA_KEY, handle_console_output);
1070 /* We store the console state in dev->priv, and initialize it. */
695 dev->priv = malloc(sizeof(struct console_abort)); 1071 dev->priv = malloc(sizeof(struct console_abort));
696 ((struct console_abort *)dev->priv)->count = 0; 1072 ((struct console_abort *)dev->priv)->count = 0;
697 verbose("device %p: console\n", 1073 verbose("device %p: console\n",
698 (void *)(dev->desc->pfn * getpagesize())); 1074 (void *)(dev->desc->pfn * getpagesize()));
699} 1075}
700 1076
1077/* Setting up a block file is also fairly straightforward. */
701static void setup_block_file(const char *filename, struct device_list *devices) 1078static void setup_block_file(const char *filename, struct device_list *devices)
702{ 1079{
703 int fd; 1080 int fd;
@@ -705,20 +1082,47 @@ static void setup_block_file(const char *filename, struct device_list *devices)
705 off64_t *device_len; 1082 off64_t *device_len;
706 struct lguest_block_page *p; 1083 struct lguest_block_page *p;
707 1084
1085 /* We open with O_LARGEFILE because otherwise we get stuck at 2G. We
1086 * open with O_DIRECT because otherwise our benchmarks go much too
1087 * fast. */
708 fd = open_or_die(filename, O_RDWR|O_LARGEFILE|O_DIRECT); 1088 fd = open_or_die(filename, O_RDWR|O_LARGEFILE|O_DIRECT);
1089
1090 /* We want one page, and have no input handler (the block file never
1091 * has anything interesting to say to us). Our timing will be quite
1092 * random, so it should be a reasonable randomness source. */
709 dev = new_device(devices, LGUEST_DEVICE_T_BLOCK, 1, 1093 dev = new_device(devices, LGUEST_DEVICE_T_BLOCK, 1,
710 LGUEST_DEVICE_F_RANDOMNESS, 1094 LGUEST_DEVICE_F_RANDOMNESS,
711 fd, NULL, 0, handle_block_output); 1095 fd, NULL, 0, handle_block_output);
1096
1097 /* We store the device size in the private area */
712 device_len = dev->priv = malloc(sizeof(*device_len)); 1098 device_len = dev->priv = malloc(sizeof(*device_len));
1099 /* This is the safe way of establishing the size of our device: it
1100 * might be a normal file or an actual block device like /dev/hdb. */
713 *device_len = lseek64(fd, 0, SEEK_END); 1101 *device_len = lseek64(fd, 0, SEEK_END);
714 p = dev->mem;
715 1102
1103 /* The device memory is a "struct lguest_block_page". It's zeroed
1104 * already, we just need to put in the device size. Block devices
1105 * think in sectors (ie. 512 byte chunks), so we translate here. */
1106 p = dev->mem;
716 p->num_sectors = *device_len/512; 1107 p->num_sectors = *device_len/512;
717 verbose("device %p: block %i sectors\n", 1108 verbose("device %p: block %i sectors\n",
718 (void *)(dev->desc->pfn * getpagesize()), p->num_sectors); 1109 (void *)(dev->desc->pfn * getpagesize()), p->num_sectors);
719} 1110}
720 1111
721/* We use fnctl locks to reserve network slots (autocleanup!) */ 1112/*
1113 * Network Devices.
1114 *
1115 * Setting up network devices is quite a pain, because we have three types.
1116 * First, we have the inter-Guest network. This is a file which is mapped into
1117 * the address space of the Guests who are on the network. Because it is a
1118 * shared mapping, the same page underlies all the devices, and they can send
1119 * DMA to each other.
1120 *
1121 * Remember from our network driver, the Guest is told what slot in the page it
1122 * is to use. We use exclusive fnctl locks to reserve a slot. If another
1123 * Guest is using a slot, the lock will fail and we try another. Because fnctl
1124 * locks are cleaned up automatically when we die, this cleverly means that our
1125 * reservation on the slot will vanish if we crash. */
722static unsigned int find_slot(int netfd, const char *filename) 1126static unsigned int find_slot(int netfd, const char *filename)
723{ 1127{
724 struct flock fl; 1128 struct flock fl;
@@ -726,26 +1130,33 @@ static unsigned int find_slot(int netfd, const char *filename)
726 fl.l_type = F_WRLCK; 1130 fl.l_type = F_WRLCK;
727 fl.l_whence = SEEK_SET; 1131 fl.l_whence = SEEK_SET;
728 fl.l_len = 1; 1132 fl.l_len = 1;
1133 /* Try a 1 byte lock in each possible position number */
729 for (fl.l_start = 0; 1134 for (fl.l_start = 0;
730 fl.l_start < getpagesize()/sizeof(struct lguest_net); 1135 fl.l_start < getpagesize()/sizeof(struct lguest_net);
731 fl.l_start++) { 1136 fl.l_start++) {
1137 /* If we succeed, return the slot number. */
732 if (fcntl(netfd, F_SETLK, &fl) == 0) 1138 if (fcntl(netfd, F_SETLK, &fl) == 0)
733 return fl.l_start; 1139 return fl.l_start;
734 } 1140 }
735 errx(1, "No free slots in network file %s", filename); 1141 errx(1, "No free slots in network file %s", filename);
736} 1142}
737 1143
1144/* This function sets up the network file */
738static void setup_net_file(const char *filename, 1145static void setup_net_file(const char *filename,
739 struct device_list *devices) 1146 struct device_list *devices)
740{ 1147{
741 int netfd; 1148 int netfd;
742 struct device *dev; 1149 struct device *dev;
743 1150
1151 /* We don't use open_or_die() here: for friendliness we create the file
1152 * if it doesn't already exist. */
744 netfd = open(filename, O_RDWR, 0); 1153 netfd = open(filename, O_RDWR, 0);
745 if (netfd < 0) { 1154 if (netfd < 0) {
746 if (errno == ENOENT) { 1155 if (errno == ENOENT) {
747 netfd = open(filename, O_RDWR|O_CREAT, 0600); 1156 netfd = open(filename, O_RDWR|O_CREAT, 0600);
748 if (netfd >= 0) { 1157 if (netfd >= 0) {
1158 /* If we succeeded, initialize the file with a
1159 * blank page. */
749 char page[getpagesize()]; 1160 char page[getpagesize()];
750 memset(page, 0, sizeof(page)); 1161 memset(page, 0, sizeof(page));
751 write(netfd, page, sizeof(page)); 1162 write(netfd, page, sizeof(page));
@@ -755,11 +1166,15 @@ static void setup_net_file(const char *filename,
755 err(1, "cannot open net file '%s'", filename); 1166 err(1, "cannot open net file '%s'", filename);
756 } 1167 }
757 1168
1169 /* We need 1 page, and the features indicate the slot to use and that
1170 * no checksum is needed. We never touch this device again; it's
1171 * between the Guests on the network, so we don't register input or
1172 * output handlers. */
758 dev = new_device(devices, LGUEST_DEVICE_T_NET, 1, 1173 dev = new_device(devices, LGUEST_DEVICE_T_NET, 1,
759 find_slot(netfd, filename)|LGUEST_NET_F_NOCSUM, 1174 find_slot(netfd, filename)|LGUEST_NET_F_NOCSUM,
760 -1, NULL, 0, NULL); 1175 -1, NULL, 0, NULL);
761 1176
762 /* We overwrite the /dev/zero mapping with the actual file. */ 1177 /* Map the shared file. */
763 if (mmap(dev->mem, getpagesize(), PROT_READ|PROT_WRITE, 1178 if (mmap(dev->mem, getpagesize(), PROT_READ|PROT_WRITE,
764 MAP_FIXED|MAP_SHARED, netfd, 0) != dev->mem) 1179 MAP_FIXED|MAP_SHARED, netfd, 0) != dev->mem)
765 err(1, "could not mmap '%s'", filename); 1180 err(1, "could not mmap '%s'", filename);
@@ -767,6 +1182,7 @@ static void setup_net_file(const char *filename,
767 (void *)(dev->desc->pfn * getpagesize()), filename, 1182 (void *)(dev->desc->pfn * getpagesize()), filename,
768 dev->desc->features & ~LGUEST_NET_F_NOCSUM); 1183 dev->desc->features & ~LGUEST_NET_F_NOCSUM);
769} 1184}
1185/*:*/
770 1186
771static u32 str2ip(const char *ipaddr) 1187static u32 str2ip(const char *ipaddr)
772{ 1188{
@@ -776,7 +1192,11 @@ static u32 str2ip(const char *ipaddr)
776 return (byte[0] << 24) | (byte[1] << 16) | (byte[2] << 8) | byte[3]; 1192 return (byte[0] << 24) | (byte[1] << 16) | (byte[2] << 8) | byte[3];
777} 1193}
778 1194
779/* adapted from libbridge */ 1195/* This code is "adapted" from libbridge: it attaches the Host end of the
1196 * network device to the bridge device specified by the command line.
1197 *
1198 * This is yet another James Morris contribution (I'm an IP-level guy, so I
1199 * dislike bridging), and I just try not to break it. */
780static void add_to_bridge(int fd, const char *if_name, const char *br_name) 1200static void add_to_bridge(int fd, const char *if_name, const char *br_name)
781{ 1201{
782 int ifidx; 1202 int ifidx;
@@ -795,12 +1215,16 @@ static void add_to_bridge(int fd, const char *if_name, const char *br_name)
795 err(1, "can't add %s to bridge %s", if_name, br_name); 1215 err(1, "can't add %s to bridge %s", if_name, br_name);
796} 1216}
797 1217
1218/* This sets up the Host end of the network device with an IP address, brings
1219 * it up so packets will flow, the copies the MAC address into the hwaddr
1220 * pointer (in practice, the Host's slot in the network device's memory). */
798static void configure_device(int fd, const char *devname, u32 ipaddr, 1221static void configure_device(int fd, const char *devname, u32 ipaddr,
799 unsigned char hwaddr[6]) 1222 unsigned char hwaddr[6])
800{ 1223{
801 struct ifreq ifr; 1224 struct ifreq ifr;
802 struct sockaddr_in *sin = (struct sockaddr_in *)&ifr.ifr_addr; 1225 struct sockaddr_in *sin = (struct sockaddr_in *)&ifr.ifr_addr;
803 1226
1227 /* Don't read these incantations. Just cut & paste them like I did! */
804 memset(&ifr, 0, sizeof(ifr)); 1228 memset(&ifr, 0, sizeof(ifr));
805 strcpy(ifr.ifr_name, devname); 1229 strcpy(ifr.ifr_name, devname);
806 sin->sin_family = AF_INET; 1230 sin->sin_family = AF_INET;
@@ -811,12 +1235,19 @@ static void configure_device(int fd, const char *devname, u32 ipaddr,
811 if (ioctl(fd, SIOCSIFFLAGS, &ifr) != 0) 1235 if (ioctl(fd, SIOCSIFFLAGS, &ifr) != 0)
812 err(1, "Bringing interface %s up", devname); 1236 err(1, "Bringing interface %s up", devname);
813 1237
1238 /* SIOC stands for Socket I/O Control. G means Get (vs S for Set
1239 * above). IF means Interface, and HWADDR is hardware address.
1240 * Simple! */
814 if (ioctl(fd, SIOCGIFHWADDR, &ifr) != 0) 1241 if (ioctl(fd, SIOCGIFHWADDR, &ifr) != 0)
815 err(1, "getting hw address for %s", devname); 1242 err(1, "getting hw address for %s", devname);
816
817 memcpy(hwaddr, ifr.ifr_hwaddr.sa_data, 6); 1243 memcpy(hwaddr, ifr.ifr_hwaddr.sa_data, 6);
818} 1244}
819 1245
1246/*L:195 The other kind of network is a Host<->Guest network. This can either
1247 * use briding or routing, but the principle is the same: it uses the "tun"
1248 * device to inject packets into the Host as if they came in from a normal
1249 * network card. We just shunt packets between the Guest and the tun
1250 * device. */
820static void setup_tun_net(const char *arg, struct device_list *devices) 1251static void setup_tun_net(const char *arg, struct device_list *devices)
821{ 1252{
822 struct device *dev; 1253 struct device *dev;
@@ -825,36 +1256,56 @@ static void setup_tun_net(const char *arg, struct device_list *devices)
825 u32 ip; 1256 u32 ip;
826 const char *br_name = NULL; 1257 const char *br_name = NULL;
827 1258
1259 /* We open the /dev/net/tun device and tell it we want a tap device. A
1260 * tap device is like a tun device, only somehow different. To tell
1261 * the truth, I completely blundered my way through this code, but it
1262 * works now! */
828 netfd = open_or_die("/dev/net/tun", O_RDWR); 1263 netfd = open_or_die("/dev/net/tun", O_RDWR);
829 memset(&ifr, 0, sizeof(ifr)); 1264 memset(&ifr, 0, sizeof(ifr));
830 ifr.ifr_flags = IFF_TAP | IFF_NO_PI; 1265 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
831 strcpy(ifr.ifr_name, "tap%d"); 1266 strcpy(ifr.ifr_name, "tap%d");
832 if (ioctl(netfd, TUNSETIFF, &ifr) != 0) 1267 if (ioctl(netfd, TUNSETIFF, &ifr) != 0)
833 err(1, "configuring /dev/net/tun"); 1268 err(1, "configuring /dev/net/tun");
1269 /* We don't need checksums calculated for packets coming in this
1270 * device: trust us! */
834 ioctl(netfd, TUNSETNOCSUM, 1); 1271 ioctl(netfd, TUNSETNOCSUM, 1);
835 1272
836 /* You will be peer 1: we should create enough jitter to randomize */ 1273 /* We create the net device with 1 page, using the features field of
1274 * the descriptor to tell the Guest it is in slot 1 (NET_PEERNUM), and
1275 * that the device has fairly random timing. We do *not* specify
1276 * LGUEST_NET_F_NOCSUM: these packets can reach the real world.
1277 *
1278 * We will put our MAC address is slot 0 for the Guest to see, so
1279 * it will send packets to us using the key "peer_offset(0)": */
837 dev = new_device(devices, LGUEST_DEVICE_T_NET, 1, 1280 dev = new_device(devices, LGUEST_DEVICE_T_NET, 1,
838 NET_PEERNUM|LGUEST_DEVICE_F_RANDOMNESS, netfd, 1281 NET_PEERNUM|LGUEST_DEVICE_F_RANDOMNESS, netfd,
839 handle_tun_input, peer_offset(0), handle_tun_output); 1282 handle_tun_input, peer_offset(0), handle_tun_output);
1283
1284 /* We keep a flag which says whether we've seen packets come out from
1285 * this network device. */
840 dev->priv = malloc(sizeof(bool)); 1286 dev->priv = malloc(sizeof(bool));
841 *(bool *)dev->priv = false; 1287 *(bool *)dev->priv = false;
842 1288
1289 /* We need a socket to perform the magic network ioctls to bring up the
1290 * tap interface, connect to the bridge etc. Any socket will do! */
843 ipfd = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP); 1291 ipfd = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
844 if (ipfd < 0) 1292 if (ipfd < 0)
845 err(1, "opening IP socket"); 1293 err(1, "opening IP socket");
846 1294
1295 /* If the command line was --tunnet=bridge:<name> do bridging. */
847 if (!strncmp(BRIDGE_PFX, arg, strlen(BRIDGE_PFX))) { 1296 if (!strncmp(BRIDGE_PFX, arg, strlen(BRIDGE_PFX))) {
848 ip = INADDR_ANY; 1297 ip = INADDR_ANY;
849 br_name = arg + strlen(BRIDGE_PFX); 1298 br_name = arg + strlen(BRIDGE_PFX);
850 add_to_bridge(ipfd, ifr.ifr_name, br_name); 1299 add_to_bridge(ipfd, ifr.ifr_name, br_name);
851 } else 1300 } else /* It is an IP address to set up the device with */
852 ip = str2ip(arg); 1301 ip = str2ip(arg);
853 1302
854 /* We are peer 0, ie. first slot. */ 1303 /* We are peer 0, ie. first slot, so we hand dev->mem to this routine
1304 * to write the MAC address at the start of the device memory. */
855 configure_device(ipfd, ifr.ifr_name, ip, dev->mem); 1305 configure_device(ipfd, ifr.ifr_name, ip, dev->mem);
856 1306
857 /* Set "promisc" bit: we want every single packet. */ 1307 /* Set "promisc" bit: we want every single packet if we're going to
1308 * bridge to other machines (and otherwise it doesn't matter). */
858 *((u8 *)dev->mem) |= 0x1; 1309 *((u8 *)dev->mem) |= 0x1;
859 1310
860 close(ipfd); 1311 close(ipfd);
@@ -865,31 +1316,10 @@ static void setup_tun_net(const char *arg, struct device_list *devices)
865 if (br_name) 1316 if (br_name)
866 verbose("attached to bridge: %s\n", br_name); 1317 verbose("attached to bridge: %s\n", br_name);
867} 1318}
1319/* That's the end of device setup. */
868 1320
869/* Now we know how much memory we have, we copy in device descriptors */ 1321/*L:220 Finally we reach the core of the Launcher, which runs the Guest, serves
870static void map_device_descriptors(struct device_list *devs, unsigned long mem) 1322 * its input and output, and finally, lays it to rest. */
871{
872 struct device *i;
873 unsigned int num;
874 struct lguest_device_desc *descs;
875
876 /* Device descriptor array sits just above top of normal memory */
877 descs = map_zeroed_pages(mem, 1);
878
879 for (i = devs->dev, num = 0; i; i = i->next, num++) {
880 if (num == LGUEST_MAX_DEVICES)
881 errx(1, "too many devices");
882 verbose("Device %i: %s\n", num,
883 i->desc->type == LGUEST_DEVICE_T_NET ? "net"
884 : i->desc->type == LGUEST_DEVICE_T_CONSOLE ? "console"
885 : i->desc->type == LGUEST_DEVICE_T_BLOCK ? "block"
886 : "unknown");
887 descs[num] = *i->desc;
888 free(i->desc);
889 i->desc = &descs[num];
890 }
891}
892
893static void __attribute__((noreturn)) 1323static void __attribute__((noreturn))
894run_guest(int lguest_fd, struct device_list *device_list) 1324run_guest(int lguest_fd, struct device_list *device_list)
895{ 1325{
@@ -901,20 +1331,37 @@ run_guest(int lguest_fd, struct device_list *device_list)
901 /* We read from the /dev/lguest device to run the Guest. */ 1331 /* We read from the /dev/lguest device to run the Guest. */
902 readval = read(lguest_fd, arr, sizeof(arr)); 1332 readval = read(lguest_fd, arr, sizeof(arr));
903 1333
1334 /* The read can only really return sizeof(arr) (the Guest did a
1335 * SEND_DMA to us), or an error. */
1336
1337 /* For a successful read, arr[0] is the address of the "struct
1338 * lguest_dma", and arr[1] is the key the Guest sent to. */
904 if (readval == sizeof(arr)) { 1339 if (readval == sizeof(arr)) {
905 handle_output(lguest_fd, arr[0], arr[1], device_list); 1340 handle_output(lguest_fd, arr[0], arr[1], device_list);
906 continue; 1341 continue;
1342 /* ENOENT means the Guest died. Reading tells us why. */
907 } else if (errno == ENOENT) { 1343 } else if (errno == ENOENT) {
908 char reason[1024] = { 0 }; 1344 char reason[1024] = { 0 };
909 read(lguest_fd, reason, sizeof(reason)-1); 1345 read(lguest_fd, reason, sizeof(reason)-1);
910 errx(1, "%s", reason); 1346 errx(1, "%s", reason);
1347 /* EAGAIN means the waker wanted us to look at some input.
1348 * Anything else means a bug or incompatible change. */
911 } else if (errno != EAGAIN) 1349 } else if (errno != EAGAIN)
912 err(1, "Running guest failed"); 1350 err(1, "Running guest failed");
1351
1352 /* Service input, then unset the BREAK which releases
1353 * the Waker. */
913 handle_input(lguest_fd, device_list); 1354 handle_input(lguest_fd, device_list);
914 if (write(lguest_fd, args, sizeof(args)) < 0) 1355 if (write(lguest_fd, args, sizeof(args)) < 0)
915 err(1, "Resetting break"); 1356 err(1, "Resetting break");
916 } 1357 }
917} 1358}
1359/*
1360 * This is the end of the Launcher.
1361 *
1362 * But wait! We've seen I/O from the Launcher, and we've seen I/O from the
1363 * Drivers. If we were to see the Host kernel I/O code, our understanding
1364 * would be complete... :*/
918 1365
919static struct option opts[] = { 1366static struct option opts[] = {
920 { "verbose", 0, NULL, 'v' }, 1367 { "verbose", 0, NULL, 'v' },
@@ -932,19 +1379,59 @@ static void usage(void)
932 "<mem-in-mb> vmlinux [args...]"); 1379 "<mem-in-mb> vmlinux [args...]");
933} 1380}
934 1381
1382/*L:100 The Launcher code itself takes us out into userspace, that scary place
1383 * where pointers run wild and free! Unfortunately, like most userspace
1384 * programs, it's quite boring (which is why everyone like to hack on the
1385 * kernel!). Perhaps if you make up an Lguest Drinking Game at this point, it
1386 * will get you through this section. Or, maybe not.
1387 *
1388 * The Launcher binary sits up high, usually starting at address 0xB8000000.
1389 * Everything below this is the "physical" memory for the Guest. For example,
1390 * if the Guest were to write a "1" at physical address 0, we would see a "1"
1391 * in the Launcher at "(int *)0". Guest physical == Launcher virtual.
1392 *
1393 * This can be tough to get your head around, but usually it just means that we
1394 * don't need to do any conversion when the Guest gives us it's "physical"
1395 * addresses.
1396 */
935int main(int argc, char *argv[]) 1397int main(int argc, char *argv[])
936{ 1398{
937 unsigned long mem, pgdir, start, page_offset, initrd_size = 0; 1399 /* Memory, top-level pagetable, code startpoint, PAGE_OFFSET and size
938 int c, lguest_fd; 1400 * of the (optional) initrd. */
1401 unsigned long mem = 0, pgdir, start, page_offset, initrd_size = 0;
1402 /* A temporary and the /dev/lguest file descriptor. */
1403 int i, c, lguest_fd;
1404 /* The list of Guest devices, based on command line arguments. */
939 struct device_list device_list; 1405 struct device_list device_list;
1406 /* The boot information for the Guest: at guest-physical address 0. */
940 void *boot = (void *)0; 1407 void *boot = (void *)0;
1408 /* If they specify an initrd file to load. */
941 const char *initrd_name = NULL; 1409 const char *initrd_name = NULL;
942 1410
1411 /* First we initialize the device list. Since console and network
1412 * device receive input from a file descriptor, we keep an fdset
1413 * (infds) and the maximum fd number (max_infd) with the head of the
1414 * list. We also keep a pointer to the last device, for easy appending
1415 * to the list. */
943 device_list.max_infd = -1; 1416 device_list.max_infd = -1;
944 device_list.dev = NULL; 1417 device_list.dev = NULL;
945 device_list.lastdev = &device_list.dev; 1418 device_list.lastdev = &device_list.dev;
946 FD_ZERO(&device_list.infds); 1419 FD_ZERO(&device_list.infds);
947 1420
1421 /* We need to know how much memory so we can set up the device
1422 * descriptor and memory pages for the devices as we parse the command
1423 * line. So we quickly look through the arguments to find the amount
1424 * of memory now. */
1425 for (i = 1; i < argc; i++) {
1426 if (argv[i][0] != '-') {
1427 mem = top = atoi(argv[i]) * 1024 * 1024;
1428 device_list.descs = map_zeroed_pages(top, 1);
1429 top += getpagesize();
1430 break;
1431 }
1432 }
1433
1434 /* The options are fairly straight-forward */
948 while ((c = getopt_long(argc, argv, "v", opts, NULL)) != EOF) { 1435 while ((c = getopt_long(argc, argv, "v", opts, NULL)) != EOF) {
949 switch (c) { 1436 switch (c) {
950 case 'v': 1437 case 'v':
@@ -967,46 +1454,71 @@ int main(int argc, char *argv[])
967 usage(); 1454 usage();
968 } 1455 }
969 } 1456 }
1457 /* After the other arguments we expect memory and kernel image name,
1458 * followed by command line arguments for the kernel. */
970 if (optind + 2 > argc) 1459 if (optind + 2 > argc)
971 usage(); 1460 usage();
972 1461
973 /* We need a console device */ 1462 /* We always have a console device */
974 setup_console(&device_list); 1463 setup_console(&device_list);
975 1464
976 /* First we map /dev/zero over all of guest-physical memory. */ 1465 /* We start by mapping anonymous pages over all of guest-physical
977 mem = atoi(argv[optind]) * 1024 * 1024; 1466 * memory range. This fills it with 0, and ensures that the Guest
1467 * won't be killed when it tries to access it. */
978 map_zeroed_pages(0, mem / getpagesize()); 1468 map_zeroed_pages(0, mem / getpagesize());
979 1469
980 /* Now we load the kernel */ 1470 /* Now we load the kernel */
981 start = load_kernel(open_or_die(argv[optind+1], O_RDONLY), 1471 start = load_kernel(open_or_die(argv[optind+1], O_RDONLY),
982 &page_offset); 1472 &page_offset);
983 1473
984 /* Write the device descriptors into memory. */ 1474 /* Map the initrd image if requested (at top of physical memory) */
985 map_device_descriptors(&device_list, mem);
986
987 /* Map the initrd image if requested */
988 if (initrd_name) { 1475 if (initrd_name) {
989 initrd_size = load_initrd(initrd_name, mem); 1476 initrd_size = load_initrd(initrd_name, mem);
1477 /* These are the location in the Linux boot header where the
1478 * start and size of the initrd are expected to be found. */
990 *(unsigned long *)(boot+0x218) = mem - initrd_size; 1479 *(unsigned long *)(boot+0x218) = mem - initrd_size;
991 *(unsigned long *)(boot+0x21c) = initrd_size; 1480 *(unsigned long *)(boot+0x21c) = initrd_size;
1481 /* The bootloader type 0xFF means "unknown"; that's OK. */
992 *(unsigned char *)(boot+0x210) = 0xFF; 1482 *(unsigned char *)(boot+0x210) = 0xFF;
993 } 1483 }
994 1484
995 /* Set up the initial linar pagetables. */ 1485 /* Set up the initial linear pagetables, starting below the initrd. */
996 pgdir = setup_pagetables(mem, initrd_size, page_offset); 1486 pgdir = setup_pagetables(mem, initrd_size, page_offset);
997 1487
998 /* E820 memory map: ours is a simple, single region. */ 1488 /* The Linux boot header contains an "E820" memory map: ours is a
1489 * simple, single region. */
999 *(char*)(boot+E820NR) = 1; 1490 *(char*)(boot+E820NR) = 1;
1000 *((struct e820entry *)(boot+E820MAP)) 1491 *((struct e820entry *)(boot+E820MAP))
1001 = ((struct e820entry) { 0, mem, E820_RAM }); 1492 = ((struct e820entry) { 0, mem, E820_RAM });
1002 /* Command line pointer and command line (at 4096) */ 1493 /* The boot header contains a command line pointer: we put the command
1494 * line after the boot header (at address 4096) */
1003 *(void **)(boot + 0x228) = boot + 4096; 1495 *(void **)(boot + 0x228) = boot + 4096;
1004 concat(boot + 4096, argv+optind+2); 1496 concat(boot + 4096, argv+optind+2);
1005 /* Paravirt type: 1 == lguest */ 1497
1498 /* The guest type value of "1" tells the Guest it's under lguest. */
1006 *(int *)(boot + 0x23c) = 1; 1499 *(int *)(boot + 0x23c) = 1;
1007 1500
1501 /* We tell the kernel to initialize the Guest: this returns the open
1502 * /dev/lguest file descriptor. */
1008 lguest_fd = tell_kernel(pgdir, start, page_offset); 1503 lguest_fd = tell_kernel(pgdir, start, page_offset);
1504
1505 /* We fork off a child process, which wakes the Launcher whenever one
1506 * of the input file descriptors needs attention. Otherwise we would
1507 * run the Guest until it tries to output something. */
1009 waker_fd = setup_waker(lguest_fd, &device_list); 1508 waker_fd = setup_waker(lguest_fd, &device_list);
1010 1509
1510 /* Finally, run the Guest. This doesn't return. */
1011 run_guest(lguest_fd, &device_list); 1511 run_guest(lguest_fd, &device_list);
1012} 1512}
1513/*:*/
1514
1515/*M:999
1516 * Mastery is done: you now know everything I do.
1517 *
1518 * But surely you have seen code, features and bugs in your wanderings which
1519 * you now yearn to attack? That is the real game, and I look forward to you
1520 * patching and forking lguest into the Your-Name-Here-visor.
1521 *
1522 * Farewell, and good coding!
1523 * Rusty Russell.
1524 */
diff --git a/Documentation/sched-stats.txt b/Documentation/sched-stats.txt
index 6f72021aae51..442e14d35dea 100644
--- a/Documentation/sched-stats.txt
+++ b/Documentation/sched-stats.txt
@@ -1,10 +1,11 @@
1Version 10 of schedstats includes support for sched_domains, which 1Version 14 of schedstats includes support for sched_domains, which hit the
2hit the mainline kernel in 2.6.7. Some counters make more sense to be 2mainline kernel in 2.6.20 although it is identical to the stats from version
3per-runqueue; other to be per-domain. Note that domains (and their associated 312 which was in the kernel from 2.6.13-2.6.19 (version 13 never saw a kernel
4information) will only be pertinent and available on machines utilizing 4release). Some counters make more sense to be per-runqueue; other to be
5CONFIG_SMP. 5per-domain. Note that domains (and their associated information) will only
6 6be pertinent and available on machines utilizing CONFIG_SMP.
7In version 10 of schedstat, there is at least one level of domain 7
8In version 14 of schedstat, there is at least one level of domain
8statistics for each cpu listed, and there may well be more than one 9statistics for each cpu listed, and there may well be more than one
9domain. Domains have no particular names in this implementation, but 10domain. Domains have no particular names in this implementation, but
10the highest numbered one typically arbitrates balancing across all the 11the highest numbered one typically arbitrates balancing across all the
@@ -27,7 +28,7 @@ to write their own scripts, the fields are described here.
27 28
28CPU statistics 29CPU statistics
29-------------- 30--------------
30cpu<N> 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 31cpu<N> 1 2 3 4 5 6 7 8 9 10 11 12
31 32
32NOTE: In the sched_yield() statistics, the active queue is considered empty 33NOTE: In the sched_yield() statistics, the active queue is considered empty
33 if it has only one process in it, since obviously the process calling 34 if it has only one process in it, since obviously the process calling
@@ -39,48 +40,20 @@ First four fields are sched_yield() statistics:
39 3) # of times just the expired queue was empty 40 3) # of times just the expired queue was empty
40 4) # of times sched_yield() was called 41 4) # of times sched_yield() was called
41 42
42Next four are schedule() statistics: 43Next three are schedule() statistics:
43 5) # of times the active queue had at least one other process on it 44 5) # of times we switched to the expired queue and reused it
44 6) # of times we switched to the expired queue and reused it 45 6) # of times schedule() was called
45 7) # of times schedule() was called 46 7) # of times schedule() left the processor idle
46 8) # of times schedule() left the processor idle
47
48Next four are active_load_balance() statistics:
49 9) # of times active_load_balance() was called
50 10) # of times active_load_balance() caused this cpu to gain a task
51 11) # of times active_load_balance() caused this cpu to lose a task
52 12) # of times active_load_balance() tried to move a task and failed
53
54Next three are try_to_wake_up() statistics:
55 13) # of times try_to_wake_up() was called
56 14) # of times try_to_wake_up() successfully moved the awakening task
57 15) # of times try_to_wake_up() attempted to move the awakening task
58
59Next two are wake_up_new_task() statistics:
60 16) # of times wake_up_new_task() was called
61 17) # of times wake_up_new_task() successfully moved the new task
62
63Next one is a sched_migrate_task() statistic:
64 18) # of times sched_migrate_task() was called
65 47
66Next one is a sched_balance_exec() statistic: 48Next two are try_to_wake_up() statistics:
67 19) # of times sched_balance_exec() was called 49 8) # of times try_to_wake_up() was called
50 9) # of times try_to_wake_up() was called to wake up the local cpu
68 51
69Next three are statistics describing scheduling latency: 52Next three are statistics describing scheduling latency:
70 20) sum of all time spent running by tasks on this processor (in ms) 53 10) sum of all time spent running by tasks on this processor (in jiffies)
71 21) sum of all time spent waiting to run by tasks on this processor (in ms) 54 11) sum of all time spent waiting to run by tasks on this processor (in
72 22) # of tasks (not necessarily unique) given to the processor 55 jiffies)
73 56 12) # of timeslices run on this cpu
74The last six are statistics dealing with pull_task():
75 23) # of times pull_task() moved a task to this cpu when newly idle
76 24) # of times pull_task() stole a task from this cpu when another cpu
77 was newly idle
78 25) # of times pull_task() moved a task to this cpu when idle
79 26) # of times pull_task() stole a task from this cpu when another cpu
80 was idle
81 27) # of times pull_task() moved a task to this cpu when busy
82 28) # of times pull_task() stole a task from this cpu when another cpu
83 was busy
84 57
85 58
86Domain statistics 59Domain statistics
@@ -89,65 +62,95 @@ One of these is produced per domain for each cpu described. (Note that if
89CONFIG_SMP is not defined, *no* domains are utilized and these lines 62CONFIG_SMP is not defined, *no* domains are utilized and these lines
90will not appear in the output.) 63will not appear in the output.)
91 64
92domain<N> 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 65domain<N> <cpumask> 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
93 66
94The first field is a bit mask indicating what cpus this domain operates over. 67The first field is a bit mask indicating what cpus this domain operates over.
95 68
96The next fifteen are a variety of load_balance() statistics: 69The next 24 are a variety of load_balance() statistics in grouped into types
97 70of idleness (idle, busy, and newly idle):
98 1) # of times in this domain load_balance() was called when the cpu 71
99 was idle 72 1) # of times in this domain load_balance() was called when the
100 2) # of times in this domain load_balance() was called when the cpu 73 cpu was idle
101 was busy 74 2) # of times in this domain load_balance() checked but found
102 3) # of times in this domain load_balance() was called when the cpu 75 the load did not require balancing when the cpu was idle
103 was just becoming idle 76 3) # of times in this domain load_balance() tried to move one or
104 4) # of times in this domain load_balance() tried to move one or more 77 more tasks and failed, when the cpu was idle
105 tasks and failed, when the cpu was idle 78 4) sum of imbalances discovered (if any) with each call to
106 5) # of times in this domain load_balance() tried to move one or more 79 load_balance() in this domain when the cpu was idle
107 tasks and failed, when the cpu was busy 80 5) # of times in this domain pull_task() was called when the cpu
108 6) # of times in this domain load_balance() tried to move one or more 81 was idle
109 tasks and failed, when the cpu was just becoming idle 82 6) # of times in this domain pull_task() was called even though
110 7) sum of imbalances discovered (if any) with each call to 83 the target task was cache-hot when idle
111 load_balance() in this domain when the cpu was idle 84 7) # of times in this domain load_balance() was called but did
112 8) sum of imbalances discovered (if any) with each call to 85 not find a busier queue while the cpu was idle
113 load_balance() in this domain when the cpu was busy 86 8) # of times in this domain a busier queue was found while the
114 9) sum of imbalances discovered (if any) with each call to 87 cpu was idle but no busier group was found
115 load_balance() in this domain when the cpu was just becoming idle 88
116 10) # of times in this domain load_balance() was called but did not find 89 9) # of times in this domain load_balance() was called when the
117 a busier queue while the cpu was idle 90 cpu was busy
118 11) # of times in this domain load_balance() was called but did not find 91 10) # of times in this domain load_balance() checked but found the
119 a busier queue while the cpu was busy 92 load did not require balancing when busy
120 12) # of times in this domain load_balance() was called but did not find 93 11) # of times in this domain load_balance() tried to move one or
121 a busier queue while the cpu was just becoming idle 94 more tasks and failed, when the cpu was busy
122 13) # of times in this domain a busier queue was found while the cpu was 95 12) sum of imbalances discovered (if any) with each call to
123 idle but no busier group was found 96 load_balance() in this domain when the cpu was busy
124 14) # of times in this domain a busier queue was found while the cpu was 97 13) # of times in this domain pull_task() was called when busy
125 busy but no busier group was found 98 14) # of times in this domain pull_task() was called even though the
126 15) # of times in this domain a busier queue was found while the cpu was 99 target task was cache-hot when busy
127 just becoming idle but no busier group was found 100 15) # of times in this domain load_balance() was called but did not
128 101 find a busier queue while the cpu was busy
129Next two are sched_balance_exec() statistics: 102 16) # of times in this domain a busier queue was found while the cpu
130 17) # of times in this domain sched_balance_exec() successfully pushed 103 was busy but no busier group was found
131 a task to a new cpu 104
132 18) # of times in this domain sched_balance_exec() tried but failed to 105 17) # of times in this domain load_balance() was called when the
133 push a task to a new cpu 106 cpu was just becoming idle
134 107 18) # of times in this domain load_balance() checked but found the
135Next two are try_to_wake_up() statistics: 108 load did not require balancing when the cpu was just becoming idle
136 19) # of times in this domain try_to_wake_up() tried to move a task based 109 19) # of times in this domain load_balance() tried to move one or more
137 on affinity and cache warmth 110 tasks and failed, when the cpu was just becoming idle
138 20) # of times in this domain try_to_wake_up() tried to move a task based 111 20) sum of imbalances discovered (if any) with each call to
139 on load balancing 112 load_balance() in this domain when the cpu was just becoming idle
140 113 21) # of times in this domain pull_task() was called when newly idle
114 22) # of times in this domain pull_task() was called even though the
115 target task was cache-hot when just becoming idle
116 23) # of times in this domain load_balance() was called but did not
117 find a busier queue while the cpu was just becoming idle
118 24) # of times in this domain a busier queue was found while the cpu
119 was just becoming idle but no busier group was found
120
121 Next three are active_load_balance() statistics:
122 25) # of times active_load_balance() was called
123 26) # of times active_load_balance() tried to move a task and failed
124 27) # of times active_load_balance() successfully moved a task
125
126 Next three are sched_balance_exec() statistics:
127 28) sbe_cnt is not used
128 29) sbe_balanced is not used
129 30) sbe_pushed is not used
130
131 Next three are sched_balance_fork() statistics:
132 31) sbf_cnt is not used
133 32) sbf_balanced is not used
134 33) sbf_pushed is not used
135
136 Next three are try_to_wake_up() statistics:
137 34) # of times in this domain try_to_wake_up() awoke a task that
138 last ran on a different cpu in this domain
139 35) # of times in this domain try_to_wake_up() moved a task to the
140 waking cpu because it was cache-cold on its own cpu anyway
141 36) # of times in this domain try_to_wake_up() started passive balancing
141 142
142/proc/<pid>/schedstat 143/proc/<pid>/schedstat
143---------------- 144----------------
144schedstats also adds a new /proc/<pid/schedstat file to include some of 145schedstats also adds a new /proc/<pid/schedstat file to include some of
145the same information on a per-process level. There are three fields in 146the same information on a per-process level. There are three fields in
146this file correlating to fields 20, 21, and 22 in the CPU fields, but 147this file correlating for that process to:
147they only apply for that process. 148 1) time spent on the cpu
149 2) time spent waiting on a runqueue
150 3) # of timeslices run on this cpu
148 151
149A program could be easily written to make use of these extra fields to 152A program could be easily written to make use of these extra fields to
150report on how well a particular process or set of processes is faring 153report on how well a particular process or set of processes is faring
151under the scheduler's policies. A simple version of such a program is 154under the scheduler's policies. A simple version of such a program is
152available at 155available at
153 http://eaglet.rain.com/rick/linux/schedstat/v10/latency.c 156 http://eaglet.rain.com/rick/linux/schedstat/v12/latency.c
diff --git a/Documentation/spi/spidev_test.c b/Documentation/spi/spidev_test.c
new file mode 100644
index 000000000000..218e86215297
--- /dev/null
+++ b/Documentation/spi/spidev_test.c
@@ -0,0 +1,202 @@
1/*
2 * SPI testing utility (using spidev driver)
3 *
4 * Copyright (c) 2007 MontaVista Software, Inc.
5 * Copyright (c) 2007 Anton Vorontsov <avorontsov@ru.mvista.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License.
10 *
11 * Cross-compile with cross-gcc -I/path/to/cross-kernel/include
12 */
13
14#include <stdint.h>
15#include <unistd.h>
16#include <stdio.h>
17#include <stdlib.h>
18#include <getopt.h>
19#include <fcntl.h>
20#include <sys/ioctl.h>
21#include <linux/types.h>
22#include <linux/spi/spidev.h>
23
24#define ARRAY_SIZE(a) (sizeof(a) / sizeof((a)[0]))
25
26static void pabort(const char *s)
27{
28 perror(s);
29 abort();
30}
31
32static char *device = "/dev/spidev1.1";
33static uint8_t mode;
34static uint8_t bits = 8;
35static uint32_t speed = 500000;
36static uint16_t delay;
37
38static void transfer(int fd)
39{
40 int ret;
41 uint8_t tx[] = {
42 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
43 0x40, 0x00, 0x00, 0x00, 0x00, 0x95,
44 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
45 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
46 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
47 0xDE, 0xAD, 0xBE, 0xEF, 0xBA, 0xAD,
48 0xF0, 0x0D,
49 };
50 uint8_t rx[ARRAY_SIZE(tx)] = {0, };
51 struct spi_ioc_transfer tr = {
52 .tx_buf = (unsigned long)tx,
53 .rx_buf = (unsigned long)rx,
54 .len = ARRAY_SIZE(tx),
55 .delay_usecs = delay,
56 .speed_hz = speed,
57 .bits_per_word = bits,
58 };
59
60 ret = ioctl(fd, SPI_IOC_MESSAGE(1), &tr);
61 if (ret == 1)
62 pabort("can't send spi message");
63
64 for (ret = 0; ret < ARRAY_SIZE(tx); ret++) {
65 if (!(ret % 6))
66 puts("");
67 printf("%.2X ", rx[ret]);
68 }
69 puts("");
70}
71
72void print_usage(char *prog)
73{
74 printf("Usage: %s [-DsbdlHOLC3]\n", prog);
75 puts(" -D --device device to use (default /dev/spidev1.1)\n"
76 " -s --speed max speed (Hz)\n"
77 " -d --delay delay (usec)\n"
78 " -b --bpw bits per word \n"
79 " -l --loop loopback\n"
80 " -H --cpha clock phase\n"
81 " -O --cpol clock polarity\n"
82 " -L --lsb least significant bit first\n"
83 " -C --cs-high chip select active high\n"
84 " -3 --3wire SI/SO signals shared\n");
85 exit(1);
86}
87
88void parse_opts(int argc, char *argv[])
89{
90 while (1) {
91 static struct option lopts[] = {
92 { "device", 1, 0, 'D' },
93 { "speed", 1, 0, 's' },
94 { "delay", 1, 0, 'd' },
95 { "bpw", 1, 0, 'b' },
96 { "loop", 0, 0, 'l' },
97 { "cpha", 0, 0, 'H' },
98 { "cpol", 0, 0, 'O' },
99 { "lsb", 0, 0, 'L' },
100 { "cs-high", 0, 0, 'C' },
101 { "3wire", 0, 0, '3' },
102 { NULL, 0, 0, 0 },
103 };
104 int c;
105
106 c = getopt_long(argc, argv, "D:s:d:b:lHOLC3", lopts, NULL);
107
108 if (c == -1)
109 break;
110
111 switch (c) {
112 case 'D':
113 device = optarg;
114 break;
115 case 's':
116 speed = atoi(optarg);
117 break;
118 case 'd':
119 delay = atoi(optarg);
120 break;
121 case 'b':
122 bits = atoi(optarg);
123 break;
124 case 'l':
125 mode |= SPI_LOOP;
126 break;
127 case 'H':
128 mode |= SPI_CPHA;
129 break;
130 case 'O':
131 mode |= SPI_CPOL;
132 break;
133 case 'L':
134 mode |= SPI_LSB_FIRST;
135 break;
136 case 'C':
137 mode |= SPI_CS_HIGH;
138 break;
139 case '3':
140 mode |= SPI_3WIRE;
141 break;
142 default:
143 print_usage(argv[0]);
144 break;
145 }
146 }
147}
148
149int main(int argc, char *argv[])
150{
151 int ret = 0;
152 int fd;
153
154 parse_opts(argc, argv);
155
156 fd = open(device, O_RDWR);
157 if (fd < 0)
158 pabort("can't open device");
159
160 /*
161 * spi mode
162 */
163 ret = ioctl(fd, SPI_IOC_WR_MODE, &mode);
164 if (ret == -1)
165 pabort("can't set spi mode");
166
167 ret = ioctl(fd, SPI_IOC_RD_MODE, &mode);
168 if (ret == -1)
169 pabort("can't get spi mode");
170
171 /*
172 * bits per word
173 */
174 ret = ioctl(fd, SPI_IOC_WR_BITS_PER_WORD, &bits);
175 if (ret == -1)
176 pabort("can't set bits per word");
177
178 ret = ioctl(fd, SPI_IOC_RD_BITS_PER_WORD, &bits);
179 if (ret == -1)
180 pabort("can't get bits per word");
181
182 /*
183 * max speed hz
184 */
185 ret = ioctl(fd, SPI_IOC_WR_MAX_SPEED_HZ, &speed);
186 if (ret == -1)
187 pabort("can't set max speed hz");
188
189 ret = ioctl(fd, SPI_IOC_RD_MAX_SPEED_HZ, &speed);
190 if (ret == -1)
191 pabort("can't get max speed hz");
192
193 printf("spi mode: %d\n", mode);
194 printf("bits per word: %d\n", bits);
195 printf("max speed: %d Hz (%d KHz)\n", speed, speed/1000);
196
197 transfer(fd);
198
199 close(fd);
200
201 return ret;
202}
diff --git a/Documentation/stable_api_nonsense.txt b/Documentation/stable_api_nonsense.txt
index a2afca3b2bab..847b342b7b20 100644
--- a/Documentation/stable_api_nonsense.txt
+++ b/Documentation/stable_api_nonsense.txt
@@ -10,7 +10,7 @@ kernel to userspace interfaces. The kernel to userspace interface is
10the one that application programs use, the syscall interface. That 10the one that application programs use, the syscall interface. That
11interface is _very_ stable over time, and will not break. I have old 11interface is _very_ stable over time, and will not break. I have old
12programs that were built on a pre 0.9something kernel that still work 12programs that were built on a pre 0.9something kernel that still work
13just fine on the latest 2.6 kernel release. This interface is the one 13just fine on the latest 2.6 kernel release. That interface is the one
14that users and application programmers can count on being stable. 14that users and application programmers can count on being stable.
15 15
16 16
diff --git a/Documentation/sysfs-rules.txt b/Documentation/sysfs-rules.txt
index 42861bb0bc9b..80ef562160bb 100644
--- a/Documentation/sysfs-rules.txt
+++ b/Documentation/sysfs-rules.txt
@@ -1,19 +1,18 @@
1Rules on how to access information in the Linux kernel sysfs 1Rules on how to access information in the Linux kernel sysfs
2 2
3The kernel exported sysfs exports internal kernel implementation-details 3The kernel-exported sysfs exports internal kernel implementation details
4and depends on internal kernel structures and layout. It is agreed upon 4and depends on internal kernel structures and layout. It is agreed upon
5by the kernel developers that the Linux kernel does not provide a stable 5by the kernel developers that the Linux kernel does not provide a stable
6internal API. As sysfs is a direct export of kernel internal 6internal API. As sysfs is a direct export of kernel internal
7structures, the sysfs interface can not provide a stable interface eighter, 7structures, the sysfs interface cannot provide a stable interface either;
8it may always change along with internal kernel changes. 8it may always change along with internal kernel changes.
9 9
10To minimize the risk of breaking users of sysfs, which are in most cases 10To minimize the risk of breaking users of sysfs, which are in most cases
11low-level userspace applications, with a new kernel release, the users 11low-level userspace applications, with a new kernel release, the users
12of sysfs must follow some rules to use an as abstract-as-possible way to 12of sysfs must follow some rules to use an as-abstract-as-possible way to
13access this filesystem. The current udev and HAL programs already 13access this filesystem. The current udev and HAL programs already
14implement this and users are encouraged to plug, if possible, into the 14implement this and users are encouraged to plug, if possible, into the
15abstractions these programs provide instead of accessing sysfs 15abstractions these programs provide instead of accessing sysfs directly.
16directly.
17 16
18But if you really do want or need to access sysfs directly, please follow 17But if you really do want or need to access sysfs directly, please follow
19the following rules and then your programs should work with future 18the following rules and then your programs should work with future
@@ -25,22 +24,22 @@ versions of the sysfs interface.
25 implementation details in its own API. Therefore it is not better than 24 implementation details in its own API. Therefore it is not better than
26 reading directories and opening the files yourself. 25 reading directories and opening the files yourself.
27 Also, it is not actively maintained, in the sense of reflecting the 26 Also, it is not actively maintained, in the sense of reflecting the
28 current kernel-development. The goal of providing a stable interface 27 current kernel development. The goal of providing a stable interface
29 to sysfs has failed, it causes more problems, than it solves. It 28 to sysfs has failed; it causes more problems than it solves. It
30 violates many of the rules in this document. 29 violates many of the rules in this document.
31 30
32- sysfs is always at /sys 31- sysfs is always at /sys
33 Parsing /proc/mounts is a waste of time. Other mount points are a 32 Parsing /proc/mounts is a waste of time. Other mount points are a
34 system configuration bug you should not try to solve. For test cases, 33 system configuration bug you should not try to solve. For test cases,
35 possibly support a SYSFS_PATH environment variable to overwrite the 34 possibly support a SYSFS_PATH environment variable to overwrite the
36 applications behavior, but never try to search for sysfs. Never try 35 application's behavior, but never try to search for sysfs. Never try
37 to mount it, if you are not an early boot script. 36 to mount it, if you are not an early boot script.
38 37
39- devices are only "devices" 38- devices are only "devices"
40 There is no such thing like class-, bus-, physical devices, 39 There is no such thing like class-, bus-, physical devices,
41 interfaces, and such that you can rely on in userspace. Everything is 40 interfaces, and such that you can rely on in userspace. Everything is
42 just simply a "device". Class-, bus-, physical, ... types are just 41 just simply a "device". Class-, bus-, physical, ... types are just
43 kernel implementation details, which should not be expected by 42 kernel implementation details which should not be expected by
44 applications that look for devices in sysfs. 43 applications that look for devices in sysfs.
45 44
46 The properties of a device are: 45 The properties of a device are:
@@ -48,11 +47,11 @@ versions of the sysfs interface.
48 - identical to the DEVPATH value in the event sent from the kernel 47 - identical to the DEVPATH value in the event sent from the kernel
49 at device creation and removal 48 at device creation and removal
50 - the unique key to the device at that point in time 49 - the unique key to the device at that point in time
51 - the kernels path to the device-directory without the leading 50 - the kernel's path to the device directory without the leading
52 /sys, and always starting with with a slash 51 /sys, and always starting with with a slash
53 - all elements of a devpath must be real directories. Symlinks 52 - all elements of a devpath must be real directories. Symlinks
54 pointing to /sys/devices must always be resolved to their real 53 pointing to /sys/devices must always be resolved to their real
55 target, and the target path must be used to access the device. 54 target and the target path must be used to access the device.
56 That way the devpath to the device matches the devpath of the 55 That way the devpath to the device matches the devpath of the
57 kernel used at event time. 56 kernel used at event time.
58 - using or exposing symlink values as elements in a devpath string 57 - using or exposing symlink values as elements in a devpath string
@@ -73,17 +72,17 @@ versions of the sysfs interface.
73 link 72 link
74 - it is retrieved by reading the "driver"-link and using only the 73 - it is retrieved by reading the "driver"-link and using only the
75 last element of the target path 74 last element of the target path
76 - devices which do not have "driver"-link, just do not have a 75 - devices which do not have "driver"-link just do not have a
77 driver; copying the driver value in a child device context, is a 76 driver; copying the driver value in a child device context is a
78 bug in the application 77 bug in the application
79 78
80 o attributes 79 o attributes
81 - the files in the device directory or files below a subdirectories 80 - the files in the device directory or files below subdirectories
82 of the same device directory 81 of the same device directory
83 - accessing attributes reached by a symlink pointing to another device, 82 - accessing attributes reached by a symlink pointing to another device,
84 like the "device"-link, is a bug in the application 83 like the "device"-link, is a bug in the application
85 84
86 Everything else is just a kernel driver-core implementation detail, 85 Everything else is just a kernel driver-core implementation detail
87 that should not be assumed to be stable across kernel releases. 86 that should not be assumed to be stable across kernel releases.
88 87
89- Properties of parent devices never belong into a child device. 88- Properties of parent devices never belong into a child device.
@@ -91,25 +90,25 @@ versions of the sysfs interface.
91 context properties. If the device 'eth0' or 'sda' does not have a 90 context properties. If the device 'eth0' or 'sda' does not have a
92 "driver"-link, then this device does not have a driver. Its value is empty. 91 "driver"-link, then this device does not have a driver. Its value is empty.
93 Never copy any property of the parent-device into a child-device. Parent 92 Never copy any property of the parent-device into a child-device. Parent
94 device-properties may change dynamically without any notice to the 93 device properties may change dynamically without any notice to the
95 child device. 94 child device.
96 95
97- Hierarchy in a single device-tree 96- Hierarchy in a single device tree
98 There is only one valid place in sysfs where hierarchy can be examined 97 There is only one valid place in sysfs where hierarchy can be examined
99 and this is below: /sys/devices. 98 and this is below: /sys/devices.
100 It is planned, that all device directories will end up in the tree 99 It is planned that all device directories will end up in the tree
101 below this directory. 100 below this directory.
102 101
103- Classification by subsystem 102- Classification by subsystem
104 There are currently three places for classification of devices: 103 There are currently three places for classification of devices:
105 /sys/block, /sys/class and /sys/bus. It is planned that these will 104 /sys/block, /sys/class and /sys/bus. It is planned that these will
106 not contain any device-directories themselves, but only flat lists of 105 not contain any device directories themselves, but only flat lists of
107 symlinks pointing to the unified /sys/devices tree. 106 symlinks pointing to the unified /sys/devices tree.
108 All three places have completely different rules on how to access 107 All three places have completely different rules on how to access
109 device information. It is planned to merge all three 108 device information. It is planned to merge all three
110 classification-directories into one place at /sys/subsystem, 109 classification directories into one place at /sys/subsystem,
111 following the layout of the bus-directories. All buses and 110 following the layout of the bus directories. All buses and
112 classes, including the converted block-subsystem, will show up 111 classes, including the converted block subsystem, will show up
113 there. 112 there.
114 The devices belonging to a subsystem will create a symlink in the 113 The devices belonging to a subsystem will create a symlink in the
115 "devices" directory at /sys/subsystem/<name>/devices. 114 "devices" directory at /sys/subsystem/<name>/devices.
@@ -121,38 +120,38 @@ versions of the sysfs interface.
121 subsystem name. 120 subsystem name.
122 121
123 Assuming /sys/class/<subsystem> and /sys/bus/<subsystem>, or 122 Assuming /sys/class/<subsystem> and /sys/bus/<subsystem>, or
124 /sys/block and /sys/class/block are not interchangeable, is a bug in 123 /sys/block and /sys/class/block are not interchangeable is a bug in
125 the application. 124 the application.
126 125
127- Block 126- Block
128 The converted block-subsystem at /sys/class/block, or 127 The converted block subsystem at /sys/class/block or
129 /sys/subsystem/block will contain the links for disks and partitions 128 /sys/subsystem/block will contain the links for disks and partitions
130 at the same level, never in a hierarchy. Assuming the block-subsytem to 129 at the same level, never in a hierarchy. Assuming the block subsytem to
131 contain only disks and not partition-devices in the same flat list is 130 contain only disks and not partition devices in the same flat list is
132 a bug in the application. 131 a bug in the application.
133 132
134- "device"-link and <subsystem>:<kernel name>-links 133- "device"-link and <subsystem>:<kernel name>-links
135 Never depend on the "device"-link. The "device"-link is a workaround 134 Never depend on the "device"-link. The "device"-link is a workaround
136 for the old layout, where class-devices are not created in 135 for the old layout, where class devices are not created in
137 /sys/devices/ like the bus-devices. If the link-resolving of a 136 /sys/devices/ like the bus devices. If the link-resolving of a
138 device-directory does not end in /sys/devices/, you can use the 137 device directory does not end in /sys/devices/, you can use the
139 "device"-link to find the parent devices in /sys/devices/. That is the 138 "device"-link to find the parent devices in /sys/devices/. That is the
140 single valid use of the "device"-link, it must never appear in any 139 single valid use of the "device"-link; it must never appear in any
141 path as an element. Assuming the existence of the "device"-link for 140 path as an element. Assuming the existence of the "device"-link for
142 a device in /sys/devices/ is a bug in the application. 141 a device in /sys/devices/ is a bug in the application.
143 Accessing /sys/class/net/eth0/device is a bug in the application. 142 Accessing /sys/class/net/eth0/device is a bug in the application.
144 143
145 Never depend on the class-specific links back to the /sys/class 144 Never depend on the class-specific links back to the /sys/class
146 directory. These links are also a workaround for the design mistake 145 directory. These links are also a workaround for the design mistake
147 that class-devices are not created in /sys/devices. If a device 146 that class devices are not created in /sys/devices. If a device
148 directory does not contain directories for child devices, these links 147 directory does not contain directories for child devices, these links
149 may be used to find the child devices in /sys/class. That is the single 148 may be used to find the child devices in /sys/class. That is the single
150 valid use of these links, they must never appear in any path as an 149 valid use of these links; they must never appear in any path as an
151 element. Assuming the existence of these links for devices which are 150 element. Assuming the existence of these links for devices which are
152 real child device directories in the /sys/devices tree, is a bug in 151 real child device directories in the /sys/devices tree is a bug in
153 the application. 152 the application.
154 153
155 It is planned to remove all these links when when all class-device 154 It is planned to remove all these links when all class device
156 directories live in /sys/devices. 155 directories live in /sys/devices.
157 156
158- Position of devices along device chain can change. 157- Position of devices along device chain can change.
@@ -161,6 +160,5 @@ versions of the sysfs interface.
161 the chain. You must always request the parent device you are looking for 160 the chain. You must always request the parent device you are looking for
162 by its subsystem value. You need to walk up the chain until you find 161 by its subsystem value. You need to walk up the chain until you find
163 the device that matches the expected subsystem. Depending on a specific 162 the device that matches the expected subsystem. Depending on a specific
164 position of a parent device, or exposing relative paths, using "../" to 163 position of a parent device or exposing relative paths using "../" to
165 access the chain of parents, is a bug in the application. 164 access the chain of parents is a bug in the application.
166