diff options
Diffstat (limited to 'Documentation')
-rw-r--r-- | Documentation/ABI/testing/sysfs-firmware-sfi | 15 | ||||
-rw-r--r-- | Documentation/DMA-API-HOWTO.txt | 85 | ||||
-rw-r--r-- | Documentation/SubmittingDrivers | 5 | ||||
-rw-r--r-- | Documentation/cgroups/cgroups.txt | 2 | ||||
-rw-r--r-- | Documentation/cgroups/memory.txt | 326 | ||||
-rw-r--r-- | Documentation/devices.txt | 2 | ||||
-rw-r--r-- | Documentation/feature-removal-schedule.txt | 10 | ||||
-rw-r--r-- | Documentation/filesystems/Locking | 5 | ||||
-rw-r--r-- | Documentation/filesystems/squashfs.txt | 32 | ||||
-rw-r--r-- | Documentation/hwmon/dme1737 | 51 | ||||
-rw-r--r-- | Documentation/hwmon/lm63 | 7 | ||||
-rw-r--r-- | Documentation/hwmon/ltc4245 | 4 | ||||
-rw-r--r-- | Documentation/hwmon/sysfs-interface | 13 | ||||
-rw-r--r-- | Documentation/hwmon/tmp102 | 26 | ||||
-rw-r--r-- | Documentation/vm/numa | 186 |
15 files changed, 583 insertions, 186 deletions
diff --git a/Documentation/ABI/testing/sysfs-firmware-sfi b/Documentation/ABI/testing/sysfs-firmware-sfi new file mode 100644 index 000000000000..4be7d44aeacf --- /dev/null +++ b/Documentation/ABI/testing/sysfs-firmware-sfi | |||
@@ -0,0 +1,15 @@ | |||
1 | What: /sys/firmware/sfi/tables/ | ||
2 | Date: May 2010 | ||
3 | Contact: Len Brown <lenb@kernel.org> | ||
4 | Description: | ||
5 | SFI defines a number of small static memory tables | ||
6 | so the kernel can get platform information from firmware. | ||
7 | |||
8 | The tables are defined in the latest SFI specification: | ||
9 | http://simplefirmware.org/documentation | ||
10 | |||
11 | While the tables are used by the kernel, user-space | ||
12 | can observe them this way: | ||
13 | |||
14 | # cd /sys/firmware/sfi/tables | ||
15 | # cat $TABLENAME > $TABLENAME.bin | ||
diff --git a/Documentation/DMA-API-HOWTO.txt b/Documentation/DMA-API-HOWTO.txt index 2e435adfbd6b..98ce51796f71 100644 --- a/Documentation/DMA-API-HOWTO.txt +++ b/Documentation/DMA-API-HOWTO.txt | |||
@@ -639,6 +639,36 @@ is planned to completely remove virt_to_bus() and bus_to_virt() as | |||
639 | they are entirely deprecated. Some ports already do not provide these | 639 | they are entirely deprecated. Some ports already do not provide these |
640 | as it is impossible to correctly support them. | 640 | as it is impossible to correctly support them. |
641 | 641 | ||
642 | Handling Errors | ||
643 | |||
644 | DMA address space is limited on some architectures and an allocation | ||
645 | failure can be determined by: | ||
646 | |||
647 | - checking if dma_alloc_coherent returns NULL or dma_map_sg returns 0 | ||
648 | |||
649 | - checking the returned dma_addr_t of dma_map_single and dma_map_page | ||
650 | by using dma_mapping_error(): | ||
651 | |||
652 | dma_addr_t dma_handle; | ||
653 | |||
654 | dma_handle = dma_map_single(dev, addr, size, direction); | ||
655 | if (dma_mapping_error(dev, dma_handle)) { | ||
656 | /* | ||
657 | * reduce current DMA mapping usage, | ||
658 | * delay and try again later or | ||
659 | * reset driver. | ||
660 | */ | ||
661 | } | ||
662 | |||
663 | Networking drivers must call dev_kfree_skb to free the socket buffer | ||
664 | and return NETDEV_TX_OK if the DMA mapping fails on the transmit hook | ||
665 | (ndo_start_xmit). This means that the socket buffer is just dropped in | ||
666 | the failure case. | ||
667 | |||
668 | SCSI drivers must return SCSI_MLQUEUE_HOST_BUSY if the DMA mapping | ||
669 | fails in the queuecommand hook. This means that the SCSI subsystem | ||
670 | passes the command to the driver again later. | ||
671 | |||
642 | Optimizing Unmap State Space Consumption | 672 | Optimizing Unmap State Space Consumption |
643 | 673 | ||
644 | On many platforms, dma_unmap_{single,page}() is simply a nop. | 674 | On many platforms, dma_unmap_{single,page}() is simply a nop. |
@@ -703,42 +733,25 @@ to "Closing". | |||
703 | 733 | ||
704 | 1) Struct scatterlist requirements. | 734 | 1) Struct scatterlist requirements. |
705 | 735 | ||
706 | Struct scatterlist must contain, at a minimum, the following | 736 | Don't invent the architecture specific struct scatterlist; just use |
707 | members: | 737 | <asm-generic/scatterlist.h>. You need to enable |
708 | 738 | CONFIG_NEED_SG_DMA_LENGTH if the architecture supports IOMMUs | |
709 | struct page *page; | 739 | (including software IOMMU). |
710 | unsigned int offset; | 740 | |
711 | unsigned int length; | 741 | 2) ARCH_KMALLOC_MINALIGN |
712 | 742 | ||
713 | The base address is specified by a "page+offset" pair. | 743 | Architectures must ensure that kmalloc'ed buffer is |
714 | 744 | DMA-safe. Drivers and subsystems depend on it. If an architecture | |
715 | Previous versions of struct scatterlist contained a "void *address" | 745 | isn't fully DMA-coherent (i.e. hardware doesn't ensure that data in |
716 | field that was sometimes used instead of page+offset. As of Linux | 746 | the CPU cache is identical to data in main memory), |
717 | 2.5., page+offset is always used, and the "address" field has been | 747 | ARCH_KMALLOC_MINALIGN must be set so that the memory allocator |
718 | deleted. | 748 | makes sure that kmalloc'ed buffer doesn't share a cache line with |
719 | 749 | the others. See arch/arm/include/asm/cache.h as an example. | |
720 | 2) More to come... | 750 | |
721 | 751 | Note that ARCH_KMALLOC_MINALIGN is about DMA memory alignment | |
722 | Handling Errors | 752 | constraints. You don't need to worry about the architecture data |
723 | 753 | alignment constraints (e.g. the alignment constraints about 64-bit | |
724 | DMA address space is limited on some architectures and an allocation | 754 | objects). |
725 | failure can be determined by: | ||
726 | |||
727 | - checking if dma_alloc_coherent returns NULL or dma_map_sg returns 0 | ||
728 | |||
729 | - checking the returned dma_addr_t of dma_map_single and dma_map_page | ||
730 | by using dma_mapping_error(): | ||
731 | |||
732 | dma_addr_t dma_handle; | ||
733 | |||
734 | dma_handle = dma_map_single(dev, addr, size, direction); | ||
735 | if (dma_mapping_error(dev, dma_handle)) { | ||
736 | /* | ||
737 | * reduce current DMA mapping usage, | ||
738 | * delay and try again later or | ||
739 | * reset driver. | ||
740 | */ | ||
741 | } | ||
742 | 755 | ||
743 | Closing | 756 | Closing |
744 | 757 | ||
diff --git a/Documentation/SubmittingDrivers b/Documentation/SubmittingDrivers index 99e72a81fa2f..4947fd8fb182 100644 --- a/Documentation/SubmittingDrivers +++ b/Documentation/SubmittingDrivers | |||
@@ -130,6 +130,8 @@ Linux kernel master tree: | |||
130 | ftp.??.kernel.org:/pub/linux/kernel/... | 130 | ftp.??.kernel.org:/pub/linux/kernel/... |
131 | ?? == your country code, such as "us", "uk", "fr", etc. | 131 | ?? == your country code, such as "us", "uk", "fr", etc. |
132 | 132 | ||
133 | http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git | ||
134 | |||
133 | Linux kernel mailing list: | 135 | Linux kernel mailing list: |
134 | linux-kernel@vger.kernel.org | 136 | linux-kernel@vger.kernel.org |
135 | [mail majordomo@vger.kernel.org to subscribe] | 137 | [mail majordomo@vger.kernel.org to subscribe] |
@@ -160,3 +162,6 @@ How to NOT write kernel driver by Arjan van de Ven: | |||
160 | 162 | ||
161 | Kernel Janitor: | 163 | Kernel Janitor: |
162 | http://janitor.kernelnewbies.org/ | 164 | http://janitor.kernelnewbies.org/ |
165 | |||
166 | GIT, Fast Version Control System: | ||
167 | http://git-scm.com/ | ||
diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt index 57444c2609fc..b34823ff1646 100644 --- a/Documentation/cgroups/cgroups.txt +++ b/Documentation/cgroups/cgroups.txt | |||
@@ -339,7 +339,7 @@ To mount a cgroup hierarchy with all available subsystems, type: | |||
339 | The "xxx" is not interpreted by the cgroup code, but will appear in | 339 | The "xxx" is not interpreted by the cgroup code, but will appear in |
340 | /proc/mounts so may be any useful identifying string that you like. | 340 | /proc/mounts so may be any useful identifying string that you like. |
341 | 341 | ||
342 | To mount a cgroup hierarchy with just the cpuset and numtasks | 342 | To mount a cgroup hierarchy with just the cpuset and memory |
343 | subsystems, type: | 343 | subsystems, type: |
344 | # mount -t cgroup -o cpuset,memory hier1 /dev/cgroup | 344 | # mount -t cgroup -o cpuset,memory hier1 /dev/cgroup |
345 | 345 | ||
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt index 6cab1f29da4c..7781857dc940 100644 --- a/Documentation/cgroups/memory.txt +++ b/Documentation/cgroups/memory.txt | |||
@@ -1,18 +1,15 @@ | |||
1 | Memory Resource Controller | 1 | Memory Resource Controller |
2 | 2 | ||
3 | NOTE: The Memory Resource Controller has been generically been referred | 3 | NOTE: The Memory Resource Controller has been generically been referred |
4 | to as the memory controller in this document. Do not confuse memory controller | 4 | to as the memory controller in this document. Do not confuse memory |
5 | used here with the memory controller that is used in hardware. | 5 | controller used here with the memory controller that is used in hardware. |
6 | 6 | ||
7 | Salient features | 7 | (For editors) |
8 | 8 | In this document: | |
9 | a. Enable control of Anonymous, Page Cache (mapped and unmapped) and | 9 | When we mention a cgroup (cgroupfs's directory) with memory controller, |
10 | Swap Cache memory pages. | 10 | we call it "memory cgroup". When you see git-log and source code, you'll |
11 | b. The infrastructure allows easy addition of other types of memory to control | 11 | see patch's title and function names tend to use "memcg". |
12 | c. Provides *zero overhead* for non memory controller users | 12 | In this document, we avoid using it. |
13 | d. Provides a double LRU: global memory pressure causes reclaim from the | ||
14 | global LRU; a cgroup on hitting a limit, reclaims from the per | ||
15 | cgroup LRU | ||
16 | 13 | ||
17 | Benefits and Purpose of the memory controller | 14 | Benefits and Purpose of the memory controller |
18 | 15 | ||
@@ -33,6 +30,45 @@ d. A CD/DVD burner could control the amount of memory used by the | |||
33 | e. There are several other use cases, find one or use the controller just | 30 | e. There are several other use cases, find one or use the controller just |
34 | for fun (to learn and hack on the VM subsystem). | 31 | for fun (to learn and hack on the VM subsystem). |
35 | 32 | ||
33 | Current Status: linux-2.6.34-mmotm(development version of 2010/April) | ||
34 | |||
35 | Features: | ||
36 | - accounting anonymous pages, file caches, swap caches usage and limiting them. | ||
37 | - private LRU and reclaim routine. (system's global LRU and private LRU | ||
38 | work independently from each other) | ||
39 | - optionally, memory+swap usage can be accounted and limited. | ||
40 | - hierarchical accounting | ||
41 | - soft limit | ||
42 | - moving(recharging) account at moving a task is selectable. | ||
43 | - usage threshold notifier | ||
44 | - oom-killer disable knob and oom-notifier | ||
45 | - Root cgroup has no limit controls. | ||
46 | |||
47 | Kernel memory and Hugepages are not under control yet. We just manage | ||
48 | pages on LRU. To add more controls, we have to take care of performance. | ||
49 | |||
50 | Brief summary of control files. | ||
51 | |||
52 | tasks # attach a task(thread) and show list of threads | ||
53 | cgroup.procs # show list of processes | ||
54 | cgroup.event_control # an interface for event_fd() | ||
55 | memory.usage_in_bytes # show current memory(RSS+Cache) usage. | ||
56 | memory.memsw.usage_in_bytes # show current memory+Swap usage | ||
57 | memory.limit_in_bytes # set/show limit of memory usage | ||
58 | memory.memsw.limit_in_bytes # set/show limit of memory+Swap usage | ||
59 | memory.failcnt # show the number of memory usage hits limits | ||
60 | memory.memsw.failcnt # show the number of memory+Swap hits limits | ||
61 | memory.max_usage_in_bytes # show max memory usage recorded | ||
62 | memory.memsw.usage_in_bytes # show max memory+Swap usage recorded | ||
63 | memory.soft_limit_in_bytes # set/show soft limit of memory usage | ||
64 | memory.stat # show various statistics | ||
65 | memory.use_hierarchy # set/show hierarchical account enabled | ||
66 | memory.force_empty # trigger forced move charge to parent | ||
67 | memory.swappiness # set/show swappiness parameter of vmscan | ||
68 | (See sysctl's vm.swappiness) | ||
69 | memory.move_charge_at_immigrate # set/show controls of moving charges | ||
70 | memory.oom_control # set/show oom controls. | ||
71 | |||
36 | 1. History | 72 | 1. History |
37 | 73 | ||
38 | The memory controller has a long history. A request for comments for the memory | 74 | The memory controller has a long history. A request for comments for the memory |
@@ -106,14 +142,14 @@ the necessary data structures and check if the cgroup that is being charged | |||
106 | is over its limit. If it is then reclaim is invoked on the cgroup. | 142 | is over its limit. If it is then reclaim is invoked on the cgroup. |
107 | More details can be found in the reclaim section of this document. | 143 | More details can be found in the reclaim section of this document. |
108 | If everything goes well, a page meta-data-structure called page_cgroup is | 144 | If everything goes well, a page meta-data-structure called page_cgroup is |
109 | allocated and associated with the page. This routine also adds the page to | 145 | updated. page_cgroup has its own LRU on cgroup. |
110 | the per cgroup LRU. | 146 | (*) page_cgroup structure is allocated at boot/memory-hotplug time. |
111 | 147 | ||
112 | 2.2.1 Accounting details | 148 | 2.2.1 Accounting details |
113 | 149 | ||
114 | All mapped anon pages (RSS) and cache pages (Page Cache) are accounted. | 150 | All mapped anon pages (RSS) and cache pages (Page Cache) are accounted. |
115 | (some pages which never be reclaimable and will not be on global LRU | 151 | Some pages which are never reclaimable and will not be on the global LRU |
116 | are not accounted. we just accounts pages under usual vm management.) | 152 | are not accounted. We just account pages under usual VM management. |
117 | 153 | ||
118 | RSS pages are accounted at page_fault unless they've already been accounted | 154 | RSS pages are accounted at page_fault unless they've already been accounted |
119 | for earlier. A file page will be accounted for as Page Cache when it's | 155 | for earlier. A file page will be accounted for as Page Cache when it's |
@@ -121,12 +157,19 @@ inserted into inode (radix-tree). While it's mapped into the page tables of | |||
121 | processes, duplicate accounting is carefully avoided. | 157 | processes, duplicate accounting is carefully avoided. |
122 | 158 | ||
123 | A RSS page is unaccounted when it's fully unmapped. A PageCache page is | 159 | A RSS page is unaccounted when it's fully unmapped. A PageCache page is |
124 | unaccounted when it's removed from radix-tree. | 160 | unaccounted when it's removed from radix-tree. Even if RSS pages are fully |
161 | unmapped (by kswapd), they may exist as SwapCache in the system until they | ||
162 | are really freed. Such SwapCaches also also accounted. | ||
163 | A swapped-in page is not accounted until it's mapped. | ||
164 | |||
165 | Note: The kernel does swapin-readahead and read multiple swaps at once. | ||
166 | This means swapped-in pages may contain pages for other tasks than a task | ||
167 | causing page fault. So, we avoid accounting at swap-in I/O. | ||
125 | 168 | ||
126 | At page migration, accounting information is kept. | 169 | At page migration, accounting information is kept. |
127 | 170 | ||
128 | Note: we just account pages-on-lru because our purpose is to control amount | 171 | Note: we just account pages-on-LRU because our purpose is to control amount |
129 | of used pages. not-on-lru pages are tend to be out-of-control from vm view. | 172 | of used pages; not-on-LRU pages tend to be out-of-control from VM view. |
130 | 173 | ||
131 | 2.3 Shared Page Accounting | 174 | 2.3 Shared Page Accounting |
132 | 175 | ||
@@ -143,6 +186,7 @@ caller of swapoff rather than the users of shmem. | |||
143 | 186 | ||
144 | 187 | ||
145 | 2.4 Swap Extension (CONFIG_CGROUP_MEM_RES_CTLR_SWAP) | 188 | 2.4 Swap Extension (CONFIG_CGROUP_MEM_RES_CTLR_SWAP) |
189 | |||
146 | Swap Extension allows you to record charge for swap. A swapped-in page is | 190 | Swap Extension allows you to record charge for swap. A swapped-in page is |
147 | charged back to original page allocator if possible. | 191 | charged back to original page allocator if possible. |
148 | 192 | ||
@@ -150,13 +194,20 @@ When swap is accounted, following files are added. | |||
150 | - memory.memsw.usage_in_bytes. | 194 | - memory.memsw.usage_in_bytes. |
151 | - memory.memsw.limit_in_bytes. | 195 | - memory.memsw.limit_in_bytes. |
152 | 196 | ||
153 | usage of mem+swap is limited by memsw.limit_in_bytes. | 197 | memsw means memory+swap. Usage of memory+swap is limited by |
198 | memsw.limit_in_bytes. | ||
154 | 199 | ||
155 | * why 'mem+swap' rather than swap. | 200 | Example: Assume a system with 4G of swap. A task which allocates 6G of memory |
201 | (by mistake) under 2G memory limitation will use all swap. | ||
202 | In this case, setting memsw.limit_in_bytes=3G will prevent bad use of swap. | ||
203 | By using memsw limit, you can avoid system OOM which can be caused by swap | ||
204 | shortage. | ||
205 | |||
206 | * why 'memory+swap' rather than swap. | ||
156 | The global LRU(kswapd) can swap out arbitrary pages. Swap-out means | 207 | The global LRU(kswapd) can swap out arbitrary pages. Swap-out means |
157 | to move account from memory to swap...there is no change in usage of | 208 | to move account from memory to swap...there is no change in usage of |
158 | mem+swap. In other words, when we want to limit the usage of swap without | 209 | memory+swap. In other words, when we want to limit the usage of swap without |
159 | affecting global LRU, mem+swap limit is better than just limiting swap from | 210 | affecting global LRU, memory+swap limit is better than just limiting swap from |
160 | OS point of view. | 211 | OS point of view. |
161 | 212 | ||
162 | * What happens when a cgroup hits memory.memsw.limit_in_bytes | 213 | * What happens when a cgroup hits memory.memsw.limit_in_bytes |
@@ -168,12 +219,12 @@ it by cgroup. | |||
168 | 219 | ||
169 | 2.5 Reclaim | 220 | 2.5 Reclaim |
170 | 221 | ||
171 | Each cgroup maintains a per cgroup LRU that consists of an active | 222 | Each cgroup maintains a per cgroup LRU which has the same structure as |
172 | and inactive list. When a cgroup goes over its limit, we first try | 223 | global VM. When a cgroup goes over its limit, we first try |
173 | to reclaim memory from the cgroup so as to make space for the new | 224 | to reclaim memory from the cgroup so as to make space for the new |
174 | pages that the cgroup has touched. If the reclaim is unsuccessful, | 225 | pages that the cgroup has touched. If the reclaim is unsuccessful, |
175 | an OOM routine is invoked to select and kill the bulkiest task in the | 226 | an OOM routine is invoked to select and kill the bulkiest task in the |
176 | cgroup. | 227 | cgroup. (See 10. OOM Control below.) |
177 | 228 | ||
178 | The reclaim algorithm has not been modified for cgroups, except that | 229 | The reclaim algorithm has not been modified for cgroups, except that |
179 | pages that are selected for reclaiming come from the per cgroup LRU | 230 | pages that are selected for reclaiming come from the per cgroup LRU |
@@ -184,13 +235,22 @@ limits on the root cgroup. | |||
184 | 235 | ||
185 | Note2: When panic_on_oom is set to "2", the whole system will panic. | 236 | Note2: When panic_on_oom is set to "2", the whole system will panic. |
186 | 237 | ||
187 | 2. Locking | 238 | When oom event notifier is registered, event will be delivered. |
239 | (See oom_control section) | ||
240 | |||
241 | 2.6 Locking | ||
188 | 242 | ||
189 | The memory controller uses the following hierarchy | 243 | lock_page_cgroup()/unlock_page_cgroup() should not be called under |
244 | mapping->tree_lock. | ||
190 | 245 | ||
191 | 1. zone->lru_lock is used for selecting pages to be isolated | 246 | Other lock order is following: |
192 | 2. mem->per_zone->lru_lock protects the per cgroup LRU (per zone) | 247 | PG_locked. |
193 | 3. lock_page_cgroup() is used to protect page->page_cgroup | 248 | mm->page_table_lock |
249 | zone->lru_lock | ||
250 | lock_page_cgroup. | ||
251 | In many cases, just lock_page_cgroup() is called. | ||
252 | per-zone-per-cgroup LRU (cgroup's private LRU) is just guarded by | ||
253 | zone->lru_lock, it has no lock of its own. | ||
194 | 254 | ||
195 | 3. User Interface | 255 | 3. User Interface |
196 | 256 | ||
@@ -199,6 +259,7 @@ The memory controller uses the following hierarchy | |||
199 | a. Enable CONFIG_CGROUPS | 259 | a. Enable CONFIG_CGROUPS |
200 | b. Enable CONFIG_RESOURCE_COUNTERS | 260 | b. Enable CONFIG_RESOURCE_COUNTERS |
201 | c. Enable CONFIG_CGROUP_MEM_RES_CTLR | 261 | c. Enable CONFIG_CGROUP_MEM_RES_CTLR |
262 | d. Enable CONFIG_CGROUP_MEM_RES_CTLR_SWAP (to use swap extension) | ||
202 | 263 | ||
203 | 1. Prepare the cgroups | 264 | 1. Prepare the cgroups |
204 | # mkdir -p /cgroups | 265 | # mkdir -p /cgroups |
@@ -206,31 +267,28 @@ c. Enable CONFIG_CGROUP_MEM_RES_CTLR | |||
206 | 267 | ||
207 | 2. Make the new group and move bash into it | 268 | 2. Make the new group and move bash into it |
208 | # mkdir /cgroups/0 | 269 | # mkdir /cgroups/0 |
209 | # echo $$ > /cgroups/0/tasks | 270 | # echo $$ > /cgroups/0/tasks |
210 | 271 | ||
211 | Since now we're in the 0 cgroup, | 272 | Since now we're in the 0 cgroup, we can alter the memory limit: |
212 | We can alter the memory limit: | ||
213 | # echo 4M > /cgroups/0/memory.limit_in_bytes | 273 | # echo 4M > /cgroups/0/memory.limit_in_bytes |
214 | 274 | ||
215 | NOTE: We can use a suffix (k, K, m, M, g or G) to indicate values in kilo, | 275 | NOTE: We can use a suffix (k, K, m, M, g or G) to indicate values in kilo, |
216 | mega or gigabytes. | 276 | mega or gigabytes. (Here, Kilo, Mega, Giga are Kibibytes, Mebibytes, Gibibytes.) |
277 | |||
217 | NOTE: We can write "-1" to reset the *.limit_in_bytes(unlimited). | 278 | NOTE: We can write "-1" to reset the *.limit_in_bytes(unlimited). |
218 | NOTE: We cannot set limits on the root cgroup any more. | 279 | NOTE: We cannot set limits on the root cgroup any more. |
219 | 280 | ||
220 | # cat /cgroups/0/memory.limit_in_bytes | 281 | # cat /cgroups/0/memory.limit_in_bytes |
221 | 4194304 | 282 | 4194304 |
222 | 283 | ||
223 | NOTE: The interface has now changed to display the usage in bytes | ||
224 | instead of pages | ||
225 | |||
226 | We can check the usage: | 284 | We can check the usage: |
227 | # cat /cgroups/0/memory.usage_in_bytes | 285 | # cat /cgroups/0/memory.usage_in_bytes |
228 | 1216512 | 286 | 1216512 |
229 | 287 | ||
230 | A successful write to this file does not guarantee a successful set of | 288 | A successful write to this file does not guarantee a successful set of |
231 | this limit to the value written into the file. This can be due to a | 289 | this limit to the value written into the file. This can be due to a |
232 | number of factors, such as rounding up to page boundaries or the total | 290 | number of factors, such as rounding up to page boundaries or the total |
233 | availability of memory on the system. The user is required to re-read | 291 | availability of memory on the system. The user is required to re-read |
234 | this file after a write to guarantee the value committed by the kernel. | 292 | this file after a write to guarantee the value committed by the kernel. |
235 | 293 | ||
236 | # echo 1 > memory.limit_in_bytes | 294 | # echo 1 > memory.limit_in_bytes |
@@ -245,15 +303,23 @@ caches, RSS and Active pages/Inactive pages are shown. | |||
245 | 303 | ||
246 | 4. Testing | 304 | 4. Testing |
247 | 305 | ||
248 | Balbir posted lmbench, AIM9, LTP and vmmstress results [10] and [11]. | 306 | For testing features and implementation, see memcg_test.txt. |
249 | Apart from that v6 has been tested with several applications and regular | 307 | |
250 | daily use. The controller has also been tested on the PPC64, x86_64 and | 308 | Performance test is also important. To see pure memory controller's overhead, |
251 | UML platforms. | 309 | testing on tmpfs will give you good numbers of small overheads. |
310 | Example: do kernel make on tmpfs. | ||
311 | |||
312 | Page-fault scalability is also important. At measuring parallel | ||
313 | page fault test, multi-process test may be better than multi-thread | ||
314 | test because it has noise of shared objects/status. | ||
315 | |||
316 | But the above two are testing extreme situations. | ||
317 | Trying usual test under memory controller is always helpful. | ||
252 | 318 | ||
253 | 4.1 Troubleshooting | 319 | 4.1 Troubleshooting |
254 | 320 | ||
255 | Sometimes a user might find that the application under a cgroup is | 321 | Sometimes a user might find that the application under a cgroup is |
256 | terminated. There are several causes for this: | 322 | terminated by OOM killer. There are several causes for this: |
257 | 323 | ||
258 | 1. The cgroup limit is too low (just too low to do anything useful) | 324 | 1. The cgroup limit is too low (just too low to do anything useful) |
259 | 2. The user is using anonymous memory and swap is turned off or too low | 325 | 2. The user is using anonymous memory and swap is turned off or too low |
@@ -261,6 +327,9 @@ terminated. There are several causes for this: | |||
261 | A sync followed by echo 1 > /proc/sys/vm/drop_caches will help get rid of | 327 | A sync followed by echo 1 > /proc/sys/vm/drop_caches will help get rid of |
262 | some of the pages cached in the cgroup (page cache pages). | 328 | some of the pages cached in the cgroup (page cache pages). |
263 | 329 | ||
330 | To know what happens, disable OOM_Kill by 10. OOM Control(see below) and | ||
331 | seeing what happens will be helpful. | ||
332 | |||
264 | 4.2 Task migration | 333 | 4.2 Task migration |
265 | 334 | ||
266 | When a task migrates from one cgroup to another, its charge is not | 335 | When a task migrates from one cgroup to another, its charge is not |
@@ -268,16 +337,19 @@ carried forward by default. The pages allocated from the original cgroup still | |||
268 | remain charged to it, the charge is dropped when the page is freed or | 337 | remain charged to it, the charge is dropped when the page is freed or |
269 | reclaimed. | 338 | reclaimed. |
270 | 339 | ||
271 | Note: You can move charges of a task along with task migration. See 8. | 340 | You can move charges of a task along with task migration. |
341 | See 8. "Move charges at task migration" | ||
272 | 342 | ||
273 | 4.3 Removing a cgroup | 343 | 4.3 Removing a cgroup |
274 | 344 | ||
275 | A cgroup can be removed by rmdir, but as discussed in sections 4.1 and 4.2, a | 345 | A cgroup can be removed by rmdir, but as discussed in sections 4.1 and 4.2, a |
276 | cgroup might have some charge associated with it, even though all | 346 | cgroup might have some charge associated with it, even though all |
277 | tasks have migrated away from it. | 347 | tasks have migrated away from it. (because we charge against pages, not |
278 | Such charges are freed(at default) or moved to its parent. When moved, | 348 | against tasks.) |
279 | both of RSS and CACHES are moved to parent. | 349 | |
280 | If both of them are busy, rmdir() returns -EBUSY. See 5.1 Also. | 350 | Such charges are freed or moved to their parent. At moving, both of RSS |
351 | and CACHES are moved to parent. | ||
352 | rmdir() may return -EBUSY if freeing/moving fails. See 5.1 also. | ||
281 | 353 | ||
282 | Charges recorded in swap information is not updated at removal of cgroup. | 354 | Charges recorded in swap information is not updated at removal of cgroup. |
283 | Recorded information is discarded and a cgroup which uses swap (swapcache) | 355 | Recorded information is discarded and a cgroup which uses swap (swapcache) |
@@ -293,10 +365,10 @@ will be charged as a new owner of it. | |||
293 | 365 | ||
294 | # echo 0 > memory.force_empty | 366 | # echo 0 > memory.force_empty |
295 | 367 | ||
296 | Almost all pages tracked by this memcg will be unmapped and freed. Some of | 368 | Almost all pages tracked by this memory cgroup will be unmapped and freed. |
297 | pages cannot be freed because it's locked or in-use. Such pages are moved | 369 | Some pages cannot be freed because they are locked or in-use. Such pages are |
298 | to parent and this cgroup will be empty. But this may return -EBUSY in | 370 | moved to parent and this cgroup will be empty. This may return -EBUSY if |
299 | some too busy case. | 371 | VM is too busy to free/move all pages immediately. |
300 | 372 | ||
301 | Typical use case of this interface is that calling this before rmdir(). | 373 | Typical use case of this interface is that calling this before rmdir(). |
302 | Because rmdir() moves all pages to parent, some out-of-use page caches can be | 374 | Because rmdir() moves all pages to parent, some out-of-use page caches can be |
@@ -306,19 +378,41 @@ will be charged as a new owner of it. | |||
306 | 378 | ||
307 | memory.stat file includes following statistics | 379 | memory.stat file includes following statistics |
308 | 380 | ||
381 | # per-memory cgroup local status | ||
309 | cache - # of bytes of page cache memory. | 382 | cache - # of bytes of page cache memory. |
310 | rss - # of bytes of anonymous and swap cache memory. | 383 | rss - # of bytes of anonymous and swap cache memory. |
384 | mapped_file - # of bytes of mapped file (includes tmpfs/shmem) | ||
311 | pgpgin - # of pages paged in (equivalent to # of charging events). | 385 | pgpgin - # of pages paged in (equivalent to # of charging events). |
312 | pgpgout - # of pages paged out (equivalent to # of uncharging events). | 386 | pgpgout - # of pages paged out (equivalent to # of uncharging events). |
313 | active_anon - # of bytes of anonymous and swap cache memory on active | 387 | swap - # of bytes of swap usage |
314 | lru list. | ||
315 | inactive_anon - # of bytes of anonymous memory and swap cache memory on | 388 | inactive_anon - # of bytes of anonymous memory and swap cache memory on |
316 | inactive lru list. | 389 | LRU list. |
317 | active_file - # of bytes of file-backed memory on active lru list. | 390 | active_anon - # of bytes of anonymous and swap cache memory on active |
318 | inactive_file - # of bytes of file-backed memory on inactive lru list. | 391 | inactive LRU list. |
392 | inactive_file - # of bytes of file-backed memory on inactive LRU list. | ||
393 | active_file - # of bytes of file-backed memory on active LRU list. | ||
319 | unevictable - # of bytes of memory that cannot be reclaimed (mlocked etc). | 394 | unevictable - # of bytes of memory that cannot be reclaimed (mlocked etc). |
320 | 395 | ||
321 | The following additional stats are dependent on CONFIG_DEBUG_VM. | 396 | # status considering hierarchy (see memory.use_hierarchy settings) |
397 | |||
398 | hierarchical_memory_limit - # of bytes of memory limit with regard to hierarchy | ||
399 | under which the memory cgroup is | ||
400 | hierarchical_memsw_limit - # of bytes of memory+swap limit with regard to | ||
401 | hierarchy under which memory cgroup is. | ||
402 | |||
403 | total_cache - sum of all children's "cache" | ||
404 | total_rss - sum of all children's "rss" | ||
405 | total_mapped_file - sum of all children's "cache" | ||
406 | total_pgpgin - sum of all children's "pgpgin" | ||
407 | total_pgpgout - sum of all children's "pgpgout" | ||
408 | total_swap - sum of all children's "swap" | ||
409 | total_inactive_anon - sum of all children's "inactive_anon" | ||
410 | total_active_anon - sum of all children's "active_anon" | ||
411 | total_inactive_file - sum of all children's "inactive_file" | ||
412 | total_active_file - sum of all children's "active_file" | ||
413 | total_unevictable - sum of all children's "unevictable" | ||
414 | |||
415 | # The following additional stats are dependent on CONFIG_DEBUG_VM. | ||
322 | 416 | ||
323 | inactive_ratio - VM internal parameter. (see mm/page_alloc.c) | 417 | inactive_ratio - VM internal parameter. (see mm/page_alloc.c) |
324 | recent_rotated_anon - VM internal parameter. (see mm/vmscan.c) | 418 | recent_rotated_anon - VM internal parameter. (see mm/vmscan.c) |
@@ -327,24 +421,37 @@ recent_scanned_anon - VM internal parameter. (see mm/vmscan.c) | |||
327 | recent_scanned_file - VM internal parameter. (see mm/vmscan.c) | 421 | recent_scanned_file - VM internal parameter. (see mm/vmscan.c) |
328 | 422 | ||
329 | Memo: | 423 | Memo: |
330 | recent_rotated means recent frequency of lru rotation. | 424 | recent_rotated means recent frequency of LRU rotation. |
331 | recent_scanned means recent # of scans to lru. | 425 | recent_scanned means recent # of scans to LRU. |
332 | showing for better debug please see the code for meanings. | 426 | showing for better debug please see the code for meanings. |
333 | 427 | ||
334 | Note: | 428 | Note: |
335 | Only anonymous and swap cache memory is listed as part of 'rss' stat. | 429 | Only anonymous and swap cache memory is listed as part of 'rss' stat. |
336 | This should not be confused with the true 'resident set size' or the | 430 | This should not be confused with the true 'resident set size' or the |
337 | amount of physical memory used by the cgroup. Per-cgroup rss | 431 | amount of physical memory used by the cgroup. |
338 | accounting is not done yet. | 432 | 'rss + file_mapped" will give you resident set size of cgroup. |
433 | (Note: file and shmem may be shared among other cgroups. In that case, | ||
434 | file_mapped is accounted only when the memory cgroup is owner of page | ||
435 | cache.) | ||
339 | 436 | ||
340 | 5.3 swappiness | 437 | 5.3 swappiness |
341 | Similar to /proc/sys/vm/swappiness, but affecting a hierarchy of groups only. | ||
342 | 438 | ||
343 | Following cgroups' swappiness can't be changed. | 439 | Similar to /proc/sys/vm/swappiness, but affecting a hierarchy of groups only. |
344 | - root cgroup (uses /proc/sys/vm/swappiness). | ||
345 | - a cgroup which uses hierarchy and it has child cgroup. | ||
346 | - a cgroup which uses hierarchy and not the root of hierarchy. | ||
347 | 440 | ||
441 | Following cgroups' swappiness can't be changed. | ||
442 | - root cgroup (uses /proc/sys/vm/swappiness). | ||
443 | - a cgroup which uses hierarchy and it has other cgroup(s) below it. | ||
444 | - a cgroup which uses hierarchy and not the root of hierarchy. | ||
445 | |||
446 | 5.4 failcnt | ||
447 | |||
448 | A memory cgroup provides memory.failcnt and memory.memsw.failcnt files. | ||
449 | This failcnt(== failure count) shows the number of times that a usage counter | ||
450 | hit its limit. When a memory cgroup hits a limit, failcnt increases and | ||
451 | memory under it will be reclaimed. | ||
452 | |||
453 | You can reset failcnt by writing 0 to failcnt file. | ||
454 | # echo 0 > .../memory.failcnt | ||
348 | 455 | ||
349 | 6. Hierarchy support | 456 | 6. Hierarchy support |
350 | 457 | ||
@@ -363,13 +470,13 @@ hierarchy | |||
363 | 470 | ||
364 | In the diagram above, with hierarchical accounting enabled, all memory | 471 | In the diagram above, with hierarchical accounting enabled, all memory |
365 | usage of e, is accounted to its ancestors up until the root (i.e, c and root), | 472 | usage of e, is accounted to its ancestors up until the root (i.e, c and root), |
366 | that has memory.use_hierarchy enabled. If one of the ancestors goes over its | 473 | that has memory.use_hierarchy enabled. If one of the ancestors goes over its |
367 | limit, the reclaim algorithm reclaims from the tasks in the ancestor and the | 474 | limit, the reclaim algorithm reclaims from the tasks in the ancestor and the |
368 | children of the ancestor. | 475 | children of the ancestor. |
369 | 476 | ||
370 | 6.1 Enabling hierarchical accounting and reclaim | 477 | 6.1 Enabling hierarchical accounting and reclaim |
371 | 478 | ||
372 | The memory controller by default disables the hierarchy feature. Support | 479 | A memory cgroup by default disables the hierarchy feature. Support |
373 | can be enabled by writing 1 to memory.use_hierarchy file of the root cgroup | 480 | can be enabled by writing 1 to memory.use_hierarchy file of the root cgroup |
374 | 481 | ||
375 | # echo 1 > memory.use_hierarchy | 482 | # echo 1 > memory.use_hierarchy |
@@ -379,10 +486,10 @@ The feature can be disabled by | |||
379 | # echo 0 > memory.use_hierarchy | 486 | # echo 0 > memory.use_hierarchy |
380 | 487 | ||
381 | NOTE1: Enabling/disabling will fail if the cgroup already has other | 488 | NOTE1: Enabling/disabling will fail if the cgroup already has other |
382 | cgroups created below it. | 489 | cgroups created below it. |
383 | 490 | ||
384 | NOTE2: When panic_on_oom is set to "2", the whole system will panic in | 491 | NOTE2: When panic_on_oom is set to "2", the whole system will panic in |
385 | case of an oom event in any cgroup. | 492 | case of an OOM event in any cgroup. |
386 | 493 | ||
387 | 7. Soft limits | 494 | 7. Soft limits |
388 | 495 | ||
@@ -392,7 +499,7 @@ is to allow control groups to use as much of the memory as needed, provided | |||
392 | a. There is no memory contention | 499 | a. There is no memory contention |
393 | b. They do not exceed their hard limit | 500 | b. They do not exceed their hard limit |
394 | 501 | ||
395 | When the system detects memory contention or low memory control groups | 502 | When the system detects memory contention or low memory, control groups |
396 | are pushed back to their soft limits. If the soft limit of each control | 503 | are pushed back to their soft limits. If the soft limit of each control |
397 | group is very high, they are pushed back as much as possible to make | 504 | group is very high, they are pushed back as much as possible to make |
398 | sure that one control group does not starve the others of memory. | 505 | sure that one control group does not starve the others of memory. |
@@ -406,7 +513,7 @@ it gets invoked from balance_pgdat (kswapd). | |||
406 | 7.1 Interface | 513 | 7.1 Interface |
407 | 514 | ||
408 | Soft limits can be setup by using the following commands (in this example we | 515 | Soft limits can be setup by using the following commands (in this example we |
409 | assume a soft limit of 256 megabytes) | 516 | assume a soft limit of 256 MiB) |
410 | 517 | ||
411 | # echo 256M > memory.soft_limit_in_bytes | 518 | # echo 256M > memory.soft_limit_in_bytes |
412 | 519 | ||
@@ -442,7 +549,7 @@ Note: Charges are moved only when you move mm->owner, IOW, a leader of a thread | |||
442 | Note: If we cannot find enough space for the task in the destination cgroup, we | 549 | Note: If we cannot find enough space for the task in the destination cgroup, we |
443 | try to make space by reclaiming memory. Task migration may fail if we | 550 | try to make space by reclaiming memory. Task migration may fail if we |
444 | cannot make enough space. | 551 | cannot make enough space. |
445 | Note: It can take several seconds if you move charges in giga bytes order. | 552 | Note: It can take several seconds if you move charges much. |
446 | 553 | ||
447 | And if you want disable it again: | 554 | And if you want disable it again: |
448 | 555 | ||
@@ -451,21 +558,27 @@ And if you want disable it again: | |||
451 | 8.2 Type of charges which can be move | 558 | 8.2 Type of charges which can be move |
452 | 559 | ||
453 | Each bits of move_charge_at_immigrate has its own meaning about what type of | 560 | Each bits of move_charge_at_immigrate has its own meaning about what type of |
454 | charges should be moved. | 561 | charges should be moved. But in any cases, it must be noted that an account of |
562 | a page or a swap can be moved only when it is charged to the task's current(old) | ||
563 | memory cgroup. | ||
455 | 564 | ||
456 | bit | what type of charges would be moved ? | 565 | bit | what type of charges would be moved ? |
457 | -----+------------------------------------------------------------------------ | 566 | -----+------------------------------------------------------------------------ |
458 | 0 | A charge of an anonymous page(or swap of it) used by the target task. | 567 | 0 | A charge of an anonymous page(or swap of it) used by the target task. |
459 | | Those pages and swaps must be used only by the target task. You must | 568 | | Those pages and swaps must be used only by the target task. You must |
460 | | enable Swap Extension(see 2.4) to enable move of swap charges. | 569 | | enable Swap Extension(see 2.4) to enable move of swap charges. |
461 | 570 | -----+------------------------------------------------------------------------ | |
462 | Note: Those pages and swaps must be charged to the old cgroup. | 571 | 1 | A charge of file pages(normal file, tmpfs file(e.g. ipc shared memory) |
463 | Note: More type of pages(e.g. file cache, shmem,) will be supported by other | 572 | | and swaps of tmpfs file) mmapped by the target task. Unlike the case of |
464 | bits in future. | 573 | | anonymous pages, file pages(and swaps) in the range mmapped by the task |
574 | | will be moved even if the task hasn't done page fault, i.e. they might | ||
575 | | not be the task's "RSS", but other task's "RSS" that maps the same file. | ||
576 | | And mapcount of the page is ignored(the page can be moved even if | ||
577 | | page_mapcount(page) > 1). You must enable Swap Extension(see 2.4) to | ||
578 | | enable move of swap charges. | ||
465 | 579 | ||
466 | 8.3 TODO | 580 | 8.3 TODO |
467 | 581 | ||
468 | - Add support for other types of pages(e.g. file cache, shmem, etc.). | ||
469 | - Implement madvise(2) to let users decide the vma to be moved or not to be | 582 | - Implement madvise(2) to let users decide the vma to be moved or not to be |
470 | moved. | 583 | moved. |
471 | - All of moving charge operations are done under cgroup_mutex. It's not good | 584 | - All of moving charge operations are done under cgroup_mutex. It's not good |
@@ -473,22 +586,61 @@ Note: More type of pages(e.g. file cache, shmem,) will be supported by other | |||
473 | 586 | ||
474 | 9. Memory thresholds | 587 | 9. Memory thresholds |
475 | 588 | ||
476 | Memory controler implements memory thresholds using cgroups notification | 589 | Memory cgroup implements memory thresholds using cgroups notification |
477 | API (see cgroups.txt). It allows to register multiple memory and memsw | 590 | API (see cgroups.txt). It allows to register multiple memory and memsw |
478 | thresholds and gets notifications when it crosses. | 591 | thresholds and gets notifications when it crosses. |
479 | 592 | ||
480 | To register a threshold application need: | 593 | To register a threshold application need: |
481 | - create an eventfd using eventfd(2); | 594 | - create an eventfd using eventfd(2); |
482 | - open memory.usage_in_bytes or memory.memsw.usage_in_bytes; | 595 | - open memory.usage_in_bytes or memory.memsw.usage_in_bytes; |
483 | - write string like "<event_fd> <memory.usage_in_bytes> <threshold>" to | 596 | - write string like "<event_fd> <fd of memory.usage_in_bytes> <threshold>" to |
484 | cgroup.event_control. | 597 | cgroup.event_control. |
485 | 598 | ||
486 | Application will be notified through eventfd when memory usage crosses | 599 | Application will be notified through eventfd when memory usage crosses |
487 | threshold in any direction. | 600 | threshold in any direction. |
488 | 601 | ||
489 | It's applicable for root and non-root cgroup. | 602 | It's applicable for root and non-root cgroup. |
490 | 603 | ||
491 | 10. TODO | 604 | 10. OOM Control |
605 | |||
606 | memory.oom_control file is for OOM notification and other controls. | ||
607 | |||
608 | Memory cgroup implements OOM notifier using cgroup notification | ||
609 | API (See cgroups.txt). It allows to register multiple OOM notification | ||
610 | delivery and gets notification when OOM happens. | ||
611 | |||
612 | To register a notifier, application need: | ||
613 | - create an eventfd using eventfd(2) | ||
614 | - open memory.oom_control file | ||
615 | - write string like "<event_fd> <fd of memory.oom_control>" to | ||
616 | cgroup.event_control | ||
617 | |||
618 | Application will be notified through eventfd when OOM happens. | ||
619 | OOM notification doesn't work for root cgroup. | ||
620 | |||
621 | You can disable OOM-killer by writing "1" to memory.oom_control file, as: | ||
622 | |||
623 | #echo 1 > memory.oom_control | ||
624 | |||
625 | This operation is only allowed to the top cgroup of sub-hierarchy. | ||
626 | If OOM-killer is disabled, tasks under cgroup will hang/sleep | ||
627 | in memory cgroup's OOM-waitqueue when they request accountable memory. | ||
628 | |||
629 | For running them, you have to relax the memory cgroup's OOM status by | ||
630 | * enlarge limit or reduce usage. | ||
631 | To reduce usage, | ||
632 | * kill some tasks. | ||
633 | * move some tasks to other group with account migration. | ||
634 | * remove some files (on tmpfs?) | ||
635 | |||
636 | Then, stopped tasks will work again. | ||
637 | |||
638 | At reading, current status of OOM is shown. | ||
639 | oom_kill_disable 0 or 1 (if 1, oom-killer is disabled) | ||
640 | under_oom 0 or 1 (if 1, the memory cgroup is under OOM, tasks may | ||
641 | be stopped.) | ||
642 | |||
643 | 11. TODO | ||
492 | 644 | ||
493 | 1. Add support for accounting huge pages (as a separate controller) | 645 | 1. Add support for accounting huge pages (as a separate controller) |
494 | 2. Make per-cgroup scanner reclaim not-shared pages first | 646 | 2. Make per-cgroup scanner reclaim not-shared pages first |
diff --git a/Documentation/devices.txt b/Documentation/devices.txt index 53d64d382343..1d83d124056c 100644 --- a/Documentation/devices.txt +++ b/Documentation/devices.txt | |||
@@ -443,6 +443,8 @@ Your cooperation is appreciated. | |||
443 | 231 = /dev/snapshot System memory snapshot device | 443 | 231 = /dev/snapshot System memory snapshot device |
444 | 232 = /dev/kvm Kernel-based virtual machine (hardware virtualization extensions) | 444 | 232 = /dev/kvm Kernel-based virtual machine (hardware virtualization extensions) |
445 | 233 = /dev/kmview View-OS A process with a view | 445 | 233 = /dev/kmview View-OS A process with a view |
446 | 234 = /dev/btrfs-control Btrfs control device | ||
447 | 235 = /dev/autofs Autofs control device | ||
446 | 240-254 Reserved for local use | 448 | 240-254 Reserved for local use |
447 | 255 Reserved for MISC_DYNAMIC_MINOR | 449 | 255 Reserved for MISC_DYNAMIC_MINOR |
448 | 450 | ||
diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index a86152ae2f6f..672be0109d02 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt | |||
@@ -646,3 +646,13 @@ Who: Thomas Gleixner <tglx@linutronix.de> | |||
646 | 646 | ||
647 | ---------------------------- | 647 | ---------------------------- |
648 | 648 | ||
649 | What: old ieee1394 subsystem (CONFIG_IEEE1394) | ||
650 | When: 2.6.37 | ||
651 | Files: drivers/ieee1394/ except init_ohci1394_dma.c | ||
652 | Why: superseded by drivers/firewire/ (CONFIG_FIREWIRE) which offers more | ||
653 | features, better performance, and better security, all with smaller | ||
654 | and more modern code base | ||
655 | Who: Stefan Richter <stefanr@s5r6.in-berlin.de> | ||
656 | |||
657 | ---------------------------- | ||
658 | |||
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index af1608070cd5..61c98f03baa1 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking | |||
@@ -429,8 +429,9 @@ check_flags: no | |||
429 | implementations. If your fs is not using generic_file_llseek, you | 429 | implementations. If your fs is not using generic_file_llseek, you |
430 | need to acquire and release the appropriate locks in your ->llseek(). | 430 | need to acquire and release the appropriate locks in your ->llseek(). |
431 | For many filesystems, it is probably safe to acquire the inode | 431 | For many filesystems, it is probably safe to acquire the inode |
432 | mutex. Note some filesystems (i.e. remote ones) provide no | 432 | mutex or just to use i_size_read() instead. |
433 | protection for i_size so you will need to use the BKL. | 433 | Note: this does not protect the file->f_pos against concurrent modifications |
434 | since this is something the userspace has to take care about. | ||
434 | 435 | ||
435 | Note: ext2_release() was *the* source of contention on fs-intensive | 436 | Note: ext2_release() was *the* source of contention on fs-intensive |
436 | loads and dropping BKL on ->release() helps to get rid of that (we still | 437 | loads and dropping BKL on ->release() helps to get rid of that (we still |
diff --git a/Documentation/filesystems/squashfs.txt b/Documentation/filesystems/squashfs.txt index b324c033035a..203f7202cc9e 100644 --- a/Documentation/filesystems/squashfs.txt +++ b/Documentation/filesystems/squashfs.txt | |||
@@ -38,7 +38,8 @@ Hard link support: yes no | |||
38 | Real inode numbers: yes no | 38 | Real inode numbers: yes no |
39 | 32-bit uids/gids: yes no | 39 | 32-bit uids/gids: yes no |
40 | File creation time: yes no | 40 | File creation time: yes no |
41 | Xattr and ACL support: no no | 41 | Xattr support: yes no |
42 | ACL support: no no | ||
42 | 43 | ||
43 | Squashfs compresses data, inodes and directories. In addition, inode and | 44 | Squashfs compresses data, inodes and directories. In addition, inode and |
44 | directory data are highly compacted, and packed on byte boundaries. Each | 45 | directory data are highly compacted, and packed on byte boundaries. Each |
@@ -58,7 +59,7 @@ obtained from this site also. | |||
58 | 3. SQUASHFS FILESYSTEM DESIGN | 59 | 3. SQUASHFS FILESYSTEM DESIGN |
59 | ----------------------------- | 60 | ----------------------------- |
60 | 61 | ||
61 | A squashfs filesystem consists of seven parts, packed together on a byte | 62 | A squashfs filesystem consists of a maximum of eight parts, packed together on a byte |
62 | alignment: | 63 | alignment: |
63 | 64 | ||
64 | --------------- | 65 | --------------- |
@@ -80,6 +81,9 @@ alignment: | |||
80 | |---------------| | 81 | |---------------| |
81 | | uid/gid | | 82 | | uid/gid | |
82 | | lookup table | | 83 | | lookup table | |
84 | |---------------| | ||
85 | | xattr | | ||
86 | | table | | ||
83 | --------------- | 87 | --------------- |
84 | 88 | ||
85 | Compressed data blocks are written to the filesystem as files are read from | 89 | Compressed data blocks are written to the filesystem as files are read from |
@@ -192,6 +196,26 @@ This table is stored compressed into metadata blocks. A second index table is | |||
192 | used to locate these. This second index table for speed of access (and because | 196 | used to locate these. This second index table for speed of access (and because |
193 | it is small) is read at mount time and cached in memory. | 197 | it is small) is read at mount time and cached in memory. |
194 | 198 | ||
199 | 3.7 Xattr table | ||
200 | --------------- | ||
201 | |||
202 | The xattr table contains extended attributes for each inode. The xattrs | ||
203 | for each inode are stored in a list, each list entry containing a type, | ||
204 | name and value field. The type field encodes the xattr prefix | ||
205 | ("user.", "trusted." etc) and it also encodes how the name/value fields | ||
206 | should be interpreted. Currently the type indicates whether the value | ||
207 | is stored inline (in which case the value field contains the xattr value), | ||
208 | or if it is stored out of line (in which case the value field stores a | ||
209 | reference to where the actual value is stored). This allows large values | ||
210 | to be stored out of line improving scanning and lookup performance and it | ||
211 | also allows values to be de-duplicated, the value being stored once, and | ||
212 | all other occurences holding an out of line reference to that value. | ||
213 | |||
214 | The xattr lists are packed into compressed 8K metadata blocks. | ||
215 | To reduce overhead in inodes, rather than storing the on-disk | ||
216 | location of the xattr list inside each inode, a 32-bit xattr id | ||
217 | is stored. This xattr id is mapped into the location of the xattr | ||
218 | list using a second xattr id lookup table. | ||
195 | 219 | ||
196 | 4. TODOS AND OUTSTANDING ISSUES | 220 | 4. TODOS AND OUTSTANDING ISSUES |
197 | ------------------------------- | 221 | ------------------------------- |
@@ -199,9 +223,7 @@ it is small) is read at mount time and cached in memory. | |||
199 | 4.1 Todo list | 223 | 4.1 Todo list |
200 | ------------- | 224 | ------------- |
201 | 225 | ||
202 | Implement Xattr and ACL support. The Squashfs 4.0 filesystem layout has hooks | 226 | Implement ACL support. |
203 | for these but the code has not been written. Once the code has been written | ||
204 | the existing layout should not require modification. | ||
205 | 227 | ||
206 | 4.2 Squashfs internal cache | 228 | 4.2 Squashfs internal cache |
207 | --------------------------- | 229 | --------------------------- |
diff --git a/Documentation/hwmon/dme1737 b/Documentation/hwmon/dme1737 index 001d2e70bc11..fc5df7654d63 100644 --- a/Documentation/hwmon/dme1737 +++ b/Documentation/hwmon/dme1737 | |||
@@ -9,11 +9,15 @@ Supported chips: | |||
9 | * SMSC SCH3112, SCH3114, SCH3116 | 9 | * SMSC SCH3112, SCH3114, SCH3116 |
10 | Prefix: 'sch311x' | 10 | Prefix: 'sch311x' |
11 | Addresses scanned: none, address read from Super-I/O config space | 11 | Addresses scanned: none, address read from Super-I/O config space |
12 | Datasheet: http://www.nuhorizons.com/FeaturedProducts/Volume1/SMSC/311x.pdf | 12 | Datasheet: Available on the Internet |
13 | * SMSC SCH5027 | 13 | * SMSC SCH5027 |
14 | Prefix: 'sch5027' | 14 | Prefix: 'sch5027' |
15 | Addresses scanned: I2C 0x2c, 0x2d, 0x2e | 15 | Addresses scanned: I2C 0x2c, 0x2d, 0x2e |
16 | Datasheet: Provided by SMSC upon request and under NDA | 16 | Datasheet: Provided by SMSC upon request and under NDA |
17 | * SMSC SCH5127 | ||
18 | Prefix: 'sch5127' | ||
19 | Addresses scanned: none, address read from Super-I/O config space | ||
20 | Datasheet: Provided by SMSC upon request and under NDA | ||
17 | 21 | ||
18 | Authors: | 22 | Authors: |
19 | Juerg Haefliger <juergh@gmail.com> | 23 | Juerg Haefliger <juergh@gmail.com> |
@@ -36,8 +40,8 @@ Description | |||
36 | ----------- | 40 | ----------- |
37 | 41 | ||
38 | This driver implements support for the hardware monitoring capabilities of the | 42 | This driver implements support for the hardware monitoring capabilities of the |
39 | SMSC DME1737 and Asus A8000 (which are the same), SMSC SCH5027, and SMSC | 43 | SMSC DME1737 and Asus A8000 (which are the same), SMSC SCH5027, SCH311x, |
40 | SCH311x Super-I/O chips. These chips feature monitoring of 3 temp sensors | 44 | and SCH5127 Super-I/O chips. These chips feature monitoring of 3 temp sensors |
41 | temp[1-3] (2 remote diodes and 1 internal), 7 voltages in[0-6] (6 external and | 45 | temp[1-3] (2 remote diodes and 1 internal), 7 voltages in[0-6] (6 external and |
42 | 1 internal) and up to 6 fan speeds fan[1-6]. Additionally, the chips implement | 46 | 1 internal) and up to 6 fan speeds fan[1-6]. Additionally, the chips implement |
43 | up to 5 PWM outputs pwm[1-3,5-6] for controlling fan speeds both manually and | 47 | up to 5 PWM outputs pwm[1-3,5-6] for controlling fan speeds both manually and |
@@ -48,14 +52,14 @@ Fan[3-6] and pwm[3,5-6] are optional features and their availability depends on | |||
48 | the configuration of the chip. The driver will detect which features are | 52 | the configuration of the chip. The driver will detect which features are |
49 | present during initialization and create the sysfs attributes accordingly. | 53 | present during initialization and create the sysfs attributes accordingly. |
50 | 54 | ||
51 | For the SCH311x, fan[1-3] and pwm[1-3] are always present and fan[4-6] and | 55 | For the SCH311x and SCH5127, fan[1-3] and pwm[1-3] are always present and |
52 | pwm[5-6] don't exist. | 56 | fan[4-6] and pwm[5-6] don't exist. |
53 | 57 | ||
54 | The hardware monitoring features of the DME1737, A8000, and SCH5027 are only | 58 | The hardware monitoring features of the DME1737, A8000, and SCH5027 are only |
55 | accessible via SMBus, while the SCH311x only provides access via the ISA bus. | 59 | accessible via SMBus, while the SCH311x and SCH5127 only provide access via |
56 | The driver will therefore register itself as an I2C client driver if it detects | 60 | the ISA bus. The driver will therefore register itself as an I2C client driver |
57 | a DME1737, A8000, or SCH5027 and as a platform driver if it detects a SCH311x | 61 | if it detects a DME1737, A8000, or SCH5027 and as a platform driver if it |
58 | chip. | 62 | detects a SCH311x or SCH5127 chip. |
59 | 63 | ||
60 | 64 | ||
61 | Voltage Monitoring | 65 | Voltage Monitoring |
@@ -76,7 +80,7 @@ DME1737, A8000: | |||
76 | in6: Vbat (+3.0V) 0V - 4.38V | 80 | in6: Vbat (+3.0V) 0V - 4.38V |
77 | 81 | ||
78 | SCH311x: | 82 | SCH311x: |
79 | in0: +2.5V 0V - 6.64V | 83 | in0: +2.5V 0V - 3.32V |
80 | in1: Vccp (processor core) 0V - 2V | 84 | in1: Vccp (processor core) 0V - 2V |
81 | in2: VCC (internal +3.3V) 0V - 4.38V | 85 | in2: VCC (internal +3.3V) 0V - 4.38V |
82 | in3: +5V 0V - 6.64V | 86 | in3: +5V 0V - 6.64V |
@@ -93,6 +97,15 @@ SCH5027: | |||
93 | in5: VTR (+3.3V standby) 0V - 4.38V | 97 | in5: VTR (+3.3V standby) 0V - 4.38V |
94 | in6: Vbat (+3.0V) 0V - 4.38V | 98 | in6: Vbat (+3.0V) 0V - 4.38V |
95 | 99 | ||
100 | SCH5127: | ||
101 | in0: +2.5 0V - 3.32V | ||
102 | in1: Vccp (processor core) 0V - 3V | ||
103 | in2: VCC (internal +3.3V) 0V - 4.38V | ||
104 | in3: V2_IN 0V - 1.5V | ||
105 | in4: V1_IN 0V - 1.5V | ||
106 | in5: VTR (+3.3V standby) 0V - 4.38V | ||
107 | in6: Vbat (+3.0V) 0V - 4.38V | ||
108 | |||
96 | Each voltage input has associated min and max limits which trigger an alarm | 109 | Each voltage input has associated min and max limits which trigger an alarm |
97 | when crossed. | 110 | when crossed. |
98 | 111 | ||
@@ -293,3 +306,21 @@ pwm[1-3]_auto_point1_pwm RW Auto PWM pwm point. Auto_point1 is the | |||
293 | pwm[1-3]_auto_point2_pwm RO Auto PWM pwm point. Auto_point2 is the | 306 | pwm[1-3]_auto_point2_pwm RO Auto PWM pwm point. Auto_point2 is the |
294 | full-speed duty-cycle which is hard- | 307 | full-speed duty-cycle which is hard- |
295 | wired to 255 (100% duty-cycle). | 308 | wired to 255 (100% duty-cycle). |
309 | |||
310 | Chip Differences | ||
311 | ---------------- | ||
312 | |||
313 | Feature dme1737 sch311x sch5027 sch5127 | ||
314 | ------------------------------------------------------- | ||
315 | temp[1-3]_offset yes yes | ||
316 | vid yes | ||
317 | zone3 yes yes yes | ||
318 | zone[1-3]_hyst yes yes | ||
319 | pwm min/off yes yes | ||
320 | fan3 opt yes opt yes | ||
321 | pwm3 opt yes opt yes | ||
322 | fan4 opt opt | ||
323 | fan5 opt opt | ||
324 | pwm5 opt opt | ||
325 | fan6 opt opt | ||
326 | pwm6 opt opt | ||
diff --git a/Documentation/hwmon/lm63 b/Documentation/hwmon/lm63 index 31660bf97979..b9843eab1afb 100644 --- a/Documentation/hwmon/lm63 +++ b/Documentation/hwmon/lm63 | |||
@@ -7,6 +7,11 @@ Supported chips: | |||
7 | Addresses scanned: I2C 0x4c | 7 | Addresses scanned: I2C 0x4c |
8 | Datasheet: Publicly available at the National Semiconductor website | 8 | Datasheet: Publicly available at the National Semiconductor website |
9 | http://www.national.com/pf/LM/LM63.html | 9 | http://www.national.com/pf/LM/LM63.html |
10 | * National Semiconductor LM64 | ||
11 | Prefix: 'lm64' | ||
12 | Addresses scanned: I2C 0x18 and 0x4e | ||
13 | Datasheet: Publicly available at the National Semiconductor website | ||
14 | http://www.national.com/pf/LM/LM64.html | ||
10 | 15 | ||
11 | Author: Jean Delvare <khali@linux-fr.org> | 16 | Author: Jean Delvare <khali@linux-fr.org> |
12 | 17 | ||
@@ -55,3 +60,5 @@ The lm63 driver will not update its values more frequently than every | |||
55 | second; reading them more often will do no harm, but will return 'old' | 60 | second; reading them more often will do no harm, but will return 'old' |
56 | values. | 61 | values. |
57 | 62 | ||
63 | The LM64 is effectively an LM63 with GPIO lines. The driver does not | ||
64 | support these GPIO lines at present. | ||
diff --git a/Documentation/hwmon/ltc4245 b/Documentation/hwmon/ltc4245 index 02838a47d862..86b5880d8502 100644 --- a/Documentation/hwmon/ltc4245 +++ b/Documentation/hwmon/ltc4245 | |||
@@ -72,9 +72,7 @@ in6_min_alarm 5v output undervoltage alarm | |||
72 | in7_min_alarm 3v output undervoltage alarm | 72 | in7_min_alarm 3v output undervoltage alarm |
73 | in8_min_alarm Vee (-12v) output undervoltage alarm | 73 | in8_min_alarm Vee (-12v) output undervoltage alarm |
74 | 74 | ||
75 | in9_input GPIO #1 voltage data | 75 | in9_input GPIO voltage data |
76 | in10_input GPIO #2 voltage data | ||
77 | in11_input GPIO #3 voltage data | ||
78 | 76 | ||
79 | power1_input 12v power usage (mW) | 77 | power1_input 12v power usage (mW) |
80 | power2_input 5v power usage (mW) | 78 | power2_input 5v power usage (mW) |
diff --git a/Documentation/hwmon/sysfs-interface b/Documentation/hwmon/sysfs-interface index 3de6b0bcb147..d4e2917c6f18 100644 --- a/Documentation/hwmon/sysfs-interface +++ b/Documentation/hwmon/sysfs-interface | |||
@@ -80,9 +80,9 @@ All entries (except name) are optional, and should only be created in a | |||
80 | given driver if the chip has the feature. | 80 | given driver if the chip has the feature. |
81 | 81 | ||
82 | 82 | ||
83 | ******** | 83 | ********************* |
84 | * Name * | 84 | * Global attributes * |
85 | ******** | 85 | ********************* |
86 | 86 | ||
87 | name The chip name. | 87 | name The chip name. |
88 | This should be a short, lowercase string, not containing | 88 | This should be a short, lowercase string, not containing |
@@ -91,6 +91,13 @@ name The chip name. | |||
91 | I2C devices get this attribute created automatically. | 91 | I2C devices get this attribute created automatically. |
92 | RO | 92 | RO |
93 | 93 | ||
94 | update_rate The rate at which the chip will update readings. | ||
95 | Unit: millisecond | ||
96 | RW | ||
97 | Some devices have a variable update rate. This attribute | ||
98 | can be used to change the update rate to the desired | ||
99 | frequency. | ||
100 | |||
94 | 101 | ||
95 | ************ | 102 | ************ |
96 | * Voltages * | 103 | * Voltages * |
diff --git a/Documentation/hwmon/tmp102 b/Documentation/hwmon/tmp102 new file mode 100644 index 000000000000..8454a7763122 --- /dev/null +++ b/Documentation/hwmon/tmp102 | |||
@@ -0,0 +1,26 @@ | |||
1 | Kernel driver tmp102 | ||
2 | ==================== | ||
3 | |||
4 | Supported chips: | ||
5 | * Texas Instruments TMP102 | ||
6 | Prefix: 'tmp102' | ||
7 | Addresses scanned: none | ||
8 | Datasheet: http://focus.ti.com/docs/prod/folders/print/tmp102.html | ||
9 | |||
10 | Author: | ||
11 | Steven King <sfking@fdwdc.com> | ||
12 | |||
13 | Description | ||
14 | ----------- | ||
15 | |||
16 | The Texas Instruments TMP102 implements one temperature sensor. Limits can be | ||
17 | set through the Overtemperature Shutdown register and Hysteresis register. The | ||
18 | sensor is accurate to 0.5 degree over the range of -25 to +85 C, and to 1.0 | ||
19 | degree from -40 to +125 C. Resolution of the sensor is 0.0625 degree. The | ||
20 | operating temperature has a minimum of -55 C and a maximum of +150 C. | ||
21 | |||
22 | The TMP102 has a programmable update rate that can select between 8, 4, 1, and | ||
23 | 0.5 Hz. (Currently the driver only supports the default of 4 Hz). | ||
24 | |||
25 | The driver provides the common sysfs-interface for temperatures (see | ||
26 | Documentation/hwmon/sysfs-interface under Temperatures). | ||
diff --git a/Documentation/vm/numa b/Documentation/vm/numa index e93ad9425e2a..a200a386429d 100644 --- a/Documentation/vm/numa +++ b/Documentation/vm/numa | |||
@@ -1,41 +1,149 @@ | |||
1 | Started Nov 1999 by Kanoj Sarcar <kanoj@sgi.com> | 1 | Started Nov 1999 by Kanoj Sarcar <kanoj@sgi.com> |
2 | 2 | ||
3 | The intent of this file is to have an uptodate, running commentary | 3 | What is NUMA? |
4 | from different people about NUMA specific code in the Linux vm. | 4 | |
5 | 5 | This question can be answered from a couple of perspectives: the | |
6 | What is NUMA? It is an architecture where the memory access times | 6 | hardware view and the Linux software view. |
7 | for different regions of memory from a given processor varies | 7 | |
8 | according to the "distance" of the memory region from the processor. | 8 | From the hardware perspective, a NUMA system is a computer platform that |
9 | Each region of memory to which access times are the same from any | 9 | comprises multiple components or assemblies each of which may contain 0 |
10 | cpu, is called a node. On such architectures, it is beneficial if | 10 | or more CPUs, local memory, and/or IO buses. For brevity and to |
11 | the kernel tries to minimize inter node communications. Schemes | 11 | disambiguate the hardware view of these physical components/assemblies |
12 | for this range from kernel text and read-only data replication | 12 | from the software abstraction thereof, we'll call the components/assemblies |
13 | across nodes, and trying to house all the data structures that | 13 | 'cells' in this document. |
14 | key components of the kernel need on memory on that node. | 14 | |
15 | 15 | Each of the 'cells' may be viewed as an SMP [symmetric multi-processor] subset | |
16 | Currently, all the numa support is to provide efficient handling | 16 | of the system--although some components necessary for a stand-alone SMP system |
17 | of widely discontiguous physical memory, so architectures which | 17 | may not be populated on any given cell. The cells of the NUMA system are |
18 | are not NUMA but can have huge holes in the physical address space | 18 | connected together with some sort of system interconnect--e.g., a crossbar or |
19 | can use the same code. All this code is bracketed by CONFIG_DISCONTIGMEM. | 19 | point-to-point link are common types of NUMA system interconnects. Both of |
20 | 20 | these types of interconnects can be aggregated to create NUMA platforms with | |
21 | The initial port includes NUMAizing the bootmem allocator code by | 21 | cells at multiple distances from other cells. |
22 | encapsulating all the pieces of information into a bootmem_data_t | 22 | |
23 | structure. Node specific calls have been added to the allocator. | 23 | For Linux, the NUMA platforms of interest are primarily what is known as Cache |
24 | In theory, any platform which uses the bootmem allocator should | 24 | Coherent NUMA or ccNUMA systems. With ccNUMA systems, all memory is visible |
25 | be able to put the bootmem and mem_map data structures anywhere | 25 | to and accessible from any CPU attached to any cell and cache coherency |
26 | it deems best. | 26 | is handled in hardware by the processor caches and/or the system interconnect. |
27 | 27 | ||
28 | Each node's page allocation data structures have also been encapsulated | 28 | Memory access time and effective memory bandwidth varies depending on how far |
29 | into a pg_data_t. The bootmem_data_t is just one part of this. To | 29 | away the cell containing the CPU or IO bus making the memory access is from the |
30 | make the code look uniform between NUMA and regular UMA platforms, | 30 | cell containing the target memory. For example, access to memory by CPUs |
31 | UMA platforms have a statically allocated pg_data_t too (contig_page_data). | 31 | attached to the same cell will experience faster access times and higher |
32 | For the sake of uniformity, the function num_online_nodes() is also defined | 32 | bandwidths than accesses to memory on other, remote cells. NUMA platforms |
33 | for all platforms. As we run benchmarks, we might decide to NUMAize | 33 | can have cells at multiple remote distances from any given cell. |
34 | more variables like low_on_memory, nr_free_pages etc into the pg_data_t. | 34 | |
35 | 35 | Platform vendors don't build NUMA systems just to make software developers' | |
36 | The NUMA aware page allocation code currently tries to allocate pages | 36 | lives interesting. Rather, this architecture is a means to provide scalable |
37 | from different nodes in a round robin manner. This will be changed to | 37 | memory bandwidth. However, to achieve scalable memory bandwidth, system and |
38 | do concentratic circle search, starting from current node, once the | 38 | application software must arrange for a large majority of the memory references |
39 | NUMA port achieves more maturity. The call alloc_pages_node has been | 39 | [cache misses] to be to "local" memory--memory on the same cell, if any--or |
40 | added, so that drivers can make the call and not worry about whether | 40 | to the closest cell with memory. |
41 | it is running on a NUMA or UMA platform. | 41 | |
42 | This leads to the Linux software view of a NUMA system: | ||
43 | |||
44 | Linux divides the system's hardware resources into multiple software | ||
45 | abstractions called "nodes". Linux maps the nodes onto the physical cells | ||
46 | of the hardware platform, abstracting away some of the details for some | ||
47 | architectures. As with physical cells, software nodes may contain 0 or more | ||
48 | CPUs, memory and/or IO buses. And, again, memory accesses to memory on | ||
49 | "closer" nodes--nodes that map to closer cells--will generally experience | ||
50 | faster access times and higher effective bandwidth than accesses to more | ||
51 | remote cells. | ||
52 | |||
53 | For some architectures, such as x86, Linux will "hide" any node representing a | ||
54 | physical cell that has no memory attached, and reassign any CPUs attached to | ||
55 | that cell to a node representing a cell that does have memory. Thus, on | ||
56 | these architectures, one cannot assume that all CPUs that Linux associates with | ||
57 | a given node will see the same local memory access times and bandwidth. | ||
58 | |||
59 | In addition, for some architectures, again x86 is an example, Linux supports | ||
60 | the emulation of additional nodes. For NUMA emulation, linux will carve up | ||
61 | the existing nodes--or the system memory for non-NUMA platforms--into multiple | ||
62 | nodes. Each emulated node will manage a fraction of the underlying cells' | ||
63 | physical memory. NUMA emluation is useful for testing NUMA kernel and | ||
64 | application features on non-NUMA platforms, and as a sort of memory resource | ||
65 | management mechanism when used together with cpusets. | ||
66 | [see Documentation/cgroups/cpusets.txt] | ||
67 | |||
68 | For each node with memory, Linux constructs an independent memory management | ||
69 | subsystem, complete with its own free page lists, in-use page lists, usage | ||
70 | statistics and locks to mediate access. In addition, Linux constructs for | ||
71 | each memory zone [one or more of DMA, DMA32, NORMAL, HIGH_MEMORY, MOVABLE], | ||
72 | an ordered "zonelist". A zonelist specifies the zones/nodes to visit when a | ||
73 | selected zone/node cannot satisfy the allocation request. This situation, | ||
74 | when a zone has no available memory to satisfy a request, is called | ||
75 | "overflow" or "fallback". | ||
76 | |||
77 | Because some nodes contain multiple zones containing different types of | ||
78 | memory, Linux must decide whether to order the zonelists such that allocations | ||
79 | fall back to the same zone type on a different node, or to a different zone | ||
80 | type on the same node. This is an important consideration because some zones, | ||
81 | such as DMA or DMA32, represent relatively scarce resources. Linux chooses | ||
82 | a default zonelist order based on the sizes of the various zone types relative | ||
83 | to the total memory of the node and the total memory of the system. The | ||
84 | default zonelist order may be overridden using the numa_zonelist_order kernel | ||
85 | boot parameter or sysctl. [see Documentation/kernel-parameters.txt and | ||
86 | Documentation/sysctl/vm.txt] | ||
87 | |||
88 | By default, Linux will attempt to satisfy memory allocation requests from the | ||
89 | node to which the CPU that executes the request is assigned. Specifically, | ||
90 | Linux will attempt to allocate from the first node in the appropriate zonelist | ||
91 | for the node where the request originates. This is called "local allocation." | ||
92 | If the "local" node cannot satisfy the request, the kernel will examine other | ||
93 | nodes' zones in the selected zonelist looking for the first zone in the list | ||
94 | that can satisfy the request. | ||
95 | |||
96 | Local allocation will tend to keep subsequent access to the allocated memory | ||
97 | "local" to the underlying physical resources and off the system interconnect-- | ||
98 | as long as the task on whose behalf the kernel allocated some memory does not | ||
99 | later migrate away from that memory. The Linux scheduler is aware of the | ||
100 | NUMA topology of the platform--embodied in the "scheduling domains" data | ||
101 | structures [see Documentation/scheduler/sched-domains.txt]--and the scheduler | ||
102 | attempts to minimize task migration to distant scheduling domains. However, | ||
103 | the scheduler does not take a task's NUMA footprint into account directly. | ||
104 | Thus, under sufficient imbalance, tasks can migrate between nodes, remote | ||
105 | from their initial node and kernel data structures. | ||
106 | |||
107 | System administrators and application designers can restrict a task's migration | ||
108 | to improve NUMA locality using various CPU affinity command line interfaces, | ||
109 | such as taskset(1) and numactl(1), and program interfaces such as | ||
110 | sched_setaffinity(2). Further, one can modify the kernel's default local | ||
111 | allocation behavior using Linux NUMA memory policy. | ||
112 | [see Documentation/vm/numa_memory_policy.] | ||
113 | |||
114 | System administrators can restrict the CPUs and nodes' memories that a non- | ||
115 | privileged user can specify in the scheduling or NUMA commands and functions | ||
116 | using control groups and CPUsets. [see Documentation/cgroups/CPUsets.txt] | ||
117 | |||
118 | On architectures that do not hide memoryless nodes, Linux will include only | ||
119 | zones [nodes] with memory in the zonelists. This means that for a memoryless | ||
120 | node the "local memory node"--the node of the first zone in CPU's node's | ||
121 | zonelist--will not be the node itself. Rather, it will be the node that the | ||
122 | kernel selected as the nearest node with memory when it built the zonelists. | ||
123 | So, default, local allocations will succeed with the kernel supplying the | ||
124 | closest available memory. This is a consequence of the same mechanism that | ||
125 | allows such allocations to fallback to other nearby nodes when a node that | ||
126 | does contain memory overflows. | ||
127 | |||
128 | Some kernel allocations do not want or cannot tolerate this allocation fallback | ||
129 | behavior. Rather they want to be sure they get memory from the specified node | ||
130 | or get notified that the node has no free memory. This is usually the case when | ||
131 | a subsystem allocates per CPU memory resources, for example. | ||
132 | |||
133 | A typical model for making such an allocation is to obtain the node id of the | ||
134 | node to which the "current CPU" is attached using one of the kernel's | ||
135 | numa_node_id() or CPU_to_node() functions and then request memory from only | ||
136 | the node id returned. When such an allocation fails, the requesting subsystem | ||
137 | may revert to its own fallback path. The slab kernel memory allocator is an | ||
138 | example of this. Or, the subsystem may choose to disable or not to enable | ||
139 | itself on allocation failure. The kernel profiling subsystem is an example of | ||
140 | this. | ||
141 | |||
142 | If the architecture supports--does not hide--memoryless nodes, then CPUs | ||
143 | attached to memoryless nodes would always incur the fallback path overhead | ||
144 | or some subsystems would fail to initialize if they attempted to allocated | ||
145 | memory exclusively from a node without memory. To support such | ||
146 | architectures transparently, kernel subsystems can use the numa_mem_id() | ||
147 | or cpu_to_mem() function to locate the "local memory node" for the calling or | ||
148 | specified CPU. Again, this is the same node from which default, local page | ||
149 | allocations will be attempted. | ||