diff options
Diffstat (limited to 'Documentation')
-rw-r--r-- | Documentation/DocBook/kernel-api.tmpl | 2 | ||||
-rw-r--r-- | Documentation/accounting/cgroupstats.txt | 27 | ||||
-rw-r--r-- | Documentation/cachetlb.txt | 27 | ||||
-rw-r--r-- | Documentation/cgroups.txt | 545 | ||||
-rw-r--r-- | Documentation/cpu-hotplug.txt | 4 | ||||
-rw-r--r-- | Documentation/cpusets.txt | 226 | ||||
-rw-r--r-- | Documentation/device-mapper/dm-uevent.txt | 97 | ||||
-rw-r--r-- | Documentation/input/input-programming.txt | 15 | ||||
-rw-r--r-- | Documentation/kbuild/kconfig-language.txt | 14 | ||||
-rw-r--r-- | Documentation/kbuild/makefiles.txt | 22 | ||||
-rw-r--r-- | Documentation/kdump/kdump.txt | 26 | ||||
-rw-r--r-- | Documentation/kernel-parameters.txt | 13 | ||||
-rw-r--r-- | Documentation/markers.txt | 81 | ||||
-rw-r--r-- | Documentation/mips/00-INDEX | 2 | ||||
-rw-r--r-- | Documentation/mips/time.README | 173 | ||||
-rw-r--r-- | Documentation/thinkpad-acpi.txt | 25 |
16 files changed, 1033 insertions, 266 deletions
diff --git a/Documentation/DocBook/kernel-api.tmpl b/Documentation/DocBook/kernel-api.tmpl index d3290c46af51..aa38cc5692a0 100644 --- a/Documentation/DocBook/kernel-api.tmpl +++ b/Documentation/DocBook/kernel-api.tmpl | |||
@@ -46,7 +46,7 @@ | |||
46 | 46 | ||
47 | <sect1><title>Atomic and pointer manipulation</title> | 47 | <sect1><title>Atomic and pointer manipulation</title> |
48 | !Iinclude/asm-x86/atomic_32.h | 48 | !Iinclude/asm-x86/atomic_32.h |
49 | !Iinclude/asm-x86/unaligned_32.h | 49 | !Iinclude/asm-x86/unaligned.h |
50 | </sect1> | 50 | </sect1> |
51 | 51 | ||
52 | <sect1><title>Delaying, scheduling, and timer routines</title> | 52 | <sect1><title>Delaying, scheduling, and timer routines</title> |
diff --git a/Documentation/accounting/cgroupstats.txt b/Documentation/accounting/cgroupstats.txt new file mode 100644 index 000000000000..eda40fd39cad --- /dev/null +++ b/Documentation/accounting/cgroupstats.txt | |||
@@ -0,0 +1,27 @@ | |||
1 | Control Groupstats is inspired by the discussion at | ||
2 | http://lkml.org/lkml/2007/4/11/187 and implements per cgroup statistics as | ||
3 | suggested by Andrew Morton in http://lkml.org/lkml/2007/4/11/263. | ||
4 | |||
5 | Per cgroup statistics infrastructure re-uses code from the taskstats | ||
6 | interface. A new set of cgroup operations are registered with commands | ||
7 | and attributes specific to cgroups. It should be very easy to | ||
8 | extend per cgroup statistics, by adding members to the cgroupstats | ||
9 | structure. | ||
10 | |||
11 | The current model for cgroupstats is a pull, a push model (to post | ||
12 | statistics on interesting events), should be very easy to add. Currently | ||
13 | user space requests for statistics by passing the cgroup path. | ||
14 | Statistics about the state of all the tasks in the cgroup is returned to | ||
15 | user space. | ||
16 | |||
17 | NOTE: We currently rely on delay accounting for extracting information | ||
18 | about tasks blocked on I/O. If CONFIG_TASK_DELAY_ACCT is disabled, this | ||
19 | information will not be available. | ||
20 | |||
21 | To extract cgroup statistics a utility very similar to getdelays.c | ||
22 | has been developed, the sample output of the utility is shown below | ||
23 | |||
24 | ~/balbir/cgroupstats # ./getdelays -C "/cgroup/a" | ||
25 | sleeping 1, blocked 0, running 1, stopped 0, uninterruptible 0 | ||
26 | ~/balbir/cgroupstats # ./getdelays -C "/cgroup" | ||
27 | sleeping 155, blocked 0, running 1, stopped 0, uninterruptible 2 | ||
diff --git a/Documentation/cachetlb.txt b/Documentation/cachetlb.txt index 552cabac0608..da42ab414c48 100644 --- a/Documentation/cachetlb.txt +++ b/Documentation/cachetlb.txt | |||
@@ -87,30 +87,7 @@ changes occur: | |||
87 | 87 | ||
88 | This is used primarily during fault processing. | 88 | This is used primarily during fault processing. |
89 | 89 | ||
90 | 5) void flush_tlb_pgtables(struct mm_struct *mm, | 90 | 5) void update_mmu_cache(struct vm_area_struct *vma, |
91 | unsigned long start, unsigned long end) | ||
92 | |||
93 | The software page tables for address space 'mm' for virtual | ||
94 | addresses in the range 'start' to 'end-1' are being torn down. | ||
95 | |||
96 | Some platforms cache the lowest level of the software page tables | ||
97 | in a linear virtually mapped array, to make TLB miss processing | ||
98 | more efficient. On such platforms, since the TLB is caching the | ||
99 | software page table structure, it needs to be flushed when parts | ||
100 | of the software page table tree are unlinked/freed. | ||
101 | |||
102 | Sparc64 is one example of a platform which does this. | ||
103 | |||
104 | Usually, when munmap()'ing an area of user virtual address | ||
105 | space, the kernel leaves the page table parts around and just | ||
106 | marks the individual pte's as invalid. However, if very large | ||
107 | portions of the address space are unmapped, the kernel frees up | ||
108 | those portions of the software page tables to prevent potential | ||
109 | excessive kernel memory usage caused by erratic mmap/mmunmap | ||
110 | sequences. It is at these times that flush_tlb_pgtables will | ||
111 | be invoked. | ||
112 | |||
113 | 6) void update_mmu_cache(struct vm_area_struct *vma, | ||
114 | unsigned long address, pte_t pte) | 91 | unsigned long address, pte_t pte) |
115 | 92 | ||
116 | At the end of every page fault, this routine is invoked to | 93 | At the end of every page fault, this routine is invoked to |
@@ -123,7 +100,7 @@ changes occur: | |||
123 | translations for software managed TLB configurations. | 100 | translations for software managed TLB configurations. |
124 | The sparc64 port currently does this. | 101 | The sparc64 port currently does this. |
125 | 102 | ||
126 | 7) void tlb_migrate_finish(struct mm_struct *mm) | 103 | 6) void tlb_migrate_finish(struct mm_struct *mm) |
127 | 104 | ||
128 | This interface is called at the end of an explicit | 105 | This interface is called at the end of an explicit |
129 | process migration. This interface provides a hook | 106 | process migration. This interface provides a hook |
diff --git a/Documentation/cgroups.txt b/Documentation/cgroups.txt new file mode 100644 index 000000000000..98a26f81fa75 --- /dev/null +++ b/Documentation/cgroups.txt | |||
@@ -0,0 +1,545 @@ | |||
1 | CGROUPS | ||
2 | ------- | ||
3 | |||
4 | Written by Paul Menage <menage@google.com> based on Documentation/cpusets.txt | ||
5 | |||
6 | Original copyright statements from cpusets.txt: | ||
7 | Portions Copyright (C) 2004 BULL SA. | ||
8 | Portions Copyright (c) 2004-2006 Silicon Graphics, Inc. | ||
9 | Modified by Paul Jackson <pj@sgi.com> | ||
10 | Modified by Christoph Lameter <clameter@sgi.com> | ||
11 | |||
12 | CONTENTS: | ||
13 | ========= | ||
14 | |||
15 | 1. Control Groups | ||
16 | 1.1 What are cgroups ? | ||
17 | 1.2 Why are cgroups needed ? | ||
18 | 1.3 How are cgroups implemented ? | ||
19 | 1.4 What does notify_on_release do ? | ||
20 | 1.5 How do I use cgroups ? | ||
21 | 2. Usage Examples and Syntax | ||
22 | 2.1 Basic Usage | ||
23 | 2.2 Attaching processes | ||
24 | 3. Kernel API | ||
25 | 3.1 Overview | ||
26 | 3.2 Synchronization | ||
27 | 3.3 Subsystem API | ||
28 | 4. Questions | ||
29 | |||
30 | 1. Control Groups | ||
31 | ========== | ||
32 | |||
33 | 1.1 What are cgroups ? | ||
34 | ---------------------- | ||
35 | |||
36 | Control Groups provide a mechanism for aggregating/partitioning sets of | ||
37 | tasks, and all their future children, into hierarchical groups with | ||
38 | specialized behaviour. | ||
39 | |||
40 | Definitions: | ||
41 | |||
42 | A *cgroup* associates a set of tasks with a set of parameters for one | ||
43 | or more subsystems. | ||
44 | |||
45 | A *subsystem* is a module that makes use of the task grouping | ||
46 | facilities provided by cgroups to treat groups of tasks in | ||
47 | particular ways. A subsystem is typically a "resource controller" that | ||
48 | schedules a resource or applies per-cgroup limits, but it may be | ||
49 | anything that wants to act on a group of processes, e.g. a | ||
50 | virtualization subsystem. | ||
51 | |||
52 | A *hierarchy* is a set of cgroups arranged in a tree, such that | ||
53 | every task in the system is in exactly one of the cgroups in the | ||
54 | hierarchy, and a set of subsystems; each subsystem has system-specific | ||
55 | state attached to each cgroup in the hierarchy. Each hierarchy has | ||
56 | an instance of the cgroup virtual filesystem associated with it. | ||
57 | |||
58 | At any one time there may be multiple active hierachies of task | ||
59 | cgroups. Each hierarchy is a partition of all tasks in the system. | ||
60 | |||
61 | User level code may create and destroy cgroups by name in an | ||
62 | instance of the cgroup virtual file system, specify and query to | ||
63 | which cgroup a task is assigned, and list the task pids assigned to | ||
64 | a cgroup. Those creations and assignments only affect the hierarchy | ||
65 | associated with that instance of the cgroup file system. | ||
66 | |||
67 | On their own, the only use for cgroups is for simple job | ||
68 | tracking. The intention is that other subsystems hook into the generic | ||
69 | cgroup support to provide new attributes for cgroups, such as | ||
70 | accounting/limiting the resources which processes in a cgroup can | ||
71 | access. For example, cpusets (see Documentation/cpusets.txt) allows | ||
72 | you to associate a set of CPUs and a set of memory nodes with the | ||
73 | tasks in each cgroup. | ||
74 | |||
75 | 1.2 Why are cgroups needed ? | ||
76 | ---------------------------- | ||
77 | |||
78 | There are multiple efforts to provide process aggregations in the | ||
79 | Linux kernel, mainly for resource tracking purposes. Such efforts | ||
80 | include cpusets, CKRM/ResGroups, UserBeanCounters, and virtual server | ||
81 | namespaces. These all require the basic notion of a | ||
82 | grouping/partitioning of processes, with newly forked processes ending | ||
83 | in the same group (cgroup) as their parent process. | ||
84 | |||
85 | The kernel cgroup patch provides the minimum essential kernel | ||
86 | mechanisms required to efficiently implement such groups. It has | ||
87 | minimal impact on the system fast paths, and provides hooks for | ||
88 | specific subsystems such as cpusets to provide additional behaviour as | ||
89 | desired. | ||
90 | |||
91 | Multiple hierarchy support is provided to allow for situations where | ||
92 | the division of tasks into cgroups is distinctly different for | ||
93 | different subsystems - having parallel hierarchies allows each | ||
94 | hierarchy to be a natural division of tasks, without having to handle | ||
95 | complex combinations of tasks that would be present if several | ||
96 | unrelated subsystems needed to be forced into the same tree of | ||
97 | cgroups. | ||
98 | |||
99 | At one extreme, each resource controller or subsystem could be in a | ||
100 | separate hierarchy; at the other extreme, all subsystems | ||
101 | would be attached to the same hierarchy. | ||
102 | |||
103 | As an example of a scenario (originally proposed by vatsa@in.ibm.com) | ||
104 | that can benefit from multiple hierarchies, consider a large | ||
105 | university server with various users - students, professors, system | ||
106 | tasks etc. The resource planning for this server could be along the | ||
107 | following lines: | ||
108 | |||
109 | CPU : Top cpuset | ||
110 | / \ | ||
111 | CPUSet1 CPUSet2 | ||
112 | | | | ||
113 | (Profs) (Students) | ||
114 | |||
115 | In addition (system tasks) are attached to topcpuset (so | ||
116 | that they can run anywhere) with a limit of 20% | ||
117 | |||
118 | Memory : Professors (50%), students (30%), system (20%) | ||
119 | |||
120 | Disk : Prof (50%), students (30%), system (20%) | ||
121 | |||
122 | Network : WWW browsing (20%), Network File System (60%), others (20%) | ||
123 | / \ | ||
124 | Prof (15%) students (5%) | ||
125 | |||
126 | Browsers like firefox/lynx go into the WWW network class, while (k)nfsd go | ||
127 | into NFS network class. | ||
128 | |||
129 | At the same time firefox/lynx will share an appropriate CPU/Memory class | ||
130 | depending on who launched it (prof/student). | ||
131 | |||
132 | With the ability to classify tasks differently for different resources | ||
133 | (by putting those resource subsystems in different hierarchies) then | ||
134 | the admin can easily set up a script which receives exec notifications | ||
135 | and depending on who is launching the browser he can | ||
136 | |||
137 | # echo browser_pid > /mnt/<restype>/<userclass>/tasks | ||
138 | |||
139 | With only a single hierarchy, he now would potentially have to create | ||
140 | a separate cgroup for every browser launched and associate it with | ||
141 | approp network and other resource class. This may lead to | ||
142 | proliferation of such cgroups. | ||
143 | |||
144 | Also lets say that the administrator would like to give enhanced network | ||
145 | access temporarily to a student's browser (since it is night and the user | ||
146 | wants to do online gaming :) OR give one of the students simulation | ||
147 | apps enhanced CPU power, | ||
148 | |||
149 | With ability to write pids directly to resource classes, its just a | ||
150 | matter of : | ||
151 | |||
152 | # echo pid > /mnt/network/<new_class>/tasks | ||
153 | (after some time) | ||
154 | # echo pid > /mnt/network/<orig_class>/tasks | ||
155 | |||
156 | Without this ability, he would have to split the cgroup into | ||
157 | multiple separate ones and then associate the new cgroups with the | ||
158 | new resource classes. | ||
159 | |||
160 | |||
161 | |||
162 | 1.3 How are cgroups implemented ? | ||
163 | --------------------------------- | ||
164 | |||
165 | Control Groups extends the kernel as follows: | ||
166 | |||
167 | - Each task in the system has a reference-counted pointer to a | ||
168 | css_set. | ||
169 | |||
170 | - A css_set contains a set of reference-counted pointers to | ||
171 | cgroup_subsys_state objects, one for each cgroup subsystem | ||
172 | registered in the system. There is no direct link from a task to | ||
173 | the cgroup of which it's a member in each hierarchy, but this | ||
174 | can be determined by following pointers through the | ||
175 | cgroup_subsys_state objects. This is because accessing the | ||
176 | subsystem state is something that's expected to happen frequently | ||
177 | and in performance-critical code, whereas operations that require a | ||
178 | task's actual cgroup assignments (in particular, moving between | ||
179 | cgroups) are less common. A linked list runs through the cg_list | ||
180 | field of each task_struct using the css_set, anchored at | ||
181 | css_set->tasks. | ||
182 | |||
183 | - A cgroup hierarchy filesystem can be mounted for browsing and | ||
184 | manipulation from user space. | ||
185 | |||
186 | - You can list all the tasks (by pid) attached to any cgroup. | ||
187 | |||
188 | The implementation of cgroups requires a few, simple hooks | ||
189 | into the rest of the kernel, none in performance critical paths: | ||
190 | |||
191 | - in init/main.c, to initialize the root cgroups and initial | ||
192 | css_set at system boot. | ||
193 | |||
194 | - in fork and exit, to attach and detach a task from its css_set. | ||
195 | |||
196 | In addition a new file system, of type "cgroup" may be mounted, to | ||
197 | enable browsing and modifying the cgroups presently known to the | ||
198 | kernel. When mounting a cgroup hierarchy, you may specify a | ||
199 | comma-separated list of subsystems to mount as the filesystem mount | ||
200 | options. By default, mounting the cgroup filesystem attempts to | ||
201 | mount a hierarchy containing all registered subsystems. | ||
202 | |||
203 | If an active hierarchy with exactly the same set of subsystems already | ||
204 | exists, it will be reused for the new mount. If no existing hierarchy | ||
205 | matches, and any of the requested subsystems are in use in an existing | ||
206 | hierarchy, the mount will fail with -EBUSY. Otherwise, a new hierarchy | ||
207 | is activated, associated with the requested subsystems. | ||
208 | |||
209 | It's not currently possible to bind a new subsystem to an active | ||
210 | cgroup hierarchy, or to unbind a subsystem from an active cgroup | ||
211 | hierarchy. This may be possible in future, but is fraught with nasty | ||
212 | error-recovery issues. | ||
213 | |||
214 | When a cgroup filesystem is unmounted, if there are any | ||
215 | child cgroups created below the top-level cgroup, that hierarchy | ||
216 | will remain active even though unmounted; if there are no | ||
217 | child cgroups then the hierarchy will be deactivated. | ||
218 | |||
219 | No new system calls are added for cgroups - all support for | ||
220 | querying and modifying cgroups is via this cgroup file system. | ||
221 | |||
222 | Each task under /proc has an added file named 'cgroup' displaying, | ||
223 | for each active hierarchy, the subsystem names and the cgroup name | ||
224 | as the path relative to the root of the cgroup file system. | ||
225 | |||
226 | Each cgroup is represented by a directory in the cgroup file system | ||
227 | containing the following files describing that cgroup: | ||
228 | |||
229 | - tasks: list of tasks (by pid) attached to that cgroup | ||
230 | - notify_on_release flag: run /sbin/cgroup_release_agent on exit? | ||
231 | |||
232 | Other subsystems such as cpusets may add additional files in each | ||
233 | cgroup dir | ||
234 | |||
235 | New cgroups are created using the mkdir system call or shell | ||
236 | command. The properties of a cgroup, such as its flags, are | ||
237 | modified by writing to the appropriate file in that cgroups | ||
238 | directory, as listed above. | ||
239 | |||
240 | The named hierarchical structure of nested cgroups allows partitioning | ||
241 | a large system into nested, dynamically changeable, "soft-partitions". | ||
242 | |||
243 | The attachment of each task, automatically inherited at fork by any | ||
244 | children of that task, to a cgroup allows organizing the work load | ||
245 | on a system into related sets of tasks. A task may be re-attached to | ||
246 | any other cgroup, if allowed by the permissions on the necessary | ||
247 | cgroup file system directories. | ||
248 | |||
249 | When a task is moved from one cgroup to another, it gets a new | ||
250 | css_set pointer - if there's an already existing css_set with the | ||
251 | desired collection of cgroups then that group is reused, else a new | ||
252 | css_set is allocated. Note that the current implementation uses a | ||
253 | linear search to locate an appropriate existing css_set, so isn't | ||
254 | very efficient. A future version will use a hash table for better | ||
255 | performance. | ||
256 | |||
257 | To allow access from a cgroup to the css_sets (and hence tasks) | ||
258 | that comprise it, a set of cg_cgroup_link objects form a lattice; | ||
259 | each cg_cgroup_link is linked into a list of cg_cgroup_links for | ||
260 | a single cgroup on its cont_link_list field, and a list of | ||
261 | cg_cgroup_links for a single css_set on its cg_link_list. | ||
262 | |||
263 | Thus the set of tasks in a cgroup can be listed by iterating over | ||
264 | each css_set that references the cgroup, and sub-iterating over | ||
265 | each css_set's task set. | ||
266 | |||
267 | The use of a Linux virtual file system (vfs) to represent the | ||
268 | cgroup hierarchy provides for a familiar permission and name space | ||
269 | for cgroups, with a minimum of additional kernel code. | ||
270 | |||
271 | 1.4 What does notify_on_release do ? | ||
272 | ------------------------------------ | ||
273 | |||
274 | *** notify_on_release is disabled in the current patch set. It will be | ||
275 | *** reactivated in a future patch in a less-intrusive manner | ||
276 | |||
277 | If the notify_on_release flag is enabled (1) in a cgroup, then | ||
278 | whenever the last task in the cgroup leaves (exits or attaches to | ||
279 | some other cgroup) and the last child cgroup of that cgroup | ||
280 | is removed, then the kernel runs the command specified by the contents | ||
281 | of the "release_agent" file in that hierarchy's root directory, | ||
282 | supplying the pathname (relative to the mount point of the cgroup | ||
283 | file system) of the abandoned cgroup. This enables automatic | ||
284 | removal of abandoned cgroups. The default value of | ||
285 | notify_on_release in the root cgroup at system boot is disabled | ||
286 | (0). The default value of other cgroups at creation is the current | ||
287 | value of their parents notify_on_release setting. The default value of | ||
288 | a cgroup hierarchy's release_agent path is empty. | ||
289 | |||
290 | 1.5 How do I use cgroups ? | ||
291 | -------------------------- | ||
292 | |||
293 | To start a new job that is to be contained within a cgroup, using | ||
294 | the "cpuset" cgroup subsystem, the steps are something like: | ||
295 | |||
296 | 1) mkdir /dev/cgroup | ||
297 | 2) mount -t cgroup -ocpuset cpuset /dev/cgroup | ||
298 | 3) Create the new cgroup by doing mkdir's and write's (or echo's) in | ||
299 | the /dev/cgroup virtual file system. | ||
300 | 4) Start a task that will be the "founding father" of the new job. | ||
301 | 5) Attach that task to the new cgroup by writing its pid to the | ||
302 | /dev/cgroup tasks file for that cgroup. | ||
303 | 6) fork, exec or clone the job tasks from this founding father task. | ||
304 | |||
305 | For example, the following sequence of commands will setup a cgroup | ||
306 | named "Charlie", containing just CPUs 2 and 3, and Memory Node 1, | ||
307 | and then start a subshell 'sh' in that cgroup: | ||
308 | |||
309 | mount -t cgroup cpuset -ocpuset /dev/cgroup | ||
310 | cd /dev/cgroup | ||
311 | mkdir Charlie | ||
312 | cd Charlie | ||
313 | /bin/echo 2-3 > cpus | ||
314 | /bin/echo 1 > mems | ||
315 | /bin/echo $$ > tasks | ||
316 | sh | ||
317 | # The subshell 'sh' is now running in cgroup Charlie | ||
318 | # The next line should display '/Charlie' | ||
319 | cat /proc/self/cgroup | ||
320 | |||
321 | 2. Usage Examples and Syntax | ||
322 | ============================ | ||
323 | |||
324 | 2.1 Basic Usage | ||
325 | --------------- | ||
326 | |||
327 | Creating, modifying, using the cgroups can be done through the cgroup | ||
328 | virtual filesystem. | ||
329 | |||
330 | To mount a cgroup hierarchy will all available subsystems, type: | ||
331 | # mount -t cgroup xxx /dev/cgroup | ||
332 | |||
333 | The "xxx" is not interpreted by the cgroup code, but will appear in | ||
334 | /proc/mounts so may be any useful identifying string that you like. | ||
335 | |||
336 | To mount a cgroup hierarchy with just the cpuset and numtasks | ||
337 | subsystems, type: | ||
338 | # mount -t cgroup -o cpuset,numtasks hier1 /dev/cgroup | ||
339 | |||
340 | To change the set of subsystems bound to a mounted hierarchy, just | ||
341 | remount with different options: | ||
342 | |||
343 | # mount -o remount,cpuset,ns /dev/cgroup | ||
344 | |||
345 | Note that changing the set of subsystems is currently only supported | ||
346 | when the hierarchy consists of a single (root) cgroup. Supporting | ||
347 | the ability to arbitrarily bind/unbind subsystems from an existing | ||
348 | cgroup hierarchy is intended to be implemented in the future. | ||
349 | |||
350 | Then under /dev/cgroup you can find a tree that corresponds to the | ||
351 | tree of the cgroups in the system. For instance, /dev/cgroup | ||
352 | is the cgroup that holds the whole system. | ||
353 | |||
354 | If you want to create a new cgroup under /dev/cgroup: | ||
355 | # cd /dev/cgroup | ||
356 | # mkdir my_cgroup | ||
357 | |||
358 | Now you want to do something with this cgroup. | ||
359 | # cd my_cgroup | ||
360 | |||
361 | In this directory you can find several files: | ||
362 | # ls | ||
363 | notify_on_release release_agent tasks | ||
364 | (plus whatever files are added by the attached subsystems) | ||
365 | |||
366 | Now attach your shell to this cgroup: | ||
367 | # /bin/echo $$ > tasks | ||
368 | |||
369 | You can also create cgroups inside your cgroup by using mkdir in this | ||
370 | directory. | ||
371 | # mkdir my_sub_cs | ||
372 | |||
373 | To remove a cgroup, just use rmdir: | ||
374 | # rmdir my_sub_cs | ||
375 | |||
376 | This will fail if the cgroup is in use (has cgroups inside, or | ||
377 | has processes attached, or is held alive by other subsystem-specific | ||
378 | reference). | ||
379 | |||
380 | 2.2 Attaching processes | ||
381 | ----------------------- | ||
382 | |||
383 | # /bin/echo PID > tasks | ||
384 | |||
385 | Note that it is PID, not PIDs. You can only attach ONE task at a time. | ||
386 | If you have several tasks to attach, you have to do it one after another: | ||
387 | |||
388 | # /bin/echo PID1 > tasks | ||
389 | # /bin/echo PID2 > tasks | ||
390 | ... | ||
391 | # /bin/echo PIDn > tasks | ||
392 | |||
393 | 3. Kernel API | ||
394 | ============= | ||
395 | |||
396 | 3.1 Overview | ||
397 | ------------ | ||
398 | |||
399 | Each kernel subsystem that wants to hook into the generic cgroup | ||
400 | system needs to create a cgroup_subsys object. This contains | ||
401 | various methods, which are callbacks from the cgroup system, along | ||
402 | with a subsystem id which will be assigned by the cgroup system. | ||
403 | |||
404 | Other fields in the cgroup_subsys object include: | ||
405 | |||
406 | - subsys_id: a unique array index for the subsystem, indicating which | ||
407 | entry in cgroup->subsys[] this subsystem should be | ||
408 | managing. Initialized by cgroup_register_subsys(); prior to this | ||
409 | it should be initialized to -1 | ||
410 | |||
411 | - hierarchy: an index indicating which hierarchy, if any, this | ||
412 | subsystem is currently attached to. If this is -1, then the | ||
413 | subsystem is not attached to any hierarchy, and all tasks should be | ||
414 | considered to be members of the subsystem's top_cgroup. It should | ||
415 | be initialized to -1. | ||
416 | |||
417 | - name: should be initialized to a unique subsystem name prior to | ||
418 | calling cgroup_register_subsystem. Should be no longer than | ||
419 | MAX_CGROUP_TYPE_NAMELEN | ||
420 | |||
421 | Each cgroup object created by the system has an array of pointers, | ||
422 | indexed by subsystem id; this pointer is entirely managed by the | ||
423 | subsystem; the generic cgroup code will never touch this pointer. | ||
424 | |||
425 | 3.2 Synchronization | ||
426 | ------------------- | ||
427 | |||
428 | There is a global mutex, cgroup_mutex, used by the cgroup | ||
429 | system. This should be taken by anything that wants to modify a | ||
430 | cgroup. It may also be taken to prevent cgroups from being | ||
431 | modified, but more specific locks may be more appropriate in that | ||
432 | situation. | ||
433 | |||
434 | See kernel/cgroup.c for more details. | ||
435 | |||
436 | Subsystems can take/release the cgroup_mutex via the functions | ||
437 | cgroup_lock()/cgroup_unlock(), and can | ||
438 | take/release the callback_mutex via the functions | ||
439 | cgroup_lock()/cgroup_unlock(). | ||
440 | |||
441 | Accessing a task's cgroup pointer may be done in the following ways: | ||
442 | - while holding cgroup_mutex | ||
443 | - while holding the task's alloc_lock (via task_lock()) | ||
444 | - inside an rcu_read_lock() section via rcu_dereference() | ||
445 | |||
446 | 3.3 Subsystem API | ||
447 | -------------------------- | ||
448 | |||
449 | Each subsystem should: | ||
450 | |||
451 | - add an entry in linux/cgroup_subsys.h | ||
452 | - define a cgroup_subsys object called <name>_subsys | ||
453 | |||
454 | Each subsystem may export the following methods. The only mandatory | ||
455 | methods are create/destroy. Any others that are null are presumed to | ||
456 | be successful no-ops. | ||
457 | |||
458 | struct cgroup_subsys_state *create(struct cgroup *cont) | ||
459 | LL=cgroup_mutex | ||
460 | |||
461 | Called to create a subsystem state object for a cgroup. The | ||
462 | subsystem should allocate its subsystem state object for the passed | ||
463 | cgroup, returning a pointer to the new object on success or a | ||
464 | negative error code. On success, the subsystem pointer should point to | ||
465 | a structure of type cgroup_subsys_state (typically embedded in a | ||
466 | larger subsystem-specific object), which will be initialized by the | ||
467 | cgroup system. Note that this will be called at initialization to | ||
468 | create the root subsystem state for this subsystem; this case can be | ||
469 | identified by the passed cgroup object having a NULL parent (since | ||
470 | it's the root of the hierarchy) and may be an appropriate place for | ||
471 | initialization code. | ||
472 | |||
473 | void destroy(struct cgroup *cont) | ||
474 | LL=cgroup_mutex | ||
475 | |||
476 | The cgroup system is about to destroy the passed cgroup; the | ||
477 | subsystem should do any necessary cleanup | ||
478 | |||
479 | int can_attach(struct cgroup_subsys *ss, struct cgroup *cont, | ||
480 | struct task_struct *task) | ||
481 | LL=cgroup_mutex | ||
482 | |||
483 | Called prior to moving a task into a cgroup; if the subsystem | ||
484 | returns an error, this will abort the attach operation. If a NULL | ||
485 | task is passed, then a successful result indicates that *any* | ||
486 | unspecified task can be moved into the cgroup. Note that this isn't | ||
487 | called on a fork. If this method returns 0 (success) then this should | ||
488 | remain valid while the caller holds cgroup_mutex. | ||
489 | |||
490 | void attach(struct cgroup_subsys *ss, struct cgroup *cont, | ||
491 | struct cgroup *old_cont, struct task_struct *task) | ||
492 | LL=cgroup_mutex | ||
493 | |||
494 | |||
495 | Called after the task has been attached to the cgroup, to allow any | ||
496 | post-attachment activity that requires memory allocations or blocking. | ||
497 | |||
498 | void fork(struct cgroup_subsy *ss, struct task_struct *task) | ||
499 | LL=callback_mutex, maybe read_lock(tasklist_lock) | ||
500 | |||
501 | Called when a task is forked into a cgroup. Also called during | ||
502 | registration for all existing tasks. | ||
503 | |||
504 | void exit(struct cgroup_subsys *ss, struct task_struct *task) | ||
505 | LL=callback_mutex | ||
506 | |||
507 | Called during task exit | ||
508 | |||
509 | int populate(struct cgroup_subsys *ss, struct cgroup *cont) | ||
510 | LL=none | ||
511 | |||
512 | Called after creation of a cgroup to allow a subsystem to populate | ||
513 | the cgroup directory with file entries. The subsystem should make | ||
514 | calls to cgroup_add_file() with objects of type cftype (see | ||
515 | include/linux/cgroup.h for details). Note that although this | ||
516 | method can return an error code, the error code is currently not | ||
517 | always handled well. | ||
518 | |||
519 | void post_clone(struct cgroup_subsys *ss, struct cgroup *cont) | ||
520 | |||
521 | Called at the end of cgroup_clone() to do any paramater | ||
522 | initialization which might be required before a task could attach. For | ||
523 | example in cpusets, no task may attach before 'cpus' and 'mems' are set | ||
524 | up. | ||
525 | |||
526 | void bind(struct cgroup_subsys *ss, struct cgroup *root) | ||
527 | LL=callback_mutex | ||
528 | |||
529 | Called when a cgroup subsystem is rebound to a different hierarchy | ||
530 | and root cgroup. Currently this will only involve movement between | ||
531 | the default hierarchy (which never has sub-cgroups) and a hierarchy | ||
532 | that is being created/destroyed (and hence has no sub-cgroups). | ||
533 | |||
534 | 4. Questions | ||
535 | ============ | ||
536 | |||
537 | Q: what's up with this '/bin/echo' ? | ||
538 | A: bash's builtin 'echo' command does not check calls to write() against | ||
539 | errors. If you use it in the cgroup file system, you won't be | ||
540 | able to tell whether a command succeeded or failed. | ||
541 | |||
542 | Q: When I attach processes, only the first of the line gets really attached ! | ||
543 | A: We can only return one error code per call to write(). So you should also | ||
544 | put only ONE pid. | ||
545 | |||
diff --git a/Documentation/cpu-hotplug.txt b/Documentation/cpu-hotplug.txt index b6d24c22274b..a741f658a3c9 100644 --- a/Documentation/cpu-hotplug.txt +++ b/Documentation/cpu-hotplug.txt | |||
@@ -220,7 +220,9 @@ A: The following happen, listed in no particular order :-) | |||
220 | CPU_DOWN_PREPARE or CPU_DOWN_PREPARE_FROZEN, depending on whether or not the | 220 | CPU_DOWN_PREPARE or CPU_DOWN_PREPARE_FROZEN, depending on whether or not the |
221 | CPU is being offlined while tasks are frozen due to a suspend operation in | 221 | CPU is being offlined while tasks are frozen due to a suspend operation in |
222 | progress | 222 | progress |
223 | - All process is migrated away from this outgoing CPU to a new CPU | 223 | - All processes are migrated away from this outgoing CPU to new CPUs. |
224 | The new CPU is chosen from each process' current cpuset, which may be | ||
225 | a subset of all online CPUs. | ||
224 | - All interrupts targeted to this CPU is migrated to a new CPU | 226 | - All interrupts targeted to this CPU is migrated to a new CPU |
225 | - timers/bottom half/task lets are also migrated to a new CPU | 227 | - timers/bottom half/task lets are also migrated to a new CPU |
226 | - Once all services are migrated, kernel calls an arch specific routine | 228 | - Once all services are migrated, kernel calls an arch specific routine |
diff --git a/Documentation/cpusets.txt b/Documentation/cpusets.txt index ec9de6917f01..141bef1c8599 100644 --- a/Documentation/cpusets.txt +++ b/Documentation/cpusets.txt | |||
@@ -7,6 +7,7 @@ Written by Simon.Derr@bull.net | |||
7 | Portions Copyright (c) 2004-2006 Silicon Graphics, Inc. | 7 | Portions Copyright (c) 2004-2006 Silicon Graphics, Inc. |
8 | Modified by Paul Jackson <pj@sgi.com> | 8 | Modified by Paul Jackson <pj@sgi.com> |
9 | Modified by Christoph Lameter <clameter@sgi.com> | 9 | Modified by Christoph Lameter <clameter@sgi.com> |
10 | Modified by Paul Menage <menage@google.com> | ||
10 | 11 | ||
11 | CONTENTS: | 12 | CONTENTS: |
12 | ========= | 13 | ========= |
@@ -16,9 +17,9 @@ CONTENTS: | |||
16 | 1.2 Why are cpusets needed ? | 17 | 1.2 Why are cpusets needed ? |
17 | 1.3 How are cpusets implemented ? | 18 | 1.3 How are cpusets implemented ? |
18 | 1.4 What are exclusive cpusets ? | 19 | 1.4 What are exclusive cpusets ? |
19 | 1.5 What does notify_on_release do ? | 20 | 1.5 What is memory_pressure ? |
20 | 1.6 What is memory_pressure ? | 21 | 1.6 What is memory spread ? |
21 | 1.7 What is memory spread ? | 22 | 1.7 What is sched_load_balance ? |
22 | 1.8 How do I use cpusets ? | 23 | 1.8 How do I use cpusets ? |
23 | 2. Usage Examples and Syntax | 24 | 2. Usage Examples and Syntax |
24 | 2.1 Basic Usage | 25 | 2.1 Basic Usage |
@@ -44,18 +45,19 @@ hierarchy visible in a virtual file system. These are the essential | |||
44 | hooks, beyond what is already present, required to manage dynamic | 45 | hooks, beyond what is already present, required to manage dynamic |
45 | job placement on large systems. | 46 | job placement on large systems. |
46 | 47 | ||
47 | Each task has a pointer to a cpuset. Multiple tasks may reference | 48 | Cpusets use the generic cgroup subsystem described in |
48 | the same cpuset. Requests by a task, using the sched_setaffinity(2) | 49 | Documentation/cgroup.txt. |
49 | system call to include CPUs in its CPU affinity mask, and using the | 50 | |
50 | mbind(2) and set_mempolicy(2) system calls to include Memory Nodes | 51 | Requests by a task, using the sched_setaffinity(2) system call to |
51 | in its memory policy, are both filtered through that tasks cpuset, | 52 | include CPUs in its CPU affinity mask, and using the mbind(2) and |
52 | filtering out any CPUs or Memory Nodes not in that cpuset. The | 53 | set_mempolicy(2) system calls to include Memory Nodes in its memory |
53 | scheduler will not schedule a task on a CPU that is not allowed in | 54 | policy, are both filtered through that tasks cpuset, filtering out any |
54 | its cpus_allowed vector, and the kernel page allocator will not | 55 | CPUs or Memory Nodes not in that cpuset. The scheduler will not |
55 | allocate a page on a node that is not allowed in the requesting tasks | 56 | schedule a task on a CPU that is not allowed in its cpus_allowed |
56 | mems_allowed vector. | 57 | vector, and the kernel page allocator will not allocate a page on a |
57 | 58 | node that is not allowed in the requesting tasks mems_allowed vector. | |
58 | User level code may create and destroy cpusets by name in the cpuset | 59 | |
60 | User level code may create and destroy cpusets by name in the cgroup | ||
59 | virtual file system, manage the attributes and permissions of these | 61 | virtual file system, manage the attributes and permissions of these |
60 | cpusets and which CPUs and Memory Nodes are assigned to each cpuset, | 62 | cpusets and which CPUs and Memory Nodes are assigned to each cpuset, |
61 | specify and query to which cpuset a task is assigned, and list the | 63 | specify and query to which cpuset a task is assigned, and list the |
@@ -115,7 +117,7 @@ Cpusets extends these two mechanisms as follows: | |||
115 | - Cpusets are sets of allowed CPUs and Memory Nodes, known to the | 117 | - Cpusets are sets of allowed CPUs and Memory Nodes, known to the |
116 | kernel. | 118 | kernel. |
117 | - Each task in the system is attached to a cpuset, via a pointer | 119 | - Each task in the system is attached to a cpuset, via a pointer |
118 | in the task structure to a reference counted cpuset structure. | 120 | in the task structure to a reference counted cgroup structure. |
119 | - Calls to sched_setaffinity are filtered to just those CPUs | 121 | - Calls to sched_setaffinity are filtered to just those CPUs |
120 | allowed in that tasks cpuset. | 122 | allowed in that tasks cpuset. |
121 | - Calls to mbind and set_mempolicy are filtered to just | 123 | - Calls to mbind and set_mempolicy are filtered to just |
@@ -145,15 +147,10 @@ into the rest of the kernel, none in performance critical paths: | |||
145 | - in page_alloc.c, to restrict memory to allowed nodes. | 147 | - in page_alloc.c, to restrict memory to allowed nodes. |
146 | - in vmscan.c, to restrict page recovery to the current cpuset. | 148 | - in vmscan.c, to restrict page recovery to the current cpuset. |
147 | 149 | ||
148 | In addition a new file system, of type "cpuset" may be mounted, | 150 | You should mount the "cgroup" filesystem type in order to enable |
149 | typically at /dev/cpuset, to enable browsing and modifying the cpusets | 151 | browsing and modifying the cpusets presently known to the kernel. No |
150 | presently known to the kernel. No new system calls are added for | 152 | new system calls are added for cpusets - all support for querying and |
151 | cpusets - all support for querying and modifying cpusets is via | 153 | modifying cpusets is via this cpuset file system. |
152 | this cpuset file system. | ||
153 | |||
154 | Each task under /proc has an added file named 'cpuset', displaying | ||
155 | the cpuset name, as the path relative to the root of the cpuset file | ||
156 | system. | ||
157 | 154 | ||
158 | The /proc/<pid>/status file for each task has two added lines, | 155 | The /proc/<pid>/status file for each task has two added lines, |
159 | displaying the tasks cpus_allowed (on which CPUs it may be scheduled) | 156 | displaying the tasks cpus_allowed (on which CPUs it may be scheduled) |
@@ -163,16 +160,15 @@ in the format seen in the following example: | |||
163 | Cpus_allowed: ffffffff,ffffffff,ffffffff,ffffffff | 160 | Cpus_allowed: ffffffff,ffffffff,ffffffff,ffffffff |
164 | Mems_allowed: ffffffff,ffffffff | 161 | Mems_allowed: ffffffff,ffffffff |
165 | 162 | ||
166 | Each cpuset is represented by a directory in the cpuset file system | 163 | Each cpuset is represented by a directory in the cgroup file system |
167 | containing the following files describing that cpuset: | 164 | containing (on top of the standard cgroup files) the following |
165 | files describing that cpuset: | ||
168 | 166 | ||
169 | - cpus: list of CPUs in that cpuset | 167 | - cpus: list of CPUs in that cpuset |
170 | - mems: list of Memory Nodes in that cpuset | 168 | - mems: list of Memory Nodes in that cpuset |
171 | - memory_migrate flag: if set, move pages to cpusets nodes | 169 | - memory_migrate flag: if set, move pages to cpusets nodes |
172 | - cpu_exclusive flag: is cpu placement exclusive? | 170 | - cpu_exclusive flag: is cpu placement exclusive? |
173 | - mem_exclusive flag: is memory placement exclusive? | 171 | - mem_exclusive flag: is memory placement exclusive? |
174 | - tasks: list of tasks (by pid) attached to that cpuset | ||
175 | - notify_on_release flag: run /sbin/cpuset_release_agent on exit? | ||
176 | - memory_pressure: measure of how much paging pressure in cpuset | 172 | - memory_pressure: measure of how much paging pressure in cpuset |
177 | 173 | ||
178 | In addition, the root cpuset only has the following file: | 174 | In addition, the root cpuset only has the following file: |
@@ -237,21 +233,7 @@ such as requests from interrupt handlers, is allowed to be taken | |||
237 | outside even a mem_exclusive cpuset. | 233 | outside even a mem_exclusive cpuset. |
238 | 234 | ||
239 | 235 | ||
240 | 1.5 What does notify_on_release do ? | 236 | 1.5 What is memory_pressure ? |
241 | ------------------------------------ | ||
242 | |||
243 | If the notify_on_release flag is enabled (1) in a cpuset, then whenever | ||
244 | the last task in the cpuset leaves (exits or attaches to some other | ||
245 | cpuset) and the last child cpuset of that cpuset is removed, then | ||
246 | the kernel runs the command /sbin/cpuset_release_agent, supplying the | ||
247 | pathname (relative to the mount point of the cpuset file system) of the | ||
248 | abandoned cpuset. This enables automatic removal of abandoned cpusets. | ||
249 | The default value of notify_on_release in the root cpuset at system | ||
250 | boot is disabled (0). The default value of other cpusets at creation | ||
251 | is the current value of their parents notify_on_release setting. | ||
252 | |||
253 | |||
254 | 1.6 What is memory_pressure ? | ||
255 | ----------------------------- | 237 | ----------------------------- |
256 | The memory_pressure of a cpuset provides a simple per-cpuset metric | 238 | The memory_pressure of a cpuset provides a simple per-cpuset metric |
257 | of the rate that the tasks in a cpuset are attempting to free up in | 239 | of the rate that the tasks in a cpuset are attempting to free up in |
@@ -308,7 +290,7 @@ the tasks in the cpuset, in units of reclaims attempted per second, | |||
308 | times 1000. | 290 | times 1000. |
309 | 291 | ||
310 | 292 | ||
311 | 1.7 What is memory spread ? | 293 | 1.6 What is memory spread ? |
312 | --------------------------- | 294 | --------------------------- |
313 | There are two boolean flag files per cpuset that control where the | 295 | There are two boolean flag files per cpuset that control where the |
314 | kernel allocates pages for the file system buffers and related in | 296 | kernel allocates pages for the file system buffers and related in |
@@ -378,6 +360,142 @@ policy, especially for jobs that might have one thread reading in the | |||
378 | data set, the memory allocation across the nodes in the jobs cpuset | 360 | data set, the memory allocation across the nodes in the jobs cpuset |
379 | can become very uneven. | 361 | can become very uneven. |
380 | 362 | ||
363 | 1.7 What is sched_load_balance ? | ||
364 | -------------------------------- | ||
365 | |||
366 | The kernel scheduler (kernel/sched.c) automatically load balances | ||
367 | tasks. If one CPU is underutilized, kernel code running on that | ||
368 | CPU will look for tasks on other more overloaded CPUs and move those | ||
369 | tasks to itself, within the constraints of such placement mechanisms | ||
370 | as cpusets and sched_setaffinity. | ||
371 | |||
372 | The algorithmic cost of load balancing and its impact on key shared | ||
373 | kernel data structures such as the task list increases more than | ||
374 | linearly with the number of CPUs being balanced. So the scheduler | ||
375 | has support to partition the systems CPUs into a number of sched | ||
376 | domains such that it only load balances within each sched domain. | ||
377 | Each sched domain covers some subset of the CPUs in the system; | ||
378 | no two sched domains overlap; some CPUs might not be in any sched | ||
379 | domain and hence won't be load balanced. | ||
380 | |||
381 | Put simply, it costs less to balance between two smaller sched domains | ||
382 | than one big one, but doing so means that overloads in one of the | ||
383 | two domains won't be load balanced to the other one. | ||
384 | |||
385 | By default, there is one sched domain covering all CPUs, except those | ||
386 | marked isolated using the kernel boot time "isolcpus=" argument. | ||
387 | |||
388 | This default load balancing across all CPUs is not well suited for | ||
389 | the following two situations: | ||
390 | 1) On large systems, load balancing across many CPUs is expensive. | ||
391 | If the system is managed using cpusets to place independent jobs | ||
392 | on separate sets of CPUs, full load balancing is unnecessary. | ||
393 | 2) Systems supporting realtime on some CPUs need to minimize | ||
394 | system overhead on those CPUs, including avoiding task load | ||
395 | balancing if that is not needed. | ||
396 | |||
397 | When the per-cpuset flag "sched_load_balance" is enabled (the default | ||
398 | setting), it requests that all the CPUs in that cpusets allowed 'cpus' | ||
399 | be contained in a single sched domain, ensuring that load balancing | ||
400 | can move a task (not otherwised pinned, as by sched_setaffinity) | ||
401 | from any CPU in that cpuset to any other. | ||
402 | |||
403 | When the per-cpuset flag "sched_load_balance" is disabled, then the | ||
404 | scheduler will avoid load balancing across the CPUs in that cpuset, | ||
405 | --except-- in so far as is necessary because some overlapping cpuset | ||
406 | has "sched_load_balance" enabled. | ||
407 | |||
408 | So, for example, if the top cpuset has the flag "sched_load_balance" | ||
409 | enabled, then the scheduler will have one sched domain covering all | ||
410 | CPUs, and the setting of the "sched_load_balance" flag in any other | ||
411 | cpusets won't matter, as we're already fully load balancing. | ||
412 | |||
413 | Therefore in the above two situations, the top cpuset flag | ||
414 | "sched_load_balance" should be disabled, and only some of the smaller, | ||
415 | child cpusets have this flag enabled. | ||
416 | |||
417 | When doing this, you don't usually want to leave any unpinned tasks in | ||
418 | the top cpuset that might use non-trivial amounts of CPU, as such tasks | ||
419 | may be artificially constrained to some subset of CPUs, depending on | ||
420 | the particulars of this flag setting in descendent cpusets. Even if | ||
421 | such a task could use spare CPU cycles in some other CPUs, the kernel | ||
422 | scheduler might not consider the possibility of load balancing that | ||
423 | task to that underused CPU. | ||
424 | |||
425 | Of course, tasks pinned to a particular CPU can be left in a cpuset | ||
426 | that disables "sched_load_balance" as those tasks aren't going anywhere | ||
427 | else anyway. | ||
428 | |||
429 | There is an impedance mismatch here, between cpusets and sched domains. | ||
430 | Cpusets are hierarchical and nest. Sched domains are flat; they don't | ||
431 | overlap and each CPU is in at most one sched domain. | ||
432 | |||
433 | It is necessary for sched domains to be flat because load balancing | ||
434 | across partially overlapping sets of CPUs would risk unstable dynamics | ||
435 | that would be beyond our understanding. So if each of two partially | ||
436 | overlapping cpusets enables the flag 'sched_load_balance', then we | ||
437 | form a single sched domain that is a superset of both. We won't move | ||
438 | a task to a CPU outside it cpuset, but the scheduler load balancing | ||
439 | code might waste some compute cycles considering that possibility. | ||
440 | |||
441 | This mismatch is why there is not a simple one-to-one relation | ||
442 | between which cpusets have the flag "sched_load_balance" enabled, | ||
443 | and the sched domain configuration. If a cpuset enables the flag, it | ||
444 | will get balancing across all its CPUs, but if it disables the flag, | ||
445 | it will only be assured of no load balancing if no other overlapping | ||
446 | cpuset enables the flag. | ||
447 | |||
448 | If two cpusets have partially overlapping 'cpus' allowed, and only | ||
449 | one of them has this flag enabled, then the other may find its | ||
450 | tasks only partially load balanced, just on the overlapping CPUs. | ||
451 | This is just the general case of the top_cpuset example given a few | ||
452 | paragraphs above. In the general case, as in the top cpuset case, | ||
453 | don't leave tasks that might use non-trivial amounts of CPU in | ||
454 | such partially load balanced cpusets, as they may be artificially | ||
455 | constrained to some subset of the CPUs allowed to them, for lack of | ||
456 | load balancing to the other CPUs. | ||
457 | |||
458 | 1.7.1 sched_load_balance implementation details. | ||
459 | ------------------------------------------------ | ||
460 | |||
461 | The per-cpuset flag 'sched_load_balance' defaults to enabled (contrary | ||
462 | to most cpuset flags.) When enabled for a cpuset, the kernel will | ||
463 | ensure that it can load balance across all the CPUs in that cpuset | ||
464 | (makes sure that all the CPUs in the cpus_allowed of that cpuset are | ||
465 | in the same sched domain.) | ||
466 | |||
467 | If two overlapping cpusets both have 'sched_load_balance' enabled, | ||
468 | then they will be (must be) both in the same sched domain. | ||
469 | |||
470 | If, as is the default, the top cpuset has 'sched_load_balance' enabled, | ||
471 | then by the above that means there is a single sched domain covering | ||
472 | the whole system, regardless of any other cpuset settings. | ||
473 | |||
474 | The kernel commits to user space that it will avoid load balancing | ||
475 | where it can. It will pick as fine a granularity partition of sched | ||
476 | domains as it can while still providing load balancing for any set | ||
477 | of CPUs allowed to a cpuset having 'sched_load_balance' enabled. | ||
478 | |||
479 | The internal kernel cpuset to scheduler interface passes from the | ||
480 | cpuset code to the scheduler code a partition of the load balanced | ||
481 | CPUs in the system. This partition is a set of subsets (represented | ||
482 | as an array of cpumask_t) of CPUs, pairwise disjoint, that cover all | ||
483 | the CPUs that must be load balanced. | ||
484 | |||
485 | Whenever the 'sched_load_balance' flag changes, or CPUs come or go | ||
486 | from a cpuset with this flag enabled, or a cpuset with this flag | ||
487 | enabled is removed, the cpuset code builds a new such partition and | ||
488 | passes it to the scheduler sched domain setup code, to have the sched | ||
489 | domains rebuilt as necessary. | ||
490 | |||
491 | This partition exactly defines what sched domains the scheduler should | ||
492 | setup - one sched domain for each element (cpumask_t) in the partition. | ||
493 | |||
494 | The scheduler remembers the currently active sched domain partitions. | ||
495 | When the scheduler routine partition_sched_domains() is invoked from | ||
496 | the cpuset code to update these sched domains, it compares the new | ||
497 | partition requested with the current, and updates its sched domains, | ||
498 | removing the old and adding the new, for each change. | ||
381 | 499 | ||
382 | 1.8 How do I use cpusets ? | 500 | 1.8 How do I use cpusets ? |
383 | -------------------------- | 501 | -------------------------- |
@@ -469,7 +587,7 @@ than stress the kernel. | |||
469 | To start a new job that is to be contained within a cpuset, the steps are: | 587 | To start a new job that is to be contained within a cpuset, the steps are: |
470 | 588 | ||
471 | 1) mkdir /dev/cpuset | 589 | 1) mkdir /dev/cpuset |
472 | 2) mount -t cpuset none /dev/cpuset | 590 | 2) mount -t cgroup -ocpuset cpuset /dev/cpuset |
473 | 3) Create the new cpuset by doing mkdir's and write's (or echo's) in | 591 | 3) Create the new cpuset by doing mkdir's and write's (or echo's) in |
474 | the /dev/cpuset virtual file system. | 592 | the /dev/cpuset virtual file system. |
475 | 4) Start a task that will be the "founding father" of the new job. | 593 | 4) Start a task that will be the "founding father" of the new job. |
@@ -481,7 +599,7 @@ For example, the following sequence of commands will setup a cpuset | |||
481 | named "Charlie", containing just CPUs 2 and 3, and Memory Node 1, | 599 | named "Charlie", containing just CPUs 2 and 3, and Memory Node 1, |
482 | and then start a subshell 'sh' in that cpuset: | 600 | and then start a subshell 'sh' in that cpuset: |
483 | 601 | ||
484 | mount -t cpuset none /dev/cpuset | 602 | mount -t cgroup -ocpuset cpuset /dev/cpuset |
485 | cd /dev/cpuset | 603 | cd /dev/cpuset |
486 | mkdir Charlie | 604 | mkdir Charlie |
487 | cd Charlie | 605 | cd Charlie |
@@ -513,7 +631,7 @@ Creating, modifying, using the cpusets can be done through the cpuset | |||
513 | virtual filesystem. | 631 | virtual filesystem. |
514 | 632 | ||
515 | To mount it, type: | 633 | To mount it, type: |
516 | # mount -t cpuset none /dev/cpuset | 634 | # mount -t cgroup -o cpuset cpuset /dev/cpuset |
517 | 635 | ||
518 | Then under /dev/cpuset you can find a tree that corresponds to the | 636 | Then under /dev/cpuset you can find a tree that corresponds to the |
519 | tree of the cpusets in the system. For instance, /dev/cpuset | 637 | tree of the cpusets in the system. For instance, /dev/cpuset |
@@ -556,6 +674,18 @@ To remove a cpuset, just use rmdir: | |||
556 | This will fail if the cpuset is in use (has cpusets inside, or has | 674 | This will fail if the cpuset is in use (has cpusets inside, or has |
557 | processes attached). | 675 | processes attached). |
558 | 676 | ||
677 | Note that for legacy reasons, the "cpuset" filesystem exists as a | ||
678 | wrapper around the cgroup filesystem. | ||
679 | |||
680 | The command | ||
681 | |||
682 | mount -t cpuset X /dev/cpuset | ||
683 | |||
684 | is equivalent to | ||
685 | |||
686 | mount -t cgroup -ocpuset X /dev/cpuset | ||
687 | echo "/sbin/cpuset_release_agent" > /dev/cpuset/release_agent | ||
688 | |||
559 | 2.2 Adding/removing cpus | 689 | 2.2 Adding/removing cpus |
560 | ------------------------ | 690 | ------------------------ |
561 | 691 | ||
diff --git a/Documentation/device-mapper/dm-uevent.txt b/Documentation/device-mapper/dm-uevent.txt new file mode 100644 index 000000000000..07edbd85c714 --- /dev/null +++ b/Documentation/device-mapper/dm-uevent.txt | |||
@@ -0,0 +1,97 @@ | |||
1 | The device-mapper uevent code adds the capability to device-mapper to create | ||
2 | and send kobject uevents (uevents). Previously device-mapper events were only | ||
3 | available through the ioctl interface. The advantage of the uevents interface | ||
4 | is the event contains environment attributes providing increased context for | ||
5 | the event avoiding the need to query the state of the device-mapper device after | ||
6 | the event is received. | ||
7 | |||
8 | There are two functions currently for device-mapper events. The first function | ||
9 | listed creates the event and the second function sends the event(s). | ||
10 | |||
11 | void dm_path_uevent(enum dm_uevent_type event_type, struct dm_target *ti, | ||
12 | const char *path, unsigned nr_valid_paths) | ||
13 | |||
14 | void dm_send_uevents(struct list_head *events, struct kobject *kobj) | ||
15 | |||
16 | |||
17 | The variables added to the uevent environment are: | ||
18 | |||
19 | Variable Name: DM_TARGET | ||
20 | Uevent Action(s): KOBJ_CHANGE | ||
21 | Type: string | ||
22 | Description: | ||
23 | Value: Name of device-mapper target that generated the event. | ||
24 | |||
25 | Variable Name: DM_ACTION | ||
26 | Uevent Action(s): KOBJ_CHANGE | ||
27 | Type: string | ||
28 | Description: | ||
29 | Value: Device-mapper specific action that caused the uevent action. | ||
30 | PATH_FAILED - A path has failed. | ||
31 | PATH_REINSTATED - A path has been reinstated. | ||
32 | |||
33 | Variable Name: DM_SEQNUM | ||
34 | Uevent Action(s): KOBJ_CHANGE | ||
35 | Type: unsigned integer | ||
36 | Description: A sequence number for this specific device-mapper device. | ||
37 | Value: Valid unsigned integer range. | ||
38 | |||
39 | Variable Name: DM_PATH | ||
40 | Uevent Action(s): KOBJ_CHANGE | ||
41 | Type: string | ||
42 | Description: Major and minor number of the path device pertaining to this | ||
43 | event. | ||
44 | Value: Path name in the form of "Major:Minor" | ||
45 | |||
46 | Variable Name: DM_NR_VALID_PATHS | ||
47 | Uevent Action(s): KOBJ_CHANGE | ||
48 | Type: unsigned integer | ||
49 | Description: | ||
50 | Value: Valid unsigned integer range. | ||
51 | |||
52 | Variable Name: DM_NAME | ||
53 | Uevent Action(s): KOBJ_CHANGE | ||
54 | Type: string | ||
55 | Description: Name of the device-mapper device. | ||
56 | Value: Name | ||
57 | |||
58 | Variable Name: DM_UUID | ||
59 | Uevent Action(s): KOBJ_CHANGE | ||
60 | Type: string | ||
61 | Description: UUID of the device-mapper device. | ||
62 | Value: UUID. (Empty string if there isn't one.) | ||
63 | |||
64 | An example of the uevents generated as captured by udevmonitor is shown | ||
65 | below. | ||
66 | |||
67 | 1.) Path failure. | ||
68 | UEVENT[1192521009.711215] change@/block/dm-3 | ||
69 | ACTION=change | ||
70 | DEVPATH=/block/dm-3 | ||
71 | SUBSYSTEM=block | ||
72 | DM_TARGET=multipath | ||
73 | DM_ACTION=PATH_FAILED | ||
74 | DM_SEQNUM=1 | ||
75 | DM_PATH=8:32 | ||
76 | DM_NR_VALID_PATHS=0 | ||
77 | DM_NAME=mpath2 | ||
78 | DM_UUID=mpath-35333333000002328 | ||
79 | MINOR=3 | ||
80 | MAJOR=253 | ||
81 | SEQNUM=1130 | ||
82 | |||
83 | 2.) Path reinstate. | ||
84 | UEVENT[1192521132.989927] change@/block/dm-3 | ||
85 | ACTION=change | ||
86 | DEVPATH=/block/dm-3 | ||
87 | SUBSYSTEM=block | ||
88 | DM_TARGET=multipath | ||
89 | DM_ACTION=PATH_REINSTATED | ||
90 | DM_SEQNUM=2 | ||
91 | DM_PATH=8:32 | ||
92 | DM_NR_VALID_PATHS=1 | ||
93 | DM_NAME=mpath2 | ||
94 | DM_UUID=mpath-35333333000002328 | ||
95 | MINOR=3 | ||
96 | MAJOR=253 | ||
97 | SEQNUM=1131 | ||
diff --git a/Documentation/input/input-programming.txt b/Documentation/input/input-programming.txt index d9d523099bb7..4d932dc66098 100644 --- a/Documentation/input/input-programming.txt +++ b/Documentation/input/input-programming.txt | |||
@@ -42,8 +42,8 @@ static int __init button_init(void) | |||
42 | goto err_free_irq; | 42 | goto err_free_irq; |
43 | } | 43 | } |
44 | 44 | ||
45 | button_dev->evbit[0] = BIT(EV_KEY); | 45 | button_dev->evbit[0] = BIT_MASK(EV_KEY); |
46 | button_dev->keybit[LONG(BTN_0)] = BIT(BTN_0); | 46 | button_dev->keybit[BIT_WORD(BTN_0)] = BIT_MASK(BTN_0); |
47 | 47 | ||
48 | error = input_register_device(button_dev); | 48 | error = input_register_device(button_dev); |
49 | if (error) { | 49 | if (error) { |
@@ -217,14 +217,15 @@ If you don't need absfuzz and absflat, you can set them to zero, which mean | |||
217 | that the thing is precise and always returns to exactly the center position | 217 | that the thing is precise and always returns to exactly the center position |
218 | (if it has any). | 218 | (if it has any). |
219 | 219 | ||
220 | 1.4 NBITS(), LONG(), BIT() | 220 | 1.4 BITS_TO_LONGS(), BIT_WORD(), BIT_MASK() |
221 | ~~~~~~~~~~~~~~~~~~~~~~~~~~ | 221 | ~~~~~~~~~~~~~~~~~~~~~~~~~~ |
222 | 222 | ||
223 | These three macros from input.h help some bitfield computations: | 223 | These three macros from bitops.h help some bitfield computations: |
224 | 224 | ||
225 | NBITS(x) - returns the length of a bitfield array in longs for x bits | 225 | BITS_TO_LONGS(x) - returns the length of a bitfield array in longs for |
226 | LONG(x) - returns the index in the array in longs for bit x | 226 | x bits |
227 | BIT(x) - returns the index in a long for bit x | 227 | BIT_WORD(x) - returns the index in the array in longs for bit x |
228 | BIT_MASK(x) - returns the index in a long for bit x | ||
228 | 229 | ||
229 | 1.5 The id* and name fields | 230 | 1.5 The id* and name fields |
230 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | 231 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
diff --git a/Documentation/kbuild/kconfig-language.txt b/Documentation/kbuild/kconfig-language.txt index fe8b0c4892cf..616043a6da99 100644 --- a/Documentation/kbuild/kconfig-language.txt +++ b/Documentation/kbuild/kconfig-language.txt | |||
@@ -77,7 +77,12 @@ applicable everywhere (see syntax). | |||
77 | Optionally, dependencies only for this default value can be added with | 77 | Optionally, dependencies only for this default value can be added with |
78 | "if". | 78 | "if". |
79 | 79 | ||
80 | - dependencies: "depends on"/"requires" <expr> | 80 | - type definition + default value: |
81 | "def_bool"/"def_tristate" <expr> ["if" <expr>] | ||
82 | This is a shorthand notation for a type definition plus a value. | ||
83 | Optionally dependencies for this default value can be added with "if". | ||
84 | |||
85 | - dependencies: "depends on" <expr> | ||
81 | This defines a dependency for this menu entry. If multiple | 86 | This defines a dependency for this menu entry. If multiple |
82 | dependencies are defined, they are connected with '&&'. Dependencies | 87 | dependencies are defined, they are connected with '&&'. Dependencies |
83 | are applied to all other options within this menu entry (which also | 88 | are applied to all other options within this menu entry (which also |
@@ -289,3 +294,10 @@ source: | |||
289 | "source" <prompt> | 294 | "source" <prompt> |
290 | 295 | ||
291 | This reads the specified configuration file. This file is always parsed. | 296 | This reads the specified configuration file. This file is always parsed. |
297 | |||
298 | mainmenu: | ||
299 | |||
300 | "mainmenu" <prompt> | ||
301 | |||
302 | This sets the config program's title bar if the config program chooses | ||
303 | to use it. | ||
diff --git a/Documentation/kbuild/makefiles.txt b/Documentation/kbuild/makefiles.txt index f099b814d383..6166e2d7da76 100644 --- a/Documentation/kbuild/makefiles.txt +++ b/Documentation/kbuild/makefiles.txt | |||
@@ -518,6 +518,28 @@ more details, with real examples. | |||
518 | In this example for a specific GCC version the build will error out explaining | 518 | In this example for a specific GCC version the build will error out explaining |
519 | to the user why it stops. | 519 | to the user why it stops. |
520 | 520 | ||
521 | cc-cross-prefix | ||
522 | cc-cross-prefix is used to check if there exist a $(CC) in path with | ||
523 | one of the listed prefixes. The first prefix where there exist a | ||
524 | prefix$(CC) in the PATH is returned - and if no prefix$(CC) is found | ||
525 | then nothing is returned. | ||
526 | Additional prefixes are separated by a single space in the | ||
527 | call of cc-cross-prefix. | ||
528 | This functionality is usefull for architecture Makefile that try | ||
529 | to set CROSS_COMPILE to well know values but may have several | ||
530 | values to select between. | ||
531 | It is recommended only to try to set CROSS_COMPILE is it is a cross | ||
532 | build (host arch is different from target arch). And is CROSS_COMPILE | ||
533 | is already set then leave it with the old value. | ||
534 | |||
535 | Example: | ||
536 | #arch/m68k/Makefile | ||
537 | ifneq ($(SUBARCH),$(ARCH)) | ||
538 | ifeq ($(CROSS_COMPILE),) | ||
539 | CROSS_COMPILE := $(call cc-cross-prefix, m68k-linux-gnu-) | ||
540 | endif | ||
541 | endif | ||
542 | |||
521 | === 4 Host Program support | 543 | === 4 Host Program support |
522 | 544 | ||
523 | Kbuild supports building executables on the host for use during the | 545 | Kbuild supports building executables on the host for use during the |
diff --git a/Documentation/kdump/kdump.txt b/Documentation/kdump/kdump.txt index 1b37b28cc234..d0ac72cc19ff 100644 --- a/Documentation/kdump/kdump.txt +++ b/Documentation/kdump/kdump.txt | |||
@@ -231,6 +231,32 @@ Dump-capture kernel config options (Arch Dependent, ia64) | |||
231 | any space below the alignment point will be wasted. | 231 | any space below the alignment point will be wasted. |
232 | 232 | ||
233 | 233 | ||
234 | Extended crashkernel syntax | ||
235 | =========================== | ||
236 | |||
237 | While the "crashkernel=size[@offset]" syntax is sufficient for most | ||
238 | configurations, sometimes it's handy to have the reserved memory dependent | ||
239 | on the value of System RAM -- that's mostly for distributors that pre-setup | ||
240 | the kernel command line to avoid a unbootable system after some memory has | ||
241 | been removed from the machine. | ||
242 | |||
243 | The syntax is: | ||
244 | |||
245 | crashkernel=<range1>:<size1>[,<range2>:<size2>,...][@offset] | ||
246 | range=start-[end] | ||
247 | |||
248 | For example: | ||
249 | |||
250 | crashkernel=512M-2G:64M,2G-:128M | ||
251 | |||
252 | This would mean: | ||
253 | |||
254 | 1) if the RAM is smaller than 512M, then don't reserve anything | ||
255 | (this is the "rescue" case) | ||
256 | 2) if the RAM size is between 512M and 2G, then reserve 64M | ||
257 | 3) if the RAM size is larger than 2G, then reserve 128M | ||
258 | |||
259 | |||
234 | Boot into System Kernel | 260 | Boot into System Kernel |
235 | ======================= | 261 | ======================= |
236 | 262 | ||
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 189df0bcab99..7bf6bd2f530b 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt | |||
@@ -431,8 +431,10 @@ and is between 256 and 4096 characters. It is defined in the file | |||
431 | over the 8254 in addition to over the IO-APIC. The | 431 | over the 8254 in addition to over the IO-APIC. The |
432 | kernel tries to set a sensible default. | 432 | kernel tries to set a sensible default. |
433 | 433 | ||
434 | hpet= [X86-32,HPET] option to disable HPET and use PIT. | 434 | hpet= [X86-32,HPET] option to control HPET usage |
435 | Format: disable | 435 | Format: { enable (default) | disable | force } |
436 | disable: disable HPET and use PIT instead | ||
437 | force: allow force enabled of undocumented chips (ICH4, VIA) | ||
436 | 438 | ||
437 | com20020= [HW,NET] ARCnet - COM20020 chipset | 439 | com20020= [HW,NET] ARCnet - COM20020 chipset |
438 | Format: | 440 | Format: |
@@ -497,6 +499,13 @@ and is between 256 and 4096 characters. It is defined in the file | |||
497 | [KNL] Reserve a chunk of physical memory to | 499 | [KNL] Reserve a chunk of physical memory to |
498 | hold a kernel to switch to with kexec on panic. | 500 | hold a kernel to switch to with kexec on panic. |
499 | 501 | ||
502 | crashkernel=range1:size1[,range2:size2,...][@offset] | ||
503 | [KNL] Same as above, but depends on the memory | ||
504 | in the running system. The syntax of range is | ||
505 | start-[end] where start and end are both | ||
506 | a memory unit (amount[KMG]). See also | ||
507 | Documentation/kdump/kdump.txt for a example. | ||
508 | |||
500 | cs4232= [HW,OSS] | 509 | cs4232= [HW,OSS] |
501 | Format: <io>,<irq>,<dma>,<dma2>,<mpuio>,<mpuirq> | 510 | Format: <io>,<irq>,<dma>,<dma2>,<mpuio>,<mpuirq> |
502 | 511 | ||
diff --git a/Documentation/markers.txt b/Documentation/markers.txt new file mode 100644 index 000000000000..295a71bc301e --- /dev/null +++ b/Documentation/markers.txt | |||
@@ -0,0 +1,81 @@ | |||
1 | Using the Linux Kernel Markers | ||
2 | |||
3 | Mathieu Desnoyers | ||
4 | |||
5 | |||
6 | This document introduces Linux Kernel Markers and their use. It provides | ||
7 | examples of how to insert markers in the kernel and connect probe functions to | ||
8 | them and provides some examples of probe functions. | ||
9 | |||
10 | |||
11 | * Purpose of markers | ||
12 | |||
13 | A marker placed in code provides a hook to call a function (probe) that you can | ||
14 | provide at runtime. A marker can be "on" (a probe is connected to it) or "off" | ||
15 | (no probe is attached). When a marker is "off" it has no effect, except for | ||
16 | adding a tiny time penalty (checking a condition for a branch) and space | ||
17 | penalty (adding a few bytes for the function call at the end of the | ||
18 | instrumented function and adds a data structure in a separate section). When a | ||
19 | marker is "on", the function you provide is called each time the marker is | ||
20 | executed, in the execution context of the caller. When the function provided | ||
21 | ends its execution, it returns to the caller (continuing from the marker site). | ||
22 | |||
23 | You can put markers at important locations in the code. Markers are | ||
24 | lightweight hooks that can pass an arbitrary number of parameters, | ||
25 | described in a printk-like format string, to the attached probe function. | ||
26 | |||
27 | They can be used for tracing and performance accounting. | ||
28 | |||
29 | |||
30 | * Usage | ||
31 | |||
32 | In order to use the macro trace_mark, you should include linux/marker.h. | ||
33 | |||
34 | #include <linux/marker.h> | ||
35 | |||
36 | And, | ||
37 | |||
38 | trace_mark(subsystem_event, "%d %s", someint, somestring); | ||
39 | Where : | ||
40 | - subsystem_event is an identifier unique to your event | ||
41 | - subsystem is the name of your subsystem. | ||
42 | - event is the name of the event to mark. | ||
43 | - "%d %s" is the formatted string for the serializer. | ||
44 | - someint is an integer. | ||
45 | - somestring is a char pointer. | ||
46 | |||
47 | Connecting a function (probe) to a marker is done by providing a probe (function | ||
48 | to call) for the specific marker through marker_probe_register() and can be | ||
49 | activated by calling marker_arm(). Marker deactivation can be done by calling | ||
50 | marker_disarm() as many times as marker_arm() has been called. Removing a probe | ||
51 | is done through marker_probe_unregister(); it will disarm the probe and make | ||
52 | sure there is no caller left using the probe when it returns. Probe removal is | ||
53 | preempt-safe because preemption is disabled around the probe call. See the | ||
54 | "Probe example" section below for a sample probe module. | ||
55 | |||
56 | The marker mechanism supports inserting multiple instances of the same marker. | ||
57 | Markers can be put in inline functions, inlined static functions, and | ||
58 | unrolled loops as well as regular functions. | ||
59 | |||
60 | The naming scheme "subsystem_event" is suggested here as a convention intended | ||
61 | to limit collisions. Marker names are global to the kernel: they are considered | ||
62 | as being the same whether they are in the core kernel image or in modules. | ||
63 | Conflicting format strings for markers with the same name will cause the markers | ||
64 | to be detected to have a different format string not to be armed and will output | ||
65 | a printk warning which identifies the inconsistency: | ||
66 | |||
67 | "Format mismatch for probe probe_name (format), marker (format)" | ||
68 | |||
69 | |||
70 | * Probe / marker example | ||
71 | |||
72 | See the example provided in samples/markers/src | ||
73 | |||
74 | Compile them with your kernel. | ||
75 | |||
76 | Run, as root : | ||
77 | modprobe marker-example (insmod order is not important) | ||
78 | modprobe probe-example | ||
79 | cat /proc/marker-example (returns an expected error) | ||
80 | rmmod marker-example probe-example | ||
81 | dmesg | ||
diff --git a/Documentation/mips/00-INDEX b/Documentation/mips/00-INDEX index 9df8a2eac7b4..3f13bf8043d2 100644 --- a/Documentation/mips/00-INDEX +++ b/Documentation/mips/00-INDEX | |||
@@ -4,5 +4,3 @@ AU1xxx_IDE.README | |||
4 | - README for MIPS AU1XXX IDE driver. | 4 | - README for MIPS AU1XXX IDE driver. |
5 | GT64120.README | 5 | GT64120.README |
6 | - README for dir with info on MIPS boards using GT-64120 or GT-64120A. | 6 | - README for dir with info on MIPS boards using GT-64120 or GT-64120A. |
7 | time.README | ||
8 | - README for MIPS time services. | ||
diff --git a/Documentation/mips/time.README b/Documentation/mips/time.README deleted file mode 100644 index a4ce603ed3b3..000000000000 --- a/Documentation/mips/time.README +++ /dev/null | |||
@@ -1,173 +0,0 @@ | |||
1 | README for MIPS time services | ||
2 | |||
3 | Jun Sun | ||
4 | jsun@mvista.com or jsun@junsun.net | ||
5 | |||
6 | |||
7 | ABOUT | ||
8 | ----- | ||
9 | This file describes the new arch/mips/kernel/time.c, related files and the | ||
10 | services they provide. | ||
11 | |||
12 | If you are short in patience and just want to know how to use time.c for a | ||
13 | new board or convert an existing board, go to the last section. | ||
14 | |||
15 | |||
16 | FILES, COMPATABILITY AND CONFIGS | ||
17 | --------------------------------- | ||
18 | |||
19 | The old arch/mips/kernel/time.c is renamed to old-time.c. | ||
20 | |||
21 | A new time.c is put there, together with include/asm-mips/time.h. | ||
22 | |||
23 | Two configs variables are introduced, CONFIG_OLD_TIME_C and CONFIG_NEW_TIME_C. | ||
24 | So we allow boards using | ||
25 | |||
26 | 1) old time.c (CONFIG_OLD_TIME_C) | ||
27 | 2) new time.c (CONFIG_NEW_TIME_C) | ||
28 | 3) neither (their own private time.c) | ||
29 | |||
30 | However, it is expected every board will move to the new time.c in the near | ||
31 | future. | ||
32 | |||
33 | |||
34 | WHAT THE NEW CODE PROVIDES? | ||
35 | --------------------------- | ||
36 | |||
37 | The new time code provide the following services: | ||
38 | |||
39 | a) Implements functions required by Linux common code: | ||
40 | time_init | ||
41 | |||
42 | b) provides an abstraction of RTC and null RTC implementation as default. | ||
43 | extern unsigned long (*rtc_get_time)(void); | ||
44 | extern int (*rtc_set_time)(unsigned long); | ||
45 | |||
46 | c) high-level and low-level timer interrupt routines where the timer | ||
47 | interrupt source may or may not be the CPU timer. The high-level | ||
48 | routine is dispatched through do_IRQ() while the low-level is | ||
49 | dispatched in assemably code (usually int-handler.S) | ||
50 | |||
51 | |||
52 | WHAT THE NEW CODE REQUIRES? | ||
53 | --------------------------- | ||
54 | |||
55 | For the new code to work properly, each board implementation needs to supply | ||
56 | the following functions or values: | ||
57 | |||
58 | a) board_time_init - a function pointer. Invoked at the beginnig of | ||
59 | time_init(). It is optional. | ||
60 | 1. (optional) set up RTC routines | ||
61 | 2. (optional) calibrate and set the mips_hpt_frequency | ||
62 | |||
63 | b) plat_timer_setup - a function pointer. Invoked at the end of time_init() | ||
64 | 1. (optional) over-ride any decisions made in time_init() | ||
65 | 2. set up the irqaction for timer interrupt. | ||
66 | 3. enable the timer interrupt | ||
67 | |||
68 | c) (optional) board-specific RTC routines. | ||
69 | |||
70 | d) (optional) mips_hpt_frequency - It must be definied if the board | ||
71 | is using CPU counter for timer interrupt. | ||
72 | |||
73 | |||
74 | PORTING GUIDE | ||
75 | ------------- | ||
76 | |||
77 | Step 1: decide how you like to implement the time services. | ||
78 | |||
79 | a) does this board have a RTC? If yes, implement the two RTC funcs. | ||
80 | |||
81 | b) does the CPU have counter/compare registers? | ||
82 | |||
83 | If the answer is no, you need a timer to provide the timer interrupt | ||
84 | at 100 HZ speed. | ||
85 | |||
86 | c) The following sub steps assume your CPU has counter register. | ||
87 | Do you plan to use the CPU counter register as the timer interrupt | ||
88 | or use an exnternal timer? | ||
89 | |||
90 | In order to use CPU counter register as the timer interrupt source, you | ||
91 | must know the counter speed (mips_hpt_frequency). It is usually the | ||
92 | same as the CPU speed or an integral divisor of it. | ||
93 | |||
94 | d) decide on whether you want to use high-level or low-level timer | ||
95 | interrupt routines. The low-level one is presumably faster, but should | ||
96 | not make too mcuh difference. | ||
97 | |||
98 | |||
99 | Step 2: the machine setup() function | ||
100 | |||
101 | If you supply board_time_init(), set the function poointer. | ||
102 | |||
103 | |||
104 | Step 3: implement rtc routines, board_time_init() and plat_timer_setup() | ||
105 | if needed. | ||
106 | |||
107 | board_time_init() - | ||
108 | a) (optional) set up RTC routines, | ||
109 | b) (optional) calibrate and set the mips_hpt_frequency | ||
110 | (only needed if you intended to use cpu counter as timer interrupt | ||
111 | source) | ||
112 | |||
113 | plat_timer_setup() - | ||
114 | a) (optional) over-write any choices made above by time_init(). | ||
115 | b) machine specific code should setup the timer irqaction. | ||
116 | c) enable the timer interrupt | ||
117 | |||
118 | |||
119 | If the RTC chip is a common chip, I suggest the routines are put under | ||
120 | arch/mips/libs. For example, for DS1386 chip, one would create | ||
121 | rtc-ds1386.c under arch/mips/lib directory. Add the following line to | ||
122 | the arch/mips/lib/Makefile: | ||
123 | |||
124 | obj-$(CONFIG_DDB5476) += rtc-ds1386.o | ||
125 | |||
126 | Step 4: if you are using low-level timer interrupt, change your interrupt | ||
127 | dispathcing code to check for timer interrupt and jump to | ||
128 | ll_timer_interrupt() directly if one is detected. | ||
129 | |||
130 | Step 5: Modify arch/mips/config.in and add CONFIG_NEW_TIME_C to your machine. | ||
131 | Modify the appropriate defconfig if applicable. | ||
132 | |||
133 | Final notes: | ||
134 | |||
135 | For some tricky cases, you may need to add your own wrapper functions | ||
136 | for some of the functions in time.c. | ||
137 | |||
138 | For example, you may define your own timer interrupt routine, which does | ||
139 | some of its own processing and then calls timer_interrupt(). | ||
140 | |||
141 | You can also over-ride any of the built-in functions (RTC routines | ||
142 | and/or timer interrupt routine). | ||
143 | |||
144 | |||
145 | PORTING NOTES FOR SMP | ||
146 | ---------------------- | ||
147 | |||
148 | If you have a SMP box, things are slightly more complicated. | ||
149 | |||
150 | The time service running every jiffy is logically divided into two parts: | ||
151 | |||
152 | 1) the one for the whole system (defined in timer_interrupt()) | ||
153 | 2) the one that should run for each CPU (defined in local_timer_interrupt()) | ||
154 | |||
155 | You need to decide on your timer interrupt sources. | ||
156 | |||
157 | case 1) - whole system has only one timer interrupt delivered to one CPU | ||
158 | |||
159 | In this case, you set up timer interrupt as in UP systems. In addtion, | ||
160 | you need to set emulate_local_timer_interrupt to 1 so that other | ||
161 | CPUs get to call local_timer_interrupt(). | ||
162 | |||
163 | THIS IS CURRENTLY NOT IMPLEMNETED. However, it is rather easy to write | ||
164 | one should such a need arise. You simply make a IPI call. | ||
165 | |||
166 | case 2) - each CPU has a separate timer interrupt | ||
167 | |||
168 | In this case, you need to set up IRQ such that each of them will | ||
169 | call local_timer_interrupt(). In addition, you need to arrange | ||
170 | one and only one of them to call timer_interrupt(). | ||
171 | |||
172 | You can also do the low-level version of those interrupt routines, | ||
173 | following similar dispatching routes described above. | ||
diff --git a/Documentation/thinkpad-acpi.txt b/Documentation/thinkpad-acpi.txt index 60953d6c919d..3b95bbacc775 100644 --- a/Documentation/thinkpad-acpi.txt +++ b/Documentation/thinkpad-acpi.txt | |||
@@ -105,10 +105,15 @@ The version of thinkpad-acpi's sysfs interface is exported by the driver | |||
105 | as a driver attribute (see below). | 105 | as a driver attribute (see below). |
106 | 106 | ||
107 | Sysfs driver attributes are on the driver's sysfs attribute space, | 107 | Sysfs driver attributes are on the driver's sysfs attribute space, |
108 | for 2.6.20 this is /sys/bus/platform/drivers/thinkpad_acpi/. | 108 | for 2.6.23 this is /sys/bus/platform/drivers/thinkpad_acpi/ and |
109 | /sys/bus/platform/drivers/thinkpad_hwmon/ | ||
109 | 110 | ||
110 | Sysfs device attributes are on the driver's sysfs attribute space, | 111 | Sysfs device attributes are on the thinkpad_acpi device sysfs attribute |
111 | for 2.6.20 this is /sys/devices/platform/thinkpad_acpi/. | 112 | space, for 2.6.23 this is /sys/devices/platform/thinkpad_acpi/. |
113 | |||
114 | Sysfs device attributes for the sensors and fan are on the | ||
115 | thinkpad_hwmon device's sysfs attribute space, but you should locate it | ||
116 | looking for a hwmon device with the name attribute of "thinkpad". | ||
112 | 117 | ||
113 | Driver version | 118 | Driver version |
114 | -------------- | 119 | -------------- |
@@ -766,7 +771,7 @@ Temperature sensors | |||
766 | ------------------- | 771 | ------------------- |
767 | 772 | ||
768 | procfs: /proc/acpi/ibm/thermal | 773 | procfs: /proc/acpi/ibm/thermal |
769 | sysfs device attributes: (hwmon) temp*_input | 774 | sysfs device attributes: (hwmon "thinkpad") temp*_input |
770 | 775 | ||
771 | Most ThinkPads include six or more separate temperature sensors but only | 776 | Most ThinkPads include six or more separate temperature sensors but only |
772 | expose the CPU temperature through the standard ACPI methods. This | 777 | expose the CPU temperature through the standard ACPI methods. This |
@@ -989,7 +994,9 @@ Fan control and monitoring: fan speed, fan enable/disable | |||
989 | --------------------------------------------------------- | 994 | --------------------------------------------------------- |
990 | 995 | ||
991 | procfs: /proc/acpi/ibm/fan | 996 | procfs: /proc/acpi/ibm/fan |
992 | sysfs device attributes: (hwmon) fan_input, pwm1, pwm1_enable | 997 | sysfs device attributes: (hwmon "thinkpad") fan1_input, pwm1, |
998 | pwm1_enable | ||
999 | sysfs hwmon driver attributes: fan_watchdog | ||
993 | 1000 | ||
994 | NOTE NOTE NOTE: fan control operations are disabled by default for | 1001 | NOTE NOTE NOTE: fan control operations are disabled by default for |
995 | safety reasons. To enable them, the module parameter "fan_control=1" | 1002 | safety reasons. To enable them, the module parameter "fan_control=1" |
@@ -1131,7 +1138,7 @@ hwmon device attribute fan1_input: | |||
1131 | which can take up to two minutes. May return rubbish on older | 1138 | which can take up to two minutes. May return rubbish on older |
1132 | ThinkPads. | 1139 | ThinkPads. |
1133 | 1140 | ||
1134 | driver attribute fan_watchdog: | 1141 | hwmon driver attribute fan_watchdog: |
1135 | Fan safety watchdog timer interval, in seconds. Minimum is | 1142 | Fan safety watchdog timer interval, in seconds. Minimum is |
1136 | 1 second, maximum is 120 seconds. 0 disables the watchdog. | 1143 | 1 second, maximum is 120 seconds. 0 disables the watchdog. |
1137 | 1144 | ||
@@ -1233,3 +1240,9 @@ Sysfs interface changelog: | |||
1233 | layer, the radio switch generates input event EV_RADIO, | 1240 | layer, the radio switch generates input event EV_RADIO, |
1234 | and the driver enables hot key handling by default in | 1241 | and the driver enables hot key handling by default in |
1235 | the firmware. | 1242 | the firmware. |
1243 | |||
1244 | 0x020000: ABI fix: added a separate hwmon platform device and | ||
1245 | driver, which must be located by name (thinkpad) | ||
1246 | and the hwmon class for libsensors4 (lm-sensors 3) | ||
1247 | compatibility. Moved all hwmon attributes to this | ||
1248 | new platform device. | ||