diff options
Diffstat (limited to 'Documentation')
| -rw-r--r-- | Documentation/auxdisplay/cfag12864b-example.c | 1 | ||||
| -rw-r--r-- | Documentation/cgroups/cgroups.txt | 32 | ||||
| -rw-r--r-- | Documentation/cgroups/memory.txt | 41 | ||||
| -rw-r--r-- | Documentation/crypto/async-tx-api.txt | 75 | ||||
| -rw-r--r-- | Documentation/filesystems/sharedsubtree.txt | 220 | ||||
| -rw-r--r-- | Documentation/filesystems/vfs.txt | 7 | ||||
| -rw-r--r-- | Documentation/ioctl/ioctl-number.txt | 1 | ||||
| -rw-r--r-- | Documentation/sysctl/fs.txt | 17 | ||||
| -rw-r--r-- | Documentation/sysctl/kernel.txt | 22 | ||||
| -rw-r--r-- | Documentation/sysctl/vm.txt | 41 | ||||
| -rw-r--r-- | Documentation/vm/.gitignore | 1 | ||||
| -rw-r--r-- | Documentation/vm/page-types.c | 200 |
12 files changed, 417 insertions, 241 deletions
diff --git a/Documentation/auxdisplay/cfag12864b-example.c b/Documentation/auxdisplay/cfag12864b-example.c index 1d2c010bae12..e7823ffb1ca0 100644 --- a/Documentation/auxdisplay/cfag12864b-example.c +++ b/Documentation/auxdisplay/cfag12864b-example.c | |||
| @@ -194,7 +194,6 @@ static void cfag12864b_blit(void) | |||
| 194 | */ | 194 | */ |
| 195 | 195 | ||
| 196 | #include <stdio.h> | 196 | #include <stdio.h> |
| 197 | #include <string.h> | ||
| 198 | 197 | ||
| 199 | #define EXAMPLES 6 | 198 | #define EXAMPLES 6 |
| 200 | 199 | ||
diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt index 6eb1a97e88ce..455d4e6d346d 100644 --- a/Documentation/cgroups/cgroups.txt +++ b/Documentation/cgroups/cgroups.txt | |||
| @@ -408,6 +408,26 @@ You can attach the current shell task by echoing 0: | |||
| 408 | 408 | ||
| 409 | # echo 0 > tasks | 409 | # echo 0 > tasks |
| 410 | 410 | ||
| 411 | 2.3 Mounting hierarchies by name | ||
| 412 | -------------------------------- | ||
| 413 | |||
| 414 | Passing the name=<x> option when mounting a cgroups hierarchy | ||
| 415 | associates the given name with the hierarchy. This can be used when | ||
| 416 | mounting a pre-existing hierarchy, in order to refer to it by name | ||
| 417 | rather than by its set of active subsystems. Each hierarchy is either | ||
| 418 | nameless, or has a unique name. | ||
| 419 | |||
| 420 | The name should match [\w.-]+ | ||
| 421 | |||
| 422 | When passing a name=<x> option for a new hierarchy, you need to | ||
| 423 | specify subsystems manually; the legacy behaviour of mounting all | ||
| 424 | subsystems when none are explicitly specified is not supported when | ||
| 425 | you give a subsystem a name. | ||
| 426 | |||
| 427 | The name of the subsystem appears as part of the hierarchy description | ||
| 428 | in /proc/mounts and /proc/<pid>/cgroups. | ||
| 429 | |||
| 430 | |||
| 411 | 3. Kernel API | 431 | 3. Kernel API |
| 412 | ============= | 432 | ============= |
| 413 | 433 | ||
| @@ -501,7 +521,7 @@ rmdir() will fail with it. From this behavior, pre_destroy() can be | |||
| 501 | called multiple times against a cgroup. | 521 | called multiple times against a cgroup. |
| 502 | 522 | ||
| 503 | int can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, | 523 | int can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, |
| 504 | struct task_struct *task) | 524 | struct task_struct *task, bool threadgroup) |
| 505 | (cgroup_mutex held by caller) | 525 | (cgroup_mutex held by caller) |
| 506 | 526 | ||
| 507 | Called prior to moving a task into a cgroup; if the subsystem | 527 | Called prior to moving a task into a cgroup; if the subsystem |
| @@ -509,14 +529,20 @@ returns an error, this will abort the attach operation. If a NULL | |||
| 509 | task is passed, then a successful result indicates that *any* | 529 | task is passed, then a successful result indicates that *any* |
| 510 | unspecified task can be moved into the cgroup. Note that this isn't | 530 | unspecified task can be moved into the cgroup. Note that this isn't |
| 511 | called on a fork. If this method returns 0 (success) then this should | 531 | called on a fork. If this method returns 0 (success) then this should |
| 512 | remain valid while the caller holds cgroup_mutex. | 532 | remain valid while the caller holds cgroup_mutex. If threadgroup is |
| 533 | true, then a successful result indicates that all threads in the given | ||
| 534 | thread's threadgroup can be moved together. | ||
| 513 | 535 | ||
| 514 | void attach(struct cgroup_subsys *ss, struct cgroup *cgrp, | 536 | void attach(struct cgroup_subsys *ss, struct cgroup *cgrp, |
| 515 | struct cgroup *old_cgrp, struct task_struct *task) | 537 | struct cgroup *old_cgrp, struct task_struct *task, |
| 538 | bool threadgroup) | ||
| 516 | (cgroup_mutex held by caller) | 539 | (cgroup_mutex held by caller) |
| 517 | 540 | ||
| 518 | Called after the task has been attached to the cgroup, to allow any | 541 | Called after the task has been attached to the cgroup, to allow any |
| 519 | post-attachment activity that requires memory allocations or blocking. | 542 | post-attachment activity that requires memory allocations or blocking. |
| 543 | If threadgroup is true, the subsystem should take care of all threads | ||
| 544 | in the specified thread's threadgroup. Currently does not support any | ||
| 545 | subsystem that might need the old_cgrp for every thread in the group. | ||
| 520 | 546 | ||
| 521 | void fork(struct cgroup_subsy *ss, struct task_struct *task) | 547 | void fork(struct cgroup_subsy *ss, struct task_struct *task) |
| 522 | 548 | ||
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt index 23d1262c0775..b871f2552b45 100644 --- a/Documentation/cgroups/memory.txt +++ b/Documentation/cgroups/memory.txt | |||
| @@ -179,6 +179,9 @@ The reclaim algorithm has not been modified for cgroups, except that | |||
| 179 | pages that are selected for reclaiming come from the per cgroup LRU | 179 | pages that are selected for reclaiming come from the per cgroup LRU |
| 180 | list. | 180 | list. |
| 181 | 181 | ||
| 182 | NOTE: Reclaim does not work for the root cgroup, since we cannot set any | ||
| 183 | limits on the root cgroup. | ||
| 184 | |||
| 182 | 2. Locking | 185 | 2. Locking |
| 183 | 186 | ||
| 184 | The memory controller uses the following hierarchy | 187 | The memory controller uses the following hierarchy |
| @@ -210,6 +213,7 @@ We can alter the memory limit: | |||
| 210 | NOTE: We can use a suffix (k, K, m, M, g or G) to indicate values in kilo, | 213 | NOTE: We can use a suffix (k, K, m, M, g or G) to indicate values in kilo, |
| 211 | mega or gigabytes. | 214 | mega or gigabytes. |
| 212 | NOTE: We can write "-1" to reset the *.limit_in_bytes(unlimited). | 215 | NOTE: We can write "-1" to reset the *.limit_in_bytes(unlimited). |
| 216 | NOTE: We cannot set limits on the root cgroup any more. | ||
| 213 | 217 | ||
| 214 | # cat /cgroups/0/memory.limit_in_bytes | 218 | # cat /cgroups/0/memory.limit_in_bytes |
| 215 | 4194304 | 219 | 4194304 |
| @@ -375,7 +379,42 @@ cgroups created below it. | |||
| 375 | 379 | ||
| 376 | NOTE2: This feature can be enabled/disabled per subtree. | 380 | NOTE2: This feature can be enabled/disabled per subtree. |
| 377 | 381 | ||
| 378 | 7. TODO | 382 | 7. Soft limits |
| 383 | |||
| 384 | Soft limits allow for greater sharing of memory. The idea behind soft limits | ||
| 385 | is to allow control groups to use as much of the memory as needed, provided | ||
| 386 | |||
| 387 | a. There is no memory contention | ||
| 388 | b. They do not exceed their hard limit | ||
| 389 | |||
| 390 | When the system detects memory contention or low memory control groups | ||
| 391 | are pushed back to their soft limits. If the soft limit of each control | ||
| 392 | group is very high, they are pushed back as much as possible to make | ||
| 393 | sure that one control group does not starve the others of memory. | ||
| 394 | |||
| 395 | Please note that soft limits is a best effort feature, it comes with | ||
| 396 | no guarantees, but it does its best to make sure that when memory is | ||
| 397 | heavily contended for, memory is allocated based on the soft limit | ||
| 398 | hints/setup. Currently soft limit based reclaim is setup such that | ||
| 399 | it gets invoked from balance_pgdat (kswapd). | ||
| 400 | |||
| 401 | 7.1 Interface | ||
| 402 | |||
| 403 | Soft limits can be setup by using the following commands (in this example we | ||
| 404 | assume a soft limit of 256 megabytes) | ||
| 405 | |||
| 406 | # echo 256M > memory.soft_limit_in_bytes | ||
| 407 | |||
| 408 | If we want to change this to 1G, we can at any time use | ||
| 409 | |||
| 410 | # echo 1G > memory.soft_limit_in_bytes | ||
| 411 | |||
| 412 | NOTE1: Soft limits take effect over a long period of time, since they involve | ||
| 413 | reclaiming memory for balancing between memory cgroups | ||
| 414 | NOTE2: It is recommended to set the soft limit always below the hard limit, | ||
| 415 | otherwise the hard limit will take precedence. | ||
| 416 | |||
| 417 | 8. TODO | ||
| 379 | 418 | ||
| 380 | 1. Add support for accounting huge pages (as a separate controller) | 419 | 1. Add support for accounting huge pages (as a separate controller) |
| 381 | 2. Make per-cgroup scanner reclaim not-shared pages first | 420 | 2. Make per-cgroup scanner reclaim not-shared pages first |
diff --git a/Documentation/crypto/async-tx-api.txt b/Documentation/crypto/async-tx-api.txt index 9f59fcbf5d82..ba046b8fa92f 100644 --- a/Documentation/crypto/async-tx-api.txt +++ b/Documentation/crypto/async-tx-api.txt | |||
| @@ -54,20 +54,23 @@ features surfaced as a result: | |||
| 54 | 54 | ||
| 55 | 3.1 General format of the API: | 55 | 3.1 General format of the API: |
| 56 | struct dma_async_tx_descriptor * | 56 | struct dma_async_tx_descriptor * |
| 57 | async_<operation>(<op specific parameters>, | 57 | async_<operation>(<op specific parameters>, struct async_submit ctl *submit) |
| 58 | enum async_tx_flags flags, | ||
| 59 | struct dma_async_tx_descriptor *dependency, | ||
| 60 | dma_async_tx_callback callback_routine, | ||
| 61 | void *callback_parameter); | ||
| 62 | 58 | ||
| 63 | 3.2 Supported operations: | 59 | 3.2 Supported operations: |
| 64 | memcpy - memory copy between a source and a destination buffer | 60 | memcpy - memory copy between a source and a destination buffer |
| 65 | memset - fill a destination buffer with a byte value | 61 | memset - fill a destination buffer with a byte value |
| 66 | xor - xor a series of source buffers and write the result to a | 62 | xor - xor a series of source buffers and write the result to a |
| 67 | destination buffer | 63 | destination buffer |
| 68 | xor_zero_sum - xor a series of source buffers and set a flag if the | 64 | xor_val - xor a series of source buffers and set a flag if the |
| 69 | result is zero. The implementation attempts to prevent | 65 | result is zero. The implementation attempts to prevent |
| 70 | writes to memory | 66 | writes to memory |
| 67 | pq - generate the p+q (raid6 syndrome) from a series of source buffers | ||
| 68 | pq_val - validate that a p and or q buffer are in sync with a given series of | ||
| 69 | sources | ||
| 70 | datap - (raid6_datap_recov) recover a raid6 data block and the p block | ||
| 71 | from the given sources | ||
| 72 | 2data - (raid6_2data_recov) recover 2 raid6 data blocks from the given | ||
| 73 | sources | ||
| 71 | 74 | ||
| 72 | 3.3 Descriptor management: | 75 | 3.3 Descriptor management: |
| 73 | The return value is non-NULL and points to a 'descriptor' when the operation | 76 | The return value is non-NULL and points to a 'descriptor' when the operation |
| @@ -80,8 +83,8 @@ acknowledged by the application before the offload engine driver is allowed to | |||
| 80 | recycle (or free) the descriptor. A descriptor can be acked by one of the | 83 | recycle (or free) the descriptor. A descriptor can be acked by one of the |
| 81 | following methods: | 84 | following methods: |
| 82 | 1/ setting the ASYNC_TX_ACK flag if no child operations are to be submitted | 85 | 1/ setting the ASYNC_TX_ACK flag if no child operations are to be submitted |
| 83 | 2/ setting the ASYNC_TX_DEP_ACK flag to acknowledge the parent | 86 | 2/ submitting an unacknowledged descriptor as a dependency to another |
| 84 | descriptor of a new operation. | 87 | async_tx call will implicitly set the acknowledged state. |
| 85 | 3/ calling async_tx_ack() on the descriptor. | 88 | 3/ calling async_tx_ack() on the descriptor. |
| 86 | 89 | ||
| 87 | 3.4 When does the operation execute? | 90 | 3.4 When does the operation execute? |
| @@ -119,30 +122,42 @@ of an operation. | |||
| 119 | Perform a xor->copy->xor operation where each operation depends on the | 122 | Perform a xor->copy->xor operation where each operation depends on the |
| 120 | result from the previous operation: | 123 | result from the previous operation: |
| 121 | 124 | ||
| 122 | void complete_xor_copy_xor(void *param) | 125 | void callback(void *param) |
| 123 | { | 126 | { |
| 124 | printk("complete\n"); | 127 | struct completion *cmp = param; |
| 128 | |||
| 129 | complete(cmp); | ||
| 125 | } | 130 | } |
| 126 | 131 | ||
| 127 | int run_xor_copy_xor(struct page **xor_srcs, | 132 | void run_xor_copy_xor(struct page **xor_srcs, |
| 128 | int xor_src_cnt, | 133 | int xor_src_cnt, |
| 129 | struct page *xor_dest, | 134 | struct page *xor_dest, |
| 130 | size_t xor_len, | 135 | size_t xor_len, |
| 131 | struct page *copy_src, | 136 | struct page *copy_src, |
| 132 | struct page *copy_dest, | 137 | struct page *copy_dest, |
| 133 | size_t copy_len) | 138 | size_t copy_len) |
| 134 | { | 139 | { |
| 135 | struct dma_async_tx_descriptor *tx; | 140 | struct dma_async_tx_descriptor *tx; |
| 141 | addr_conv_t addr_conv[xor_src_cnt]; | ||
| 142 | struct async_submit_ctl submit; | ||
| 143 | addr_conv_t addr_conv[NDISKS]; | ||
| 144 | struct completion cmp; | ||
| 145 | |||
| 146 | init_async_submit(&submit, ASYNC_TX_XOR_DROP_DST, NULL, NULL, NULL, | ||
| 147 | addr_conv); | ||
| 148 | tx = async_xor(xor_dest, xor_srcs, 0, xor_src_cnt, xor_len, &submit) | ||
| 136 | 149 | ||
| 137 | tx = async_xor(xor_dest, xor_srcs, 0, xor_src_cnt, xor_len, | 150 | submit->depend_tx = tx; |
| 138 | ASYNC_TX_XOR_DROP_DST, NULL, NULL, NULL); | 151 | tx = async_memcpy(copy_dest, copy_src, 0, 0, copy_len, &submit); |
| 139 | tx = async_memcpy(copy_dest, copy_src, 0, 0, copy_len, | 152 | |
| 140 | ASYNC_TX_DEP_ACK, tx, NULL, NULL); | 153 | init_completion(&cmp); |
| 141 | tx = async_xor(xor_dest, xor_srcs, 0, xor_src_cnt, xor_len, | 154 | init_async_submit(&submit, ASYNC_TX_XOR_DROP_DST | ASYNC_TX_ACK, tx, |
| 142 | ASYNC_TX_XOR_DROP_DST | ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, | 155 | callback, &cmp, addr_conv); |
| 143 | tx, complete_xor_copy_xor, NULL); | 156 | tx = async_xor(xor_dest, xor_srcs, 0, xor_src_cnt, xor_len, &submit); |
| 144 | 157 | ||
| 145 | async_tx_issue_pending_all(); | 158 | async_tx_issue_pending_all(); |
| 159 | |||
| 160 | wait_for_completion(&cmp); | ||
| 146 | } | 161 | } |
| 147 | 162 | ||
| 148 | See include/linux/async_tx.h for more information on the flags. See the | 163 | See include/linux/async_tx.h for more information on the flags. See the |
diff --git a/Documentation/filesystems/sharedsubtree.txt b/Documentation/filesystems/sharedsubtree.txt index 736540045dc7..23a181074f94 100644 --- a/Documentation/filesystems/sharedsubtree.txt +++ b/Documentation/filesystems/sharedsubtree.txt | |||
| @@ -4,7 +4,7 @@ Shared Subtrees | |||
| 4 | Contents: | 4 | Contents: |
| 5 | 1) Overview | 5 | 1) Overview |
| 6 | 2) Features | 6 | 2) Features |
| 7 | 3) smount command | 7 | 3) Setting mount states |
| 8 | 4) Use-case | 8 | 4) Use-case |
| 9 | 5) Detailed semantics | 9 | 5) Detailed semantics |
| 10 | 6) Quiz | 10 | 6) Quiz |
| @@ -41,14 +41,14 @@ replicas continue to be exactly same. | |||
| 41 | 41 | ||
| 42 | Here is an example: | 42 | Here is an example: |
| 43 | 43 | ||
| 44 | Lets say /mnt has a mount that is shared. | 44 | Let's say /mnt has a mount that is shared. |
| 45 | mount --make-shared /mnt | 45 | mount --make-shared /mnt |
| 46 | 46 | ||
| 47 | note: mount command does not yet support the --make-shared flag. | 47 | Note: mount(8) command now supports the --make-shared flag, |
| 48 | I have included a small C program which does the same by executing | 48 | so the sample 'smount' program is no longer needed and has been |
| 49 | 'smount /mnt shared' | 49 | removed. |
| 50 | 50 | ||
| 51 | #mount --bind /mnt /tmp | 51 | # mount --bind /mnt /tmp |
| 52 | The above command replicates the mount at /mnt to the mountpoint /tmp | 52 | The above command replicates the mount at /mnt to the mountpoint /tmp |
| 53 | and the contents of both the mounts remain identical. | 53 | and the contents of both the mounts remain identical. |
| 54 | 54 | ||
| @@ -58,8 +58,8 @@ replicas continue to be exactly same. | |||
| 58 | #ls /tmp | 58 | #ls /tmp |
| 59 | a b c | 59 | a b c |
| 60 | 60 | ||
| 61 | Now lets say we mount a device at /tmp/a | 61 | Now let's say we mount a device at /tmp/a |
| 62 | #mount /dev/sd0 /tmp/a | 62 | # mount /dev/sd0 /tmp/a |
| 63 | 63 | ||
| 64 | #ls /tmp/a | 64 | #ls /tmp/a |
| 65 | t1 t2 t2 | 65 | t1 t2 t2 |
| @@ -80,21 +80,20 @@ replicas continue to be exactly same. | |||
| 80 | 80 | ||
| 81 | Here is an example: | 81 | Here is an example: |
| 82 | 82 | ||
| 83 | Lets say /mnt has a mount which is shared. | 83 | Let's say /mnt has a mount which is shared. |
| 84 | #mount --make-shared /mnt | 84 | # mount --make-shared /mnt |
| 85 | 85 | ||
| 86 | Lets bind mount /mnt to /tmp | 86 | Let's bind mount /mnt to /tmp |
| 87 | #mount --bind /mnt /tmp | 87 | # mount --bind /mnt /tmp |
| 88 | 88 | ||
| 89 | the new mount at /tmp becomes a shared mount and it is a replica of | 89 | the new mount at /tmp becomes a shared mount and it is a replica of |
| 90 | the mount at /mnt. | 90 | the mount at /mnt. |
| 91 | 91 | ||
| 92 | Now lets make the mount at /tmp; a slave of /mnt | 92 | Now let's make the mount at /tmp; a slave of /mnt |
| 93 | #mount --make-slave /tmp | 93 | # mount --make-slave /tmp |
| 94 | [or smount /tmp slave] | ||
| 95 | 94 | ||
| 96 | lets mount /dev/sd0 on /mnt/a | 95 | let's mount /dev/sd0 on /mnt/a |
| 97 | #mount /dev/sd0 /mnt/a | 96 | # mount /dev/sd0 /mnt/a |
| 98 | 97 | ||
| 99 | #ls /mnt/a | 98 | #ls /mnt/a |
| 100 | t1 t2 t3 | 99 | t1 t2 t3 |
| @@ -104,9 +103,9 @@ replicas continue to be exactly same. | |||
| 104 | 103 | ||
| 105 | Note the mount event has propagated to the mount at /tmp | 104 | Note the mount event has propagated to the mount at /tmp |
| 106 | 105 | ||
| 107 | However lets see what happens if we mount something on the mount at /tmp | 106 | However let's see what happens if we mount something on the mount at /tmp |
| 108 | 107 | ||
| 109 | #mount /dev/sd1 /tmp/b | 108 | # mount /dev/sd1 /tmp/b |
| 110 | 109 | ||
| 111 | #ls /tmp/b | 110 | #ls /tmp/b |
| 112 | s1 s2 s3 | 111 | s1 s2 s3 |
| @@ -124,12 +123,11 @@ replicas continue to be exactly same. | |||
| 124 | 123 | ||
| 125 | 2d) A unbindable mount is a unbindable private mount | 124 | 2d) A unbindable mount is a unbindable private mount |
| 126 | 125 | ||
| 127 | lets say we have a mount at /mnt and we make is unbindable | 126 | let's say we have a mount at /mnt and we make is unbindable |
| 128 | 127 | ||
| 129 | #mount --make-unbindable /mnt | 128 | # mount --make-unbindable /mnt |
| 130 | [ smount /mnt unbindable ] | ||
| 131 | 129 | ||
| 132 | Lets try to bind mount this mount somewhere else. | 130 | Let's try to bind mount this mount somewhere else. |
| 133 | # mount --bind /mnt /tmp | 131 | # mount --bind /mnt /tmp |
| 134 | mount: wrong fs type, bad option, bad superblock on /mnt, | 132 | mount: wrong fs type, bad option, bad superblock on /mnt, |
| 135 | or too many mounted file systems | 133 | or too many mounted file systems |
| @@ -137,149 +135,15 @@ replicas continue to be exactly same. | |||
| 137 | Binding a unbindable mount is a invalid operation. | 135 | Binding a unbindable mount is a invalid operation. |
| 138 | 136 | ||
| 139 | 137 | ||
| 140 | 3) smount command | 138 | 3) Setting mount states |
| 141 | 139 | ||
| 142 | Currently the mount command is not aware of shared subtree features. | 140 | The mount command (util-linux package) can be used to set mount |
| 143 | Work is in progress to add the support in mount ( util-linux package ). | 141 | states: |
| 144 | Till then use the following program. | ||
| 145 | 142 | ||
| 146 | ------------------------------------------------------------------------ | 143 | mount --make-shared mountpoint |
| 147 | // | 144 | mount --make-slave mountpoint |
| 148 | //this code was developed my Miklos Szeredi <miklos@szeredi.hu> | 145 | mount --make-private mountpoint |
| 149 | //and modified by Ram Pai <linuxram@us.ibm.com> | 146 | mount --make-unbindable mountpoint |
| 150 | // sample usage: | ||
| 151 | // smount /tmp shared | ||
| 152 | // | ||
| 153 | #include <stdio.h> | ||
| 154 | #include <stdlib.h> | ||
| 155 | #include <unistd.h> | ||
| 156 | #include <string.h> | ||
| 157 | #include <sys/mount.h> | ||
| 158 | #include <sys/fsuid.h> | ||
| 159 | |||
| 160 | #ifndef MS_REC | ||
| 161 | #define MS_REC 0x4000 /* 16384: Recursive loopback */ | ||
| 162 | #endif | ||
| 163 | |||
| 164 | #ifndef MS_SHARED | ||
| 165 | #define MS_SHARED 1<<20 /* Shared */ | ||
| 166 | #endif | ||
| 167 | |||
| 168 | #ifndef MS_PRIVATE | ||
| 169 | #define MS_PRIVATE 1<<18 /* Private */ | ||
| 170 | #endif | ||
| 171 | |||
| 172 | #ifndef MS_SLAVE | ||
| 173 | #define MS_SLAVE 1<<19 /* Slave */ | ||
| 174 | #endif | ||
| 175 | |||
| 176 | #ifndef MS_UNBINDABLE | ||
| 177 | #define MS_UNBINDABLE 1<<17 /* Unbindable */ | ||
| 178 | #endif | ||
| 179 | |||
| 180 | int main(int argc, char *argv[]) | ||
| 181 | { | ||
| 182 | int type; | ||
| 183 | if(argc != 3) { | ||
| 184 | fprintf(stderr, "usage: %s dir " | ||
| 185 | "<rshared|rslave|rprivate|runbindable|shared|slave" | ||
| 186 | "|private|unbindable>\n" , argv[0]); | ||
| 187 | return 1; | ||
| 188 | } | ||
| 189 | |||
| 190 | fprintf(stdout, "%s %s %s\n", argv[0], argv[1], argv[2]); | ||
| 191 | |||
| 192 | if (strcmp(argv[2],"rshared")==0) | ||
| 193 | type=(MS_SHARED|MS_REC); | ||
| 194 | else if (strcmp(argv[2],"rslave")==0) | ||
| 195 | type=(MS_SLAVE|MS_REC); | ||
| 196 | else if (strcmp(argv[2],"rprivate")==0) | ||
| 197 | type=(MS_PRIVATE|MS_REC); | ||
| 198 | else if (strcmp(argv[2],"runbindable")==0) | ||
| 199 | type=(MS_UNBINDABLE|MS_REC); | ||
| 200 | else if (strcmp(argv[2],"shared")==0) | ||
| 201 | type=MS_SHARED; | ||
| 202 | else if (strcmp(argv[2],"slave")==0) | ||
| 203 | type=MS_SLAVE; | ||
| 204 | else if (strcmp(argv[2],"private")==0) | ||
| 205 | type=MS_PRIVATE; | ||
| 206 | else if (strcmp(argv[2],"unbindable")==0) | ||
| 207 | type=MS_UNBINDABLE; | ||
| 208 | else { | ||
| 209 | fprintf(stderr, "invalid operation: %s\n", argv[2]); | ||
| 210 | return 1; | ||
| 211 | } | ||
| 212 | setfsuid(getuid()); | ||
| 213 | |||
| 214 | if(mount("", argv[1], "dontcare", type, "") == -1) { | ||
| 215 | perror("mount"); | ||
| 216 | return 1; | ||
| 217 | } | ||
| 218 | return 0; | ||
| 219 | } | ||
| 220 | ----------------------------------------------------------------------- | ||
| 221 | |||
| 222 | Copy the above code snippet into smount.c | ||
| 223 | gcc -o smount smount.c | ||
| 224 | |||
| 225 | |||
| 226 | (i) To mark all the mounts under /mnt as shared execute the following | ||
| 227 | command: | ||
| 228 | |||
| 229 | smount /mnt rshared | ||
| 230 | the corresponding syntax planned for mount command is | ||
| 231 | mount --make-rshared /mnt | ||
| 232 | |||
| 233 | just to mark a mount /mnt as shared, execute the following | ||
| 234 | command: | ||
| 235 | smount /mnt shared | ||
| 236 | the corresponding syntax planned for mount command is | ||
| 237 | mount --make-shared /mnt | ||
| 238 | |||
| 239 | (ii) To mark all the shared mounts under /mnt as slave execute the | ||
| 240 | following | ||
| 241 | |||
| 242 | command: | ||
| 243 | smount /mnt rslave | ||
| 244 | the corresponding syntax planned for mount command is | ||
| 245 | mount --make-rslave /mnt | ||
| 246 | |||
| 247 | just to mark a mount /mnt as slave, execute the following | ||
| 248 | command: | ||
| 249 | smount /mnt slave | ||
| 250 | the corresponding syntax planned for mount command is | ||
| 251 | mount --make-slave /mnt | ||
| 252 | |||
| 253 | (iii) To mark all the mounts under /mnt as private execute the | ||
| 254 | following command: | ||
| 255 | |||
| 256 | smount /mnt rprivate | ||
| 257 | the corresponding syntax planned for mount command is | ||
| 258 | mount --make-rprivate /mnt | ||
| 259 | |||
| 260 | just to mark a mount /mnt as private, execute the following | ||
| 261 | command: | ||
| 262 | smount /mnt private | ||
| 263 | the corresponding syntax planned for mount command is | ||
| 264 | mount --make-private /mnt | ||
| 265 | |||
| 266 | NOTE: by default all the mounts are created as private. But if | ||
| 267 | you want to change some shared/slave/unbindable mount as | ||
| 268 | private at a later point in time, this command can help. | ||
| 269 | |||
| 270 | (iv) To mark all the mounts under /mnt as unbindable execute the | ||
| 271 | following | ||
| 272 | |||
| 273 | command: | ||
| 274 | smount /mnt runbindable | ||
| 275 | the corresponding syntax planned for mount command is | ||
| 276 | mount --make-runbindable /mnt | ||
| 277 | |||
| 278 | just to mark a mount /mnt as unbindable, execute the following | ||
| 279 | command: | ||
| 280 | smount /mnt unbindable | ||
| 281 | the corresponding syntax planned for mount command is | ||
| 282 | mount --make-unbindable /mnt | ||
| 283 | 147 | ||
| 284 | 148 | ||
| 285 | 4) Use cases | 149 | 4) Use cases |
| @@ -350,7 +214,7 @@ replicas continue to be exactly same. | |||
| 350 | mount --rbind / /view/v3 | 214 | mount --rbind / /view/v3 |
| 351 | mount --rbind / /view/v4 | 215 | mount --rbind / /view/v4 |
| 352 | 216 | ||
| 353 | and if /usr has a versioning filesystem mounted, than that | 217 | and if /usr has a versioning filesystem mounted, then that |
| 354 | mount appears at /view/v1/usr, /view/v2/usr, /view/v3/usr and | 218 | mount appears at /view/v1/usr, /view/v2/usr, /view/v3/usr and |
| 355 | /view/v4/usr too | 219 | /view/v4/usr too |
| 356 | 220 | ||
| @@ -390,7 +254,7 @@ replicas continue to be exactly same. | |||
| 390 | 254 | ||
| 391 | For example: | 255 | For example: |
| 392 | mount --make-shared /mnt | 256 | mount --make-shared /mnt |
| 393 | mount --bin /mnt /tmp | 257 | mount --bind /mnt /tmp |
| 394 | 258 | ||
| 395 | The mount at /mnt and that at /tmp are both shared and belong | 259 | The mount at /mnt and that at /tmp are both shared and belong |
| 396 | to the same peer group. Anything mounted or unmounted under | 260 | to the same peer group. Anything mounted or unmounted under |
| @@ -558,7 +422,7 @@ replicas continue to be exactly same. | |||
| 558 | then the subtree under the unbindable mount is pruned in the new | 422 | then the subtree under the unbindable mount is pruned in the new |
| 559 | location. | 423 | location. |
| 560 | 424 | ||
| 561 | eg: lets say we have the following mount tree. | 425 | eg: let's say we have the following mount tree. |
| 562 | 426 | ||
| 563 | A | 427 | A |
| 564 | / \ | 428 | / \ |
| @@ -566,7 +430,7 @@ replicas continue to be exactly same. | |||
| 566 | / \ / \ | 430 | / \ / \ |
| 567 | D E F G | 431 | D E F G |
| 568 | 432 | ||
| 569 | Lets say all the mount except the mount C in the tree are | 433 | Let's say all the mount except the mount C in the tree are |
| 570 | of a type other than unbindable. | 434 | of a type other than unbindable. |
| 571 | 435 | ||
| 572 | If this tree is rbound to say Z | 436 | If this tree is rbound to say Z |
| @@ -683,13 +547,13 @@ replicas continue to be exactly same. | |||
| 683 | 'b' on mounts that receive propagation from mount 'B' and does not have | 547 | 'b' on mounts that receive propagation from mount 'B' and does not have |
| 684 | sub-mounts within them are unmounted. | 548 | sub-mounts within them are unmounted. |
| 685 | 549 | ||
| 686 | Example: Lets say 'B1', 'B2', 'B3' are shared mounts that propagate to | 550 | Example: Let's say 'B1', 'B2', 'B3' are shared mounts that propagate to |
| 687 | each other. | 551 | each other. |
| 688 | 552 | ||
| 689 | lets say 'A1', 'A2', 'A3' are first mounted at dentry 'b' on mount | 553 | let's say 'A1', 'A2', 'A3' are first mounted at dentry 'b' on mount |
| 690 | 'B1', 'B2' and 'B3' respectively. | 554 | 'B1', 'B2' and 'B3' respectively. |
| 691 | 555 | ||
| 692 | lets say 'C1', 'C2', 'C3' are next mounted at the same dentry 'b' on | 556 | let's say 'C1', 'C2', 'C3' are next mounted at the same dentry 'b' on |
| 693 | mount 'B1', 'B2' and 'B3' respectively. | 557 | mount 'B1', 'B2' and 'B3' respectively. |
| 694 | 558 | ||
| 695 | if 'C1' is unmounted, all the mounts that are most-recently-mounted on | 559 | if 'C1' is unmounted, all the mounts that are most-recently-mounted on |
| @@ -710,7 +574,7 @@ replicas continue to be exactly same. | |||
| 710 | A cloned namespace contains all the mounts as that of the parent | 574 | A cloned namespace contains all the mounts as that of the parent |
| 711 | namespace. | 575 | namespace. |
| 712 | 576 | ||
| 713 | Lets say 'A' and 'B' are the corresponding mounts in the parent and the | 577 | Let's say 'A' and 'B' are the corresponding mounts in the parent and the |
| 714 | child namespace. | 578 | child namespace. |
| 715 | 579 | ||
| 716 | If 'A' is shared, then 'B' is also shared and 'A' and 'B' propagate to | 580 | If 'A' is shared, then 'B' is also shared and 'A' and 'B' propagate to |
| @@ -759,11 +623,11 @@ replicas continue to be exactly same. | |||
| 759 | mount --make-slave /mnt | 623 | mount --make-slave /mnt |
| 760 | 624 | ||
| 761 | At this point we have the first mount at /tmp and | 625 | At this point we have the first mount at /tmp and |
| 762 | its root dentry is 1. Lets call this mount 'A' | 626 | its root dentry is 1. Let's call this mount 'A' |
| 763 | And then we have a second mount at /tmp1 with root | 627 | And then we have a second mount at /tmp1 with root |
| 764 | dentry 2. Lets call this mount 'B' | 628 | dentry 2. Let's call this mount 'B' |
| 765 | Next we have a third mount at /mnt with root dentry | 629 | Next we have a third mount at /mnt with root dentry |
| 766 | mnt. Lets call this mount 'C' | 630 | mnt. Let's call this mount 'C' |
| 767 | 631 | ||
| 768 | 'B' is the slave of 'A' and 'C' is a slave of 'B' | 632 | 'B' is the slave of 'A' and 'C' is a slave of 'B' |
| 769 | A -> B -> C | 633 | A -> B -> C |
| @@ -794,7 +658,7 @@ replicas continue to be exactly same. | |||
| 794 | 658 | ||
| 795 | Q3 Why is unbindable mount needed? | 659 | Q3 Why is unbindable mount needed? |
| 796 | 660 | ||
| 797 | Lets say we want to replicate the mount tree at multiple | 661 | Let's say we want to replicate the mount tree at multiple |
| 798 | locations within the same subtree. | 662 | locations within the same subtree. |
| 799 | 663 | ||
| 800 | if one rbind mounts a tree within the same subtree 'n' times | 664 | if one rbind mounts a tree within the same subtree 'n' times |
| @@ -803,7 +667,7 @@ replicas continue to be exactly same. | |||
| 803 | mounts. Here is a example. | 667 | mounts. Here is a example. |
| 804 | 668 | ||
| 805 | step 1: | 669 | step 1: |
| 806 | lets say the root tree has just two directories with | 670 | let's say the root tree has just two directories with |
| 807 | one vfsmount. | 671 | one vfsmount. |
| 808 | root | 672 | root |
| 809 | / \ | 673 | / \ |
| @@ -875,7 +739,7 @@ replicas continue to be exactly same. | |||
| 875 | Unclonable mounts come in handy here. | 739 | Unclonable mounts come in handy here. |
| 876 | 740 | ||
| 877 | step 1: | 741 | step 1: |
| 878 | lets say the root tree has just two directories with | 742 | let's say the root tree has just two directories with |
| 879 | one vfsmount. | 743 | one vfsmount. |
| 880 | root | 744 | root |
| 881 | / \ | 745 | / \ |
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index f49eecf2e573..623f094c9d8d 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt | |||
| @@ -536,6 +536,7 @@ struct address_space_operations { | |||
| 536 | /* migrate the contents of a page to the specified target */ | 536 | /* migrate the contents of a page to the specified target */ |
| 537 | int (*migratepage) (struct page *, struct page *); | 537 | int (*migratepage) (struct page *, struct page *); |
| 538 | int (*launder_page) (struct page *); | 538 | int (*launder_page) (struct page *); |
| 539 | int (*error_remove_page) (struct mapping *mapping, struct page *page); | ||
| 539 | }; | 540 | }; |
| 540 | 541 | ||
| 541 | writepage: called by the VM to write a dirty page to backing store. | 542 | writepage: called by the VM to write a dirty page to backing store. |
| @@ -694,6 +695,12 @@ struct address_space_operations { | |||
| 694 | prevent redirtying the page, it is kept locked during the whole | 695 | prevent redirtying the page, it is kept locked during the whole |
| 695 | operation. | 696 | operation. |
| 696 | 697 | ||
| 698 | error_remove_page: normally set to generic_error_remove_page if truncation | ||
| 699 | is ok for this address space. Used for memory failure handling. | ||
| 700 | Setting this implies you deal with pages going away under you, | ||
| 701 | unless you have them locked or reference counts increased. | ||
| 702 | |||
| 703 | |||
| 697 | The File Object | 704 | The File Object |
| 698 | =============== | 705 | =============== |
| 699 | 706 | ||
diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt index aafca0a8f66a..947374977ca5 100644 --- a/Documentation/ioctl/ioctl-number.txt +++ b/Documentation/ioctl/ioctl-number.txt | |||
| @@ -135,6 +135,7 @@ Code Seq# Include File Comments | |||
| 135 | <http://mikonos.dia.unisa.it/tcfs> | 135 | <http://mikonos.dia.unisa.it/tcfs> |
| 136 | 'l' 40-7F linux/udf_fs_i.h in development: | 136 | 'l' 40-7F linux/udf_fs_i.h in development: |
| 137 | <http://sourceforge.net/projects/linux-udf/> | 137 | <http://sourceforge.net/projects/linux-udf/> |
| 138 | 'm' 00-09 linux/mmtimer.h | ||
| 138 | 'm' all linux/mtio.h conflict! | 139 | 'm' all linux/mtio.h conflict! |
| 139 | 'm' all linux/soundcard.h conflict! | 140 | 'm' all linux/soundcard.h conflict! |
| 140 | 'm' all linux/synclink.h conflict! | 141 | 'm' all linux/synclink.h conflict! |
diff --git a/Documentation/sysctl/fs.txt b/Documentation/sysctl/fs.txt index 1458448436cc..62682500878a 100644 --- a/Documentation/sysctl/fs.txt +++ b/Documentation/sysctl/fs.txt | |||
| @@ -96,13 +96,16 @@ handles that the Linux kernel will allocate. When you get lots | |||
| 96 | of error messages about running out of file handles, you might | 96 | of error messages about running out of file handles, you might |
| 97 | want to increase this limit. | 97 | want to increase this limit. |
| 98 | 98 | ||
| 99 | The three values in file-nr denote the number of allocated | 99 | Historically, the three values in file-nr denoted the number of |
| 100 | file handles, the number of unused file handles and the maximum | 100 | allocated file handles, the number of allocated but unused file |
| 101 | number of file handles. When the allocated file handles come | 101 | handles, and the maximum number of file handles. Linux 2.6 always |
| 102 | close to the maximum, but the number of unused file handles is | 102 | reports 0 as the number of free file handles -- this is not an |
| 103 | significantly greater than 0, you've encountered a peak in your | 103 | error, it just means that the number of allocated file handles |
| 104 | usage of file handles and you don't need to increase the maximum. | 104 | exactly matches the number of used file handles. |
| 105 | 105 | ||
| 106 | Attempts to allocate more file descriptors than file-max are | ||
| 107 | reported with printk, look for "VFS: file-max limit <number> | ||
| 108 | reached". | ||
| 106 | ============================================================== | 109 | ============================================================== |
| 107 | 110 | ||
| 108 | nr_open: | 111 | nr_open: |
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index b3d8b4922740..a028b92001ed 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt | |||
| @@ -22,6 +22,7 @@ show up in /proc/sys/kernel: | |||
| 22 | - callhome [ S390 only ] | 22 | - callhome [ S390 only ] |
| 23 | - auto_msgmni | 23 | - auto_msgmni |
| 24 | - core_pattern | 24 | - core_pattern |
| 25 | - core_pipe_limit | ||
| 25 | - core_uses_pid | 26 | - core_uses_pid |
| 26 | - ctrl-alt-del | 27 | - ctrl-alt-del |
| 27 | - dentry-state | 28 | - dentry-state |
| @@ -135,6 +136,27 @@ core_pattern is used to specify a core dumpfile pattern name. | |||
| 135 | 136 | ||
| 136 | ============================================================== | 137 | ============================================================== |
| 137 | 138 | ||
| 139 | core_pipe_limit: | ||
| 140 | |||
| 141 | This sysctl is only applicable when core_pattern is configured to pipe core | ||
| 142 | files to user space helper a (when the first character of core_pattern is a '|', | ||
| 143 | see above). When collecting cores via a pipe to an application, it is | ||
| 144 | occasionally usefull for the collecting application to gather data about the | ||
| 145 | crashing process from its /proc/pid directory. In order to do this safely, the | ||
| 146 | kernel must wait for the collecting process to exit, so as not to remove the | ||
| 147 | crashing processes proc files prematurely. This in turn creates the possibility | ||
| 148 | that a misbehaving userspace collecting process can block the reaping of a | ||
| 149 | crashed process simply by never exiting. This sysctl defends against that. It | ||
| 150 | defines how many concurrent crashing processes may be piped to user space | ||
| 151 | applications in parallel. If this value is exceeded, then those crashing | ||
| 152 | processes above that value are noted via the kernel log and their cores are | ||
| 153 | skipped. 0 is a special value, indicating that unlimited processes may be | ||
| 154 | captured in parallel, but that no waiting will take place (i.e. the collecting | ||
| 155 | process is not guaranteed access to /proc/<crahing pid>/). This value defaults | ||
| 156 | to 0. | ||
| 157 | |||
| 158 | ============================================================== | ||
| 159 | |||
| 138 | core_uses_pid: | 160 | core_uses_pid: |
| 139 | 161 | ||
| 140 | The default coredump filename is "core". By setting | 162 | The default coredump filename is "core". By setting |
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index e6fb1ec2744b..a6e360d2055c 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt | |||
| @@ -32,6 +32,8 @@ Currently, these files are in /proc/sys/vm: | |||
| 32 | - legacy_va_layout | 32 | - legacy_va_layout |
| 33 | - lowmem_reserve_ratio | 33 | - lowmem_reserve_ratio |
| 34 | - max_map_count | 34 | - max_map_count |
| 35 | - memory_failure_early_kill | ||
| 36 | - memory_failure_recovery | ||
| 35 | - min_free_kbytes | 37 | - min_free_kbytes |
| 36 | - min_slab_ratio | 38 | - min_slab_ratio |
| 37 | - min_unmapped_ratio | 39 | - min_unmapped_ratio |
| @@ -53,7 +55,6 @@ Currently, these files are in /proc/sys/vm: | |||
| 53 | - vfs_cache_pressure | 55 | - vfs_cache_pressure |
| 54 | - zone_reclaim_mode | 56 | - zone_reclaim_mode |
| 55 | 57 | ||
| 56 | |||
| 57 | ============================================================== | 58 | ============================================================== |
| 58 | 59 | ||
| 59 | block_dump | 60 | block_dump |
| @@ -275,6 +276,44 @@ e.g., up to one or two maps per allocation. | |||
| 275 | 276 | ||
| 276 | The default value is 65536. | 277 | The default value is 65536. |
| 277 | 278 | ||
| 279 | ============================================================= | ||
| 280 | |||
| 281 | memory_failure_early_kill: | ||
| 282 | |||
| 283 | Control how to kill processes when uncorrected memory error (typically | ||
| 284 | a 2bit error in a memory module) is detected in the background by hardware | ||
| 285 | that cannot be handled by the kernel. In some cases (like the page | ||
| 286 | still having a valid copy on disk) the kernel will handle the failure | ||
| 287 | transparently without affecting any applications. But if there is | ||
| 288 | no other uptodate copy of the data it will kill to prevent any data | ||
| 289 | corruptions from propagating. | ||
| 290 | |||
| 291 | 1: Kill all processes that have the corrupted and not reloadable page mapped | ||
| 292 | as soon as the corruption is detected. Note this is not supported | ||
| 293 | for a few types of pages, like kernel internally allocated data or | ||
| 294 | the swap cache, but works for the majority of user pages. | ||
| 295 | |||
| 296 | 0: Only unmap the corrupted page from all processes and only kill a process | ||
| 297 | who tries to access it. | ||
| 298 | |||
| 299 | The kill is done using a catchable SIGBUS with BUS_MCEERR_AO, so processes can | ||
| 300 | handle this if they want to. | ||
| 301 | |||
| 302 | This is only active on architectures/platforms with advanced machine | ||
| 303 | check handling and depends on the hardware capabilities. | ||
| 304 | |||
| 305 | Applications can override this setting individually with the PR_MCE_KILL prctl | ||
| 306 | |||
| 307 | ============================================================== | ||
| 308 | |||
| 309 | memory_failure_recovery | ||
| 310 | |||
| 311 | Enable memory failure recovery (when supported by the platform) | ||
| 312 | |||
| 313 | 1: Attempt recovery. | ||
| 314 | |||
| 315 | 0: Always panic on a memory failure. | ||
| 316 | |||
| 278 | ============================================================== | 317 | ============================================================== |
| 279 | 318 | ||
| 280 | min_free_kbytes: | 319 | min_free_kbytes: |
diff --git a/Documentation/vm/.gitignore b/Documentation/vm/.gitignore index 33e8a023df02..09b164a5700f 100644 --- a/Documentation/vm/.gitignore +++ b/Documentation/vm/.gitignore | |||
| @@ -1 +1,2 @@ | |||
| 1 | page-types | ||
| 1 | slabinfo | 2 | slabinfo |
diff --git a/Documentation/vm/page-types.c b/Documentation/vm/page-types.c index 3eda8ea00852..fa1a30d9e9d5 100644 --- a/Documentation/vm/page-types.c +++ b/Documentation/vm/page-types.c | |||
| @@ -5,6 +5,7 @@ | |||
| 5 | * Copyright (C) 2009 Wu Fengguang <fengguang.wu@intel.com> | 5 | * Copyright (C) 2009 Wu Fengguang <fengguang.wu@intel.com> |
| 6 | */ | 6 | */ |
| 7 | 7 | ||
| 8 | #define _LARGEFILE64_SOURCE | ||
| 8 | #include <stdio.h> | 9 | #include <stdio.h> |
| 9 | #include <stdlib.h> | 10 | #include <stdlib.h> |
| 10 | #include <unistd.h> | 11 | #include <unistd.h> |
| @@ -13,12 +14,33 @@ | |||
| 13 | #include <string.h> | 14 | #include <string.h> |
| 14 | #include <getopt.h> | 15 | #include <getopt.h> |
| 15 | #include <limits.h> | 16 | #include <limits.h> |
| 17 | #include <assert.h> | ||
| 16 | #include <sys/types.h> | 18 | #include <sys/types.h> |
| 17 | #include <sys/errno.h> | 19 | #include <sys/errno.h> |
| 18 | #include <sys/fcntl.h> | 20 | #include <sys/fcntl.h> |
| 19 | 21 | ||
| 20 | 22 | ||
| 21 | /* | 23 | /* |
| 24 | * pagemap kernel ABI bits | ||
| 25 | */ | ||
| 26 | |||
| 27 | #define PM_ENTRY_BYTES sizeof(uint64_t) | ||
| 28 | #define PM_STATUS_BITS 3 | ||
| 29 | #define PM_STATUS_OFFSET (64 - PM_STATUS_BITS) | ||
| 30 | #define PM_STATUS_MASK (((1LL << PM_STATUS_BITS) - 1) << PM_STATUS_OFFSET) | ||
| 31 | #define PM_STATUS(nr) (((nr) << PM_STATUS_OFFSET) & PM_STATUS_MASK) | ||
| 32 | #define PM_PSHIFT_BITS 6 | ||
| 33 | #define PM_PSHIFT_OFFSET (PM_STATUS_OFFSET - PM_PSHIFT_BITS) | ||
| 34 | #define PM_PSHIFT_MASK (((1LL << PM_PSHIFT_BITS) - 1) << PM_PSHIFT_OFFSET) | ||
| 35 | #define PM_PSHIFT(x) (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK) | ||
| 36 | #define PM_PFRAME_MASK ((1LL << PM_PSHIFT_OFFSET) - 1) | ||
| 37 | #define PM_PFRAME(x) ((x) & PM_PFRAME_MASK) | ||
| 38 | |||
| 39 | #define PM_PRESENT PM_STATUS(4LL) | ||
| 40 | #define PM_SWAP PM_STATUS(2LL) | ||
| 41 | |||
| 42 | |||
| 43 | /* | ||
| 22 | * kernel page flags | 44 | * kernel page flags |
| 23 | */ | 45 | */ |
| 24 | 46 | ||
| @@ -126,6 +148,14 @@ static int nr_addr_ranges; | |||
| 126 | static unsigned long opt_offset[MAX_ADDR_RANGES]; | 148 | static unsigned long opt_offset[MAX_ADDR_RANGES]; |
| 127 | static unsigned long opt_size[MAX_ADDR_RANGES]; | 149 | static unsigned long opt_size[MAX_ADDR_RANGES]; |
| 128 | 150 | ||
| 151 | #define MAX_VMAS 10240 | ||
| 152 | static int nr_vmas; | ||
| 153 | static unsigned long pg_start[MAX_VMAS]; | ||
| 154 | static unsigned long pg_end[MAX_VMAS]; | ||
| 155 | static unsigned long voffset; | ||
| 156 | |||
| 157 | static int pagemap_fd; | ||
| 158 | |||
| 129 | #define MAX_BIT_FILTERS 64 | 159 | #define MAX_BIT_FILTERS 64 |
| 130 | static int nr_bit_filters; | 160 | static int nr_bit_filters; |
| 131 | static uint64_t opt_mask[MAX_BIT_FILTERS]; | 161 | static uint64_t opt_mask[MAX_BIT_FILTERS]; |
| @@ -135,7 +165,6 @@ static int page_size; | |||
| 135 | 165 | ||
| 136 | #define PAGES_BATCH (64 << 10) /* 64k pages */ | 166 | #define PAGES_BATCH (64 << 10) /* 64k pages */ |
| 137 | static int kpageflags_fd; | 167 | static int kpageflags_fd; |
| 138 | static uint64_t kpageflags_buf[KPF_BYTES * PAGES_BATCH]; | ||
| 139 | 168 | ||
| 140 | #define HASH_SHIFT 13 | 169 | #define HASH_SHIFT 13 |
| 141 | #define HASH_SIZE (1 << HASH_SHIFT) | 170 | #define HASH_SIZE (1 << HASH_SHIFT) |
| @@ -158,6 +187,11 @@ static uint64_t page_flags[HASH_SIZE]; | |||
| 158 | type __min2 = (y); \ | 187 | type __min2 = (y); \ |
| 159 | __min1 < __min2 ? __min1 : __min2; }) | 188 | __min1 < __min2 ? __min1 : __min2; }) |
| 160 | 189 | ||
| 190 | #define max_t(type, x, y) ({ \ | ||
| 191 | type __max1 = (x); \ | ||
| 192 | type __max2 = (y); \ | ||
| 193 | __max1 > __max2 ? __max1 : __max2; }) | ||
| 194 | |||
| 161 | static unsigned long pages2mb(unsigned long pages) | 195 | static unsigned long pages2mb(unsigned long pages) |
| 162 | { | 196 | { |
| 163 | return (pages * page_size) >> 20; | 197 | return (pages * page_size) >> 20; |
| @@ -224,26 +258,34 @@ static char *page_flag_longname(uint64_t flags) | |||
| 224 | static void show_page_range(unsigned long offset, uint64_t flags) | 258 | static void show_page_range(unsigned long offset, uint64_t flags) |
| 225 | { | 259 | { |
| 226 | static uint64_t flags0; | 260 | static uint64_t flags0; |
| 261 | static unsigned long voff; | ||
| 227 | static unsigned long index; | 262 | static unsigned long index; |
| 228 | static unsigned long count; | 263 | static unsigned long count; |
| 229 | 264 | ||
| 230 | if (flags == flags0 && offset == index + count) { | 265 | if (flags == flags0 && offset == index + count && |
| 266 | (!opt_pid || voffset == voff + count)) { | ||
| 231 | count++; | 267 | count++; |
| 232 | return; | 268 | return; |
| 233 | } | 269 | } |
| 234 | 270 | ||
| 235 | if (count) | 271 | if (count) { |
| 236 | printf("%lu\t%lu\t%s\n", | 272 | if (opt_pid) |
| 273 | printf("%lx\t", voff); | ||
| 274 | printf("%lx\t%lx\t%s\n", | ||
| 237 | index, count, page_flag_name(flags0)); | 275 | index, count, page_flag_name(flags0)); |
| 276 | } | ||
| 238 | 277 | ||
| 239 | flags0 = flags; | 278 | flags0 = flags; |
| 240 | index = offset; | 279 | index = offset; |
| 280 | voff = voffset; | ||
| 241 | count = 1; | 281 | count = 1; |
| 242 | } | 282 | } |
| 243 | 283 | ||
| 244 | static void show_page(unsigned long offset, uint64_t flags) | 284 | static void show_page(unsigned long offset, uint64_t flags) |
| 245 | { | 285 | { |
| 246 | printf("%lu\t%s\n", offset, page_flag_name(flags)); | 286 | if (opt_pid) |
| 287 | printf("%lx\t", voffset); | ||
| 288 | printf("%lx\t%s\n", offset, page_flag_name(flags)); | ||
| 247 | } | 289 | } |
| 248 | 290 | ||
| 249 | static void show_summary(void) | 291 | static void show_summary(void) |
| @@ -383,6 +425,8 @@ static void walk_pfn(unsigned long index, unsigned long count) | |||
| 383 | lseek(kpageflags_fd, index * KPF_BYTES, SEEK_SET); | 425 | lseek(kpageflags_fd, index * KPF_BYTES, SEEK_SET); |
| 384 | 426 | ||
| 385 | while (count) { | 427 | while (count) { |
| 428 | uint64_t kpageflags_buf[KPF_BYTES * PAGES_BATCH]; | ||
| 429 | |||
| 386 | batch = min_t(unsigned long, count, PAGES_BATCH); | 430 | batch = min_t(unsigned long, count, PAGES_BATCH); |
| 387 | n = read(kpageflags_fd, kpageflags_buf, batch * KPF_BYTES); | 431 | n = read(kpageflags_fd, kpageflags_buf, batch * KPF_BYTES); |
| 388 | if (n == 0) | 432 | if (n == 0) |
| @@ -404,6 +448,81 @@ static void walk_pfn(unsigned long index, unsigned long count) | |||
| 404 | } | 448 | } |
| 405 | } | 449 | } |
| 406 | 450 | ||
| 451 | |||
| 452 | #define PAGEMAP_BATCH 4096 | ||
| 453 | static unsigned long task_pfn(unsigned long pgoff) | ||
| 454 | { | ||
| 455 | static uint64_t buf[PAGEMAP_BATCH]; | ||
| 456 | static unsigned long start; | ||
| 457 | static long count; | ||
| 458 | uint64_t pfn; | ||
| 459 | |||
| 460 | if (pgoff < start || pgoff >= start + count) { | ||
| 461 | if (lseek64(pagemap_fd, | ||
| 462 | (uint64_t)pgoff * PM_ENTRY_BYTES, | ||
| 463 | SEEK_SET) < 0) { | ||
| 464 | perror("pagemap seek"); | ||
| 465 | exit(EXIT_FAILURE); | ||
| 466 | } | ||
| 467 | count = read(pagemap_fd, buf, sizeof(buf)); | ||
| 468 | if (count == 0) | ||
| 469 | return 0; | ||
| 470 | if (count < 0) { | ||
| 471 | perror("pagemap read"); | ||
| 472 | exit(EXIT_FAILURE); | ||
| 473 | } | ||
| 474 | if (count % PM_ENTRY_BYTES) { | ||
| 475 | fatal("pagemap read not aligned.\n"); | ||
| 476 | exit(EXIT_FAILURE); | ||
| 477 | } | ||
| 478 | count /= PM_ENTRY_BYTES; | ||
| 479 | start = pgoff; | ||
| 480 | } | ||
| 481 | |||
| 482 | pfn = buf[pgoff - start]; | ||
| 483 | if (pfn & PM_PRESENT) | ||
| 484 | pfn = PM_PFRAME(pfn); | ||
| 485 | else | ||
| 486 | pfn = 0; | ||
| 487 | |||
| 488 | return pfn; | ||
| 489 | } | ||
| 490 | |||
| 491 | static void walk_task(unsigned long index, unsigned long count) | ||
| 492 | { | ||
| 493 | int i = 0; | ||
| 494 | const unsigned long end = index + count; | ||
| 495 | |||
| 496 | while (index < end) { | ||
| 497 | |||
| 498 | while (pg_end[i] <= index) | ||
| 499 | if (++i >= nr_vmas) | ||
| 500 | return; | ||
| 501 | if (pg_start[i] >= end) | ||
| 502 | return; | ||
| 503 | |||
| 504 | voffset = max_t(unsigned long, pg_start[i], index); | ||
| 505 | index = min_t(unsigned long, pg_end[i], end); | ||
| 506 | |||
| 507 | assert(voffset < index); | ||
| 508 | for (; voffset < index; voffset++) { | ||
| 509 | unsigned long pfn = task_pfn(voffset); | ||
| 510 | if (pfn) | ||
| 511 | walk_pfn(pfn, 1); | ||
| 512 | } | ||
| 513 | } | ||
| 514 | } | ||
| 515 | |||
| 516 | static void add_addr_range(unsigned long offset, unsigned long size) | ||
| 517 | { | ||
| 518 | if (nr_addr_ranges >= MAX_ADDR_RANGES) | ||
| 519 | fatal("too many addr ranges\n"); | ||
| 520 | |||
| 521 | opt_offset[nr_addr_ranges] = offset; | ||
| 522 | opt_size[nr_addr_ranges] = min_t(unsigned long, size, ULONG_MAX-offset); | ||
| 523 | nr_addr_ranges++; | ||
| 524 | } | ||
| 525 | |||
| 407 | static void walk_addr_ranges(void) | 526 | static void walk_addr_ranges(void) |
| 408 | { | 527 | { |
| 409 | int i; | 528 | int i; |
| @@ -415,10 +534,13 @@ static void walk_addr_ranges(void) | |||
| 415 | } | 534 | } |
| 416 | 535 | ||
| 417 | if (!nr_addr_ranges) | 536 | if (!nr_addr_ranges) |
| 418 | walk_pfn(0, ULONG_MAX); | 537 | add_addr_range(0, ULONG_MAX); |
| 419 | 538 | ||
| 420 | for (i = 0; i < nr_addr_ranges; i++) | 539 | for (i = 0; i < nr_addr_ranges; i++) |
| 421 | walk_pfn(opt_offset[i], opt_size[i]); | 540 | if (!opt_pid) |
| 541 | walk_pfn(opt_offset[i], opt_size[i]); | ||
| 542 | else | ||
| 543 | walk_task(opt_offset[i], opt_size[i]); | ||
| 422 | 544 | ||
| 423 | close(kpageflags_fd); | 545 | close(kpageflags_fd); |
| 424 | } | 546 | } |
| @@ -446,8 +568,8 @@ static void usage(void) | |||
| 446 | " -r|--raw Raw mode, for kernel developers\n" | 568 | " -r|--raw Raw mode, for kernel developers\n" |
| 447 | " -a|--addr addr-spec Walk a range of pages\n" | 569 | " -a|--addr addr-spec Walk a range of pages\n" |
| 448 | " -b|--bits bits-spec Walk pages with specified bits\n" | 570 | " -b|--bits bits-spec Walk pages with specified bits\n" |
| 449 | #if 0 /* planned features */ | ||
| 450 | " -p|--pid pid Walk process address space\n" | 571 | " -p|--pid pid Walk process address space\n" |
| 572 | #if 0 /* planned features */ | ||
| 451 | " -f|--file filename Walk file address space\n" | 573 | " -f|--file filename Walk file address space\n" |
| 452 | #endif | 574 | #endif |
| 453 | " -l|--list Show page details in ranges\n" | 575 | " -l|--list Show page details in ranges\n" |
| @@ -459,7 +581,7 @@ static void usage(void) | |||
| 459 | " N+M pages range from N to N+M-1\n" | 581 | " N+M pages range from N to N+M-1\n" |
| 460 | " N,M pages range from N to M-1\n" | 582 | " N,M pages range from N to M-1\n" |
| 461 | " N, pages range from N to end\n" | 583 | " N, pages range from N to end\n" |
| 462 | " ,M pages range from 0 to M\n" | 584 | " ,M pages range from 0 to M-1\n" |
| 463 | "bits-spec:\n" | 585 | "bits-spec:\n" |
| 464 | " bit1,bit2 (flags & (bit1|bit2)) != 0\n" | 586 | " bit1,bit2 (flags & (bit1|bit2)) != 0\n" |
| 465 | " bit1,bit2=bit1 (flags & (bit1|bit2)) == bit1\n" | 587 | " bit1,bit2=bit1 (flags & (bit1|bit2)) == bit1\n" |
| @@ -496,21 +618,57 @@ static unsigned long long parse_number(const char *str) | |||
| 496 | 618 | ||
| 497 | static void parse_pid(const char *str) | 619 | static void parse_pid(const char *str) |
| 498 | { | 620 | { |
| 621 | FILE *file; | ||
| 622 | char buf[5000]; | ||
| 623 | |||
| 499 | opt_pid = parse_number(str); | 624 | opt_pid = parse_number(str); |
| 500 | } | ||
| 501 | 625 | ||
| 502 | static void parse_file(const char *name) | 626 | sprintf(buf, "/proc/%d/pagemap", opt_pid); |
| 503 | { | 627 | pagemap_fd = open(buf, O_RDONLY); |
| 628 | if (pagemap_fd < 0) { | ||
| 629 | perror(buf); | ||
| 630 | exit(EXIT_FAILURE); | ||
| 631 | } | ||
| 632 | |||
| 633 | sprintf(buf, "/proc/%d/maps", opt_pid); | ||
| 634 | file = fopen(buf, "r"); | ||
| 635 | if (!file) { | ||
| 636 | perror(buf); | ||
| 637 | exit(EXIT_FAILURE); | ||
| 638 | } | ||
| 639 | |||
| 640 | while (fgets(buf, sizeof(buf), file) != NULL) { | ||
| 641 | unsigned long vm_start; | ||
| 642 | unsigned long vm_end; | ||
| 643 | unsigned long long pgoff; | ||
| 644 | int major, minor; | ||
| 645 | char r, w, x, s; | ||
| 646 | unsigned long ino; | ||
| 647 | int n; | ||
| 648 | |||
| 649 | n = sscanf(buf, "%lx-%lx %c%c%c%c %llx %x:%x %lu", | ||
| 650 | &vm_start, | ||
| 651 | &vm_end, | ||
| 652 | &r, &w, &x, &s, | ||
| 653 | &pgoff, | ||
| 654 | &major, &minor, | ||
| 655 | &ino); | ||
| 656 | if (n < 10) { | ||
| 657 | fprintf(stderr, "unexpected line: %s\n", buf); | ||
| 658 | continue; | ||
| 659 | } | ||
| 660 | pg_start[nr_vmas] = vm_start / page_size; | ||
| 661 | pg_end[nr_vmas] = vm_end / page_size; | ||
| 662 | if (++nr_vmas >= MAX_VMAS) { | ||
| 663 | fprintf(stderr, "too many VMAs\n"); | ||
| 664 | break; | ||
| 665 | } | ||
| 666 | } | ||
| 667 | fclose(file); | ||
| 504 | } | 668 | } |
| 505 | 669 | ||
| 506 | static void add_addr_range(unsigned long offset, unsigned long size) | 670 | static void parse_file(const char *name) |
| 507 | { | 671 | { |
| 508 | if (nr_addr_ranges >= MAX_ADDR_RANGES) | ||
| 509 | fatal("too much addr ranges\n"); | ||
| 510 | |||
| 511 | opt_offset[nr_addr_ranges] = offset; | ||
| 512 | opt_size[nr_addr_ranges] = size; | ||
| 513 | nr_addr_ranges++; | ||
| 514 | } | 672 | } |
| 515 | 673 | ||
| 516 | static void parse_addr_range(const char *optarg) | 674 | static void parse_addr_range(const char *optarg) |
| @@ -676,8 +834,10 @@ int main(int argc, char *argv[]) | |||
| 676 | } | 834 | } |
| 677 | } | 835 | } |
| 678 | 836 | ||
| 837 | if (opt_list && opt_pid) | ||
| 838 | printf("voffset\t"); | ||
| 679 | if (opt_list == 1) | 839 | if (opt_list == 1) |
| 680 | printf("offset\tcount\tflags\n"); | 840 | printf("offset\tlen\tflags\n"); |
| 681 | if (opt_list == 2) | 841 | if (opt_list == 2) |
| 682 | printf("offset\tflags\n"); | 842 | printf("offset\tflags\n"); |
| 683 | 843 | ||
