diff options
Diffstat (limited to 'Documentation/filesystems')
-rw-r--r-- | Documentation/filesystems/00-INDEX | 4 | ||||
-rw-r--r-- | Documentation/filesystems/Locking | 18 | ||||
-rw-r--r-- | Documentation/filesystems/Makefile | 8 | ||||
-rw-r--r-- | Documentation/filesystems/dentry-locking.txt | 3 | ||||
-rw-r--r-- | Documentation/filesystems/dnotify.txt | 39 | ||||
-rw-r--r-- | Documentation/filesystems/dnotify_test.c | 34 | ||||
-rw-r--r-- | Documentation/filesystems/logfs.txt | 241 | ||||
-rw-r--r-- | Documentation/filesystems/nfs/nfs41-server.txt | 5 | ||||
-rw-r--r-- | Documentation/filesystems/nilfs2.txt | 3 | ||||
-rw-r--r-- | Documentation/filesystems/proc.txt | 55 | ||||
-rw-r--r-- | Documentation/filesystems/sharedsubtree.txt | 16 |
11 files changed, 367 insertions, 59 deletions
diff --git a/Documentation/filesystems/00-INDEX b/Documentation/filesystems/00-INDEX index 875d49696b6e..3bae418c6ad3 100644 --- a/Documentation/filesystems/00-INDEX +++ b/Documentation/filesystems/00-INDEX | |||
@@ -32,6 +32,8 @@ dlmfs.txt | |||
32 | - info on the userspace interface to the OCFS2 DLM. | 32 | - info on the userspace interface to the OCFS2 DLM. |
33 | dnotify.txt | 33 | dnotify.txt |
34 | - info about directory notification in Linux. | 34 | - info about directory notification in Linux. |
35 | dnotify_test.c | ||
36 | - example program for dnotify | ||
35 | ecryptfs.txt | 37 | ecryptfs.txt |
36 | - docs on eCryptfs: stacked cryptographic filesystem for Linux. | 38 | - docs on eCryptfs: stacked cryptographic filesystem for Linux. |
37 | exofs.txt | 39 | exofs.txt |
@@ -62,6 +64,8 @@ jfs.txt | |||
62 | - info and mount options for the JFS filesystem. | 64 | - info and mount options for the JFS filesystem. |
63 | locks.txt | 65 | locks.txt |
64 | - info on file locking implementations, flock() vs. fcntl(), etc. | 66 | - info on file locking implementations, flock() vs. fcntl(), etc. |
67 | logfs.txt | ||
68 | - info on the LogFS flash filesystem. | ||
65 | mandatory-locking.txt | 69 | mandatory-locking.txt |
66 | - info on the Linux implementation of Sys V mandatory file locking. | 70 | - info on the Linux implementation of Sys V mandatory file locking. |
67 | ncpfs.txt | 71 | ncpfs.txt |
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 18b9d0ca0630..06bbbed71206 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking | |||
@@ -460,13 +460,6 @@ in sys_read() and friends. | |||
460 | 460 | ||
461 | --------------------------- dquot_operations ------------------------------- | 461 | --------------------------- dquot_operations ------------------------------- |
462 | prototypes: | 462 | prototypes: |
463 | int (*initialize) (struct inode *, int); | ||
464 | int (*drop) (struct inode *); | ||
465 | int (*alloc_space) (struct inode *, qsize_t, int); | ||
466 | int (*alloc_inode) (const struct inode *, unsigned long); | ||
467 | int (*free_space) (struct inode *, qsize_t); | ||
468 | int (*free_inode) (const struct inode *, unsigned long); | ||
469 | int (*transfer) (struct inode *, struct iattr *); | ||
470 | int (*write_dquot) (struct dquot *); | 463 | int (*write_dquot) (struct dquot *); |
471 | int (*acquire_dquot) (struct dquot *); | 464 | int (*acquire_dquot) (struct dquot *); |
472 | int (*release_dquot) (struct dquot *); | 465 | int (*release_dquot) (struct dquot *); |
@@ -479,13 +472,6 @@ a proper locking wrt the filesystem and call the generic quota operations. | |||
479 | What filesystem should expect from the generic quota functions: | 472 | What filesystem should expect from the generic quota functions: |
480 | 473 | ||
481 | FS recursion Held locks when called | 474 | FS recursion Held locks when called |
482 | initialize: yes maybe dqonoff_sem | ||
483 | drop: yes - | ||
484 | alloc_space: ->mark_dirty() - | ||
485 | alloc_inode: ->mark_dirty() - | ||
486 | free_space: ->mark_dirty() - | ||
487 | free_inode: ->mark_dirty() - | ||
488 | transfer: yes - | ||
489 | write_dquot: yes dqonoff_sem or dqptr_sem | 475 | write_dquot: yes dqonoff_sem or dqptr_sem |
490 | acquire_dquot: yes dqonoff_sem or dqptr_sem | 476 | acquire_dquot: yes dqonoff_sem or dqptr_sem |
491 | release_dquot: yes dqonoff_sem or dqptr_sem | 477 | release_dquot: yes dqonoff_sem or dqptr_sem |
@@ -495,10 +481,6 @@ write_info: yes dqonoff_sem | |||
495 | FS recursion means calling ->quota_read() and ->quota_write() from superblock | 481 | FS recursion means calling ->quota_read() and ->quota_write() from superblock |
496 | operations. | 482 | operations. |
497 | 483 | ||
498 | ->alloc_space(), ->alloc_inode(), ->free_space(), ->free_inode() are called | ||
499 | only directly by the filesystem and do not call any fs functions only | ||
500 | the ->mark_dirty() operation. | ||
501 | |||
502 | More details about quota locking can be found in fs/dquot.c. | 484 | More details about quota locking can be found in fs/dquot.c. |
503 | 485 | ||
504 | --------------------------- vm_operations_struct ----------------------------- | 486 | --------------------------- vm_operations_struct ----------------------------- |
diff --git a/Documentation/filesystems/Makefile b/Documentation/filesystems/Makefile new file mode 100644 index 000000000000..a5dd114da14f --- /dev/null +++ b/Documentation/filesystems/Makefile | |||
@@ -0,0 +1,8 @@ | |||
1 | # kbuild trick to avoid linker error. Can be omitted if a module is built. | ||
2 | obj- := dummy.o | ||
3 | |||
4 | # List of programs to build | ||
5 | hostprogs-y := dnotify_test | ||
6 | |||
7 | # Tell kbuild to always build the programs | ||
8 | always := $(hostprogs-y) | ||
diff --git a/Documentation/filesystems/dentry-locking.txt b/Documentation/filesystems/dentry-locking.txt index 4c0c575a4012..79334ed5daa7 100644 --- a/Documentation/filesystems/dentry-locking.txt +++ b/Documentation/filesystems/dentry-locking.txt | |||
@@ -62,7 +62,8 @@ changes are : | |||
62 | 2. Insertion of a dentry into the hash table is done using | 62 | 2. Insertion of a dentry into the hash table is done using |
63 | hlist_add_head_rcu() which take care of ordering the writes - the | 63 | hlist_add_head_rcu() which take care of ordering the writes - the |
64 | writes to the dentry must be visible before the dentry is | 64 | writes to the dentry must be visible before the dentry is |
65 | inserted. This works in conjunction with hlist_for_each_rcu() while | 65 | inserted. This works in conjunction with hlist_for_each_rcu(), |
66 | which has since been replaced by hlist_for_each_entry_rcu(), while | ||
66 | walking the hash chain. The only requirement is that all | 67 | walking the hash chain. The only requirement is that all |
67 | initialization to the dentry must be done before | 68 | initialization to the dentry must be done before |
68 | hlist_add_head_rcu() since we don't have dcache_lock protection | 69 | hlist_add_head_rcu() since we don't have dcache_lock protection |
diff --git a/Documentation/filesystems/dnotify.txt b/Documentation/filesystems/dnotify.txt index 9f5d338ddbb8..6baf88f46859 100644 --- a/Documentation/filesystems/dnotify.txt +++ b/Documentation/filesystems/dnotify.txt | |||
@@ -62,38 +62,9 @@ disabled, fcntl(fd, F_NOTIFY, ...) will return -EINVAL. | |||
62 | 62 | ||
63 | Example | 63 | Example |
64 | ------- | 64 | ------- |
65 | See Documentation/filesystems/dnotify_test.c for an example. | ||
65 | 66 | ||
66 | #define _GNU_SOURCE /* needed to get the defines */ | 67 | NOTE |
67 | #include <fcntl.h> /* in glibc 2.2 this has the needed | 68 | ---- |
68 | values defined */ | 69 | Beginning with Linux 2.6.13, dnotify has been replaced by inotify. |
69 | #include <signal.h> | 70 | See Documentation/filesystems/inotify.txt for more information on it. |
70 | #include <stdio.h> | ||
71 | #include <unistd.h> | ||
72 | |||
73 | static volatile int event_fd; | ||
74 | |||
75 | static void handler(int sig, siginfo_t *si, void *data) | ||
76 | { | ||
77 | event_fd = si->si_fd; | ||
78 | } | ||
79 | |||
80 | int main(void) | ||
81 | { | ||
82 | struct sigaction act; | ||
83 | int fd; | ||
84 | |||
85 | act.sa_sigaction = handler; | ||
86 | sigemptyset(&act.sa_mask); | ||
87 | act.sa_flags = SA_SIGINFO; | ||
88 | sigaction(SIGRTMIN + 1, &act, NULL); | ||
89 | |||
90 | fd = open(".", O_RDONLY); | ||
91 | fcntl(fd, F_SETSIG, SIGRTMIN + 1); | ||
92 | fcntl(fd, F_NOTIFY, DN_MODIFY|DN_CREATE|DN_MULTISHOT); | ||
93 | /* we will now be notified if any of the files | ||
94 | in "." is modified or new files are created */ | ||
95 | while (1) { | ||
96 | pause(); | ||
97 | printf("Got event on fd=%d\n", event_fd); | ||
98 | } | ||
99 | } | ||
diff --git a/Documentation/filesystems/dnotify_test.c b/Documentation/filesystems/dnotify_test.c new file mode 100644 index 000000000000..8b37b4a1e18d --- /dev/null +++ b/Documentation/filesystems/dnotify_test.c | |||
@@ -0,0 +1,34 @@ | |||
1 | #define _GNU_SOURCE /* needed to get the defines */ | ||
2 | #include <fcntl.h> /* in glibc 2.2 this has the needed | ||
3 | values defined */ | ||
4 | #include <signal.h> | ||
5 | #include <stdio.h> | ||
6 | #include <unistd.h> | ||
7 | |||
8 | static volatile int event_fd; | ||
9 | |||
10 | static void handler(int sig, siginfo_t *si, void *data) | ||
11 | { | ||
12 | event_fd = si->si_fd; | ||
13 | } | ||
14 | |||
15 | int main(void) | ||
16 | { | ||
17 | struct sigaction act; | ||
18 | int fd; | ||
19 | |||
20 | act.sa_sigaction = handler; | ||
21 | sigemptyset(&act.sa_mask); | ||
22 | act.sa_flags = SA_SIGINFO; | ||
23 | sigaction(SIGRTMIN + 1, &act, NULL); | ||
24 | |||
25 | fd = open(".", O_RDONLY); | ||
26 | fcntl(fd, F_SETSIG, SIGRTMIN + 1); | ||
27 | fcntl(fd, F_NOTIFY, DN_MODIFY|DN_CREATE|DN_MULTISHOT); | ||
28 | /* we will now be notified if any of the files | ||
29 | in "." is modified or new files are created */ | ||
30 | while (1) { | ||
31 | pause(); | ||
32 | printf("Got event on fd=%d\n", event_fd); | ||
33 | } | ||
34 | } | ||
diff --git a/Documentation/filesystems/logfs.txt b/Documentation/filesystems/logfs.txt new file mode 100644 index 000000000000..e64c94ba401a --- /dev/null +++ b/Documentation/filesystems/logfs.txt | |||
@@ -0,0 +1,241 @@ | |||
1 | |||
2 | The LogFS Flash Filesystem | ||
3 | ========================== | ||
4 | |||
5 | Specification | ||
6 | ============= | ||
7 | |||
8 | Superblocks | ||
9 | ----------- | ||
10 | |||
11 | Two superblocks exist at the beginning and end of the filesystem. | ||
12 | Each superblock is 256 Bytes large, with another 3840 Bytes reserved | ||
13 | for future purposes, making a total of 4096 Bytes. | ||
14 | |||
15 | Superblock locations may differ for MTD and block devices. On MTD the | ||
16 | first non-bad block contains a superblock in the first 4096 Bytes and | ||
17 | the last non-bad block contains a superblock in the last 4096 Bytes. | ||
18 | On block devices, the first 4096 Bytes of the device contain the first | ||
19 | superblock and the last aligned 4096 Byte-block contains the second | ||
20 | superblock. | ||
21 | |||
22 | For the most part, the superblocks can be considered read-only. They | ||
23 | are written only to correct errors detected within the superblocks, | ||
24 | move the journal and change the filesystem parameters through tunefs. | ||
25 | As a result, the superblock does not contain any fields that require | ||
26 | constant updates, like the amount of free space, etc. | ||
27 | |||
28 | Segments | ||
29 | -------- | ||
30 | |||
31 | The space in the device is split up into equal-sized segments. | ||
32 | Segments are the primary write unit of LogFS. Within each segments, | ||
33 | writes happen from front (low addresses) to back (high addresses. If | ||
34 | only a partial segment has been written, the segment number, the | ||
35 | current position within and optionally a write buffer are stored in | ||
36 | the journal. | ||
37 | |||
38 | Segments are erased as a whole. Therefore Garbage Collection may be | ||
39 | required to completely free a segment before doing so. | ||
40 | |||
41 | Journal | ||
42 | -------- | ||
43 | |||
44 | The journal contains all global information about the filesystem that | ||
45 | is subject to frequent change. At mount time, it has to be scanned | ||
46 | for the most recent commit entry, which contains a list of pointers to | ||
47 | all currently valid entries. | ||
48 | |||
49 | Object Store | ||
50 | ------------ | ||
51 | |||
52 | All space except for the superblocks and journal is part of the object | ||
53 | store. Each segment contains a segment header and a number of | ||
54 | objects, each consisting of the object header and the payload. | ||
55 | Objects are either inodes, directory entries (dentries), file data | ||
56 | blocks or indirect blocks. | ||
57 | |||
58 | Levels | ||
59 | ------ | ||
60 | |||
61 | Garbage collection (GC) may fail if all data is written | ||
62 | indiscriminately. One requirement of GC is that data is seperated | ||
63 | roughly according to the distance between the tree root and the data. | ||
64 | Effectively that means all file data is on level 0, indirect blocks | ||
65 | are on levels 1, 2, 3 4 or 5 for 1x, 2x, 3x, 4x or 5x indirect blocks, | ||
66 | respectively. Inode file data is on level 6 for the inodes and 7-11 | ||
67 | for indirect blocks. | ||
68 | |||
69 | Each segment contains objects of a single level only. As a result, | ||
70 | each level requires its own seperate segment to be open for writing. | ||
71 | |||
72 | Inode File | ||
73 | ---------- | ||
74 | |||
75 | All inodes are stored in a special file, the inode file. Single | ||
76 | exception is the inode file's inode (master inode) which for obvious | ||
77 | reasons is stored in the journal instead. Instead of data blocks, the | ||
78 | leaf nodes of the inode files are inodes. | ||
79 | |||
80 | Aliases | ||
81 | ------- | ||
82 | |||
83 | Writes in LogFS are done by means of a wandering tree. A naïve | ||
84 | implementation would require that for each write or a block, all | ||
85 | parent blocks are written as well, since the block pointers have | ||
86 | changed. Such an implementation would not be very efficient. | ||
87 | |||
88 | In LogFS, the block pointer changes are cached in the journal by means | ||
89 | of alias entries. Each alias consists of its logical address - inode | ||
90 | number, block index, level and child number (index into block) - and | ||
91 | the changed data. Any 8-byte word can be changes in this manner. | ||
92 | |||
93 | Currently aliases are used for block pointers, file size, file used | ||
94 | bytes and the height of an inodes indirect tree. | ||
95 | |||
96 | Segment Aliases | ||
97 | --------------- | ||
98 | |||
99 | Related to regular aliases, these are used to handle bad blocks. | ||
100 | Initially, bad blocks are handled by moving the affected segment | ||
101 | content to a spare segment and noting this move in the journal with a | ||
102 | segment alias, a simple (to, from) tupel. GC will later empty this | ||
103 | segment and the alias can be removed again. This is used on MTD only. | ||
104 | |||
105 | Vim | ||
106 | --- | ||
107 | |||
108 | By cleverly predicting the life time of data, it is possible to | ||
109 | seperate long-living data from short-living data and thereby reduce | ||
110 | the GC overhead later. Each type of distinc life expectency (vim) can | ||
111 | have a seperate segment open for writing. Each (level, vim) tupel can | ||
112 | be open just once. If an open segment with unknown vim is encountered | ||
113 | at mount time, it is closed and ignored henceforth. | ||
114 | |||
115 | Indirect Tree | ||
116 | ------------- | ||
117 | |||
118 | Inodes in LogFS are similar to FFS-style filesystems with direct and | ||
119 | indirect block pointers. One difference is that LogFS uses a single | ||
120 | indirect pointer that can be either a 1x, 2x, etc. indirect pointer. | ||
121 | A height field in the inode defines the height of the indirect tree | ||
122 | and thereby the indirection of the pointer. | ||
123 | |||
124 | Another difference is the addressing of indirect blocks. In LogFS, | ||
125 | the first 16 pointers in the first indirect block are left empty, | ||
126 | corresponding to the 16 direct pointers in the inode. In ext2 (maybe | ||
127 | others as well) the first pointer in the first indirect block | ||
128 | corresponds to logical block 12, skipping the 12 direct pointers. | ||
129 | So where ext2 is using arithmetic to better utilize space, LogFS keeps | ||
130 | arithmetic simple and uses compression to save space. | ||
131 | |||
132 | Compression | ||
133 | ----------- | ||
134 | |||
135 | Both file data and metadata can be compressed. Compression for file | ||
136 | data can be enabled with chattr +c and disabled with chattr -c. Doing | ||
137 | so has no effect on existing data, but new data will be stored | ||
138 | accordingly. New inodes will inherit the compression flag of the | ||
139 | parent directory. | ||
140 | |||
141 | Metadata is always compressed. However, the space accounting ignores | ||
142 | this and charges for the uncompressed size. Failing to do so could | ||
143 | result in GC failures when, after moving some data, indirect blocks | ||
144 | compress worse than previously. Even on a 100% full medium, GC may | ||
145 | not consume any extra space, so the compression gains are lost space | ||
146 | to the user. | ||
147 | |||
148 | However, they are not lost space to the filesystem internals. By | ||
149 | cheating the user for those bytes, the filesystem gained some slack | ||
150 | space and GC will run less often and faster. | ||
151 | |||
152 | Garbage Collection and Wear Leveling | ||
153 | ------------------------------------ | ||
154 | |||
155 | Garbage collection is invoked whenever the number of free segments | ||
156 | falls below a threshold. The best (known) candidate is picked based | ||
157 | on the least amount of valid data contained in the segment. All | ||
158 | remaining valid data is copied elsewhere, thereby invalidating it. | ||
159 | |||
160 | The GC code also checks for aliases and writes then back if their | ||
161 | number gets too large. | ||
162 | |||
163 | Wear leveling is done by occasionally picking a suboptimal segment for | ||
164 | garbage collection. If a stale segments erase count is significantly | ||
165 | lower than the active segments' erase counts, it will be picked. Wear | ||
166 | leveling is rate limited, so it will never monopolize the device for | ||
167 | more than one segment worth at a time. | ||
168 | |||
169 | Values for "occasionally", "significantly lower" are compile time | ||
170 | constants. | ||
171 | |||
172 | Hashed directories | ||
173 | ------------------ | ||
174 | |||
175 | To satisfy efficient lookup(), directory entries are hashed and | ||
176 | located based on the hash. In order to both support large directories | ||
177 | and not be overly inefficient for small directories, several hash | ||
178 | tables of increasing size are used. For each table, the hash value | ||
179 | modulo the table size gives the table index. | ||
180 | |||
181 | Tables sizes are chosen to limit the number of indirect blocks with a | ||
182 | fully populated table to 0, 1, 2 or 3 respectively. So the first | ||
183 | table contains 16 entries, the second 512-16, etc. | ||
184 | |||
185 | The last table is special in several ways. First its size depends on | ||
186 | the effective 32bit limit on telldir/seekdir cookies. Since logfs | ||
187 | uses the upper half of the address space for indirect blocks, the size | ||
188 | is limited to 2^31. Secondly the table contains hash buckets with 16 | ||
189 | entries each. | ||
190 | |||
191 | Using single-entry buckets would result in birthday "attacks". At | ||
192 | just 2^16 used entries, hash collisions would be likely (P >= 0.5). | ||
193 | My math skills are insufficient to do the combinatorics for the 17x | ||
194 | collisions necessary to overflow a bucket, but testing showed that in | ||
195 | 10,000 runs the lowest directory fill before a bucket overflow was | ||
196 | 188,057,130 entries with an average of 315,149,915 entries. So for | ||
197 | directory sizes of up to a million, bucket overflows should be | ||
198 | virtually impossible under normal circumstances. | ||
199 | |||
200 | With carefully chosen filenames, it is obviously possible to cause an | ||
201 | overflow with just 21 entries (4 higher tables + 16 entries + 1). So | ||
202 | there may be a security concern if a malicious user has write access | ||
203 | to a directory. | ||
204 | |||
205 | Open For Discussion | ||
206 | =================== | ||
207 | |||
208 | Device Address Space | ||
209 | -------------------- | ||
210 | |||
211 | A device address space is used for caching. Both block devices and | ||
212 | MTD provide functions to either read a single page or write a segment. | ||
213 | Partial segments may be written for data integrity, but where possible | ||
214 | complete segments are written for performance on simple block device | ||
215 | flash media. | ||
216 | |||
217 | Meta Inodes | ||
218 | ----------- | ||
219 | |||
220 | Inodes are stored in the inode file, which is just a regular file for | ||
221 | most purposes. At umount time, however, the inode file needs to | ||
222 | remain open until all dirty inodes are written. So | ||
223 | generic_shutdown_super() may not close this inode, but shouldn't | ||
224 | complain about remaining inodes due to the inode file either. Same | ||
225 | goes for mapping inode of the device address space. | ||
226 | |||
227 | Currently logfs uses a hack that essentially copies part of fs/inode.c | ||
228 | code over. A general solution would be preferred. | ||
229 | |||
230 | Indirect block mapping | ||
231 | ---------------------- | ||
232 | |||
233 | With compression, the block device (or mapping inode) cannot be used | ||
234 | to cache indirect blocks. Some other place is required. Currently | ||
235 | logfs uses the top half of each inode's address space. The low 8TB | ||
236 | (on 32bit) are filled with file data, the high 8TB are used for | ||
237 | indirect blocks. | ||
238 | |||
239 | One problem is that 16TB files created on 64bit systems actually have | ||
240 | data in the top 8TB. But files >16TB would cause problems anyway, so | ||
241 | only the limit has changed. | ||
diff --git a/Documentation/filesystems/nfs/nfs41-server.txt b/Documentation/filesystems/nfs/nfs41-server.txt index 1bd0d0c05171..6a53a84afc72 100644 --- a/Documentation/filesystems/nfs/nfs41-server.txt +++ b/Documentation/filesystems/nfs/nfs41-server.txt | |||
@@ -17,8 +17,7 @@ kernels must turn 4.1 on or off *before* turning support for version 4 | |||
17 | on or off; rpc.nfsd does this correctly.) | 17 | on or off; rpc.nfsd does this correctly.) |
18 | 18 | ||
19 | The NFSv4 minorversion 1 (NFSv4.1) implementation in nfsd is based | 19 | The NFSv4 minorversion 1 (NFSv4.1) implementation in nfsd is based |
20 | on the latest NFSv4.1 Internet Draft: | 20 | on RFC 5661. |
21 | http://tools.ietf.org/html/draft-ietf-nfsv4-minorversion1-29 | ||
22 | 21 | ||
23 | From the many new features in NFSv4.1 the current implementation | 22 | From the many new features in NFSv4.1 the current implementation |
24 | focuses on the mandatory-to-implement NFSv4.1 Sessions, providing | 23 | focuses on the mandatory-to-implement NFSv4.1 Sessions, providing |
@@ -44,7 +43,7 @@ interoperability problems with future clients. Known issues: | |||
44 | trunking, but this is a mandatory feature, and its use is | 43 | trunking, but this is a mandatory feature, and its use is |
45 | recommended to clients in a number of places. (E.g. to ensure | 44 | recommended to clients in a number of places. (E.g. to ensure |
46 | timely renewal in case an existing connection's retry timeouts | 45 | timely renewal in case an existing connection's retry timeouts |
47 | have gotten too long; see section 8.3 of the draft.) | 46 | have gotten too long; see section 8.3 of the RFC.) |
48 | Therefore, lack of this feature may cause future clients to | 47 | Therefore, lack of this feature may cause future clients to |
49 | fail. | 48 | fail. |
50 | - Incomplete backchannel support: incomplete backchannel gss | 49 | - Incomplete backchannel support: incomplete backchannel gss |
diff --git a/Documentation/filesystems/nilfs2.txt b/Documentation/filesystems/nilfs2.txt index 839efd8a8a8c..cf6d0d85ca82 100644 --- a/Documentation/filesystems/nilfs2.txt +++ b/Documentation/filesystems/nilfs2.txt | |||
@@ -74,6 +74,9 @@ norecovery Disable recovery of the filesystem on mount. | |||
74 | This disables every write access on the device for | 74 | This disables every write access on the device for |
75 | read-only mounts or snapshots. This option will fail | 75 | read-only mounts or snapshots. This option will fail |
76 | for r/w mounts on an unclean volume. | 76 | for r/w mounts on an unclean volume. |
77 | discard Issue discard/TRIM commands to the underlying block | ||
78 | device when blocks are freed. This is useful for SSD | ||
79 | devices and sparse/thinly-provisioned LUNs. | ||
77 | 80 | ||
78 | NILFS2 usage | 81 | NILFS2 usage |
79 | ============ | 82 | ============ |
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 0d07513a67a6..a4f30faa4f1f 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt | |||
@@ -164,6 +164,7 @@ read the file /proc/PID/status: | |||
164 | VmExe: 68 kB | 164 | VmExe: 68 kB |
165 | VmLib: 1412 kB | 165 | VmLib: 1412 kB |
166 | VmPTE: 20 kb | 166 | VmPTE: 20 kb |
167 | VmSwap: 0 kB | ||
167 | Threads: 1 | 168 | Threads: 1 |
168 | SigQ: 0/28578 | 169 | SigQ: 0/28578 |
169 | SigPnd: 0000000000000000 | 170 | SigPnd: 0000000000000000 |
@@ -188,7 +189,13 @@ memory usage. Its seven fields are explained in Table 1-3. The stat file | |||
188 | contains details information about the process itself. Its fields are | 189 | contains details information about the process itself. Its fields are |
189 | explained in Table 1-4. | 190 | explained in Table 1-4. |
190 | 191 | ||
191 | Table 1-2: Contents of the statm files (as of 2.6.30-rc7) | 192 | (for SMP CONFIG users) |
193 | For making accounting scalable, RSS related information are handled in | ||
194 | asynchronous manner and the vaule may not be very precise. To see a precise | ||
195 | snapshot of a moment, you can see /proc/<pid>/smaps file and scan page table. | ||
196 | It's slow but very precise. | ||
197 | |||
198 | Table 1-2: Contents of the status files (as of 2.6.30-rc7) | ||
192 | .............................................................................. | 199 | .............................................................................. |
193 | Field Content | 200 | Field Content |
194 | Name filename of the executable | 201 | Name filename of the executable |
@@ -213,6 +220,7 @@ Table 1-2: Contents of the statm files (as of 2.6.30-rc7) | |||
213 | VmExe size of text segment | 220 | VmExe size of text segment |
214 | VmLib size of shared library code | 221 | VmLib size of shared library code |
215 | VmPTE size of page table entries | 222 | VmPTE size of page table entries |
223 | VmSwap size of swap usage (the number of referred swapents) | ||
216 | Threads number of threads | 224 | Threads number of threads |
217 | SigQ number of signals queued/max. number for queue | 225 | SigQ number of signals queued/max. number for queue |
218 | SigPnd bitmap of pending signals for the thread | 226 | SigPnd bitmap of pending signals for the thread |
@@ -430,6 +438,7 @@ Table 1-5: Kernel info in /proc | |||
430 | modules List of loaded modules | 438 | modules List of loaded modules |
431 | mounts Mounted filesystems | 439 | mounts Mounted filesystems |
432 | net Networking info (see text) | 440 | net Networking info (see text) |
441 | pagetypeinfo Additional page allocator information (see text) (2.5) | ||
433 | partitions Table of partitions known to the system | 442 | partitions Table of partitions known to the system |
434 | pci Deprecated info of PCI bus (new way -> /proc/bus/pci/, | 443 | pci Deprecated info of PCI bus (new way -> /proc/bus/pci/, |
435 | decoupled by lspci (2.4) | 444 | decoupled by lspci (2.4) |
@@ -584,7 +593,7 @@ Node 0, zone DMA 0 4 5 4 4 3 ... | |||
584 | Node 0, zone Normal 1 0 0 1 101 8 ... | 593 | Node 0, zone Normal 1 0 0 1 101 8 ... |
585 | Node 0, zone HighMem 2 0 0 1 1 0 ... | 594 | Node 0, zone HighMem 2 0 0 1 1 0 ... |
586 | 595 | ||
587 | Memory fragmentation is a problem under some workloads, and buddyinfo is a | 596 | External fragmentation is a problem under some workloads, and buddyinfo is a |
588 | useful tool for helping diagnose these problems. Buddyinfo will give you a | 597 | useful tool for helping diagnose these problems. Buddyinfo will give you a |
589 | clue as to how big an area you can safely allocate, or why a previous | 598 | clue as to how big an area you can safely allocate, or why a previous |
590 | allocation failed. | 599 | allocation failed. |
@@ -594,6 +603,48 @@ available. In this case, there are 0 chunks of 2^0*PAGE_SIZE available in | |||
594 | ZONE_DMA, 4 chunks of 2^1*PAGE_SIZE in ZONE_DMA, 101 chunks of 2^4*PAGE_SIZE | 603 | ZONE_DMA, 4 chunks of 2^1*PAGE_SIZE in ZONE_DMA, 101 chunks of 2^4*PAGE_SIZE |
595 | available in ZONE_NORMAL, etc... | 604 | available in ZONE_NORMAL, etc... |
596 | 605 | ||
606 | More information relevant to external fragmentation can be found in | ||
607 | pagetypeinfo. | ||
608 | |||
609 | > cat /proc/pagetypeinfo | ||
610 | Page block order: 9 | ||
611 | Pages per block: 512 | ||
612 | |||
613 | Free pages count per migrate type at order 0 1 2 3 4 5 6 7 8 9 10 | ||
614 | Node 0, zone DMA, type Unmovable 0 0 0 1 1 1 1 1 1 1 0 | ||
615 | Node 0, zone DMA, type Reclaimable 0 0 0 0 0 0 0 0 0 0 0 | ||
616 | Node 0, zone DMA, type Movable 1 1 2 1 2 1 1 0 1 0 2 | ||
617 | Node 0, zone DMA, type Reserve 0 0 0 0 0 0 0 0 0 1 0 | ||
618 | Node 0, zone DMA, type Isolate 0 0 0 0 0 0 0 0 0 0 0 | ||
619 | Node 0, zone DMA32, type Unmovable 103 54 77 1 1 1 11 8 7 1 9 | ||
620 | Node 0, zone DMA32, type Reclaimable 0 0 2 1 0 0 0 0 1 0 0 | ||
621 | Node 0, zone DMA32, type Movable 169 152 113 91 77 54 39 13 6 1 452 | ||
622 | Node 0, zone DMA32, type Reserve 1 2 2 2 2 0 1 1 1 1 0 | ||
623 | Node 0, zone DMA32, type Isolate 0 0 0 0 0 0 0 0 0 0 0 | ||
624 | |||
625 | Number of blocks type Unmovable Reclaimable Movable Reserve Isolate | ||
626 | Node 0, zone DMA 2 0 5 1 0 | ||
627 | Node 0, zone DMA32 41 6 967 2 0 | ||
628 | |||
629 | Fragmentation avoidance in the kernel works by grouping pages of different | ||
630 | migrate types into the same contiguous regions of memory called page blocks. | ||
631 | A page block is typically the size of the default hugepage size e.g. 2MB on | ||
632 | X86-64. By keeping pages grouped based on their ability to move, the kernel | ||
633 | can reclaim pages within a page block to satisfy a high-order allocation. | ||
634 | |||
635 | The pagetypinfo begins with information on the size of a page block. It | ||
636 | then gives the same type of information as buddyinfo except broken down | ||
637 | by migrate-type and finishes with details on how many page blocks of each | ||
638 | type exist. | ||
639 | |||
640 | If min_free_kbytes has been tuned correctly (recommendations made by hugeadm | ||
641 | from libhugetlbfs http://sourceforge.net/projects/libhugetlbfs/), one can | ||
642 | make an estimate of the likely number of huge pages that can be allocated | ||
643 | at a given point in time. All the "Movable" blocks should be allocatable | ||
644 | unless memory has been mlock()'d. Some of the Reclaimable blocks should | ||
645 | also be allocatable although a lot of filesystem metadata may have to be | ||
646 | reclaimed to achieve this. | ||
647 | |||
597 | .............................................................................. | 648 | .............................................................................. |
598 | 649 | ||
599 | meminfo: | 650 | meminfo: |
diff --git a/Documentation/filesystems/sharedsubtree.txt b/Documentation/filesystems/sharedsubtree.txt index 23a181074f94..fc0e39af43c3 100644 --- a/Documentation/filesystems/sharedsubtree.txt +++ b/Documentation/filesystems/sharedsubtree.txt | |||
@@ -837,6 +837,9 @@ replicas continue to be exactly same. | |||
837 | individual lists does not affect propagation or the way propagation | 837 | individual lists does not affect propagation or the way propagation |
838 | tree is modified by operations. | 838 | tree is modified by operations. |
839 | 839 | ||
840 | All vfsmounts in a peer group have the same ->mnt_master. If it is | ||
841 | non-NULL, they form a contiguous (ordered) segment of slave list. | ||
842 | |||
840 | A example propagation tree looks as shown in the figure below. | 843 | A example propagation tree looks as shown in the figure below. |
841 | [ NOTE: Though it looks like a forest, if we consider all the shared | 844 | [ NOTE: Though it looks like a forest, if we consider all the shared |
842 | mounts as a conceptual entity called 'pnode', it becomes a tree] | 845 | mounts as a conceptual entity called 'pnode', it becomes a tree] |
@@ -874,8 +877,19 @@ replicas continue to be exactly same. | |||
874 | 877 | ||
875 | NOTE: The propagation tree is orthogonal to the mount tree. | 878 | NOTE: The propagation tree is orthogonal to the mount tree. |
876 | 879 | ||
880 | 8B Locking: | ||
881 | |||
882 | ->mnt_share, ->mnt_slave, ->mnt_slave_list, ->mnt_master are protected | ||
883 | by namespace_sem (exclusive for modifications, shared for reading). | ||
884 | |||
885 | Normally we have ->mnt_flags modifications serialized by vfsmount_lock. | ||
886 | There are two exceptions: do_add_mount() and clone_mnt(). | ||
887 | The former modifies a vfsmount that has not been visible in any shared | ||
888 | data structures yet. | ||
889 | The latter holds namespace_sem and the only references to vfsmount | ||
890 | are in lists that can't be traversed without namespace_sem. | ||
877 | 891 | ||
878 | 8B Algorithm: | 892 | 8C Algorithm: |
879 | 893 | ||
880 | The crux of the implementation resides in rbind/move operation. | 894 | The crux of the implementation resides in rbind/move operation. |
881 | 895 | ||