diff options
Diffstat (limited to 'Documentation/filesystems')
-rw-r--r-- | Documentation/filesystems/00-INDEX | 4 | ||||
-rw-r--r-- | Documentation/filesystems/Locking | 3 | ||||
-rw-r--r-- | Documentation/filesystems/configfs/configfs.txt | 2 | ||||
-rw-r--r-- | Documentation/filesystems/dnotify.txt | 99 | ||||
-rw-r--r-- | Documentation/filesystems/ext4.txt | 20 | ||||
-rw-r--r-- | Documentation/filesystems/ocfs2.txt | 16 | ||||
-rw-r--r-- | Documentation/filesystems/porting | 36 | ||||
-rw-r--r-- | Documentation/filesystems/proc.txt | 155 | ||||
-rw-r--r-- | Documentation/filesystems/ramfs-rootfs-initramfs.txt | 2 | ||||
-rw-r--r-- | Documentation/filesystems/relay.txt | 2 | ||||
-rw-r--r-- | Documentation/filesystems/sharedsubtree.txt | 1061 | ||||
-rw-r--r-- | Documentation/filesystems/vfs.txt | 17 |
12 files changed, 1352 insertions, 65 deletions
diff --git a/Documentation/filesystems/00-INDEX b/Documentation/filesystems/00-INDEX index 1de155e2dc36..e68021c08fbd 100644 --- a/Documentation/filesystems/00-INDEX +++ b/Documentation/filesystems/00-INDEX | |||
@@ -32,6 +32,8 @@ directory-locking | |||
32 | - info about the locking scheme used for directory operations. | 32 | - info about the locking scheme used for directory operations. |
33 | dlmfs.txt | 33 | dlmfs.txt |
34 | - info on the userspace interface to the OCFS2 DLM. | 34 | - info on the userspace interface to the OCFS2 DLM. |
35 | dnotify.txt | ||
36 | - info about directory notification in Linux. | ||
35 | ecryptfs.txt | 37 | ecryptfs.txt |
36 | - docs on eCryptfs: stacked cryptographic filesystem for Linux. | 38 | - docs on eCryptfs: stacked cryptographic filesystem for Linux. |
37 | ext2.txt | 39 | ext2.txt |
@@ -80,6 +82,8 @@ relay.txt | |||
80 | - info on relay, for efficient streaming from kernel to user space. | 82 | - info on relay, for efficient streaming from kernel to user space. |
81 | romfs.txt | 83 | romfs.txt |
82 | - description of the ROMFS filesystem. | 84 | - description of the ROMFS filesystem. |
85 | sharedsubtree.txt | ||
86 | - a description of shared subtrees for namespaces. | ||
83 | smbfs.txt | 87 | smbfs.txt |
84 | - info on using filesystems with the SMB protocol (Win 3.11 and NT). | 88 | - info on using filesystems with the SMB protocol (Win 3.11 and NT). |
85 | spufs.txt | 89 | spufs.txt |
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 37c10cba7177..42d4b30b1045 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking | |||
@@ -90,7 +90,6 @@ of the locking scheme for directory operations. | |||
90 | prototypes: | 90 | prototypes: |
91 | struct inode *(*alloc_inode)(struct super_block *sb); | 91 | struct inode *(*alloc_inode)(struct super_block *sb); |
92 | void (*destroy_inode)(struct inode *); | 92 | void (*destroy_inode)(struct inode *); |
93 | void (*read_inode) (struct inode *); | ||
94 | void (*dirty_inode) (struct inode *); | 93 | void (*dirty_inode) (struct inode *); |
95 | int (*write_inode) (struct inode *, int); | 94 | int (*write_inode) (struct inode *, int); |
96 | void (*put_inode) (struct inode *); | 95 | void (*put_inode) (struct inode *); |
@@ -114,7 +113,6 @@ locking rules: | |||
114 | BKL s_lock s_umount | 113 | BKL s_lock s_umount |
115 | alloc_inode: no no no | 114 | alloc_inode: no no no |
116 | destroy_inode: no | 115 | destroy_inode: no |
117 | read_inode: no (see below) | ||
118 | dirty_inode: no (must not sleep) | 116 | dirty_inode: no (must not sleep) |
119 | write_inode: no | 117 | write_inode: no |
120 | put_inode: no | 118 | put_inode: no |
@@ -133,7 +131,6 @@ show_options: no (vfsmount->sem) | |||
133 | quota_read: no no no (see below) | 131 | quota_read: no no no (see below) |
134 | quota_write: no no no (see below) | 132 | quota_write: no no no (see below) |
135 | 133 | ||
136 | ->read_inode() is not a method - it's a callback used in iget(). | ||
137 | ->remount_fs() will have the s_umount lock if it's already mounted. | 134 | ->remount_fs() will have the s_umount lock if it's already mounted. |
138 | When called from get_sb_single, it does NOT have the s_umount lock. | 135 | When called from get_sb_single, it does NOT have the s_umount lock. |
139 | ->quota_read() and ->quota_write() functions are both guaranteed to | 136 | ->quota_read() and ->quota_write() functions are both guaranteed to |
diff --git a/Documentation/filesystems/configfs/configfs.txt b/Documentation/filesystems/configfs/configfs.txt index d1b98257d000..44c97e6accb2 100644 --- a/Documentation/filesystems/configfs/configfs.txt +++ b/Documentation/filesystems/configfs/configfs.txt | |||
@@ -377,7 +377,7 @@ more explicit to have a method whereby userspace sees this divergence. | |||
377 | Rather than have a group where some items behave differently than | 377 | Rather than have a group where some items behave differently than |
378 | others, configfs provides a method whereby one or many subgroups are | 378 | others, configfs provides a method whereby one or many subgroups are |
379 | automatically created inside the parent at its creation. Thus, | 379 | automatically created inside the parent at its creation. Thus, |
380 | mkdir("parent) results in "parent", "parent/subgroup1", up through | 380 | mkdir("parent") results in "parent", "parent/subgroup1", up through |
381 | "parent/subgroupN". Items of type 1 can now be created in | 381 | "parent/subgroupN". Items of type 1 can now be created in |
382 | "parent/subgroup1", and items of type N can be created in | 382 | "parent/subgroup1", and items of type N can be created in |
383 | "parent/subgroupN". | 383 | "parent/subgroupN". |
diff --git a/Documentation/filesystems/dnotify.txt b/Documentation/filesystems/dnotify.txt new file mode 100644 index 000000000000..9f5d338ddbb8 --- /dev/null +++ b/Documentation/filesystems/dnotify.txt | |||
@@ -0,0 +1,99 @@ | |||
1 | Linux Directory Notification | ||
2 | ============================ | ||
3 | |||
4 | Stephen Rothwell <sfr@canb.auug.org.au> | ||
5 | |||
6 | The intention of directory notification is to allow user applications | ||
7 | to be notified when a directory, or any of the files in it, are changed. | ||
8 | The basic mechanism involves the application registering for notification | ||
9 | on a directory using a fcntl(2) call and the notifications themselves | ||
10 | being delivered using signals. | ||
11 | |||
12 | The application decides which "events" it wants to be notified about. | ||
13 | The currently defined events are: | ||
14 | |||
15 | DN_ACCESS A file in the directory was accessed (read) | ||
16 | DN_MODIFY A file in the directory was modified (write,truncate) | ||
17 | DN_CREATE A file was created in the directory | ||
18 | DN_DELETE A file was unlinked from directory | ||
19 | DN_RENAME A file in the directory was renamed | ||
20 | DN_ATTRIB A file in the directory had its attributes | ||
21 | changed (chmod,chown) | ||
22 | |||
23 | Usually, the application must reregister after each notification, but | ||
24 | if DN_MULTISHOT is or'ed with the event mask, then the registration will | ||
25 | remain until explicitly removed (by registering for no events). | ||
26 | |||
27 | By default, SIGIO will be delivered to the process and no other useful | ||
28 | information. However, if the F_SETSIG fcntl(2) call is used to let the | ||
29 | kernel know which signal to deliver, a siginfo structure will be passed to | ||
30 | the signal handler and the si_fd member of that structure will contain the | ||
31 | file descriptor associated with the directory in which the event occurred. | ||
32 | |||
33 | Preferably the application will choose one of the real time signals | ||
34 | (SIGRTMIN + <n>) so that the notifications may be queued. This is | ||
35 | especially important if DN_MULTISHOT is specified. Note that SIGRTMIN | ||
36 | is often blocked, so it is better to use (at least) SIGRTMIN + 1. | ||
37 | |||
38 | Implementation expectations (features and bugs :-)) | ||
39 | --------------------------- | ||
40 | |||
41 | The notification should work for any local access to files even if the | ||
42 | actual file system is on a remote server. This implies that remote | ||
43 | access to files served by local user mode servers should be notified. | ||
44 | Also, remote accesses to files served by a local kernel NFS server should | ||
45 | be notified. | ||
46 | |||
47 | In order to make the impact on the file system code as small as possible, | ||
48 | the problem of hard links to files has been ignored. So if a file (x) | ||
49 | exists in two directories (a and b) then a change to the file using the | ||
50 | name "a/x" should be notified to a program expecting notifications on | ||
51 | directory "a", but will not be notified to one expecting notifications on | ||
52 | directory "b". | ||
53 | |||
54 | Also, files that are unlinked, will still cause notifications in the | ||
55 | last directory that they were linked to. | ||
56 | |||
57 | Configuration | ||
58 | ------------- | ||
59 | |||
60 | Dnotify is controlled via the CONFIG_DNOTIFY configuration option. When | ||
61 | disabled, fcntl(fd, F_NOTIFY, ...) will return -EINVAL. | ||
62 | |||
63 | Example | ||
64 | ------- | ||
65 | |||
66 | #define _GNU_SOURCE /* needed to get the defines */ | ||
67 | #include <fcntl.h> /* in glibc 2.2 this has the needed | ||
68 | values defined */ | ||
69 | #include <signal.h> | ||
70 | #include <stdio.h> | ||
71 | #include <unistd.h> | ||
72 | |||
73 | static volatile int event_fd; | ||
74 | |||
75 | static void handler(int sig, siginfo_t *si, void *data) | ||
76 | { | ||
77 | event_fd = si->si_fd; | ||
78 | } | ||
79 | |||
80 | int main(void) | ||
81 | { | ||
82 | struct sigaction act; | ||
83 | int fd; | ||
84 | |||
85 | act.sa_sigaction = handler; | ||
86 | sigemptyset(&act.sa_mask); | ||
87 | act.sa_flags = SA_SIGINFO; | ||
88 | sigaction(SIGRTMIN + 1, &act, NULL); | ||
89 | |||
90 | fd = open(".", O_RDONLY); | ||
91 | fcntl(fd, F_SETSIG, SIGRTMIN + 1); | ||
92 | fcntl(fd, F_NOTIFY, DN_MODIFY|DN_CREATE|DN_MULTISHOT); | ||
93 | /* we will now be notified if any of the files | ||
94 | in "." is modified or new files are created */ | ||
95 | while (1) { | ||
96 | pause(); | ||
97 | printf("Got event on fd=%d\n", event_fd); | ||
98 | } | ||
99 | } | ||
diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt index 6a4adcae9f9a..560f88dc7090 100644 --- a/Documentation/filesystems/ext4.txt +++ b/Documentation/filesystems/ext4.txt | |||
@@ -86,9 +86,21 @@ Alex is working on a new set of patches right now. | |||
86 | When mounting an ext4 filesystem, the following option are accepted: | 86 | When mounting an ext4 filesystem, the following option are accepted: |
87 | (*) == default | 87 | (*) == default |
88 | 88 | ||
89 | extents ext4 will use extents to address file data. The | 89 | extents (*) ext4 will use extents to address file data. The |
90 | file system will no longer be mountable by ext3. | 90 | file system will no longer be mountable by ext3. |
91 | 91 | ||
92 | noextents ext4 will not use extents for newly created files | ||
93 | |||
94 | journal_checksum Enable checksumming of the journal transactions. | ||
95 | This will allow the recovery code in e2fsck and the | ||
96 | kernel to detect corruption in the kernel. It is a | ||
97 | compatible change and will be ignored by older kernels. | ||
98 | |||
99 | journal_async_commit Commit block can be written to disk without waiting | ||
100 | for descriptor blocks. If enabled older kernels cannot | ||
101 | mount the device. This will enable 'journal_checksum' | ||
102 | internally. | ||
103 | |||
92 | journal=update Update the ext4 file system's journal to the current | 104 | journal=update Update the ext4 file system's journal to the current |
93 | format. | 105 | format. |
94 | 106 | ||
@@ -196,6 +208,12 @@ nobh (a) cache disk block mapping information | |||
196 | "nobh" option tries to avoid associating buffer | 208 | "nobh" option tries to avoid associating buffer |
197 | heads (supported only for "writeback" mode). | 209 | heads (supported only for "writeback" mode). |
198 | 210 | ||
211 | mballoc (*) Use the multiple block allocator for block allocation | ||
212 | nomballoc disabled multiple block allocator for block allocation. | ||
213 | stripe=n Number of filesystem blocks that mballoc will try | ||
214 | to use for allocation size and alignment. For RAID5/6 | ||
215 | systems this should be the number of data | ||
216 | disks * RAID chunk size in file system blocks. | ||
199 | 217 | ||
200 | Data Mode | 218 | Data Mode |
201 | --------- | 219 | --------- |
diff --git a/Documentation/filesystems/ocfs2.txt b/Documentation/filesystems/ocfs2.txt index ed55238023a9..c318a8bbb1ef 100644 --- a/Documentation/filesystems/ocfs2.txt +++ b/Documentation/filesystems/ocfs2.txt | |||
@@ -35,7 +35,6 @@ Features which OCFS2 does not support yet: | |||
35 | - Directory change notification (F_NOTIFY) | 35 | - Directory change notification (F_NOTIFY) |
36 | - Distributed Caching (F_SETLEASE/F_GETLEASE/break_lease) | 36 | - Distributed Caching (F_SETLEASE/F_GETLEASE/break_lease) |
37 | - POSIX ACLs | 37 | - POSIX ACLs |
38 | - readpages / writepages (not user visible) | ||
39 | 38 | ||
40 | Mount options | 39 | Mount options |
41 | ============= | 40 | ============= |
@@ -62,3 +61,18 @@ data=writeback Data ordering is not preserved, data may be written | |||
62 | preferred_slot=0(*) During mount, try to use this filesystem slot first. If | 61 | preferred_slot=0(*) During mount, try to use this filesystem slot first. If |
63 | it is in use by another node, the first empty one found | 62 | it is in use by another node, the first empty one found |
64 | will be chosen. Invalid values will be ignored. | 63 | will be chosen. Invalid values will be ignored. |
64 | commit=nrsec (*) Ocfs2 can be told to sync all its data and metadata | ||
65 | every 'nrsec' seconds. The default value is 5 seconds. | ||
66 | This means that if you lose your power, you will lose | ||
67 | as much as the latest 5 seconds of work (your | ||
68 | filesystem will not be damaged though, thanks to the | ||
69 | journaling). This default value (or any low value) | ||
70 | will hurt performance, but it's good for data-safety. | ||
71 | Setting it to 0 will have the same effect as leaving | ||
72 | it at the default (5 seconds). | ||
73 | Setting it to very large values will improve | ||
74 | performance. | ||
75 | localalloc=8(*) Allows custom localalloc size in MB. If the value is too | ||
76 | large, the fs will silently revert it to the default. | ||
77 | Localalloc is not enabled for local mounts. | ||
78 | localflocks This disables cluster aware flock. | ||
diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting index dac45c92d872..92b888d540a6 100644 --- a/Documentation/filesystems/porting +++ b/Documentation/filesystems/porting | |||
@@ -1,6 +1,6 @@ | |||
1 | Changes since 2.5.0: | 1 | Changes since 2.5.0: |
2 | 2 | ||
3 | --- | 3 | --- |
4 | [recommended] | 4 | [recommended] |
5 | 5 | ||
6 | New helpers: sb_bread(), sb_getblk(), sb_find_get_block(), set_bh(), | 6 | New helpers: sb_bread(), sb_getblk(), sb_find_get_block(), set_bh(), |
@@ -10,7 +10,7 @@ Use them. | |||
10 | 10 | ||
11 | (sb_find_get_block() replaces 2.4's get_hash_table()) | 11 | (sb_find_get_block() replaces 2.4's get_hash_table()) |
12 | 12 | ||
13 | --- | 13 | --- |
14 | [recommended] | 14 | [recommended] |
15 | 15 | ||
16 | New methods: ->alloc_inode() and ->destroy_inode(). | 16 | New methods: ->alloc_inode() and ->destroy_inode(). |
@@ -28,14 +28,14 @@ Declare | |||
28 | 28 | ||
29 | Use FOO_I(inode) instead of &inode->u.foo_inode_i; | 29 | Use FOO_I(inode) instead of &inode->u.foo_inode_i; |
30 | 30 | ||
31 | Add foo_alloc_inode() and foo_destory_inode() - the former should allocate | 31 | Add foo_alloc_inode() and foo_destroy_inode() - the former should allocate |
32 | foo_inode_info and return the address of ->vfs_inode, the latter should free | 32 | foo_inode_info and return the address of ->vfs_inode, the latter should free |
33 | FOO_I(inode) (see in-tree filesystems for examples). | 33 | FOO_I(inode) (see in-tree filesystems for examples). |
34 | 34 | ||
35 | Make them ->alloc_inode and ->destroy_inode in your super_operations. | 35 | Make them ->alloc_inode and ->destroy_inode in your super_operations. |
36 | 36 | ||
37 | Keep in mind that now you need explicit initialization of private data - | 37 | Keep in mind that now you need explicit initialization of private data |
38 | typically in ->read_inode() and after getting an inode from new_inode(). | 38 | typically between calling iget_locked() and unlocking the inode. |
39 | 39 | ||
40 | At some point that will become mandatory. | 40 | At some point that will become mandatory. |
41 | 41 | ||
@@ -173,10 +173,10 @@ should be a non-blocking function that initializes those parts of a | |||
173 | newly created inode to allow the test function to succeed. 'data' is | 173 | newly created inode to allow the test function to succeed. 'data' is |
174 | passed as an opaque value to both test and set functions. | 174 | passed as an opaque value to both test and set functions. |
175 | 175 | ||
176 | When the inode has been created by iget5_locked(), it will be returned with | 176 | When the inode has been created by iget5_locked(), it will be returned with the |
177 | the I_NEW flag set and will still be locked. read_inode has not been | 177 | I_NEW flag set and will still be locked. The filesystem then needs to finalize |
178 | called so the file system still has to finalize the initialization. Once | 178 | the initialization. Once the inode is initialized it must be unlocked by |
179 | the inode is initialized it must be unlocked by calling unlock_new_inode(). | 179 | calling unlock_new_inode(). |
180 | 180 | ||
181 | The filesystem is responsible for setting (and possibly testing) i_ino | 181 | The filesystem is responsible for setting (and possibly testing) i_ino |
182 | when appropriate. There is also a simpler iget_locked function that | 182 | when appropriate. There is also a simpler iget_locked function that |
@@ -184,11 +184,19 @@ just takes the superblock and inode number as arguments and does the | |||
184 | test and set for you. | 184 | test and set for you. |
185 | 185 | ||
186 | e.g. | 186 | e.g. |
187 | inode = iget_locked(sb, ino); | 187 | inode = iget_locked(sb, ino); |
188 | if (inode->i_state & I_NEW) { | 188 | if (inode->i_state & I_NEW) { |
189 | read_inode_from_disk(inode); | 189 | err = read_inode_from_disk(inode); |
190 | unlock_new_inode(inode); | 190 | if (err < 0) { |
191 | } | 191 | iget_failed(inode); |
192 | return err; | ||
193 | } | ||
194 | unlock_new_inode(inode); | ||
195 | } | ||
196 | |||
197 | Note that if the process of setting up a new inode fails, then iget_failed() | ||
198 | should be called on the inode to render it dead, and an appropriate error | ||
199 | should be passed back to the caller. | ||
192 | 200 | ||
193 | --- | 201 | --- |
194 | [recommended] | 202 | [recommended] |
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index dec99455321f..5681e2fa1496 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt | |||
@@ -216,6 +216,7 @@ Table 1-3: Contents of the stat files (as of 2.6.22-rc3) | |||
216 | priority priority level | 216 | priority priority level |
217 | nice nice level | 217 | nice nice level |
218 | num_threads number of threads | 218 | num_threads number of threads |
219 | it_real_value (obsolete, always 0) | ||
219 | start_time time the process started after system boot | 220 | start_time time the process started after system boot |
220 | vsize virtual memory size | 221 | vsize virtual memory size |
221 | rss resident set memory size | 222 | rss resident set memory size |
@@ -857,6 +858,45 @@ CPUs. | |||
857 | The "procs_blocked" line gives the number of processes currently blocked, | 858 | The "procs_blocked" line gives the number of processes currently blocked, |
858 | waiting for I/O to complete. | 859 | waiting for I/O to complete. |
859 | 860 | ||
861 | 1.9 Ext4 file system parameters | ||
862 | ------------------------------ | ||
863 | Ext4 file system have one directory per partition under /proc/fs/ext4/ | ||
864 | # ls /proc/fs/ext4/hdc/ | ||
865 | group_prealloc max_to_scan mb_groups mb_history min_to_scan order2_req | ||
866 | stats stream_req | ||
867 | |||
868 | mb_groups: | ||
869 | This file gives the details of mutiblock allocator buddy cache of free blocks | ||
870 | |||
871 | mb_history: | ||
872 | Multiblock allocation history. | ||
873 | |||
874 | stats: | ||
875 | This file indicate whether the multiblock allocator should start collecting | ||
876 | statistics. The statistics are shown during unmount | ||
877 | |||
878 | group_prealloc: | ||
879 | The multiblock allocator normalize the block allocation request to | ||
880 | group_prealloc filesystem blocks if we don't have strip value set. | ||
881 | The stripe value can be specified at mount time or during mke2fs. | ||
882 | |||
883 | max_to_scan: | ||
884 | How long multiblock allocator can look for a best extent (in found extents) | ||
885 | |||
886 | min_to_scan: | ||
887 | How long multiblock allocator must look for a best extent | ||
888 | |||
889 | order2_req: | ||
890 | Multiblock allocator use 2^N search using buddies only for requests greater | ||
891 | than or equal to order2_req. The request size is specfied in file system | ||
892 | blocks. A value of 2 indicate only if the requests are greater than or equal | ||
893 | to 4 blocks. | ||
894 | |||
895 | stream_req: | ||
896 | Files smaller than stream_req are served by the stream allocator, whose | ||
897 | purpose is to pack requests as close each to other as possible to | ||
898 | produce smooth I/O traffic. Avalue of 16 indicate that file smaller than 16 | ||
899 | filesystem block size will use group based preallocation. | ||
860 | 900 | ||
861 | ------------------------------------------------------------------------------ | 901 | ------------------------------------------------------------------------------ |
862 | Summary | 902 | Summary |
@@ -989,6 +1029,14 @@ nr_inodes | |||
989 | Denotes the number of inodes the system has allocated. This number will | 1029 | Denotes the number of inodes the system has allocated. This number will |
990 | grow and shrink dynamically. | 1030 | grow and shrink dynamically. |
991 | 1031 | ||
1032 | nr_open | ||
1033 | ------- | ||
1034 | |||
1035 | Denotes the maximum number of file-handles a process can | ||
1036 | allocate. Default value is 1024*1024 (1048576) which should be | ||
1037 | enough for most machines. Actual limit depends on RLIMIT_NOFILE | ||
1038 | resource limit. | ||
1039 | |||
992 | nr_free_inodes | 1040 | nr_free_inodes |
993 | -------------- | 1041 | -------------- |
994 | 1042 | ||
@@ -1095,13 +1143,6 @@ check the amount of free space (value is in seconds). Default settings are: 4, | |||
1095 | resume it if we have a value of 3 or more percent; consider information about | 1143 | resume it if we have a value of 3 or more percent; consider information about |
1096 | the amount of free space valid for 30 seconds | 1144 | the amount of free space valid for 30 seconds |
1097 | 1145 | ||
1098 | audit_argv_kb | ||
1099 | ------------- | ||
1100 | |||
1101 | The file contains a single value denoting the limit on the argv array size | ||
1102 | for execve (in KiB). This limit is only applied when system call auditing for | ||
1103 | execve is enabled, otherwise the value is ignored. | ||
1104 | |||
1105 | ctrl-alt-del | 1146 | ctrl-alt-del |
1106 | ------------ | 1147 | ------------ |
1107 | 1148 | ||
@@ -1282,13 +1323,28 @@ for writeout by the pdflush daemons. It is expressed in 100'ths of a second. | |||
1282 | Data which has been dirty in-memory for longer than this interval will be | 1323 | Data which has been dirty in-memory for longer than this interval will be |
1283 | written out next time a pdflush daemon wakes up. | 1324 | written out next time a pdflush daemon wakes up. |
1284 | 1325 | ||
1326 | highmem_is_dirtyable | ||
1327 | -------------------- | ||
1328 | |||
1329 | Only present if CONFIG_HIGHMEM is set. | ||
1330 | |||
1331 | This defaults to 0 (false), meaning that the ratios set above are calculated | ||
1332 | as a percentage of lowmem only. This protects against excessive scanning | ||
1333 | in page reclaim, swapping and general VM distress. | ||
1334 | |||
1335 | Setting this to 1 can be useful on 32 bit machines where you want to make | ||
1336 | random changes within an MMAPed file that is larger than your available | ||
1337 | lowmem without causing large quantities of random IO. Is is safe if the | ||
1338 | behavior of all programs running on the machine is known and memory will | ||
1339 | not be otherwise stressed. | ||
1340 | |||
1285 | legacy_va_layout | 1341 | legacy_va_layout |
1286 | ---------------- | 1342 | ---------------- |
1287 | 1343 | ||
1288 | If non-zero, this sysctl disables the new 32-bit mmap mmap layout - the kernel | 1344 | If non-zero, this sysctl disables the new 32-bit mmap mmap layout - the kernel |
1289 | will use the legacy (2.4) layout for all processes. | 1345 | will use the legacy (2.4) layout for all processes. |
1290 | 1346 | ||
1291 | lower_zone_protection | 1347 | lowmem_reserve_ratio |
1292 | --------------------- | 1348 | --------------------- |
1293 | 1349 | ||
1294 | For some specialised workloads on highmem machines it is dangerous for | 1350 | For some specialised workloads on highmem machines it is dangerous for |
@@ -1308,25 +1364,71 @@ captured into pinned user memory. | |||
1308 | mechanism will also defend that region from allocations which could use | 1364 | mechanism will also defend that region from allocations which could use |
1309 | highmem or lowmem). | 1365 | highmem or lowmem). |
1310 | 1366 | ||
1311 | The `lower_zone_protection' tunable determines how aggressive the kernel is | 1367 | The `lowmem_reserve_ratio' tunable determines how aggressive the kernel is |
1312 | in defending these lower zones. The default value is zero - no | 1368 | in defending these lower zones. |
1313 | protection at all. | ||
1314 | 1369 | ||
1315 | If you have a machine which uses highmem or ISA DMA and your | 1370 | If you have a machine which uses highmem or ISA DMA and your |
1316 | applications are using mlock(), or if you are running with no swap then | 1371 | applications are using mlock(), or if you are running with no swap then |
1317 | you probably should increase the lower_zone_protection setting. | 1372 | you probably should change the lowmem_reserve_ratio setting. |
1318 | 1373 | ||
1319 | The units of this tunable are fairly vague. It is approximately equal | 1374 | The lowmem_reserve_ratio is an array. You can see them by reading this file. |
1320 | to "megabytes," so setting lower_zone_protection=100 will protect around 100 | 1375 | - |
1321 | megabytes of the lowmem zone from user allocations. It will also make | 1376 | % cat /proc/sys/vm/lowmem_reserve_ratio |
1322 | those 100 megabytes unavailable for use by applications and by | 1377 | 256 256 32 |
1323 | pagecache, so there is a cost. | 1378 | - |
1324 | 1379 | Note: # of this elements is one fewer than number of zones. Because the highest | |
1325 | The effects of this tunable may be observed by monitoring | 1380 | zone's value is not necessary for following calculation. |
1326 | /proc/meminfo:LowFree. Write a single huge file and observe the point | 1381 | |
1327 | at which LowFree ceases to fall. | 1382 | But, these values are not used directly. The kernel calculates # of protection |
1328 | 1383 | pages for each zones from them. These are shown as array of protection pages | |
1329 | A reasonable value for lower_zone_protection is 100. | 1384 | in /proc/zoneinfo like followings. (This is an example of x86-64 box). |
1385 | Each zone has an array of protection pages like this. | ||
1386 | |||
1387 | - | ||
1388 | Node 0, zone DMA | ||
1389 | pages free 1355 | ||
1390 | min 3 | ||
1391 | low 3 | ||
1392 | high 4 | ||
1393 | : | ||
1394 | : | ||
1395 | numa_other 0 | ||
1396 | protection: (0, 2004, 2004, 2004) | ||
1397 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | ||
1398 | pagesets | ||
1399 | cpu: 0 pcp: 0 | ||
1400 | : | ||
1401 | - | ||
1402 | These protections are added to score to judge whether this zone should be used | ||
1403 | for page allocation or should be reclaimed. | ||
1404 | |||
1405 | In this example, if normal pages (index=2) are required to this DMA zone and | ||
1406 | pages_high is used for watermark, the kernel judges this zone should not be | ||
1407 | used because pages_free(1355) is smaller than watermark + protection[2] | ||
1408 | (4 + 2004 = 2008). If this protection value is 0, this zone would be used for | ||
1409 | normal page requirement. If requirement is DMA zone(index=0), protection[0] | ||
1410 | (=0) is used. | ||
1411 | |||
1412 | zone[i]'s protection[j] is calculated by following exprssion. | ||
1413 | |||
1414 | (i < j): | ||
1415 | zone[i]->protection[j] | ||
1416 | = (total sums of present_pages from zone[i+1] to zone[j] on the node) | ||
1417 | / lowmem_reserve_ratio[i]; | ||
1418 | (i = j): | ||
1419 | (should not be protected. = 0; | ||
1420 | (i > j): | ||
1421 | (not necessary, but looks 0) | ||
1422 | |||
1423 | The default values of lowmem_reserve_ratio[i] are | ||
1424 | 256 (if zone[i] means DMA or DMA32 zone) | ||
1425 | 32 (others). | ||
1426 | As above expression, they are reciprocal number of ratio. | ||
1427 | 256 means 1/256. # of protection pages becomes about "0.39%" of total present | ||
1428 | pages of higher zones on the node. | ||
1429 | |||
1430 | If you would like to protect more pages, smaller values are effective. | ||
1431 | The minimum value is 1 (1/1 -> 100%). | ||
1330 | 1432 | ||
1331 | page-cluster | 1433 | page-cluster |
1332 | ------------ | 1434 | ------------ |
@@ -1880,11 +1982,6 @@ max_size | |||
1880 | Maximum size of the routing cache. Old entries will be purged once the cache | 1982 | Maximum size of the routing cache. Old entries will be purged once the cache |
1881 | reached has this size. | 1983 | reached has this size. |
1882 | 1984 | ||
1883 | max_delay, min_delay | ||
1884 | -------------------- | ||
1885 | |||
1886 | Delays for flushing the routing cache. | ||
1887 | |||
1888 | redirect_load, redirect_number | 1985 | redirect_load, redirect_number |
1889 | ------------------------------ | 1986 | ------------------------------ |
1890 | 1987 | ||
diff --git a/Documentation/filesystems/ramfs-rootfs-initramfs.txt b/Documentation/filesystems/ramfs-rootfs-initramfs.txt index 339c6a4f220e..7be232b44ee4 100644 --- a/Documentation/filesystems/ramfs-rootfs-initramfs.txt +++ b/Documentation/filesystems/ramfs-rootfs-initramfs.txt | |||
@@ -118,7 +118,7 @@ All this differs from the old initrd in several ways: | |||
118 | with the new root (cd /newmount; mount --move . /; chroot .), attach | 118 | with the new root (cd /newmount; mount --move . /; chroot .), attach |
119 | stdin/stdout/stderr to the new /dev/console, and exec the new init. | 119 | stdin/stdout/stderr to the new /dev/console, and exec the new init. |
120 | 120 | ||
121 | Since this is a remarkably persnickity process (and involves deleting | 121 | Since this is a remarkably persnickety process (and involves deleting |
122 | commands before you can run them), the klibc package introduced a helper | 122 | commands before you can run them), the klibc package introduced a helper |
123 | program (utils/run_init.c) to do all this for you. Most other packages | 123 | program (utils/run_init.c) to do all this for you. Most other packages |
124 | (such as busybox) have named this command "switch_root". | 124 | (such as busybox) have named this command "switch_root". |
diff --git a/Documentation/filesystems/relay.txt b/Documentation/filesystems/relay.txt index 18d23f9a18c7..094f2d2f38b1 100644 --- a/Documentation/filesystems/relay.txt +++ b/Documentation/filesystems/relay.txt | |||
@@ -140,7 +140,7 @@ close() decrements the channel buffer's refcount. When the refcount | |||
140 | In order for a user application to make use of relay files, the | 140 | In order for a user application to make use of relay files, the |
141 | host filesystem must be mounted. For example, | 141 | host filesystem must be mounted. For example, |
142 | 142 | ||
143 | mount -t debugfs debugfs /debug | 143 | mount -t debugfs debugfs /sys/kernel/debug |
144 | 144 | ||
145 | NOTE: the host filesystem doesn't need to be mounted for kernel | 145 | NOTE: the host filesystem doesn't need to be mounted for kernel |
146 | clients to create or use channels - it only needs to be | 146 | clients to create or use channels - it only needs to be |
diff --git a/Documentation/filesystems/sharedsubtree.txt b/Documentation/filesystems/sharedsubtree.txt new file mode 100644 index 000000000000..736540045dc7 --- /dev/null +++ b/Documentation/filesystems/sharedsubtree.txt | |||
@@ -0,0 +1,1061 @@ | |||
1 | Shared Subtrees | ||
2 | --------------- | ||
3 | |||
4 | Contents: | ||
5 | 1) Overview | ||
6 | 2) Features | ||
7 | 3) smount command | ||
8 | 4) Use-case | ||
9 | 5) Detailed semantics | ||
10 | 6) Quiz | ||
11 | 7) FAQ | ||
12 | 8) Implementation | ||
13 | |||
14 | |||
15 | 1) Overview | ||
16 | ----------- | ||
17 | |||
18 | Consider the following situation: | ||
19 | |||
20 | A process wants to clone its own namespace, but still wants to access the CD | ||
21 | that got mounted recently. Shared subtree semantics provide the necessary | ||
22 | mechanism to accomplish the above. | ||
23 | |||
24 | It provides the necessary building blocks for features like per-user-namespace | ||
25 | and versioned filesystem. | ||
26 | |||
27 | 2) Features | ||
28 | ----------- | ||
29 | |||
30 | Shared subtree provides four different flavors of mounts; struct vfsmount to be | ||
31 | precise | ||
32 | |||
33 | a. shared mount | ||
34 | b. slave mount | ||
35 | c. private mount | ||
36 | d. unbindable mount | ||
37 | |||
38 | |||
39 | 2a) A shared mount can be replicated to as many mountpoints and all the | ||
40 | replicas continue to be exactly same. | ||
41 | |||
42 | Here is an example: | ||
43 | |||
44 | Lets say /mnt has a mount that is shared. | ||
45 | mount --make-shared /mnt | ||
46 | |||
47 | note: mount command does not yet support the --make-shared flag. | ||
48 | I have included a small C program which does the same by executing | ||
49 | 'smount /mnt shared' | ||
50 | |||
51 | #mount --bind /mnt /tmp | ||
52 | The above command replicates the mount at /mnt to the mountpoint /tmp | ||
53 | and the contents of both the mounts remain identical. | ||
54 | |||
55 | #ls /mnt | ||
56 | a b c | ||
57 | |||
58 | #ls /tmp | ||
59 | a b c | ||
60 | |||
61 | Now lets say we mount a device at /tmp/a | ||
62 | #mount /dev/sd0 /tmp/a | ||
63 | |||
64 | #ls /tmp/a | ||
65 | t1 t2 t2 | ||
66 | |||
67 | #ls /mnt/a | ||
68 | t1 t2 t2 | ||
69 | |||
70 | Note that the mount has propagated to the mount at /mnt as well. | ||
71 | |||
72 | And the same is true even when /dev/sd0 is mounted on /mnt/a. The | ||
73 | contents will be visible under /tmp/a too. | ||
74 | |||
75 | |||
76 | 2b) A slave mount is like a shared mount except that mount and umount events | ||
77 | only propagate towards it. | ||
78 | |||
79 | All slave mounts have a master mount which is a shared. | ||
80 | |||
81 | Here is an example: | ||
82 | |||
83 | Lets say /mnt has a mount which is shared. | ||
84 | #mount --make-shared /mnt | ||
85 | |||
86 | Lets bind mount /mnt to /tmp | ||
87 | #mount --bind /mnt /tmp | ||
88 | |||
89 | the new mount at /tmp becomes a shared mount and it is a replica of | ||
90 | the mount at /mnt. | ||
91 | |||
92 | Now lets make the mount at /tmp; a slave of /mnt | ||
93 | #mount --make-slave /tmp | ||
94 | [or smount /tmp slave] | ||
95 | |||
96 | lets mount /dev/sd0 on /mnt/a | ||
97 | #mount /dev/sd0 /mnt/a | ||
98 | |||
99 | #ls /mnt/a | ||
100 | t1 t2 t3 | ||
101 | |||
102 | #ls /tmp/a | ||
103 | t1 t2 t3 | ||
104 | |||
105 | Note the mount event has propagated to the mount at /tmp | ||
106 | |||
107 | However lets see what happens if we mount something on the mount at /tmp | ||
108 | |||
109 | #mount /dev/sd1 /tmp/b | ||
110 | |||
111 | #ls /tmp/b | ||
112 | s1 s2 s3 | ||
113 | |||
114 | #ls /mnt/b | ||
115 | |||
116 | Note how the mount event has not propagated to the mount at | ||
117 | /mnt | ||
118 | |||
119 | |||
120 | 2c) A private mount does not forward or receive propagation. | ||
121 | |||
122 | This is the mount we are familiar with. Its the default type. | ||
123 | |||
124 | |||
125 | 2d) A unbindable mount is a unbindable private mount | ||
126 | |||
127 | lets say we have a mount at /mnt and we make is unbindable | ||
128 | |||
129 | #mount --make-unbindable /mnt | ||
130 | [ smount /mnt unbindable ] | ||
131 | |||
132 | Lets try to bind mount this mount somewhere else. | ||
133 | # mount --bind /mnt /tmp | ||
134 | mount: wrong fs type, bad option, bad superblock on /mnt, | ||
135 | or too many mounted file systems | ||
136 | |||
137 | Binding a unbindable mount is a invalid operation. | ||
138 | |||
139 | |||
140 | 3) smount command | ||
141 | |||
142 | Currently the mount command is not aware of shared subtree features. | ||
143 | Work is in progress to add the support in mount ( util-linux package ). | ||
144 | Till then use the following program. | ||
145 | |||
146 | ------------------------------------------------------------------------ | ||
147 | // | ||
148 | //this code was developed my Miklos Szeredi <miklos@szeredi.hu> | ||
149 | //and modified by Ram Pai <linuxram@us.ibm.com> | ||
150 | // sample usage: | ||
151 | // smount /tmp shared | ||
152 | // | ||
153 | #include <stdio.h> | ||
154 | #include <stdlib.h> | ||
155 | #include <unistd.h> | ||
156 | #include <string.h> | ||
157 | #include <sys/mount.h> | ||
158 | #include <sys/fsuid.h> | ||
159 | |||
160 | #ifndef MS_REC | ||
161 | #define MS_REC 0x4000 /* 16384: Recursive loopback */ | ||
162 | #endif | ||
163 | |||
164 | #ifndef MS_SHARED | ||
165 | #define MS_SHARED 1<<20 /* Shared */ | ||
166 | #endif | ||
167 | |||
168 | #ifndef MS_PRIVATE | ||
169 | #define MS_PRIVATE 1<<18 /* Private */ | ||
170 | #endif | ||
171 | |||
172 | #ifndef MS_SLAVE | ||
173 | #define MS_SLAVE 1<<19 /* Slave */ | ||
174 | #endif | ||
175 | |||
176 | #ifndef MS_UNBINDABLE | ||
177 | #define MS_UNBINDABLE 1<<17 /* Unbindable */ | ||
178 | #endif | ||
179 | |||
180 | int main(int argc, char *argv[]) | ||
181 | { | ||
182 | int type; | ||
183 | if(argc != 3) { | ||
184 | fprintf(stderr, "usage: %s dir " | ||
185 | "<rshared|rslave|rprivate|runbindable|shared|slave" | ||
186 | "|private|unbindable>\n" , argv[0]); | ||
187 | return 1; | ||
188 | } | ||
189 | |||
190 | fprintf(stdout, "%s %s %s\n", argv[0], argv[1], argv[2]); | ||
191 | |||
192 | if (strcmp(argv[2],"rshared")==0) | ||
193 | type=(MS_SHARED|MS_REC); | ||
194 | else if (strcmp(argv[2],"rslave")==0) | ||
195 | type=(MS_SLAVE|MS_REC); | ||
196 | else if (strcmp(argv[2],"rprivate")==0) | ||
197 | type=(MS_PRIVATE|MS_REC); | ||
198 | else if (strcmp(argv[2],"runbindable")==0) | ||
199 | type=(MS_UNBINDABLE|MS_REC); | ||
200 | else if (strcmp(argv[2],"shared")==0) | ||
201 | type=MS_SHARED; | ||
202 | else if (strcmp(argv[2],"slave")==0) | ||
203 | type=MS_SLAVE; | ||
204 | else if (strcmp(argv[2],"private")==0) | ||
205 | type=MS_PRIVATE; | ||
206 | else if (strcmp(argv[2],"unbindable")==0) | ||
207 | type=MS_UNBINDABLE; | ||
208 | else { | ||
209 | fprintf(stderr, "invalid operation: %s\n", argv[2]); | ||
210 | return 1; | ||
211 | } | ||
212 | setfsuid(getuid()); | ||
213 | |||
214 | if(mount("", argv[1], "dontcare", type, "") == -1) { | ||
215 | perror("mount"); | ||
216 | return 1; | ||
217 | } | ||
218 | return 0; | ||
219 | } | ||
220 | ----------------------------------------------------------------------- | ||
221 | |||
222 | Copy the above code snippet into smount.c | ||
223 | gcc -o smount smount.c | ||
224 | |||
225 | |||
226 | (i) To mark all the mounts under /mnt as shared execute the following | ||
227 | command: | ||
228 | |||
229 | smount /mnt rshared | ||
230 | the corresponding syntax planned for mount command is | ||
231 | mount --make-rshared /mnt | ||
232 | |||
233 | just to mark a mount /mnt as shared, execute the following | ||
234 | command: | ||
235 | smount /mnt shared | ||
236 | the corresponding syntax planned for mount command is | ||
237 | mount --make-shared /mnt | ||
238 | |||
239 | (ii) To mark all the shared mounts under /mnt as slave execute the | ||
240 | following | ||
241 | |||
242 | command: | ||
243 | smount /mnt rslave | ||
244 | the corresponding syntax planned for mount command is | ||
245 | mount --make-rslave /mnt | ||
246 | |||
247 | just to mark a mount /mnt as slave, execute the following | ||
248 | command: | ||
249 | smount /mnt slave | ||
250 | the corresponding syntax planned for mount command is | ||
251 | mount --make-slave /mnt | ||
252 | |||
253 | (iii) To mark all the mounts under /mnt as private execute the | ||
254 | following command: | ||
255 | |||
256 | smount /mnt rprivate | ||
257 | the corresponding syntax planned for mount command is | ||
258 | mount --make-rprivate /mnt | ||
259 | |||
260 | just to mark a mount /mnt as private, execute the following | ||
261 | command: | ||
262 | smount /mnt private | ||
263 | the corresponding syntax planned for mount command is | ||
264 | mount --make-private /mnt | ||
265 | |||
266 | NOTE: by default all the mounts are created as private. But if | ||
267 | you want to change some shared/slave/unbindable mount as | ||
268 | private at a later point in time, this command can help. | ||
269 | |||
270 | (iv) To mark all the mounts under /mnt as unbindable execute the | ||
271 | following | ||
272 | |||
273 | command: | ||
274 | smount /mnt runbindable | ||
275 | the corresponding syntax planned for mount command is | ||
276 | mount --make-runbindable /mnt | ||
277 | |||
278 | just to mark a mount /mnt as unbindable, execute the following | ||
279 | command: | ||
280 | smount /mnt unbindable | ||
281 | the corresponding syntax planned for mount command is | ||
282 | mount --make-unbindable /mnt | ||
283 | |||
284 | |||
285 | 4) Use cases | ||
286 | ------------ | ||
287 | |||
288 | A) A process wants to clone its own namespace, but still wants to | ||
289 | access the CD that got mounted recently. | ||
290 | |||
291 | Solution: | ||
292 | |||
293 | The system administrator can make the mount at /cdrom shared | ||
294 | mount --bind /cdrom /cdrom | ||
295 | mount --make-shared /cdrom | ||
296 | |||
297 | Now any process that clones off a new namespace will have a | ||
298 | mount at /cdrom which is a replica of the same mount in the | ||
299 | parent namespace. | ||
300 | |||
301 | So when a CD is inserted and mounted at /cdrom that mount gets | ||
302 | propagated to the other mount at /cdrom in all the other clone | ||
303 | namespaces. | ||
304 | |||
305 | B) A process wants its mounts invisible to any other process, but | ||
306 | still be able to see the other system mounts. | ||
307 | |||
308 | Solution: | ||
309 | |||
310 | To begin with, the administrator can mark the entire mount tree | ||
311 | as shareable. | ||
312 | |||
313 | mount --make-rshared / | ||
314 | |||
315 | A new process can clone off a new namespace. And mark some part | ||
316 | of its namespace as slave | ||
317 | |||
318 | mount --make-rslave /myprivatetree | ||
319 | |||
320 | Hence forth any mounts within the /myprivatetree done by the | ||
321 | process will not show up in any other namespace. However mounts | ||
322 | done in the parent namespace under /myprivatetree still shows | ||
323 | up in the process's namespace. | ||
324 | |||
325 | |||
326 | Apart from the above semantics this feature provides the | ||
327 | building blocks to solve the following problems: | ||
328 | |||
329 | C) Per-user namespace | ||
330 | |||
331 | The above semantics allows a way to share mounts across | ||
332 | namespaces. But namespaces are associated with processes. If | ||
333 | namespaces are made first class objects with user API to | ||
334 | associate/disassociate a namespace with userid, then each user | ||
335 | could have his/her own namespace and tailor it to his/her | ||
336 | requirements. Offcourse its needs support from PAM. | ||
337 | |||
338 | D) Versioned files | ||
339 | |||
340 | If the entire mount tree is visible at multiple locations, then | ||
341 | a underlying versioning file system can return different | ||
342 | version of the file depending on the path used to access that | ||
343 | file. | ||
344 | |||
345 | An example is: | ||
346 | |||
347 | mount --make-shared / | ||
348 | mount --rbind / /view/v1 | ||
349 | mount --rbind / /view/v2 | ||
350 | mount --rbind / /view/v3 | ||
351 | mount --rbind / /view/v4 | ||
352 | |||
353 | and if /usr has a versioning filesystem mounted, than that | ||
354 | mount appears at /view/v1/usr, /view/v2/usr, /view/v3/usr and | ||
355 | /view/v4/usr too | ||
356 | |||
357 | A user can request v3 version of the file /usr/fs/namespace.c | ||
358 | by accessing /view/v3/usr/fs/namespace.c . The underlying | ||
359 | versioning filesystem can then decipher that v3 version of the | ||
360 | filesystem is being requested and return the corresponding | ||
361 | inode. | ||
362 | |||
363 | 5) Detailed semantics: | ||
364 | ------------------- | ||
365 | The section below explains the detailed semantics of | ||
366 | bind, rbind, move, mount, umount and clone-namespace operations. | ||
367 | |||
368 | Note: the word 'vfsmount' and the noun 'mount' have been used | ||
369 | to mean the same thing, throughout this document. | ||
370 | |||
371 | 5a) Mount states | ||
372 | |||
373 | A given mount can be in one of the following states | ||
374 | 1) shared | ||
375 | 2) slave | ||
376 | 3) shared and slave | ||
377 | 4) private | ||
378 | 5) unbindable | ||
379 | |||
380 | A 'propagation event' is defined as event generated on a vfsmount | ||
381 | that leads to mount or unmount actions in other vfsmounts. | ||
382 | |||
383 | A 'peer group' is defined as a group of vfsmounts that propagate | ||
384 | events to each other. | ||
385 | |||
386 | (1) Shared mounts | ||
387 | |||
388 | A 'shared mount' is defined as a vfsmount that belongs to a | ||
389 | 'peer group'. | ||
390 | |||
391 | For example: | ||
392 | mount --make-shared /mnt | ||
393 | mount --bin /mnt /tmp | ||
394 | |||
395 | The mount at /mnt and that at /tmp are both shared and belong | ||
396 | to the same peer group. Anything mounted or unmounted under | ||
397 | /mnt or /tmp reflect in all the other mounts of its peer | ||
398 | group. | ||
399 | |||
400 | |||
401 | (2) Slave mounts | ||
402 | |||
403 | A 'slave mount' is defined as a vfsmount that receives | ||
404 | propagation events and does not forward propagation events. | ||
405 | |||
406 | A slave mount as the name implies has a master mount from which | ||
407 | mount/unmount events are received. Events do not propagate from | ||
408 | the slave mount to the master. Only a shared mount can be made | ||
409 | a slave by executing the following command | ||
410 | |||
411 | mount --make-slave mount | ||
412 | |||
413 | A shared mount that is made as a slave is no more shared unless | ||
414 | modified to become shared. | ||
415 | |||
416 | (3) Shared and Slave | ||
417 | |||
418 | A vfsmount can be both shared as well as slave. This state | ||
419 | indicates that the mount is a slave of some vfsmount, and | ||
420 | has its own peer group too. This vfsmount receives propagation | ||
421 | events from its master vfsmount, and also forwards propagation | ||
422 | events to its 'peer group' and to its slave vfsmounts. | ||
423 | |||
424 | Strictly speaking, the vfsmount is shared having its own | ||
425 | peer group, and this peer-group is a slave of some other | ||
426 | peer group. | ||
427 | |||
428 | Only a slave vfsmount can be made as 'shared and slave' by | ||
429 | either executing the following command | ||
430 | mount --make-shared mount | ||
431 | or by moving the slave vfsmount under a shared vfsmount. | ||
432 | |||
433 | (4) Private mount | ||
434 | |||
435 | A 'private mount' is defined as vfsmount that does not | ||
436 | receive or forward any propagation events. | ||
437 | |||
438 | (5) Unbindable mount | ||
439 | |||
440 | A 'unbindable mount' is defined as vfsmount that does not | ||
441 | receive or forward any propagation events and cannot | ||
442 | be bind mounted. | ||
443 | |||
444 | |||
445 | State diagram: | ||
446 | The state diagram below explains the state transition of a mount, | ||
447 | in response to various commands. | ||
448 | ------------------------------------------------------------------------ | ||
449 | | |make-shared | make-slave | make-private |make-unbindab| | ||
450 | --------------|------------|--------------|--------------|-------------| | ||
451 | |shared |shared |*slave/private| private | unbindable | | ||
452 | | | | | | | | ||
453 | |-------------|------------|--------------|--------------|-------------| | ||
454 | |slave |shared | **slave | private | unbindable | | ||
455 | | |and slave | | | | | ||
456 | |-------------|------------|--------------|--------------|-------------| | ||
457 | |shared |shared | slave | private | unbindable | | ||
458 | |and slave |and slave | | | | | ||
459 | |-------------|------------|--------------|--------------|-------------| | ||
460 | |private |shared | **private | private | unbindable | | ||
461 | |-------------|------------|--------------|--------------|-------------| | ||
462 | |unbindable |shared |**unbindable | private | unbindable | | ||
463 | ------------------------------------------------------------------------ | ||
464 | |||
465 | * if the shared mount is the only mount in its peer group, making it | ||
466 | slave, makes it private automatically. Note that there is no master to | ||
467 | which it can be slaved to. | ||
468 | |||
469 | ** slaving a non-shared mount has no effect on the mount. | ||
470 | |||
471 | Apart from the commands listed below, the 'move' operation also changes | ||
472 | the state of a mount depending on type of the destination mount. Its | ||
473 | explained in section 5d. | ||
474 | |||
475 | 5b) Bind semantics | ||
476 | |||
477 | Consider the following command | ||
478 | |||
479 | mount --bind A/a B/b | ||
480 | |||
481 | where 'A' is the source mount, 'a' is the dentry in the mount 'A', 'B' | ||
482 | is the destination mount and 'b' is the dentry in the destination mount. | ||
483 | |||
484 | The outcome depends on the type of mount of 'A' and 'B'. The table | ||
485 | below contains quick reference. | ||
486 | --------------------------------------------------------------------------- | ||
487 | | BIND MOUNT OPERATION | | ||
488 | |************************************************************************** | ||
489 | |source(A)->| shared | private | slave | unbindable | | ||
490 | | dest(B) | | | | | | ||
491 | | | | | | | | | ||
492 | | v | | | | | | ||
493 | |************************************************************************** | ||
494 | | shared | shared | shared | shared & slave | invalid | | ||
495 | | | | | | | | ||
496 | |non-shared| shared | private | slave | invalid | | ||
497 | *************************************************************************** | ||
498 | |||
499 | Details: | ||
500 | |||
501 | 1. 'A' is a shared mount and 'B' is a shared mount. A new mount 'C' | ||
502 | which is clone of 'A', is created. Its root dentry is 'a' . 'C' is | ||
503 | mounted on mount 'B' at dentry 'b'. Also new mount 'C1', 'C2', 'C3' ... | ||
504 | are created and mounted at the dentry 'b' on all mounts where 'B' | ||
505 | propagates to. A new propagation tree containing 'C1',..,'Cn' is | ||
506 | created. This propagation tree is identical to the propagation tree of | ||
507 | 'B'. And finally the peer-group of 'C' is merged with the peer group | ||
508 | of 'A'. | ||
509 | |||
510 | 2. 'A' is a private mount and 'B' is a shared mount. A new mount 'C' | ||
511 | which is clone of 'A', is created. Its root dentry is 'a'. 'C' is | ||
512 | mounted on mount 'B' at dentry 'b'. Also new mount 'C1', 'C2', 'C3' ... | ||
513 | are created and mounted at the dentry 'b' on all mounts where 'B' | ||
514 | propagates to. A new propagation tree is set containing all new mounts | ||
515 | 'C', 'C1', .., 'Cn' with exactly the same configuration as the | ||
516 | propagation tree for 'B'. | ||
517 | |||
518 | 3. 'A' is a slave mount of mount 'Z' and 'B' is a shared mount. A new | ||
519 | mount 'C' which is clone of 'A', is created. Its root dentry is 'a' . | ||
520 | 'C' is mounted on mount 'B' at dentry 'b'. Also new mounts 'C1', 'C2', | ||
521 | 'C3' ... are created and mounted at the dentry 'b' on all mounts where | ||
522 | 'B' propagates to. A new propagation tree containing the new mounts | ||
523 | 'C','C1',.. 'Cn' is created. This propagation tree is identical to the | ||
524 | propagation tree for 'B'. And finally the mount 'C' and its peer group | ||
525 | is made the slave of mount 'Z'. In other words, mount 'C' is in the | ||
526 | state 'slave and shared'. | ||
527 | |||
528 | 4. 'A' is a unbindable mount and 'B' is a shared mount. This is a | ||
529 | invalid operation. | ||
530 | |||
531 | 5. 'A' is a private mount and 'B' is a non-shared(private or slave or | ||
532 | unbindable) mount. A new mount 'C' which is clone of 'A', is created. | ||
533 | Its root dentry is 'a'. 'C' is mounted on mount 'B' at dentry 'b'. | ||
534 | |||
535 | 6. 'A' is a shared mount and 'B' is a non-shared mount. A new mount 'C' | ||
536 | which is a clone of 'A' is created. Its root dentry is 'a'. 'C' is | ||
537 | mounted on mount 'B' at dentry 'b'. 'C' is made a member of the | ||
538 | peer-group of 'A'. | ||
539 | |||
540 | 7. 'A' is a slave mount of mount 'Z' and 'B' is a non-shared mount. A | ||
541 | new mount 'C' which is a clone of 'A' is created. Its root dentry is | ||
542 | 'a'. 'C' is mounted on mount 'B' at dentry 'b'. Also 'C' is set as a | ||
543 | slave mount of 'Z'. In other words 'A' and 'C' are both slave mounts of | ||
544 | 'Z'. All mount/unmount events on 'Z' propagates to 'A' and 'C'. But | ||
545 | mount/unmount on 'A' do not propagate anywhere else. Similarly | ||
546 | mount/unmount on 'C' do not propagate anywhere else. | ||
547 | |||
548 | 8. 'A' is a unbindable mount and 'B' is a non-shared mount. This is a | ||
549 | invalid operation. A unbindable mount cannot be bind mounted. | ||
550 | |||
551 | 5c) Rbind semantics | ||
552 | |||
553 | rbind is same as bind. Bind replicates the specified mount. Rbind | ||
554 | replicates all the mounts in the tree belonging to the specified mount. | ||
555 | Rbind mount is bind mount applied to all the mounts in the tree. | ||
556 | |||
557 | If the source tree that is rbind has some unbindable mounts, | ||
558 | then the subtree under the unbindable mount is pruned in the new | ||
559 | location. | ||
560 | |||
561 | eg: lets say we have the following mount tree. | ||
562 | |||
563 | A | ||
564 | / \ | ||
565 | B C | ||
566 | / \ / \ | ||
567 | D E F G | ||
568 | |||
569 | Lets say all the mount except the mount C in the tree are | ||
570 | of a type other than unbindable. | ||
571 | |||
572 | If this tree is rbound to say Z | ||
573 | |||
574 | We will have the following tree at the new location. | ||
575 | |||
576 | Z | ||
577 | | | ||
578 | A' | ||
579 | / | ||
580 | B' Note how the tree under C is pruned | ||
581 | / \ in the new location. | ||
582 | D' E' | ||
583 | |||
584 | |||
585 | |||
586 | 5d) Move semantics | ||
587 | |||
588 | Consider the following command | ||
589 | |||
590 | mount --move A B/b | ||
591 | |||
592 | where 'A' is the source mount, 'B' is the destination mount and 'b' is | ||
593 | the dentry in the destination mount. | ||
594 | |||
595 | The outcome depends on the type of the mount of 'A' and 'B'. The table | ||
596 | below is a quick reference. | ||
597 | --------------------------------------------------------------------------- | ||
598 | | MOVE MOUNT OPERATION | | ||
599 | |************************************************************************** | ||
600 | | source(A)->| shared | private | slave | unbindable | | ||
601 | | dest(B) | | | | | | ||
602 | | | | | | | | | ||
603 | | v | | | | | | ||
604 | |************************************************************************** | ||
605 | | shared | shared | shared |shared and slave| invalid | | ||
606 | | | | | | | | ||
607 | |non-shared| shared | private | slave | unbindable | | ||
608 | *************************************************************************** | ||
609 | NOTE: moving a mount residing under a shared mount is invalid. | ||
610 | |||
611 | Details follow: | ||
612 | |||
613 | 1. 'A' is a shared mount and 'B' is a shared mount. The mount 'A' is | ||
614 | mounted on mount 'B' at dentry 'b'. Also new mounts 'A1', 'A2'...'An' | ||
615 | are created and mounted at dentry 'b' on all mounts that receive | ||
616 | propagation from mount 'B'. A new propagation tree is created in the | ||
617 | exact same configuration as that of 'B'. This new propagation tree | ||
618 | contains all the new mounts 'A1', 'A2'... 'An'. And this new | ||
619 | propagation tree is appended to the already existing propagation tree | ||
620 | of 'A'. | ||
621 | |||
622 | 2. 'A' is a private mount and 'B' is a shared mount. The mount 'A' is | ||
623 | mounted on mount 'B' at dentry 'b'. Also new mount 'A1', 'A2'... 'An' | ||
624 | are created and mounted at dentry 'b' on all mounts that receive | ||
625 | propagation from mount 'B'. The mount 'A' becomes a shared mount and a | ||
626 | propagation tree is created which is identical to that of | ||
627 | 'B'. This new propagation tree contains all the new mounts 'A1', | ||
628 | 'A2'... 'An'. | ||
629 | |||
630 | 3. 'A' is a slave mount of mount 'Z' and 'B' is a shared mount. The | ||
631 | mount 'A' is mounted on mount 'B' at dentry 'b'. Also new mounts 'A1', | ||
632 | 'A2'... 'An' are created and mounted at dentry 'b' on all mounts that | ||
633 | receive propagation from mount 'B'. A new propagation tree is created | ||
634 | in the exact same configuration as that of 'B'. This new propagation | ||
635 | tree contains all the new mounts 'A1', 'A2'... 'An'. And this new | ||
636 | propagation tree is appended to the already existing propagation tree of | ||
637 | 'A'. Mount 'A' continues to be the slave mount of 'Z' but it also | ||
638 | becomes 'shared'. | ||
639 | |||
640 | 4. 'A' is a unbindable mount and 'B' is a shared mount. The operation | ||
641 | is invalid. Because mounting anything on the shared mount 'B' can | ||
642 | create new mounts that get mounted on the mounts that receive | ||
643 | propagation from 'B'. And since the mount 'A' is unbindable, cloning | ||
644 | it to mount at other mountpoints is not possible. | ||
645 | |||
646 | 5. 'A' is a private mount and 'B' is a non-shared(private or slave or | ||
647 | unbindable) mount. The mount 'A' is mounted on mount 'B' at dentry 'b'. | ||
648 | |||
649 | 6. 'A' is a shared mount and 'B' is a non-shared mount. The mount 'A' | ||
650 | is mounted on mount 'B' at dentry 'b'. Mount 'A' continues to be a | ||
651 | shared mount. | ||
652 | |||
653 | 7. 'A' is a slave mount of mount 'Z' and 'B' is a non-shared mount. | ||
654 | The mount 'A' is mounted on mount 'B' at dentry 'b'. Mount 'A' | ||
655 | continues to be a slave mount of mount 'Z'. | ||
656 | |||
657 | 8. 'A' is a unbindable mount and 'B' is a non-shared mount. The mount | ||
658 | 'A' is mounted on mount 'B' at dentry 'b'. Mount 'A' continues to be a | ||
659 | unbindable mount. | ||
660 | |||
661 | 5e) Mount semantics | ||
662 | |||
663 | Consider the following command | ||
664 | |||
665 | mount device B/b | ||
666 | |||
667 | 'B' is the destination mount and 'b' is the dentry in the destination | ||
668 | mount. | ||
669 | |||
670 | The above operation is the same as bind operation with the exception | ||
671 | that the source mount is always a private mount. | ||
672 | |||
673 | |||
674 | 5f) Unmount semantics | ||
675 | |||
676 | Consider the following command | ||
677 | |||
678 | umount A | ||
679 | |||
680 | where 'A' is a mount mounted on mount 'B' at dentry 'b'. | ||
681 | |||
682 | If mount 'B' is shared, then all most-recently-mounted mounts at dentry | ||
683 | 'b' on mounts that receive propagation from mount 'B' and does not have | ||
684 | sub-mounts within them are unmounted. | ||
685 | |||
686 | Example: Lets say 'B1', 'B2', 'B3' are shared mounts that propagate to | ||
687 | each other. | ||
688 | |||
689 | lets say 'A1', 'A2', 'A3' are first mounted at dentry 'b' on mount | ||
690 | 'B1', 'B2' and 'B3' respectively. | ||
691 | |||
692 | lets say 'C1', 'C2', 'C3' are next mounted at the same dentry 'b' on | ||
693 | mount 'B1', 'B2' and 'B3' respectively. | ||
694 | |||
695 | if 'C1' is unmounted, all the mounts that are most-recently-mounted on | ||
696 | 'B1' and on the mounts that 'B1' propagates-to are unmounted. | ||
697 | |||
698 | 'B1' propagates to 'B2' and 'B3'. And the most recently mounted mount | ||
699 | on 'B2' at dentry 'b' is 'C2', and that of mount 'B3' is 'C3'. | ||
700 | |||
701 | So all 'C1', 'C2' and 'C3' should be unmounted. | ||
702 | |||
703 | If any of 'C2' or 'C3' has some child mounts, then that mount is not | ||
704 | unmounted, but all other mounts are unmounted. However if 'C1' is told | ||
705 | to be unmounted and 'C1' has some sub-mounts, the umount operation is | ||
706 | failed entirely. | ||
707 | |||
708 | 5g) Clone Namespace | ||
709 | |||
710 | A cloned namespace contains all the mounts as that of the parent | ||
711 | namespace. | ||
712 | |||
713 | Lets say 'A' and 'B' are the corresponding mounts in the parent and the | ||
714 | child namespace. | ||
715 | |||
716 | If 'A' is shared, then 'B' is also shared and 'A' and 'B' propagate to | ||
717 | each other. | ||
718 | |||
719 | If 'A' is a slave mount of 'Z', then 'B' is also the slave mount of | ||
720 | 'Z'. | ||
721 | |||
722 | If 'A' is a private mount, then 'B' is a private mount too. | ||
723 | |||
724 | If 'A' is unbindable mount, then 'B' is a unbindable mount too. | ||
725 | |||
726 | |||
727 | 6) Quiz | ||
728 | |||
729 | A. What is the result of the following command sequence? | ||
730 | |||
731 | mount --bind /mnt /mnt | ||
732 | mount --make-shared /mnt | ||
733 | mount --bind /mnt /tmp | ||
734 | mount --move /tmp /mnt/1 | ||
735 | |||
736 | what should be the contents of /mnt /mnt/1 /mnt/1/1 should be? | ||
737 | Should they all be identical? or should /mnt and /mnt/1 be | ||
738 | identical only? | ||
739 | |||
740 | |||
741 | B. What is the result of the following command sequence? | ||
742 | |||
743 | mount --make-rshared / | ||
744 | mkdir -p /v/1 | ||
745 | mount --rbind / /v/1 | ||
746 | |||
747 | what should be the content of /v/1/v/1 be? | ||
748 | |||
749 | |||
750 | C. What is the result of the following command sequence? | ||
751 | |||
752 | mount --bind /mnt /mnt | ||
753 | mount --make-shared /mnt | ||
754 | mkdir -p /mnt/1/2/3 /mnt/1/test | ||
755 | mount --bind /mnt/1 /tmp | ||
756 | mount --make-slave /mnt | ||
757 | mount --make-shared /mnt | ||
758 | mount --bind /mnt/1/2 /tmp1 | ||
759 | mount --make-slave /mnt | ||
760 | |||
761 | At this point we have the first mount at /tmp and | ||
762 | its root dentry is 1. Lets call this mount 'A' | ||
763 | And then we have a second mount at /tmp1 with root | ||
764 | dentry 2. Lets call this mount 'B' | ||
765 | Next we have a third mount at /mnt with root dentry | ||
766 | mnt. Lets call this mount 'C' | ||
767 | |||
768 | 'B' is the slave of 'A' and 'C' is a slave of 'B' | ||
769 | A -> B -> C | ||
770 | |||
771 | at this point if we execute the following command | ||
772 | |||
773 | mount --bind /bin /tmp/test | ||
774 | |||
775 | The mount is attempted on 'A' | ||
776 | |||
777 | will the mount propagate to 'B' and 'C' ? | ||
778 | |||
779 | what would be the contents of | ||
780 | /mnt/1/test be? | ||
781 | |||
782 | 7) FAQ | ||
783 | |||
784 | Q1. Why is bind mount needed? How is it different from symbolic links? | ||
785 | symbolic links can get stale if the destination mount gets | ||
786 | unmounted or moved. Bind mounts continue to exist even if the | ||
787 | other mount is unmounted or moved. | ||
788 | |||
789 | Q2. Why can't the shared subtree be implemented using exportfs? | ||
790 | |||
791 | exportfs is a heavyweight way of accomplishing part of what | ||
792 | shared subtree can do. I cannot imagine a way to implement the | ||
793 | semantics of slave mount using exportfs? | ||
794 | |||
795 | Q3 Why is unbindable mount needed? | ||
796 | |||
797 | Lets say we want to replicate the mount tree at multiple | ||
798 | locations within the same subtree. | ||
799 | |||
800 | if one rbind mounts a tree within the same subtree 'n' times | ||
801 | the number of mounts created is an exponential function of 'n'. | ||
802 | Having unbindable mount can help prune the unneeded bind | ||
803 | mounts. Here is a example. | ||
804 | |||
805 | step 1: | ||
806 | lets say the root tree has just two directories with | ||
807 | one vfsmount. | ||
808 | root | ||
809 | / \ | ||
810 | tmp usr | ||
811 | |||
812 | And we want to replicate the tree at multiple | ||
813 | mountpoints under /root/tmp | ||
814 | |||
815 | step2: | ||
816 | mount --make-shared /root | ||
817 | |||
818 | mkdir -p /tmp/m1 | ||
819 | |||
820 | mount --rbind /root /tmp/m1 | ||
821 | |||
822 | the new tree now looks like this: | ||
823 | |||
824 | root | ||
825 | / \ | ||
826 | tmp usr | ||
827 | / | ||
828 | m1 | ||
829 | / \ | ||
830 | tmp usr | ||
831 | / | ||
832 | m1 | ||
833 | |||
834 | it has two vfsmounts | ||
835 | |||
836 | step3: | ||
837 | mkdir -p /tmp/m2 | ||
838 | mount --rbind /root /tmp/m2 | ||
839 | |||
840 | the new tree now looks like this: | ||
841 | |||
842 | root | ||
843 | / \ | ||
844 | tmp usr | ||
845 | / \ | ||
846 | m1 m2 | ||
847 | / \ / \ | ||
848 | tmp usr tmp usr | ||
849 | / \ / | ||
850 | m1 m2 m1 | ||
851 | / \ / \ | ||
852 | tmp usr tmp usr | ||
853 | / / \ | ||
854 | m1 m1 m2 | ||
855 | / \ | ||
856 | tmp usr | ||
857 | / \ | ||
858 | m1 m2 | ||
859 | |||
860 | it has 6 vfsmounts | ||
861 | |||
862 | step 4: | ||
863 | mkdir -p /tmp/m3 | ||
864 | mount --rbind /root /tmp/m3 | ||
865 | |||
866 | I wont' draw the tree..but it has 24 vfsmounts | ||
867 | |||
868 | |||
869 | at step i the number of vfsmounts is V[i] = i*V[i-1]. | ||
870 | This is an exponential function. And this tree has way more | ||
871 | mounts than what we really needed in the first place. | ||
872 | |||
873 | One could use a series of umount at each step to prune | ||
874 | out the unneeded mounts. But there is a better solution. | ||
875 | Unclonable mounts come in handy here. | ||
876 | |||
877 | step 1: | ||
878 | lets say the root tree has just two directories with | ||
879 | one vfsmount. | ||
880 | root | ||
881 | / \ | ||
882 | tmp usr | ||
883 | |||
884 | How do we set up the same tree at multiple locations under | ||
885 | /root/tmp | ||
886 | |||
887 | step2: | ||
888 | mount --bind /root/tmp /root/tmp | ||
889 | |||
890 | mount --make-rshared /root | ||
891 | mount --make-unbindable /root/tmp | ||
892 | |||
893 | mkdir -p /tmp/m1 | ||
894 | |||
895 | mount --rbind /root /tmp/m1 | ||
896 | |||
897 | the new tree now looks like this: | ||
898 | |||
899 | root | ||
900 | / \ | ||
901 | tmp usr | ||
902 | / | ||
903 | m1 | ||
904 | / \ | ||
905 | tmp usr | ||
906 | |||
907 | step3: | ||
908 | mkdir -p /tmp/m2 | ||
909 | mount --rbind /root /tmp/m2 | ||
910 | |||
911 | the new tree now looks like this: | ||
912 | |||
913 | root | ||
914 | / \ | ||
915 | tmp usr | ||
916 | / \ | ||
917 | m1 m2 | ||
918 | / \ / \ | ||
919 | tmp usr tmp usr | ||
920 | |||
921 | step4: | ||
922 | |||
923 | mkdir -p /tmp/m3 | ||
924 | mount --rbind /root /tmp/m3 | ||
925 | |||
926 | the new tree now looks like this: | ||
927 | |||
928 | root | ||
929 | / \ | ||
930 | tmp usr | ||
931 | / \ \ | ||
932 | m1 m2 m3 | ||
933 | / \ / \ / \ | ||
934 | tmp usr tmp usr tmp usr | ||
935 | |||
936 | 8) Implementation | ||
937 | |||
938 | 8A) Datastructure | ||
939 | |||
940 | 4 new fields are introduced to struct vfsmount | ||
941 | ->mnt_share | ||
942 | ->mnt_slave_list | ||
943 | ->mnt_slave | ||
944 | ->mnt_master | ||
945 | |||
946 | ->mnt_share links together all the mount to/from which this vfsmount | ||
947 | send/receives propagation events. | ||
948 | |||
949 | ->mnt_slave_list links all the mounts to which this vfsmount propagates | ||
950 | to. | ||
951 | |||
952 | ->mnt_slave links together all the slaves that its master vfsmount | ||
953 | propagates to. | ||
954 | |||
955 | ->mnt_master points to the master vfsmount from which this vfsmount | ||
956 | receives propagation. | ||
957 | |||
958 | ->mnt_flags takes two more flags to indicate the propagation status of | ||
959 | the vfsmount. MNT_SHARE indicates that the vfsmount is a shared | ||
960 | vfsmount. MNT_UNCLONABLE indicates that the vfsmount cannot be | ||
961 | replicated. | ||
962 | |||
963 | All the shared vfsmounts in a peer group form a cyclic list through | ||
964 | ->mnt_share. | ||
965 | |||
966 | All vfsmounts with the same ->mnt_master form on a cyclic list anchored | ||
967 | in ->mnt_master->mnt_slave_list and going through ->mnt_slave. | ||
968 | |||
969 | ->mnt_master can point to arbitrary (and possibly different) members | ||
970 | of master peer group. To find all immediate slaves of a peer group | ||
971 | you need to go through _all_ ->mnt_slave_list of its members. | ||
972 | Conceptually it's just a single set - distribution among the | ||
973 | individual lists does not affect propagation or the way propagation | ||
974 | tree is modified by operations. | ||
975 | |||
976 | A example propagation tree looks as shown in the figure below. | ||
977 | [ NOTE: Though it looks like a forest, if we consider all the shared | ||
978 | mounts as a conceptual entity called 'pnode', it becomes a tree] | ||
979 | |||
980 | |||
981 | A <--> B <--> C <---> D | ||
982 | /|\ /| |\ | ||
983 | / F G J K H I | ||
984 | / | ||
985 | E<-->K | ||
986 | /|\ | ||
987 | M L N | ||
988 | |||
989 | In the above figure A,B,C and D all are shared and propagate to each | ||
990 | other. 'A' has got 3 slave mounts 'E' 'F' and 'G' 'C' has got 2 slave | ||
991 | mounts 'J' and 'K' and 'D' has got two slave mounts 'H' and 'I'. | ||
992 | 'E' is also shared with 'K' and they propagate to each other. And | ||
993 | 'K' has 3 slaves 'M', 'L' and 'N' | ||
994 | |||
995 | A's ->mnt_share links with the ->mnt_share of 'B' 'C' and 'D' | ||
996 | |||
997 | A's ->mnt_slave_list links with ->mnt_slave of 'E', 'K', 'F' and 'G' | ||
998 | |||
999 | E's ->mnt_share links with ->mnt_share of K | ||
1000 | 'E', 'K', 'F', 'G' have their ->mnt_master point to struct | ||
1001 | vfsmount of 'A' | ||
1002 | 'M', 'L', 'N' have their ->mnt_master point to struct vfsmount of 'K' | ||
1003 | K's ->mnt_slave_list links with ->mnt_slave of 'M', 'L' and 'N' | ||
1004 | |||
1005 | C's ->mnt_slave_list links with ->mnt_slave of 'J' and 'K' | ||
1006 | J and K's ->mnt_master points to struct vfsmount of C | ||
1007 | and finally D's ->mnt_slave_list links with ->mnt_slave of 'H' and 'I' | ||
1008 | 'H' and 'I' have their ->mnt_master pointing to struct vfsmount of 'D'. | ||
1009 | |||
1010 | |||
1011 | NOTE: The propagation tree is orthogonal to the mount tree. | ||
1012 | |||
1013 | |||
1014 | 8B Algorithm: | ||
1015 | |||
1016 | The crux of the implementation resides in rbind/move operation. | ||
1017 | |||
1018 | The overall algorithm breaks the operation into 3 phases: (look at | ||
1019 | attach_recursive_mnt() and propagate_mnt()) | ||
1020 | |||
1021 | 1. prepare phase. | ||
1022 | 2. commit phases. | ||
1023 | 3. abort phases. | ||
1024 | |||
1025 | Prepare phase: | ||
1026 | |||
1027 | for each mount in the source tree: | ||
1028 | a) Create the necessary number of mount trees to | ||
1029 | be attached to each of the mounts that receive | ||
1030 | propagation from the destination mount. | ||
1031 | b) Do not attach any of the trees to its destination. | ||
1032 | However note down its ->mnt_parent and ->mnt_mountpoint | ||
1033 | c) Link all the new mounts to form a propagation tree that | ||
1034 | is identical to the propagation tree of the destination | ||
1035 | mount. | ||
1036 | |||
1037 | If this phase is successful, there should be 'n' new | ||
1038 | propagation trees; where 'n' is the number of mounts in the | ||
1039 | source tree. Go to the commit phase | ||
1040 | |||
1041 | Also there should be 'm' new mount trees, where 'm' is | ||
1042 | the number of mounts to which the destination mount | ||
1043 | propagates to. | ||
1044 | |||
1045 | if any memory allocations fail, go to the abort phase. | ||
1046 | |||
1047 | Commit phase | ||
1048 | attach each of the mount trees to their corresponding | ||
1049 | destination mounts. | ||
1050 | |||
1051 | Abort phase | ||
1052 | delete all the newly created trees. | ||
1053 | |||
1054 | NOTE: all the propagation related functionality resides in the file | ||
1055 | pnode.c | ||
1056 | |||
1057 | |||
1058 | ------------------------------------------------------------------------ | ||
1059 | |||
1060 | version 0.1 (created the initial document, Ram Pai linuxram@us.ibm.com) | ||
1061 | version 0.2 (Incorporated comments from Al Viro) | ||
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index 9d019d35728f..bd55038b56f5 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt | |||
@@ -203,8 +203,6 @@ struct super_operations { | |||
203 | struct inode *(*alloc_inode)(struct super_block *sb); | 203 | struct inode *(*alloc_inode)(struct super_block *sb); |
204 | void (*destroy_inode)(struct inode *); | 204 | void (*destroy_inode)(struct inode *); |
205 | 205 | ||
206 | void (*read_inode) (struct inode *); | ||
207 | |||
208 | void (*dirty_inode) (struct inode *); | 206 | void (*dirty_inode) (struct inode *); |
209 | int (*write_inode) (struct inode *, int); | 207 | int (*write_inode) (struct inode *, int); |
210 | void (*put_inode) (struct inode *); | 208 | void (*put_inode) (struct inode *); |
@@ -242,15 +240,6 @@ or bottom half). | |||
242 | ->alloc_inode was defined and simply undoes anything done by | 240 | ->alloc_inode was defined and simply undoes anything done by |
243 | ->alloc_inode. | 241 | ->alloc_inode. |
244 | 242 | ||
245 | read_inode: this method is called to read a specific inode from the | ||
246 | mounted filesystem. The i_ino member in the struct inode is | ||
247 | initialized by the VFS to indicate which inode to read. Other | ||
248 | members are filled in by this method. | ||
249 | |||
250 | You can set this to NULL and use iget5_locked() instead of iget() | ||
251 | to read inodes. This is necessary for filesystems for which the | ||
252 | inode number is not sufficient to identify an inode. | ||
253 | |||
254 | dirty_inode: this method is called by the VFS to mark an inode dirty. | 243 | dirty_inode: this method is called by the VFS to mark an inode dirty. |
255 | 244 | ||
256 | write_inode: this method is called when the VFS needs to write an | 245 | write_inode: this method is called when the VFS needs to write an |
@@ -308,9 +297,9 @@ or bottom half). | |||
308 | 297 | ||
309 | quota_write: called by the VFS to write to filesystem quota file. | 298 | quota_write: called by the VFS to write to filesystem quota file. |
310 | 299 | ||
311 | The read_inode() method is responsible for filling in the "i_op" | 300 | Whoever sets up the inode is responsible for filling in the "i_op" field. This |
312 | field. This is a pointer to a "struct inode_operations" which | 301 | is a pointer to a "struct inode_operations" which describes the methods that |
313 | describes the methods that can be performed on individual inodes. | 302 | can be performed on individual inodes. |
314 | 303 | ||
315 | 304 | ||
316 | The Inode Object | 305 | The Inode Object |