diff options
-rw-r--r-- | Documentation/sharedsubtree.txt | 1060 |
1 files changed, 1060 insertions, 0 deletions
diff --git a/Documentation/sharedsubtree.txt b/Documentation/sharedsubtree.txt new file mode 100644 index 000000000000..2d8f403eb6eb --- /dev/null +++ b/Documentation/sharedsubtree.txt | |||
@@ -0,0 +1,1060 @@ | |||
1 | Shared Subtrees | ||
2 | --------------- | ||
3 | |||
4 | Contents: | ||
5 | 1) Overview | ||
6 | 2) Features | ||
7 | 3) smount command | ||
8 | 4) Use-case | ||
9 | 5) Detailed semantics | ||
10 | 6) Quiz | ||
11 | 7) FAQ | ||
12 | 8) Implementation | ||
13 | |||
14 | |||
15 | 1) Overview | ||
16 | ----------- | ||
17 | |||
18 | Consider the following situation: | ||
19 | |||
20 | A process wants to clone its own namespace, but still wants to access the CD | ||
21 | that got mounted recently. Shared subtree semantics provide the necessary | ||
22 | mechanism to accomplish the above. | ||
23 | |||
24 | It provides the necessary building blocks for features like per-user-namespace | ||
25 | and versioned filesystem. | ||
26 | |||
27 | 2) Features | ||
28 | ----------- | ||
29 | |||
30 | Shared subtree provides four different flavors of mounts; struct vfsmount to be | ||
31 | precise | ||
32 | |||
33 | a. shared mount | ||
34 | b. slave mount | ||
35 | c. private mount | ||
36 | d. unbindable mount | ||
37 | |||
38 | |||
39 | 2a) A shared mount can be replicated to as many mountpoints and all the | ||
40 | replicas continue to be exactly same. | ||
41 | |||
42 | Here is an example: | ||
43 | |||
44 | Lets say /mnt has a mount that is shared. | ||
45 | mount --make-shared /mnt | ||
46 | |||
47 | note: mount command does not yet support the --make-shared flag. | ||
48 | I have included a small C program which does the same by executing | ||
49 | 'smount /mnt shared' | ||
50 | |||
51 | #mount --bind /mnt /tmp | ||
52 | The above command replicates the mount at /mnt to the mountpoint /tmp | ||
53 | and the contents of both the mounts remain identical. | ||
54 | |||
55 | #ls /mnt | ||
56 | a b c | ||
57 | |||
58 | #ls /tmp | ||
59 | a b c | ||
60 | |||
61 | Now lets say we mount a device at /tmp/a | ||
62 | #mount /dev/sd0 /tmp/a | ||
63 | |||
64 | #ls /tmp/a | ||
65 | t1 t2 t2 | ||
66 | |||
67 | #ls /mnt/a | ||
68 | t1 t2 t2 | ||
69 | |||
70 | Note that the mount has propagated to the mount at /mnt as well. | ||
71 | |||
72 | And the same is true even when /dev/sd0 is mounted on /mnt/a. The | ||
73 | contents will be visible under /tmp/a too. | ||
74 | |||
75 | |||
76 | 2b) A slave mount is like a shared mount except that mount and umount events | ||
77 | only propagate towards it. | ||
78 | |||
79 | All slave mounts have a master mount which is a shared. | ||
80 | |||
81 | Here is an example: | ||
82 | |||
83 | Lets say /mnt has a mount which is shared. | ||
84 | #mount --make-shared /mnt | ||
85 | |||
86 | Lets bind mount /mnt to /tmp | ||
87 | #mount --bind /mnt /tmp | ||
88 | |||
89 | the new mount at /tmp becomes a shared mount and it is a replica of | ||
90 | the mount at /mnt. | ||
91 | |||
92 | Now lets make the mount at /tmp; a slave of /mnt | ||
93 | #mount --make-slave /tmp | ||
94 | [or smount /tmp slave] | ||
95 | |||
96 | lets mount /dev/sd0 on /mnt/a | ||
97 | #mount /dev/sd0 /mnt/a | ||
98 | |||
99 | #ls /mnt/a | ||
100 | t1 t2 t3 | ||
101 | |||
102 | #ls /tmp/a | ||
103 | t1 t2 t3 | ||
104 | |||
105 | Note the mount event has propagated to the mount at /tmp | ||
106 | |||
107 | However lets see what happens if we mount something on the mount at /tmp | ||
108 | |||
109 | #mount /dev/sd1 /tmp/b | ||
110 | |||
111 | #ls /tmp/b | ||
112 | s1 s2 s3 | ||
113 | |||
114 | #ls /mnt/b | ||
115 | |||
116 | Note how the mount event has not propagated to the mount at | ||
117 | /mnt | ||
118 | |||
119 | |||
120 | 2c) A private mount does not forward or receive propagation. | ||
121 | |||
122 | This is the mount we are familiar with. Its the default type. | ||
123 | |||
124 | |||
125 | 2d) A unbindable mount is a unbindable private mount | ||
126 | |||
127 | lets say we have a mount at /mnt and we make is unbindable | ||
128 | |||
129 | #mount --make-unbindable /mnt | ||
130 | [ smount /mnt unbindable ] | ||
131 | |||
132 | Lets try to bind mount this mount somewhere else. | ||
133 | # mount --bind /mnt /tmp | ||
134 | mount: wrong fs type, bad option, bad superblock on /mnt, | ||
135 | or too many mounted file systems | ||
136 | |||
137 | Binding a unbindable mount is a invalid operation. | ||
138 | |||
139 | |||
140 | 3) smount command | ||
141 | |||
142 | Currently the mount command is not aware of shared subtree features. | ||
143 | Work is in progress to add the support in mount ( util-linux package ). | ||
144 | Till then use the following program. | ||
145 | |||
146 | ------------------------------------------------------------------------ | ||
147 | // | ||
148 | //this code was developed my Miklos Szeredi <miklos@szeredi.hu> | ||
149 | //and modified by Ram Pai <linuxram@us.ibm.com> | ||
150 | // sample usage: | ||
151 | // smount /tmp shared | ||
152 | // | ||
153 | #include <stdio.h> | ||
154 | #include <stdlib.h> | ||
155 | #include <unistd.h> | ||
156 | #include <sys/mount.h> | ||
157 | #include <sys/fsuid.h> | ||
158 | |||
159 | #ifndef MS_REC | ||
160 | #define MS_REC 0x4000 /* 16384: Recursive loopback */ | ||
161 | #endif | ||
162 | |||
163 | #ifndef MS_SHARED | ||
164 | #define MS_SHARED 1<<20 /* Shared */ | ||
165 | #endif | ||
166 | |||
167 | #ifndef MS_PRIVATE | ||
168 | #define MS_PRIVATE 1<<18 /* Private */ | ||
169 | #endif | ||
170 | |||
171 | #ifndef MS_SLAVE | ||
172 | #define MS_SLAVE 1<<19 /* Slave */ | ||
173 | #endif | ||
174 | |||
175 | #ifndef MS_UNBINDABLE | ||
176 | #define MS_UNBINDABLE 1<<17 /* Unbindable */ | ||
177 | #endif | ||
178 | |||
179 | int main(int argc, char *argv[]) | ||
180 | { | ||
181 | int type; | ||
182 | if(argc != 3) { | ||
183 | fprintf(stderr, "usage: %s dir " | ||
184 | "<rshared|rslave|rprivate|runbindable|shared|slave" | ||
185 | "|private|unbindable>\n" , argv[0]); | ||
186 | return 1; | ||
187 | } | ||
188 | |||
189 | fprintf(stdout, "%s %s %s\n", argv[0], argv[1], argv[2]); | ||
190 | |||
191 | if (strcmp(argv[2],"rshared")==0) | ||
192 | type=(MS_SHARED|MS_REC); | ||
193 | else if (strcmp(argv[2],"rslave")==0) | ||
194 | type=(MS_SLAVE|MS_REC); | ||
195 | else if (strcmp(argv[2],"rprivate")==0) | ||
196 | type=(MS_PRIVATE|MS_REC); | ||
197 | else if (strcmp(argv[2],"runbindable")==0) | ||
198 | type=(MS_UNBINDABLE|MS_REC); | ||
199 | else if (strcmp(argv[2],"shared")==0) | ||
200 | type=MS_SHARED; | ||
201 | else if (strcmp(argv[2],"slave")==0) | ||
202 | type=MS_SLAVE; | ||
203 | else if (strcmp(argv[2],"private")==0) | ||
204 | type=MS_PRIVATE; | ||
205 | else if (strcmp(argv[2],"unbindable")==0) | ||
206 | type=MS_UNBINDABLE; | ||
207 | else { | ||
208 | fprintf(stderr, "invalid operation: %s\n", argv[2]); | ||
209 | return 1; | ||
210 | } | ||
211 | setfsuid(getuid()); | ||
212 | |||
213 | if(mount("", argv[1], "dontcare", type, "") == -1) { | ||
214 | perror("mount"); | ||
215 | return 1; | ||
216 | } | ||
217 | return 0; | ||
218 | } | ||
219 | ----------------------------------------------------------------------- | ||
220 | |||
221 | Copy the above code snippet into smount.c | ||
222 | gcc -o smount smount.c | ||
223 | |||
224 | |||
225 | (i) To mark all the mounts under /mnt as shared execute the following | ||
226 | command: | ||
227 | |||
228 | smount /mnt rshared | ||
229 | the corresponding syntax planned for mount command is | ||
230 | mount --make-rshared /mnt | ||
231 | |||
232 | just to mark a mount /mnt as shared, execute the following | ||
233 | command: | ||
234 | smount /mnt shared | ||
235 | the corresponding syntax planned for mount command is | ||
236 | mount --make-shared /mnt | ||
237 | |||
238 | (ii) To mark all the shared mounts under /mnt as slave execute the | ||
239 | following | ||
240 | |||
241 | command: | ||
242 | smount /mnt rslave | ||
243 | the corresponding syntax planned for mount command is | ||
244 | mount --make-rslave /mnt | ||
245 | |||
246 | just to mark a mount /mnt as slave, execute the following | ||
247 | command: | ||
248 | smount /mnt slave | ||
249 | the corresponding syntax planned for mount command is | ||
250 | mount --make-slave /mnt | ||
251 | |||
252 | (iii) To mark all the mounts under /mnt as private execute the | ||
253 | following command: | ||
254 | |||
255 | smount /mnt rprivate | ||
256 | the corresponding syntax planned for mount command is | ||
257 | mount --make-rprivate /mnt | ||
258 | |||
259 | just to mark a mount /mnt as private, execute the following | ||
260 | command: | ||
261 | smount /mnt private | ||
262 | the corresponding syntax planned for mount command is | ||
263 | mount --make-private /mnt | ||
264 | |||
265 | NOTE: by default all the mounts are created as private. But if | ||
266 | you want to change some shared/slave/unbindable mount as | ||
267 | private at a later point in time, this command can help. | ||
268 | |||
269 | (iv) To mark all the mounts under /mnt as unbindable execute the | ||
270 | following | ||
271 | |||
272 | command: | ||
273 | smount /mnt runbindable | ||
274 | the corresponding syntax planned for mount command is | ||
275 | mount --make-runbindable /mnt | ||
276 | |||
277 | just to mark a mount /mnt as unbindable, execute the following | ||
278 | command: | ||
279 | smount /mnt unbindable | ||
280 | the corresponding syntax planned for mount command is | ||
281 | mount --make-unbindable /mnt | ||
282 | |||
283 | |||
284 | 4) Use cases | ||
285 | ------------ | ||
286 | |||
287 | A) A process wants to clone its own namespace, but still wants to | ||
288 | access the CD that got mounted recently. | ||
289 | |||
290 | Solution: | ||
291 | |||
292 | The system administrator can make the mount at /cdrom shared | ||
293 | mount --bind /cdrom /cdrom | ||
294 | mount --make-shared /cdrom | ||
295 | |||
296 | Now any process that clones off a new namespace will have a | ||
297 | mount at /cdrom which is a replica of the same mount in the | ||
298 | parent namespace. | ||
299 | |||
300 | So when a CD is inserted and mounted at /cdrom that mount gets | ||
301 | propagated to the other mount at /cdrom in all the other clone | ||
302 | namespaces. | ||
303 | |||
304 | B) A process wants its mounts invisible to any other process, but | ||
305 | still be able to see the other system mounts. | ||
306 | |||
307 | Solution: | ||
308 | |||
309 | To begin with, the administrator can mark the entire mount tree | ||
310 | as shareable. | ||
311 | |||
312 | mount --make-rshared / | ||
313 | |||
314 | A new process can clone off a new namespace. And mark some part | ||
315 | of its namespace as slave | ||
316 | |||
317 | mount --make-rslave /myprivatetree | ||
318 | |||
319 | Hence forth any mounts within the /myprivatetree done by the | ||
320 | process will not show up in any other namespace. However mounts | ||
321 | done in the parent namespace under /myprivatetree still shows | ||
322 | up in the process's namespace. | ||
323 | |||
324 | |||
325 | Apart from the above semantics this feature provides the | ||
326 | building blocks to solve the following problems: | ||
327 | |||
328 | C) Per-user namespace | ||
329 | |||
330 | The above semantics allows a way to share mounts across | ||
331 | namespaces. But namespaces are associated with processes. If | ||
332 | namespaces are made first class objects with user API to | ||
333 | associate/disassociate a namespace with userid, then each user | ||
334 | could have his/her own namespace and tailor it to his/her | ||
335 | requirements. Offcourse its needs support from PAM. | ||
336 | |||
337 | D) Versioned files | ||
338 | |||
339 | If the entire mount tree is visible at multiple locations, then | ||
340 | a underlying versioning file system can return different | ||
341 | version of the file depending on the path used to access that | ||
342 | file. | ||
343 | |||
344 | An example is: | ||
345 | |||
346 | mount --make-shared / | ||
347 | mount --rbind / /view/v1 | ||
348 | mount --rbind / /view/v2 | ||
349 | mount --rbind / /view/v3 | ||
350 | mount --rbind / /view/v4 | ||
351 | |||
352 | and if /usr has a versioning filesystem mounted, than that | ||
353 | mount appears at /view/v1/usr, /view/v2/usr, /view/v3/usr and | ||
354 | /view/v4/usr too | ||
355 | |||
356 | A user can request v3 version of the file /usr/fs/namespace.c | ||
357 | by accessing /view/v3/usr/fs/namespace.c . The underlying | ||
358 | versioning filesystem can then decipher that v3 version of the | ||
359 | filesystem is being requested and return the corresponding | ||
360 | inode. | ||
361 | |||
362 | 5) Detailed semantics: | ||
363 | ------------------- | ||
364 | The section below explains the detailed semantics of | ||
365 | bind, rbind, move, mount, umount and clone-namespace operations. | ||
366 | |||
367 | Note: the word 'vfsmount' and the noun 'mount' have been used | ||
368 | to mean the same thing, throughout this document. | ||
369 | |||
370 | 5a) Mount states | ||
371 | |||
372 | A given mount can be in one of the following states | ||
373 | 1) shared | ||
374 | 2) slave | ||
375 | 3) shared and slave | ||
376 | 4) private | ||
377 | 5) unbindable | ||
378 | |||
379 | A 'propagation event' is defined as event generated on a vfsmount | ||
380 | that leads to mount or unmount actions in other vfsmounts. | ||
381 | |||
382 | A 'peer group' is defined as a group of vfsmounts that propagate | ||
383 | events to each other. | ||
384 | |||
385 | (1) Shared mounts | ||
386 | |||
387 | A 'shared mount' is defined as a vfsmount that belongs to a | ||
388 | 'peer group'. | ||
389 | |||
390 | For example: | ||
391 | mount --make-shared /mnt | ||
392 | mount --bin /mnt /tmp | ||
393 | |||
394 | The mount at /mnt and that at /tmp are both shared and belong | ||
395 | to the same peer group. Anything mounted or unmounted under | ||
396 | /mnt or /tmp reflect in all the other mounts of its peer | ||
397 | group. | ||
398 | |||
399 | |||
400 | (2) Slave mounts | ||
401 | |||
402 | A 'slave mount' is defined as a vfsmount that receives | ||
403 | propagation events and does not forward propagation events. | ||
404 | |||
405 | A slave mount as the name implies has a master mount from which | ||
406 | mount/unmount events are received. Events do not propagate from | ||
407 | the slave mount to the master. Only a shared mount can be made | ||
408 | a slave by executing the following command | ||
409 | |||
410 | mount --make-slave mount | ||
411 | |||
412 | A shared mount that is made as a slave is no more shared unless | ||
413 | modified to become shared. | ||
414 | |||
415 | (3) Shared and Slave | ||
416 | |||
417 | A vfsmount can be both shared as well as slave. This state | ||
418 | indicates that the mount is a slave of some vfsmount, and | ||
419 | has its own peer group too. This vfsmount receives propagation | ||
420 | events from its master vfsmount, and also forwards propagation | ||
421 | events to its 'peer group' and to its slave vfsmounts. | ||
422 | |||
423 | Strictly speaking, the vfsmount is shared having its own | ||
424 | peer group, and this peer-group is a slave of some other | ||
425 | peer group. | ||
426 | |||
427 | Only a slave vfsmount can be made as 'shared and slave' by | ||
428 | either executing the following command | ||
429 | mount --make-shared mount | ||
430 | or by moving the slave vfsmount under a shared vfsmount. | ||
431 | |||
432 | (4) Private mount | ||
433 | |||
434 | A 'private mount' is defined as vfsmount that does not | ||
435 | receive or forward any propagation events. | ||
436 | |||
437 | (5) Unbindable mount | ||
438 | |||
439 | A 'unbindable mount' is defined as vfsmount that does not | ||
440 | receive or forward any propagation events and cannot | ||
441 | be bind mounted. | ||
442 | |||
443 | |||
444 | State diagram: | ||
445 | The state diagram below explains the state transition of a mount, | ||
446 | in response to various commands. | ||
447 | ------------------------------------------------------------------------ | ||
448 | | |make-shared | make-slave | make-private |make-unbindab| | ||
449 | --------------|------------|--------------|--------------|-------------| | ||
450 | |shared |shared |*slave/private| private | unbindable | | ||
451 | | | | | | | | ||
452 | |-------------|------------|--------------|--------------|-------------| | ||
453 | |slave |shared | **slave | private | unbindable | | ||
454 | | |and slave | | | | | ||
455 | |-------------|------------|--------------|--------------|-------------| | ||
456 | |shared |shared | slave | private | unbindable | | ||
457 | |and slave |and slave | | | | | ||
458 | |-------------|------------|--------------|--------------|-------------| | ||
459 | |private |shared | **private | private | unbindable | | ||
460 | |-------------|------------|--------------|--------------|-------------| | ||
461 | |unbindable |shared |**unbindable | private | unbindable | | ||
462 | ------------------------------------------------------------------------ | ||
463 | |||
464 | * if the shared mount is the only mount in its peer group, making it | ||
465 | slave, makes it private automatically. Note that there is no master to | ||
466 | which it can be slaved to. | ||
467 | |||
468 | ** slaving a non-shared mount has no effect on the mount. | ||
469 | |||
470 | Apart from the commands listed below, the 'move' operation also changes | ||
471 | the state of a mount depending on type of the destination mount. Its | ||
472 | explained in section 5d. | ||
473 | |||
474 | 5b) Bind semantics | ||
475 | |||
476 | Consider the following command | ||
477 | |||
478 | mount --bind A/a B/b | ||
479 | |||
480 | where 'A' is the source mount, 'a' is the dentry in the mount 'A', 'B' | ||
481 | is the destination mount and 'b' is the dentry in the destination mount. | ||
482 | |||
483 | The outcome depends on the type of mount of 'A' and 'B'. The table | ||
484 | below contains quick reference. | ||
485 | --------------------------------------------------------------------------- | ||
486 | | BIND MOUNT OPERATION | | ||
487 | |************************************************************************** | ||
488 | |source(A)->| shared | private | slave | unbindable | | ||
489 | | dest(B) | | | | | | ||
490 | | | | | | | | | ||
491 | | v | | | | | | ||
492 | |************************************************************************** | ||
493 | | shared | shared | shared | shared & slave | invalid | | ||
494 | | | | | | | | ||
495 | |non-shared| shared | private | slave | invalid | | ||
496 | *************************************************************************** | ||
497 | |||
498 | Details: | ||
499 | |||
500 | 1. 'A' is a shared mount and 'B' is a shared mount. A new mount 'C' | ||
501 | which is clone of 'A', is created. Its root dentry is 'a' . 'C' is | ||
502 | mounted on mount 'B' at dentry 'b'. Also new mount 'C1', 'C2', 'C3' ... | ||
503 | are created and mounted at the dentry 'b' on all mounts where 'B' | ||
504 | propagates to. A new propagation tree containing 'C1',..,'Cn' is | ||
505 | created. This propagation tree is identical to the propagation tree of | ||
506 | 'B'. And finally the peer-group of 'C' is merged with the peer group | ||
507 | of 'A'. | ||
508 | |||
509 | 2. 'A' is a private mount and 'B' is a shared mount. A new mount 'C' | ||
510 | which is clone of 'A', is created. Its root dentry is 'a'. 'C' is | ||
511 | mounted on mount 'B' at dentry 'b'. Also new mount 'C1', 'C2', 'C3' ... | ||
512 | are created and mounted at the dentry 'b' on all mounts where 'B' | ||
513 | propagates to. A new propagation tree is set containing all new mounts | ||
514 | 'C', 'C1', .., 'Cn' with exactly the same configuration as the | ||
515 | propagation tree for 'B'. | ||
516 | |||
517 | 3. 'A' is a slave mount of mount 'Z' and 'B' is a shared mount. A new | ||
518 | mount 'C' which is clone of 'A', is created. Its root dentry is 'a' . | ||
519 | 'C' is mounted on mount 'B' at dentry 'b'. Also new mounts 'C1', 'C2', | ||
520 | 'C3' ... are created and mounted at the dentry 'b' on all mounts where | ||
521 | 'B' propagates to. A new propagation tree containing the new mounts | ||
522 | 'C','C1',.. 'Cn' is created. This propagation tree is identical to the | ||
523 | propagation tree for 'B'. And finally the mount 'C' and its peer group | ||
524 | is made the slave of mount 'Z'. In other words, mount 'C' is in the | ||
525 | state 'slave and shared'. | ||
526 | |||
527 | 4. 'A' is a unbindable mount and 'B' is a shared mount. This is a | ||
528 | invalid operation. | ||
529 | |||
530 | 5. 'A' is a private mount and 'B' is a non-shared(private or slave or | ||
531 | unbindable) mount. A new mount 'C' which is clone of 'A', is created. | ||
532 | Its root dentry is 'a'. 'C' is mounted on mount 'B' at dentry 'b'. | ||
533 | |||
534 | 6. 'A' is a shared mount and 'B' is a non-shared mount. A new mount 'C' | ||
535 | which is a clone of 'A' is created. Its root dentry is 'a'. 'C' is | ||
536 | mounted on mount 'B' at dentry 'b'. 'C' is made a member of the | ||
537 | peer-group of 'A'. | ||
538 | |||
539 | 7. 'A' is a slave mount of mount 'Z' and 'B' is a non-shared mount. A | ||
540 | new mount 'C' which is a clone of 'A' is created. Its root dentry is | ||
541 | 'a'. 'C' is mounted on mount 'B' at dentry 'b'. Also 'C' is set as a | ||
542 | slave mount of 'Z'. In other words 'A' and 'C' are both slave mounts of | ||
543 | 'Z'. All mount/unmount events on 'Z' propagates to 'A' and 'C'. But | ||
544 | mount/unmount on 'A' do not propagate anywhere else. Similarly | ||
545 | mount/unmount on 'C' do not propagate anywhere else. | ||
546 | |||
547 | 8. 'A' is a unbindable mount and 'B' is a non-shared mount. This is a | ||
548 | invalid operation. A unbindable mount cannot be bind mounted. | ||
549 | |||
550 | 5c) Rbind semantics | ||
551 | |||
552 | rbind is same as bind. Bind replicates the specified mount. Rbind | ||
553 | replicates all the mounts in the tree belonging to the specified mount. | ||
554 | Rbind mount is bind mount applied to all the mounts in the tree. | ||
555 | |||
556 | If the source tree that is rbind has some unbindable mounts, | ||
557 | then the subtree under the unbindable mount is pruned in the new | ||
558 | location. | ||
559 | |||
560 | eg: lets say we have the following mount tree. | ||
561 | |||
562 | A | ||
563 | / \ | ||
564 | B C | ||
565 | / \ / \ | ||
566 | D E F G | ||
567 | |||
568 | Lets say all the mount except the mount C in the tree are | ||
569 | of a type other than unbindable. | ||
570 | |||
571 | If this tree is rbound to say Z | ||
572 | |||
573 | We will have the following tree at the new location. | ||
574 | |||
575 | Z | ||
576 | | | ||
577 | A' | ||
578 | / | ||
579 | B' Note how the tree under C is pruned | ||
580 | / \ in the new location. | ||
581 | D' E' | ||
582 | |||
583 | |||
584 | |||
585 | 5d) Move semantics | ||
586 | |||
587 | Consider the following command | ||
588 | |||
589 | mount --move A B/b | ||
590 | |||
591 | where 'A' is the source mount, 'B' is the destination mount and 'b' is | ||
592 | the dentry in the destination mount. | ||
593 | |||
594 | The outcome depends on the type of the mount of 'A' and 'B'. The table | ||
595 | below is a quick reference. | ||
596 | --------------------------------------------------------------------------- | ||
597 | | MOVE MOUNT OPERATION | | ||
598 | |************************************************************************** | ||
599 | | source(A)->| shared | private | slave | unbindable | | ||
600 | | dest(B) | | | | | | ||
601 | | | | | | | | | ||
602 | | v | | | | | | ||
603 | |************************************************************************** | ||
604 | | shared | shared | shared |shared and slave| invalid | | ||
605 | | | | | | | | ||
606 | |non-shared| shared | private | slave | unbindable | | ||
607 | *************************************************************************** | ||
608 | NOTE: moving a mount residing under a shared mount is invalid. | ||
609 | |||
610 | Details follow: | ||
611 | |||
612 | 1. 'A' is a shared mount and 'B' is a shared mount. The mount 'A' is | ||
613 | mounted on mount 'B' at dentry 'b'. Also new mounts 'A1', 'A2'...'An' | ||
614 | are created and mounted at dentry 'b' on all mounts that receive | ||
615 | propagation from mount 'B'. A new propagation tree is created in the | ||
616 | exact same configuration as that of 'B'. This new propagation tree | ||
617 | contains all the new mounts 'A1', 'A2'... 'An'. And this new | ||
618 | propagation tree is appended to the already existing propagation tree | ||
619 | of 'A'. | ||
620 | |||
621 | 2. 'A' is a private mount and 'B' is a shared mount. The mount 'A' is | ||
622 | mounted on mount 'B' at dentry 'b'. Also new mount 'A1', 'A2'... 'An' | ||
623 | are created and mounted at dentry 'b' on all mounts that receive | ||
624 | propagation from mount 'B'. The mount 'A' becomes a shared mount and a | ||
625 | propagation tree is created which is identical to that of | ||
626 | 'B'. This new propagation tree contains all the new mounts 'A1', | ||
627 | 'A2'... 'An'. | ||
628 | |||
629 | 3. 'A' is a slave mount of mount 'Z' and 'B' is a shared mount. The | ||
630 | mount 'A' is mounted on mount 'B' at dentry 'b'. Also new mounts 'A1', | ||
631 | 'A2'... 'An' are created and mounted at dentry 'b' on all mounts that | ||
632 | receive propagation from mount 'B'. A new propagation tree is created | ||
633 | in the exact same configuration as that of 'B'. This new propagation | ||
634 | tree contains all the new mounts 'A1', 'A2'... 'An'. And this new | ||
635 | propagation tree is appended to the already existing propagation tree of | ||
636 | 'A'. Mount 'A' continues to be the slave mount of 'Z' but it also | ||
637 | becomes 'shared'. | ||
638 | |||
639 | 4. 'A' is a unbindable mount and 'B' is a shared mount. The operation | ||
640 | is invalid. Because mounting anything on the shared mount 'B' can | ||
641 | create new mounts that get mounted on the mounts that receive | ||
642 | propagation from 'B'. And since the mount 'A' is unbindable, cloning | ||
643 | it to mount at other mountpoints is not possible. | ||
644 | |||
645 | 5. 'A' is a private mount and 'B' is a non-shared(private or slave or | ||
646 | unbindable) mount. The mount 'A' is mounted on mount 'B' at dentry 'b'. | ||
647 | |||
648 | 6. 'A' is a shared mount and 'B' is a non-shared mount. The mount 'A' | ||
649 | is mounted on mount 'B' at dentry 'b'. Mount 'A' continues to be a | ||
650 | shared mount. | ||
651 | |||
652 | 7. 'A' is a slave mount of mount 'Z' and 'B' is a non-shared mount. | ||
653 | The mount 'A' is mounted on mount 'B' at dentry 'b'. Mount 'A' | ||
654 | continues to be a slave mount of mount 'Z'. | ||
655 | |||
656 | 8. 'A' is a unbindable mount and 'B' is a non-shared mount. The mount | ||
657 | 'A' is mounted on mount 'B' at dentry 'b'. Mount 'A' continues to be a | ||
658 | unbindable mount. | ||
659 | |||
660 | 5e) Mount semantics | ||
661 | |||
662 | Consider the following command | ||
663 | |||
664 | mount device B/b | ||
665 | |||
666 | 'B' is the destination mount and 'b' is the dentry in the destination | ||
667 | mount. | ||
668 | |||
669 | The above operation is the same as bind operation with the exception | ||
670 | that the source mount is always a private mount. | ||
671 | |||
672 | |||
673 | 5f) Unmount semantics | ||
674 | |||
675 | Consider the following command | ||
676 | |||
677 | umount A | ||
678 | |||
679 | where 'A' is a mount mounted on mount 'B' at dentry 'b'. | ||
680 | |||
681 | If mount 'B' is shared, then all most-recently-mounted mounts at dentry | ||
682 | 'b' on mounts that receive propagation from mount 'B' and does not have | ||
683 | sub-mounts within them are unmounted. | ||
684 | |||
685 | Example: Lets say 'B1', 'B2', 'B3' are shared mounts that propagate to | ||
686 | each other. | ||
687 | |||
688 | lets say 'A1', 'A2', 'A3' are first mounted at dentry 'b' on mount | ||
689 | 'B1', 'B2' and 'B3' respectively. | ||
690 | |||
691 | lets say 'C1', 'C2', 'C3' are next mounted at the same dentry 'b' on | ||
692 | mount 'B1', 'B2' and 'B3' respectively. | ||
693 | |||
694 | if 'C1' is unmounted, all the mounts that are most-recently-mounted on | ||
695 | 'B1' and on the mounts that 'B1' propagates-to are unmounted. | ||
696 | |||
697 | 'B1' propagates to 'B2' and 'B3'. And the most recently mounted mount | ||
698 | on 'B2' at dentry 'b' is 'C2', and that of mount 'B3' is 'C3'. | ||
699 | |||
700 | So all 'C1', 'C2' and 'C3' should be unmounted. | ||
701 | |||
702 | If any of 'C2' or 'C3' has some child mounts, then that mount is not | ||
703 | unmounted, but all other mounts are unmounted. However if 'C1' is told | ||
704 | to be unmounted and 'C1' has some sub-mounts, the umount operation is | ||
705 | failed entirely. | ||
706 | |||
707 | 5g) Clone Namespace | ||
708 | |||
709 | A cloned namespace contains all the mounts as that of the parent | ||
710 | namespace. | ||
711 | |||
712 | Lets say 'A' and 'B' are the corresponding mounts in the parent and the | ||
713 | child namespace. | ||
714 | |||
715 | If 'A' is shared, then 'B' is also shared and 'A' and 'B' propagate to | ||
716 | each other. | ||
717 | |||
718 | If 'A' is a slave mount of 'Z', then 'B' is also the slave mount of | ||
719 | 'Z'. | ||
720 | |||
721 | If 'A' is a private mount, then 'B' is a private mount too. | ||
722 | |||
723 | If 'A' is unbindable mount, then 'B' is a unbindable mount too. | ||
724 | |||
725 | |||
726 | 6) Quiz | ||
727 | |||
728 | A. What is the result of the following command sequence? | ||
729 | |||
730 | mount --bind /mnt /mnt | ||
731 | mount --make-shared /mnt | ||
732 | mount --bind /mnt /tmp | ||
733 | mount --move /tmp /mnt/1 | ||
734 | |||
735 | what should be the contents of /mnt /mnt/1 /mnt/1/1 should be? | ||
736 | Should they all be identical? or should /mnt and /mnt/1 be | ||
737 | identical only? | ||
738 | |||
739 | |||
740 | B. What is the result of the following command sequence? | ||
741 | |||
742 | mount --make-rshared / | ||
743 | mkdir -p /v/1 | ||
744 | mount --rbind / /v/1 | ||
745 | |||
746 | what should be the content of /v/1/v/1 be? | ||
747 | |||
748 | |||
749 | C. What is the result of the following command sequence? | ||
750 | |||
751 | mount --bind /mnt /mnt | ||
752 | mount --make-shared /mnt | ||
753 | mkdir -p /mnt/1/2/3 /mnt/1/test | ||
754 | mount --bind /mnt/1 /tmp | ||
755 | mount --make-slave /mnt | ||
756 | mount --make-shared /mnt | ||
757 | mount --bind /mnt/1/2 /tmp1 | ||
758 | mount --make-slave /mnt | ||
759 | |||
760 | At this point we have the first mount at /tmp and | ||
761 | its root dentry is 1. Lets call this mount 'A' | ||
762 | And then we have a second mount at /tmp1 with root | ||
763 | dentry 2. Lets call this mount 'B' | ||
764 | Next we have a third mount at /mnt with root dentry | ||
765 | mnt. Lets call this mount 'C' | ||
766 | |||
767 | 'B' is the slave of 'A' and 'C' is a slave of 'B' | ||
768 | A -> B -> C | ||
769 | |||
770 | at this point if we execute the following command | ||
771 | |||
772 | mount --bind /bin /tmp/test | ||
773 | |||
774 | The mount is attempted on 'A' | ||
775 | |||
776 | will the mount propagate to 'B' and 'C' ? | ||
777 | |||
778 | what would be the contents of | ||
779 | /mnt/1/test be? | ||
780 | |||
781 | 7) FAQ | ||
782 | |||
783 | Q1. Why is bind mount needed? How is it different from symbolic links? | ||
784 | symbolic links can get stale if the destination mount gets | ||
785 | unmounted or moved. Bind mounts continue to exist even if the | ||
786 | other mount is unmounted or moved. | ||
787 | |||
788 | Q2. Why can't the shared subtree be implemented using exportfs? | ||
789 | |||
790 | exportfs is a heavyweight way of accomplishing part of what | ||
791 | shared subtree can do. I cannot imagine a way to implement the | ||
792 | semantics of slave mount using exportfs? | ||
793 | |||
794 | Q3 Why is unbindable mount needed? | ||
795 | |||
796 | Lets say we want to replicate the mount tree at multiple | ||
797 | locations within the same subtree. | ||
798 | |||
799 | if one rbind mounts a tree within the same subtree 'n' times | ||
800 | the number of mounts created is an exponential function of 'n'. | ||
801 | Having unbindable mount can help prune the unneeded bind | ||
802 | mounts. Here is a example. | ||
803 | |||
804 | step 1: | ||
805 | lets say the root tree has just two directories with | ||
806 | one vfsmount. | ||
807 | root | ||
808 | / \ | ||
809 | tmp usr | ||
810 | |||
811 | And we want to replicate the tree at multiple | ||
812 | mountpoints under /root/tmp | ||
813 | |||
814 | step2: | ||
815 | mount --make-shared /root | ||
816 | |||
817 | mkdir -p /tmp/m1 | ||
818 | |||
819 | mount --rbind /root /tmp/m1 | ||
820 | |||
821 | the new tree now looks like this: | ||
822 | |||
823 | root | ||
824 | / \ | ||
825 | tmp usr | ||
826 | / | ||
827 | m1 | ||
828 | / \ | ||
829 | tmp usr | ||
830 | / | ||
831 | m1 | ||
832 | |||
833 | it has two vfsmounts | ||
834 | |||
835 | step3: | ||
836 | mkdir -p /tmp/m2 | ||
837 | mount --rbind /root /tmp/m2 | ||
838 | |||
839 | the new tree now looks like this: | ||
840 | |||
841 | root | ||
842 | / \ | ||
843 | tmp usr | ||
844 | / \ | ||
845 | m1 m2 | ||
846 | / \ / \ | ||
847 | tmp usr tmp usr | ||
848 | / \ / | ||
849 | m1 m2 m1 | ||
850 | / \ / \ | ||
851 | tmp usr tmp usr | ||
852 | / / \ | ||
853 | m1 m1 m2 | ||
854 | / \ | ||
855 | tmp usr | ||
856 | / \ | ||
857 | m1 m2 | ||
858 | |||
859 | it has 6 vfsmounts | ||
860 | |||
861 | step 4: | ||
862 | mkdir -p /tmp/m3 | ||
863 | mount --rbind /root /tmp/m3 | ||
864 | |||
865 | I wont' draw the tree..but it has 24 vfsmounts | ||
866 | |||
867 | |||
868 | at step i the number of vfsmounts is V[i] = i*V[i-1]. | ||
869 | This is an exponential function. And this tree has way more | ||
870 | mounts than what we really needed in the first place. | ||
871 | |||
872 | One could use a series of umount at each step to prune | ||
873 | out the unneeded mounts. But there is a better solution. | ||
874 | Unclonable mounts come in handy here. | ||
875 | |||
876 | step 1: | ||
877 | lets say the root tree has just two directories with | ||
878 | one vfsmount. | ||
879 | root | ||
880 | / \ | ||
881 | tmp usr | ||
882 | |||
883 | How do we set up the same tree at multiple locations under | ||
884 | /root/tmp | ||
885 | |||
886 | step2: | ||
887 | mount --bind /root/tmp /root/tmp | ||
888 | |||
889 | mount --make-rshared /root | ||
890 | mount --make-unbindable /root/tmp | ||
891 | |||
892 | mkdir -p /tmp/m1 | ||
893 | |||
894 | mount --rbind /root /tmp/m1 | ||
895 | |||
896 | the new tree now looks like this: | ||
897 | |||
898 | root | ||
899 | / \ | ||
900 | tmp usr | ||
901 | / | ||
902 | m1 | ||
903 | / \ | ||
904 | tmp usr | ||
905 | |||
906 | step3: | ||
907 | mkdir -p /tmp/m2 | ||
908 | mount --rbind /root /tmp/m2 | ||
909 | |||
910 | the new tree now looks like this: | ||
911 | |||
912 | root | ||
913 | / \ | ||
914 | tmp usr | ||
915 | / \ | ||
916 | m1 m2 | ||
917 | / \ / \ | ||
918 | tmp usr tmp usr | ||
919 | |||
920 | step4: | ||
921 | |||
922 | mkdir -p /tmp/m3 | ||
923 | mount --rbind /root /tmp/m3 | ||
924 | |||
925 | the new tree now looks like this: | ||
926 | |||
927 | root | ||
928 | / \ | ||
929 | tmp usr | ||
930 | / \ \ | ||
931 | m1 m2 m3 | ||
932 | / \ / \ / \ | ||
933 | tmp usr tmp usr tmp usr | ||
934 | |||
935 | 8) Implementation | ||
936 | |||
937 | 8A) Datastructure | ||
938 | |||
939 | 4 new fields are introduced to struct vfsmount | ||
940 | ->mnt_share | ||
941 | ->mnt_slave_list | ||
942 | ->mnt_slave | ||
943 | ->mnt_master | ||
944 | |||
945 | ->mnt_share links togather all the mount to/from which this vfsmount | ||
946 | send/receives propagation events. | ||
947 | |||
948 | ->mnt_slave_list links all the mounts to which this vfsmount propagates | ||
949 | to. | ||
950 | |||
951 | ->mnt_slave links togather all the slaves that its master vfsmount | ||
952 | propagates to. | ||
953 | |||
954 | ->mnt_master points to the master vfsmount from which this vfsmount | ||
955 | receives propagation. | ||
956 | |||
957 | ->mnt_flags takes two more flags to indicate the propagation status of | ||
958 | the vfsmount. MNT_SHARE indicates that the vfsmount is a shared | ||
959 | vfsmount. MNT_UNCLONABLE indicates that the vfsmount cannot be | ||
960 | replicated. | ||
961 | |||
962 | All the shared vfsmounts in a peer group form a cyclic list through | ||
963 | ->mnt_share. | ||
964 | |||
965 | All vfsmounts with the same ->mnt_master form on a cyclic list anchored | ||
966 | in ->mnt_master->mnt_slave_list and going through ->mnt_slave. | ||
967 | |||
968 | ->mnt_master can point to arbitrary (and possibly different) members | ||
969 | of master peer group. To find all immediate slaves of a peer group | ||
970 | you need to go through _all_ ->mnt_slave_list of its members. | ||
971 | Conceptually it's just a single set - distribution among the | ||
972 | individual lists does not affect propagation or the way propagation | ||
973 | tree is modified by operations. | ||
974 | |||
975 | A example propagation tree looks as shown in the figure below. | ||
976 | [ NOTE: Though it looks like a forest, if we consider all the shared | ||
977 | mounts as a conceptual entity called 'pnode', it becomes a tree] | ||
978 | |||
979 | |||
980 | A <--> B <--> C <---> D | ||
981 | /|\ /| |\ | ||
982 | / F G J K H I | ||
983 | / | ||
984 | E<-->K | ||
985 | /|\ | ||
986 | M L N | ||
987 | |||
988 | In the above figure A,B,C and D all are shared and propagate to each | ||
989 | other. 'A' has got 3 slave mounts 'E' 'F' and 'G' 'C' has got 2 slave | ||
990 | mounts 'J' and 'K' and 'D' has got two slave mounts 'H' and 'I'. | ||
991 | 'E' is also shared with 'K' and they propagate to each other. And | ||
992 | 'K' has 3 slaves 'M', 'L' and 'N' | ||
993 | |||
994 | A's ->mnt_share links with the ->mnt_share of 'B' 'C' and 'D' | ||
995 | |||
996 | A's ->mnt_slave_list links with ->mnt_slave of 'E', 'K', 'F' and 'G' | ||
997 | |||
998 | E's ->mnt_share links with ->mnt_share of K | ||
999 | 'E', 'K', 'F', 'G' have their ->mnt_master point to struct | ||
1000 | vfsmount of 'A' | ||
1001 | 'M', 'L', 'N' have their ->mnt_master point to struct vfsmount of 'K' | ||
1002 | K's ->mnt_slave_list links with ->mnt_slave of 'M', 'L' and 'N' | ||
1003 | |||
1004 | C's ->mnt_slave_list links with ->mnt_slave of 'J' and 'K' | ||
1005 | J and K's ->mnt_master points to struct vfsmount of C | ||
1006 | and finally D's ->mnt_slave_list links with ->mnt_slave of 'H' and 'I' | ||
1007 | 'H' and 'I' have their ->mnt_master pointing to struct vfsmount of 'D'. | ||
1008 | |||
1009 | |||
1010 | NOTE: The propagation tree is orthogonal to the mount tree. | ||
1011 | |||
1012 | |||
1013 | 8B Algorithm: | ||
1014 | |||
1015 | The crux of the implementation resides in rbind/move operation. | ||
1016 | |||
1017 | The overall algorithm breaks the operation into 3 phases: (look at | ||
1018 | attach_recursive_mnt() and propagate_mnt()) | ||
1019 | |||
1020 | 1. prepare phase. | ||
1021 | 2. commit phases. | ||
1022 | 3. abort phases. | ||
1023 | |||
1024 | Prepare phase: | ||
1025 | |||
1026 | for each mount in the source tree: | ||
1027 | a) Create the necessary number of mount trees to | ||
1028 | be attached to each of the mounts that receive | ||
1029 | propagation from the destination mount. | ||
1030 | b) Do not attach any of the trees to its destination. | ||
1031 | However note down its ->mnt_parent and ->mnt_mountpoint | ||
1032 | c) Link all the new mounts to form a propagation tree that | ||
1033 | is identical to the propagation tree of the destination | ||
1034 | mount. | ||
1035 | |||
1036 | If this phase is successful, there should be 'n' new | ||
1037 | propagation trees; where 'n' is the number of mounts in the | ||
1038 | source tree. Go to the commit phase | ||
1039 | |||
1040 | Also there should be 'm' new mount trees, where 'm' is | ||
1041 | the number of mounts to which the destination mount | ||
1042 | propagates to. | ||
1043 | |||
1044 | if any memory allocations fail, go to the abort phase. | ||
1045 | |||
1046 | Commit phase | ||
1047 | attach each of the mount trees to their corresponding | ||
1048 | destination mounts. | ||
1049 | |||
1050 | Abort phase | ||
1051 | delete all the newly created trees. | ||
1052 | |||
1053 | NOTE: all the propagation related functionality resides in the file | ||
1054 | pnode.c | ||
1055 | |||
1056 | |||
1057 | ------------------------------------------------------------------------ | ||
1058 | |||
1059 | version 0.1 (created the initial document, Ram Pai linuxram@us.ibm.com) | ||
1060 | version 0.2 (Incorporated comments from Al Viro) | ||