diff options
Diffstat (limited to 'Documentation/filesystems')
-rw-r--r-- | Documentation/filesystems/Locking | 10 | ||||
-rw-r--r-- | Documentation/filesystems/btrfs.txt | 91 | ||||
-rw-r--r-- | Documentation/filesystems/ext4.txt | 85 | ||||
-rw-r--r-- | Documentation/filesystems/proc.txt | 27 | ||||
-rw-r--r-- | Documentation/filesystems/squashfs.txt | 225 | ||||
-rw-r--r-- | Documentation/filesystems/vfs.txt | 8 |
6 files changed, 418 insertions, 28 deletions
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index ccec55394380..ec6a9392a173 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking | |||
@@ -97,8 +97,8 @@ prototypes: | |||
97 | void (*put_super) (struct super_block *); | 97 | void (*put_super) (struct super_block *); |
98 | void (*write_super) (struct super_block *); | 98 | void (*write_super) (struct super_block *); |
99 | int (*sync_fs)(struct super_block *sb, int wait); | 99 | int (*sync_fs)(struct super_block *sb, int wait); |
100 | void (*write_super_lockfs) (struct super_block *); | 100 | int (*freeze_fs) (struct super_block *); |
101 | void (*unlockfs) (struct super_block *); | 101 | int (*unfreeze_fs) (struct super_block *); |
102 | int (*statfs) (struct dentry *, struct kstatfs *); | 102 | int (*statfs) (struct dentry *, struct kstatfs *); |
103 | int (*remount_fs) (struct super_block *, int *, char *); | 103 | int (*remount_fs) (struct super_block *, int *, char *); |
104 | void (*clear_inode) (struct inode *); | 104 | void (*clear_inode) (struct inode *); |
@@ -119,8 +119,8 @@ delete_inode: no | |||
119 | put_super: yes yes no | 119 | put_super: yes yes no |
120 | write_super: no yes read | 120 | write_super: no yes read |
121 | sync_fs: no no read | 121 | sync_fs: no no read |
122 | write_super_lockfs: ? | 122 | freeze_fs: ? |
123 | unlockfs: ? | 123 | unfreeze_fs: ? |
124 | statfs: no no no | 124 | statfs: no no no |
125 | remount_fs: yes yes maybe (see below) | 125 | remount_fs: yes yes maybe (see below) |
126 | clear_inode: no | 126 | clear_inode: no |
@@ -397,7 +397,7 @@ prototypes: | |||
397 | }; | 397 | }; |
398 | 398 | ||
399 | locking rules: | 399 | locking rules: |
400 | All except ->poll() may block. | 400 | All may block. |
401 | BKL | 401 | BKL |
402 | llseek: no (see below) | 402 | llseek: no (see below) |
403 | read: no | 403 | read: no |
diff --git a/Documentation/filesystems/btrfs.txt b/Documentation/filesystems/btrfs.txt new file mode 100644 index 000000000000..64087c34327f --- /dev/null +++ b/Documentation/filesystems/btrfs.txt | |||
@@ -0,0 +1,91 @@ | |||
1 | |||
2 | BTRFS | ||
3 | ===== | ||
4 | |||
5 | Btrfs is a new copy on write filesystem for Linux aimed at | ||
6 | implementing advanced features while focusing on fault tolerance, | ||
7 | repair and easy administration. Initially developed by Oracle, Btrfs | ||
8 | is licensed under the GPL and open for contribution from anyone. | ||
9 | |||
10 | Linux has a wealth of filesystems to choose from, but we are facing a | ||
11 | number of challenges with scaling to the large storage subsystems that | ||
12 | are becoming common in today's data centers. Filesystems need to scale | ||
13 | in their ability to address and manage large storage, and also in | ||
14 | their ability to detect, repair and tolerate errors in the data stored | ||
15 | on disk. Btrfs is under heavy development, and is not suitable for | ||
16 | any uses other than benchmarking and review. The Btrfs disk format is | ||
17 | not yet finalized. | ||
18 | |||
19 | The main Btrfs features include: | ||
20 | |||
21 | * Extent based file storage (2^64 max file size) | ||
22 | * Space efficient packing of small files | ||
23 | * Space efficient indexed directories | ||
24 | * Dynamic inode allocation | ||
25 | * Writable snapshots | ||
26 | * Subvolumes (separate internal filesystem roots) | ||
27 | * Object level mirroring and striping | ||
28 | * Checksums on data and metadata (multiple algorithms available) | ||
29 | * Compression | ||
30 | * Integrated multiple device support, with several raid algorithms | ||
31 | * Online filesystem check (not yet implemented) | ||
32 | * Very fast offline filesystem check | ||
33 | * Efficient incremental backup and FS mirroring (not yet implemented) | ||
34 | * Online filesystem defragmentation | ||
35 | |||
36 | |||
37 | |||
38 | MAILING LIST | ||
39 | ============ | ||
40 | |||
41 | There is a Btrfs mailing list hosted on vger.kernel.org. You can | ||
42 | find details on how to subscribe here: | ||
43 | |||
44 | http://vger.kernel.org/vger-lists.html#linux-btrfs | ||
45 | |||
46 | Mailing list archives are available from gmane: | ||
47 | |||
48 | http://dir.gmane.org/gmane.comp.file-systems.btrfs | ||
49 | |||
50 | |||
51 | |||
52 | IRC | ||
53 | === | ||
54 | |||
55 | Discussion of Btrfs also occurs on the #btrfs channel of the Freenode | ||
56 | IRC network. | ||
57 | |||
58 | |||
59 | |||
60 | UTILITIES | ||
61 | ========= | ||
62 | |||
63 | Userspace tools for creating and manipulating Btrfs file systems are | ||
64 | available from the git repository at the following location: | ||
65 | |||
66 | http://git.kernel.org/?p=linux/kernel/git/mason/btrfs-progs-unstable.git | ||
67 | git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs-progs-unstable.git | ||
68 | |||
69 | These include the following tools: | ||
70 | |||
71 | mkfs.btrfs: create a filesystem | ||
72 | |||
73 | btrfsctl: control program to create snapshots and subvolumes: | ||
74 | |||
75 | mount /dev/sda2 /mnt | ||
76 | btrfsctl -s new_subvol_name /mnt | ||
77 | btrfsctl -s snapshot_of_default /mnt/default | ||
78 | btrfsctl -s snapshot_of_new_subvol /mnt/new_subvol_name | ||
79 | btrfsctl -s snapshot_of_a_snapshot /mnt/snapshot_of_new_subvol | ||
80 | ls /mnt | ||
81 | default snapshot_of_a_snapshot snapshot_of_new_subvol | ||
82 | new_subvol_name snapshot_of_default | ||
83 | |||
84 | Snapshots and subvolumes cannot be deleted right now, but you can | ||
85 | rm -rf all the files and directories inside them. | ||
86 | |||
87 | btrfsck: do a limited check of the FS extent trees. | ||
88 | |||
89 | btrfs-debug-tree: print all of the FS metadata in text form. Example: | ||
90 | |||
91 | btrfs-debug-tree /dev/sda2 >& big_output_file | ||
diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt index 174eaff7ded9..cec829bc7291 100644 --- a/Documentation/filesystems/ext4.txt +++ b/Documentation/filesystems/ext4.txt | |||
@@ -58,13 +58,22 @@ Note: More extensive information for getting started with ext4 can be | |||
58 | 58 | ||
59 | # mount -t ext4 /dev/hda1 /wherever | 59 | # mount -t ext4 /dev/hda1 /wherever |
60 | 60 | ||
61 | - When comparing performance with other filesystems, remember that | 61 | - When comparing performance with other filesystems, it's always |
62 | ext3/4 by default offers higher data integrity guarantees than most. | 62 | important to try multiple workloads; very often a subtle change in a |
63 | So when comparing with a metadata-only journalling filesystem, such | 63 | workload parameter can completely change the ranking of which |
64 | as ext3, use `mount -o data=writeback'. And you might as well use | 64 | filesystems do well compared to others. When comparing versus ext3, |
65 | `mount -o nobh' too along with it. Making the journal larger than | 65 | note that ext4 enables write barriers by default, while ext3 does |
66 | the mke2fs default often helps performance with metadata-intensive | 66 | not enable write barriers by default. So it is useful to use |
67 | workloads. | 67 | explicitly specify whether barriers are enabled or not when via the |
68 | '-o barriers=[0|1]' mount option for both ext3 and ext4 filesystems | ||
69 | for a fair comparison. When tuning ext3 for best benchmark numbers, | ||
70 | it is often worthwhile to try changing the data journaling mode; '-o | ||
71 | data=writeback,nobh' can be faster for some workloads. (Note | ||
72 | however that running mounted with data=writeback can potentially | ||
73 | leave stale data exposed in recently written files in case of an | ||
74 | unclean shutdown, which could be a security exposure in some | ||
75 | situations.) Configuring the filesystem with a large journal can | ||
76 | also be helpful for metadata-intensive workloads. | ||
68 | 77 | ||
69 | 2. Features | 78 | 2. Features |
70 | =========== | 79 | =========== |
@@ -74,7 +83,7 @@ Note: More extensive information for getting started with ext4 can be | |||
74 | * ability to use filesystems > 16TB (e2fsprogs support not available yet) | 83 | * ability to use filesystems > 16TB (e2fsprogs support not available yet) |
75 | * extent format reduces metadata overhead (RAM, IO for access, transactions) | 84 | * extent format reduces metadata overhead (RAM, IO for access, transactions) |
76 | * extent format more robust in face of on-disk corruption due to magics, | 85 | * extent format more robust in face of on-disk corruption due to magics, |
77 | * internal redunancy in tree | 86 | * internal redundancy in tree |
78 | * improved file allocation (multi-block alloc) | 87 | * improved file allocation (multi-block alloc) |
79 | * fix 32000 subdirectory limit | 88 | * fix 32000 subdirectory limit |
80 | * nsec timestamps for mtime, atime, ctime, create time | 89 | * nsec timestamps for mtime, atime, ctime, create time |
@@ -116,10 +125,11 @@ grouping of bitmaps and inode tables. Some test results available here: | |||
116 | When mounting an ext4 filesystem, the following option are accepted: | 125 | When mounting an ext4 filesystem, the following option are accepted: |
117 | (*) == default | 126 | (*) == default |
118 | 127 | ||
119 | extents (*) ext4 will use extents to address file data. The | 128 | ro Mount filesystem read only. Note that ext4 will |
120 | file system will no longer be mountable by ext3. | 129 | replay the journal (and thus write to the |
121 | 130 | partition) even when mounted "read only". The | |
122 | noextents ext4 will not use extents for newly created files | 131 | mount options "ro,noload" can be used to prevent |
132 | writes to the filesystem. | ||
123 | 133 | ||
124 | journal_checksum Enable checksumming of the journal transactions. | 134 | journal_checksum Enable checksumming of the journal transactions. |
125 | This will allow the recovery code in e2fsck and the | 135 | This will allow the recovery code in e2fsck and the |
@@ -134,17 +144,17 @@ journal_async_commit Commit block can be written to disk without waiting | |||
134 | journal=update Update the ext4 file system's journal to the current | 144 | journal=update Update the ext4 file system's journal to the current |
135 | format. | 145 | format. |
136 | 146 | ||
137 | journal=inum When a journal already exists, this option is ignored. | ||
138 | Otherwise, it specifies the number of the inode which | ||
139 | will represent the ext4 file system's journal file. | ||
140 | |||
141 | journal_dev=devnum When the external journal device's major/minor numbers | 147 | journal_dev=devnum When the external journal device's major/minor numbers |
142 | have changed, this option allows the user to specify | 148 | have changed, this option allows the user to specify |
143 | the new journal location. The journal device is | 149 | the new journal location. The journal device is |
144 | identified through its new major/minor numbers encoded | 150 | identified through its new major/minor numbers encoded |
145 | in devnum. | 151 | in devnum. |
146 | 152 | ||
147 | noload Don't load the journal on mounting. | 153 | noload Don't load the journal on mounting. Note that |
154 | if the filesystem was not unmounted cleanly, | ||
155 | skipping the journal replay will lead to the | ||
156 | filesystem containing inconsistencies that can | ||
157 | lead to any number of problems. | ||
148 | 158 | ||
149 | data=journal All data are committed into the journal prior to being | 159 | data=journal All data are committed into the journal prior to being |
150 | written into the main file system. | 160 | written into the main file system. |
@@ -219,9 +229,12 @@ minixdf Make 'df' act like Minix. | |||
219 | 229 | ||
220 | debug Extra debugging information is sent to syslog. | 230 | debug Extra debugging information is sent to syslog. |
221 | 231 | ||
222 | errors=remount-ro(*) Remount the filesystem read-only on an error. | 232 | errors=remount-ro Remount the filesystem read-only on an error. |
223 | errors=continue Keep going on a filesystem error. | 233 | errors=continue Keep going on a filesystem error. |
224 | errors=panic Panic and halt the machine if an error occurs. | 234 | errors=panic Panic and halt the machine if an error occurs. |
235 | (These mount options override the errors behavior | ||
236 | specified in the superblock, which can be configured | ||
237 | using tune2fs) | ||
225 | 238 | ||
226 | data_err=ignore(*) Just print an error message if an error occurs | 239 | data_err=ignore(*) Just print an error message if an error occurs |
227 | in a file data buffer in ordered mode. | 240 | in a file data buffer in ordered mode. |
@@ -261,6 +274,42 @@ delalloc (*) Deferring block allocation until write-out time. | |||
261 | nodelalloc Disable delayed allocation. Blocks are allocation | 274 | nodelalloc Disable delayed allocation. Blocks are allocation |
262 | when data is copied from user to page cache. | 275 | when data is copied from user to page cache. |
263 | 276 | ||
277 | max_batch_time=usec Maximum amount of time ext4 should wait for | ||
278 | additional filesystem operations to be batch | ||
279 | together with a synchronous write operation. | ||
280 | Since a synchronous write operation is going to | ||
281 | force a commit and then a wait for the I/O | ||
282 | complete, it doesn't cost much, and can be a | ||
283 | huge throughput win, we wait for a small amount | ||
284 | of time to see if any other transactions can | ||
285 | piggyback on the synchronous write. The | ||
286 | algorithm used is designed to automatically tune | ||
287 | for the speed of the disk, by measuring the | ||
288 | amount of time (on average) that it takes to | ||
289 | finish committing a transaction. Call this time | ||
290 | the "commit time". If the time that the | ||
291 | transactoin has been running is less than the | ||
292 | commit time, ext4 will try sleeping for the | ||
293 | commit time to see if other operations will join | ||
294 | the transaction. The commit time is capped by | ||
295 | the max_batch_time, which defaults to 15000us | ||
296 | (15ms). This optimization can be turned off | ||
297 | entirely by setting max_batch_time to 0. | ||
298 | |||
299 | min_batch_time=usec This parameter sets the commit time (as | ||
300 | described above) to be at least min_batch_time. | ||
301 | It defaults to zero microseconds. Increasing | ||
302 | this parameter may improve the throughput of | ||
303 | multi-threaded, synchronous workloads on very | ||
304 | fast disks, at the cost of increasing latency. | ||
305 | |||
306 | journal_ioprio=prio The I/O priority (from 0 to 7, where 0 is the | ||
307 | highest priorty) which should be used for I/O | ||
308 | operations submitted by kjournald2 during a | ||
309 | commit operation. This defaults to 3, which is | ||
310 | a slightly higher priority than the default I/O | ||
311 | priority. | ||
312 | |||
264 | Data Mode | 313 | Data Mode |
265 | ========= | 314 | ========= |
266 | There are 3 different data modes: | 315 | There are 3 different data modes: |
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 71df353e367c..d105eb45282a 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt | |||
@@ -140,6 +140,7 @@ Table 1-1: Process specific entries in /proc | |||
140 | statm Process memory status information | 140 | statm Process memory status information |
141 | status Process status in human readable form | 141 | status Process status in human readable form |
142 | wchan If CONFIG_KALLSYMS is set, a pre-decoded wchan | 142 | wchan If CONFIG_KALLSYMS is set, a pre-decoded wchan |
143 | stack Report full stack trace, enable via CONFIG_STACKTRACE | ||
143 | smaps Extension based on maps, the rss size for each mapped file | 144 | smaps Extension based on maps, the rss size for each mapped file |
144 | .............................................................................. | 145 | .............................................................................. |
145 | 146 | ||
@@ -1385,6 +1386,15 @@ swapcache reclaim. Decreasing vfs_cache_pressure causes the kernel to prefer | |||
1385 | to retain dentry and inode caches. Increasing vfs_cache_pressure beyond 100 | 1386 | to retain dentry and inode caches. Increasing vfs_cache_pressure beyond 100 |
1386 | causes the kernel to prefer to reclaim dentries and inodes. | 1387 | causes the kernel to prefer to reclaim dentries and inodes. |
1387 | 1388 | ||
1389 | dirty_background_bytes | ||
1390 | ---------------------- | ||
1391 | |||
1392 | Contains the amount of dirty memory at which the pdflush background writeback | ||
1393 | daemon will start writeback. | ||
1394 | |||
1395 | If dirty_background_bytes is written, dirty_background_ratio becomes a function | ||
1396 | of its value (dirty_background_bytes / the amount of dirtyable system memory). | ||
1397 | |||
1388 | dirty_background_ratio | 1398 | dirty_background_ratio |
1389 | ---------------------- | 1399 | ---------------------- |
1390 | 1400 | ||
@@ -1393,14 +1403,29 @@ pages + file cache, not including locked pages and HugePages), the number of | |||
1393 | pages at which the pdflush background writeback daemon will start writing out | 1403 | pages at which the pdflush background writeback daemon will start writing out |
1394 | dirty data. | 1404 | dirty data. |
1395 | 1405 | ||
1406 | If dirty_background_ratio is written, dirty_background_bytes becomes a function | ||
1407 | of its value (dirty_background_ratio * the amount of dirtyable system memory). | ||
1408 | |||
1409 | dirty_bytes | ||
1410 | ----------- | ||
1411 | |||
1412 | Contains the amount of dirty memory at which a process generating disk writes | ||
1413 | will itself start writeback. | ||
1414 | |||
1415 | If dirty_bytes is written, dirty_ratio becomes a function of its value | ||
1416 | (dirty_bytes / the amount of dirtyable system memory). | ||
1417 | |||
1396 | dirty_ratio | 1418 | dirty_ratio |
1397 | ----------------- | 1419 | ----------- |
1398 | 1420 | ||
1399 | Contains, as a percentage of the dirtyable system memory (free pages + mapped | 1421 | Contains, as a percentage of the dirtyable system memory (free pages + mapped |
1400 | pages + file cache, not including locked pages and HugePages), the number of | 1422 | pages + file cache, not including locked pages and HugePages), the number of |
1401 | pages at which a process which is generating disk writes will itself start | 1423 | pages at which a process which is generating disk writes will itself start |
1402 | writing out dirty data. | 1424 | writing out dirty data. |
1403 | 1425 | ||
1426 | If dirty_ratio is written, dirty_bytes becomes a function of its value | ||
1427 | (dirty_ratio * the amount of dirtyable system memory). | ||
1428 | |||
1404 | dirty_writeback_centisecs | 1429 | dirty_writeback_centisecs |
1405 | ------------------------- | 1430 | ------------------------- |
1406 | 1431 | ||
diff --git a/Documentation/filesystems/squashfs.txt b/Documentation/filesystems/squashfs.txt new file mode 100644 index 000000000000..3e79e4a7a392 --- /dev/null +++ b/Documentation/filesystems/squashfs.txt | |||
@@ -0,0 +1,225 @@ | |||
1 | SQUASHFS 4.0 FILESYSTEM | ||
2 | ======================= | ||
3 | |||
4 | Squashfs is a compressed read-only filesystem for Linux. | ||
5 | It uses zlib compression to compress files, inodes and directories. | ||
6 | Inodes in the system are very small and all blocks are packed to minimise | ||
7 | data overhead. Block sizes greater than 4K are supported up to a maximum | ||
8 | of 1Mbytes (default block size 128K). | ||
9 | |||
10 | Squashfs is intended for general read-only filesystem use, for archival | ||
11 | use (i.e. in cases where a .tar.gz file may be used), and in constrained | ||
12 | block device/memory systems (e.g. embedded systems) where low overhead is | ||
13 | needed. | ||
14 | |||
15 | Mailing list: squashfs-devel@lists.sourceforge.net | ||
16 | Web site: www.squashfs.org | ||
17 | |||
18 | 1. FILESYSTEM FEATURES | ||
19 | ---------------------- | ||
20 | |||
21 | Squashfs filesystem features versus Cramfs: | ||
22 | |||
23 | Squashfs Cramfs | ||
24 | |||
25 | Max filesystem size: 2^64 16 MiB | ||
26 | Max file size: ~ 2 TiB 16 MiB | ||
27 | Max files: unlimited unlimited | ||
28 | Max directories: unlimited unlimited | ||
29 | Max entries per directory: unlimited unlimited | ||
30 | Max block size: 1 MiB 4 KiB | ||
31 | Metadata compression: yes no | ||
32 | Directory indexes: yes no | ||
33 | Sparse file support: yes no | ||
34 | Tail-end packing (fragments): yes no | ||
35 | Exportable (NFS etc.): yes no | ||
36 | Hard link support: yes no | ||
37 | "." and ".." in readdir: yes no | ||
38 | Real inode numbers: yes no | ||
39 | 32-bit uids/gids: yes no | ||
40 | File creation time: yes no | ||
41 | Xattr and ACL support: no no | ||
42 | |||
43 | Squashfs compresses data, inodes and directories. In addition, inode and | ||
44 | directory data are highly compacted, and packed on byte boundaries. Each | ||
45 | compressed inode is on average 8 bytes in length (the exact length varies on | ||
46 | file type, i.e. regular file, directory, symbolic link, and block/char device | ||
47 | inodes have different sizes). | ||
48 | |||
49 | 2. USING SQUASHFS | ||
50 | ----------------- | ||
51 | |||
52 | As squashfs is a read-only filesystem, the mksquashfs program must be used to | ||
53 | create populated squashfs filesystems. This and other squashfs utilities | ||
54 | can be obtained from http://www.squashfs.org. Usage instructions can be | ||
55 | obtained from this site also. | ||
56 | |||
57 | |||
58 | 3. SQUASHFS FILESYSTEM DESIGN | ||
59 | ----------------------------- | ||
60 | |||
61 | A squashfs filesystem consists of seven parts, packed together on a byte | ||
62 | alignment: | ||
63 | |||
64 | --------------- | ||
65 | | superblock | | ||
66 | |---------------| | ||
67 | | datablocks | | ||
68 | | & fragments | | ||
69 | |---------------| | ||
70 | | inode table | | ||
71 | |---------------| | ||
72 | | directory | | ||
73 | | table | | ||
74 | |---------------| | ||
75 | | fragment | | ||
76 | | table | | ||
77 | |---------------| | ||
78 | | export | | ||
79 | | table | | ||
80 | |---------------| | ||
81 | | uid/gid | | ||
82 | | lookup table | | ||
83 | --------------- | ||
84 | |||
85 | Compressed data blocks are written to the filesystem as files are read from | ||
86 | the source directory, and checked for duplicates. Once all file data has been | ||
87 | written the completed inode, directory, fragment, export and uid/gid lookup | ||
88 | tables are written. | ||
89 | |||
90 | 3.1 Inodes | ||
91 | ---------- | ||
92 | |||
93 | Metadata (inodes and directories) are compressed in 8Kbyte blocks. Each | ||
94 | compressed block is prefixed by a two byte length, the top bit is set if the | ||
95 | block is uncompressed. A block will be uncompressed if the -noI option is set, | ||
96 | or if the compressed block was larger than the uncompressed block. | ||
97 | |||
98 | Inodes are packed into the metadata blocks, and are not aligned to block | ||
99 | boundaries, therefore inodes overlap compressed blocks. Inodes are identified | ||
100 | by a 48-bit number which encodes the location of the compressed metadata block | ||
101 | containing the inode, and the byte offset into that block where the inode is | ||
102 | placed (<block, offset>). | ||
103 | |||
104 | To maximise compression there are different inodes for each file type | ||
105 | (regular file, directory, device, etc.), the inode contents and length | ||
106 | varying with the type. | ||
107 | |||
108 | To further maximise compression, two types of regular file inode and | ||
109 | directory inode are defined: inodes optimised for frequently occurring | ||
110 | regular files and directories, and extended types where extra | ||
111 | information has to be stored. | ||
112 | |||
113 | 3.2 Directories | ||
114 | --------------- | ||
115 | |||
116 | Like inodes, directories are packed into compressed metadata blocks, stored | ||
117 | in a directory table. Directories are accessed using the start address of | ||
118 | the metablock containing the directory and the offset into the | ||
119 | decompressed block (<block, offset>). | ||
120 | |||
121 | Directories are organised in a slightly complex way, and are not simply | ||
122 | a list of file names. The organisation takes advantage of the | ||
123 | fact that (in most cases) the inodes of the files will be in the same | ||
124 | compressed metadata block, and therefore, can share the start block. | ||
125 | Directories are therefore organised in a two level list, a directory | ||
126 | header containing the shared start block value, and a sequence of directory | ||
127 | entries, each of which share the shared start block. A new directory header | ||
128 | is written once/if the inode start block changes. The directory | ||
129 | header/directory entry list is repeated as many times as necessary. | ||
130 | |||
131 | Directories are sorted, and can contain a directory index to speed up | ||
132 | file lookup. Directory indexes store one entry per metablock, each entry | ||
133 | storing the index/filename mapping to the first directory header | ||
134 | in each metadata block. Directories are sorted in alphabetical order, | ||
135 | and at lookup the index is scanned linearly looking for the first filename | ||
136 | alphabetically larger than the filename being looked up. At this point the | ||
137 | location of the metadata block the filename is in has been found. | ||
138 | The general idea of the index is ensure only one metadata block needs to be | ||
139 | decompressed to do a lookup irrespective of the length of the directory. | ||
140 | This scheme has the advantage that it doesn't require extra memory overhead | ||
141 | and doesn't require much extra storage on disk. | ||
142 | |||
143 | 3.3 File data | ||
144 | ------------- | ||
145 | |||
146 | Regular files consist of a sequence of contiguous compressed blocks, and/or a | ||
147 | compressed fragment block (tail-end packed block). The compressed size | ||
148 | of each datablock is stored in a block list contained within the | ||
149 | file inode. | ||
150 | |||
151 | To speed up access to datablocks when reading 'large' files (256 Mbytes or | ||
152 | larger), the code implements an index cache that caches the mapping from | ||
153 | block index to datablock location on disk. | ||
154 | |||
155 | The index cache allows Squashfs to handle large files (up to 1.75 TiB) while | ||
156 | retaining a simple and space-efficient block list on disk. The cache | ||
157 | is split into slots, caching up to eight 224 GiB files (128 KiB blocks). | ||
158 | Larger files use multiple slots, with 1.75 TiB files using all 8 slots. | ||
159 | The index cache is designed to be memory efficient, and by default uses | ||
160 | 16 KiB. | ||
161 | |||
162 | 3.4 Fragment lookup table | ||
163 | ------------------------- | ||
164 | |||
165 | Regular files can contain a fragment index which is mapped to a fragment | ||
166 | location on disk and compressed size using a fragment lookup table. This | ||
167 | fragment lookup table is itself stored compressed into metadata blocks. | ||
168 | A second index table is used to locate these. This second index table for | ||
169 | speed of access (and because it is small) is read at mount time and cached | ||
170 | in memory. | ||
171 | |||
172 | 3.5 Uid/gid lookup table | ||
173 | ------------------------ | ||
174 | |||
175 | For space efficiency regular files store uid and gid indexes, which are | ||
176 | converted to 32-bit uids/gids using an id look up table. This table is | ||
177 | stored compressed into metadata blocks. A second index table is used to | ||
178 | locate these. This second index table for speed of access (and because it | ||
179 | is small) is read at mount time and cached in memory. | ||
180 | |||
181 | 3.6 Export table | ||
182 | ---------------- | ||
183 | |||
184 | To enable Squashfs filesystems to be exportable (via NFS etc.) filesystems | ||
185 | can optionally (disabled with the -no-exports Mksquashfs option) contain | ||
186 | an inode number to inode disk location lookup table. This is required to | ||
187 | enable Squashfs to map inode numbers passed in filehandles to the inode | ||
188 | location on disk, which is necessary when the export code reinstantiates | ||
189 | expired/flushed inodes. | ||
190 | |||
191 | This table is stored compressed into metadata blocks. A second index table is | ||
192 | used to locate these. This second index table for speed of access (and because | ||
193 | it is small) is read at mount time and cached in memory. | ||
194 | |||
195 | |||
196 | 4. TODOS AND OUTSTANDING ISSUES | ||
197 | ------------------------------- | ||
198 | |||
199 | 4.1 Todo list | ||
200 | ------------- | ||
201 | |||
202 | Implement Xattr and ACL support. The Squashfs 4.0 filesystem layout has hooks | ||
203 | for these but the code has not been written. Once the code has been written | ||
204 | the existing layout should not require modification. | ||
205 | |||
206 | 4.2 Squashfs internal cache | ||
207 | --------------------------- | ||
208 | |||
209 | Blocks in Squashfs are compressed. To avoid repeatedly decompressing | ||
210 | recently accessed data Squashfs uses two small metadata and fragment caches. | ||
211 | |||
212 | The cache is not used for file datablocks, these are decompressed and cached in | ||
213 | the page-cache in the normal way. The cache is used to temporarily cache | ||
214 | fragment and metadata blocks which have been read as a result of a metadata | ||
215 | (i.e. inode or directory) or fragment access. Because metadata and fragments | ||
216 | are packed together into blocks (to gain greater compression) the read of a | ||
217 | particular piece of metadata or fragment will retrieve other metadata/fragments | ||
218 | which have been packed with it, these because of locality-of-reference may be | ||
219 | read in the near future. Temporarily caching them ensures they are available | ||
220 | for near future access without requiring an additional read and decompress. | ||
221 | |||
222 | In the future this internal cache may be replaced with an implementation which | ||
223 | uses the kernel page cache. Because the page cache operates on page sized | ||
224 | units this may introduce additional complexity in terms of locking and | ||
225 | associated race conditions. | ||
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index ef19afa186a9..deeeed0faa8f 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt | |||
@@ -210,8 +210,8 @@ struct super_operations { | |||
210 | void (*put_super) (struct super_block *); | 210 | void (*put_super) (struct super_block *); |
211 | void (*write_super) (struct super_block *); | 211 | void (*write_super) (struct super_block *); |
212 | int (*sync_fs)(struct super_block *sb, int wait); | 212 | int (*sync_fs)(struct super_block *sb, int wait); |
213 | void (*write_super_lockfs) (struct super_block *); | 213 | int (*freeze_fs) (struct super_block *); |
214 | void (*unlockfs) (struct super_block *); | 214 | int (*unfreeze_fs) (struct super_block *); |
215 | int (*statfs) (struct dentry *, struct kstatfs *); | 215 | int (*statfs) (struct dentry *, struct kstatfs *); |
216 | int (*remount_fs) (struct super_block *, int *, char *); | 216 | int (*remount_fs) (struct super_block *, int *, char *); |
217 | void (*clear_inode) (struct inode *); | 217 | void (*clear_inode) (struct inode *); |
@@ -270,11 +270,11 @@ or bottom half). | |||
270 | a superblock. The second parameter indicates whether the method | 270 | a superblock. The second parameter indicates whether the method |
271 | should wait until the write out has been completed. Optional. | 271 | should wait until the write out has been completed. Optional. |
272 | 272 | ||
273 | write_super_lockfs: called when VFS is locking a filesystem and | 273 | freeze_fs: called when VFS is locking a filesystem and |
274 | forcing it into a consistent state. This method is currently | 274 | forcing it into a consistent state. This method is currently |
275 | used by the Logical Volume Manager (LVM). | 275 | used by the Logical Volume Manager (LVM). |
276 | 276 | ||
277 | unlockfs: called when VFS is unlocking a filesystem and making it writable | 277 | unfreeze_fs: called when VFS is unlocking a filesystem and making it writable |
278 | again. | 278 | again. |
279 | 279 | ||
280 | statfs: called when the VFS needs to get filesystem statistics. This | 280 | statfs: called when the VFS needs to get filesystem statistics. This |