aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAl Viro <viro@zeniv.linux.org.uk>2014-10-23 22:52:55 -0400
committerAl Viro <viro@zeniv.linux.org.uk>2014-10-23 22:52:55 -0400
commit1be47b387a717a1d3edf29c80b6e7f3a72ab236e (patch)
tree5bd5f4b46b5266f5f583f601b8880211ee224c95
parent51486b900ee92856b977eacfc5bfbe6565028070 (diff)
parent69c433ed2ecd2d3264efd7afec4439524b319121 (diff)
Merge branch 'overlayfs.v25' of git://git.kernel.org/pub/scm/linux/kernel/git/mszeredi/vfs into for-linus
-rw-r--r--Documentation/filesystems/Locking2
-rw-r--r--Documentation/filesystems/overlayfs.txt198
-rw-r--r--Documentation/filesystems/vfs.txt7
-rw-r--r--MAINTAINERS7
-rw-r--r--fs/Kconfig1
-rw-r--r--fs/Makefile1
-rw-r--r--fs/btrfs/ioctl.c20
-rw-r--r--fs/ecryptfs/main.c7
-rw-r--r--fs/ext4/namei.c95
-rw-r--r--fs/internal.h7
-rw-r--r--fs/namei.c41
-rw-r--r--fs/namespace.c27
-rw-r--r--fs/open.c23
-rw-r--r--fs/overlayfs/Kconfig10
-rw-r--r--fs/overlayfs/Makefile7
-rw-r--r--fs/overlayfs/copy_up.c414
-rw-r--r--fs/overlayfs/dir.c921
-rw-r--r--fs/overlayfs/inode.c425
-rw-r--r--fs/overlayfs/overlayfs.h191
-rw-r--r--fs/overlayfs/readdir.c587
-rw-r--r--fs/overlayfs/super.c796
-rw-r--r--fs/splice.c1
-rw-r--r--include/linux/fs.h39
-rw-r--r--include/linux/mount.h3
-rw-r--r--include/uapi/linux/fs.h1
-rw-r--r--mm/shmem.c36
26 files changed, 3809 insertions, 58 deletions
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 94d93b1f8b53..b30753cbf431 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -67,6 +67,7 @@ prototypes:
67 struct file *, unsigned open_flag, 67 struct file *, unsigned open_flag,
68 umode_t create_mode, int *opened); 68 umode_t create_mode, int *opened);
69 int (*tmpfile) (struct inode *, struct dentry *, umode_t); 69 int (*tmpfile) (struct inode *, struct dentry *, umode_t);
70 int (*dentry_open)(struct dentry *, struct file *, const struct cred *);
70 71
71locking rules: 72locking rules:
72 all may block 73 all may block
@@ -96,6 +97,7 @@ fiemap: no
96update_time: no 97update_time: no
97atomic_open: yes 98atomic_open: yes
98tmpfile: no 99tmpfile: no
100dentry_open: no
99 101
100 Additionally, ->rmdir(), ->unlink() and ->rename() have ->i_mutex on 102 Additionally, ->rmdir(), ->unlink() and ->rename() have ->i_mutex on
101victim. 103victim.
diff --git a/Documentation/filesystems/overlayfs.txt b/Documentation/filesystems/overlayfs.txt
new file mode 100644
index 000000000000..530850a72735
--- /dev/null
+++ b/Documentation/filesystems/overlayfs.txt
@@ -0,0 +1,198 @@
1Written by: Neil Brown <neilb@suse.de>
2
3Overlay Filesystem
4==================
5
6This document describes a prototype for a new approach to providing
7overlay-filesystem functionality in Linux (sometimes referred to as
8union-filesystems). An overlay-filesystem tries to present a
9filesystem which is the result over overlaying one filesystem on top
10of the other.
11
12The result will inevitably fail to look exactly like a normal
13filesystem for various technical reasons. The expectation is that
14many use cases will be able to ignore these differences.
15
16This approach is 'hybrid' because the objects that appear in the
17filesystem do not all appear to belong to that filesystem. In many
18cases an object accessed in the union will be indistinguishable
19from accessing the corresponding object from the original filesystem.
20This is most obvious from the 'st_dev' field returned by stat(2).
21
22While directories will report an st_dev from the overlay-filesystem,
23all non-directory objects will report an st_dev from the lower or
24upper filesystem that is providing the object. Similarly st_ino will
25only be unique when combined with st_dev, and both of these can change
26over the lifetime of a non-directory object. Many applications and
27tools ignore these values and will not be affected.
28
29Upper and Lower
30---------------
31
32An overlay filesystem combines two filesystems - an 'upper' filesystem
33and a 'lower' filesystem. When a name exists in both filesystems, the
34object in the 'upper' filesystem is visible while the object in the
35'lower' filesystem is either hidden or, in the case of directories,
36merged with the 'upper' object.
37
38It would be more correct to refer to an upper and lower 'directory
39tree' rather than 'filesystem' as it is quite possible for both
40directory trees to be in the same filesystem and there is no
41requirement that the root of a filesystem be given for either upper or
42lower.
43
44The lower filesystem can be any filesystem supported by Linux and does
45not need to be writable. The lower filesystem can even be another
46overlayfs. The upper filesystem will normally be writable and if it
47is it must support the creation of trusted.* extended attributes, and
48must provide valid d_type in readdir responses, so NFS is not suitable.
49
50A read-only overlay of two read-only filesystems may use any
51filesystem type.
52
53Directories
54-----------
55
56Overlaying mainly involves directories. If a given name appears in both
57upper and lower filesystems and refers to a non-directory in either,
58then the lower object is hidden - the name refers only to the upper
59object.
60
61Where both upper and lower objects are directories, a merged directory
62is formed.
63
64At mount time, the two directories given as mount options "lowerdir" and
65"upperdir" are combined into a merged directory:
66
67 mount -t overlayfs overlayfs -olowerdir=/lower,upperdir=/upper,\
68workdir=/work /merged
69
70The "workdir" needs to be an empty directory on the same filesystem
71as upperdir.
72
73Then whenever a lookup is requested in such a merged directory, the
74lookup is performed in each actual directory and the combined result
75is cached in the dentry belonging to the overlay filesystem. If both
76actual lookups find directories, both are stored and a merged
77directory is created, otherwise only one is stored: the upper if it
78exists, else the lower.
79
80Only the lists of names from directories are merged. Other content
81such as metadata and extended attributes are reported for the upper
82directory only. These attributes of the lower directory are hidden.
83
84whiteouts and opaque directories
85--------------------------------
86
87In order to support rm and rmdir without changing the lower
88filesystem, an overlay filesystem needs to record in the upper filesystem
89that files have been removed. This is done using whiteouts and opaque
90directories (non-directories are always opaque).
91
92A whiteout is created as a character device with 0/0 device number.
93When a whiteout is found in the upper level of a merged directory, any
94matching name in the lower level is ignored, and the whiteout itself
95is also hidden.
96
97A directory is made opaque by setting the xattr "trusted.overlay.opaque"
98to "y". Where the upper filesystem contains an opaque directory, any
99directory in the lower filesystem with the same name is ignored.
100
101readdir
102-------
103
104When a 'readdir' request is made on a merged directory, the upper and
105lower directories are each read and the name lists merged in the
106obvious way (upper is read first, then lower - entries that already
107exist are not re-added). This merged name list is cached in the
108'struct file' and so remains as long as the file is kept open. If the
109directory is opened and read by two processes at the same time, they
110will each have separate caches. A seekdir to the start of the
111directory (offset 0) followed by a readdir will cause the cache to be
112discarded and rebuilt.
113
114This means that changes to the merged directory do not appear while a
115directory is being read. This is unlikely to be noticed by many
116programs.
117
118seek offsets are assigned sequentially when the directories are read.
119Thus if
120 - read part of a directory
121 - remember an offset, and close the directory
122 - re-open the directory some time later
123 - seek to the remembered offset
124
125there may be little correlation between the old and new locations in
126the list of filenames, particularly if anything has changed in the
127directory.
128
129Readdir on directories that are not merged is simply handled by the
130underlying directory (upper or lower).
131
132
133Non-directories
134---------------
135
136Objects that are not directories (files, symlinks, device-special
137files etc.) are presented either from the upper or lower filesystem as
138appropriate. When a file in the lower filesystem is accessed in a way
139the requires write-access, such as opening for write access, changing
140some metadata etc., the file is first copied from the lower filesystem
141to the upper filesystem (copy_up). Note that creating a hard-link
142also requires copy_up, though of course creation of a symlink does
143not.
144
145The copy_up may turn out to be unnecessary, for example if the file is
146opened for read-write but the data is not modified.
147
148The copy_up process first makes sure that the containing directory
149exists in the upper filesystem - creating it and any parents as
150necessary. It then creates the object with the same metadata (owner,
151mode, mtime, symlink-target etc.) and then if the object is a file, the
152data is copied from the lower to the upper filesystem. Finally any
153extended attributes are copied up.
154
155Once the copy_up is complete, the overlay filesystem simply
156provides direct access to the newly created file in the upper
157filesystem - future operations on the file are barely noticed by the
158overlay filesystem (though an operation on the name of the file such as
159rename or unlink will of course be noticed and handled).
160
161
162Non-standard behavior
163---------------------
164
165The copy_up operation essentially creates a new, identical file and
166moves it over to the old name. The new file may be on a different
167filesystem, so both st_dev and st_ino of the file may change.
168
169Any open files referring to this inode will access the old data and
170metadata. Similarly any file locks obtained before copy_up will not
171apply to the copied up file.
172
173On a file opened with O_RDONLY fchmod(2), fchown(2), futimesat(2) and
174fsetxattr(2) will fail with EROFS.
175
176If a file with multiple hard links is copied up, then this will
177"break" the link. Changes will not be propagated to other names
178referring to the same inode.
179
180Symlinks in /proc/PID/ and /proc/PID/fd which point to a non-directory
181object in overlayfs will not contain valid absolute paths, only
182relative paths leading up to the filesystem's root. This will be
183fixed in the future.
184
185Some operations are not atomic, for example a crash during copy_up or
186rename will leave the filesystem in an inconsistent state. This will
187be addressed in the future.
188
189Changes to underlying filesystems
190---------------------------------
191
192Offline changes, when the overlay is not mounted, are allowed to either
193the upper or the lower trees.
194
195Changes to the underlying filesystems while part of a mounted overlay
196filesystem are not allowed. If the underlying filesystem is changed,
197the behavior of the overlay is undefined, though it will not result in
198a crash or deadlock.
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index fceff7c00a3c..20bf204426ca 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -364,6 +364,7 @@ struct inode_operations {
364 int (*atomic_open)(struct inode *, struct dentry *, struct file *, 364 int (*atomic_open)(struct inode *, struct dentry *, struct file *,
365 unsigned open_flag, umode_t create_mode, int *opened); 365 unsigned open_flag, umode_t create_mode, int *opened);
366 int (*tmpfile) (struct inode *, struct dentry *, umode_t); 366 int (*tmpfile) (struct inode *, struct dentry *, umode_t);
367 int (*dentry_open)(struct dentry *, struct file *, const struct cred *);
367}; 368};
368 369
369Again, all methods are called without any locks being held, unless 370Again, all methods are called without any locks being held, unless
@@ -696,6 +697,12 @@ struct address_space_operations {
696 but instead uses bmap to find out where the blocks in the file 697 but instead uses bmap to find out where the blocks in the file
697 are and uses those addresses directly. 698 are and uses those addresses directly.
698 699
700 dentry_open: *WARNING: probably going away soon, do not use!* This is an
701 alternative to f_op->open(), the difference is that this method may open
702 a file not necessarily originating from the same filesystem as the one
703 i_op->open() was called on. It may be useful for stacking filesystems
704 which want to allow native I/O directly on underlying files.
705
699 706
700 invalidatepage: If a page has PagePrivate set, then invalidatepage 707 invalidatepage: If a page has PagePrivate set, then invalidatepage
701 will be called when part or all of the page is to be removed 708 will be called when part or all of the page is to be removed
diff --git a/MAINTAINERS b/MAINTAINERS
index a20df9bf8ab0..aa974d445bfd 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6832,6 +6832,13 @@ F: drivers/scsi/osd/
6832F: include/scsi/osd_* 6832F: include/scsi/osd_*
6833F: fs/exofs/ 6833F: fs/exofs/
6834 6834
6835OVERLAYFS FILESYSTEM
6836M: Miklos Szeredi <miklos@szeredi.hu>
6837L: linux-fsdevel@vger.kernel.org
6838S: Supported
6839F: fs/overlayfs/*
6840F: Documentation/filesystems/overlayfs.txt
6841
6835P54 WIRELESS DRIVER 6842P54 WIRELESS DRIVER
6836M: Christian Lamparter <chunkeey@googlemail.com> 6843M: Christian Lamparter <chunkeey@googlemail.com>
6837L: linux-wireless@vger.kernel.org 6844L: linux-wireless@vger.kernel.org
diff --git a/fs/Kconfig b/fs/Kconfig
index db5dc1598716..664991afe0c0 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -67,6 +67,7 @@ source "fs/quota/Kconfig"
67 67
68source "fs/autofs4/Kconfig" 68source "fs/autofs4/Kconfig"
69source "fs/fuse/Kconfig" 69source "fs/fuse/Kconfig"
70source "fs/overlayfs/Kconfig"
70 71
71menu "Caches" 72menu "Caches"
72 73
diff --git a/fs/Makefile b/fs/Makefile
index 90c88529892b..34a1b9dea6dd 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -104,6 +104,7 @@ obj-$(CONFIG_QNX6FS_FS) += qnx6/
104obj-$(CONFIG_AUTOFS4_FS) += autofs4/ 104obj-$(CONFIG_AUTOFS4_FS) += autofs4/
105obj-$(CONFIG_ADFS_FS) += adfs/ 105obj-$(CONFIG_ADFS_FS) += adfs/
106obj-$(CONFIG_FUSE_FS) += fuse/ 106obj-$(CONFIG_FUSE_FS) += fuse/
107obj-$(CONFIG_OVERLAYFS_FS) += overlayfs/
107obj-$(CONFIG_UDF_FS) += udf/ 108obj-$(CONFIG_UDF_FS) += udf/
108obj-$(CONFIG_SUN_OPENPROMFS) += openpromfs/ 109obj-$(CONFIG_SUN_OPENPROMFS) += openpromfs/
109obj-$(CONFIG_OMFS_FS) += omfs/ 110obj-$(CONFIG_OMFS_FS) += omfs/
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 8d2b76e29d3b..4399f0c3a4ce 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -765,23 +765,6 @@ out:
765 return ret; 765 return ret;
766} 766}
767 767
768/* copy of check_sticky in fs/namei.c()
769* It's inline, so penalty for filesystems that don't use sticky bit is
770* minimal.
771*/
772static inline int btrfs_check_sticky(struct inode *dir, struct inode *inode)
773{
774 kuid_t fsuid = current_fsuid();
775
776 if (!(dir->i_mode & S_ISVTX))
777 return 0;
778 if (uid_eq(inode->i_uid, fsuid))
779 return 0;
780 if (uid_eq(dir->i_uid, fsuid))
781 return 0;
782 return !capable(CAP_FOWNER);
783}
784
785/* copy of may_delete in fs/namei.c() 768/* copy of may_delete in fs/namei.c()
786 * Check whether we can remove a link victim from directory dir, check 769 * Check whether we can remove a link victim from directory dir, check
787 * whether the type of victim is right. 770 * whether the type of victim is right.
@@ -817,8 +800,7 @@ static int btrfs_may_delete(struct inode *dir, struct dentry *victim, int isdir)
817 return error; 800 return error;
818 if (IS_APPEND(dir)) 801 if (IS_APPEND(dir))
819 return -EPERM; 802 return -EPERM;
820 if (btrfs_check_sticky(dir, victim->d_inode)|| 803 if (check_sticky(dir, victim->d_inode) || IS_APPEND(victim->d_inode) ||
821 IS_APPEND(victim->d_inode)||
822 IS_IMMUTABLE(victim->d_inode) || IS_SWAPFILE(victim->d_inode)) 804 IS_IMMUTABLE(victim->d_inode) || IS_SWAPFILE(victim->d_inode))
823 return -EPERM; 805 return -EPERM;
824 if (isdir) { 806 if (isdir) {
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 1b119d3bf924..c4cd1fd86cc2 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -566,6 +566,13 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
566 s->s_maxbytes = path.dentry->d_sb->s_maxbytes; 566 s->s_maxbytes = path.dentry->d_sb->s_maxbytes;
567 s->s_blocksize = path.dentry->d_sb->s_blocksize; 567 s->s_blocksize = path.dentry->d_sb->s_blocksize;
568 s->s_magic = ECRYPTFS_SUPER_MAGIC; 568 s->s_magic = ECRYPTFS_SUPER_MAGIC;
569 s->s_stack_depth = path.dentry->d_sb->s_stack_depth + 1;
570
571 rc = -EINVAL;
572 if (s->s_stack_depth > FILESYSTEM_MAX_STACK_DEPTH) {
573 pr_err("eCryptfs: maximum fs stacking depth exceeded\n");
574 goto out_free;
575 }
569 576
570 inode = ecryptfs_get_inode(path.dentry->d_inode, s); 577 inode = ecryptfs_get_inode(path.dentry->d_inode, s);
571 rc = PTR_ERR(inode); 578 rc = PTR_ERR(inode);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 603e4ebbd0ac..aba86e8ef1ef 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -3190,6 +3190,39 @@ static void ext4_update_dir_count(handle_t *handle, struct ext4_renament *ent)
3190 } 3190 }
3191} 3191}
3192 3192
3193static struct inode *ext4_whiteout_for_rename(struct ext4_renament *ent,
3194 int credits, handle_t **h)
3195{
3196 struct inode *wh;
3197 handle_t *handle;
3198 int retries = 0;
3199
3200 /*
3201 * for inode block, sb block, group summaries,
3202 * and inode bitmap
3203 */
3204 credits += (EXT4_MAXQUOTAS_TRANS_BLOCKS(ent->dir->i_sb) +
3205 EXT4_XATTR_TRANS_BLOCKS + 4);
3206retry:
3207 wh = ext4_new_inode_start_handle(ent->dir, S_IFCHR | WHITEOUT_MODE,
3208 &ent->dentry->d_name, 0, NULL,
3209 EXT4_HT_DIR, credits);
3210
3211 handle = ext4_journal_current_handle();
3212 if (IS_ERR(wh)) {
3213 if (handle)
3214 ext4_journal_stop(handle);
3215 if (PTR_ERR(wh) == -ENOSPC &&
3216 ext4_should_retry_alloc(ent->dir->i_sb, &retries))
3217 goto retry;
3218 } else {
3219 *h = handle;
3220 init_special_inode(wh, wh->i_mode, WHITEOUT_DEV);
3221 wh->i_op = &ext4_special_inode_operations;
3222 }
3223 return wh;
3224}
3225
3193/* 3226/*
3194 * Anybody can rename anything with this: the permission checks are left to the 3227 * Anybody can rename anything with this: the permission checks are left to the
3195 * higher-level routines. 3228 * higher-level routines.
@@ -3199,7 +3232,8 @@ static void ext4_update_dir_count(handle_t *handle, struct ext4_renament *ent)
3199 * This comes from rename(const char *oldpath, const char *newpath) 3232 * This comes from rename(const char *oldpath, const char *newpath)
3200 */ 3233 */
3201static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, 3234static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
3202 struct inode *new_dir, struct dentry *new_dentry) 3235 struct inode *new_dir, struct dentry *new_dentry,
3236 unsigned int flags)
3203{ 3237{
3204 handle_t *handle = NULL; 3238 handle_t *handle = NULL;
3205 struct ext4_renament old = { 3239 struct ext4_renament old = {
@@ -3214,6 +3248,9 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
3214 }; 3248 };
3215 int force_reread; 3249 int force_reread;
3216 int retval; 3250 int retval;
3251 struct inode *whiteout = NULL;
3252 int credits;
3253 u8 old_file_type;
3217 3254
3218 dquot_initialize(old.dir); 3255 dquot_initialize(old.dir);
3219 dquot_initialize(new.dir); 3256 dquot_initialize(new.dir);
@@ -3252,11 +3289,17 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
3252 if (new.inode && !test_opt(new.dir->i_sb, NO_AUTO_DA_ALLOC)) 3289 if (new.inode && !test_opt(new.dir->i_sb, NO_AUTO_DA_ALLOC))
3253 ext4_alloc_da_blocks(old.inode); 3290 ext4_alloc_da_blocks(old.inode);
3254 3291
3255 handle = ext4_journal_start(old.dir, EXT4_HT_DIR, 3292 credits = (2 * EXT4_DATA_TRANS_BLOCKS(old.dir->i_sb) +
3256 (2 * EXT4_DATA_TRANS_BLOCKS(old.dir->i_sb) + 3293 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2);
3257 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2)); 3294 if (!(flags & RENAME_WHITEOUT)) {
3258 if (IS_ERR(handle)) 3295 handle = ext4_journal_start(old.dir, EXT4_HT_DIR, credits);
3259 return PTR_ERR(handle); 3296 if (IS_ERR(handle))
3297 return PTR_ERR(handle);
3298 } else {
3299 whiteout = ext4_whiteout_for_rename(&old, credits, &handle);
3300 if (IS_ERR(whiteout))
3301 return PTR_ERR(whiteout);
3302 }
3260 3303
3261 if (IS_DIRSYNC(old.dir) || IS_DIRSYNC(new.dir)) 3304 if (IS_DIRSYNC(old.dir) || IS_DIRSYNC(new.dir))
3262 ext4_handle_sync(handle); 3305 ext4_handle_sync(handle);
@@ -3284,13 +3327,26 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
3284 */ 3327 */
3285 force_reread = (new.dir->i_ino == old.dir->i_ino && 3328 force_reread = (new.dir->i_ino == old.dir->i_ino &&
3286 ext4_test_inode_flag(new.dir, EXT4_INODE_INLINE_DATA)); 3329 ext4_test_inode_flag(new.dir, EXT4_INODE_INLINE_DATA));
3330
3331 old_file_type = old.de->file_type;
3332 if (whiteout) {
3333 /*
3334 * Do this before adding a new entry, so the old entry is sure
3335 * to be still pointing to the valid old entry.
3336 */
3337 retval = ext4_setent(handle, &old, whiteout->i_ino,
3338 EXT4_FT_CHRDEV);
3339 if (retval)
3340 goto end_rename;
3341 ext4_mark_inode_dirty(handle, whiteout);
3342 }
3287 if (!new.bh) { 3343 if (!new.bh) {
3288 retval = ext4_add_entry(handle, new.dentry, old.inode); 3344 retval = ext4_add_entry(handle, new.dentry, old.inode);
3289 if (retval) 3345 if (retval)
3290 goto end_rename; 3346 goto end_rename;
3291 } else { 3347 } else {
3292 retval = ext4_setent(handle, &new, 3348 retval = ext4_setent(handle, &new,
3293 old.inode->i_ino, old.de->file_type); 3349 old.inode->i_ino, old_file_type);
3294 if (retval) 3350 if (retval)
3295 goto end_rename; 3351 goto end_rename;
3296 } 3352 }
@@ -3305,10 +3361,12 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
3305 old.inode->i_ctime = ext4_current_time(old.inode); 3361 old.inode->i_ctime = ext4_current_time(old.inode);
3306 ext4_mark_inode_dirty(handle, old.inode); 3362 ext4_mark_inode_dirty(handle, old.inode);
3307 3363
3308 /* 3364 if (!whiteout) {
3309 * ok, that's it 3365 /*
3310 */ 3366 * ok, that's it
3311 ext4_rename_delete(handle, &old, force_reread); 3367 */
3368 ext4_rename_delete(handle, &old, force_reread);
3369 }
3312 3370
3313 if (new.inode) { 3371 if (new.inode) {
3314 ext4_dec_count(handle, new.inode); 3372 ext4_dec_count(handle, new.inode);
@@ -3344,6 +3402,12 @@ end_rename:
3344 brelse(old.dir_bh); 3402 brelse(old.dir_bh);
3345 brelse(old.bh); 3403 brelse(old.bh);
3346 brelse(new.bh); 3404 brelse(new.bh);
3405 if (whiteout) {
3406 if (retval)
3407 drop_nlink(whiteout);
3408 unlock_new_inode(whiteout);
3409 iput(whiteout);
3410 }
3347 if (handle) 3411 if (handle)
3348 ext4_journal_stop(handle); 3412 ext4_journal_stop(handle);
3349 return retval; 3413 return retval;
@@ -3476,18 +3540,15 @@ static int ext4_rename2(struct inode *old_dir, struct dentry *old_dentry,
3476 struct inode *new_dir, struct dentry *new_dentry, 3540 struct inode *new_dir, struct dentry *new_dentry,
3477 unsigned int flags) 3541 unsigned int flags)
3478{ 3542{
3479 if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) 3543 if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
3480 return -EINVAL; 3544 return -EINVAL;
3481 3545
3482 if (flags & RENAME_EXCHANGE) { 3546 if (flags & RENAME_EXCHANGE) {
3483 return ext4_cross_rename(old_dir, old_dentry, 3547 return ext4_cross_rename(old_dir, old_dentry,
3484 new_dir, new_dentry); 3548 new_dir, new_dentry);
3485 } 3549 }
3486 /* 3550
3487 * Existence checking was done by the VFS, otherwise "RENAME_NOREPLACE" 3551 return ext4_rename(old_dir, old_dentry, new_dir, new_dentry, flags);
3488 * is equivalent to regular rename.
3489 */
3490 return ext4_rename(old_dir, old_dentry, new_dir, new_dentry);
3491} 3552}
3492 3553
3493/* 3554/*
diff --git a/fs/internal.h b/fs/internal.h
index 9477f8f6aefc..757ba2abf21e 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -47,7 +47,6 @@ extern void __init chrdev_init(void);
47/* 47/*
48 * namei.c 48 * namei.c
49 */ 49 */
50extern int __inode_permission(struct inode *, int);
51extern int user_path_mountpoint_at(int, const char __user *, unsigned int, struct path *); 50extern int user_path_mountpoint_at(int, const char __user *, unsigned int, struct path *);
52extern int vfs_path_lookup(struct dentry *, struct vfsmount *, 51extern int vfs_path_lookup(struct dentry *, struct vfsmount *,
53 const char *, unsigned int, struct path *); 52 const char *, unsigned int, struct path *);
@@ -139,12 +138,6 @@ extern long prune_dcache_sb(struct super_block *sb, unsigned long nr_to_scan,
139extern int rw_verify_area(int, struct file *, const loff_t *, size_t); 138extern int rw_verify_area(int, struct file *, const loff_t *, size_t);
140 139
141/* 140/*
142 * splice.c
143 */
144extern long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
145 loff_t *opos, size_t len, unsigned int flags);
146
147/*
148 * pipe.c 141 * pipe.c
149 */ 142 */
150extern const struct file_operations pipefifo_fops; 143extern const struct file_operations pipefifo_fops;
diff --git a/fs/namei.c b/fs/namei.c
index 43927d14db67..42df664e95e5 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -416,6 +416,7 @@ int __inode_permission(struct inode *inode, int mask)
416 416
417 return security_inode_permission(inode, mask); 417 return security_inode_permission(inode, mask);
418} 418}
419EXPORT_SYMBOL(__inode_permission);
419 420
420/** 421/**
421 * sb_permission - Check superblock-level permissions 422 * sb_permission - Check superblock-level permissions
@@ -2383,22 +2384,17 @@ kern_path_mountpoint(int dfd, const char *name, struct path *path,
2383} 2384}
2384EXPORT_SYMBOL(kern_path_mountpoint); 2385EXPORT_SYMBOL(kern_path_mountpoint);
2385 2386
2386/* 2387int __check_sticky(struct inode *dir, struct inode *inode)
2387 * It's inline, so penalty for filesystems that don't use sticky bit is
2388 * minimal.
2389 */
2390static inline int check_sticky(struct inode *dir, struct inode *inode)
2391{ 2388{
2392 kuid_t fsuid = current_fsuid(); 2389 kuid_t fsuid = current_fsuid();
2393 2390
2394 if (!(dir->i_mode & S_ISVTX))
2395 return 0;
2396 if (uid_eq(inode->i_uid, fsuid)) 2391 if (uid_eq(inode->i_uid, fsuid))
2397 return 0; 2392 return 0;
2398 if (uid_eq(dir->i_uid, fsuid)) 2393 if (uid_eq(dir->i_uid, fsuid))
2399 return 0; 2394 return 0;
2400 return !capable_wrt_inode_uidgid(inode, CAP_FOWNER); 2395 return !capable_wrt_inode_uidgid(inode, CAP_FOWNER);
2401} 2396}
2397EXPORT_SYMBOL(__check_sticky);
2402 2398
2403/* 2399/*
2404 * Check whether we can remove a link victim from directory dir, check 2400 * Check whether we can remove a link victim from directory dir, check
@@ -3064,9 +3060,12 @@ finish_open_created:
3064 error = may_open(&nd->path, acc_mode, open_flag); 3060 error = may_open(&nd->path, acc_mode, open_flag);
3065 if (error) 3061 if (error)
3066 goto out; 3062 goto out;
3067 file->f_path.mnt = nd->path.mnt; 3063
3068 error = finish_open(file, nd->path.dentry, NULL, opened); 3064 BUG_ON(*opened & FILE_OPENED); /* once it's opened, it's opened */
3069 if (error) { 3065 error = vfs_open(&nd->path, file, current_cred());
3066 if (!error) {
3067 *opened |= FILE_OPENED;
3068 } else {
3070 if (error == -EOPENSTALE) 3069 if (error == -EOPENSTALE)
3071 goto stale_open; 3070 goto stale_open;
3072 goto out; 3071 goto out;
@@ -4210,12 +4209,16 @@ SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname,
4210 bool should_retry = false; 4209 bool should_retry = false;
4211 int error; 4210 int error;
4212 4211
4213 if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) 4212 if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
4214 return -EINVAL; 4213 return -EINVAL;
4215 4214
4216 if ((flags & RENAME_NOREPLACE) && (flags & RENAME_EXCHANGE)) 4215 if ((flags & (RENAME_NOREPLACE | RENAME_WHITEOUT)) &&
4216 (flags & RENAME_EXCHANGE))
4217 return -EINVAL; 4217 return -EINVAL;
4218 4218
4219 if ((flags & RENAME_WHITEOUT) && !capable(CAP_MKNOD))
4220 return -EPERM;
4221
4219retry: 4222retry:
4220 from = user_path_parent(olddfd, oldname, &oldnd, lookup_flags); 4223 from = user_path_parent(olddfd, oldname, &oldnd, lookup_flags);
4221 if (IS_ERR(from)) { 4224 if (IS_ERR(from)) {
@@ -4347,6 +4350,20 @@ SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newna
4347 return sys_renameat2(AT_FDCWD, oldname, AT_FDCWD, newname, 0); 4350 return sys_renameat2(AT_FDCWD, oldname, AT_FDCWD, newname, 0);
4348} 4351}
4349 4352
4353int vfs_whiteout(struct inode *dir, struct dentry *dentry)
4354{
4355 int error = may_create(dir, dentry);
4356 if (error)
4357 return error;
4358
4359 if (!dir->i_op->mknod)
4360 return -EPERM;
4361
4362 return dir->i_op->mknod(dir, dentry,
4363 S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV);
4364}
4365EXPORT_SYMBOL(vfs_whiteout);
4366
4350int readlink_copy(char __user *buffer, int buflen, const char *link) 4367int readlink_copy(char __user *buffer, int buflen, const char *link)
4351{ 4368{
4352 int len = PTR_ERR(link); 4369 int len = PTR_ERR(link);
diff --git a/fs/namespace.c b/fs/namespace.c
index fbba8b17330d..5b66b2b3624d 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1686,6 +1686,33 @@ void drop_collected_mounts(struct vfsmount *mnt)
1686 namespace_unlock(); 1686 namespace_unlock();
1687} 1687}
1688 1688
1689/**
1690 * clone_private_mount - create a private clone of a path
1691 *
1692 * This creates a new vfsmount, which will be the clone of @path. The new will
1693 * not be attached anywhere in the namespace and will be private (i.e. changes
1694 * to the originating mount won't be propagated into this).
1695 *
1696 * Release with mntput().
1697 */
1698struct vfsmount *clone_private_mount(struct path *path)
1699{
1700 struct mount *old_mnt = real_mount(path->mnt);
1701 struct mount *new_mnt;
1702
1703 if (IS_MNT_UNBINDABLE(old_mnt))
1704 return ERR_PTR(-EINVAL);
1705
1706 down_read(&namespace_sem);
1707 new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE);
1708 up_read(&namespace_sem);
1709 if (IS_ERR(new_mnt))
1710 return ERR_CAST(new_mnt);
1711
1712 return &new_mnt->mnt;
1713}
1714EXPORT_SYMBOL_GPL(clone_private_mount);
1715
1689int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg, 1716int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg,
1690 struct vfsmount *root) 1717 struct vfsmount *root)
1691{ 1718{
diff --git a/fs/open.c b/fs/open.c
index d6fd3acde134..de92c13b58be 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -823,8 +823,7 @@ struct file *dentry_open(const struct path *path, int flags,
823 f = get_empty_filp(); 823 f = get_empty_filp();
824 if (!IS_ERR(f)) { 824 if (!IS_ERR(f)) {
825 f->f_flags = flags; 825 f->f_flags = flags;
826 f->f_path = *path; 826 error = vfs_open(path, f, cred);
827 error = do_dentry_open(f, NULL, cred);
828 if (!error) { 827 if (!error) {
829 /* from now on we need fput() to dispose of f */ 828 /* from now on we need fput() to dispose of f */
830 error = open_check_o_direct(f); 829 error = open_check_o_direct(f);
@@ -841,6 +840,26 @@ struct file *dentry_open(const struct path *path, int flags,
841} 840}
842EXPORT_SYMBOL(dentry_open); 841EXPORT_SYMBOL(dentry_open);
843 842
843/**
844 * vfs_open - open the file at the given path
845 * @path: path to open
846 * @filp: newly allocated file with f_flag initialized
847 * @cred: credentials to use
848 */
849int vfs_open(const struct path *path, struct file *filp,
850 const struct cred *cred)
851{
852 struct inode *inode = path->dentry->d_inode;
853
854 if (inode->i_op->dentry_open)
855 return inode->i_op->dentry_open(path->dentry, filp, cred);
856 else {
857 filp->f_path = *path;
858 return do_dentry_open(filp, NULL, cred);
859 }
860}
861EXPORT_SYMBOL(vfs_open);
862
844static inline int build_open_flags(int flags, umode_t mode, struct open_flags *op) 863static inline int build_open_flags(int flags, umode_t mode, struct open_flags *op)
845{ 864{
846 int lookup_flags = 0; 865 int lookup_flags = 0;
diff --git a/fs/overlayfs/Kconfig b/fs/overlayfs/Kconfig
new file mode 100644
index 000000000000..e60125976873
--- /dev/null
+++ b/fs/overlayfs/Kconfig
@@ -0,0 +1,10 @@
1config OVERLAYFS_FS
2 tristate "Overlay filesystem support"
3 help
4 An overlay filesystem combines two filesystems - an 'upper' filesystem
5 and a 'lower' filesystem. When a name exists in both filesystems, the
6 object in the 'upper' filesystem is visible while the object in the
7 'lower' filesystem is either hidden or, in the case of directories,
8 merged with the 'upper' object.
9
10 For more information see Documentation/filesystems/overlayfs.txt
diff --git a/fs/overlayfs/Makefile b/fs/overlayfs/Makefile
new file mode 100644
index 000000000000..8f91889480d0
--- /dev/null
+++ b/fs/overlayfs/Makefile
@@ -0,0 +1,7 @@
1#
2# Makefile for the overlay filesystem.
3#
4
5obj-$(CONFIG_OVERLAYFS_FS) += overlayfs.o
6
7overlayfs-objs := super.o inode.o dir.o readdir.o copy_up.o
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
new file mode 100644
index 000000000000..ea10a8719107
--- /dev/null
+++ b/fs/overlayfs/copy_up.c
@@ -0,0 +1,414 @@
1/*
2 *
3 * Copyright (C) 2011 Novell Inc.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 as published by
7 * the Free Software Foundation.
8 */
9
10#include <linux/fs.h>
11#include <linux/slab.h>
12#include <linux/file.h>
13#include <linux/splice.h>
14#include <linux/xattr.h>
15#include <linux/security.h>
16#include <linux/uaccess.h>
17#include <linux/sched.h>
18#include <linux/namei.h>
19#include "overlayfs.h"
20
21#define OVL_COPY_UP_CHUNK_SIZE (1 << 20)
22
23int ovl_copy_xattr(struct dentry *old, struct dentry *new)
24{
25 ssize_t list_size, size;
26 char *buf, *name, *value;
27 int error;
28
29 if (!old->d_inode->i_op->getxattr ||
30 !new->d_inode->i_op->getxattr)
31 return 0;
32
33 list_size = vfs_listxattr(old, NULL, 0);
34 if (list_size <= 0) {
35 if (list_size == -EOPNOTSUPP)
36 return 0;
37 return list_size;
38 }
39
40 buf = kzalloc(list_size, GFP_KERNEL);
41 if (!buf)
42 return -ENOMEM;
43
44 error = -ENOMEM;
45 value = kmalloc(XATTR_SIZE_MAX, GFP_KERNEL);
46 if (!value)
47 goto out;
48
49 list_size = vfs_listxattr(old, buf, list_size);
50 if (list_size <= 0) {
51 error = list_size;
52 goto out_free_value;
53 }
54
55 for (name = buf; name < (buf + list_size); name += strlen(name) + 1) {
56 size = vfs_getxattr(old, name, value, XATTR_SIZE_MAX);
57 if (size <= 0) {
58 error = size;
59 goto out_free_value;
60 }
61 error = vfs_setxattr(new, name, value, size, 0);
62 if (error)
63 goto out_free_value;
64 }
65
66out_free_value:
67 kfree(value);
68out:
69 kfree(buf);
70 return error;
71}
72
73static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len)
74{
75 struct file *old_file;
76 struct file *new_file;
77 loff_t old_pos = 0;
78 loff_t new_pos = 0;
79 int error = 0;
80
81 if (len == 0)
82 return 0;
83
84 old_file = ovl_path_open(old, O_RDONLY);
85 if (IS_ERR(old_file))
86 return PTR_ERR(old_file);
87
88 new_file = ovl_path_open(new, O_WRONLY);
89 if (IS_ERR(new_file)) {
90 error = PTR_ERR(new_file);
91 goto out_fput;
92 }
93
94 /* FIXME: copy up sparse files efficiently */
95 while (len) {
96 size_t this_len = OVL_COPY_UP_CHUNK_SIZE;
97 long bytes;
98
99 if (len < this_len)
100 this_len = len;
101
102 if (signal_pending_state(TASK_KILLABLE, current)) {
103 error = -EINTR;
104 break;
105 }
106
107 bytes = do_splice_direct(old_file, &old_pos,
108 new_file, &new_pos,
109 this_len, SPLICE_F_MOVE);
110 if (bytes <= 0) {
111 error = bytes;
112 break;
113 }
114 WARN_ON(old_pos != new_pos);
115
116 len -= bytes;
117 }
118
119 fput(new_file);
120out_fput:
121 fput(old_file);
122 return error;
123}
124
125static char *ovl_read_symlink(struct dentry *realdentry)
126{
127 int res;
128 char *buf;
129 struct inode *inode = realdentry->d_inode;
130 mm_segment_t old_fs;
131
132 res = -EINVAL;
133 if (!inode->i_op->readlink)
134 goto err;
135
136 res = -ENOMEM;
137 buf = (char *) __get_free_page(GFP_KERNEL);
138 if (!buf)
139 goto err;
140
141 old_fs = get_fs();
142 set_fs(get_ds());
143 /* The cast to a user pointer is valid due to the set_fs() */
144 res = inode->i_op->readlink(realdentry,
145 (char __user *)buf, PAGE_SIZE - 1);
146 set_fs(old_fs);
147 if (res < 0) {
148 free_page((unsigned long) buf);
149 goto err;
150 }
151 buf[res] = '\0';
152
153 return buf;
154
155err:
156 return ERR_PTR(res);
157}
158
159static int ovl_set_timestamps(struct dentry *upperdentry, struct kstat *stat)
160{
161 struct iattr attr = {
162 .ia_valid =
163 ATTR_ATIME | ATTR_MTIME | ATTR_ATIME_SET | ATTR_MTIME_SET,
164 .ia_atime = stat->atime,
165 .ia_mtime = stat->mtime,
166 };
167
168 return notify_change(upperdentry, &attr, NULL);
169}
170
171int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat)
172{
173 int err = 0;
174
175 if (!S_ISLNK(stat->mode)) {
176 struct iattr attr = {
177 .ia_valid = ATTR_MODE,
178 .ia_mode = stat->mode,
179 };
180 err = notify_change(upperdentry, &attr, NULL);
181 }
182 if (!err) {
183 struct iattr attr = {
184 .ia_valid = ATTR_UID | ATTR_GID,
185 .ia_uid = stat->uid,
186 .ia_gid = stat->gid,
187 };
188 err = notify_change(upperdentry, &attr, NULL);
189 }
190 if (!err)
191 ovl_set_timestamps(upperdentry, stat);
192
193 return err;
194
195}
196
197static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir,
198 struct dentry *dentry, struct path *lowerpath,
199 struct kstat *stat, struct iattr *attr,
200 const char *link)
201{
202 struct inode *wdir = workdir->d_inode;
203 struct inode *udir = upperdir->d_inode;
204 struct dentry *newdentry = NULL;
205 struct dentry *upper = NULL;
206 umode_t mode = stat->mode;
207 int err;
208
209 newdentry = ovl_lookup_temp(workdir, dentry);
210 err = PTR_ERR(newdentry);
211 if (IS_ERR(newdentry))
212 goto out;
213
214 upper = lookup_one_len(dentry->d_name.name, upperdir,
215 dentry->d_name.len);
216 err = PTR_ERR(upper);
217 if (IS_ERR(upper))
218 goto out1;
219
220 /* Can't properly set mode on creation because of the umask */
221 stat->mode &= S_IFMT;
222 err = ovl_create_real(wdir, newdentry, stat, link, NULL, true);
223 stat->mode = mode;
224 if (err)
225 goto out2;
226
227 if (S_ISREG(stat->mode)) {
228 struct path upperpath;
229 ovl_path_upper(dentry, &upperpath);
230 BUG_ON(upperpath.dentry != NULL);
231 upperpath.dentry = newdentry;
232
233 err = ovl_copy_up_data(lowerpath, &upperpath, stat->size);
234 if (err)
235 goto out_cleanup;
236 }
237
238 err = ovl_copy_xattr(lowerpath->dentry, newdentry);
239 if (err)
240 goto out_cleanup;
241
242 mutex_lock(&newdentry->d_inode->i_mutex);
243 err = ovl_set_attr(newdentry, stat);
244 if (!err && attr)
245 err = notify_change(newdentry, attr, NULL);
246 mutex_unlock(&newdentry->d_inode->i_mutex);
247 if (err)
248 goto out_cleanup;
249
250 err = ovl_do_rename(wdir, newdentry, udir, upper, 0);
251 if (err)
252 goto out_cleanup;
253
254 ovl_dentry_update(dentry, newdentry);
255 newdentry = NULL;
256
257 /*
258 * Non-directores become opaque when copied up.
259 */
260 if (!S_ISDIR(stat->mode))
261 ovl_dentry_set_opaque(dentry, true);
262out2:
263 dput(upper);
264out1:
265 dput(newdentry);
266out:
267 return err;
268
269out_cleanup:
270 ovl_cleanup(wdir, newdentry);
271 goto out;
272}
273
274/*
275 * Copy up a single dentry
276 *
277 * Directory renames only allowed on "pure upper" (already created on
278 * upper filesystem, never copied up). Directories which are on lower or
279 * are merged may not be renamed. For these -EXDEV is returned and
280 * userspace has to deal with it. This means, when copying up a
281 * directory we can rely on it and ancestors being stable.
282 *
283 * Non-directory renames start with copy up of source if necessary. The
284 * actual rename will only proceed once the copy up was successful. Copy
285 * up uses upper parent i_mutex for exclusion. Since rename can change
286 * d_parent it is possible that the copy up will lock the old parent. At
287 * that point the file will have already been copied up anyway.
288 */
289int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
290 struct path *lowerpath, struct kstat *stat,
291 struct iattr *attr)
292{
293 struct dentry *workdir = ovl_workdir(dentry);
294 int err;
295 struct kstat pstat;
296 struct path parentpath;
297 struct dentry *upperdir;
298 struct dentry *upperdentry;
299 const struct cred *old_cred;
300 struct cred *override_cred;
301 char *link = NULL;
302
303 ovl_path_upper(parent, &parentpath);
304 upperdir = parentpath.dentry;
305
306 err = vfs_getattr(&parentpath, &pstat);
307 if (err)
308 return err;
309
310 if (S_ISLNK(stat->mode)) {
311 link = ovl_read_symlink(lowerpath->dentry);
312 if (IS_ERR(link))
313 return PTR_ERR(link);
314 }
315
316 err = -ENOMEM;
317 override_cred = prepare_creds();
318 if (!override_cred)
319 goto out_free_link;
320
321 override_cred->fsuid = stat->uid;
322 override_cred->fsgid = stat->gid;
323 /*
324 * CAP_SYS_ADMIN for copying up extended attributes
325 * CAP_DAC_OVERRIDE for create
326 * CAP_FOWNER for chmod, timestamp update
327 * CAP_FSETID for chmod
328 * CAP_CHOWN for chown
329 * CAP_MKNOD for mknod
330 */
331 cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
332 cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
333 cap_raise(override_cred->cap_effective, CAP_FOWNER);
334 cap_raise(override_cred->cap_effective, CAP_FSETID);
335 cap_raise(override_cred->cap_effective, CAP_CHOWN);
336 cap_raise(override_cred->cap_effective, CAP_MKNOD);
337 old_cred = override_creds(override_cred);
338
339 err = -EIO;
340 if (lock_rename(workdir, upperdir) != NULL) {
341 pr_err("overlayfs: failed to lock workdir+upperdir\n");
342 goto out_unlock;
343 }
344 upperdentry = ovl_dentry_upper(dentry);
345 if (upperdentry) {
346 unlock_rename(workdir, upperdir);
347 err = 0;
348 /* Raced with another copy-up? Do the setattr here */
349 if (attr) {
350 mutex_lock(&upperdentry->d_inode->i_mutex);
351 err = notify_change(upperdentry, attr, NULL);
352 mutex_unlock(&upperdentry->d_inode->i_mutex);
353 }
354 goto out_put_cred;
355 }
356
357 err = ovl_copy_up_locked(workdir, upperdir, dentry, lowerpath,
358 stat, attr, link);
359 if (!err) {
360 /* Restore timestamps on parent (best effort) */
361 ovl_set_timestamps(upperdir, &pstat);
362 }
363out_unlock:
364 unlock_rename(workdir, upperdir);
365out_put_cred:
366 revert_creds(old_cred);
367 put_cred(override_cred);
368
369out_free_link:
370 if (link)
371 free_page((unsigned long) link);
372
373 return err;
374}
375
376int ovl_copy_up(struct dentry *dentry)
377{
378 int err;
379
380 err = 0;
381 while (!err) {
382 struct dentry *next;
383 struct dentry *parent;
384 struct path lowerpath;
385 struct kstat stat;
386 enum ovl_path_type type = ovl_path_type(dentry);
387
388 if (type != OVL_PATH_LOWER)
389 break;
390
391 next = dget(dentry);
392 /* find the topmost dentry not yet copied up */
393 for (;;) {
394 parent = dget_parent(next);
395
396 type = ovl_path_type(parent);
397 if (type != OVL_PATH_LOWER)
398 break;
399
400 dput(next);
401 next = parent;
402 }
403
404 ovl_path_lower(next, &lowerpath);
405 err = vfs_getattr(&lowerpath, &stat);
406 if (!err)
407 err = ovl_copy_up_one(parent, next, &lowerpath, &stat, NULL);
408
409 dput(parent);
410 dput(next);
411 }
412
413 return err;
414}
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
new file mode 100644
index 000000000000..15cd91ad9940
--- /dev/null
+++ b/fs/overlayfs/dir.c
@@ -0,0 +1,921 @@
1/*
2 *
3 * Copyright (C) 2011 Novell Inc.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 as published by
7 * the Free Software Foundation.
8 */
9
10#include <linux/fs.h>
11#include <linux/namei.h>
12#include <linux/xattr.h>
13#include <linux/security.h>
14#include <linux/cred.h>
15#include "overlayfs.h"
16
17void ovl_cleanup(struct inode *wdir, struct dentry *wdentry)
18{
19 int err;
20
21 dget(wdentry);
22 if (S_ISDIR(wdentry->d_inode->i_mode))
23 err = ovl_do_rmdir(wdir, wdentry);
24 else
25 err = ovl_do_unlink(wdir, wdentry);
26 dput(wdentry);
27
28 if (err) {
29 pr_err("overlayfs: cleanup of '%pd2' failed (%i)\n",
30 wdentry, err);
31 }
32}
33
34struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry)
35{
36 struct dentry *temp;
37 char name[20];
38
39 snprintf(name, sizeof(name), "#%lx", (unsigned long) dentry);
40
41 temp = lookup_one_len(name, workdir, strlen(name));
42 if (!IS_ERR(temp) && temp->d_inode) {
43 pr_err("overlayfs: workdir/%s already exists\n", name);
44 dput(temp);
45 temp = ERR_PTR(-EIO);
46 }
47
48 return temp;
49}
50
51/* caller holds i_mutex on workdir */
52static struct dentry *ovl_whiteout(struct dentry *workdir,
53 struct dentry *dentry)
54{
55 int err;
56 struct dentry *whiteout;
57 struct inode *wdir = workdir->d_inode;
58
59 whiteout = ovl_lookup_temp(workdir, dentry);
60 if (IS_ERR(whiteout))
61 return whiteout;
62
63 err = ovl_do_whiteout(wdir, whiteout);
64 if (err) {
65 dput(whiteout);
66 whiteout = ERR_PTR(err);
67 }
68
69 return whiteout;
70}
71
72int ovl_create_real(struct inode *dir, struct dentry *newdentry,
73 struct kstat *stat, const char *link,
74 struct dentry *hardlink, bool debug)
75{
76 int err;
77
78 if (newdentry->d_inode)
79 return -ESTALE;
80
81 if (hardlink) {
82 err = ovl_do_link(hardlink, dir, newdentry, debug);
83 } else {
84 switch (stat->mode & S_IFMT) {
85 case S_IFREG:
86 err = ovl_do_create(dir, newdentry, stat->mode, debug);
87 break;
88
89 case S_IFDIR:
90 err = ovl_do_mkdir(dir, newdentry, stat->mode, debug);
91 break;
92
93 case S_IFCHR:
94 case S_IFBLK:
95 case S_IFIFO:
96 case S_IFSOCK:
97 err = ovl_do_mknod(dir, newdentry,
98 stat->mode, stat->rdev, debug);
99 break;
100
101 case S_IFLNK:
102 err = ovl_do_symlink(dir, newdentry, link, debug);
103 break;
104
105 default:
106 err = -EPERM;
107 }
108 }
109 if (!err && WARN_ON(!newdentry->d_inode)) {
110 /*
111 * Not quite sure if non-instantiated dentry is legal or not.
112 * VFS doesn't seem to care so check and warn here.
113 */
114 err = -ENOENT;
115 }
116 return err;
117}
118
119static int ovl_set_opaque(struct dentry *upperdentry)
120{
121 return ovl_do_setxattr(upperdentry, ovl_opaque_xattr, "y", 1, 0);
122}
123
124static void ovl_remove_opaque(struct dentry *upperdentry)
125{
126 int err;
127
128 err = ovl_do_removexattr(upperdentry, ovl_opaque_xattr);
129 if (err) {
130 pr_warn("overlayfs: failed to remove opaque from '%s' (%i)\n",
131 upperdentry->d_name.name, err);
132 }
133}
134
135static int ovl_dir_getattr(struct vfsmount *mnt, struct dentry *dentry,
136 struct kstat *stat)
137{
138 int err;
139 enum ovl_path_type type;
140 struct path realpath;
141
142 type = ovl_path_real(dentry, &realpath);
143 err = vfs_getattr(&realpath, stat);
144 if (err)
145 return err;
146
147 stat->dev = dentry->d_sb->s_dev;
148 stat->ino = dentry->d_inode->i_ino;
149
150 /*
151 * It's probably not worth it to count subdirs to get the
152 * correct link count. nlink=1 seems to pacify 'find' and
153 * other utilities.
154 */
155 if (type == OVL_PATH_MERGE)
156 stat->nlink = 1;
157
158 return 0;
159}
160
161static int ovl_create_upper(struct dentry *dentry, struct inode *inode,
162 struct kstat *stat, const char *link,
163 struct dentry *hardlink)
164{
165 struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
166 struct inode *udir = upperdir->d_inode;
167 struct dentry *newdentry;
168 int err;
169
170 mutex_lock_nested(&udir->i_mutex, I_MUTEX_PARENT);
171 newdentry = lookup_one_len(dentry->d_name.name, upperdir,
172 dentry->d_name.len);
173 err = PTR_ERR(newdentry);
174 if (IS_ERR(newdentry))
175 goto out_unlock;
176 err = ovl_create_real(udir, newdentry, stat, link, hardlink, false);
177 if (err)
178 goto out_dput;
179
180 ovl_dentry_version_inc(dentry->d_parent);
181 ovl_dentry_update(dentry, newdentry);
182 ovl_copyattr(newdentry->d_inode, inode);
183 d_instantiate(dentry, inode);
184 newdentry = NULL;
185out_dput:
186 dput(newdentry);
187out_unlock:
188 mutex_unlock(&udir->i_mutex);
189 return err;
190}
191
192static int ovl_lock_rename_workdir(struct dentry *workdir,
193 struct dentry *upperdir)
194{
195 /* Workdir should not be the same as upperdir */
196 if (workdir == upperdir)
197 goto err;
198
199 /* Workdir should not be subdir of upperdir and vice versa */
200 if (lock_rename(workdir, upperdir) != NULL)
201 goto err_unlock;
202
203 return 0;
204
205err_unlock:
206 unlock_rename(workdir, upperdir);
207err:
208 pr_err("overlayfs: failed to lock workdir+upperdir\n");
209 return -EIO;
210}
211
212static struct dentry *ovl_clear_empty(struct dentry *dentry,
213 struct list_head *list)
214{
215 struct dentry *workdir = ovl_workdir(dentry);
216 struct inode *wdir = workdir->d_inode;
217 struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
218 struct inode *udir = upperdir->d_inode;
219 struct path upperpath;
220 struct dentry *upper;
221 struct dentry *opaquedir;
222 struct kstat stat;
223 int err;
224
225 err = ovl_lock_rename_workdir(workdir, upperdir);
226 if (err)
227 goto out;
228
229 ovl_path_upper(dentry, &upperpath);
230 err = vfs_getattr(&upperpath, &stat);
231 if (err)
232 goto out_unlock;
233
234 err = -ESTALE;
235 if (!S_ISDIR(stat.mode))
236 goto out_unlock;
237 upper = upperpath.dentry;
238 if (upper->d_parent->d_inode != udir)
239 goto out_unlock;
240
241 opaquedir = ovl_lookup_temp(workdir, dentry);
242 err = PTR_ERR(opaquedir);
243 if (IS_ERR(opaquedir))
244 goto out_unlock;
245
246 err = ovl_create_real(wdir, opaquedir, &stat, NULL, NULL, true);
247 if (err)
248 goto out_dput;
249
250 err = ovl_copy_xattr(upper, opaquedir);
251 if (err)
252 goto out_cleanup;
253
254 err = ovl_set_opaque(opaquedir);
255 if (err)
256 goto out_cleanup;
257
258 mutex_lock(&opaquedir->d_inode->i_mutex);
259 err = ovl_set_attr(opaquedir, &stat);
260 mutex_unlock(&opaquedir->d_inode->i_mutex);
261 if (err)
262 goto out_cleanup;
263
264 err = ovl_do_rename(wdir, opaquedir, udir, upper, RENAME_EXCHANGE);
265 if (err)
266 goto out_cleanup;
267
268 ovl_cleanup_whiteouts(upper, list);
269 ovl_cleanup(wdir, upper);
270 unlock_rename(workdir, upperdir);
271
272 /* dentry's upper doesn't match now, get rid of it */
273 d_drop(dentry);
274
275 return opaquedir;
276
277out_cleanup:
278 ovl_cleanup(wdir, opaquedir);
279out_dput:
280 dput(opaquedir);
281out_unlock:
282 unlock_rename(workdir, upperdir);
283out:
284 return ERR_PTR(err);
285}
286
287static struct dentry *ovl_check_empty_and_clear(struct dentry *dentry,
288 enum ovl_path_type type)
289{
290 int err;
291 struct dentry *ret = NULL;
292 LIST_HEAD(list);
293
294 err = ovl_check_empty_dir(dentry, &list);
295 if (err)
296 ret = ERR_PTR(err);
297 else if (type == OVL_PATH_MERGE)
298 ret = ovl_clear_empty(dentry, &list);
299
300 ovl_cache_free(&list);
301
302 return ret;
303}
304
305static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
306 struct kstat *stat, const char *link,
307 struct dentry *hardlink)
308{
309 struct dentry *workdir = ovl_workdir(dentry);
310 struct inode *wdir = workdir->d_inode;
311 struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
312 struct inode *udir = upperdir->d_inode;
313 struct dentry *upper;
314 struct dentry *newdentry;
315 int err;
316
317 err = ovl_lock_rename_workdir(workdir, upperdir);
318 if (err)
319 goto out;
320
321 newdentry = ovl_lookup_temp(workdir, dentry);
322 err = PTR_ERR(newdentry);
323 if (IS_ERR(newdentry))
324 goto out_unlock;
325
326 upper = lookup_one_len(dentry->d_name.name, upperdir,
327 dentry->d_name.len);
328 err = PTR_ERR(upper);
329 if (IS_ERR(upper))
330 goto out_dput;
331
332 err = ovl_create_real(wdir, newdentry, stat, link, hardlink, true);
333 if (err)
334 goto out_dput2;
335
336 if (S_ISDIR(stat->mode)) {
337 err = ovl_set_opaque(newdentry);
338 if (err)
339 goto out_cleanup;
340
341 err = ovl_do_rename(wdir, newdentry, udir, upper,
342 RENAME_EXCHANGE);
343 if (err)
344 goto out_cleanup;
345
346 ovl_cleanup(wdir, upper);
347 } else {
348 err = ovl_do_rename(wdir, newdentry, udir, upper, 0);
349 if (err)
350 goto out_cleanup;
351 }
352 ovl_dentry_version_inc(dentry->d_parent);
353 ovl_dentry_update(dentry, newdentry);
354 ovl_copyattr(newdentry->d_inode, inode);
355 d_instantiate(dentry, inode);
356 newdentry = NULL;
357out_dput2:
358 dput(upper);
359out_dput:
360 dput(newdentry);
361out_unlock:
362 unlock_rename(workdir, upperdir);
363out:
364 return err;
365
366out_cleanup:
367 ovl_cleanup(wdir, newdentry);
368 goto out_dput2;
369}
370
371static int ovl_create_or_link(struct dentry *dentry, int mode, dev_t rdev,
372 const char *link, struct dentry *hardlink)
373{
374 int err;
375 struct inode *inode;
376 struct kstat stat = {
377 .mode = mode,
378 .rdev = rdev,
379 };
380
381 err = -ENOMEM;
382 inode = ovl_new_inode(dentry->d_sb, mode, dentry->d_fsdata);
383 if (!inode)
384 goto out;
385
386 err = ovl_copy_up(dentry->d_parent);
387 if (err)
388 goto out_iput;
389
390 if (!ovl_dentry_is_opaque(dentry)) {
391 err = ovl_create_upper(dentry, inode, &stat, link, hardlink);
392 } else {
393 const struct cred *old_cred;
394 struct cred *override_cred;
395
396 err = -ENOMEM;
397 override_cred = prepare_creds();
398 if (!override_cred)
399 goto out_iput;
400
401 /*
402 * CAP_SYS_ADMIN for setting opaque xattr
403 * CAP_DAC_OVERRIDE for create in workdir, rename
404 * CAP_FOWNER for removing whiteout from sticky dir
405 */
406 cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
407 cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
408 cap_raise(override_cred->cap_effective, CAP_FOWNER);
409 old_cred = override_creds(override_cred);
410
411 err = ovl_create_over_whiteout(dentry, inode, &stat, link,
412 hardlink);
413
414 revert_creds(old_cred);
415 put_cred(override_cred);
416 }
417
418 if (!err)
419 inode = NULL;
420out_iput:
421 iput(inode);
422out:
423 return err;
424}
425
426static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev,
427 const char *link)
428{
429 int err;
430
431 err = ovl_want_write(dentry);
432 if (!err) {
433 err = ovl_create_or_link(dentry, mode, rdev, link, NULL);
434 ovl_drop_write(dentry);
435 }
436
437 return err;
438}
439
440static int ovl_create(struct inode *dir, struct dentry *dentry, umode_t mode,
441 bool excl)
442{
443 return ovl_create_object(dentry, (mode & 07777) | S_IFREG, 0, NULL);
444}
445
446static int ovl_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
447{
448 return ovl_create_object(dentry, (mode & 07777) | S_IFDIR, 0, NULL);
449}
450
451static int ovl_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
452 dev_t rdev)
453{
454 /* Don't allow creation of "whiteout" on overlay */
455 if (S_ISCHR(mode) && rdev == WHITEOUT_DEV)
456 return -EPERM;
457
458 return ovl_create_object(dentry, mode, rdev, NULL);
459}
460
461static int ovl_symlink(struct inode *dir, struct dentry *dentry,
462 const char *link)
463{
464 return ovl_create_object(dentry, S_IFLNK, 0, link);
465}
466
467static int ovl_link(struct dentry *old, struct inode *newdir,
468 struct dentry *new)
469{
470 int err;
471 struct dentry *upper;
472
473 err = ovl_want_write(old);
474 if (err)
475 goto out;
476
477 err = ovl_copy_up(old);
478 if (err)
479 goto out_drop_write;
480
481 upper = ovl_dentry_upper(old);
482 err = ovl_create_or_link(new, upper->d_inode->i_mode, 0, NULL, upper);
483
484out_drop_write:
485 ovl_drop_write(old);
486out:
487 return err;
488}
489
490static int ovl_remove_and_whiteout(struct dentry *dentry,
491 enum ovl_path_type type, bool is_dir)
492{
493 struct dentry *workdir = ovl_workdir(dentry);
494 struct inode *wdir = workdir->d_inode;
495 struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
496 struct inode *udir = upperdir->d_inode;
497 struct dentry *whiteout;
498 struct dentry *upper;
499 struct dentry *opaquedir = NULL;
500 int err;
501
502 if (is_dir) {
503 opaquedir = ovl_check_empty_and_clear(dentry, type);
504 err = PTR_ERR(opaquedir);
505 if (IS_ERR(opaquedir))
506 goto out;
507 }
508
509 err = ovl_lock_rename_workdir(workdir, upperdir);
510 if (err)
511 goto out_dput;
512
513 whiteout = ovl_whiteout(workdir, dentry);
514 err = PTR_ERR(whiteout);
515 if (IS_ERR(whiteout))
516 goto out_unlock;
517
518 if (type == OVL_PATH_LOWER) {
519 upper = lookup_one_len(dentry->d_name.name, upperdir,
520 dentry->d_name.len);
521 err = PTR_ERR(upper);
522 if (IS_ERR(upper))
523 goto kill_whiteout;
524
525 err = ovl_do_rename(wdir, whiteout, udir, upper, 0);
526 dput(upper);
527 if (err)
528 goto kill_whiteout;
529 } else {
530 int flags = 0;
531
532 upper = ovl_dentry_upper(dentry);
533 if (opaquedir)
534 upper = opaquedir;
535 err = -ESTALE;
536 if (upper->d_parent != upperdir)
537 goto kill_whiteout;
538
539 if (is_dir)
540 flags |= RENAME_EXCHANGE;
541
542 err = ovl_do_rename(wdir, whiteout, udir, upper, flags);
543 if (err)
544 goto kill_whiteout;
545
546 if (is_dir)
547 ovl_cleanup(wdir, upper);
548 }
549 ovl_dentry_version_inc(dentry->d_parent);
550out_d_drop:
551 d_drop(dentry);
552 dput(whiteout);
553out_unlock:
554 unlock_rename(workdir, upperdir);
555out_dput:
556 dput(opaquedir);
557out:
558 return err;
559
560kill_whiteout:
561 ovl_cleanup(wdir, whiteout);
562 goto out_d_drop;
563}
564
565static int ovl_remove_upper(struct dentry *dentry, bool is_dir)
566{
567 struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
568 struct inode *dir = upperdir->d_inode;
569 struct dentry *upper = ovl_dentry_upper(dentry);
570 int err;
571
572 mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
573 err = -ESTALE;
574 if (upper->d_parent == upperdir) {
575 /* Don't let d_delete() think it can reset d_inode */
576 dget(upper);
577 if (is_dir)
578 err = vfs_rmdir(dir, upper);
579 else
580 err = vfs_unlink(dir, upper, NULL);
581 dput(upper);
582 ovl_dentry_version_inc(dentry->d_parent);
583 }
584
585 /*
586 * Keeping this dentry hashed would mean having to release
587 * upperpath/lowerpath, which could only be done if we are the
588 * sole user of this dentry. Too tricky... Just unhash for
589 * now.
590 */
591 d_drop(dentry);
592 mutex_unlock(&dir->i_mutex);
593
594 return err;
595}
596
597static inline int ovl_check_sticky(struct dentry *dentry)
598{
599 struct inode *dir = ovl_dentry_real(dentry->d_parent)->d_inode;
600 struct inode *inode = ovl_dentry_real(dentry)->d_inode;
601
602 if (check_sticky(dir, inode))
603 return -EPERM;
604
605 return 0;
606}
607
608static int ovl_do_remove(struct dentry *dentry, bool is_dir)
609{
610 enum ovl_path_type type;
611 int err;
612
613 err = ovl_check_sticky(dentry);
614 if (err)
615 goto out;
616
617 err = ovl_want_write(dentry);
618 if (err)
619 goto out;
620
621 err = ovl_copy_up(dentry->d_parent);
622 if (err)
623 goto out_drop_write;
624
625 type = ovl_path_type(dentry);
626 if (type == OVL_PATH_PURE_UPPER) {
627 err = ovl_remove_upper(dentry, is_dir);
628 } else {
629 const struct cred *old_cred;
630 struct cred *override_cred;
631
632 err = -ENOMEM;
633 override_cred = prepare_creds();
634 if (!override_cred)
635 goto out_drop_write;
636
637 /*
638 * CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir
639 * CAP_DAC_OVERRIDE for create in workdir, rename
640 * CAP_FOWNER for removing whiteout from sticky dir
641 * CAP_FSETID for chmod of opaque dir
642 * CAP_CHOWN for chown of opaque dir
643 */
644 cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
645 cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
646 cap_raise(override_cred->cap_effective, CAP_FOWNER);
647 cap_raise(override_cred->cap_effective, CAP_FSETID);
648 cap_raise(override_cred->cap_effective, CAP_CHOWN);
649 old_cred = override_creds(override_cred);
650
651 err = ovl_remove_and_whiteout(dentry, type, is_dir);
652
653 revert_creds(old_cred);
654 put_cred(override_cred);
655 }
656out_drop_write:
657 ovl_drop_write(dentry);
658out:
659 return err;
660}
661
662static int ovl_unlink(struct inode *dir, struct dentry *dentry)
663{
664 return ovl_do_remove(dentry, false);
665}
666
667static int ovl_rmdir(struct inode *dir, struct dentry *dentry)
668{
669 return ovl_do_remove(dentry, true);
670}
671
672static int ovl_rename2(struct inode *olddir, struct dentry *old,
673 struct inode *newdir, struct dentry *new,
674 unsigned int flags)
675{
676 int err;
677 enum ovl_path_type old_type;
678 enum ovl_path_type new_type;
679 struct dentry *old_upperdir;
680 struct dentry *new_upperdir;
681 struct dentry *olddentry;
682 struct dentry *newdentry;
683 struct dentry *trap;
684 bool old_opaque;
685 bool new_opaque;
686 bool new_create = false;
687 bool cleanup_whiteout = false;
688 bool overwrite = !(flags & RENAME_EXCHANGE);
689 bool is_dir = S_ISDIR(old->d_inode->i_mode);
690 bool new_is_dir = false;
691 struct dentry *opaquedir = NULL;
692 const struct cred *old_cred = NULL;
693 struct cred *override_cred = NULL;
694
695 err = -EINVAL;
696 if (flags & ~(RENAME_EXCHANGE | RENAME_NOREPLACE))
697 goto out;
698
699 flags &= ~RENAME_NOREPLACE;
700
701 err = ovl_check_sticky(old);
702 if (err)
703 goto out;
704
705 /* Don't copy up directory trees */
706 old_type = ovl_path_type(old);
707 err = -EXDEV;
708 if ((old_type == OVL_PATH_LOWER || old_type == OVL_PATH_MERGE) && is_dir)
709 goto out;
710
711 if (new->d_inode) {
712 err = ovl_check_sticky(new);
713 if (err)
714 goto out;
715
716 if (S_ISDIR(new->d_inode->i_mode))
717 new_is_dir = true;
718
719 new_type = ovl_path_type(new);
720 err = -EXDEV;
721 if (!overwrite && (new_type == OVL_PATH_LOWER || new_type == OVL_PATH_MERGE) && new_is_dir)
722 goto out;
723
724 err = 0;
725 if (new_type == OVL_PATH_LOWER && old_type == OVL_PATH_LOWER) {
726 if (ovl_dentry_lower(old)->d_inode ==
727 ovl_dentry_lower(new)->d_inode)
728 goto out;
729 }
730 if (new_type != OVL_PATH_LOWER && old_type != OVL_PATH_LOWER) {
731 if (ovl_dentry_upper(old)->d_inode ==
732 ovl_dentry_upper(new)->d_inode)
733 goto out;
734 }
735 } else {
736 if (ovl_dentry_is_opaque(new))
737 new_type = OVL_PATH_UPPER;
738 else
739 new_type = OVL_PATH_PURE_UPPER;
740 }
741
742 err = ovl_want_write(old);
743 if (err)
744 goto out;
745
746 err = ovl_copy_up(old);
747 if (err)
748 goto out_drop_write;
749
750 err = ovl_copy_up(new->d_parent);
751 if (err)
752 goto out_drop_write;
753 if (!overwrite) {
754 err = ovl_copy_up(new);
755 if (err)
756 goto out_drop_write;
757 }
758
759 old_opaque = old_type != OVL_PATH_PURE_UPPER;
760 new_opaque = new_type != OVL_PATH_PURE_UPPER;
761
762 if (old_opaque || new_opaque) {
763 err = -ENOMEM;
764 override_cred = prepare_creds();
765 if (!override_cred)
766 goto out_drop_write;
767
768 /*
769 * CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir
770 * CAP_DAC_OVERRIDE for create in workdir
771 * CAP_FOWNER for removing whiteout from sticky dir
772 * CAP_FSETID for chmod of opaque dir
773 * CAP_CHOWN for chown of opaque dir
774 */
775 cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
776 cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
777 cap_raise(override_cred->cap_effective, CAP_FOWNER);
778 cap_raise(override_cred->cap_effective, CAP_FSETID);
779 cap_raise(override_cred->cap_effective, CAP_CHOWN);
780 old_cred = override_creds(override_cred);
781 }
782
783 if (overwrite && (new_type == OVL_PATH_LOWER || new_type == OVL_PATH_MERGE) && new_is_dir) {
784 opaquedir = ovl_check_empty_and_clear(new, new_type);
785 err = PTR_ERR(opaquedir);
786 if (IS_ERR(opaquedir)) {
787 opaquedir = NULL;
788 goto out_revert_creds;
789 }
790 }
791
792 if (overwrite) {
793 if (old_opaque) {
794 if (new->d_inode || !new_opaque) {
795 /* Whiteout source */
796 flags |= RENAME_WHITEOUT;
797 } else {
798 /* Switch whiteouts */
799 flags |= RENAME_EXCHANGE;
800 }
801 } else if (is_dir && !new->d_inode && new_opaque) {
802 flags |= RENAME_EXCHANGE;
803 cleanup_whiteout = true;
804 }
805 }
806
807 old_upperdir = ovl_dentry_upper(old->d_parent);
808 new_upperdir = ovl_dentry_upper(new->d_parent);
809
810 trap = lock_rename(new_upperdir, old_upperdir);
811
812 olddentry = ovl_dentry_upper(old);
813 newdentry = ovl_dentry_upper(new);
814 if (newdentry) {
815 if (opaquedir) {
816 newdentry = opaquedir;
817 opaquedir = NULL;
818 } else {
819 dget(newdentry);
820 }
821 } else {
822 new_create = true;
823 newdentry = lookup_one_len(new->d_name.name, new_upperdir,
824 new->d_name.len);
825 err = PTR_ERR(newdentry);
826 if (IS_ERR(newdentry))
827 goto out_unlock;
828 }
829
830 err = -ESTALE;
831 if (olddentry->d_parent != old_upperdir)
832 goto out_dput;
833 if (newdentry->d_parent != new_upperdir)
834 goto out_dput;
835 if (olddentry == trap)
836 goto out_dput;
837 if (newdentry == trap)
838 goto out_dput;
839
840 if (is_dir && !old_opaque && new_opaque) {
841 err = ovl_set_opaque(olddentry);
842 if (err)
843 goto out_dput;
844 }
845 if (!overwrite && new_is_dir && old_opaque && !new_opaque) {
846 err = ovl_set_opaque(newdentry);
847 if (err)
848 goto out_dput;
849 }
850
851 if (old_opaque || new_opaque) {
852 err = ovl_do_rename(old_upperdir->d_inode, olddentry,
853 new_upperdir->d_inode, newdentry,
854 flags);
855 } else {
856 /* No debug for the plain case */
857 BUG_ON(flags & ~RENAME_EXCHANGE);
858 err = vfs_rename(old_upperdir->d_inode, olddentry,
859 new_upperdir->d_inode, newdentry,
860 NULL, flags);
861 }
862
863 if (err) {
864 if (is_dir && !old_opaque && new_opaque)
865 ovl_remove_opaque(olddentry);
866 if (!overwrite && new_is_dir && old_opaque && !new_opaque)
867 ovl_remove_opaque(newdentry);
868 goto out_dput;
869 }
870
871 if (is_dir && old_opaque && !new_opaque)
872 ovl_remove_opaque(olddentry);
873 if (!overwrite && new_is_dir && !old_opaque && new_opaque)
874 ovl_remove_opaque(newdentry);
875
876 if (old_opaque != new_opaque) {
877 ovl_dentry_set_opaque(old, new_opaque);
878 if (!overwrite)
879 ovl_dentry_set_opaque(new, old_opaque);
880 }
881
882 if (cleanup_whiteout)
883 ovl_cleanup(old_upperdir->d_inode, newdentry);
884
885 ovl_dentry_version_inc(old->d_parent);
886 ovl_dentry_version_inc(new->d_parent);
887
888out_dput:
889 dput(newdentry);
890out_unlock:
891 unlock_rename(new_upperdir, old_upperdir);
892out_revert_creds:
893 if (old_opaque || new_opaque) {
894 revert_creds(old_cred);
895 put_cred(override_cred);
896 }
897out_drop_write:
898 ovl_drop_write(old);
899out:
900 dput(opaquedir);
901 return err;
902}
903
904const struct inode_operations ovl_dir_inode_operations = {
905 .lookup = ovl_lookup,
906 .mkdir = ovl_mkdir,
907 .symlink = ovl_symlink,
908 .unlink = ovl_unlink,
909 .rmdir = ovl_rmdir,
910 .rename2 = ovl_rename2,
911 .link = ovl_link,
912 .setattr = ovl_setattr,
913 .create = ovl_create,
914 .mknod = ovl_mknod,
915 .permission = ovl_permission,
916 .getattr = ovl_dir_getattr,
917 .setxattr = ovl_setxattr,
918 .getxattr = ovl_getxattr,
919 .listxattr = ovl_listxattr,
920 .removexattr = ovl_removexattr,
921};
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
new file mode 100644
index 000000000000..af2d18c9fcee
--- /dev/null
+++ b/fs/overlayfs/inode.c
@@ -0,0 +1,425 @@
1/*
2 *
3 * Copyright (C) 2011 Novell Inc.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 as published by
7 * the Free Software Foundation.
8 */
9
10#include <linux/fs.h>
11#include <linux/slab.h>
12#include <linux/xattr.h>
13#include "overlayfs.h"
14
15static int ovl_copy_up_last(struct dentry *dentry, struct iattr *attr,
16 bool no_data)
17{
18 int err;
19 struct dentry *parent;
20 struct kstat stat;
21 struct path lowerpath;
22
23 parent = dget_parent(dentry);
24 err = ovl_copy_up(parent);
25 if (err)
26 goto out_dput_parent;
27
28 ovl_path_lower(dentry, &lowerpath);
29 err = vfs_getattr(&lowerpath, &stat);
30 if (err)
31 goto out_dput_parent;
32
33 if (no_data)
34 stat.size = 0;
35
36 err = ovl_copy_up_one(parent, dentry, &lowerpath, &stat, attr);
37
38out_dput_parent:
39 dput(parent);
40 return err;
41}
42
43int ovl_setattr(struct dentry *dentry, struct iattr *attr)
44{
45 int err;
46 struct dentry *upperdentry;
47
48 err = ovl_want_write(dentry);
49 if (err)
50 goto out;
51
52 upperdentry = ovl_dentry_upper(dentry);
53 if (upperdentry) {
54 mutex_lock(&upperdentry->d_inode->i_mutex);
55 err = notify_change(upperdentry, attr, NULL);
56 mutex_unlock(&upperdentry->d_inode->i_mutex);
57 } else {
58 err = ovl_copy_up_last(dentry, attr, false);
59 }
60 ovl_drop_write(dentry);
61out:
62 return err;
63}
64
65static int ovl_getattr(struct vfsmount *mnt, struct dentry *dentry,
66 struct kstat *stat)
67{
68 struct path realpath;
69
70 ovl_path_real(dentry, &realpath);
71 return vfs_getattr(&realpath, stat);
72}
73
74int ovl_permission(struct inode *inode, int mask)
75{
76 struct ovl_entry *oe;
77 struct dentry *alias = NULL;
78 struct inode *realinode;
79 struct dentry *realdentry;
80 bool is_upper;
81 int err;
82
83 if (S_ISDIR(inode->i_mode)) {
84 oe = inode->i_private;
85 } else if (mask & MAY_NOT_BLOCK) {
86 return -ECHILD;
87 } else {
88 /*
89 * For non-directories find an alias and get the info
90 * from there.
91 */
92 alias = d_find_any_alias(inode);
93 if (WARN_ON(!alias))
94 return -ENOENT;
95
96 oe = alias->d_fsdata;
97 }
98
99 realdentry = ovl_entry_real(oe, &is_upper);
100
101 /* Careful in RCU walk mode */
102 realinode = ACCESS_ONCE(realdentry->d_inode);
103 if (!realinode) {
104 WARN_ON(!(mask & MAY_NOT_BLOCK));
105 err = -ENOENT;
106 goto out_dput;
107 }
108
109 if (mask & MAY_WRITE) {
110 umode_t mode = realinode->i_mode;
111
112 /*
113 * Writes will always be redirected to upper layer, so
114 * ignore lower layer being read-only.
115 *
116 * If the overlay itself is read-only then proceed
117 * with the permission check, don't return EROFS.
118 * This will only happen if this is the lower layer of
119 * another overlayfs.
120 *
121 * If upper fs becomes read-only after the overlay was
122 * constructed return EROFS to prevent modification of
123 * upper layer.
124 */
125 err = -EROFS;
126 if (is_upper && !IS_RDONLY(inode) && IS_RDONLY(realinode) &&
127 (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
128 goto out_dput;
129 }
130
131 err = __inode_permission(realinode, mask);
132out_dput:
133 dput(alias);
134 return err;
135}
136
137
138struct ovl_link_data {
139 struct dentry *realdentry;
140 void *cookie;
141};
142
143static void *ovl_follow_link(struct dentry *dentry, struct nameidata *nd)
144{
145 void *ret;
146 struct dentry *realdentry;
147 struct inode *realinode;
148
149 realdentry = ovl_dentry_real(dentry);
150 realinode = realdentry->d_inode;
151
152 if (WARN_ON(!realinode->i_op->follow_link))
153 return ERR_PTR(-EPERM);
154
155 ret = realinode->i_op->follow_link(realdentry, nd);
156 if (IS_ERR(ret))
157 return ret;
158
159 if (realinode->i_op->put_link) {
160 struct ovl_link_data *data;
161
162 data = kmalloc(sizeof(struct ovl_link_data), GFP_KERNEL);
163 if (!data) {
164 realinode->i_op->put_link(realdentry, nd, ret);
165 return ERR_PTR(-ENOMEM);
166 }
167 data->realdentry = realdentry;
168 data->cookie = ret;
169
170 return data;
171 } else {
172 return NULL;
173 }
174}
175
176static void ovl_put_link(struct dentry *dentry, struct nameidata *nd, void *c)
177{
178 struct inode *realinode;
179 struct ovl_link_data *data = c;
180
181 if (!data)
182 return;
183
184 realinode = data->realdentry->d_inode;
185 realinode->i_op->put_link(data->realdentry, nd, data->cookie);
186 kfree(data);
187}
188
189static int ovl_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
190{
191 struct path realpath;
192 struct inode *realinode;
193
194 ovl_path_real(dentry, &realpath);
195 realinode = realpath.dentry->d_inode;
196
197 if (!realinode->i_op->readlink)
198 return -EINVAL;
199
200 touch_atime(&realpath);
201
202 return realinode->i_op->readlink(realpath.dentry, buf, bufsiz);
203}
204
205
206static bool ovl_is_private_xattr(const char *name)
207{
208 return strncmp(name, "trusted.overlay.", 14) == 0;
209}
210
211int ovl_setxattr(struct dentry *dentry, const char *name,
212 const void *value, size_t size, int flags)
213{
214 int err;
215 struct dentry *upperdentry;
216
217 err = ovl_want_write(dentry);
218 if (err)
219 goto out;
220
221 err = -EPERM;
222 if (ovl_is_private_xattr(name))
223 goto out_drop_write;
224
225 err = ovl_copy_up(dentry);
226 if (err)
227 goto out_drop_write;
228
229 upperdentry = ovl_dentry_upper(dentry);
230 err = vfs_setxattr(upperdentry, name, value, size, flags);
231
232out_drop_write:
233 ovl_drop_write(dentry);
234out:
235 return err;
236}
237
238ssize_t ovl_getxattr(struct dentry *dentry, const char *name,
239 void *value, size_t size)
240{
241 if (ovl_path_type(dentry->d_parent) == OVL_PATH_MERGE &&
242 ovl_is_private_xattr(name))
243 return -ENODATA;
244
245 return vfs_getxattr(ovl_dentry_real(dentry), name, value, size);
246}
247
248ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size)
249{
250 ssize_t res;
251 int off;
252
253 res = vfs_listxattr(ovl_dentry_real(dentry), list, size);
254 if (res <= 0 || size == 0)
255 return res;
256
257 if (ovl_path_type(dentry->d_parent) != OVL_PATH_MERGE)
258 return res;
259
260 /* filter out private xattrs */
261 for (off = 0; off < res;) {
262 char *s = list + off;
263 size_t slen = strlen(s) + 1;
264
265 BUG_ON(off + slen > res);
266
267 if (ovl_is_private_xattr(s)) {
268 res -= slen;
269 memmove(s, s + slen, res - off);
270 } else {
271 off += slen;
272 }
273 }
274
275 return res;
276}
277
278int ovl_removexattr(struct dentry *dentry, const char *name)
279{
280 int err;
281 struct path realpath;
282 enum ovl_path_type type;
283
284 err = ovl_want_write(dentry);
285 if (err)
286 goto out;
287
288 if (ovl_path_type(dentry->d_parent) == OVL_PATH_MERGE &&
289 ovl_is_private_xattr(name))
290 goto out_drop_write;
291
292 type = ovl_path_real(dentry, &realpath);
293 if (type == OVL_PATH_LOWER) {
294 err = vfs_getxattr(realpath.dentry, name, NULL, 0);
295 if (err < 0)
296 goto out_drop_write;
297
298 err = ovl_copy_up(dentry);
299 if (err)
300 goto out_drop_write;
301
302 ovl_path_upper(dentry, &realpath);
303 }
304
305 err = vfs_removexattr(realpath.dentry, name);
306out_drop_write:
307 ovl_drop_write(dentry);
308out:
309 return err;
310}
311
312static bool ovl_open_need_copy_up(int flags, enum ovl_path_type type,
313 struct dentry *realdentry)
314{
315 if (type != OVL_PATH_LOWER)
316 return false;
317
318 if (special_file(realdentry->d_inode->i_mode))
319 return false;
320
321 if (!(OPEN_FMODE(flags) & FMODE_WRITE) && !(flags & O_TRUNC))
322 return false;
323
324 return true;
325}
326
327static int ovl_dentry_open(struct dentry *dentry, struct file *file,
328 const struct cred *cred)
329{
330 int err;
331 struct path realpath;
332 enum ovl_path_type type;
333 bool want_write = false;
334
335 type = ovl_path_real(dentry, &realpath);
336 if (ovl_open_need_copy_up(file->f_flags, type, realpath.dentry)) {
337 want_write = true;
338 err = ovl_want_write(dentry);
339 if (err)
340 goto out;
341
342 if (file->f_flags & O_TRUNC)
343 err = ovl_copy_up_last(dentry, NULL, true);
344 else
345 err = ovl_copy_up(dentry);
346 if (err)
347 goto out_drop_write;
348
349 ovl_path_upper(dentry, &realpath);
350 }
351
352 err = vfs_open(&realpath, file, cred);
353out_drop_write:
354 if (want_write)
355 ovl_drop_write(dentry);
356out:
357 return err;
358}
359
360static const struct inode_operations ovl_file_inode_operations = {
361 .setattr = ovl_setattr,
362 .permission = ovl_permission,
363 .getattr = ovl_getattr,
364 .setxattr = ovl_setxattr,
365 .getxattr = ovl_getxattr,
366 .listxattr = ovl_listxattr,
367 .removexattr = ovl_removexattr,
368 .dentry_open = ovl_dentry_open,
369};
370
371static const struct inode_operations ovl_symlink_inode_operations = {
372 .setattr = ovl_setattr,
373 .follow_link = ovl_follow_link,
374 .put_link = ovl_put_link,
375 .readlink = ovl_readlink,
376 .getattr = ovl_getattr,
377 .setxattr = ovl_setxattr,
378 .getxattr = ovl_getxattr,
379 .listxattr = ovl_listxattr,
380 .removexattr = ovl_removexattr,
381};
382
383struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
384 struct ovl_entry *oe)
385{
386 struct inode *inode;
387
388 inode = new_inode(sb);
389 if (!inode)
390 return NULL;
391
392 mode &= S_IFMT;
393
394 inode->i_ino = get_next_ino();
395 inode->i_mode = mode;
396 inode->i_flags |= S_NOATIME | S_NOCMTIME;
397
398 switch (mode) {
399 case S_IFDIR:
400 inode->i_private = oe;
401 inode->i_op = &ovl_dir_inode_operations;
402 inode->i_fop = &ovl_dir_operations;
403 break;
404
405 case S_IFLNK:
406 inode->i_op = &ovl_symlink_inode_operations;
407 break;
408
409 case S_IFREG:
410 case S_IFSOCK:
411 case S_IFBLK:
412 case S_IFCHR:
413 case S_IFIFO:
414 inode->i_op = &ovl_file_inode_operations;
415 break;
416
417 default:
418 WARN(1, "illegal file type: %i\n", mode);
419 iput(inode);
420 inode = NULL;
421 }
422
423 return inode;
424
425}
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
new file mode 100644
index 000000000000..814bed33dd07
--- /dev/null
+++ b/fs/overlayfs/overlayfs.h
@@ -0,0 +1,191 @@
1/*
2 *
3 * Copyright (C) 2011 Novell Inc.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 as published by
7 * the Free Software Foundation.
8 */
9
10#include <linux/kernel.h>
11
12struct ovl_entry;
13
14enum ovl_path_type {
15 OVL_PATH_PURE_UPPER,
16 OVL_PATH_UPPER,
17 OVL_PATH_MERGE,
18 OVL_PATH_LOWER,
19};
20
21extern const char *ovl_opaque_xattr;
22
23static inline int ovl_do_rmdir(struct inode *dir, struct dentry *dentry)
24{
25 int err = vfs_rmdir(dir, dentry);
26 pr_debug("rmdir(%pd2) = %i\n", dentry, err);
27 return err;
28}
29
30static inline int ovl_do_unlink(struct inode *dir, struct dentry *dentry)
31{
32 int err = vfs_unlink(dir, dentry, NULL);
33 pr_debug("unlink(%pd2) = %i\n", dentry, err);
34 return err;
35}
36
37static inline int ovl_do_link(struct dentry *old_dentry, struct inode *dir,
38 struct dentry *new_dentry, bool debug)
39{
40 int err = vfs_link(old_dentry, dir, new_dentry, NULL);
41 if (debug) {
42 pr_debug("link(%pd2, %pd2) = %i\n",
43 old_dentry, new_dentry, err);
44 }
45 return err;
46}
47
48static inline int ovl_do_create(struct inode *dir, struct dentry *dentry,
49 umode_t mode, bool debug)
50{
51 int err = vfs_create(dir, dentry, mode, true);
52 if (debug)
53 pr_debug("create(%pd2, 0%o) = %i\n", dentry, mode, err);
54 return err;
55}
56
57static inline int ovl_do_mkdir(struct inode *dir, struct dentry *dentry,
58 umode_t mode, bool debug)
59{
60 int err = vfs_mkdir(dir, dentry, mode);
61 if (debug)
62 pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, err);
63 return err;
64}
65
66static inline int ovl_do_mknod(struct inode *dir, struct dentry *dentry,
67 umode_t mode, dev_t dev, bool debug)
68{
69 int err = vfs_mknod(dir, dentry, mode, dev);
70 if (debug) {
71 pr_debug("mknod(%pd2, 0%o, 0%o) = %i\n",
72 dentry, mode, dev, err);
73 }
74 return err;
75}
76
77static inline int ovl_do_symlink(struct inode *dir, struct dentry *dentry,
78 const char *oldname, bool debug)
79{
80 int err = vfs_symlink(dir, dentry, oldname);
81 if (debug)
82 pr_debug("symlink(\"%s\", %pd2) = %i\n", oldname, dentry, err);
83 return err;
84}
85
86static inline int ovl_do_setxattr(struct dentry *dentry, const char *name,
87 const void *value, size_t size, int flags)
88{
89 int err = vfs_setxattr(dentry, name, value, size, flags);
90 pr_debug("setxattr(%pd2, \"%s\", \"%*s\", 0x%x) = %i\n",
91 dentry, name, (int) size, (char *) value, flags, err);
92 return err;
93}
94
95static inline int ovl_do_removexattr(struct dentry *dentry, const char *name)
96{
97 int err = vfs_removexattr(dentry, name);
98 pr_debug("removexattr(%pd2, \"%s\") = %i\n", dentry, name, err);
99 return err;
100}
101
102static inline int ovl_do_rename(struct inode *olddir, struct dentry *olddentry,
103 struct inode *newdir, struct dentry *newdentry,
104 unsigned int flags)
105{
106 int err;
107
108 pr_debug("rename2(%pd2, %pd2, 0x%x)\n",
109 olddentry, newdentry, flags);
110
111 err = vfs_rename(olddir, olddentry, newdir, newdentry, NULL, flags);
112
113 if (err) {
114 pr_debug("...rename2(%pd2, %pd2, ...) = %i\n",
115 olddentry, newdentry, err);
116 }
117 return err;
118}
119
120static inline int ovl_do_whiteout(struct inode *dir, struct dentry *dentry)
121{
122 int err = vfs_whiteout(dir, dentry);
123 pr_debug("whiteout(%pd2) = %i\n", dentry, err);
124 return err;
125}
126
127enum ovl_path_type ovl_path_type(struct dentry *dentry);
128u64 ovl_dentry_version_get(struct dentry *dentry);
129void ovl_dentry_version_inc(struct dentry *dentry);
130void ovl_path_upper(struct dentry *dentry, struct path *path);
131void ovl_path_lower(struct dentry *dentry, struct path *path);
132enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path);
133struct dentry *ovl_dentry_upper(struct dentry *dentry);
134struct dentry *ovl_dentry_lower(struct dentry *dentry);
135struct dentry *ovl_dentry_real(struct dentry *dentry);
136struct dentry *ovl_entry_real(struct ovl_entry *oe, bool *is_upper);
137struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry);
138void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache);
139struct dentry *ovl_workdir(struct dentry *dentry);
140int ovl_want_write(struct dentry *dentry);
141void ovl_drop_write(struct dentry *dentry);
142bool ovl_dentry_is_opaque(struct dentry *dentry);
143void ovl_dentry_set_opaque(struct dentry *dentry, bool opaque);
144bool ovl_is_whiteout(struct dentry *dentry);
145void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry);
146struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
147 unsigned int flags);
148struct file *ovl_path_open(struct path *path, int flags);
149
150struct dentry *ovl_upper_create(struct dentry *upperdir, struct dentry *dentry,
151 struct kstat *stat, const char *link);
152
153/* readdir.c */
154extern const struct file_operations ovl_dir_operations;
155int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list);
156void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list);
157void ovl_cache_free(struct list_head *list);
158
159/* inode.c */
160int ovl_setattr(struct dentry *dentry, struct iattr *attr);
161int ovl_permission(struct inode *inode, int mask);
162int ovl_setxattr(struct dentry *dentry, const char *name,
163 const void *value, size_t size, int flags);
164ssize_t ovl_getxattr(struct dentry *dentry, const char *name,
165 void *value, size_t size);
166ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size);
167int ovl_removexattr(struct dentry *dentry, const char *name);
168
169struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
170 struct ovl_entry *oe);
171static inline void ovl_copyattr(struct inode *from, struct inode *to)
172{
173 to->i_uid = from->i_uid;
174 to->i_gid = from->i_gid;
175}
176
177/* dir.c */
178extern const struct inode_operations ovl_dir_inode_operations;
179struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry);
180int ovl_create_real(struct inode *dir, struct dentry *newdentry,
181 struct kstat *stat, const char *link,
182 struct dentry *hardlink, bool debug);
183void ovl_cleanup(struct inode *dir, struct dentry *dentry);
184
185/* copy_up.c */
186int ovl_copy_up(struct dentry *dentry);
187int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
188 struct path *lowerpath, struct kstat *stat,
189 struct iattr *attr);
190int ovl_copy_xattr(struct dentry *old, struct dentry *new);
191int ovl_set_attr(struct dentry *upper, struct kstat *stat);
diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
new file mode 100644
index 000000000000..c6787f84ece9
--- /dev/null
+++ b/fs/overlayfs/readdir.c
@@ -0,0 +1,587 @@
1/*
2 *
3 * Copyright (C) 2011 Novell Inc.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 as published by
7 * the Free Software Foundation.
8 */
9
10#include <linux/fs.h>
11#include <linux/slab.h>
12#include <linux/namei.h>
13#include <linux/file.h>
14#include <linux/xattr.h>
15#include <linux/rbtree.h>
16#include <linux/security.h>
17#include <linux/cred.h>
18#include "overlayfs.h"
19
20struct ovl_cache_entry {
21 const char *name;
22 unsigned int len;
23 unsigned int type;
24 u64 ino;
25 bool is_whiteout;
26 struct list_head l_node;
27 struct rb_node node;
28};
29
30struct ovl_dir_cache {
31 long refcount;
32 u64 version;
33 struct list_head entries;
34};
35
36struct ovl_readdir_data {
37 struct dir_context ctx;
38 bool is_merge;
39 struct rb_root *root;
40 struct list_head *list;
41 struct list_head *middle;
42 int count;
43 int err;
44};
45
46struct ovl_dir_file {
47 bool is_real;
48 bool is_upper;
49 struct ovl_dir_cache *cache;
50 struct ovl_cache_entry cursor;
51 struct file *realfile;
52 struct file *upperfile;
53};
54
55static struct ovl_cache_entry *ovl_cache_entry_from_node(struct rb_node *n)
56{
57 return container_of(n, struct ovl_cache_entry, node);
58}
59
60static struct ovl_cache_entry *ovl_cache_entry_find(struct rb_root *root,
61 const char *name, int len)
62{
63 struct rb_node *node = root->rb_node;
64 int cmp;
65
66 while (node) {
67 struct ovl_cache_entry *p = ovl_cache_entry_from_node(node);
68
69 cmp = strncmp(name, p->name, len);
70 if (cmp > 0)
71 node = p->node.rb_right;
72 else if (cmp < 0 || len < p->len)
73 node = p->node.rb_left;
74 else
75 return p;
76 }
77
78 return NULL;
79}
80
81static struct ovl_cache_entry *ovl_cache_entry_new(const char *name, int len,
82 u64 ino, unsigned int d_type)
83{
84 struct ovl_cache_entry *p;
85
86 p = kmalloc(sizeof(*p) + len + 1, GFP_KERNEL);
87 if (p) {
88 char *name_copy = (char *) (p + 1);
89 memcpy(name_copy, name, len);
90 name_copy[len] = '\0';
91 p->name = name_copy;
92 p->len = len;
93 p->type = d_type;
94 p->ino = ino;
95 p->is_whiteout = false;
96 }
97
98 return p;
99}
100
101static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd,
102 const char *name, int len, u64 ino,
103 unsigned int d_type)
104{
105 struct rb_node **newp = &rdd->root->rb_node;
106 struct rb_node *parent = NULL;
107 struct ovl_cache_entry *p;
108
109 while (*newp) {
110 int cmp;
111 struct ovl_cache_entry *tmp;
112
113 parent = *newp;
114 tmp = ovl_cache_entry_from_node(*newp);
115 cmp = strncmp(name, tmp->name, len);
116 if (cmp > 0)
117 newp = &tmp->node.rb_right;
118 else if (cmp < 0 || len < tmp->len)
119 newp = &tmp->node.rb_left;
120 else
121 return 0;
122 }
123
124 p = ovl_cache_entry_new(name, len, ino, d_type);
125 if (p == NULL)
126 return -ENOMEM;
127
128 list_add_tail(&p->l_node, rdd->list);
129 rb_link_node(&p->node, parent, newp);
130 rb_insert_color(&p->node, rdd->root);
131
132 return 0;
133}
134
135static int ovl_fill_lower(struct ovl_readdir_data *rdd,
136 const char *name, int namelen,
137 loff_t offset, u64 ino, unsigned int d_type)
138{
139 struct ovl_cache_entry *p;
140
141 p = ovl_cache_entry_find(rdd->root, name, namelen);
142 if (p) {
143 list_move_tail(&p->l_node, rdd->middle);
144 } else {
145 p = ovl_cache_entry_new(name, namelen, ino, d_type);
146 if (p == NULL)
147 rdd->err = -ENOMEM;
148 else
149 list_add_tail(&p->l_node, rdd->middle);
150 }
151
152 return rdd->err;
153}
154
155void ovl_cache_free(struct list_head *list)
156{
157 struct ovl_cache_entry *p;
158 struct ovl_cache_entry *n;
159
160 list_for_each_entry_safe(p, n, list, l_node)
161 kfree(p);
162
163 INIT_LIST_HEAD(list);
164}
165
166static void ovl_cache_put(struct ovl_dir_file *od, struct dentry *dentry)
167{
168 struct ovl_dir_cache *cache = od->cache;
169
170 list_del(&od->cursor.l_node);
171 WARN_ON(cache->refcount <= 0);
172 cache->refcount--;
173 if (!cache->refcount) {
174 if (ovl_dir_cache(dentry) == cache)
175 ovl_set_dir_cache(dentry, NULL);
176
177 ovl_cache_free(&cache->entries);
178 kfree(cache);
179 }
180}
181
182static int ovl_fill_merge(void *buf, const char *name, int namelen,
183 loff_t offset, u64 ino, unsigned int d_type)
184{
185 struct ovl_readdir_data *rdd = buf;
186
187 rdd->count++;
188 if (!rdd->is_merge)
189 return ovl_cache_entry_add_rb(rdd, name, namelen, ino, d_type);
190 else
191 return ovl_fill_lower(rdd, name, namelen, offset, ino, d_type);
192}
193
194static inline int ovl_dir_read(struct path *realpath,
195 struct ovl_readdir_data *rdd)
196{
197 struct file *realfile;
198 int err;
199
200 realfile = ovl_path_open(realpath, O_RDONLY | O_DIRECTORY);
201 if (IS_ERR(realfile))
202 return PTR_ERR(realfile);
203
204 rdd->ctx.pos = 0;
205 do {
206 rdd->count = 0;
207 rdd->err = 0;
208 err = iterate_dir(realfile, &rdd->ctx);
209 if (err >= 0)
210 err = rdd->err;
211 } while (!err && rdd->count);
212 fput(realfile);
213
214 return err;
215}
216
217static void ovl_dir_reset(struct file *file)
218{
219 struct ovl_dir_file *od = file->private_data;
220 struct ovl_dir_cache *cache = od->cache;
221 struct dentry *dentry = file->f_path.dentry;
222 enum ovl_path_type type = ovl_path_type(dentry);
223
224 if (cache && ovl_dentry_version_get(dentry) != cache->version) {
225 ovl_cache_put(od, dentry);
226 od->cache = NULL;
227 }
228 WARN_ON(!od->is_real && type != OVL_PATH_MERGE);
229 if (od->is_real && type == OVL_PATH_MERGE)
230 od->is_real = false;
231}
232
233static int ovl_dir_mark_whiteouts(struct dentry *dir,
234 struct ovl_readdir_data *rdd)
235{
236 struct ovl_cache_entry *p;
237 struct dentry *dentry;
238 const struct cred *old_cred;
239 struct cred *override_cred;
240
241 override_cred = prepare_creds();
242 if (!override_cred) {
243 ovl_cache_free(rdd->list);
244 return -ENOMEM;
245 }
246
247 /*
248 * CAP_DAC_OVERRIDE for lookup
249 */
250 cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
251 old_cred = override_creds(override_cred);
252
253 mutex_lock(&dir->d_inode->i_mutex);
254 list_for_each_entry(p, rdd->list, l_node) {
255 if (!p->name)
256 continue;
257
258 if (p->type != DT_CHR)
259 continue;
260
261 dentry = lookup_one_len(p->name, dir, p->len);
262 if (IS_ERR(dentry))
263 continue;
264
265 p->is_whiteout = ovl_is_whiteout(dentry);
266 dput(dentry);
267 }
268 mutex_unlock(&dir->d_inode->i_mutex);
269
270 revert_creds(old_cred);
271 put_cred(override_cred);
272
273 return 0;
274}
275
276static inline int ovl_dir_read_merged(struct path *upperpath,
277 struct path *lowerpath,
278 struct list_head *list)
279{
280 int err;
281 struct rb_root root = RB_ROOT;
282 struct list_head middle;
283 struct ovl_readdir_data rdd = {
284 .ctx.actor = ovl_fill_merge,
285 .list = list,
286 .root = &root,
287 .is_merge = false,
288 };
289
290 if (upperpath->dentry) {
291 err = ovl_dir_read(upperpath, &rdd);
292 if (err)
293 goto out;
294
295 if (lowerpath->dentry) {
296 err = ovl_dir_mark_whiteouts(upperpath->dentry, &rdd);
297 if (err)
298 goto out;
299 }
300 }
301 if (lowerpath->dentry) {
302 /*
303 * Insert lowerpath entries before upperpath ones, this allows
304 * offsets to be reasonably constant
305 */
306 list_add(&middle, rdd.list);
307 rdd.middle = &middle;
308 rdd.is_merge = true;
309 err = ovl_dir_read(lowerpath, &rdd);
310 list_del(&middle);
311 }
312out:
313 return err;
314
315}
316
317static void ovl_seek_cursor(struct ovl_dir_file *od, loff_t pos)
318{
319 struct ovl_cache_entry *p;
320 loff_t off = 0;
321
322 list_for_each_entry(p, &od->cache->entries, l_node) {
323 if (!p->name)
324 continue;
325 if (off >= pos)
326 break;
327 off++;
328 }
329 list_move_tail(&od->cursor.l_node, &p->l_node);
330}
331
332static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry)
333{
334 int res;
335 struct path lowerpath;
336 struct path upperpath;
337 struct ovl_dir_cache *cache;
338
339 cache = ovl_dir_cache(dentry);
340 if (cache && ovl_dentry_version_get(dentry) == cache->version) {
341 cache->refcount++;
342 return cache;
343 }
344 ovl_set_dir_cache(dentry, NULL);
345
346 cache = kzalloc(sizeof(struct ovl_dir_cache), GFP_KERNEL);
347 if (!cache)
348 return ERR_PTR(-ENOMEM);
349
350 cache->refcount = 1;
351 INIT_LIST_HEAD(&cache->entries);
352
353 ovl_path_lower(dentry, &lowerpath);
354 ovl_path_upper(dentry, &upperpath);
355
356 res = ovl_dir_read_merged(&upperpath, &lowerpath, &cache->entries);
357 if (res) {
358 ovl_cache_free(&cache->entries);
359 kfree(cache);
360 return ERR_PTR(res);
361 }
362
363 cache->version = ovl_dentry_version_get(dentry);
364 ovl_set_dir_cache(dentry, cache);
365
366 return cache;
367}
368
369static int ovl_iterate(struct file *file, struct dir_context *ctx)
370{
371 struct ovl_dir_file *od = file->private_data;
372 struct dentry *dentry = file->f_path.dentry;
373
374 if (!ctx->pos)
375 ovl_dir_reset(file);
376
377 if (od->is_real)
378 return iterate_dir(od->realfile, ctx);
379
380 if (!od->cache) {
381 struct ovl_dir_cache *cache;
382
383 cache = ovl_cache_get(dentry);
384 if (IS_ERR(cache))
385 return PTR_ERR(cache);
386
387 od->cache = cache;
388 ovl_seek_cursor(od, ctx->pos);
389 }
390
391 while (od->cursor.l_node.next != &od->cache->entries) {
392 struct ovl_cache_entry *p;
393
394 p = list_entry(od->cursor.l_node.next, struct ovl_cache_entry, l_node);
395 /* Skip cursors */
396 if (p->name) {
397 if (!p->is_whiteout) {
398 if (!dir_emit(ctx, p->name, p->len, p->ino, p->type))
399 break;
400 }
401 ctx->pos++;
402 }
403 list_move(&od->cursor.l_node, &p->l_node);
404 }
405 return 0;
406}
407
408static loff_t ovl_dir_llseek(struct file *file, loff_t offset, int origin)
409{
410 loff_t res;
411 struct ovl_dir_file *od = file->private_data;
412
413 mutex_lock(&file_inode(file)->i_mutex);
414 if (!file->f_pos)
415 ovl_dir_reset(file);
416
417 if (od->is_real) {
418 res = vfs_llseek(od->realfile, offset, origin);
419 file->f_pos = od->realfile->f_pos;
420 } else {
421 res = -EINVAL;
422
423 switch (origin) {
424 case SEEK_CUR:
425 offset += file->f_pos;
426 break;
427 case SEEK_SET:
428 break;
429 default:
430 goto out_unlock;
431 }
432 if (offset < 0)
433 goto out_unlock;
434
435 if (offset != file->f_pos) {
436 file->f_pos = offset;
437 if (od->cache)
438 ovl_seek_cursor(od, offset);
439 }
440 res = offset;
441 }
442out_unlock:
443 mutex_unlock(&file_inode(file)->i_mutex);
444
445 return res;
446}
447
448static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end,
449 int datasync)
450{
451 struct ovl_dir_file *od = file->private_data;
452 struct dentry *dentry = file->f_path.dentry;
453 struct file *realfile = od->realfile;
454
455 /*
456 * Need to check if we started out being a lower dir, but got copied up
457 */
458 if (!od->is_upper && ovl_path_type(dentry) == OVL_PATH_MERGE) {
459 struct inode *inode = file_inode(file);
460
461 mutex_lock(&inode->i_mutex);
462 realfile = od->upperfile;
463 if (!realfile) {
464 struct path upperpath;
465
466 ovl_path_upper(dentry, &upperpath);
467 realfile = ovl_path_open(&upperpath, O_RDONLY);
468 if (IS_ERR(realfile)) {
469 mutex_unlock(&inode->i_mutex);
470 return PTR_ERR(realfile);
471 }
472 od->upperfile = realfile;
473 }
474 mutex_unlock(&inode->i_mutex);
475 }
476
477 return vfs_fsync_range(realfile, start, end, datasync);
478}
479
480static int ovl_dir_release(struct inode *inode, struct file *file)
481{
482 struct ovl_dir_file *od = file->private_data;
483
484 if (od->cache) {
485 mutex_lock(&inode->i_mutex);
486 ovl_cache_put(od, file->f_path.dentry);
487 mutex_unlock(&inode->i_mutex);
488 }
489 fput(od->realfile);
490 if (od->upperfile)
491 fput(od->upperfile);
492 kfree(od);
493
494 return 0;
495}
496
497static int ovl_dir_open(struct inode *inode, struct file *file)
498{
499 struct path realpath;
500 struct file *realfile;
501 struct ovl_dir_file *od;
502 enum ovl_path_type type;
503
504 od = kzalloc(sizeof(struct ovl_dir_file), GFP_KERNEL);
505 if (!od)
506 return -ENOMEM;
507
508 type = ovl_path_real(file->f_path.dentry, &realpath);
509 realfile = ovl_path_open(&realpath, file->f_flags);
510 if (IS_ERR(realfile)) {
511 kfree(od);
512 return PTR_ERR(realfile);
513 }
514 INIT_LIST_HEAD(&od->cursor.l_node);
515 od->realfile = realfile;
516 od->is_real = (type != OVL_PATH_MERGE);
517 od->is_upper = (type != OVL_PATH_LOWER);
518 file->private_data = od;
519
520 return 0;
521}
522
523const struct file_operations ovl_dir_operations = {
524 .read = generic_read_dir,
525 .open = ovl_dir_open,
526 .iterate = ovl_iterate,
527 .llseek = ovl_dir_llseek,
528 .fsync = ovl_dir_fsync,
529 .release = ovl_dir_release,
530};
531
532int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list)
533{
534 int err;
535 struct path lowerpath;
536 struct path upperpath;
537 struct ovl_cache_entry *p;
538
539 ovl_path_upper(dentry, &upperpath);
540 ovl_path_lower(dentry, &lowerpath);
541
542 err = ovl_dir_read_merged(&upperpath, &lowerpath, list);
543 if (err)
544 return err;
545
546 err = 0;
547
548 list_for_each_entry(p, list, l_node) {
549 if (p->is_whiteout)
550 continue;
551
552 if (p->name[0] == '.') {
553 if (p->len == 1)
554 continue;
555 if (p->len == 2 && p->name[1] == '.')
556 continue;
557 }
558 err = -ENOTEMPTY;
559 break;
560 }
561
562 return err;
563}
564
565void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list)
566{
567 struct ovl_cache_entry *p;
568
569 mutex_lock_nested(&upper->d_inode->i_mutex, I_MUTEX_PARENT);
570 list_for_each_entry(p, list, l_node) {
571 struct dentry *dentry;
572
573 if (!p->is_whiteout)
574 continue;
575
576 dentry = lookup_one_len(p->name, upper, p->len);
577 if (IS_ERR(dentry)) {
578 pr_err("overlayfs: lookup '%s/%.*s' failed (%i)\n",
579 upper->d_name.name, p->len, p->name,
580 (int) PTR_ERR(dentry));
581 continue;
582 }
583 ovl_cleanup(upper->d_inode, dentry);
584 dput(dentry);
585 }
586 mutex_unlock(&upper->d_inode->i_mutex);
587}
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
new file mode 100644
index 000000000000..08b704cebfc4
--- /dev/null
+++ b/fs/overlayfs/super.c
@@ -0,0 +1,796 @@
1/*
2 *
3 * Copyright (C) 2011 Novell Inc.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 as published by
7 * the Free Software Foundation.
8 */
9
10#include <linux/fs.h>
11#include <linux/namei.h>
12#include <linux/xattr.h>
13#include <linux/security.h>
14#include <linux/mount.h>
15#include <linux/slab.h>
16#include <linux/parser.h>
17#include <linux/module.h>
18#include <linux/sched.h>
19#include <linux/statfs.h>
20#include <linux/seq_file.h>
21#include "overlayfs.h"
22
23MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>");
24MODULE_DESCRIPTION("Overlay filesystem");
25MODULE_LICENSE("GPL");
26
27#define OVERLAYFS_SUPER_MAGIC 0x794c764f
28
29struct ovl_config {
30 char *lowerdir;
31 char *upperdir;
32 char *workdir;
33};
34
35/* private information held for overlayfs's superblock */
36struct ovl_fs {
37 struct vfsmount *upper_mnt;
38 struct vfsmount *lower_mnt;
39 struct dentry *workdir;
40 long lower_namelen;
41 /* pathnames of lower and upper dirs, for show_options */
42 struct ovl_config config;
43};
44
45struct ovl_dir_cache;
46
47/* private information held for every overlayfs dentry */
48struct ovl_entry {
49 struct dentry *__upperdentry;
50 struct dentry *lowerdentry;
51 struct ovl_dir_cache *cache;
52 union {
53 struct {
54 u64 version;
55 bool opaque;
56 };
57 struct rcu_head rcu;
58 };
59};
60
61const char *ovl_opaque_xattr = "trusted.overlay.opaque";
62
63
64enum ovl_path_type ovl_path_type(struct dentry *dentry)
65{
66 struct ovl_entry *oe = dentry->d_fsdata;
67
68 if (oe->__upperdentry) {
69 if (oe->lowerdentry) {
70 if (S_ISDIR(dentry->d_inode->i_mode))
71 return OVL_PATH_MERGE;
72 else
73 return OVL_PATH_UPPER;
74 } else {
75 if (oe->opaque)
76 return OVL_PATH_UPPER;
77 else
78 return OVL_PATH_PURE_UPPER;
79 }
80 } else {
81 return OVL_PATH_LOWER;
82 }
83}
84
85static struct dentry *ovl_upperdentry_dereference(struct ovl_entry *oe)
86{
87 struct dentry *upperdentry = ACCESS_ONCE(oe->__upperdentry);
88 /*
89 * Make sure to order reads to upperdentry wrt ovl_dentry_update()
90 */
91 smp_read_barrier_depends();
92 return upperdentry;
93}
94
95void ovl_path_upper(struct dentry *dentry, struct path *path)
96{
97 struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
98 struct ovl_entry *oe = dentry->d_fsdata;
99
100 path->mnt = ofs->upper_mnt;
101 path->dentry = ovl_upperdentry_dereference(oe);
102}
103
104enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path)
105{
106
107 enum ovl_path_type type = ovl_path_type(dentry);
108
109 if (type == OVL_PATH_LOWER)
110 ovl_path_lower(dentry, path);
111 else
112 ovl_path_upper(dentry, path);
113
114 return type;
115}
116
117struct dentry *ovl_dentry_upper(struct dentry *dentry)
118{
119 struct ovl_entry *oe = dentry->d_fsdata;
120
121 return ovl_upperdentry_dereference(oe);
122}
123
124struct dentry *ovl_dentry_lower(struct dentry *dentry)
125{
126 struct ovl_entry *oe = dentry->d_fsdata;
127
128 return oe->lowerdentry;
129}
130
131struct dentry *ovl_dentry_real(struct dentry *dentry)
132{
133 struct ovl_entry *oe = dentry->d_fsdata;
134 struct dentry *realdentry;
135
136 realdentry = ovl_upperdentry_dereference(oe);
137 if (!realdentry)
138 realdentry = oe->lowerdentry;
139
140 return realdentry;
141}
142
143struct dentry *ovl_entry_real(struct ovl_entry *oe, bool *is_upper)
144{
145 struct dentry *realdentry;
146
147 realdentry = ovl_upperdentry_dereference(oe);
148 if (realdentry) {
149 *is_upper = true;
150 } else {
151 realdentry = oe->lowerdentry;
152 *is_upper = false;
153 }
154 return realdentry;
155}
156
157struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry)
158{
159 struct ovl_entry *oe = dentry->d_fsdata;
160
161 return oe->cache;
162}
163
164void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache)
165{
166 struct ovl_entry *oe = dentry->d_fsdata;
167
168 oe->cache = cache;
169}
170
171void ovl_path_lower(struct dentry *dentry, struct path *path)
172{
173 struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
174 struct ovl_entry *oe = dentry->d_fsdata;
175
176 path->mnt = ofs->lower_mnt;
177 path->dentry = oe->lowerdentry;
178}
179
180int ovl_want_write(struct dentry *dentry)
181{
182 struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
183 return mnt_want_write(ofs->upper_mnt);
184}
185
186void ovl_drop_write(struct dentry *dentry)
187{
188 struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
189 mnt_drop_write(ofs->upper_mnt);
190}
191
192struct dentry *ovl_workdir(struct dentry *dentry)
193{
194 struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
195 return ofs->workdir;
196}
197
198bool ovl_dentry_is_opaque(struct dentry *dentry)
199{
200 struct ovl_entry *oe = dentry->d_fsdata;
201 return oe->opaque;
202}
203
204void ovl_dentry_set_opaque(struct dentry *dentry, bool opaque)
205{
206 struct ovl_entry *oe = dentry->d_fsdata;
207 oe->opaque = opaque;
208}
209
210void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry)
211{
212 struct ovl_entry *oe = dentry->d_fsdata;
213
214 WARN_ON(!mutex_is_locked(&upperdentry->d_parent->d_inode->i_mutex));
215 WARN_ON(oe->__upperdentry);
216 BUG_ON(!upperdentry->d_inode);
217 /*
218 * Make sure upperdentry is consistent before making it visible to
219 * ovl_upperdentry_dereference().
220 */
221 smp_wmb();
222 oe->__upperdentry = upperdentry;
223}
224
225void ovl_dentry_version_inc(struct dentry *dentry)
226{
227 struct ovl_entry *oe = dentry->d_fsdata;
228
229 WARN_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
230 oe->version++;
231}
232
233u64 ovl_dentry_version_get(struct dentry *dentry)
234{
235 struct ovl_entry *oe = dentry->d_fsdata;
236
237 WARN_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
238 return oe->version;
239}
240
241bool ovl_is_whiteout(struct dentry *dentry)
242{
243 struct inode *inode = dentry->d_inode;
244
245 return inode && IS_WHITEOUT(inode);
246}
247
248static bool ovl_is_opaquedir(struct dentry *dentry)
249{
250 int res;
251 char val;
252 struct inode *inode = dentry->d_inode;
253
254 if (!S_ISDIR(inode->i_mode) || !inode->i_op->getxattr)
255 return false;
256
257 res = inode->i_op->getxattr(dentry, ovl_opaque_xattr, &val, 1);
258 if (res == 1 && val == 'y')
259 return true;
260
261 return false;
262}
263
264static void ovl_dentry_release(struct dentry *dentry)
265{
266 struct ovl_entry *oe = dentry->d_fsdata;
267
268 if (oe) {
269 dput(oe->__upperdentry);
270 dput(oe->lowerdentry);
271 kfree_rcu(oe, rcu);
272 }
273}
274
275static const struct dentry_operations ovl_dentry_operations = {
276 .d_release = ovl_dentry_release,
277};
278
279static struct ovl_entry *ovl_alloc_entry(void)
280{
281 return kzalloc(sizeof(struct ovl_entry), GFP_KERNEL);
282}
283
284static inline struct dentry *ovl_lookup_real(struct dentry *dir,
285 struct qstr *name)
286{
287 struct dentry *dentry;
288
289 mutex_lock(&dir->d_inode->i_mutex);
290 dentry = lookup_one_len(name->name, dir, name->len);
291 mutex_unlock(&dir->d_inode->i_mutex);
292
293 if (IS_ERR(dentry)) {
294 if (PTR_ERR(dentry) == -ENOENT)
295 dentry = NULL;
296 } else if (!dentry->d_inode) {
297 dput(dentry);
298 dentry = NULL;
299 }
300 return dentry;
301}
302
303struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
304 unsigned int flags)
305{
306 struct ovl_entry *oe;
307 struct dentry *upperdir;
308 struct dentry *lowerdir;
309 struct dentry *upperdentry = NULL;
310 struct dentry *lowerdentry = NULL;
311 struct inode *inode = NULL;
312 int err;
313
314 err = -ENOMEM;
315 oe = ovl_alloc_entry();
316 if (!oe)
317 goto out;
318
319 upperdir = ovl_dentry_upper(dentry->d_parent);
320 lowerdir = ovl_dentry_lower(dentry->d_parent);
321
322 if (upperdir) {
323 upperdentry = ovl_lookup_real(upperdir, &dentry->d_name);
324 err = PTR_ERR(upperdentry);
325 if (IS_ERR(upperdentry))
326 goto out_put_dir;
327
328 if (lowerdir && upperdentry) {
329 if (ovl_is_whiteout(upperdentry)) {
330 dput(upperdentry);
331 upperdentry = NULL;
332 oe->opaque = true;
333 } else if (ovl_is_opaquedir(upperdentry)) {
334 oe->opaque = true;
335 }
336 }
337 }
338 if (lowerdir && !oe->opaque) {
339 lowerdentry = ovl_lookup_real(lowerdir, &dentry->d_name);
340 err = PTR_ERR(lowerdentry);
341 if (IS_ERR(lowerdentry))
342 goto out_dput_upper;
343 }
344
345 if (lowerdentry && upperdentry &&
346 (!S_ISDIR(upperdentry->d_inode->i_mode) ||
347 !S_ISDIR(lowerdentry->d_inode->i_mode))) {
348 dput(lowerdentry);
349 lowerdentry = NULL;
350 oe->opaque = true;
351 }
352
353 if (lowerdentry || upperdentry) {
354 struct dentry *realdentry;
355
356 realdentry = upperdentry ? upperdentry : lowerdentry;
357 err = -ENOMEM;
358 inode = ovl_new_inode(dentry->d_sb, realdentry->d_inode->i_mode,
359 oe);
360 if (!inode)
361 goto out_dput;
362 ovl_copyattr(realdentry->d_inode, inode);
363 }
364
365 oe->__upperdentry = upperdentry;
366 oe->lowerdentry = lowerdentry;
367
368 dentry->d_fsdata = oe;
369 d_add(dentry, inode);
370
371 return NULL;
372
373out_dput:
374 dput(lowerdentry);
375out_dput_upper:
376 dput(upperdentry);
377out_put_dir:
378 kfree(oe);
379out:
380 return ERR_PTR(err);
381}
382
383struct file *ovl_path_open(struct path *path, int flags)
384{
385 return dentry_open(path, flags, current_cred());
386}
387
388static void ovl_put_super(struct super_block *sb)
389{
390 struct ovl_fs *ufs = sb->s_fs_info;
391
392 dput(ufs->workdir);
393 mntput(ufs->upper_mnt);
394 mntput(ufs->lower_mnt);
395
396 kfree(ufs->config.lowerdir);
397 kfree(ufs->config.upperdir);
398 kfree(ufs->config.workdir);
399 kfree(ufs);
400}
401
402/**
403 * ovl_statfs
404 * @sb: The overlayfs super block
405 * @buf: The struct kstatfs to fill in with stats
406 *
407 * Get the filesystem statistics. As writes always target the upper layer
408 * filesystem pass the statfs to the same filesystem.
409 */
410static int ovl_statfs(struct dentry *dentry, struct kstatfs *buf)
411{
412 struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
413 struct dentry *root_dentry = dentry->d_sb->s_root;
414 struct path path;
415 int err;
416
417 ovl_path_upper(root_dentry, &path);
418
419 err = vfs_statfs(&path, buf);
420 if (!err) {
421 buf->f_namelen = max(buf->f_namelen, ofs->lower_namelen);
422 buf->f_type = OVERLAYFS_SUPER_MAGIC;
423 }
424
425 return err;
426}
427
428/**
429 * ovl_show_options
430 *
431 * Prints the mount options for a given superblock.
432 * Returns zero; does not fail.
433 */
434static int ovl_show_options(struct seq_file *m, struct dentry *dentry)
435{
436 struct super_block *sb = dentry->d_sb;
437 struct ovl_fs *ufs = sb->s_fs_info;
438
439 seq_printf(m, ",lowerdir=%s", ufs->config.lowerdir);
440 seq_printf(m, ",upperdir=%s", ufs->config.upperdir);
441 seq_printf(m, ",workdir=%s", ufs->config.workdir);
442 return 0;
443}
444
445static const struct super_operations ovl_super_operations = {
446 .put_super = ovl_put_super,
447 .statfs = ovl_statfs,
448 .show_options = ovl_show_options,
449};
450
451enum {
452 OPT_LOWERDIR,
453 OPT_UPPERDIR,
454 OPT_WORKDIR,
455 OPT_ERR,
456};
457
458static const match_table_t ovl_tokens = {
459 {OPT_LOWERDIR, "lowerdir=%s"},
460 {OPT_UPPERDIR, "upperdir=%s"},
461 {OPT_WORKDIR, "workdir=%s"},
462 {OPT_ERR, NULL}
463};
464
465static int ovl_parse_opt(char *opt, struct ovl_config *config)
466{
467 char *p;
468
469 while ((p = strsep(&opt, ",")) != NULL) {
470 int token;
471 substring_t args[MAX_OPT_ARGS];
472
473 if (!*p)
474 continue;
475
476 token = match_token(p, ovl_tokens, args);
477 switch (token) {
478 case OPT_UPPERDIR:
479 kfree(config->upperdir);
480 config->upperdir = match_strdup(&args[0]);
481 if (!config->upperdir)
482 return -ENOMEM;
483 break;
484
485 case OPT_LOWERDIR:
486 kfree(config->lowerdir);
487 config->lowerdir = match_strdup(&args[0]);
488 if (!config->lowerdir)
489 return -ENOMEM;
490 break;
491
492 case OPT_WORKDIR:
493 kfree(config->workdir);
494 config->workdir = match_strdup(&args[0]);
495 if (!config->workdir)
496 return -ENOMEM;
497 break;
498
499 default:
500 return -EINVAL;
501 }
502 }
503 return 0;
504}
505
506#define OVL_WORKDIR_NAME "work"
507
508static struct dentry *ovl_workdir_create(struct vfsmount *mnt,
509 struct dentry *dentry)
510{
511 struct inode *dir = dentry->d_inode;
512 struct dentry *work;
513 int err;
514 bool retried = false;
515
516 err = mnt_want_write(mnt);
517 if (err)
518 return ERR_PTR(err);
519
520 mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
521retry:
522 work = lookup_one_len(OVL_WORKDIR_NAME, dentry,
523 strlen(OVL_WORKDIR_NAME));
524
525 if (!IS_ERR(work)) {
526 struct kstat stat = {
527 .mode = S_IFDIR | 0,
528 };
529
530 if (work->d_inode) {
531 err = -EEXIST;
532 if (retried)
533 goto out_dput;
534
535 retried = true;
536 ovl_cleanup(dir, work);
537 dput(work);
538 goto retry;
539 }
540
541 err = ovl_create_real(dir, work, &stat, NULL, NULL, true);
542 if (err)
543 goto out_dput;
544 }
545out_unlock:
546 mutex_unlock(&dir->i_mutex);
547 mnt_drop_write(mnt);
548
549 return work;
550
551out_dput:
552 dput(work);
553 work = ERR_PTR(err);
554 goto out_unlock;
555}
556
557static int ovl_mount_dir(const char *name, struct path *path)
558{
559 int err;
560
561 err = kern_path(name, LOOKUP_FOLLOW, path);
562 if (err) {
563 pr_err("overlayfs: failed to resolve '%s': %i\n", name, err);
564 err = -EINVAL;
565 }
566 return err;
567}
568
569static bool ovl_is_allowed_fs_type(struct dentry *root)
570{
571 const struct dentry_operations *dop = root->d_op;
572
573 /*
574 * We don't support:
575 * - automount filesystems
576 * - filesystems with revalidate (FIXME for lower layer)
577 * - filesystems with case insensitive names
578 */
579 if (dop &&
580 (dop->d_manage || dop->d_automount ||
581 dop->d_revalidate || dop->d_weak_revalidate ||
582 dop->d_compare || dop->d_hash)) {
583 return false;
584 }
585 return true;
586}
587
588/* Workdir should not be subdir of upperdir and vice versa */
589static bool ovl_workdir_ok(struct dentry *workdir, struct dentry *upperdir)
590{
591 bool ok = false;
592
593 if (workdir != upperdir) {
594 ok = (lock_rename(workdir, upperdir) == NULL);
595 unlock_rename(workdir, upperdir);
596 }
597 return ok;
598}
599
600static int ovl_fill_super(struct super_block *sb, void *data, int silent)
601{
602 struct path lowerpath;
603 struct path upperpath;
604 struct path workpath;
605 struct inode *root_inode;
606 struct dentry *root_dentry;
607 struct ovl_entry *oe;
608 struct ovl_fs *ufs;
609 struct kstatfs statfs;
610 int err;
611
612 err = -ENOMEM;
613 ufs = kzalloc(sizeof(struct ovl_fs), GFP_KERNEL);
614 if (!ufs)
615 goto out;
616
617 err = ovl_parse_opt((char *) data, &ufs->config);
618 if (err)
619 goto out_free_config;
620
621 /* FIXME: workdir is not needed for a R/O mount */
622 err = -EINVAL;
623 if (!ufs->config.upperdir || !ufs->config.lowerdir ||
624 !ufs->config.workdir) {
625 pr_err("overlayfs: missing upperdir or lowerdir or workdir\n");
626 goto out_free_config;
627 }
628
629 err = -ENOMEM;
630 oe = ovl_alloc_entry();
631 if (oe == NULL)
632 goto out_free_config;
633
634 err = ovl_mount_dir(ufs->config.upperdir, &upperpath);
635 if (err)
636 goto out_free_oe;
637
638 err = ovl_mount_dir(ufs->config.lowerdir, &lowerpath);
639 if (err)
640 goto out_put_upperpath;
641
642 err = ovl_mount_dir(ufs->config.workdir, &workpath);
643 if (err)
644 goto out_put_lowerpath;
645
646 err = -EINVAL;
647 if (!S_ISDIR(upperpath.dentry->d_inode->i_mode) ||
648 !S_ISDIR(lowerpath.dentry->d_inode->i_mode) ||
649 !S_ISDIR(workpath.dentry->d_inode->i_mode)) {
650 pr_err("overlayfs: upperdir or lowerdir or workdir not a directory\n");
651 goto out_put_workpath;
652 }
653
654 if (upperpath.mnt != workpath.mnt) {
655 pr_err("overlayfs: workdir and upperdir must reside under the same mount\n");
656 goto out_put_workpath;
657 }
658 if (!ovl_workdir_ok(workpath.dentry, upperpath.dentry)) {
659 pr_err("overlayfs: workdir and upperdir must be separate subtrees\n");
660 goto out_put_workpath;
661 }
662
663 if (!ovl_is_allowed_fs_type(upperpath.dentry)) {
664 pr_err("overlayfs: filesystem of upperdir is not supported\n");
665 goto out_put_workpath;
666 }
667
668 if (!ovl_is_allowed_fs_type(lowerpath.dentry)) {
669 pr_err("overlayfs: filesystem of lowerdir is not supported\n");
670 goto out_put_workpath;
671 }
672
673 err = vfs_statfs(&lowerpath, &statfs);
674 if (err) {
675 pr_err("overlayfs: statfs failed on lowerpath\n");
676 goto out_put_workpath;
677 }
678 ufs->lower_namelen = statfs.f_namelen;
679
680 sb->s_stack_depth = max(upperpath.mnt->mnt_sb->s_stack_depth,
681 lowerpath.mnt->mnt_sb->s_stack_depth) + 1;
682
683 err = -EINVAL;
684 if (sb->s_stack_depth > FILESYSTEM_MAX_STACK_DEPTH) {
685 pr_err("overlayfs: maximum fs stacking depth exceeded\n");
686 goto out_put_workpath;
687 }
688
689 ufs->upper_mnt = clone_private_mount(&upperpath);
690 err = PTR_ERR(ufs->upper_mnt);
691 if (IS_ERR(ufs->upper_mnt)) {
692 pr_err("overlayfs: failed to clone upperpath\n");
693 goto out_put_workpath;
694 }
695
696 ufs->lower_mnt = clone_private_mount(&lowerpath);
697 err = PTR_ERR(ufs->lower_mnt);
698 if (IS_ERR(ufs->lower_mnt)) {
699 pr_err("overlayfs: failed to clone lowerpath\n");
700 goto out_put_upper_mnt;
701 }
702
703 ufs->workdir = ovl_workdir_create(ufs->upper_mnt, workpath.dentry);
704 err = PTR_ERR(ufs->workdir);
705 if (IS_ERR(ufs->workdir)) {
706 pr_err("overlayfs: failed to create directory %s/%s\n",
707 ufs->config.workdir, OVL_WORKDIR_NAME);
708 goto out_put_lower_mnt;
709 }
710
711 /*
712 * Make lower_mnt R/O. That way fchmod/fchown on lower file
713 * will fail instead of modifying lower fs.
714 */
715 ufs->lower_mnt->mnt_flags |= MNT_READONLY;
716
717 /* If the upper fs is r/o, we mark overlayfs r/o too */
718 if (ufs->upper_mnt->mnt_sb->s_flags & MS_RDONLY)
719 sb->s_flags |= MS_RDONLY;
720
721 sb->s_d_op = &ovl_dentry_operations;
722
723 err = -ENOMEM;
724 root_inode = ovl_new_inode(sb, S_IFDIR, oe);
725 if (!root_inode)
726 goto out_put_workdir;
727
728 root_dentry = d_make_root(root_inode);
729 if (!root_dentry)
730 goto out_put_workdir;
731
732 mntput(upperpath.mnt);
733 mntput(lowerpath.mnt);
734 path_put(&workpath);
735
736 oe->__upperdentry = upperpath.dentry;
737 oe->lowerdentry = lowerpath.dentry;
738
739 root_dentry->d_fsdata = oe;
740
741 sb->s_magic = OVERLAYFS_SUPER_MAGIC;
742 sb->s_op = &ovl_super_operations;
743 sb->s_root = root_dentry;
744 sb->s_fs_info = ufs;
745
746 return 0;
747
748out_put_workdir:
749 dput(ufs->workdir);
750out_put_lower_mnt:
751 mntput(ufs->lower_mnt);
752out_put_upper_mnt:
753 mntput(ufs->upper_mnt);
754out_put_workpath:
755 path_put(&workpath);
756out_put_lowerpath:
757 path_put(&lowerpath);
758out_put_upperpath:
759 path_put(&upperpath);
760out_free_oe:
761 kfree(oe);
762out_free_config:
763 kfree(ufs->config.lowerdir);
764 kfree(ufs->config.upperdir);
765 kfree(ufs->config.workdir);
766 kfree(ufs);
767out:
768 return err;
769}
770
771static struct dentry *ovl_mount(struct file_system_type *fs_type, int flags,
772 const char *dev_name, void *raw_data)
773{
774 return mount_nodev(fs_type, flags, raw_data, ovl_fill_super);
775}
776
777static struct file_system_type ovl_fs_type = {
778 .owner = THIS_MODULE,
779 .name = "overlayfs",
780 .mount = ovl_mount,
781 .kill_sb = kill_anon_super,
782};
783MODULE_ALIAS_FS("overlayfs");
784
785static int __init ovl_init(void)
786{
787 return register_filesystem(&ovl_fs_type);
788}
789
790static void __exit ovl_exit(void)
791{
792 unregister_filesystem(&ovl_fs_type);
793}
794
795module_init(ovl_init);
796module_exit(ovl_exit);
diff --git a/fs/splice.c b/fs/splice.c
index f5cb9ba84510..75c6058eabf2 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1330,6 +1330,7 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
1330 1330
1331 return ret; 1331 return ret;
1332} 1332}
1333EXPORT_SYMBOL(do_splice_direct);
1333 1334
1334static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, 1335static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
1335 struct pipe_inode_info *opipe, 1336 struct pipe_inode_info *opipe,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index a957d4366c24..4e41a4a331bb 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -223,6 +223,13 @@ typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
223#define ATTR_TIMES_SET (1 << 16) 223#define ATTR_TIMES_SET (1 << 16)
224 224
225/* 225/*
226 * Whiteout is represented by a char device. The following constants define the
227 * mode and device number to use.
228 */
229#define WHITEOUT_MODE 0
230#define WHITEOUT_DEV 0
231
232/*
226 * This is the Inode Attributes structure, used for notify_change(). It 233 * This is the Inode Attributes structure, used for notify_change(). It
227 * uses the above definitions as flags, to know which values have changed. 234 * uses the above definitions as flags, to know which values have changed.
228 * Also, in this manner, a Filesystem can look at only the values it cares 235 * Also, in this manner, a Filesystem can look at only the values it cares
@@ -254,6 +261,12 @@ struct iattr {
254 */ 261 */
255#include <linux/quota.h> 262#include <linux/quota.h>
256 263
264/*
265 * Maximum number of layers of fs stack. Needs to be limited to
266 * prevent kernel stack overflow
267 */
268#define FILESYSTEM_MAX_STACK_DEPTH 2
269
257/** 270/**
258 * enum positive_aop_returns - aop return codes with specific semantics 271 * enum positive_aop_returns - aop return codes with specific semantics
259 * 272 *
@@ -1266,6 +1279,11 @@ struct super_block {
1266 struct list_lru s_dentry_lru ____cacheline_aligned_in_smp; 1279 struct list_lru s_dentry_lru ____cacheline_aligned_in_smp;
1267 struct list_lru s_inode_lru ____cacheline_aligned_in_smp; 1280 struct list_lru s_inode_lru ____cacheline_aligned_in_smp;
1268 struct rcu_head rcu; 1281 struct rcu_head rcu;
1282
1283 /*
1284 * Indicates how deep in a filesystem stack this SB is
1285 */
1286 int s_stack_depth;
1269}; 1287};
1270 1288
1271extern struct timespec current_fs_time(struct super_block *sb); 1289extern struct timespec current_fs_time(struct super_block *sb);
@@ -1398,6 +1416,7 @@ extern int vfs_link(struct dentry *, struct inode *, struct dentry *, struct ino
1398extern int vfs_rmdir(struct inode *, struct dentry *); 1416extern int vfs_rmdir(struct inode *, struct dentry *);
1399extern int vfs_unlink(struct inode *, struct dentry *, struct inode **); 1417extern int vfs_unlink(struct inode *, struct dentry *, struct inode **);
1400extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *, struct inode **, unsigned int); 1418extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *, struct inode **, unsigned int);
1419extern int vfs_whiteout(struct inode *, struct dentry *);
1401 1420
1402/* 1421/*
1403 * VFS dentry helper functions. 1422 * VFS dentry helper functions.
@@ -1528,6 +1547,9 @@ struct inode_operations {
1528 umode_t create_mode, int *opened); 1547 umode_t create_mode, int *opened);
1529 int (*tmpfile) (struct inode *, struct dentry *, umode_t); 1548 int (*tmpfile) (struct inode *, struct dentry *, umode_t);
1530 int (*set_acl)(struct inode *, struct posix_acl *, int); 1549 int (*set_acl)(struct inode *, struct posix_acl *, int);
1550
1551 /* WARNING: probably going away soon, do not use! */
1552 int (*dentry_open)(struct dentry *, struct file *, const struct cred *);
1531} ____cacheline_aligned; 1553} ____cacheline_aligned;
1532 1554
1533ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector, 1555ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
@@ -1625,6 +1647,9 @@ struct super_operations {
1625#define IS_AUTOMOUNT(inode) ((inode)->i_flags & S_AUTOMOUNT) 1647#define IS_AUTOMOUNT(inode) ((inode)->i_flags & S_AUTOMOUNT)
1626#define IS_NOSEC(inode) ((inode)->i_flags & S_NOSEC) 1648#define IS_NOSEC(inode) ((inode)->i_flags & S_NOSEC)
1627 1649
1650#define IS_WHITEOUT(inode) (S_ISCHR(inode->i_mode) && \
1651 (inode)->i_rdev == WHITEOUT_DEV)
1652
1628/* 1653/*
1629 * Inode state bits. Protected by inode->i_lock 1654 * Inode state bits. Protected by inode->i_lock
1630 * 1655 *
@@ -2040,6 +2065,7 @@ extern struct file *file_open_name(struct filename *, int, umode_t);
2040extern struct file *filp_open(const char *, int, umode_t); 2065extern struct file *filp_open(const char *, int, umode_t);
2041extern struct file *file_open_root(struct dentry *, struct vfsmount *, 2066extern struct file *file_open_root(struct dentry *, struct vfsmount *,
2042 const char *, int); 2067 const char *, int);
2068extern int vfs_open(const struct path *, struct file *, const struct cred *);
2043extern struct file * dentry_open(const struct path *, int, const struct cred *); 2069extern struct file * dentry_open(const struct path *, int, const struct cred *);
2044extern int filp_close(struct file *, fl_owner_t id); 2070extern int filp_close(struct file *, fl_owner_t id);
2045 2071
@@ -2253,7 +2279,9 @@ extern sector_t bmap(struct inode *, sector_t);
2253#endif 2279#endif
2254extern int notify_change(struct dentry *, struct iattr *, struct inode **); 2280extern int notify_change(struct dentry *, struct iattr *, struct inode **);
2255extern int inode_permission(struct inode *, int); 2281extern int inode_permission(struct inode *, int);
2282extern int __inode_permission(struct inode *, int);
2256extern int generic_permission(struct inode *, int); 2283extern int generic_permission(struct inode *, int);
2284extern int __check_sticky(struct inode *dir, struct inode *inode);
2257 2285
2258static inline bool execute_ok(struct inode *inode) 2286static inline bool execute_ok(struct inode *inode)
2259{ 2287{
@@ -2452,6 +2480,9 @@ extern ssize_t iter_file_splice_write(struct pipe_inode_info *,
2452 struct file *, loff_t *, size_t, unsigned int); 2480 struct file *, loff_t *, size_t, unsigned int);
2453extern ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, 2481extern ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe,
2454 struct file *out, loff_t *, size_t len, unsigned int flags); 2482 struct file *out, loff_t *, size_t len, unsigned int flags);
2483extern long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
2484 loff_t *opos, size_t len, unsigned int flags);
2485
2455 2486
2456extern void 2487extern void
2457file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping); 2488file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping);
@@ -2737,6 +2768,14 @@ static inline int is_sxid(umode_t mode)
2737 return (mode & S_ISUID) || ((mode & S_ISGID) && (mode & S_IXGRP)); 2768 return (mode & S_ISUID) || ((mode & S_ISGID) && (mode & S_IXGRP));
2738} 2769}
2739 2770
2771static inline int check_sticky(struct inode *dir, struct inode *inode)
2772{
2773 if (!(dir->i_mode & S_ISVTX))
2774 return 0;
2775
2776 return __check_sticky(dir, inode);
2777}
2778
2740static inline void inode_has_no_xattr(struct inode *inode) 2779static inline void inode_has_no_xattr(struct inode *inode)
2741{ 2780{
2742 if (!is_sxid(inode->i_mode) && (inode->i_sb->s_flags & MS_NOSEC)) 2781 if (!is_sxid(inode->i_mode) && (inode->i_sb->s_flags & MS_NOSEC))
diff --git a/include/linux/mount.h b/include/linux/mount.h
index 9262e4bf0cc3..c2c561dc0114 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -81,6 +81,9 @@ extern struct vfsmount *mntget(struct vfsmount *mnt);
81extern struct vfsmount *mnt_clone_internal(struct path *path); 81extern struct vfsmount *mnt_clone_internal(struct path *path);
82extern int __mnt_is_readonly(struct vfsmount *mnt); 82extern int __mnt_is_readonly(struct vfsmount *mnt);
83 83
84struct path;
85extern struct vfsmount *clone_private_mount(struct path *path);
86
84struct file_system_type; 87struct file_system_type;
85extern struct vfsmount *vfs_kern_mount(struct file_system_type *type, 88extern struct vfsmount *vfs_kern_mount(struct file_system_type *type,
86 int flags, const char *name, 89 int flags, const char *name,
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index ca1a11bb4443..3735fa0a6784 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -37,6 +37,7 @@
37 37
38#define RENAME_NOREPLACE (1 << 0) /* Don't overwrite target */ 38#define RENAME_NOREPLACE (1 << 0) /* Don't overwrite target */
39#define RENAME_EXCHANGE (1 << 1) /* Exchange source and dest */ 39#define RENAME_EXCHANGE (1 << 1) /* Exchange source and dest */
40#define RENAME_WHITEOUT (1 << 2) /* Whiteout source */
40 41
41struct fstrim_range { 42struct fstrim_range {
42 __u64 start; 43 __u64 start;
diff --git a/mm/shmem.c b/mm/shmem.c
index cd6fc7590e54..185836ba53ef 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2345,6 +2345,32 @@ static int shmem_exchange(struct inode *old_dir, struct dentry *old_dentry, stru
2345 return 0; 2345 return 0;
2346} 2346}
2347 2347
2348static int shmem_whiteout(struct inode *old_dir, struct dentry *old_dentry)
2349{
2350 struct dentry *whiteout;
2351 int error;
2352
2353 whiteout = d_alloc(old_dentry->d_parent, &old_dentry->d_name);
2354 if (!whiteout)
2355 return -ENOMEM;
2356
2357 error = shmem_mknod(old_dir, whiteout,
2358 S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV);
2359 dput(whiteout);
2360 if (error)
2361 return error;
2362
2363 /*
2364 * Cheat and hash the whiteout while the old dentry is still in
2365 * place, instead of playing games with FS_RENAME_DOES_D_MOVE.
2366 *
2367 * d_lookup() will consistently find one of them at this point,
2368 * not sure which one, but that isn't even important.
2369 */
2370 d_rehash(whiteout);
2371 return 0;
2372}
2373
2348/* 2374/*
2349 * The VFS layer already does all the dentry stuff for rename, 2375 * The VFS layer already does all the dentry stuff for rename,
2350 * we just have to decrement the usage count for the target if 2376 * we just have to decrement the usage count for the target if
@@ -2356,7 +2382,7 @@ static int shmem_rename2(struct inode *old_dir, struct dentry *old_dentry, struc
2356 struct inode *inode = old_dentry->d_inode; 2382 struct inode *inode = old_dentry->d_inode;
2357 int they_are_dirs = S_ISDIR(inode->i_mode); 2383 int they_are_dirs = S_ISDIR(inode->i_mode);
2358 2384
2359 if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) 2385 if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
2360 return -EINVAL; 2386 return -EINVAL;
2361 2387
2362 if (flags & RENAME_EXCHANGE) 2388 if (flags & RENAME_EXCHANGE)
@@ -2365,6 +2391,14 @@ static int shmem_rename2(struct inode *old_dir, struct dentry *old_dentry, struc
2365 if (!simple_empty(new_dentry)) 2391 if (!simple_empty(new_dentry))
2366 return -ENOTEMPTY; 2392 return -ENOTEMPTY;
2367 2393
2394 if (flags & RENAME_WHITEOUT) {
2395 int error;
2396
2397 error = shmem_whiteout(old_dir, old_dentry);
2398 if (error)
2399 return error;
2400 }
2401
2368 if (new_dentry->d_inode) { 2402 if (new_dentry->d_inode) {
2369 (void) shmem_unlink(new_dir, new_dentry); 2403 (void) shmem_unlink(new_dir, new_dentry);
2370 if (they_are_dirs) { 2404 if (they_are_dirs) {