diff options
Diffstat (limited to 'fs')
| -rw-r--r-- | fs/ceph/Kconfig | 13 | ||||
| -rw-r--r-- | fs/ceph/Makefile | 1 | ||||
| -rw-r--r-- | fs/ceph/acl.c | 230 | ||||
| -rw-r--r-- | fs/ceph/addr.c | 93 | ||||
| -rw-r--r-- | fs/ceph/cache.h | 13 | ||||
| -rw-r--r-- | fs/ceph/caps.c | 338 | ||||
| -rw-r--r-- | fs/ceph/dir.c | 17 | ||||
| -rw-r--r-- | fs/ceph/file.c | 437 | ||||
| -rw-r--r-- | fs/ceph/inode.c | 36 | ||||
| -rw-r--r-- | fs/ceph/ioctl.c | 8 | ||||
| -rw-r--r-- | fs/ceph/mds_client.c | 132 | ||||
| -rw-r--r-- | fs/ceph/mds_client.h | 2 | ||||
| -rw-r--r-- | fs/ceph/strings.c | 2 | ||||
| -rw-r--r-- | fs/ceph/super.c | 9 | ||||
| -rw-r--r-- | fs/ceph/super.h | 45 | ||||
| -rw-r--r-- | fs/ceph/xattr.c | 61 | ||||
| -rw-r--r-- | fs/dcookies.c | 2 | ||||
| -rw-r--r-- | fs/exofs/inode.c | 31 | ||||
| -rw-r--r-- | fs/exofs/ore.c | 45 | ||||
| -rw-r--r-- | fs/jffs2/malloc.c | 4 | ||||
| -rw-r--r-- | fs/notify/fanotify/fanotify.c | 40 | ||||
| -rw-r--r-- | fs/notify/fanotify/fanotify.h | 7 | ||||
| -rw-r--r-- | fs/notify/fanotify/fanotify_user.c | 7 | ||||
| -rw-r--r-- | fs/notify/inotify/inotify_fsnotify.c | 19 | ||||
| -rw-r--r-- | fs/notify/notification.c | 24 | ||||
| -rw-r--r-- | fs/read_write.c | 16 | ||||
| -rw-r--r-- | fs/xfs/xfs_buf.c | 14 | ||||
| -rw-r--r-- | fs/xfs/xfs_buf.h | 20 | ||||
| -rw-r--r-- | fs/xfs/xfs_file.c | 7 | ||||
| -rw-r--r-- | fs/xfs/xfs_ioctl.c | 2 |
30 files changed, 1252 insertions, 423 deletions
diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig index ac9a2ef5bb9b..264e9bf83ff3 100644 --- a/fs/ceph/Kconfig +++ b/fs/ceph/Kconfig | |||
| @@ -25,3 +25,16 @@ config CEPH_FSCACHE | |||
| 25 | caching support for Ceph clients using FS-Cache | 25 | caching support for Ceph clients using FS-Cache |
| 26 | 26 | ||
| 27 | endif | 27 | endif |
| 28 | |||
| 29 | config CEPH_FS_POSIX_ACL | ||
| 30 | bool "Ceph POSIX Access Control Lists" | ||
| 31 | depends on CEPH_FS | ||
| 32 | select FS_POSIX_ACL | ||
| 33 | help | ||
| 34 | POSIX Access Control Lists (ACLs) support permissions for users and | ||
| 35 | groups beyond the owner/group/world scheme. | ||
| 36 | |||
| 37 | To learn more about Access Control Lists, visit the POSIX ACLs for | ||
| 38 | Linux website <http://acl.bestbits.at/>. | ||
| 39 | |||
| 40 | If you don't know what Access Control Lists are, say N | ||
diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile index 32e30106a2f0..85a4230b9bff 100644 --- a/fs/ceph/Makefile +++ b/fs/ceph/Makefile | |||
| @@ -10,3 +10,4 @@ ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \ | |||
| 10 | debugfs.o | 10 | debugfs.o |
| 11 | 11 | ||
| 12 | ceph-$(CONFIG_CEPH_FSCACHE) += cache.o | 12 | ceph-$(CONFIG_CEPH_FSCACHE) += cache.o |
| 13 | ceph-$(CONFIG_CEPH_FS_POSIX_ACL) += acl.o | ||
diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c new file mode 100644 index 000000000000..66d377a12f7c --- /dev/null +++ b/fs/ceph/acl.c | |||
| @@ -0,0 +1,230 @@ | |||
| 1 | /* | ||
| 2 | * linux/fs/ceph/acl.c | ||
| 3 | * | ||
| 4 | * Copyright (C) 2013 Guangliang Zhao, <lucienchao@gmail.com> | ||
| 5 | * | ||
| 6 | * This program is free software; you can redistribute it and/or | ||
| 7 | * modify it under the terms of the GNU General Public | ||
| 8 | * License v2 as published by the Free Software Foundation. | ||
| 9 | * | ||
| 10 | * This program is distributed in the hope that it will be useful, | ||
| 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
| 13 | * General Public License for more details. | ||
| 14 | * | ||
| 15 | * You should have received a copy of the GNU General Public | ||
| 16 | * License along with this program; if not, write to the | ||
| 17 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
| 18 | * Boston, MA 021110-1307, USA. | ||
| 19 | */ | ||
| 20 | |||
| 21 | #include <linux/ceph/ceph_debug.h> | ||
| 22 | #include <linux/fs.h> | ||
| 23 | #include <linux/string.h> | ||
| 24 | #include <linux/xattr.h> | ||
| 25 | #include <linux/posix_acl_xattr.h> | ||
| 26 | #include <linux/posix_acl.h> | ||
| 27 | #include <linux/sched.h> | ||
| 28 | #include <linux/slab.h> | ||
| 29 | |||
| 30 | #include "super.h" | ||
| 31 | |||
| 32 | static inline void ceph_set_cached_acl(struct inode *inode, | ||
| 33 | int type, struct posix_acl *acl) | ||
| 34 | { | ||
| 35 | struct ceph_inode_info *ci = ceph_inode(inode); | ||
| 36 | |||
| 37 | spin_lock(&ci->i_ceph_lock); | ||
| 38 | if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 0)) | ||
| 39 | set_cached_acl(inode, type, acl); | ||
| 40 | spin_unlock(&ci->i_ceph_lock); | ||
| 41 | } | ||
| 42 | |||
| 43 | static inline struct posix_acl *ceph_get_cached_acl(struct inode *inode, | ||
| 44 | int type) | ||
| 45 | { | ||
| 46 | struct ceph_inode_info *ci = ceph_inode(inode); | ||
| 47 | struct posix_acl *acl = ACL_NOT_CACHED; | ||
| 48 | |||
| 49 | spin_lock(&ci->i_ceph_lock); | ||
| 50 | if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 0)) | ||
| 51 | acl = get_cached_acl(inode, type); | ||
| 52 | spin_unlock(&ci->i_ceph_lock); | ||
| 53 | |||
| 54 | return acl; | ||
| 55 | } | ||
| 56 | |||
| 57 | void ceph_forget_all_cached_acls(struct inode *inode) | ||
| 58 | { | ||
| 59 | forget_all_cached_acls(inode); | ||
| 60 | } | ||
| 61 | |||
| 62 | struct posix_acl *ceph_get_acl(struct inode *inode, int type) | ||
| 63 | { | ||
| 64 | int size; | ||
| 65 | const char *name; | ||
| 66 | char *value = NULL; | ||
| 67 | struct posix_acl *acl; | ||
| 68 | |||
| 69 | if (!IS_POSIXACL(inode)) | ||
| 70 | return NULL; | ||
| 71 | |||
| 72 | acl = ceph_get_cached_acl(inode, type); | ||
| 73 | if (acl != ACL_NOT_CACHED) | ||
| 74 | return acl; | ||
| 75 | |||
| 76 | switch (type) { | ||
| 77 | case ACL_TYPE_ACCESS: | ||
| 78 | name = POSIX_ACL_XATTR_ACCESS; | ||
| 79 | break; | ||
| 80 | case ACL_TYPE_DEFAULT: | ||
| 81 | name = POSIX_ACL_XATTR_DEFAULT; | ||
| 82 | break; | ||
| 83 | default: | ||
| 84 | BUG(); | ||
| 85 | } | ||
| 86 | |||
| 87 | size = __ceph_getxattr(inode, name, "", 0); | ||
| 88 | if (size > 0) { | ||
| 89 | value = kzalloc(size, GFP_NOFS); | ||
| 90 | if (!value) | ||
| 91 | return ERR_PTR(-ENOMEM); | ||
| 92 | size = __ceph_getxattr(inode, name, value, size); | ||
| 93 | } | ||
| 94 | |||
| 95 | if (size > 0) | ||
| 96 | acl = posix_acl_from_xattr(&init_user_ns, value, size); | ||
| 97 | else if (size == -ERANGE || size == -ENODATA || size == 0) | ||
| 98 | acl = NULL; | ||
| 99 | else | ||
| 100 | acl = ERR_PTR(-EIO); | ||
| 101 | |||
| 102 | kfree(value); | ||
| 103 | |||
| 104 | if (!IS_ERR(acl)) | ||
| 105 | ceph_set_cached_acl(inode, type, acl); | ||
| 106 | |||
| 107 | return acl; | ||
| 108 | } | ||
| 109 | |||
| 110 | int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type) | ||
| 111 | { | ||
| 112 | int ret = 0, size = 0; | ||
| 113 | const char *name = NULL; | ||
| 114 | char *value = NULL; | ||
| 115 | struct iattr newattrs; | ||
| 116 | umode_t new_mode = inode->i_mode, old_mode = inode->i_mode; | ||
| 117 | struct dentry *dentry = d_find_alias(inode); | ||
| 118 | |||
| 119 | if (acl) { | ||
| 120 | ret = posix_acl_valid(acl); | ||
| 121 | if (ret < 0) | ||
| 122 | goto out; | ||
| 123 | } | ||
| 124 | |||
| 125 | switch (type) { | ||
| 126 | case ACL_TYPE_ACCESS: | ||
| 127 | name = POSIX_ACL_XATTR_ACCESS; | ||
| 128 | if (acl) { | ||
| 129 | ret = posix_acl_equiv_mode(acl, &new_mode); | ||
| 130 | if (ret < 0) | ||
| 131 | goto out; | ||
| 132 | if (ret == 0) | ||
| 133 | acl = NULL; | ||
| 134 | } | ||
| 135 | break; | ||
| 136 | case ACL_TYPE_DEFAULT: | ||
| 137 | if (!S_ISDIR(inode->i_mode)) { | ||
| 138 | ret = acl ? -EINVAL : 0; | ||
| 139 | goto out; | ||
| 140 | } | ||
| 141 | name = POSIX_ACL_XATTR_DEFAULT; | ||
| 142 | break; | ||
| 143 | default: | ||
| 144 | ret = -EINVAL; | ||
| 145 | goto out; | ||
| 146 | } | ||
| 147 | |||
| 148 | if (acl) { | ||
| 149 | size = posix_acl_xattr_size(acl->a_count); | ||
| 150 | value = kmalloc(size, GFP_NOFS); | ||
| 151 | if (!value) { | ||
| 152 | ret = -ENOMEM; | ||
| 153 | goto out; | ||
| 154 | } | ||
| 155 | |||
| 156 | ret = posix_acl_to_xattr(&init_user_ns, acl, value, size); | ||
| 157 | if (ret < 0) | ||
| 158 | goto out_free; | ||
| 159 | } | ||
| 160 | |||
| 161 | if (new_mode != old_mode) { | ||
| 162 | newattrs.ia_mode = new_mode; | ||
| 163 | newattrs.ia_valid = ATTR_MODE; | ||
| 164 | ret = ceph_setattr(dentry, &newattrs); | ||
| 165 | if (ret) | ||
| 166 | goto out_free; | ||
| 167 | } | ||
| 168 | |||
| 169 | if (value) | ||
| 170 | ret = __ceph_setxattr(dentry, name, value, size, 0); | ||
| 171 | else | ||
| 172 | ret = __ceph_removexattr(dentry, name); | ||
| 173 | |||
| 174 | if (ret) { | ||
| 175 | if (new_mode != old_mode) { | ||
| 176 | newattrs.ia_mode = old_mode; | ||
| 177 | newattrs.ia_valid = ATTR_MODE; | ||
| 178 | ceph_setattr(dentry, &newattrs); | ||
| 179 | } | ||
| 180 | goto out_free; | ||
| 181 | } | ||
| 182 | |||
| 183 | ceph_set_cached_acl(inode, type, acl); | ||
| 184 | |||
| 185 | out_free: | ||
| 186 | kfree(value); | ||
| 187 | out: | ||
| 188 | return ret; | ||
| 189 | } | ||
| 190 | |||
| 191 | int ceph_init_acl(struct dentry *dentry, struct inode *inode, struct inode *dir) | ||
| 192 | { | ||
| 193 | struct posix_acl *acl = NULL; | ||
| 194 | int ret = 0; | ||
| 195 | |||
| 196 | if (!S_ISLNK(inode->i_mode)) { | ||
| 197 | if (IS_POSIXACL(dir)) { | ||
| 198 | acl = ceph_get_acl(dir, ACL_TYPE_DEFAULT); | ||
| 199 | if (IS_ERR(acl)) { | ||
| 200 | ret = PTR_ERR(acl); | ||
| 201 | goto out; | ||
| 202 | } | ||
| 203 | } | ||
| 204 | |||
| 205 | if (!acl) | ||
| 206 | inode->i_mode &= ~current_umask(); | ||
| 207 | } | ||
| 208 | |||
| 209 | if (IS_POSIXACL(dir) && acl) { | ||
| 210 | if (S_ISDIR(inode->i_mode)) { | ||
| 211 | ret = ceph_set_acl(inode, acl, ACL_TYPE_DEFAULT); | ||
| 212 | if (ret) | ||
| 213 | goto out_release; | ||
| 214 | } | ||
| 215 | ret = __posix_acl_create(&acl, GFP_NOFS, &inode->i_mode); | ||
| 216 | if (ret < 0) | ||
| 217 | goto out; | ||
| 218 | else if (ret > 0) | ||
| 219 | ret = ceph_set_acl(inode, acl, ACL_TYPE_ACCESS); | ||
| 220 | else | ||
| 221 | cache_no_acl(inode); | ||
| 222 | } else { | ||
| 223 | cache_no_acl(inode); | ||
| 224 | } | ||
| 225 | |||
| 226 | out_release: | ||
| 227 | posix_acl_release(acl); | ||
| 228 | out: | ||
| 229 | return ret; | ||
| 230 | } | ||
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index ec3ba43b9faa..b53278c9fd97 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c | |||
| @@ -209,6 +209,7 @@ static int readpage_nounlock(struct file *filp, struct page *page) | |||
| 209 | err = 0; | 209 | err = 0; |
| 210 | if (err < 0) { | 210 | if (err < 0) { |
| 211 | SetPageError(page); | 211 | SetPageError(page); |
| 212 | ceph_fscache_readpage_cancel(inode, page); | ||
| 212 | goto out; | 213 | goto out; |
| 213 | } else { | 214 | } else { |
| 214 | if (err < PAGE_CACHE_SIZE) { | 215 | if (err < PAGE_CACHE_SIZE) { |
| @@ -256,6 +257,8 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg) | |||
| 256 | for (i = 0; i < num_pages; i++) { | 257 | for (i = 0; i < num_pages; i++) { |
| 257 | struct page *page = osd_data->pages[i]; | 258 | struct page *page = osd_data->pages[i]; |
| 258 | 259 | ||
| 260 | if (rc < 0) | ||
| 261 | goto unlock; | ||
| 259 | if (bytes < (int)PAGE_CACHE_SIZE) { | 262 | if (bytes < (int)PAGE_CACHE_SIZE) { |
| 260 | /* zero (remainder of) page */ | 263 | /* zero (remainder of) page */ |
| 261 | int s = bytes < 0 ? 0 : bytes; | 264 | int s = bytes < 0 ? 0 : bytes; |
| @@ -266,6 +269,7 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg) | |||
| 266 | flush_dcache_page(page); | 269 | flush_dcache_page(page); |
| 267 | SetPageUptodate(page); | 270 | SetPageUptodate(page); |
| 268 | ceph_readpage_to_fscache(inode, page); | 271 | ceph_readpage_to_fscache(inode, page); |
| 272 | unlock: | ||
| 269 | unlock_page(page); | 273 | unlock_page(page); |
| 270 | page_cache_release(page); | 274 | page_cache_release(page); |
| 271 | bytes -= PAGE_CACHE_SIZE; | 275 | bytes -= PAGE_CACHE_SIZE; |
| @@ -1207,6 +1211,41 @@ const struct address_space_operations ceph_aops = { | |||
| 1207 | /* | 1211 | /* |
| 1208 | * vm ops | 1212 | * vm ops |
| 1209 | */ | 1213 | */ |
| 1214 | static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | ||
| 1215 | { | ||
| 1216 | struct inode *inode = file_inode(vma->vm_file); | ||
| 1217 | struct ceph_inode_info *ci = ceph_inode(inode); | ||
| 1218 | struct ceph_file_info *fi = vma->vm_file->private_data; | ||
| 1219 | loff_t off = vmf->pgoff << PAGE_CACHE_SHIFT; | ||
| 1220 | int want, got, ret; | ||
| 1221 | |||
| 1222 | dout("filemap_fault %p %llx.%llx %llu~%zd trying to get caps\n", | ||
| 1223 | inode, ceph_vinop(inode), off, (size_t)PAGE_CACHE_SIZE); | ||
| 1224 | if (fi->fmode & CEPH_FILE_MODE_LAZY) | ||
| 1225 | want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; | ||
| 1226 | else | ||
| 1227 | want = CEPH_CAP_FILE_CACHE; | ||
| 1228 | while (1) { | ||
| 1229 | got = 0; | ||
| 1230 | ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, &got, -1); | ||
| 1231 | if (ret == 0) | ||
| 1232 | break; | ||
| 1233 | if (ret != -ERESTARTSYS) { | ||
| 1234 | WARN_ON(1); | ||
| 1235 | return VM_FAULT_SIGBUS; | ||
| 1236 | } | ||
| 1237 | } | ||
| 1238 | dout("filemap_fault %p %llu~%zd got cap refs on %s\n", | ||
| 1239 | inode, off, (size_t)PAGE_CACHE_SIZE, ceph_cap_string(got)); | ||
| 1240 | |||
| 1241 | ret = filemap_fault(vma, vmf); | ||
| 1242 | |||
| 1243 | dout("filemap_fault %p %llu~%zd dropping cap refs on %s ret %d\n", | ||
| 1244 | inode, off, (size_t)PAGE_CACHE_SIZE, ceph_cap_string(got), ret); | ||
| 1245 | ceph_put_cap_refs(ci, got); | ||
| 1246 | |||
| 1247 | return ret; | ||
| 1248 | } | ||
| 1210 | 1249 | ||
| 1211 | /* | 1250 | /* |
| 1212 | * Reuse write_begin here for simplicity. | 1251 | * Reuse write_begin here for simplicity. |
| @@ -1214,23 +1253,41 @@ const struct address_space_operations ceph_aops = { | |||
| 1214 | static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) | 1253 | static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) |
| 1215 | { | 1254 | { |
| 1216 | struct inode *inode = file_inode(vma->vm_file); | 1255 | struct inode *inode = file_inode(vma->vm_file); |
| 1217 | struct page *page = vmf->page; | 1256 | struct ceph_inode_info *ci = ceph_inode(inode); |
| 1257 | struct ceph_file_info *fi = vma->vm_file->private_data; | ||
| 1218 | struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; | 1258 | struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; |
| 1259 | struct page *page = vmf->page; | ||
| 1219 | loff_t off = page_offset(page); | 1260 | loff_t off = page_offset(page); |
| 1220 | loff_t size, len; | 1261 | loff_t size = i_size_read(inode); |
| 1221 | int ret; | 1262 | size_t len; |
| 1222 | 1263 | int want, got, ret; | |
| 1223 | /* Update time before taking page lock */ | ||
| 1224 | file_update_time(vma->vm_file); | ||
| 1225 | 1264 | ||
| 1226 | size = i_size_read(inode); | ||
| 1227 | if (off + PAGE_CACHE_SIZE <= size) | 1265 | if (off + PAGE_CACHE_SIZE <= size) |
| 1228 | len = PAGE_CACHE_SIZE; | 1266 | len = PAGE_CACHE_SIZE; |
| 1229 | else | 1267 | else |
| 1230 | len = size & ~PAGE_CACHE_MASK; | 1268 | len = size & ~PAGE_CACHE_MASK; |
| 1231 | 1269 | ||
| 1232 | dout("page_mkwrite %p %llu~%llu page %p idx %lu\n", inode, | 1270 | dout("page_mkwrite %p %llx.%llx %llu~%zd getting caps i_size %llu\n", |
| 1233 | off, len, page, page->index); | 1271 | inode, ceph_vinop(inode), off, len, size); |
| 1272 | if (fi->fmode & CEPH_FILE_MODE_LAZY) | ||
| 1273 | want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; | ||
| 1274 | else | ||
| 1275 | want = CEPH_CAP_FILE_BUFFER; | ||
| 1276 | while (1) { | ||
| 1277 | got = 0; | ||
| 1278 | ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, off + len); | ||
| 1279 | if (ret == 0) | ||
| 1280 | break; | ||
| 1281 | if (ret != -ERESTARTSYS) { | ||
| 1282 | WARN_ON(1); | ||
| 1283 | return VM_FAULT_SIGBUS; | ||
| 1284 | } | ||
| 1285 | } | ||
| 1286 | dout("page_mkwrite %p %llu~%zd got cap refs on %s\n", | ||
| 1287 | inode, off, len, ceph_cap_string(got)); | ||
| 1288 | |||
| 1289 | /* Update time before taking page lock */ | ||
| 1290 | file_update_time(vma->vm_file); | ||
| 1234 | 1291 | ||
| 1235 | lock_page(page); | 1292 | lock_page(page); |
| 1236 | 1293 | ||
| @@ -1252,14 +1309,26 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
| 1252 | ret = VM_FAULT_SIGBUS; | 1309 | ret = VM_FAULT_SIGBUS; |
| 1253 | } | 1310 | } |
| 1254 | out: | 1311 | out: |
| 1255 | dout("page_mkwrite %p %llu~%llu = %d\n", inode, off, len, ret); | 1312 | if (ret != VM_FAULT_LOCKED) { |
| 1256 | if (ret != VM_FAULT_LOCKED) | ||
| 1257 | unlock_page(page); | 1313 | unlock_page(page); |
| 1314 | } else { | ||
| 1315 | int dirty; | ||
| 1316 | spin_lock(&ci->i_ceph_lock); | ||
| 1317 | dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); | ||
| 1318 | spin_unlock(&ci->i_ceph_lock); | ||
| 1319 | if (dirty) | ||
| 1320 | __mark_inode_dirty(inode, dirty); | ||
| 1321 | } | ||
| 1322 | |||
| 1323 | dout("page_mkwrite %p %llu~%zd dropping cap refs on %s ret %d\n", | ||
| 1324 | inode, off, len, ceph_cap_string(got), ret); | ||
| 1325 | ceph_put_cap_refs(ci, got); | ||
| 1326 | |||
| 1258 | return ret; | 1327 | return ret; |
| 1259 | } | 1328 | } |
| 1260 | 1329 | ||
| 1261 | static struct vm_operations_struct ceph_vmops = { | 1330 | static struct vm_operations_struct ceph_vmops = { |
| 1262 | .fault = filemap_fault, | 1331 | .fault = ceph_filemap_fault, |
| 1263 | .page_mkwrite = ceph_page_mkwrite, | 1332 | .page_mkwrite = ceph_page_mkwrite, |
| 1264 | .remap_pages = generic_file_remap_pages, | 1333 | .remap_pages = generic_file_remap_pages, |
| 1265 | }; | 1334 | }; |
diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h index ba949408a336..da95f61b7a09 100644 --- a/fs/ceph/cache.h +++ b/fs/ceph/cache.h | |||
| @@ -67,6 +67,14 @@ static inline int ceph_release_fscache_page(struct page *page, gfp_t gfp) | |||
| 67 | return fscache_maybe_release_page(ci->fscache, page, gfp); | 67 | return fscache_maybe_release_page(ci->fscache, page, gfp); |
| 68 | } | 68 | } |
| 69 | 69 | ||
| 70 | static inline void ceph_fscache_readpage_cancel(struct inode *inode, | ||
| 71 | struct page *page) | ||
| 72 | { | ||
| 73 | struct ceph_inode_info *ci = ceph_inode(inode); | ||
| 74 | if (fscache_cookie_valid(ci->fscache) && PageFsCache(page)) | ||
| 75 | __fscache_uncache_page(ci->fscache, page); | ||
| 76 | } | ||
| 77 | |||
| 70 | static inline void ceph_fscache_readpages_cancel(struct inode *inode, | 78 | static inline void ceph_fscache_readpages_cancel(struct inode *inode, |
| 71 | struct list_head *pages) | 79 | struct list_head *pages) |
| 72 | { | 80 | { |
| @@ -145,6 +153,11 @@ static inline int ceph_release_fscache_page(struct page *page, gfp_t gfp) | |||
| 145 | return 1; | 153 | return 1; |
| 146 | } | 154 | } |
| 147 | 155 | ||
| 156 | static inline void ceph_fscache_readpage_cancel(struct inode *inode, | ||
| 157 | struct page *page) | ||
| 158 | { | ||
| 159 | } | ||
| 160 | |||
| 148 | static inline void ceph_fscache_readpages_cancel(struct inode *inode, | 161 | static inline void ceph_fscache_readpages_cancel(struct inode *inode, |
| 149 | struct list_head *pages) | 162 | struct list_head *pages) |
| 150 | { | 163 | { |
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 3c0a4bd74996..17543383545c 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c | |||
| @@ -555,21 +555,34 @@ retry: | |||
| 555 | cap->ci = ci; | 555 | cap->ci = ci; |
| 556 | __insert_cap_node(ci, cap); | 556 | __insert_cap_node(ci, cap); |
| 557 | 557 | ||
| 558 | /* clear out old exporting info? (i.e. on cap import) */ | ||
| 559 | if (ci->i_cap_exporting_mds == mds) { | ||
| 560 | ci->i_cap_exporting_issued = 0; | ||
| 561 | ci->i_cap_exporting_mseq = 0; | ||
| 562 | ci->i_cap_exporting_mds = -1; | ||
| 563 | } | ||
| 564 | |||
| 565 | /* add to session cap list */ | 558 | /* add to session cap list */ |
| 566 | cap->session = session; | 559 | cap->session = session; |
| 567 | spin_lock(&session->s_cap_lock); | 560 | spin_lock(&session->s_cap_lock); |
| 568 | list_add_tail(&cap->session_caps, &session->s_caps); | 561 | list_add_tail(&cap->session_caps, &session->s_caps); |
| 569 | session->s_nr_caps++; | 562 | session->s_nr_caps++; |
| 570 | spin_unlock(&session->s_cap_lock); | 563 | spin_unlock(&session->s_cap_lock); |
| 571 | } else if (new_cap) | 564 | } else { |
| 572 | ceph_put_cap(mdsc, new_cap); | 565 | if (new_cap) |
| 566 | ceph_put_cap(mdsc, new_cap); | ||
| 567 | |||
| 568 | /* | ||
| 569 | * auth mds of the inode changed. we received the cap export | ||
| 570 | * message, but still haven't received the cap import message. | ||
| 571 | * handle_cap_export() updated the new auth MDS' cap. | ||
| 572 | * | ||
| 573 | * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing | ||
| 574 | * a message that was send before the cap import message. So | ||
| 575 | * don't remove caps. | ||
| 576 | */ | ||
| 577 | if (ceph_seq_cmp(seq, cap->seq) <= 0) { | ||
| 578 | WARN_ON(cap != ci->i_auth_cap); | ||
| 579 | WARN_ON(cap->cap_id != cap_id); | ||
| 580 | seq = cap->seq; | ||
| 581 | mseq = cap->mseq; | ||
| 582 | issued |= cap->issued; | ||
| 583 | flags |= CEPH_CAP_FLAG_AUTH; | ||
| 584 | } | ||
| 585 | } | ||
| 573 | 586 | ||
| 574 | if (!ci->i_snap_realm) { | 587 | if (!ci->i_snap_realm) { |
| 575 | /* | 588 | /* |
| @@ -611,15 +624,9 @@ retry: | |||
| 611 | if (ci->i_auth_cap == NULL || | 624 | if (ci->i_auth_cap == NULL || |
| 612 | ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0) | 625 | ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0) |
| 613 | ci->i_auth_cap = cap; | 626 | ci->i_auth_cap = cap; |
| 614 | } else if (ci->i_auth_cap == cap) { | 627 | ci->i_cap_exporting_issued = 0; |
| 615 | ci->i_auth_cap = NULL; | 628 | } else { |
| 616 | spin_lock(&mdsc->cap_dirty_lock); | 629 | WARN_ON(ci->i_auth_cap == cap); |
| 617 | if (!list_empty(&ci->i_dirty_item)) { | ||
| 618 | dout(" moving %p to cap_dirty_migrating\n", inode); | ||
| 619 | list_move(&ci->i_dirty_item, | ||
| 620 | &mdsc->cap_dirty_migrating); | ||
| 621 | } | ||
| 622 | spin_unlock(&mdsc->cap_dirty_lock); | ||
| 623 | } | 630 | } |
| 624 | 631 | ||
| 625 | dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n", | 632 | dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n", |
| @@ -628,7 +635,7 @@ retry: | |||
| 628 | cap->cap_id = cap_id; | 635 | cap->cap_id = cap_id; |
| 629 | cap->issued = issued; | 636 | cap->issued = issued; |
| 630 | cap->implemented |= issued; | 637 | cap->implemented |= issued; |
| 631 | if (mseq > cap->mseq) | 638 | if (ceph_seq_cmp(mseq, cap->mseq) > 0) |
| 632 | cap->mds_wanted = wanted; | 639 | cap->mds_wanted = wanted; |
| 633 | else | 640 | else |
| 634 | cap->mds_wanted |= wanted; | 641 | cap->mds_wanted |= wanted; |
| @@ -816,7 +823,7 @@ int __ceph_caps_revoking_other(struct ceph_inode_info *ci, | |||
| 816 | 823 | ||
| 817 | for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { | 824 | for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { |
| 818 | cap = rb_entry(p, struct ceph_cap, ci_node); | 825 | cap = rb_entry(p, struct ceph_cap, ci_node); |
| 819 | if (cap != ocap && __cap_is_valid(cap) && | 826 | if (cap != ocap && |
| 820 | (cap->implemented & ~cap->issued & mask)) | 827 | (cap->implemented & ~cap->issued & mask)) |
| 821 | return 1; | 828 | return 1; |
| 822 | } | 829 | } |
| @@ -888,7 +895,19 @@ int __ceph_caps_mds_wanted(struct ceph_inode_info *ci) | |||
| 888 | */ | 895 | */ |
| 889 | static int __ceph_is_any_caps(struct ceph_inode_info *ci) | 896 | static int __ceph_is_any_caps(struct ceph_inode_info *ci) |
| 890 | { | 897 | { |
| 891 | return !RB_EMPTY_ROOT(&ci->i_caps) || ci->i_cap_exporting_mds >= 0; | 898 | return !RB_EMPTY_ROOT(&ci->i_caps) || ci->i_cap_exporting_issued; |
| 899 | } | ||
| 900 | |||
| 901 | int ceph_is_any_caps(struct inode *inode) | ||
| 902 | { | ||
| 903 | struct ceph_inode_info *ci = ceph_inode(inode); | ||
| 904 | int ret; | ||
| 905 | |||
| 906 | spin_lock(&ci->i_ceph_lock); | ||
| 907 | ret = __ceph_is_any_caps(ci); | ||
| 908 | spin_unlock(&ci->i_ceph_lock); | ||
| 909 | |||
| 910 | return ret; | ||
| 892 | } | 911 | } |
| 893 | 912 | ||
| 894 | /* | 913 | /* |
| @@ -1383,13 +1402,10 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) | |||
| 1383 | ci->i_snap_realm->cached_context); | 1402 | ci->i_snap_realm->cached_context); |
| 1384 | dout(" inode %p now dirty snapc %p auth cap %p\n", | 1403 | dout(" inode %p now dirty snapc %p auth cap %p\n", |
| 1385 | &ci->vfs_inode, ci->i_head_snapc, ci->i_auth_cap); | 1404 | &ci->vfs_inode, ci->i_head_snapc, ci->i_auth_cap); |
| 1405 | WARN_ON(!ci->i_auth_cap); | ||
| 1386 | BUG_ON(!list_empty(&ci->i_dirty_item)); | 1406 | BUG_ON(!list_empty(&ci->i_dirty_item)); |
| 1387 | spin_lock(&mdsc->cap_dirty_lock); | 1407 | spin_lock(&mdsc->cap_dirty_lock); |
| 1388 | if (ci->i_auth_cap) | 1408 | list_add(&ci->i_dirty_item, &mdsc->cap_dirty); |
| 1389 | list_add(&ci->i_dirty_item, &mdsc->cap_dirty); | ||
| 1390 | else | ||
| 1391 | list_add(&ci->i_dirty_item, | ||
| 1392 | &mdsc->cap_dirty_migrating); | ||
| 1393 | spin_unlock(&mdsc->cap_dirty_lock); | 1409 | spin_unlock(&mdsc->cap_dirty_lock); |
| 1394 | if (ci->i_flushing_caps == 0) { | 1410 | if (ci->i_flushing_caps == 0) { |
| 1395 | ihold(inode); | 1411 | ihold(inode); |
| @@ -1735,13 +1751,12 @@ ack: | |||
| 1735 | /* | 1751 | /* |
| 1736 | * Try to flush dirty caps back to the auth mds. | 1752 | * Try to flush dirty caps back to the auth mds. |
| 1737 | */ | 1753 | */ |
| 1738 | static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session, | 1754 | static int try_flush_caps(struct inode *inode, unsigned *flush_tid) |
| 1739 | unsigned *flush_tid) | ||
| 1740 | { | 1755 | { |
| 1741 | struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; | 1756 | struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; |
| 1742 | struct ceph_inode_info *ci = ceph_inode(inode); | 1757 | struct ceph_inode_info *ci = ceph_inode(inode); |
| 1743 | int unlock_session = session ? 0 : 1; | ||
| 1744 | int flushing = 0; | 1758 | int flushing = 0; |
| 1759 | struct ceph_mds_session *session = NULL; | ||
| 1745 | 1760 | ||
| 1746 | retry: | 1761 | retry: |
| 1747 | spin_lock(&ci->i_ceph_lock); | 1762 | spin_lock(&ci->i_ceph_lock); |
| @@ -1755,13 +1770,14 @@ retry: | |||
| 1755 | int want = __ceph_caps_wanted(ci); | 1770 | int want = __ceph_caps_wanted(ci); |
| 1756 | int delayed; | 1771 | int delayed; |
| 1757 | 1772 | ||
| 1758 | if (!session) { | 1773 | if (!session || session != cap->session) { |
| 1759 | spin_unlock(&ci->i_ceph_lock); | 1774 | spin_unlock(&ci->i_ceph_lock); |
| 1775 | if (session) | ||
| 1776 | mutex_unlock(&session->s_mutex); | ||
| 1760 | session = cap->session; | 1777 | session = cap->session; |
| 1761 | mutex_lock(&session->s_mutex); | 1778 | mutex_lock(&session->s_mutex); |
| 1762 | goto retry; | 1779 | goto retry; |
| 1763 | } | 1780 | } |
| 1764 | BUG_ON(session != cap->session); | ||
| 1765 | if (cap->session->s_state < CEPH_MDS_SESSION_OPEN) | 1781 | if (cap->session->s_state < CEPH_MDS_SESSION_OPEN) |
| 1766 | goto out; | 1782 | goto out; |
| 1767 | 1783 | ||
| @@ -1780,7 +1796,7 @@ retry: | |||
| 1780 | out: | 1796 | out: |
| 1781 | spin_unlock(&ci->i_ceph_lock); | 1797 | spin_unlock(&ci->i_ceph_lock); |
| 1782 | out_unlocked: | 1798 | out_unlocked: |
| 1783 | if (session && unlock_session) | 1799 | if (session) |
| 1784 | mutex_unlock(&session->s_mutex); | 1800 | mutex_unlock(&session->s_mutex); |
| 1785 | return flushing; | 1801 | return flushing; |
| 1786 | } | 1802 | } |
| @@ -1865,7 +1881,7 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) | |||
| 1865 | return ret; | 1881 | return ret; |
| 1866 | mutex_lock(&inode->i_mutex); | 1882 | mutex_lock(&inode->i_mutex); |
| 1867 | 1883 | ||
| 1868 | dirty = try_flush_caps(inode, NULL, &flush_tid); | 1884 | dirty = try_flush_caps(inode, &flush_tid); |
| 1869 | dout("fsync dirty caps are %s\n", ceph_cap_string(dirty)); | 1885 | dout("fsync dirty caps are %s\n", ceph_cap_string(dirty)); |
| 1870 | 1886 | ||
| 1871 | /* | 1887 | /* |
| @@ -1900,7 +1916,7 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc) | |||
| 1900 | 1916 | ||
| 1901 | dout("write_inode %p wait=%d\n", inode, wait); | 1917 | dout("write_inode %p wait=%d\n", inode, wait); |
| 1902 | if (wait) { | 1918 | if (wait) { |
| 1903 | dirty = try_flush_caps(inode, NULL, &flush_tid); | 1919 | dirty = try_flush_caps(inode, &flush_tid); |
| 1904 | if (dirty) | 1920 | if (dirty) |
| 1905 | err = wait_event_interruptible(ci->i_cap_wq, | 1921 | err = wait_event_interruptible(ci->i_cap_wq, |
| 1906 | caps_are_flushed(inode, flush_tid)); | 1922 | caps_are_flushed(inode, flush_tid)); |
| @@ -2350,11 +2366,11 @@ static void invalidate_aliases(struct inode *inode) | |||
| 2350 | d_prune_aliases(inode); | 2366 | d_prune_aliases(inode); |
| 2351 | /* | 2367 | /* |
| 2352 | * For non-directory inode, d_find_alias() only returns | 2368 | * For non-directory inode, d_find_alias() only returns |
| 2353 | * connected dentry. After calling d_invalidate(), the | 2369 | * hashed dentry. After calling d_invalidate(), the |
| 2354 | * dentry become disconnected. | 2370 | * dentry becomes unhashed. |
| 2355 | * | 2371 | * |
| 2356 | * For directory inode, d_find_alias() can return | 2372 | * For directory inode, d_find_alias() can return |
| 2357 | * disconnected dentry. But directory inode should have | 2373 | * unhashed dentry. But directory inode should have |
| 2358 | * one alias at most. | 2374 | * one alias at most. |
| 2359 | */ | 2375 | */ |
| 2360 | while ((dn = d_find_alias(inode))) { | 2376 | while ((dn = d_find_alias(inode))) { |
| @@ -2408,6 +2424,22 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, | |||
| 2408 | dout(" size %llu max_size %llu, i_size %llu\n", size, max_size, | 2424 | dout(" size %llu max_size %llu, i_size %llu\n", size, max_size, |
| 2409 | inode->i_size); | 2425 | inode->i_size); |
| 2410 | 2426 | ||
| 2427 | |||
| 2428 | /* | ||
| 2429 | * auth mds of the inode changed. we received the cap export message, | ||
| 2430 | * but still haven't received the cap import message. handle_cap_export | ||
| 2431 | * updated the new auth MDS' cap. | ||
| 2432 | * | ||
| 2433 | * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing a message | ||
| 2434 | * that was sent before the cap import message. So don't remove caps. | ||
| 2435 | */ | ||
| 2436 | if (ceph_seq_cmp(seq, cap->seq) <= 0) { | ||
| 2437 | WARN_ON(cap != ci->i_auth_cap); | ||
| 2438 | WARN_ON(cap->cap_id != le64_to_cpu(grant->cap_id)); | ||
| 2439 | seq = cap->seq; | ||
| 2440 | newcaps |= cap->issued; | ||
| 2441 | } | ||
| 2442 | |||
| 2411 | /* | 2443 | /* |
| 2412 | * If CACHE is being revoked, and we have no dirty buffers, | 2444 | * If CACHE is being revoked, and we have no dirty buffers, |
| 2413 | * try to invalidate (once). (If there are dirty buffers, we | 2445 | * try to invalidate (once). (If there are dirty buffers, we |
| @@ -2434,6 +2466,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, | |||
| 2434 | issued |= implemented | __ceph_caps_dirty(ci); | 2466 | issued |= implemented | __ceph_caps_dirty(ci); |
| 2435 | 2467 | ||
| 2436 | cap->cap_gen = session->s_cap_gen; | 2468 | cap->cap_gen = session->s_cap_gen; |
| 2469 | cap->seq = seq; | ||
| 2437 | 2470 | ||
| 2438 | __check_cap_issue(ci, cap, newcaps); | 2471 | __check_cap_issue(ci, cap, newcaps); |
| 2439 | 2472 | ||
| @@ -2464,6 +2497,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, | |||
| 2464 | ceph_buffer_put(ci->i_xattrs.blob); | 2497 | ceph_buffer_put(ci->i_xattrs.blob); |
| 2465 | ci->i_xattrs.blob = ceph_buffer_get(xattr_buf); | 2498 | ci->i_xattrs.blob = ceph_buffer_get(xattr_buf); |
| 2466 | ci->i_xattrs.version = version; | 2499 | ci->i_xattrs.version = version; |
| 2500 | ceph_forget_all_cached_acls(inode); | ||
| 2467 | } | 2501 | } |
| 2468 | } | 2502 | } |
| 2469 | 2503 | ||
| @@ -2483,6 +2517,10 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, | |||
| 2483 | le32_to_cpu(grant->time_warp_seq), &ctime, &mtime, | 2517 | le32_to_cpu(grant->time_warp_seq), &ctime, &mtime, |
| 2484 | &atime); | 2518 | &atime); |
| 2485 | 2519 | ||
| 2520 | |||
| 2521 | /* file layout may have changed */ | ||
| 2522 | ci->i_layout = grant->layout; | ||
| 2523 | |||
| 2486 | /* max size increase? */ | 2524 | /* max size increase? */ |
| 2487 | if (ci->i_auth_cap == cap && max_size != ci->i_max_size) { | 2525 | if (ci->i_auth_cap == cap && max_size != ci->i_max_size) { |
| 2488 | dout("max_size %lld -> %llu\n", ci->i_max_size, max_size); | 2526 | dout("max_size %lld -> %llu\n", ci->i_max_size, max_size); |
| @@ -2511,11 +2549,6 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, | |||
| 2511 | check_caps = 1; | 2549 | check_caps = 1; |
| 2512 | } | 2550 | } |
| 2513 | 2551 | ||
| 2514 | cap->seq = seq; | ||
| 2515 | |||
| 2516 | /* file layout may have changed */ | ||
| 2517 | ci->i_layout = grant->layout; | ||
| 2518 | |||
| 2519 | /* revocation, grant, or no-op? */ | 2552 | /* revocation, grant, or no-op? */ |
| 2520 | if (cap->issued & ~newcaps) { | 2553 | if (cap->issued & ~newcaps) { |
| 2521 | int revoking = cap->issued & ~newcaps; | 2554 | int revoking = cap->issued & ~newcaps; |
| @@ -2741,65 +2774,114 @@ static void handle_cap_trunc(struct inode *inode, | |||
| 2741 | * caller holds s_mutex | 2774 | * caller holds s_mutex |
| 2742 | */ | 2775 | */ |
| 2743 | static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex, | 2776 | static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex, |
| 2744 | struct ceph_mds_session *session, | 2777 | struct ceph_mds_cap_peer *ph, |
| 2745 | int *open_target_sessions) | 2778 | struct ceph_mds_session *session) |
| 2746 | { | 2779 | { |
| 2747 | struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; | 2780 | struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; |
| 2781 | struct ceph_mds_session *tsession = NULL; | ||
| 2782 | struct ceph_cap *cap, *tcap; | ||
| 2748 | struct ceph_inode_info *ci = ceph_inode(inode); | 2783 | struct ceph_inode_info *ci = ceph_inode(inode); |
| 2749 | int mds = session->s_mds; | 2784 | u64 t_cap_id; |
| 2750 | unsigned mseq = le32_to_cpu(ex->migrate_seq); | 2785 | unsigned mseq = le32_to_cpu(ex->migrate_seq); |
| 2751 | struct ceph_cap *cap = NULL, *t; | 2786 | unsigned t_seq, t_mseq; |
| 2752 | struct rb_node *p; | 2787 | int target, issued; |
| 2753 | int remember = 1; | 2788 | int mds = session->s_mds; |
| 2754 | 2789 | ||
| 2755 | dout("handle_cap_export inode %p ci %p mds%d mseq %d\n", | 2790 | if (ph) { |
| 2756 | inode, ci, mds, mseq); | 2791 | t_cap_id = le64_to_cpu(ph->cap_id); |
| 2792 | t_seq = le32_to_cpu(ph->seq); | ||
| 2793 | t_mseq = le32_to_cpu(ph->mseq); | ||
| 2794 | target = le32_to_cpu(ph->mds); | ||
| 2795 | } else { | ||
| 2796 | t_cap_id = t_seq = t_mseq = 0; | ||
| 2797 | target = -1; | ||
| 2798 | } | ||
| 2757 | 2799 | ||
| 2800 | dout("handle_cap_export inode %p ci %p mds%d mseq %d target %d\n", | ||
| 2801 | inode, ci, mds, mseq, target); | ||
| 2802 | retry: | ||
| 2758 | spin_lock(&ci->i_ceph_lock); | 2803 | spin_lock(&ci->i_ceph_lock); |
| 2804 | cap = __get_cap_for_mds(ci, mds); | ||
| 2805 | if (!cap) | ||
| 2806 | goto out_unlock; | ||
| 2759 | 2807 | ||
| 2760 | /* make sure we haven't seen a higher mseq */ | 2808 | if (target < 0) { |
| 2761 | for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { | 2809 | __ceph_remove_cap(cap, false); |
| 2762 | t = rb_entry(p, struct ceph_cap, ci_node); | 2810 | goto out_unlock; |
| 2763 | if (ceph_seq_cmp(t->mseq, mseq) > 0) { | ||
| 2764 | dout(" higher mseq on cap from mds%d\n", | ||
| 2765 | t->session->s_mds); | ||
| 2766 | remember = 0; | ||
| 2767 | } | ||
| 2768 | if (t->session->s_mds == mds) | ||
| 2769 | cap = t; | ||
| 2770 | } | 2811 | } |
| 2771 | 2812 | ||
| 2772 | if (cap) { | 2813 | /* |
| 2773 | if (remember) { | 2814 | * now we know we haven't received the cap import message yet |
| 2774 | /* make note */ | 2815 | * because the exported cap still exist. |
| 2775 | ci->i_cap_exporting_mds = mds; | 2816 | */ |
| 2776 | ci->i_cap_exporting_mseq = mseq; | ||
| 2777 | ci->i_cap_exporting_issued = cap->issued; | ||
| 2778 | |||
| 2779 | /* | ||
| 2780 | * make sure we have open sessions with all possible | ||
| 2781 | * export targets, so that we get the matching IMPORT | ||
| 2782 | */ | ||
| 2783 | *open_target_sessions = 1; | ||
| 2784 | 2817 | ||
| 2785 | /* | 2818 | issued = cap->issued; |
| 2786 | * we can't flush dirty caps that we've seen the | 2819 | WARN_ON(issued != cap->implemented); |
| 2787 | * EXPORT but no IMPORT for | 2820 | |
| 2788 | */ | 2821 | tcap = __get_cap_for_mds(ci, target); |
| 2789 | spin_lock(&mdsc->cap_dirty_lock); | 2822 | if (tcap) { |
| 2790 | if (!list_empty(&ci->i_dirty_item)) { | 2823 | /* already have caps from the target */ |
| 2791 | dout(" moving %p to cap_dirty_migrating\n", | 2824 | if (tcap->cap_id != t_cap_id || |
| 2792 | inode); | 2825 | ceph_seq_cmp(tcap->seq, t_seq) < 0) { |
| 2793 | list_move(&ci->i_dirty_item, | 2826 | dout(" updating import cap %p mds%d\n", tcap, target); |
| 2794 | &mdsc->cap_dirty_migrating); | 2827 | tcap->cap_id = t_cap_id; |
| 2828 | tcap->seq = t_seq - 1; | ||
| 2829 | tcap->issue_seq = t_seq - 1; | ||
| 2830 | tcap->mseq = t_mseq; | ||
| 2831 | tcap->issued |= issued; | ||
| 2832 | tcap->implemented |= issued; | ||
| 2833 | if (cap == ci->i_auth_cap) | ||
| 2834 | ci->i_auth_cap = tcap; | ||
| 2835 | if (ci->i_flushing_caps && ci->i_auth_cap == tcap) { | ||
| 2836 | spin_lock(&mdsc->cap_dirty_lock); | ||
| 2837 | list_move_tail(&ci->i_flushing_item, | ||
| 2838 | &tcap->session->s_cap_flushing); | ||
| 2839 | spin_unlock(&mdsc->cap_dirty_lock); | ||
| 2795 | } | 2840 | } |
| 2796 | spin_unlock(&mdsc->cap_dirty_lock); | ||
| 2797 | } | 2841 | } |
| 2798 | __ceph_remove_cap(cap, false); | 2842 | __ceph_remove_cap(cap, false); |
| 2843 | goto out_unlock; | ||
| 2799 | } | 2844 | } |
| 2800 | /* else, we already released it */ | ||
| 2801 | 2845 | ||
| 2846 | if (tsession) { | ||
| 2847 | int flag = (cap == ci->i_auth_cap) ? CEPH_CAP_FLAG_AUTH : 0; | ||
| 2848 | spin_unlock(&ci->i_ceph_lock); | ||
| 2849 | /* add placeholder for the export tagert */ | ||
| 2850 | ceph_add_cap(inode, tsession, t_cap_id, -1, issued, 0, | ||
| 2851 | t_seq - 1, t_mseq, (u64)-1, flag, NULL); | ||
| 2852 | goto retry; | ||
| 2853 | } | ||
| 2854 | |||
| 2855 | spin_unlock(&ci->i_ceph_lock); | ||
| 2856 | mutex_unlock(&session->s_mutex); | ||
| 2857 | |||
| 2858 | /* open target session */ | ||
| 2859 | tsession = ceph_mdsc_open_export_target_session(mdsc, target); | ||
| 2860 | if (!IS_ERR(tsession)) { | ||
| 2861 | if (mds > target) { | ||
| 2862 | mutex_lock(&session->s_mutex); | ||
| 2863 | mutex_lock_nested(&tsession->s_mutex, | ||
| 2864 | SINGLE_DEPTH_NESTING); | ||
| 2865 | } else { | ||
| 2866 | mutex_lock(&tsession->s_mutex); | ||
| 2867 | mutex_lock_nested(&session->s_mutex, | ||
| 2868 | SINGLE_DEPTH_NESTING); | ||
| 2869 | } | ||
| 2870 | ceph_add_cap_releases(mdsc, tsession); | ||
| 2871 | } else { | ||
| 2872 | WARN_ON(1); | ||
| 2873 | tsession = NULL; | ||
| 2874 | target = -1; | ||
| 2875 | } | ||
| 2876 | goto retry; | ||
| 2877 | |||
| 2878 | out_unlock: | ||
| 2802 | spin_unlock(&ci->i_ceph_lock); | 2879 | spin_unlock(&ci->i_ceph_lock); |
| 2880 | mutex_unlock(&session->s_mutex); | ||
| 2881 | if (tsession) { | ||
| 2882 | mutex_unlock(&tsession->s_mutex); | ||
| 2883 | ceph_put_mds_session(tsession); | ||
| 2884 | } | ||
| 2803 | } | 2885 | } |
| 2804 | 2886 | ||
| 2805 | /* | 2887 | /* |
| @@ -2810,10 +2892,12 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex, | |||
| 2810 | */ | 2892 | */ |
| 2811 | static void handle_cap_import(struct ceph_mds_client *mdsc, | 2893 | static void handle_cap_import(struct ceph_mds_client *mdsc, |
| 2812 | struct inode *inode, struct ceph_mds_caps *im, | 2894 | struct inode *inode, struct ceph_mds_caps *im, |
| 2895 | struct ceph_mds_cap_peer *ph, | ||
| 2813 | struct ceph_mds_session *session, | 2896 | struct ceph_mds_session *session, |
| 2814 | void *snaptrace, int snaptrace_len) | 2897 | void *snaptrace, int snaptrace_len) |
| 2815 | { | 2898 | { |
| 2816 | struct ceph_inode_info *ci = ceph_inode(inode); | 2899 | struct ceph_inode_info *ci = ceph_inode(inode); |
| 2900 | struct ceph_cap *cap; | ||
| 2817 | int mds = session->s_mds; | 2901 | int mds = session->s_mds; |
| 2818 | unsigned issued = le32_to_cpu(im->caps); | 2902 | unsigned issued = le32_to_cpu(im->caps); |
| 2819 | unsigned wanted = le32_to_cpu(im->wanted); | 2903 | unsigned wanted = le32_to_cpu(im->wanted); |
| @@ -2821,28 +2905,44 @@ static void handle_cap_import(struct ceph_mds_client *mdsc, | |||
| 2821 | unsigned mseq = le32_to_cpu(im->migrate_seq); | 2905 | unsigned mseq = le32_to_cpu(im->migrate_seq); |
| 2822 | u64 realmino = le64_to_cpu(im->realm); | 2906 | u64 realmino = le64_to_cpu(im->realm); |
| 2823 | u64 cap_id = le64_to_cpu(im->cap_id); | 2907 | u64 cap_id = le64_to_cpu(im->cap_id); |
| 2908 | u64 p_cap_id; | ||
| 2909 | int peer; | ||
| 2824 | 2910 | ||
| 2825 | if (ci->i_cap_exporting_mds >= 0 && | 2911 | if (ph) { |
| 2826 | ceph_seq_cmp(ci->i_cap_exporting_mseq, mseq) < 0) { | 2912 | p_cap_id = le64_to_cpu(ph->cap_id); |
| 2827 | dout("handle_cap_import inode %p ci %p mds%d mseq %d" | 2913 | peer = le32_to_cpu(ph->mds); |
| 2828 | " - cleared exporting from mds%d\n", | 2914 | } else { |
| 2829 | inode, ci, mds, mseq, | 2915 | p_cap_id = 0; |
| 2830 | ci->i_cap_exporting_mds); | 2916 | peer = -1; |
| 2831 | ci->i_cap_exporting_issued = 0; | 2917 | } |
| 2832 | ci->i_cap_exporting_mseq = 0; | ||
| 2833 | ci->i_cap_exporting_mds = -1; | ||
| 2834 | 2918 | ||
| 2835 | spin_lock(&mdsc->cap_dirty_lock); | 2919 | dout("handle_cap_import inode %p ci %p mds%d mseq %d peer %d\n", |
| 2836 | if (!list_empty(&ci->i_dirty_item)) { | 2920 | inode, ci, mds, mseq, peer); |
| 2837 | dout(" moving %p back to cap_dirty\n", inode); | 2921 | |
| 2838 | list_move(&ci->i_dirty_item, &mdsc->cap_dirty); | 2922 | spin_lock(&ci->i_ceph_lock); |
| 2923 | cap = peer >= 0 ? __get_cap_for_mds(ci, peer) : NULL; | ||
| 2924 | if (cap && cap->cap_id == p_cap_id) { | ||
| 2925 | dout(" remove export cap %p mds%d flags %d\n", | ||
| 2926 | cap, peer, ph->flags); | ||
| 2927 | if ((ph->flags & CEPH_CAP_FLAG_AUTH) && | ||
| 2928 | (cap->seq != le32_to_cpu(ph->seq) || | ||
| 2929 | cap->mseq != le32_to_cpu(ph->mseq))) { | ||
| 2930 | pr_err("handle_cap_import: mismatched seq/mseq: " | ||
| 2931 | "ino (%llx.%llx) mds%d seq %d mseq %d " | ||
| 2932 | "importer mds%d has peer seq %d mseq %d\n", | ||
| 2933 | ceph_vinop(inode), peer, cap->seq, | ||
| 2934 | cap->mseq, mds, le32_to_cpu(ph->seq), | ||
| 2935 | le32_to_cpu(ph->mseq)); | ||
| 2839 | } | 2936 | } |
| 2840 | spin_unlock(&mdsc->cap_dirty_lock); | 2937 | ci->i_cap_exporting_issued = cap->issued; |
| 2841 | } else { | 2938 | __ceph_remove_cap(cap, (ph->flags & CEPH_CAP_FLAG_RELEASE)); |
| 2842 | dout("handle_cap_import inode %p ci %p mds%d mseq %d\n", | ||
| 2843 | inode, ci, mds, mseq); | ||
| 2844 | } | 2939 | } |
| 2845 | 2940 | ||
| 2941 | /* make sure we re-request max_size, if necessary */ | ||
| 2942 | ci->i_wanted_max_size = 0; | ||
| 2943 | ci->i_requested_max_size = 0; | ||
| 2944 | spin_unlock(&ci->i_ceph_lock); | ||
| 2945 | |||
| 2846 | down_write(&mdsc->snap_rwsem); | 2946 | down_write(&mdsc->snap_rwsem); |
| 2847 | ceph_update_snap_trace(mdsc, snaptrace, snaptrace+snaptrace_len, | 2947 | ceph_update_snap_trace(mdsc, snaptrace, snaptrace+snaptrace_len, |
| 2848 | false); | 2948 | false); |
| @@ -2853,11 +2953,6 @@ static void handle_cap_import(struct ceph_mds_client *mdsc, | |||
| 2853 | kick_flushing_inode_caps(mdsc, session, inode); | 2953 | kick_flushing_inode_caps(mdsc, session, inode); |
| 2854 | up_read(&mdsc->snap_rwsem); | 2954 | up_read(&mdsc->snap_rwsem); |
| 2855 | 2955 | ||
| 2856 | /* make sure we re-request max_size, if necessary */ | ||
| 2857 | spin_lock(&ci->i_ceph_lock); | ||
| 2858 | ci->i_wanted_max_size = 0; /* reset */ | ||
| 2859 | ci->i_requested_max_size = 0; | ||
| 2860 | spin_unlock(&ci->i_ceph_lock); | ||
| 2861 | } | 2956 | } |
| 2862 | 2957 | ||
| 2863 | /* | 2958 | /* |
| @@ -2875,6 +2970,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, | |||
| 2875 | struct ceph_inode_info *ci; | 2970 | struct ceph_inode_info *ci; |
| 2876 | struct ceph_cap *cap; | 2971 | struct ceph_cap *cap; |
| 2877 | struct ceph_mds_caps *h; | 2972 | struct ceph_mds_caps *h; |
| 2973 | struct ceph_mds_cap_peer *peer = NULL; | ||
| 2878 | int mds = session->s_mds; | 2974 | int mds = session->s_mds; |
| 2879 | int op; | 2975 | int op; |
| 2880 | u32 seq, mseq; | 2976 | u32 seq, mseq; |
| @@ -2885,12 +2981,13 @@ void ceph_handle_caps(struct ceph_mds_session *session, | |||
| 2885 | void *snaptrace; | 2981 | void *snaptrace; |
| 2886 | size_t snaptrace_len; | 2982 | size_t snaptrace_len; |
| 2887 | void *flock; | 2983 | void *flock; |
| 2984 | void *end; | ||
| 2888 | u32 flock_len; | 2985 | u32 flock_len; |
| 2889 | int open_target_sessions = 0; | ||
| 2890 | 2986 | ||
| 2891 | dout("handle_caps from mds%d\n", mds); | 2987 | dout("handle_caps from mds%d\n", mds); |
| 2892 | 2988 | ||
| 2893 | /* decode */ | 2989 | /* decode */ |
| 2990 | end = msg->front.iov_base + msg->front.iov_len; | ||
| 2894 | tid = le64_to_cpu(msg->hdr.tid); | 2991 | tid = le64_to_cpu(msg->hdr.tid); |
| 2895 | if (msg->front.iov_len < sizeof(*h)) | 2992 | if (msg->front.iov_len < sizeof(*h)) |
| 2896 | goto bad; | 2993 | goto bad; |
| @@ -2908,17 +3005,28 @@ void ceph_handle_caps(struct ceph_mds_session *session, | |||
| 2908 | snaptrace_len = le32_to_cpu(h->snap_trace_len); | 3005 | snaptrace_len = le32_to_cpu(h->snap_trace_len); |
| 2909 | 3006 | ||
| 2910 | if (le16_to_cpu(msg->hdr.version) >= 2) { | 3007 | if (le16_to_cpu(msg->hdr.version) >= 2) { |
| 2911 | void *p, *end; | 3008 | void *p = snaptrace + snaptrace_len; |
| 2912 | |||
| 2913 | p = snaptrace + snaptrace_len; | ||
| 2914 | end = msg->front.iov_base + msg->front.iov_len; | ||
| 2915 | ceph_decode_32_safe(&p, end, flock_len, bad); | 3009 | ceph_decode_32_safe(&p, end, flock_len, bad); |
| 3010 | if (p + flock_len > end) | ||
| 3011 | goto bad; | ||
| 2916 | flock = p; | 3012 | flock = p; |
| 2917 | } else { | 3013 | } else { |
| 2918 | flock = NULL; | 3014 | flock = NULL; |
| 2919 | flock_len = 0; | 3015 | flock_len = 0; |
| 2920 | } | 3016 | } |
| 2921 | 3017 | ||
| 3018 | if (le16_to_cpu(msg->hdr.version) >= 3) { | ||
| 3019 | if (op == CEPH_CAP_OP_IMPORT) { | ||
| 3020 | void *p = flock + flock_len; | ||
| 3021 | if (p + sizeof(*peer) > end) | ||
| 3022 | goto bad; | ||
| 3023 | peer = p; | ||
| 3024 | } else if (op == CEPH_CAP_OP_EXPORT) { | ||
| 3025 | /* recorded in unused fields */ | ||
| 3026 | peer = (void *)&h->size; | ||
| 3027 | } | ||
| 3028 | } | ||
| 3029 | |||
| 2922 | mutex_lock(&session->s_mutex); | 3030 | mutex_lock(&session->s_mutex); |
| 2923 | session->s_seq++; | 3031 | session->s_seq++; |
| 2924 | dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq, | 3032 | dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq, |
| @@ -2951,11 +3059,11 @@ void ceph_handle_caps(struct ceph_mds_session *session, | |||
| 2951 | goto done; | 3059 | goto done; |
| 2952 | 3060 | ||
| 2953 | case CEPH_CAP_OP_EXPORT: | 3061 | case CEPH_CAP_OP_EXPORT: |
| 2954 | handle_cap_export(inode, h, session, &open_target_sessions); | 3062 | handle_cap_export(inode, h, peer, session); |
| 2955 | goto done; | 3063 | goto done_unlocked; |
| 2956 | 3064 | ||
| 2957 | case CEPH_CAP_OP_IMPORT: | 3065 | case CEPH_CAP_OP_IMPORT: |
| 2958 | handle_cap_import(mdsc, inode, h, session, | 3066 | handle_cap_import(mdsc, inode, h, peer, session, |
| 2959 | snaptrace, snaptrace_len); | 3067 | snaptrace, snaptrace_len); |
| 2960 | } | 3068 | } |
| 2961 | 3069 | ||
| @@ -3007,8 +3115,6 @@ done: | |||
| 3007 | done_unlocked: | 3115 | done_unlocked: |
| 3008 | if (inode) | 3116 | if (inode) |
| 3009 | iput(inode); | 3117 | iput(inode); |
| 3010 | if (open_target_sessions) | ||
| 3011 | ceph_mdsc_open_export_target_sessions(mdsc, session); | ||
| 3012 | return; | 3118 | return; |
| 3013 | 3119 | ||
| 3014 | bad: | 3120 | bad: |
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 2a0bcaeb189a..6da4df84ba30 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c | |||
| @@ -693,6 +693,10 @@ static int ceph_mknod(struct inode *dir, struct dentry *dentry, | |||
| 693 | if (!err && !req->r_reply_info.head->is_dentry) | 693 | if (!err && !req->r_reply_info.head->is_dentry) |
| 694 | err = ceph_handle_notrace_create(dir, dentry); | 694 | err = ceph_handle_notrace_create(dir, dentry); |
| 695 | ceph_mdsc_put_request(req); | 695 | ceph_mdsc_put_request(req); |
| 696 | |||
| 697 | if (!err) | ||
| 698 | err = ceph_init_acl(dentry, dentry->d_inode, dir); | ||
| 699 | |||
| 696 | if (err) | 700 | if (err) |
| 697 | d_drop(dentry); | 701 | d_drop(dentry); |
| 698 | return err; | 702 | return err; |
| @@ -1037,14 +1041,19 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) | |||
| 1037 | valid = 1; | 1041 | valid = 1; |
| 1038 | } else if (dentry_lease_is_valid(dentry) || | 1042 | } else if (dentry_lease_is_valid(dentry) || |
| 1039 | dir_lease_is_valid(dir, dentry)) { | 1043 | dir_lease_is_valid(dir, dentry)) { |
| 1040 | valid = 1; | 1044 | if (dentry->d_inode) |
| 1045 | valid = ceph_is_any_caps(dentry->d_inode); | ||
| 1046 | else | ||
| 1047 | valid = 1; | ||
| 1041 | } | 1048 | } |
| 1042 | 1049 | ||
| 1043 | dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid"); | 1050 | dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid"); |
| 1044 | if (valid) | 1051 | if (valid) { |
| 1045 | ceph_dentry_lru_touch(dentry); | 1052 | ceph_dentry_lru_touch(dentry); |
| 1046 | else | 1053 | } else { |
| 1054 | ceph_dir_clear_complete(dir); | ||
| 1047 | d_drop(dentry); | 1055 | d_drop(dentry); |
| 1056 | } | ||
| 1048 | iput(dir); | 1057 | iput(dir); |
| 1049 | return valid; | 1058 | return valid; |
| 1050 | } | 1059 | } |
| @@ -1293,6 +1302,8 @@ const struct inode_operations ceph_dir_iops = { | |||
| 1293 | .getxattr = ceph_getxattr, | 1302 | .getxattr = ceph_getxattr, |
| 1294 | .listxattr = ceph_listxattr, | 1303 | .listxattr = ceph_listxattr, |
| 1295 | .removexattr = ceph_removexattr, | 1304 | .removexattr = ceph_removexattr, |
| 1305 | .get_acl = ceph_get_acl, | ||
| 1306 | .set_acl = ceph_set_acl, | ||
| 1296 | .mknod = ceph_mknod, | 1307 | .mknod = ceph_mknod, |
| 1297 | .symlink = ceph_symlink, | 1308 | .symlink = ceph_symlink, |
| 1298 | .mkdir = ceph_mkdir, | 1309 | .mkdir = ceph_mkdir, |
diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 3de89829e2a1..dfd2ce3419f8 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c | |||
| @@ -408,51 +408,92 @@ more: | |||
| 408 | * | 408 | * |
| 409 | * If the read spans object boundary, just do multiple reads. | 409 | * If the read spans object boundary, just do multiple reads. |
| 410 | */ | 410 | */ |
| 411 | static ssize_t ceph_sync_read(struct file *file, char __user *data, | 411 | static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i, |
| 412 | unsigned len, loff_t *poff, int *checkeof) | 412 | int *checkeof) |
| 413 | { | 413 | { |
| 414 | struct file *file = iocb->ki_filp; | ||
| 414 | struct inode *inode = file_inode(file); | 415 | struct inode *inode = file_inode(file); |
| 415 | struct page **pages; | 416 | struct page **pages; |
| 416 | u64 off = *poff; | 417 | u64 off = iocb->ki_pos; |
| 417 | int num_pages, ret; | 418 | int num_pages, ret; |
| 419 | size_t len = i->count; | ||
| 418 | 420 | ||
| 419 | dout("sync_read on file %p %llu~%u %s\n", file, off, len, | 421 | dout("sync_read on file %p %llu~%u %s\n", file, off, |
| 422 | (unsigned)len, | ||
| 420 | (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); | 423 | (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); |
| 421 | |||
| 422 | if (file->f_flags & O_DIRECT) { | ||
| 423 | num_pages = calc_pages_for((unsigned long)data, len); | ||
| 424 | pages = ceph_get_direct_page_vector(data, num_pages, true); | ||
| 425 | } else { | ||
| 426 | num_pages = calc_pages_for(off, len); | ||
| 427 | pages = ceph_alloc_page_vector(num_pages, GFP_NOFS); | ||
| 428 | } | ||
| 429 | if (IS_ERR(pages)) | ||
| 430 | return PTR_ERR(pages); | ||
| 431 | |||
| 432 | /* | 424 | /* |
| 433 | * flush any page cache pages in this range. this | 425 | * flush any page cache pages in this range. this |
| 434 | * will make concurrent normal and sync io slow, | 426 | * will make concurrent normal and sync io slow, |
| 435 | * but it will at least behave sensibly when they are | 427 | * but it will at least behave sensibly when they are |
| 436 | * in sequence. | 428 | * in sequence. |
| 437 | */ | 429 | */ |
| 438 | ret = filemap_write_and_wait(inode->i_mapping); | 430 | ret = filemap_write_and_wait_range(inode->i_mapping, off, |
| 431 | off + len); | ||
| 439 | if (ret < 0) | 432 | if (ret < 0) |
| 440 | goto done; | 433 | return ret; |
| 441 | 434 | ||
| 442 | ret = striped_read(inode, off, len, pages, num_pages, checkeof, | 435 | if (file->f_flags & O_DIRECT) { |
| 443 | file->f_flags & O_DIRECT, | 436 | while (iov_iter_count(i)) { |
| 444 | (unsigned long)data & ~PAGE_MASK); | 437 | void __user *data = i->iov[0].iov_base + i->iov_offset; |
| 438 | size_t len = i->iov[0].iov_len - i->iov_offset; | ||
| 439 | |||
| 440 | num_pages = calc_pages_for((unsigned long)data, len); | ||
| 441 | pages = ceph_get_direct_page_vector(data, | ||
| 442 | num_pages, true); | ||
| 443 | if (IS_ERR(pages)) | ||
| 444 | return PTR_ERR(pages); | ||
| 445 | |||
| 446 | ret = striped_read(inode, off, len, | ||
| 447 | pages, num_pages, checkeof, | ||
| 448 | 1, (unsigned long)data & ~PAGE_MASK); | ||
| 449 | ceph_put_page_vector(pages, num_pages, true); | ||
| 450 | |||
| 451 | if (ret <= 0) | ||
| 452 | break; | ||
| 453 | off += ret; | ||
| 454 | iov_iter_advance(i, ret); | ||
| 455 | if (ret < len) | ||
| 456 | break; | ||
| 457 | } | ||
| 458 | } else { | ||
| 459 | num_pages = calc_pages_for(off, len); | ||
| 460 | pages = ceph_alloc_page_vector(num_pages, GFP_NOFS); | ||
| 461 | if (IS_ERR(pages)) | ||
| 462 | return PTR_ERR(pages); | ||
| 463 | ret = striped_read(inode, off, len, pages, | ||
| 464 | num_pages, checkeof, 0, 0); | ||
| 465 | if (ret > 0) { | ||
| 466 | int l, k = 0; | ||
| 467 | size_t left = len = ret; | ||
| 468 | |||
| 469 | while (left) { | ||
| 470 | void __user *data = i->iov[0].iov_base | ||
| 471 | + i->iov_offset; | ||
| 472 | l = min(i->iov[0].iov_len - i->iov_offset, | ||
| 473 | left); | ||
| 474 | |||
| 475 | ret = ceph_copy_page_vector_to_user(&pages[k], | ||
| 476 | data, off, | ||
| 477 | l); | ||
| 478 | if (ret > 0) { | ||
| 479 | iov_iter_advance(i, ret); | ||
| 480 | left -= ret; | ||
| 481 | off += ret; | ||
| 482 | k = calc_pages_for(iocb->ki_pos, | ||
| 483 | len - left + 1) - 1; | ||
| 484 | BUG_ON(k >= num_pages && left); | ||
| 485 | } else | ||
| 486 | break; | ||
| 487 | } | ||
| 488 | } | ||
| 489 | ceph_release_page_vector(pages, num_pages); | ||
| 490 | } | ||
| 445 | 491 | ||
| 446 | if (ret >= 0 && (file->f_flags & O_DIRECT) == 0) | 492 | if (off > iocb->ki_pos) { |
| 447 | ret = ceph_copy_page_vector_to_user(pages, data, off, ret); | 493 | ret = off - iocb->ki_pos; |
| 448 | if (ret >= 0) | 494 | iocb->ki_pos = off; |
| 449 | *poff = off + ret; | 495 | } |
| 450 | 496 | ||
| 451 | done: | ||
| 452 | if (file->f_flags & O_DIRECT) | ||
| 453 | ceph_put_page_vector(pages, num_pages, true); | ||
| 454 | else | ||
| 455 | ceph_release_page_vector(pages, num_pages); | ||
| 456 | dout("sync_read result %d\n", ret); | 497 | dout("sync_read result %d\n", ret); |
| 457 | return ret; | 498 | return ret; |
| 458 | } | 499 | } |
| @@ -489,83 +530,79 @@ static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe) | |||
| 489 | } | 530 | } |
| 490 | } | 531 | } |
| 491 | 532 | ||
| 533 | |||
| 492 | /* | 534 | /* |
| 493 | * Synchronous write, straight from __user pointer or user pages (if | 535 | * Synchronous write, straight from __user pointer or user pages. |
| 494 | * O_DIRECT). | ||
| 495 | * | 536 | * |
| 496 | * If write spans object boundary, just do multiple writes. (For a | 537 | * If write spans object boundary, just do multiple writes. (For a |
| 497 | * correct atomic write, we should e.g. take write locks on all | 538 | * correct atomic write, we should e.g. take write locks on all |
| 498 | * objects, rollback on failure, etc.) | 539 | * objects, rollback on failure, etc.) |
| 499 | */ | 540 | */ |
| 500 | static ssize_t ceph_sync_write(struct file *file, const char __user *data, | 541 | static ssize_t |
| 501 | size_t left, loff_t pos, loff_t *ppos) | 542 | ceph_sync_direct_write(struct kiocb *iocb, const struct iovec *iov, |
| 543 | unsigned long nr_segs, size_t count) | ||
| 502 | { | 544 | { |
| 545 | struct file *file = iocb->ki_filp; | ||
| 503 | struct inode *inode = file_inode(file); | 546 | struct inode *inode = file_inode(file); |
| 504 | struct ceph_inode_info *ci = ceph_inode(inode); | 547 | struct ceph_inode_info *ci = ceph_inode(inode); |
| 505 | struct ceph_fs_client *fsc = ceph_inode_to_client(inode); | 548 | struct ceph_fs_client *fsc = ceph_inode_to_client(inode); |
| 506 | struct ceph_snap_context *snapc; | 549 | struct ceph_snap_context *snapc; |
| 507 | struct ceph_vino vino; | 550 | struct ceph_vino vino; |
| 508 | struct ceph_osd_request *req; | 551 | struct ceph_osd_request *req; |
| 509 | int num_ops = 1; | ||
| 510 | struct page **pages; | 552 | struct page **pages; |
| 511 | int num_pages; | 553 | int num_pages; |
| 512 | u64 len; | ||
| 513 | int written = 0; | 554 | int written = 0; |
| 514 | int flags; | 555 | int flags; |
| 515 | int check_caps = 0; | 556 | int check_caps = 0; |
| 516 | int page_align, io_align; | 557 | int page_align; |
| 517 | unsigned long buf_align; | ||
| 518 | int ret; | 558 | int ret; |
| 519 | struct timespec mtime = CURRENT_TIME; | 559 | struct timespec mtime = CURRENT_TIME; |
| 520 | bool own_pages = false; | 560 | loff_t pos = iocb->ki_pos; |
| 561 | struct iov_iter i; | ||
| 521 | 562 | ||
| 522 | if (ceph_snap(file_inode(file)) != CEPH_NOSNAP) | 563 | if (ceph_snap(file_inode(file)) != CEPH_NOSNAP) |
| 523 | return -EROFS; | 564 | return -EROFS; |
| 524 | 565 | ||
| 525 | dout("sync_write on file %p %lld~%u %s\n", file, pos, | 566 | dout("sync_direct_write on file %p %lld~%u\n", file, pos, |
| 526 | (unsigned)left, (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); | 567 | (unsigned)count); |
| 527 | 568 | ||
| 528 | ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + left); | 569 | ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count); |
| 529 | if (ret < 0) | 570 | if (ret < 0) |
| 530 | return ret; | 571 | return ret; |
| 531 | 572 | ||
| 532 | ret = invalidate_inode_pages2_range(inode->i_mapping, | 573 | ret = invalidate_inode_pages2_range(inode->i_mapping, |
| 533 | pos >> PAGE_CACHE_SHIFT, | 574 | pos >> PAGE_CACHE_SHIFT, |
| 534 | (pos + left) >> PAGE_CACHE_SHIFT); | 575 | (pos + count) >> PAGE_CACHE_SHIFT); |
| 535 | if (ret < 0) | 576 | if (ret < 0) |
| 536 | dout("invalidate_inode_pages2_range returned %d\n", ret); | 577 | dout("invalidate_inode_pages2_range returned %d\n", ret); |
| 537 | 578 | ||
| 538 | flags = CEPH_OSD_FLAG_ORDERSNAP | | 579 | flags = CEPH_OSD_FLAG_ORDERSNAP | |
| 539 | CEPH_OSD_FLAG_ONDISK | | 580 | CEPH_OSD_FLAG_ONDISK | |
| 540 | CEPH_OSD_FLAG_WRITE; | 581 | CEPH_OSD_FLAG_WRITE; |
| 541 | if ((file->f_flags & (O_SYNC|O_DIRECT)) == 0) | ||
| 542 | flags |= CEPH_OSD_FLAG_ACK; | ||
| 543 | else | ||
| 544 | num_ops++; /* Also include a 'startsync' command. */ | ||
| 545 | 582 | ||
| 546 | /* | 583 | iov_iter_init(&i, iov, nr_segs, count, 0); |
| 547 | * we may need to do multiple writes here if we span an object | 584 | |
| 548 | * boundary. this isn't atomic, unfortunately. :( | 585 | while (iov_iter_count(&i) > 0) { |
| 549 | */ | 586 | void __user *data = i.iov->iov_base + i.iov_offset; |
| 550 | more: | 587 | u64 len = i.iov->iov_len - i.iov_offset; |
| 551 | io_align = pos & ~PAGE_MASK; | 588 | |
| 552 | buf_align = (unsigned long)data & ~PAGE_MASK; | 589 | page_align = (unsigned long)data & ~PAGE_MASK; |
| 553 | len = left; | 590 | |
| 554 | 591 | snapc = ci->i_snap_realm->cached_context; | |
| 555 | snapc = ci->i_snap_realm->cached_context; | 592 | vino = ceph_vino(inode); |
| 556 | vino = ceph_vino(inode); | 593 | req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, |
| 557 | req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, | 594 | vino, pos, &len, |
| 558 | vino, pos, &len, num_ops, | 595 | 2,/*include a 'startsync' command*/ |
| 559 | CEPH_OSD_OP_WRITE, flags, snapc, | 596 | CEPH_OSD_OP_WRITE, flags, snapc, |
| 560 | ci->i_truncate_seq, ci->i_truncate_size, | 597 | ci->i_truncate_seq, |
| 561 | false); | 598 | ci->i_truncate_size, |
| 562 | if (IS_ERR(req)) | 599 | false); |
| 563 | return PTR_ERR(req); | 600 | if (IS_ERR(req)) { |
| 601 | ret = PTR_ERR(req); | ||
| 602 | goto out; | ||
| 603 | } | ||
| 564 | 604 | ||
| 565 | /* write from beginning of first page, regardless of io alignment */ | 605 | num_pages = calc_pages_for(page_align, len); |
| 566 | page_align = file->f_flags & O_DIRECT ? buf_align : io_align; | ||
| 567 | num_pages = calc_pages_for(page_align, len); | ||
| 568 | if (file->f_flags & O_DIRECT) { | ||
| 569 | pages = ceph_get_direct_page_vector(data, num_pages, false); | 606 | pages = ceph_get_direct_page_vector(data, num_pages, false); |
| 570 | if (IS_ERR(pages)) { | 607 | if (IS_ERR(pages)) { |
| 571 | ret = PTR_ERR(pages); | 608 | ret = PTR_ERR(pages); |
| @@ -577,60 +614,175 @@ more: | |||
| 577 | * may block. | 614 | * may block. |
| 578 | */ | 615 | */ |
| 579 | truncate_inode_pages_range(inode->i_mapping, pos, | 616 | truncate_inode_pages_range(inode->i_mapping, pos, |
| 580 | (pos+len) | (PAGE_CACHE_SIZE-1)); | 617 | (pos+len) | (PAGE_CACHE_SIZE-1)); |
| 581 | } else { | 618 | osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_align, |
| 619 | false, false); | ||
| 620 | |||
| 621 | /* BUG_ON(vino.snap != CEPH_NOSNAP); */ | ||
| 622 | ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime); | ||
| 623 | |||
| 624 | ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); | ||
| 625 | if (!ret) | ||
| 626 | ret = ceph_osdc_wait_request(&fsc->client->osdc, req); | ||
| 627 | |||
| 628 | ceph_put_page_vector(pages, num_pages, false); | ||
| 629 | |||
| 630 | out: | ||
| 631 | ceph_osdc_put_request(req); | ||
| 632 | if (ret == 0) { | ||
| 633 | pos += len; | ||
| 634 | written += len; | ||
| 635 | iov_iter_advance(&i, (size_t)len); | ||
| 636 | |||
| 637 | if (pos > i_size_read(inode)) { | ||
| 638 | check_caps = ceph_inode_set_size(inode, pos); | ||
| 639 | if (check_caps) | ||
| 640 | ceph_check_caps(ceph_inode(inode), | ||
| 641 | CHECK_CAPS_AUTHONLY, | ||
| 642 | NULL); | ||
| 643 | } | ||
| 644 | } else | ||
| 645 | break; | ||
| 646 | } | ||
| 647 | |||
| 648 | if (ret != -EOLDSNAPC && written > 0) { | ||
| 649 | iocb->ki_pos = pos; | ||
| 650 | ret = written; | ||
| 651 | } | ||
| 652 | return ret; | ||
| 653 | } | ||
| 654 | |||
| 655 | |||
| 656 | /* | ||
| 657 | * Synchronous write, straight from __user pointer or user pages. | ||
| 658 | * | ||
| 659 | * If write spans object boundary, just do multiple writes. (For a | ||
| 660 | * correct atomic write, we should e.g. take write locks on all | ||
| 661 | * objects, rollback on failure, etc.) | ||
| 662 | */ | ||
| 663 | static ssize_t ceph_sync_write(struct kiocb *iocb, const struct iovec *iov, | ||
| 664 | unsigned long nr_segs, size_t count) | ||
| 665 | { | ||
| 666 | struct file *file = iocb->ki_filp; | ||
| 667 | struct inode *inode = file_inode(file); | ||
| 668 | struct ceph_inode_info *ci = ceph_inode(inode); | ||
| 669 | struct ceph_fs_client *fsc = ceph_inode_to_client(inode); | ||
| 670 | struct ceph_snap_context *snapc; | ||
| 671 | struct ceph_vino vino; | ||
| 672 | struct ceph_osd_request *req; | ||
| 673 | struct page **pages; | ||
| 674 | u64 len; | ||
| 675 | int num_pages; | ||
| 676 | int written = 0; | ||
| 677 | int flags; | ||
| 678 | int check_caps = 0; | ||
| 679 | int ret; | ||
| 680 | struct timespec mtime = CURRENT_TIME; | ||
| 681 | loff_t pos = iocb->ki_pos; | ||
| 682 | struct iov_iter i; | ||
| 683 | |||
| 684 | if (ceph_snap(file_inode(file)) != CEPH_NOSNAP) | ||
| 685 | return -EROFS; | ||
| 686 | |||
| 687 | dout("sync_write on file %p %lld~%u\n", file, pos, (unsigned)count); | ||
| 688 | |||
| 689 | ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count); | ||
| 690 | if (ret < 0) | ||
| 691 | return ret; | ||
| 692 | |||
| 693 | ret = invalidate_inode_pages2_range(inode->i_mapping, | ||
| 694 | pos >> PAGE_CACHE_SHIFT, | ||
| 695 | (pos + count) >> PAGE_CACHE_SHIFT); | ||
| 696 | if (ret < 0) | ||
| 697 | dout("invalidate_inode_pages2_range returned %d\n", ret); | ||
| 698 | |||
| 699 | flags = CEPH_OSD_FLAG_ORDERSNAP | | ||
| 700 | CEPH_OSD_FLAG_ONDISK | | ||
| 701 | CEPH_OSD_FLAG_WRITE | | ||
| 702 | CEPH_OSD_FLAG_ACK; | ||
| 703 | |||
| 704 | iov_iter_init(&i, iov, nr_segs, count, 0); | ||
| 705 | |||
| 706 | while ((len = iov_iter_count(&i)) > 0) { | ||
| 707 | size_t left; | ||
| 708 | int n; | ||
| 709 | |||
| 710 | snapc = ci->i_snap_realm->cached_context; | ||
| 711 | vino = ceph_vino(inode); | ||
| 712 | req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, | ||
| 713 | vino, pos, &len, 1, | ||
| 714 | CEPH_OSD_OP_WRITE, flags, snapc, | ||
| 715 | ci->i_truncate_seq, | ||
| 716 | ci->i_truncate_size, | ||
| 717 | false); | ||
| 718 | if (IS_ERR(req)) { | ||
| 719 | ret = PTR_ERR(req); | ||
| 720 | goto out; | ||
| 721 | } | ||
| 722 | |||
| 723 | /* | ||
| 724 | * write from beginning of first page, | ||
| 725 | * regardless of io alignment | ||
| 726 | */ | ||
| 727 | num_pages = (len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | ||
| 728 | |||
| 582 | pages = ceph_alloc_page_vector(num_pages, GFP_NOFS); | 729 | pages = ceph_alloc_page_vector(num_pages, GFP_NOFS); |
| 583 | if (IS_ERR(pages)) { | 730 | if (IS_ERR(pages)) { |
| 584 | ret = PTR_ERR(pages); | 731 | ret = PTR_ERR(pages); |
| 585 | goto out; | 732 | goto out; |
| 586 | } | 733 | } |
| 587 | ret = ceph_copy_user_to_page_vector(pages, data, pos, len); | 734 | |
| 735 | left = len; | ||
| 736 | for (n = 0; n < num_pages; n++) { | ||
| 737 | size_t plen = min_t(size_t, left, PAGE_SIZE); | ||
| 738 | ret = iov_iter_copy_from_user(pages[n], &i, 0, plen); | ||
| 739 | if (ret != plen) { | ||
| 740 | ret = -EFAULT; | ||
| 741 | break; | ||
| 742 | } | ||
| 743 | left -= ret; | ||
| 744 | iov_iter_advance(&i, ret); | ||
| 745 | } | ||
| 746 | |||
| 588 | if (ret < 0) { | 747 | if (ret < 0) { |
| 589 | ceph_release_page_vector(pages, num_pages); | 748 | ceph_release_page_vector(pages, num_pages); |
| 590 | goto out; | 749 | goto out; |
| 591 | } | 750 | } |
| 592 | 751 | ||
| 593 | if ((file->f_flags & O_SYNC) == 0) { | 752 | /* get a second commit callback */ |
| 594 | /* get a second commit callback */ | 753 | req->r_unsafe_callback = ceph_sync_write_unsafe; |
| 595 | req->r_unsafe_callback = ceph_sync_write_unsafe; | 754 | req->r_inode = inode; |
| 596 | req->r_inode = inode; | ||
| 597 | own_pages = true; | ||
| 598 | } | ||
| 599 | } | ||
| 600 | osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_align, | ||
| 601 | false, own_pages); | ||
| 602 | 755 | ||
| 603 | /* BUG_ON(vino.snap != CEPH_NOSNAP); */ | 756 | osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, |
| 604 | ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime); | 757 | false, true); |
| 605 | 758 | ||
| 606 | ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); | 759 | /* BUG_ON(vino.snap != CEPH_NOSNAP); */ |
| 607 | if (!ret) | 760 | ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime); |
| 608 | ret = ceph_osdc_wait_request(&fsc->client->osdc, req); | ||
| 609 | 761 | ||
| 610 | if (file->f_flags & O_DIRECT) | 762 | ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); |
| 611 | ceph_put_page_vector(pages, num_pages, false); | 763 | if (!ret) |
| 612 | else if (file->f_flags & O_SYNC) | 764 | ret = ceph_osdc_wait_request(&fsc->client->osdc, req); |
| 613 | ceph_release_page_vector(pages, num_pages); | ||
| 614 | 765 | ||
| 615 | out: | 766 | out: |
| 616 | ceph_osdc_put_request(req); | 767 | ceph_osdc_put_request(req); |
| 617 | if (ret == 0) { | 768 | if (ret == 0) { |
| 618 | pos += len; | 769 | pos += len; |
| 619 | written += len; | 770 | written += len; |
| 620 | left -= len; | 771 | |
| 621 | data += len; | 772 | if (pos > i_size_read(inode)) { |
| 622 | if (left) | 773 | check_caps = ceph_inode_set_size(inode, pos); |
| 623 | goto more; | 774 | if (check_caps) |
| 775 | ceph_check_caps(ceph_inode(inode), | ||
| 776 | CHECK_CAPS_AUTHONLY, | ||
| 777 | NULL); | ||
| 778 | } | ||
| 779 | } else | ||
| 780 | break; | ||
| 781 | } | ||
| 624 | 782 | ||
| 783 | if (ret != -EOLDSNAPC && written > 0) { | ||
| 625 | ret = written; | 784 | ret = written; |
| 626 | *ppos = pos; | 785 | iocb->ki_pos = pos; |
| 627 | if (pos > i_size_read(inode)) | ||
| 628 | check_caps = ceph_inode_set_size(inode, pos); | ||
| 629 | if (check_caps) | ||
| 630 | ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, | ||
| 631 | NULL); | ||
| 632 | } else if (ret != -EOLDSNAPC && written > 0) { | ||
| 633 | ret = written; | ||
| 634 | } | 786 | } |
| 635 | return ret; | 787 | return ret; |
| 636 | } | 788 | } |
| @@ -647,55 +799,84 @@ static ssize_t ceph_aio_read(struct kiocb *iocb, const struct iovec *iov, | |||
| 647 | { | 799 | { |
| 648 | struct file *filp = iocb->ki_filp; | 800 | struct file *filp = iocb->ki_filp; |
| 649 | struct ceph_file_info *fi = filp->private_data; | 801 | struct ceph_file_info *fi = filp->private_data; |
| 650 | loff_t *ppos = &iocb->ki_pos; | 802 | size_t len = iocb->ki_nbytes; |
| 651 | size_t len = iov->iov_len; | ||
| 652 | struct inode *inode = file_inode(filp); | 803 | struct inode *inode = file_inode(filp); |
| 653 | struct ceph_inode_info *ci = ceph_inode(inode); | 804 | struct ceph_inode_info *ci = ceph_inode(inode); |
| 654 | void __user *base = iov->iov_base; | ||
| 655 | ssize_t ret; | 805 | ssize_t ret; |
| 656 | int want, got = 0; | 806 | int want, got = 0; |
| 657 | int checkeof = 0, read = 0; | 807 | int checkeof = 0, read = 0; |
| 658 | 808 | ||
| 659 | dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n", | ||
| 660 | inode, ceph_vinop(inode), pos, (unsigned)len, inode); | ||
| 661 | again: | 809 | again: |
| 810 | dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n", | ||
| 811 | inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, inode); | ||
| 812 | |||
| 662 | if (fi->fmode & CEPH_FILE_MODE_LAZY) | 813 | if (fi->fmode & CEPH_FILE_MODE_LAZY) |
| 663 | want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; | 814 | want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; |
| 664 | else | 815 | else |
| 665 | want = CEPH_CAP_FILE_CACHE; | 816 | want = CEPH_CAP_FILE_CACHE; |
| 666 | ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, &got, -1); | 817 | ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, &got, -1); |
| 667 | if (ret < 0) | 818 | if (ret < 0) |
| 668 | goto out; | 819 | return ret; |
| 669 | dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n", | ||
| 670 | inode, ceph_vinop(inode), pos, (unsigned)len, | ||
| 671 | ceph_cap_string(got)); | ||
| 672 | 820 | ||
| 673 | if ((got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0 || | 821 | if ((got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0 || |
| 674 | (iocb->ki_filp->f_flags & O_DIRECT) || | 822 | (iocb->ki_filp->f_flags & O_DIRECT) || |
| 675 | (fi->flags & CEPH_F_SYNC)) | 823 | (fi->flags & CEPH_F_SYNC)) { |
| 824 | struct iov_iter i; | ||
| 825 | |||
| 826 | dout("aio_sync_read %p %llx.%llx %llu~%u got cap refs on %s\n", | ||
| 827 | inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, | ||
| 828 | ceph_cap_string(got)); | ||
| 829 | |||
| 830 | if (!read) { | ||
| 831 | ret = generic_segment_checks(iov, &nr_segs, | ||
| 832 | &len, VERIFY_WRITE); | ||
| 833 | if (ret) | ||
| 834 | goto out; | ||
| 835 | } | ||
| 836 | |||
| 837 | iov_iter_init(&i, iov, nr_segs, len, read); | ||
| 838 | |||
| 676 | /* hmm, this isn't really async... */ | 839 | /* hmm, this isn't really async... */ |
| 677 | ret = ceph_sync_read(filp, base, len, ppos, &checkeof); | 840 | ret = ceph_sync_read(iocb, &i, &checkeof); |
| 678 | else | 841 | } else { |
| 679 | ret = generic_file_aio_read(iocb, iov, nr_segs, pos); | 842 | /* |
| 843 | * We can't modify the content of iov, | ||
| 844 | * so we only read from beginning. | ||
| 845 | */ | ||
| 846 | if (read) { | ||
| 847 | iocb->ki_pos = pos; | ||
| 848 | len = iocb->ki_nbytes; | ||
| 849 | read = 0; | ||
| 850 | } | ||
| 851 | dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n", | ||
| 852 | inode, ceph_vinop(inode), pos, (unsigned)len, | ||
| 853 | ceph_cap_string(got)); | ||
| 680 | 854 | ||
| 855 | ret = generic_file_aio_read(iocb, iov, nr_segs, pos); | ||
| 856 | } | ||
| 681 | out: | 857 | out: |
| 682 | dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n", | 858 | dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n", |
| 683 | inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret); | 859 | inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret); |
| 684 | ceph_put_cap_refs(ci, got); | 860 | ceph_put_cap_refs(ci, got); |
| 685 | 861 | ||
| 686 | if (checkeof && ret >= 0) { | 862 | if (checkeof && ret >= 0) { |
| 687 | int statret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE); | 863 | int statret = ceph_do_getattr(inode, |
| 864 | CEPH_STAT_CAP_SIZE); | ||
| 688 | 865 | ||
| 689 | /* hit EOF or hole? */ | 866 | /* hit EOF or hole? */ |
| 690 | if (statret == 0 && *ppos < inode->i_size) { | 867 | if (statret == 0 && iocb->ki_pos < inode->i_size && |
| 691 | dout("aio_read sync_read hit hole, ppos %lld < size %lld, reading more\n", *ppos, inode->i_size); | 868 | ret < len) { |
| 869 | dout("sync_read hit hole, ppos %lld < size %lld" | ||
| 870 | ", reading more\n", iocb->ki_pos, | ||
| 871 | inode->i_size); | ||
| 872 | |||
| 692 | read += ret; | 873 | read += ret; |
| 693 | base += ret; | ||
| 694 | len -= ret; | 874 | len -= ret; |
| 695 | checkeof = 0; | 875 | checkeof = 0; |
| 696 | goto again; | 876 | goto again; |
| 697 | } | 877 | } |
| 698 | } | 878 | } |
| 879 | |||
| 699 | if (ret >= 0) | 880 | if (ret >= 0) |
| 700 | ret += read; | 881 | ret += read; |
| 701 | 882 | ||
| @@ -772,11 +953,13 @@ retry_snap: | |||
| 772 | inode, ceph_vinop(inode), pos, count, ceph_cap_string(got)); | 953 | inode, ceph_vinop(inode), pos, count, ceph_cap_string(got)); |
| 773 | 954 | ||
| 774 | if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 || | 955 | if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 || |
| 775 | (iocb->ki_filp->f_flags & O_DIRECT) || | 956 | (file->f_flags & O_DIRECT) || (fi->flags & CEPH_F_SYNC)) { |
| 776 | (fi->flags & CEPH_F_SYNC)) { | ||
| 777 | mutex_unlock(&inode->i_mutex); | 957 | mutex_unlock(&inode->i_mutex); |
| 778 | written = ceph_sync_write(file, iov->iov_base, count, | 958 | if (file->f_flags & O_DIRECT) |
| 779 | pos, &iocb->ki_pos); | 959 | written = ceph_sync_direct_write(iocb, iov, |
| 960 | nr_segs, count); | ||
| 961 | else | ||
| 962 | written = ceph_sync_write(iocb, iov, nr_segs, count); | ||
| 780 | if (written == -EOLDSNAPC) { | 963 | if (written == -EOLDSNAPC) { |
| 781 | dout("aio_write %p %llx.%llx %llu~%u" | 964 | dout("aio_write %p %llx.%llx %llu~%u" |
| 782 | "got EOLDSNAPC, retrying\n", | 965 | "got EOLDSNAPC, retrying\n", |
| @@ -1018,7 +1201,7 @@ static long ceph_fallocate(struct file *file, int mode, | |||
| 1018 | loff_t offset, loff_t length) | 1201 | loff_t offset, loff_t length) |
| 1019 | { | 1202 | { |
| 1020 | struct ceph_file_info *fi = file->private_data; | 1203 | struct ceph_file_info *fi = file->private_data; |
| 1021 | struct inode *inode = file->f_dentry->d_inode; | 1204 | struct inode *inode = file_inode(file); |
| 1022 | struct ceph_inode_info *ci = ceph_inode(inode); | 1205 | struct ceph_inode_info *ci = ceph_inode(inode); |
| 1023 | struct ceph_osd_client *osdc = | 1206 | struct ceph_osd_client *osdc = |
| 1024 | &ceph_inode_to_client(inode)->client->osdc; | 1207 | &ceph_inode_to_client(inode)->client->osdc; |
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 278fd2891288..32d519d8a2e2 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c | |||
| @@ -9,6 +9,7 @@ | |||
| 9 | #include <linux/namei.h> | 9 | #include <linux/namei.h> |
| 10 | #include <linux/writeback.h> | 10 | #include <linux/writeback.h> |
| 11 | #include <linux/vmalloc.h> | 11 | #include <linux/vmalloc.h> |
| 12 | #include <linux/posix_acl.h> | ||
| 12 | 13 | ||
| 13 | #include "super.h" | 14 | #include "super.h" |
| 14 | #include "mds_client.h" | 15 | #include "mds_client.h" |
| @@ -95,6 +96,8 @@ const struct inode_operations ceph_file_iops = { | |||
| 95 | .getxattr = ceph_getxattr, | 96 | .getxattr = ceph_getxattr, |
| 96 | .listxattr = ceph_listxattr, | 97 | .listxattr = ceph_listxattr, |
| 97 | .removexattr = ceph_removexattr, | 98 | .removexattr = ceph_removexattr, |
| 99 | .get_acl = ceph_get_acl, | ||
| 100 | .set_acl = ceph_set_acl, | ||
| 98 | }; | 101 | }; |
| 99 | 102 | ||
| 100 | 103 | ||
| @@ -335,12 +338,10 @@ struct inode *ceph_alloc_inode(struct super_block *sb) | |||
| 335 | ci->i_hold_caps_min = 0; | 338 | ci->i_hold_caps_min = 0; |
| 336 | ci->i_hold_caps_max = 0; | 339 | ci->i_hold_caps_max = 0; |
| 337 | INIT_LIST_HEAD(&ci->i_cap_delay_list); | 340 | INIT_LIST_HEAD(&ci->i_cap_delay_list); |
| 338 | ci->i_cap_exporting_mds = 0; | ||
| 339 | ci->i_cap_exporting_mseq = 0; | ||
| 340 | ci->i_cap_exporting_issued = 0; | ||
| 341 | INIT_LIST_HEAD(&ci->i_cap_snaps); | 341 | INIT_LIST_HEAD(&ci->i_cap_snaps); |
| 342 | ci->i_head_snapc = NULL; | 342 | ci->i_head_snapc = NULL; |
| 343 | ci->i_snap_caps = 0; | 343 | ci->i_snap_caps = 0; |
| 344 | ci->i_cap_exporting_issued = 0; | ||
| 344 | 345 | ||
| 345 | for (i = 0; i < CEPH_FILE_MODE_NUM; i++) | 346 | for (i = 0; i < CEPH_FILE_MODE_NUM; i++) |
| 346 | ci->i_nr_by_mode[i] = 0; | 347 | ci->i_nr_by_mode[i] = 0; |
| @@ -436,6 +437,16 @@ void ceph_destroy_inode(struct inode *inode) | |||
| 436 | call_rcu(&inode->i_rcu, ceph_i_callback); | 437 | call_rcu(&inode->i_rcu, ceph_i_callback); |
| 437 | } | 438 | } |
| 438 | 439 | ||
| 440 | int ceph_drop_inode(struct inode *inode) | ||
| 441 | { | ||
| 442 | /* | ||
| 443 | * Positve dentry and corresponding inode are always accompanied | ||
| 444 | * in MDS reply. So no need to keep inode in the cache after | ||
| 445 | * dropping all its aliases. | ||
| 446 | */ | ||
| 447 | return 1; | ||
| 448 | } | ||
| 449 | |||
| 439 | /* | 450 | /* |
| 440 | * Helpers to fill in size, ctime, mtime, and atime. We have to be | 451 | * Helpers to fill in size, ctime, mtime, and atime. We have to be |
| 441 | * careful because either the client or MDS may have more up to date | 452 | * careful because either the client or MDS may have more up to date |
| @@ -670,6 +681,7 @@ static int fill_inode(struct inode *inode, | |||
| 670 | memcpy(ci->i_xattrs.blob->vec.iov_base, | 681 | memcpy(ci->i_xattrs.blob->vec.iov_base, |
| 671 | iinfo->xattr_data, iinfo->xattr_len); | 682 | iinfo->xattr_data, iinfo->xattr_len); |
| 672 | ci->i_xattrs.version = le64_to_cpu(info->xattr_version); | 683 | ci->i_xattrs.version = le64_to_cpu(info->xattr_version); |
| 684 | ceph_forget_all_cached_acls(inode); | ||
| 673 | xattr_blob = NULL; | 685 | xattr_blob = NULL; |
| 674 | } | 686 | } |
| 675 | 687 | ||
| @@ -1454,7 +1466,8 @@ static void ceph_invalidate_work(struct work_struct *work) | |||
| 1454 | dout("invalidate_pages %p gen %d revoking %d\n", inode, | 1466 | dout("invalidate_pages %p gen %d revoking %d\n", inode, |
| 1455 | ci->i_rdcache_gen, ci->i_rdcache_revoking); | 1467 | ci->i_rdcache_gen, ci->i_rdcache_revoking); |
| 1456 | if (ci->i_rdcache_revoking != ci->i_rdcache_gen) { | 1468 | if (ci->i_rdcache_revoking != ci->i_rdcache_gen) { |
| 1457 | /* nevermind! */ | 1469 | if (__ceph_caps_revoking_other(ci, NULL, CEPH_CAP_FILE_CACHE)) |
| 1470 | check = 1; | ||
| 1458 | spin_unlock(&ci->i_ceph_lock); | 1471 | spin_unlock(&ci->i_ceph_lock); |
| 1459 | mutex_unlock(&ci->i_truncate_mutex); | 1472 | mutex_unlock(&ci->i_truncate_mutex); |
| 1460 | goto out; | 1473 | goto out; |
| @@ -1475,13 +1488,14 @@ static void ceph_invalidate_work(struct work_struct *work) | |||
| 1475 | dout("invalidate_pages %p gen %d raced, now %d revoking %d\n", | 1488 | dout("invalidate_pages %p gen %d raced, now %d revoking %d\n", |
| 1476 | inode, orig_gen, ci->i_rdcache_gen, | 1489 | inode, orig_gen, ci->i_rdcache_gen, |
| 1477 | ci->i_rdcache_revoking); | 1490 | ci->i_rdcache_revoking); |
| 1491 | if (__ceph_caps_revoking_other(ci, NULL, CEPH_CAP_FILE_CACHE)) | ||
| 1492 | check = 1; | ||
| 1478 | } | 1493 | } |
| 1479 | spin_unlock(&ci->i_ceph_lock); | 1494 | spin_unlock(&ci->i_ceph_lock); |
| 1480 | mutex_unlock(&ci->i_truncate_mutex); | 1495 | mutex_unlock(&ci->i_truncate_mutex); |
| 1481 | 1496 | out: | |
| 1482 | if (check) | 1497 | if (check) |
| 1483 | ceph_check_caps(ci, 0, NULL); | 1498 | ceph_check_caps(ci, 0, NULL); |
| 1484 | out: | ||
| 1485 | iput(inode); | 1499 | iput(inode); |
| 1486 | } | 1500 | } |
| 1487 | 1501 | ||
| @@ -1602,6 +1616,8 @@ static const struct inode_operations ceph_symlink_iops = { | |||
| 1602 | .getxattr = ceph_getxattr, | 1616 | .getxattr = ceph_getxattr, |
| 1603 | .listxattr = ceph_listxattr, | 1617 | .listxattr = ceph_listxattr, |
| 1604 | .removexattr = ceph_removexattr, | 1618 | .removexattr = ceph_removexattr, |
| 1619 | .get_acl = ceph_get_acl, | ||
| 1620 | .set_acl = ceph_set_acl, | ||
| 1605 | }; | 1621 | }; |
| 1606 | 1622 | ||
| 1607 | /* | 1623 | /* |
| @@ -1675,6 +1691,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) | |||
| 1675 | dirtied |= CEPH_CAP_AUTH_EXCL; | 1691 | dirtied |= CEPH_CAP_AUTH_EXCL; |
| 1676 | } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 || | 1692 | } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 || |
| 1677 | attr->ia_mode != inode->i_mode) { | 1693 | attr->ia_mode != inode->i_mode) { |
| 1694 | inode->i_mode = attr->ia_mode; | ||
| 1678 | req->r_args.setattr.mode = cpu_to_le32(attr->ia_mode); | 1695 | req->r_args.setattr.mode = cpu_to_le32(attr->ia_mode); |
| 1679 | mask |= CEPH_SETATTR_MODE; | 1696 | mask |= CEPH_SETATTR_MODE; |
| 1680 | release |= CEPH_CAP_AUTH_SHARED; | 1697 | release |= CEPH_CAP_AUTH_SHARED; |
| @@ -1790,6 +1807,12 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) | |||
| 1790 | if (inode_dirty_flags) | 1807 | if (inode_dirty_flags) |
| 1791 | __mark_inode_dirty(inode, inode_dirty_flags); | 1808 | __mark_inode_dirty(inode, inode_dirty_flags); |
| 1792 | 1809 | ||
| 1810 | if (ia_valid & ATTR_MODE) { | ||
| 1811 | err = posix_acl_chmod(inode, attr->ia_mode); | ||
| 1812 | if (err) | ||
| 1813 | goto out_put; | ||
| 1814 | } | ||
| 1815 | |||
| 1793 | if (mask) { | 1816 | if (mask) { |
| 1794 | req->r_inode = inode; | 1817 | req->r_inode = inode; |
| 1795 | ihold(inode); | 1818 | ihold(inode); |
| @@ -1809,6 +1832,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) | |||
| 1809 | return err; | 1832 | return err; |
| 1810 | out: | 1833 | out: |
| 1811 | spin_unlock(&ci->i_ceph_lock); | 1834 | spin_unlock(&ci->i_ceph_lock); |
| 1835 | out_put: | ||
| 1812 | ceph_mdsc_put_request(req); | 1836 | ceph_mdsc_put_request(req); |
| 1813 | return err; | 1837 | return err; |
| 1814 | } | 1838 | } |
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index 669622fd1ae3..dc66c9e023e4 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c | |||
| @@ -183,6 +183,8 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) | |||
| 183 | struct ceph_inode_info *ci = ceph_inode(inode); | 183 | struct ceph_inode_info *ci = ceph_inode(inode); |
| 184 | struct ceph_osd_client *osdc = | 184 | struct ceph_osd_client *osdc = |
| 185 | &ceph_sb_to_client(inode->i_sb)->client->osdc; | 185 | &ceph_sb_to_client(inode->i_sb)->client->osdc; |
| 186 | struct ceph_object_locator oloc; | ||
| 187 | struct ceph_object_id oid; | ||
| 186 | u64 len = 1, olen; | 188 | u64 len = 1, olen; |
| 187 | u64 tmp; | 189 | u64 tmp; |
| 188 | struct ceph_pg pgid; | 190 | struct ceph_pg pgid; |
| @@ -211,8 +213,10 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) | |||
| 211 | snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx", | 213 | snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx", |
| 212 | ceph_ino(inode), dl.object_no); | 214 | ceph_ino(inode), dl.object_no); |
| 213 | 215 | ||
| 214 | r = ceph_calc_ceph_pg(&pgid, dl.object_name, osdc->osdmap, | 216 | oloc.pool = ceph_file_layout_pg_pool(ci->i_layout); |
| 215 | ceph_file_layout_pg_pool(ci->i_layout)); | 217 | ceph_oid_set_name(&oid, dl.object_name); |
| 218 | |||
| 219 | r = ceph_oloc_oid_to_pg(osdc->osdmap, &oloc, &oid, &pgid); | ||
| 216 | if (r < 0) { | 220 | if (r < 0) { |
| 217 | up_read(&osdc->map_sem); | 221 | up_read(&osdc->map_sem); |
| 218 | return r; | 222 | return r; |
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index d90861f45210..f4f050a69a48 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c | |||
| @@ -63,7 +63,7 @@ static const struct ceph_connection_operations mds_con_ops; | |||
| 63 | */ | 63 | */ |
| 64 | static int parse_reply_info_in(void **p, void *end, | 64 | static int parse_reply_info_in(void **p, void *end, |
| 65 | struct ceph_mds_reply_info_in *info, | 65 | struct ceph_mds_reply_info_in *info, |
| 66 | int features) | 66 | u64 features) |
| 67 | { | 67 | { |
| 68 | int err = -EIO; | 68 | int err = -EIO; |
| 69 | 69 | ||
| @@ -98,7 +98,7 @@ bad: | |||
| 98 | */ | 98 | */ |
| 99 | static int parse_reply_info_trace(void **p, void *end, | 99 | static int parse_reply_info_trace(void **p, void *end, |
| 100 | struct ceph_mds_reply_info_parsed *info, | 100 | struct ceph_mds_reply_info_parsed *info, |
| 101 | int features) | 101 | u64 features) |
| 102 | { | 102 | { |
| 103 | int err; | 103 | int err; |
| 104 | 104 | ||
| @@ -145,7 +145,7 @@ out_bad: | |||
| 145 | */ | 145 | */ |
| 146 | static int parse_reply_info_dir(void **p, void *end, | 146 | static int parse_reply_info_dir(void **p, void *end, |
| 147 | struct ceph_mds_reply_info_parsed *info, | 147 | struct ceph_mds_reply_info_parsed *info, |
| 148 | int features) | 148 | u64 features) |
| 149 | { | 149 | { |
| 150 | u32 num, i = 0; | 150 | u32 num, i = 0; |
| 151 | int err; | 151 | int err; |
| @@ -217,7 +217,7 @@ out_bad: | |||
| 217 | */ | 217 | */ |
| 218 | static int parse_reply_info_filelock(void **p, void *end, | 218 | static int parse_reply_info_filelock(void **p, void *end, |
| 219 | struct ceph_mds_reply_info_parsed *info, | 219 | struct ceph_mds_reply_info_parsed *info, |
| 220 | int features) | 220 | u64 features) |
| 221 | { | 221 | { |
| 222 | if (*p + sizeof(*info->filelock_reply) > end) | 222 | if (*p + sizeof(*info->filelock_reply) > end) |
| 223 | goto bad; | 223 | goto bad; |
| @@ -238,7 +238,7 @@ bad: | |||
| 238 | */ | 238 | */ |
| 239 | static int parse_reply_info_create(void **p, void *end, | 239 | static int parse_reply_info_create(void **p, void *end, |
| 240 | struct ceph_mds_reply_info_parsed *info, | 240 | struct ceph_mds_reply_info_parsed *info, |
| 241 | int features) | 241 | u64 features) |
| 242 | { | 242 | { |
| 243 | if (features & CEPH_FEATURE_REPLY_CREATE_INODE) { | 243 | if (features & CEPH_FEATURE_REPLY_CREATE_INODE) { |
| 244 | if (*p == end) { | 244 | if (*p == end) { |
| @@ -262,7 +262,7 @@ bad: | |||
| 262 | */ | 262 | */ |
| 263 | static int parse_reply_info_extra(void **p, void *end, | 263 | static int parse_reply_info_extra(void **p, void *end, |
| 264 | struct ceph_mds_reply_info_parsed *info, | 264 | struct ceph_mds_reply_info_parsed *info, |
| 265 | int features) | 265 | u64 features) |
| 266 | { | 266 | { |
| 267 | if (info->head->op == CEPH_MDS_OP_GETFILELOCK) | 267 | if (info->head->op == CEPH_MDS_OP_GETFILELOCK) |
| 268 | return parse_reply_info_filelock(p, end, info, features); | 268 | return parse_reply_info_filelock(p, end, info, features); |
| @@ -280,7 +280,7 @@ static int parse_reply_info_extra(void **p, void *end, | |||
| 280 | */ | 280 | */ |
| 281 | static int parse_reply_info(struct ceph_msg *msg, | 281 | static int parse_reply_info(struct ceph_msg *msg, |
| 282 | struct ceph_mds_reply_info_parsed *info, | 282 | struct ceph_mds_reply_info_parsed *info, |
| 283 | int features) | 283 | u64 features) |
| 284 | { | 284 | { |
| 285 | void *p, *end; | 285 | void *p, *end; |
| 286 | u32 len; | 286 | u32 len; |
| @@ -713,14 +713,15 @@ static int __choose_mds(struct ceph_mds_client *mdsc, | |||
| 713 | struct dentry *dn = get_nonsnap_parent(parent); | 713 | struct dentry *dn = get_nonsnap_parent(parent); |
| 714 | inode = dn->d_inode; | 714 | inode = dn->d_inode; |
| 715 | dout("__choose_mds using nonsnap parent %p\n", inode); | 715 | dout("__choose_mds using nonsnap parent %p\n", inode); |
| 716 | } else if (req->r_dentry->d_inode) { | 716 | } else { |
| 717 | /* dentry target */ | 717 | /* dentry target */ |
| 718 | inode = req->r_dentry->d_inode; | 718 | inode = req->r_dentry->d_inode; |
| 719 | } else { | 719 | if (!inode || mode == USE_AUTH_MDS) { |
| 720 | /* dir + name */ | 720 | /* dir + name */ |
| 721 | inode = dir; | 721 | inode = dir; |
| 722 | hash = ceph_dentry_hash(dir, req->r_dentry); | 722 | hash = ceph_dentry_hash(dir, req->r_dentry); |
| 723 | is_hash = true; | 723 | is_hash = true; |
| 724 | } | ||
| 724 | } | 725 | } |
| 725 | } | 726 | } |
| 726 | 727 | ||
| @@ -846,35 +847,56 @@ static int __open_session(struct ceph_mds_client *mdsc, | |||
| 846 | * | 847 | * |
| 847 | * called under mdsc->mutex | 848 | * called under mdsc->mutex |
| 848 | */ | 849 | */ |
| 850 | static struct ceph_mds_session * | ||
| 851 | __open_export_target_session(struct ceph_mds_client *mdsc, int target) | ||
| 852 | { | ||
| 853 | struct ceph_mds_session *session; | ||
| 854 | |||
| 855 | session = __ceph_lookup_mds_session(mdsc, target); | ||
| 856 | if (!session) { | ||
| 857 | session = register_session(mdsc, target); | ||
| 858 | if (IS_ERR(session)) | ||
| 859 | return session; | ||
| 860 | } | ||
| 861 | if (session->s_state == CEPH_MDS_SESSION_NEW || | ||
| 862 | session->s_state == CEPH_MDS_SESSION_CLOSING) | ||
| 863 | __open_session(mdsc, session); | ||
| 864 | |||
| 865 | return session; | ||
| 866 | } | ||
| 867 | |||
| 868 | struct ceph_mds_session * | ||
| 869 | ceph_mdsc_open_export_target_session(struct ceph_mds_client *mdsc, int target) | ||
| 870 | { | ||
| 871 | struct ceph_mds_session *session; | ||
| 872 | |||
| 873 | dout("open_export_target_session to mds%d\n", target); | ||
| 874 | |||
| 875 | mutex_lock(&mdsc->mutex); | ||
| 876 | session = __open_export_target_session(mdsc, target); | ||
| 877 | mutex_unlock(&mdsc->mutex); | ||
| 878 | |||
| 879 | return session; | ||
| 880 | } | ||
| 881 | |||
| 849 | static void __open_export_target_sessions(struct ceph_mds_client *mdsc, | 882 | static void __open_export_target_sessions(struct ceph_mds_client *mdsc, |
| 850 | struct ceph_mds_session *session) | 883 | struct ceph_mds_session *session) |
| 851 | { | 884 | { |
| 852 | struct ceph_mds_info *mi; | 885 | struct ceph_mds_info *mi; |
| 853 | struct ceph_mds_session *ts; | 886 | struct ceph_mds_session *ts; |
| 854 | int i, mds = session->s_mds; | 887 | int i, mds = session->s_mds; |
| 855 | int target; | ||
| 856 | 888 | ||
| 857 | if (mds >= mdsc->mdsmap->m_max_mds) | 889 | if (mds >= mdsc->mdsmap->m_max_mds) |
| 858 | return; | 890 | return; |
| 891 | |||
| 859 | mi = &mdsc->mdsmap->m_info[mds]; | 892 | mi = &mdsc->mdsmap->m_info[mds]; |
| 860 | dout("open_export_target_sessions for mds%d (%d targets)\n", | 893 | dout("open_export_target_sessions for mds%d (%d targets)\n", |
| 861 | session->s_mds, mi->num_export_targets); | 894 | session->s_mds, mi->num_export_targets); |
| 862 | 895 | ||
| 863 | for (i = 0; i < mi->num_export_targets; i++) { | 896 | for (i = 0; i < mi->num_export_targets; i++) { |
| 864 | target = mi->export_targets[i]; | 897 | ts = __open_export_target_session(mdsc, mi->export_targets[i]); |
| 865 | ts = __ceph_lookup_mds_session(mdsc, target); | 898 | if (!IS_ERR(ts)) |
| 866 | if (!ts) { | 899 | ceph_put_mds_session(ts); |
| 867 | ts = register_session(mdsc, target); | ||
| 868 | if (IS_ERR(ts)) | ||
| 869 | return; | ||
| 870 | } | ||
| 871 | if (session->s_state == CEPH_MDS_SESSION_NEW || | ||
| 872 | session->s_state == CEPH_MDS_SESSION_CLOSING) | ||
| 873 | __open_session(mdsc, session); | ||
| 874 | else | ||
| 875 | dout(" mds%d target mds%d %p is %s\n", session->s_mds, | ||
| 876 | i, ts, session_state_name(ts->s_state)); | ||
| 877 | ceph_put_mds_session(ts); | ||
| 878 | } | 900 | } |
| 879 | } | 901 | } |
| 880 | 902 | ||
| @@ -1136,6 +1158,21 @@ static int send_renew_caps(struct ceph_mds_client *mdsc, | |||
| 1136 | return 0; | 1158 | return 0; |
| 1137 | } | 1159 | } |
| 1138 | 1160 | ||
| 1161 | static int send_flushmsg_ack(struct ceph_mds_client *mdsc, | ||
| 1162 | struct ceph_mds_session *session, u64 seq) | ||
| 1163 | { | ||
| 1164 | struct ceph_msg *msg; | ||
| 1165 | |||
| 1166 | dout("send_flushmsg_ack to mds%d (%s)s seq %lld\n", | ||
| 1167 | session->s_mds, session_state_name(session->s_state), seq); | ||
| 1168 | msg = create_session_msg(CEPH_SESSION_FLUSHMSG_ACK, seq); | ||
| 1169 | if (!msg) | ||
| 1170 | return -ENOMEM; | ||
| 1171 | ceph_con_send(&session->s_con, msg); | ||
| 1172 | return 0; | ||
| 1173 | } | ||
| 1174 | |||
| 1175 | |||
| 1139 | /* | 1176 | /* |
| 1140 | * Note new cap ttl, and any transition from stale -> not stale (fresh?). | 1177 | * Note new cap ttl, and any transition from stale -> not stale (fresh?). |
| 1141 | * | 1178 | * |
| @@ -1214,7 +1251,7 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) | |||
| 1214 | { | 1251 | { |
| 1215 | struct ceph_mds_session *session = arg; | 1252 | struct ceph_mds_session *session = arg; |
| 1216 | struct ceph_inode_info *ci = ceph_inode(inode); | 1253 | struct ceph_inode_info *ci = ceph_inode(inode); |
| 1217 | int used, oissued, mine; | 1254 | int used, wanted, oissued, mine; |
| 1218 | 1255 | ||
| 1219 | if (session->s_trim_caps <= 0) | 1256 | if (session->s_trim_caps <= 0) |
| 1220 | return -1; | 1257 | return -1; |
| @@ -1222,14 +1259,19 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) | |||
| 1222 | spin_lock(&ci->i_ceph_lock); | 1259 | spin_lock(&ci->i_ceph_lock); |
| 1223 | mine = cap->issued | cap->implemented; | 1260 | mine = cap->issued | cap->implemented; |
| 1224 | used = __ceph_caps_used(ci); | 1261 | used = __ceph_caps_used(ci); |
| 1262 | wanted = __ceph_caps_file_wanted(ci); | ||
| 1225 | oissued = __ceph_caps_issued_other(ci, cap); | 1263 | oissued = __ceph_caps_issued_other(ci, cap); |
| 1226 | 1264 | ||
| 1227 | dout("trim_caps_cb %p cap %p mine %s oissued %s used %s\n", | 1265 | dout("trim_caps_cb %p cap %p mine %s oissued %s used %s wanted %s\n", |
| 1228 | inode, cap, ceph_cap_string(mine), ceph_cap_string(oissued), | 1266 | inode, cap, ceph_cap_string(mine), ceph_cap_string(oissued), |
| 1229 | ceph_cap_string(used)); | 1267 | ceph_cap_string(used), ceph_cap_string(wanted)); |
| 1230 | if (ci->i_dirty_caps) | 1268 | if (cap == ci->i_auth_cap) { |
| 1231 | goto out; /* dirty caps */ | 1269 | if (ci->i_dirty_caps | ci->i_flushing_caps) |
| 1232 | if ((used & ~oissued) & mine) | 1270 | goto out; |
| 1271 | if ((used | wanted) & CEPH_CAP_ANY_WR) | ||
| 1272 | goto out; | ||
| 1273 | } | ||
| 1274 | if ((used | wanted) & ~oissued & mine) | ||
| 1233 | goto out; /* we need these caps */ | 1275 | goto out; /* we need these caps */ |
| 1234 | 1276 | ||
| 1235 | session->s_trim_caps--; | 1277 | session->s_trim_caps--; |
| @@ -2156,26 +2198,16 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) | |||
| 2156 | */ | 2198 | */ |
| 2157 | if (result == -ESTALE) { | 2199 | if (result == -ESTALE) { |
| 2158 | dout("got ESTALE on request %llu", req->r_tid); | 2200 | dout("got ESTALE on request %llu", req->r_tid); |
| 2159 | if (!req->r_inode) { | 2201 | if (req->r_direct_mode != USE_AUTH_MDS) { |
| 2160 | /* do nothing; not an authority problem */ | ||
| 2161 | } else if (req->r_direct_mode != USE_AUTH_MDS) { | ||
| 2162 | dout("not using auth, setting for that now"); | 2202 | dout("not using auth, setting for that now"); |
| 2163 | req->r_direct_mode = USE_AUTH_MDS; | 2203 | req->r_direct_mode = USE_AUTH_MDS; |
| 2164 | __do_request(mdsc, req); | 2204 | __do_request(mdsc, req); |
| 2165 | mutex_unlock(&mdsc->mutex); | 2205 | mutex_unlock(&mdsc->mutex); |
| 2166 | goto out; | 2206 | goto out; |
| 2167 | } else { | 2207 | } else { |
| 2168 | struct ceph_inode_info *ci = ceph_inode(req->r_inode); | 2208 | int mds = __choose_mds(mdsc, req); |
| 2169 | struct ceph_cap *cap = NULL; | 2209 | if (mds >= 0 && mds != req->r_session->s_mds) { |
| 2170 | 2210 | dout("but auth changed, so resending"); | |
| 2171 | if (req->r_session) | ||
| 2172 | cap = ceph_get_cap_for_mds(ci, | ||
| 2173 | req->r_session->s_mds); | ||
| 2174 | |||
| 2175 | dout("already using auth"); | ||
| 2176 | if ((!cap || cap != ci->i_auth_cap) || | ||
| 2177 | (cap->mseq != req->r_sent_on_mseq)) { | ||
| 2178 | dout("but cap changed, so resending"); | ||
| 2179 | __do_request(mdsc, req); | 2211 | __do_request(mdsc, req); |
| 2180 | mutex_unlock(&mdsc->mutex); | 2212 | mutex_unlock(&mdsc->mutex); |
| 2181 | goto out; | 2213 | goto out; |
| @@ -2400,6 +2432,10 @@ static void handle_session(struct ceph_mds_session *session, | |||
| 2400 | trim_caps(mdsc, session, le32_to_cpu(h->max_caps)); | 2432 | trim_caps(mdsc, session, le32_to_cpu(h->max_caps)); |
| 2401 | break; | 2433 | break; |
| 2402 | 2434 | ||
| 2435 | case CEPH_SESSION_FLUSHMSG: | ||
| 2436 | send_flushmsg_ack(mdsc, session, seq); | ||
| 2437 | break; | ||
| 2438 | |||
| 2403 | default: | 2439 | default: |
| 2404 | pr_err("mdsc_handle_session bad op %d mds%d\n", op, mds); | 2440 | pr_err("mdsc_handle_session bad op %d mds%d\n", op, mds); |
| 2405 | WARN_ON(1); | 2441 | WARN_ON(1); |
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 4c053d099ae4..68288917c737 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h | |||
| @@ -383,6 +383,8 @@ extern void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session, | |||
| 383 | extern void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, | 383 | extern void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, |
| 384 | struct ceph_msg *msg); | 384 | struct ceph_msg *msg); |
| 385 | 385 | ||
| 386 | extern struct ceph_mds_session * | ||
| 387 | ceph_mdsc_open_export_target_session(struct ceph_mds_client *mdsc, int target); | ||
| 386 | extern void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc, | 388 | extern void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc, |
| 387 | struct ceph_mds_session *session); | 389 | struct ceph_mds_session *session); |
| 388 | 390 | ||
diff --git a/fs/ceph/strings.c b/fs/ceph/strings.c index 89fa4a940a0f..4440f447fd3f 100644 --- a/fs/ceph/strings.c +++ b/fs/ceph/strings.c | |||
| @@ -41,6 +41,8 @@ const char *ceph_session_op_name(int op) | |||
| 41 | case CEPH_SESSION_RENEWCAPS: return "renewcaps"; | 41 | case CEPH_SESSION_RENEWCAPS: return "renewcaps"; |
| 42 | case CEPH_SESSION_STALE: return "stale"; | 42 | case CEPH_SESSION_STALE: return "stale"; |
| 43 | case CEPH_SESSION_RECALL_STATE: return "recall_state"; | 43 | case CEPH_SESSION_RECALL_STATE: return "recall_state"; |
| 44 | case CEPH_SESSION_FLUSHMSG: return "flushmsg"; | ||
| 45 | case CEPH_SESSION_FLUSHMSG_ACK: return "flushmsg_ack"; | ||
| 44 | } | 46 | } |
| 45 | return "???"; | 47 | return "???"; |
| 46 | } | 48 | } |
diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 6a0951e43044..2df963f1cf5a 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c | |||
| @@ -490,10 +490,10 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, | |||
| 490 | struct ceph_options *opt) | 490 | struct ceph_options *opt) |
| 491 | { | 491 | { |
| 492 | struct ceph_fs_client *fsc; | 492 | struct ceph_fs_client *fsc; |
| 493 | const unsigned supported_features = | 493 | const u64 supported_features = |
| 494 | CEPH_FEATURE_FLOCK | | 494 | CEPH_FEATURE_FLOCK | |
| 495 | CEPH_FEATURE_DIRLAYOUTHASH; | 495 | CEPH_FEATURE_DIRLAYOUTHASH; |
| 496 | const unsigned required_features = 0; | 496 | const u64 required_features = 0; |
| 497 | int page_count; | 497 | int page_count; |
| 498 | size_t size; | 498 | size_t size; |
| 499 | int err = -ENOMEM; | 499 | int err = -ENOMEM; |
| @@ -686,6 +686,7 @@ static const struct super_operations ceph_super_ops = { | |||
| 686 | .alloc_inode = ceph_alloc_inode, | 686 | .alloc_inode = ceph_alloc_inode, |
| 687 | .destroy_inode = ceph_destroy_inode, | 687 | .destroy_inode = ceph_destroy_inode, |
| 688 | .write_inode = ceph_write_inode, | 688 | .write_inode = ceph_write_inode, |
| 689 | .drop_inode = ceph_drop_inode, | ||
| 689 | .sync_fs = ceph_sync_fs, | 690 | .sync_fs = ceph_sync_fs, |
| 690 | .put_super = ceph_put_super, | 691 | .put_super = ceph_put_super, |
| 691 | .show_options = ceph_show_options, | 692 | .show_options = ceph_show_options, |
| @@ -818,7 +819,11 @@ static int ceph_set_super(struct super_block *s, void *data) | |||
| 818 | 819 | ||
| 819 | s->s_flags = fsc->mount_options->sb_flags; | 820 | s->s_flags = fsc->mount_options->sb_flags; |
| 820 | s->s_maxbytes = 1ULL << 40; /* temp value until we get mdsmap */ | 821 | s->s_maxbytes = 1ULL << 40; /* temp value until we get mdsmap */ |
| 822 | #ifdef CONFIG_CEPH_FS_POSIX_ACL | ||
| 823 | s->s_flags |= MS_POSIXACL; | ||
| 824 | #endif | ||
| 821 | 825 | ||
| 826 | s->s_xattr = ceph_xattr_handlers; | ||
| 822 | s->s_fs_info = fsc; | 827 | s->s_fs_info = fsc; |
| 823 | fsc->sb = s; | 828 | fsc->sb = s; |
| 824 | 829 | ||
diff --git a/fs/ceph/super.h b/fs/ceph/super.h index ef4ac38bb614..aa260590f615 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h | |||
| @@ -287,14 +287,12 @@ struct ceph_inode_info { | |||
| 287 | unsigned long i_hold_caps_min; /* jiffies */ | 287 | unsigned long i_hold_caps_min; /* jiffies */ |
| 288 | unsigned long i_hold_caps_max; /* jiffies */ | 288 | unsigned long i_hold_caps_max; /* jiffies */ |
| 289 | struct list_head i_cap_delay_list; /* for delayed cap release to mds */ | 289 | struct list_head i_cap_delay_list; /* for delayed cap release to mds */ |
| 290 | int i_cap_exporting_mds; /* to handle cap migration between */ | ||
| 291 | unsigned i_cap_exporting_mseq; /* mds's. */ | ||
| 292 | unsigned i_cap_exporting_issued; | ||
| 293 | struct ceph_cap_reservation i_cap_migration_resv; | 290 | struct ceph_cap_reservation i_cap_migration_resv; |
| 294 | struct list_head i_cap_snaps; /* snapped state pending flush to mds */ | 291 | struct list_head i_cap_snaps; /* snapped state pending flush to mds */ |
| 295 | struct ceph_snap_context *i_head_snapc; /* set if wr_buffer_head > 0 or | 292 | struct ceph_snap_context *i_head_snapc; /* set if wr_buffer_head > 0 or |
| 296 | dirty|flushing caps */ | 293 | dirty|flushing caps */ |
| 297 | unsigned i_snap_caps; /* cap bits for snapped files */ | 294 | unsigned i_snap_caps; /* cap bits for snapped files */ |
| 295 | unsigned i_cap_exporting_issued; | ||
| 298 | 296 | ||
| 299 | int i_nr_by_mode[CEPH_FILE_MODE_NUM]; /* open file counts */ | 297 | int i_nr_by_mode[CEPH_FILE_MODE_NUM]; /* open file counts */ |
| 300 | 298 | ||
| @@ -335,7 +333,6 @@ struct ceph_inode_info { | |||
| 335 | u32 i_fscache_gen; /* sequence, for delayed fscache validate */ | 333 | u32 i_fscache_gen; /* sequence, for delayed fscache validate */ |
| 336 | struct work_struct i_revalidate_work; | 334 | struct work_struct i_revalidate_work; |
| 337 | #endif | 335 | #endif |
| 338 | |||
| 339 | struct inode vfs_inode; /* at end */ | 336 | struct inode vfs_inode; /* at end */ |
| 340 | }; | 337 | }; |
| 341 | 338 | ||
| @@ -529,6 +526,8 @@ static inline int __ceph_caps_dirty(struct ceph_inode_info *ci) | |||
| 529 | } | 526 | } |
| 530 | extern int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask); | 527 | extern int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask); |
| 531 | 528 | ||
| 529 | extern int __ceph_caps_revoking_other(struct ceph_inode_info *ci, | ||
| 530 | struct ceph_cap *ocap, int mask); | ||
| 532 | extern int ceph_caps_revoking(struct ceph_inode_info *ci, int mask); | 531 | extern int ceph_caps_revoking(struct ceph_inode_info *ci, int mask); |
| 533 | extern int __ceph_caps_used(struct ceph_inode_info *ci); | 532 | extern int __ceph_caps_used(struct ceph_inode_info *ci); |
| 534 | 533 | ||
| @@ -691,6 +690,7 @@ extern const struct inode_operations ceph_file_iops; | |||
| 691 | 690 | ||
| 692 | extern struct inode *ceph_alloc_inode(struct super_block *sb); | 691 | extern struct inode *ceph_alloc_inode(struct super_block *sb); |
| 693 | extern void ceph_destroy_inode(struct inode *inode); | 692 | extern void ceph_destroy_inode(struct inode *inode); |
| 693 | extern int ceph_drop_inode(struct inode *inode); | ||
| 694 | 694 | ||
| 695 | extern struct inode *ceph_get_inode(struct super_block *sb, | 695 | extern struct inode *ceph_get_inode(struct super_block *sb, |
| 696 | struct ceph_vino vino); | 696 | struct ceph_vino vino); |
| @@ -718,12 +718,16 @@ extern void ceph_queue_writeback(struct inode *inode); | |||
| 718 | extern int ceph_do_getattr(struct inode *inode, int mask); | 718 | extern int ceph_do_getattr(struct inode *inode, int mask); |
| 719 | extern int ceph_permission(struct inode *inode, int mask); | 719 | extern int ceph_permission(struct inode *inode, int mask); |
| 720 | extern int ceph_setattr(struct dentry *dentry, struct iattr *attr); | 720 | extern int ceph_setattr(struct dentry *dentry, struct iattr *attr); |
| 721 | extern int ceph_setattr(struct dentry *dentry, struct iattr *attr); | ||
| 721 | extern int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry, | 722 | extern int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry, |
| 722 | struct kstat *stat); | 723 | struct kstat *stat); |
| 723 | 724 | ||
| 724 | /* xattr.c */ | 725 | /* xattr.c */ |
| 725 | extern int ceph_setxattr(struct dentry *, const char *, const void *, | 726 | extern int ceph_setxattr(struct dentry *, const char *, const void *, |
| 726 | size_t, int); | 727 | size_t, int); |
| 728 | int __ceph_setxattr(struct dentry *, const char *, const void *, size_t, int); | ||
| 729 | ssize_t __ceph_getxattr(struct inode *, const char *, void *, size_t); | ||
| 730 | int __ceph_removexattr(struct dentry *, const char *); | ||
| 727 | extern ssize_t ceph_getxattr(struct dentry *, const char *, void *, size_t); | 731 | extern ssize_t ceph_getxattr(struct dentry *, const char *, void *, size_t); |
| 728 | extern ssize_t ceph_listxattr(struct dentry *, char *, size_t); | 732 | extern ssize_t ceph_listxattr(struct dentry *, char *, size_t); |
| 729 | extern int ceph_removexattr(struct dentry *, const char *); | 733 | extern int ceph_removexattr(struct dentry *, const char *); |
| @@ -732,6 +736,38 @@ extern void __ceph_destroy_xattrs(struct ceph_inode_info *ci); | |||
| 732 | extern void __init ceph_xattr_init(void); | 736 | extern void __init ceph_xattr_init(void); |
| 733 | extern void ceph_xattr_exit(void); | 737 | extern void ceph_xattr_exit(void); |
| 734 | 738 | ||
| 739 | /* acl.c */ | ||
| 740 | extern const struct xattr_handler *ceph_xattr_handlers[]; | ||
| 741 | |||
| 742 | #ifdef CONFIG_CEPH_FS_POSIX_ACL | ||
| 743 | |||
| 744 | struct posix_acl *ceph_get_acl(struct inode *, int); | ||
| 745 | int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type); | ||
| 746 | int ceph_init_acl(struct dentry *, struct inode *, struct inode *); | ||
| 747 | void ceph_forget_all_cached_acls(struct inode *inode); | ||
| 748 | |||
| 749 | #else | ||
| 750 | |||
| 751 | #define ceph_get_acl NULL | ||
| 752 | #define ceph_set_acl NULL | ||
| 753 | |||
| 754 | static inline int ceph_init_acl(struct dentry *dentry, struct inode *inode, | ||
| 755 | struct inode *dir) | ||
| 756 | { | ||
| 757 | return 0; | ||
| 758 | } | ||
| 759 | |||
| 760 | static inline int ceph_acl_chmod(struct dentry *dentry, struct inode *inode) | ||
| 761 | { | ||
| 762 | return 0; | ||
| 763 | } | ||
| 764 | |||
| 765 | static inline void ceph_forget_all_cached_acls(struct inode *inode) | ||
| 766 | { | ||
| 767 | } | ||
| 768 | |||
| 769 | #endif | ||
| 770 | |||
| 735 | /* caps.c */ | 771 | /* caps.c */ |
| 736 | extern const char *ceph_cap_string(int c); | 772 | extern const char *ceph_cap_string(int c); |
| 737 | extern void ceph_handle_caps(struct ceph_mds_session *session, | 773 | extern void ceph_handle_caps(struct ceph_mds_session *session, |
| @@ -744,6 +780,7 @@ extern int ceph_add_cap(struct inode *inode, | |||
| 744 | extern void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release); | 780 | extern void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release); |
| 745 | extern void ceph_put_cap(struct ceph_mds_client *mdsc, | 781 | extern void ceph_put_cap(struct ceph_mds_client *mdsc, |
| 746 | struct ceph_cap *cap); | 782 | struct ceph_cap *cap); |
| 783 | extern int ceph_is_any_caps(struct inode *inode); | ||
| 747 | 784 | ||
| 748 | extern void __queue_cap_release(struct ceph_mds_session *session, u64 ino, | 785 | extern void __queue_cap_release(struct ceph_mds_session *session, u64 ino, |
| 749 | u64 cap_id, u32 migrate_seq, u32 issue_seq); | 786 | u64 cap_id, u32 migrate_seq, u32 issue_seq); |
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index be661d8f532a..898b6565ad3e 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c | |||
| @@ -6,16 +6,30 @@ | |||
| 6 | #include <linux/ceph/decode.h> | 6 | #include <linux/ceph/decode.h> |
| 7 | 7 | ||
| 8 | #include <linux/xattr.h> | 8 | #include <linux/xattr.h> |
| 9 | #include <linux/posix_acl_xattr.h> | ||
| 9 | #include <linux/slab.h> | 10 | #include <linux/slab.h> |
| 10 | 11 | ||
| 11 | #define XATTR_CEPH_PREFIX "ceph." | 12 | #define XATTR_CEPH_PREFIX "ceph." |
| 12 | #define XATTR_CEPH_PREFIX_LEN (sizeof (XATTR_CEPH_PREFIX) - 1) | 13 | #define XATTR_CEPH_PREFIX_LEN (sizeof (XATTR_CEPH_PREFIX) - 1) |
| 13 | 14 | ||
| 15 | /* | ||
| 16 | * List of handlers for synthetic system.* attributes. Other | ||
| 17 | * attributes are handled directly. | ||
| 18 | */ | ||
| 19 | const struct xattr_handler *ceph_xattr_handlers[] = { | ||
| 20 | #ifdef CONFIG_CEPH_FS_POSIX_ACL | ||
| 21 | &posix_acl_access_xattr_handler, | ||
| 22 | &posix_acl_default_xattr_handler, | ||
| 23 | #endif | ||
| 24 | NULL, | ||
| 25 | }; | ||
| 26 | |||
| 14 | static bool ceph_is_valid_xattr(const char *name) | 27 | static bool ceph_is_valid_xattr(const char *name) |
| 15 | { | 28 | { |
| 16 | return !strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN) || | 29 | return !strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN) || |
| 17 | !strncmp(name, XATTR_SECURITY_PREFIX, | 30 | !strncmp(name, XATTR_SECURITY_PREFIX, |
| 18 | XATTR_SECURITY_PREFIX_LEN) || | 31 | XATTR_SECURITY_PREFIX_LEN) || |
| 32 | !strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) || | ||
| 19 | !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) || | 33 | !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) || |
| 20 | !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN); | 34 | !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN); |
| 21 | } | 35 | } |
| @@ -663,10 +677,9 @@ void __ceph_build_xattrs_blob(struct ceph_inode_info *ci) | |||
| 663 | } | 677 | } |
| 664 | } | 678 | } |
| 665 | 679 | ||
| 666 | ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value, | 680 | ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value, |
| 667 | size_t size) | 681 | size_t size) |
| 668 | { | 682 | { |
| 669 | struct inode *inode = dentry->d_inode; | ||
| 670 | struct ceph_inode_info *ci = ceph_inode(inode); | 683 | struct ceph_inode_info *ci = ceph_inode(inode); |
| 671 | int err; | 684 | int err; |
| 672 | struct ceph_inode_xattr *xattr; | 685 | struct ceph_inode_xattr *xattr; |
| @@ -675,7 +688,6 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value, | |||
| 675 | if (!ceph_is_valid_xattr(name)) | 688 | if (!ceph_is_valid_xattr(name)) |
| 676 | return -ENODATA; | 689 | return -ENODATA; |
| 677 | 690 | ||
| 678 | |||
| 679 | /* let's see if a virtual xattr was requested */ | 691 | /* let's see if a virtual xattr was requested */ |
| 680 | vxattr = ceph_match_vxattr(inode, name); | 692 | vxattr = ceph_match_vxattr(inode, name); |
| 681 | if (vxattr && !(vxattr->exists_cb && !vxattr->exists_cb(ci))) { | 693 | if (vxattr && !(vxattr->exists_cb && !vxattr->exists_cb(ci))) { |
| @@ -725,6 +737,15 @@ out: | |||
| 725 | return err; | 737 | return err; |
| 726 | } | 738 | } |
| 727 | 739 | ||
| 740 | ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value, | ||
| 741 | size_t size) | ||
| 742 | { | ||
| 743 | if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) | ||
| 744 | return generic_getxattr(dentry, name, value, size); | ||
| 745 | |||
| 746 | return __ceph_getxattr(dentry->d_inode, name, value, size); | ||
| 747 | } | ||
| 748 | |||
| 728 | ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size) | 749 | ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size) |
| 729 | { | 750 | { |
| 730 | struct inode *inode = dentry->d_inode; | 751 | struct inode *inode = dentry->d_inode; |
| @@ -863,8 +884,8 @@ out: | |||
| 863 | return err; | 884 | return err; |
| 864 | } | 885 | } |
| 865 | 886 | ||
| 866 | int ceph_setxattr(struct dentry *dentry, const char *name, | 887 | int __ceph_setxattr(struct dentry *dentry, const char *name, |
| 867 | const void *value, size_t size, int flags) | 888 | const void *value, size_t size, int flags) |
| 868 | { | 889 | { |
| 869 | struct inode *inode = dentry->d_inode; | 890 | struct inode *inode = dentry->d_inode; |
| 870 | struct ceph_vxattr *vxattr; | 891 | struct ceph_vxattr *vxattr; |
| @@ -879,9 +900,6 @@ int ceph_setxattr(struct dentry *dentry, const char *name, | |||
| 879 | struct ceph_inode_xattr *xattr = NULL; | 900 | struct ceph_inode_xattr *xattr = NULL; |
| 880 | int required_blob_size; | 901 | int required_blob_size; |
| 881 | 902 | ||
| 882 | if (ceph_snap(inode) != CEPH_NOSNAP) | ||
| 883 | return -EROFS; | ||
| 884 | |||
| 885 | if (!ceph_is_valid_xattr(name)) | 903 | if (!ceph_is_valid_xattr(name)) |
| 886 | return -EOPNOTSUPP; | 904 | return -EOPNOTSUPP; |
| 887 | 905 | ||
| @@ -958,6 +976,18 @@ out: | |||
| 958 | return err; | 976 | return err; |
| 959 | } | 977 | } |
| 960 | 978 | ||
| 979 | int ceph_setxattr(struct dentry *dentry, const char *name, | ||
| 980 | const void *value, size_t size, int flags) | ||
| 981 | { | ||
| 982 | if (ceph_snap(dentry->d_inode) != CEPH_NOSNAP) | ||
| 983 | return -EROFS; | ||
| 984 | |||
| 985 | if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) | ||
| 986 | return generic_setxattr(dentry, name, value, size, flags); | ||
| 987 | |||
| 988 | return __ceph_setxattr(dentry, name, value, size, flags); | ||
| 989 | } | ||
| 990 | |||
| 961 | static int ceph_send_removexattr(struct dentry *dentry, const char *name) | 991 | static int ceph_send_removexattr(struct dentry *dentry, const char *name) |
| 962 | { | 992 | { |
| 963 | struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); | 993 | struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); |
| @@ -984,7 +1014,7 @@ static int ceph_send_removexattr(struct dentry *dentry, const char *name) | |||
| 984 | return err; | 1014 | return err; |
| 985 | } | 1015 | } |
| 986 | 1016 | ||
| 987 | int ceph_removexattr(struct dentry *dentry, const char *name) | 1017 | int __ceph_removexattr(struct dentry *dentry, const char *name) |
| 988 | { | 1018 | { |
| 989 | struct inode *inode = dentry->d_inode; | 1019 | struct inode *inode = dentry->d_inode; |
| 990 | struct ceph_vxattr *vxattr; | 1020 | struct ceph_vxattr *vxattr; |
| @@ -994,9 +1024,6 @@ int ceph_removexattr(struct dentry *dentry, const char *name) | |||
| 994 | int required_blob_size; | 1024 | int required_blob_size; |
| 995 | int dirty; | 1025 | int dirty; |
| 996 | 1026 | ||
| 997 | if (ceph_snap(inode) != CEPH_NOSNAP) | ||
| 998 | return -EROFS; | ||
| 999 | |||
| 1000 | if (!ceph_is_valid_xattr(name)) | 1027 | if (!ceph_is_valid_xattr(name)) |
| 1001 | return -EOPNOTSUPP; | 1028 | return -EOPNOTSUPP; |
| 1002 | 1029 | ||
| @@ -1053,3 +1080,13 @@ out: | |||
| 1053 | return err; | 1080 | return err; |
| 1054 | } | 1081 | } |
| 1055 | 1082 | ||
| 1083 | int ceph_removexattr(struct dentry *dentry, const char *name) | ||
| 1084 | { | ||
| 1085 | if (ceph_snap(dentry->d_inode) != CEPH_NOSNAP) | ||
| 1086 | return -EROFS; | ||
| 1087 | |||
| 1088 | if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) | ||
| 1089 | return generic_removexattr(dentry, name); | ||
| 1090 | |||
| 1091 | return __ceph_removexattr(dentry, name); | ||
| 1092 | } | ||
diff --git a/fs/dcookies.c b/fs/dcookies.c index ab5954b50267..ac44a69fbea9 100644 --- a/fs/dcookies.c +++ b/fs/dcookies.c | |||
| @@ -204,7 +204,7 @@ out: | |||
| 204 | } | 204 | } |
| 205 | 205 | ||
| 206 | #ifdef CONFIG_COMPAT | 206 | #ifdef CONFIG_COMPAT |
| 207 | COMPAT_SYSCALL_DEFINE4(lookup_dcookie, u32, w0, u32, w1, char __user *, buf, size_t, len) | 207 | COMPAT_SYSCALL_DEFINE4(lookup_dcookie, u32, w0, u32, w1, char __user *, buf, compat_size_t, len) |
| 208 | { | 208 | { |
| 209 | #ifdef __BIG_ENDIAN | 209 | #ifdef __BIG_ENDIAN |
| 210 | return sys_lookup_dcookie(((u64)w0 << 32) | w1, buf, len); | 210 | return sys_lookup_dcookie(((u64)w0 << 32) | w1, buf, len); |
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index a52a5d23c30b..ee4317faccb1 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c | |||
| @@ -577,7 +577,7 @@ static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate) | |||
| 577 | 577 | ||
| 578 | if (offset >= i_size) { | 578 | if (offset >= i_size) { |
| 579 | *uptodate = true; | 579 | *uptodate = true; |
| 580 | EXOFS_DBGMSG("offset >= i_size index=0x%lx\n", index); | 580 | EXOFS_DBGMSG2("offset >= i_size index=0x%lx\n", index); |
| 581 | return ZERO_PAGE(0); | 581 | return ZERO_PAGE(0); |
| 582 | } | 582 | } |
| 583 | 583 | ||
| @@ -596,10 +596,10 @@ static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate) | |||
| 596 | *uptodate = true; | 596 | *uptodate = true; |
| 597 | else | 597 | else |
| 598 | *uptodate = PageUptodate(page); | 598 | *uptodate = PageUptodate(page); |
| 599 | EXOFS_DBGMSG("index=0x%lx uptodate=%d\n", index, *uptodate); | 599 | EXOFS_DBGMSG2("index=0x%lx uptodate=%d\n", index, *uptodate); |
| 600 | return page; | 600 | return page; |
| 601 | } else { | 601 | } else { |
| 602 | EXOFS_DBGMSG("YES that_locked_page index=0x%lx\n", | 602 | EXOFS_DBGMSG2("YES that_locked_page index=0x%lx\n", |
| 603 | pcol->that_locked_page->index); | 603 | pcol->that_locked_page->index); |
| 604 | *uptodate = true; | 604 | *uptodate = true; |
| 605 | return pcol->that_locked_page; | 605 | return pcol->that_locked_page; |
| @@ -611,11 +611,11 @@ static void __r4w_put_page(void *priv, struct page *page) | |||
| 611 | struct page_collect *pcol = priv; | 611 | struct page_collect *pcol = priv; |
| 612 | 612 | ||
| 613 | if ((pcol->that_locked_page != page) && (ZERO_PAGE(0) != page)) { | 613 | if ((pcol->that_locked_page != page) && (ZERO_PAGE(0) != page)) { |
| 614 | EXOFS_DBGMSG("index=0x%lx\n", page->index); | 614 | EXOFS_DBGMSG2("index=0x%lx\n", page->index); |
| 615 | page_cache_release(page); | 615 | page_cache_release(page); |
| 616 | return; | 616 | return; |
| 617 | } | 617 | } |
| 618 | EXOFS_DBGMSG("that_locked_page index=0x%lx\n", | 618 | EXOFS_DBGMSG2("that_locked_page index=0x%lx\n", |
| 619 | ZERO_PAGE(0) == page ? -1 : page->index); | 619 | ZERO_PAGE(0) == page ? -1 : page->index); |
| 620 | } | 620 | } |
| 621 | 621 | ||
| @@ -961,6 +961,14 @@ static void exofs_invalidatepage(struct page *page, unsigned int offset, | |||
| 961 | WARN_ON(1); | 961 | WARN_ON(1); |
| 962 | } | 962 | } |
| 963 | 963 | ||
| 964 | |||
| 965 | /* TODO: Should be easy enough to do proprly */ | ||
| 966 | static ssize_t exofs_direct_IO(int rw, struct kiocb *iocb, | ||
| 967 | const struct iovec *iov, loff_t offset, unsigned long nr_segs) | ||
| 968 | { | ||
| 969 | return 0; | ||
| 970 | } | ||
| 971 | |||
| 964 | const struct address_space_operations exofs_aops = { | 972 | const struct address_space_operations exofs_aops = { |
| 965 | .readpage = exofs_readpage, | 973 | .readpage = exofs_readpage, |
| 966 | .readpages = exofs_readpages, | 974 | .readpages = exofs_readpages, |
| @@ -974,7 +982,7 @@ const struct address_space_operations exofs_aops = { | |||
| 974 | 982 | ||
| 975 | /* Not implemented Yet */ | 983 | /* Not implemented Yet */ |
| 976 | .bmap = NULL, /* TODO: use osd's OSD_ACT_READ_MAP */ | 984 | .bmap = NULL, /* TODO: use osd's OSD_ACT_READ_MAP */ |
| 977 | .direct_IO = NULL, /* TODO: Should be trivial to do */ | 985 | .direct_IO = exofs_direct_IO, |
| 978 | 986 | ||
| 979 | /* With these NULL has special meaning or default is not exported */ | 987 | /* With these NULL has special meaning or default is not exported */ |
| 980 | .get_xip_mem = NULL, | 988 | .get_xip_mem = NULL, |
| @@ -1010,7 +1018,7 @@ static int _do_truncate(struct inode *inode, loff_t newsize) | |||
| 1010 | if (likely(!ret)) | 1018 | if (likely(!ret)) |
| 1011 | truncate_setsize(inode, newsize); | 1019 | truncate_setsize(inode, newsize); |
| 1012 | 1020 | ||
| 1013 | EXOFS_DBGMSG("(0x%lx) size=0x%llx ret=>%d\n", | 1021 | EXOFS_DBGMSG2("(0x%lx) size=0x%llx ret=>%d\n", |
| 1014 | inode->i_ino, newsize, ret); | 1022 | inode->i_ino, newsize, ret); |
| 1015 | return ret; | 1023 | return ret; |
| 1016 | } | 1024 | } |
| @@ -1094,14 +1102,13 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi, | |||
| 1094 | /* If object is lost on target we might as well enable it's | 1102 | /* If object is lost on target we might as well enable it's |
| 1095 | * delete. | 1103 | * delete. |
| 1096 | */ | 1104 | */ |
| 1097 | if ((ret == -ENOENT) || (ret == -EINVAL)) | 1105 | ret = 0; |
| 1098 | ret = 0; | ||
| 1099 | goto out; | 1106 | goto out; |
| 1100 | } | 1107 | } |
| 1101 | 1108 | ||
| 1102 | ret = extract_attr_from_ios(ios, &attrs[0]); | 1109 | ret = extract_attr_from_ios(ios, &attrs[0]); |
| 1103 | if (ret) { | 1110 | if (ret) { |
| 1104 | EXOFS_ERR("%s: extract_attr of inode_data failed\n", __func__); | 1111 | EXOFS_ERR("%s: extract_attr 0 of inode failed\n", __func__); |
| 1105 | goto out; | 1112 | goto out; |
| 1106 | } | 1113 | } |
| 1107 | WARN_ON(attrs[0].len != EXOFS_INO_ATTR_SIZE); | 1114 | WARN_ON(attrs[0].len != EXOFS_INO_ATTR_SIZE); |
| @@ -1109,7 +1116,7 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi, | |||
| 1109 | 1116 | ||
| 1110 | ret = extract_attr_from_ios(ios, &attrs[1]); | 1117 | ret = extract_attr_from_ios(ios, &attrs[1]); |
| 1111 | if (ret) { | 1118 | if (ret) { |
| 1112 | EXOFS_ERR("%s: extract_attr of inode_data failed\n", __func__); | 1119 | EXOFS_ERR("%s: extract_attr 1 of inode failed\n", __func__); |
| 1113 | goto out; | 1120 | goto out; |
| 1114 | } | 1121 | } |
| 1115 | if (attrs[1].len) { | 1122 | if (attrs[1].len) { |
| @@ -1124,7 +1131,7 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi, | |||
| 1124 | 1131 | ||
| 1125 | ret = extract_attr_from_ios(ios, &attrs[2]); | 1132 | ret = extract_attr_from_ios(ios, &attrs[2]); |
| 1126 | if (ret) { | 1133 | if (ret) { |
| 1127 | EXOFS_ERR("%s: extract_attr of inode_data failed\n", __func__); | 1134 | EXOFS_ERR("%s: extract_attr 2 of inode failed\n", __func__); |
| 1128 | goto out; | 1135 | goto out; |
| 1129 | } | 1136 | } |
| 1130 | if (attrs[2].len) { | 1137 | if (attrs[2].len) { |
diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c index b74422888604..dae884694bd9 100644 --- a/fs/exofs/ore.c +++ b/fs/exofs/ore.c | |||
| @@ -103,7 +103,7 @@ int ore_verify_layout(unsigned total_comps, struct ore_layout *layout) | |||
| 103 | 103 | ||
| 104 | layout->max_io_length = | 104 | layout->max_io_length = |
| 105 | (BIO_MAX_PAGES_KMALLOC * PAGE_SIZE - layout->stripe_unit) * | 105 | (BIO_MAX_PAGES_KMALLOC * PAGE_SIZE - layout->stripe_unit) * |
| 106 | layout->group_width; | 106 | (layout->group_width - layout->parity); |
| 107 | if (layout->parity) { | 107 | if (layout->parity) { |
| 108 | unsigned stripe_length = | 108 | unsigned stripe_length = |
| 109 | (layout->group_width - layout->parity) * | 109 | (layout->group_width - layout->parity) * |
| @@ -286,7 +286,8 @@ int ore_get_rw_state(struct ore_layout *layout, struct ore_components *oc, | |||
| 286 | if (length) { | 286 | if (length) { |
| 287 | ore_calc_stripe_info(layout, offset, length, &ios->si); | 287 | ore_calc_stripe_info(layout, offset, length, &ios->si); |
| 288 | ios->length = ios->si.length; | 288 | ios->length = ios->si.length; |
| 289 | ios->nr_pages = (ios->length + PAGE_SIZE - 1) / PAGE_SIZE; | 289 | ios->nr_pages = ((ios->offset & (PAGE_SIZE - 1)) + |
| 290 | ios->length + PAGE_SIZE - 1) / PAGE_SIZE; | ||
| 290 | if (layout->parity) | 291 | if (layout->parity) |
| 291 | _ore_post_alloc_raid_stuff(ios); | 292 | _ore_post_alloc_raid_stuff(ios); |
| 292 | } | 293 | } |
| @@ -430,8 +431,12 @@ int ore_check_io(struct ore_io_state *ios, ore_on_dev_error on_dev_error) | |||
| 430 | if (likely(!ret)) | 431 | if (likely(!ret)) |
| 431 | continue; | 432 | continue; |
| 432 | 433 | ||
| 433 | if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) { | 434 | if ((OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) && |
| 434 | /* start read offset passed endof file */ | 435 | per_dev->bio) { |
| 436 | /* start read offset passed endof file. | ||
| 437 | * Note: if we do not have bio it means read-attributes | ||
| 438 | * In this case we should return error to caller. | ||
| 439 | */ | ||
| 435 | _clear_bio(per_dev->bio); | 440 | _clear_bio(per_dev->bio); |
| 436 | ORE_DBGMSG("start read offset passed end of file " | 441 | ORE_DBGMSG("start read offset passed end of file " |
| 437 | "offset=0x%llx, length=0x%llx\n", | 442 | "offset=0x%llx, length=0x%llx\n", |
| @@ -536,6 +541,7 @@ void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset, | |||
| 536 | u64 H = LmodS - G * T; | 541 | u64 H = LmodS - G * T; |
| 537 | 542 | ||
| 538 | u32 N = div_u64(H, U); | 543 | u32 N = div_u64(H, U); |
| 544 | u32 Nlast; | ||
| 539 | 545 | ||
| 540 | /* "H - (N * U)" is just "H % U" so it's bound to u32 */ | 546 | /* "H - (N * U)" is just "H % U" so it's bound to u32 */ |
| 541 | u32 C = (u32)(H - (N * U)) / stripe_unit + G * group_width; | 547 | u32 C = (u32)(H - (N * U)) / stripe_unit + G * group_width; |
| @@ -568,6 +574,10 @@ void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset, | |||
| 568 | si->length = T - H; | 574 | si->length = T - H; |
| 569 | if (si->length > length) | 575 | if (si->length > length) |
| 570 | si->length = length; | 576 | si->length = length; |
| 577 | |||
| 578 | Nlast = div_u64(H + si->length + U - 1, U); | ||
| 579 | si->maxdevUnits = Nlast - N; | ||
| 580 | |||
| 571 | si->M = M; | 581 | si->M = M; |
| 572 | } | 582 | } |
| 573 | EXPORT_SYMBOL(ore_calc_stripe_info); | 583 | EXPORT_SYMBOL(ore_calc_stripe_info); |
| @@ -583,13 +593,16 @@ int _ore_add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, | |||
| 583 | int ret; | 593 | int ret; |
| 584 | 594 | ||
| 585 | if (per_dev->bio == NULL) { | 595 | if (per_dev->bio == NULL) { |
| 586 | unsigned pages_in_stripe = ios->layout->group_width * | 596 | unsigned bio_size; |
| 587 | (ios->layout->stripe_unit / PAGE_SIZE); | 597 | |
| 588 | unsigned nr_pages = ios->nr_pages * ios->layout->group_width / | 598 | if (!ios->reading) { |
| 589 | (ios->layout->group_width - | 599 | bio_size = ios->si.maxdevUnits; |
| 590 | ios->layout->parity); | 600 | } else { |
| 591 | unsigned bio_size = (nr_pages + pages_in_stripe) / | 601 | bio_size = (ios->si.maxdevUnits + 1) * |
| 592 | ios->layout->group_width; | 602 | (ios->layout->group_width - ios->layout->parity) / |
| 603 | ios->layout->group_width; | ||
| 604 | } | ||
| 605 | bio_size *= (ios->layout->stripe_unit / PAGE_SIZE); | ||
| 593 | 606 | ||
| 594 | per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size); | 607 | per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size); |
| 595 | if (unlikely(!per_dev->bio)) { | 608 | if (unlikely(!per_dev->bio)) { |
| @@ -609,8 +622,12 @@ int _ore_add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, | |||
| 609 | added_len = bio_add_pc_page(q, per_dev->bio, pages[pg], | 622 | added_len = bio_add_pc_page(q, per_dev->bio, pages[pg], |
| 610 | pglen, pgbase); | 623 | pglen, pgbase); |
| 611 | if (unlikely(pglen != added_len)) { | 624 | if (unlikely(pglen != added_len)) { |
| 612 | ORE_DBGMSG("Failed bio_add_pc_page bi_vcnt=%u\n", | 625 | /* If bi_vcnt == bi_max then this is a SW BUG */ |
| 613 | per_dev->bio->bi_vcnt); | 626 | ORE_DBGMSG("Failed bio_add_pc_page bi_vcnt=0x%x " |
| 627 | "bi_max=0x%x BIO_MAX=0x%x cur_len=0x%x\n", | ||
| 628 | per_dev->bio->bi_vcnt, | ||
| 629 | per_dev->bio->bi_max_vecs, | ||
| 630 | BIO_MAX_PAGES_KMALLOC, cur_len); | ||
| 614 | ret = -ENOMEM; | 631 | ret = -ENOMEM; |
| 615 | goto out; | 632 | goto out; |
| 616 | } | 633 | } |
| @@ -1098,7 +1115,7 @@ int ore_truncate(struct ore_layout *layout, struct ore_components *oc, | |||
| 1098 | size_attr->attr = g_attr_logical_length; | 1115 | size_attr->attr = g_attr_logical_length; |
| 1099 | size_attr->attr.val_ptr = &size_attr->newsize; | 1116 | size_attr->attr.val_ptr = &size_attr->newsize; |
| 1100 | 1117 | ||
| 1101 | ORE_DBGMSG("trunc(0x%llx) obj_offset=0x%llx dev=%d\n", | 1118 | ORE_DBGMSG2("trunc(0x%llx) obj_offset=0x%llx dev=%d\n", |
| 1102 | _LLU(oc->comps->obj.id), _LLU(obj_size), i); | 1119 | _LLU(oc->comps->obj.id), _LLU(obj_size), i); |
| 1103 | ret = _truncate_mirrors(ios, i * ios->layout->mirrors_p1, | 1120 | ret = _truncate_mirrors(ios, i * ios->layout->mirrors_p1, |
| 1104 | &size_attr->attr); | 1121 | &size_attr->attr); |
diff --git a/fs/jffs2/malloc.c b/fs/jffs2/malloc.c index 4f47aa24b556..b8fd651307a4 100644 --- a/fs/jffs2/malloc.c +++ b/fs/jffs2/malloc.c | |||
| @@ -288,6 +288,8 @@ struct jffs2_xattr_datum *jffs2_alloc_xattr_datum(void) | |||
| 288 | struct jffs2_xattr_datum *xd; | 288 | struct jffs2_xattr_datum *xd; |
| 289 | xd = kmem_cache_zalloc(xattr_datum_cache, GFP_KERNEL); | 289 | xd = kmem_cache_zalloc(xattr_datum_cache, GFP_KERNEL); |
| 290 | dbg_memalloc("%p\n", xd); | 290 | dbg_memalloc("%p\n", xd); |
| 291 | if (!xd) | ||
| 292 | return NULL; | ||
| 291 | 293 | ||
| 292 | xd->class = RAWNODE_CLASS_XATTR_DATUM; | 294 | xd->class = RAWNODE_CLASS_XATTR_DATUM; |
| 293 | xd->node = (void *)xd; | 295 | xd->node = (void *)xd; |
| @@ -306,6 +308,8 @@ struct jffs2_xattr_ref *jffs2_alloc_xattr_ref(void) | |||
| 306 | struct jffs2_xattr_ref *ref; | 308 | struct jffs2_xattr_ref *ref; |
| 307 | ref = kmem_cache_zalloc(xattr_ref_cache, GFP_KERNEL); | 309 | ref = kmem_cache_zalloc(xattr_ref_cache, GFP_KERNEL); |
| 308 | dbg_memalloc("%p\n", ref); | 310 | dbg_memalloc("%p\n", ref); |
| 311 | if (!ref) | ||
| 312 | return NULL; | ||
| 309 | 313 | ||
| 310 | ref->class = RAWNODE_CLASS_XATTR_REF; | 314 | ref->class = RAWNODE_CLASS_XATTR_REF; |
| 311 | ref->node = (void *)ref; | 315 | ref->node = (void *)ref; |
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index 58772623f02a..0e792f5e3147 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c | |||
| @@ -16,12 +16,6 @@ static bool should_merge(struct fsnotify_event *old_fsn, | |||
| 16 | { | 16 | { |
| 17 | struct fanotify_event_info *old, *new; | 17 | struct fanotify_event_info *old, *new; |
| 18 | 18 | ||
| 19 | #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS | ||
| 20 | /* dont merge two permission events */ | ||
| 21 | if ((old_fsn->mask & FAN_ALL_PERM_EVENTS) && | ||
| 22 | (new_fsn->mask & FAN_ALL_PERM_EVENTS)) | ||
| 23 | return false; | ||
| 24 | #endif | ||
| 25 | pr_debug("%s: old=%p new=%p\n", __func__, old_fsn, new_fsn); | 19 | pr_debug("%s: old=%p new=%p\n", __func__, old_fsn, new_fsn); |
| 26 | old = FANOTIFY_E(old_fsn); | 20 | old = FANOTIFY_E(old_fsn); |
| 27 | new = FANOTIFY_E(new_fsn); | 21 | new = FANOTIFY_E(new_fsn); |
| @@ -34,14 +28,23 @@ static bool should_merge(struct fsnotify_event *old_fsn, | |||
| 34 | } | 28 | } |
| 35 | 29 | ||
| 36 | /* and the list better be locked by something too! */ | 30 | /* and the list better be locked by something too! */ |
| 37 | static struct fsnotify_event *fanotify_merge(struct list_head *list, | 31 | static int fanotify_merge(struct list_head *list, struct fsnotify_event *event) |
| 38 | struct fsnotify_event *event) | ||
| 39 | { | 32 | { |
| 40 | struct fsnotify_event *test_event; | 33 | struct fsnotify_event *test_event; |
| 41 | bool do_merge = false; | 34 | bool do_merge = false; |
| 42 | 35 | ||
| 43 | pr_debug("%s: list=%p event=%p\n", __func__, list, event); | 36 | pr_debug("%s: list=%p event=%p\n", __func__, list, event); |
| 44 | 37 | ||
| 38 | #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS | ||
| 39 | /* | ||
| 40 | * Don't merge a permission event with any other event so that we know | ||
| 41 | * the event structure we have created in fanotify_handle_event() is the | ||
| 42 | * one we should check for permission response. | ||
| 43 | */ | ||
| 44 | if (event->mask & FAN_ALL_PERM_EVENTS) | ||
| 45 | return 0; | ||
| 46 | #endif | ||
| 47 | |||
| 45 | list_for_each_entry_reverse(test_event, list, list) { | 48 | list_for_each_entry_reverse(test_event, list, list) { |
| 46 | if (should_merge(test_event, event)) { | 49 | if (should_merge(test_event, event)) { |
| 47 | do_merge = true; | 50 | do_merge = true; |
| @@ -50,10 +53,10 @@ static struct fsnotify_event *fanotify_merge(struct list_head *list, | |||
| 50 | } | 53 | } |
| 51 | 54 | ||
| 52 | if (!do_merge) | 55 | if (!do_merge) |
| 53 | return NULL; | 56 | return 0; |
| 54 | 57 | ||
| 55 | test_event->mask |= event->mask; | 58 | test_event->mask |= event->mask; |
| 56 | return test_event; | 59 | return 1; |
| 57 | } | 60 | } |
| 58 | 61 | ||
| 59 | #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS | 62 | #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS |
| @@ -149,7 +152,6 @@ static int fanotify_handle_event(struct fsnotify_group *group, | |||
| 149 | int ret = 0; | 152 | int ret = 0; |
| 150 | struct fanotify_event_info *event; | 153 | struct fanotify_event_info *event; |
| 151 | struct fsnotify_event *fsn_event; | 154 | struct fsnotify_event *fsn_event; |
| 152 | struct fsnotify_event *notify_fsn_event; | ||
| 153 | 155 | ||
| 154 | BUILD_BUG_ON(FAN_ACCESS != FS_ACCESS); | 156 | BUILD_BUG_ON(FAN_ACCESS != FS_ACCESS); |
| 155 | BUILD_BUG_ON(FAN_MODIFY != FS_MODIFY); | 157 | BUILD_BUG_ON(FAN_MODIFY != FS_MODIFY); |
| @@ -188,21 +190,19 @@ static int fanotify_handle_event(struct fsnotify_group *group, | |||
| 188 | event->response = 0; | 190 | event->response = 0; |
| 189 | #endif | 191 | #endif |
| 190 | 192 | ||
| 191 | notify_fsn_event = fsnotify_add_notify_event(group, fsn_event, | 193 | ret = fsnotify_add_notify_event(group, fsn_event, fanotify_merge); |
| 192 | fanotify_merge); | 194 | if (ret) { |
| 193 | if (notify_fsn_event) { | 195 | BUG_ON(mask & FAN_ALL_PERM_EVENTS); |
| 194 | /* Our event wasn't used in the end. Free it. */ | 196 | /* Our event wasn't used in the end. Free it. */ |
| 195 | fsnotify_destroy_event(group, fsn_event); | 197 | fsnotify_destroy_event(group, fsn_event); |
| 196 | if (IS_ERR(notify_fsn_event)) | 198 | ret = 0; |
| 197 | return PTR_ERR(notify_fsn_event); | ||
| 198 | /* We need to ask about a different events after a merge... */ | ||
| 199 | event = FANOTIFY_E(notify_fsn_event); | ||
| 200 | fsn_event = notify_fsn_event; | ||
| 201 | } | 199 | } |
| 202 | 200 | ||
| 203 | #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS | 201 | #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS |
| 204 | if (fsn_event->mask & FAN_ALL_PERM_EVENTS) | 202 | if (mask & FAN_ALL_PERM_EVENTS) { |
| 205 | ret = fanotify_get_response_from_access(group, event); | 203 | ret = fanotify_get_response_from_access(group, event); |
| 204 | fsnotify_destroy_event(group, fsn_event); | ||
| 205 | } | ||
| 206 | #endif | 206 | #endif |
| 207 | return ret; | 207 | return ret; |
| 208 | } | 208 | } |
diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h index 0e90174a116a..32a2f034fb94 100644 --- a/fs/notify/fanotify/fanotify.h +++ b/fs/notify/fanotify/fanotify.h | |||
| @@ -4,6 +4,13 @@ | |||
| 4 | 4 | ||
| 5 | extern struct kmem_cache *fanotify_event_cachep; | 5 | extern struct kmem_cache *fanotify_event_cachep; |
| 6 | 6 | ||
| 7 | /* | ||
| 8 | * Lifetime of the structure differs for normal and permission events. In both | ||
| 9 | * cases the structure is allocated in fanotify_handle_event(). For normal | ||
| 10 | * events the structure is freed immediately after reporting it to userspace. | ||
| 11 | * For permission events we free it only after we receive response from | ||
| 12 | * userspace. | ||
| 13 | */ | ||
| 7 | struct fanotify_event_info { | 14 | struct fanotify_event_info { |
| 8 | struct fsnotify_event fse; | 15 | struct fsnotify_event fse; |
| 9 | /* | 16 | /* |
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 1fd66abe5740..b6175fa11bf8 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c | |||
| @@ -319,7 +319,12 @@ static ssize_t fanotify_read(struct file *file, char __user *buf, | |||
| 319 | if (IS_ERR(kevent)) | 319 | if (IS_ERR(kevent)) |
| 320 | break; | 320 | break; |
| 321 | ret = copy_event_to_user(group, kevent, buf); | 321 | ret = copy_event_to_user(group, kevent, buf); |
| 322 | fsnotify_destroy_event(group, kevent); | 322 | /* |
| 323 | * Permission events get destroyed after we | ||
| 324 | * receive response | ||
| 325 | */ | ||
| 326 | if (!(kevent->mask & FAN_ALL_PERM_EVENTS)) | ||
| 327 | fsnotify_destroy_event(group, kevent); | ||
| 323 | if (ret < 0) | 328 | if (ret < 0) |
| 324 | break; | 329 | break; |
| 325 | buf += ret; | 330 | buf += ret; |
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c index aad1a35e9af1..d5ee56348bb8 100644 --- a/fs/notify/inotify/inotify_fsnotify.c +++ b/fs/notify/inotify/inotify_fsnotify.c | |||
| @@ -53,15 +53,13 @@ static bool event_compare(struct fsnotify_event *old_fsn, | |||
| 53 | return false; | 53 | return false; |
| 54 | } | 54 | } |
| 55 | 55 | ||
| 56 | static struct fsnotify_event *inotify_merge(struct list_head *list, | 56 | static int inotify_merge(struct list_head *list, |
| 57 | struct fsnotify_event *event) | 57 | struct fsnotify_event *event) |
| 58 | { | 58 | { |
| 59 | struct fsnotify_event *last_event; | 59 | struct fsnotify_event *last_event; |
| 60 | 60 | ||
| 61 | last_event = list_entry(list->prev, struct fsnotify_event, list); | 61 | last_event = list_entry(list->prev, struct fsnotify_event, list); |
| 62 | if (!event_compare(last_event, event)) | 62 | return event_compare(last_event, event); |
| 63 | return NULL; | ||
| 64 | return last_event; | ||
| 65 | } | 63 | } |
| 66 | 64 | ||
| 67 | int inotify_handle_event(struct fsnotify_group *group, | 65 | int inotify_handle_event(struct fsnotify_group *group, |
| @@ -73,9 +71,8 @@ int inotify_handle_event(struct fsnotify_group *group, | |||
| 73 | { | 71 | { |
| 74 | struct inotify_inode_mark *i_mark; | 72 | struct inotify_inode_mark *i_mark; |
| 75 | struct inotify_event_info *event; | 73 | struct inotify_event_info *event; |
| 76 | struct fsnotify_event *added_event; | ||
| 77 | struct fsnotify_event *fsn_event; | 74 | struct fsnotify_event *fsn_event; |
| 78 | int ret = 0; | 75 | int ret; |
| 79 | int len = 0; | 76 | int len = 0; |
| 80 | int alloc_len = sizeof(struct inotify_event_info); | 77 | int alloc_len = sizeof(struct inotify_event_info); |
| 81 | 78 | ||
| @@ -110,18 +107,16 @@ int inotify_handle_event(struct fsnotify_group *group, | |||
| 110 | if (len) | 107 | if (len) |
| 111 | strcpy(event->name, file_name); | 108 | strcpy(event->name, file_name); |
| 112 | 109 | ||
| 113 | added_event = fsnotify_add_notify_event(group, fsn_event, inotify_merge); | 110 | ret = fsnotify_add_notify_event(group, fsn_event, inotify_merge); |
| 114 | if (added_event) { | 111 | if (ret) { |
| 115 | /* Our event wasn't used in the end. Free it. */ | 112 | /* Our event wasn't used in the end. Free it. */ |
| 116 | fsnotify_destroy_event(group, fsn_event); | 113 | fsnotify_destroy_event(group, fsn_event); |
| 117 | if (IS_ERR(added_event)) | ||
| 118 | ret = PTR_ERR(added_event); | ||
| 119 | } | 114 | } |
| 120 | 115 | ||
| 121 | if (inode_mark->mask & IN_ONESHOT) | 116 | if (inode_mark->mask & IN_ONESHOT) |
| 122 | fsnotify_destroy_mark(inode_mark, group); | 117 | fsnotify_destroy_mark(inode_mark, group); |
| 123 | 118 | ||
| 124 | return ret; | 119 | return 0; |
| 125 | } | 120 | } |
| 126 | 121 | ||
| 127 | static void inotify_freeing_mark(struct fsnotify_mark *fsn_mark, struct fsnotify_group *group) | 122 | static void inotify_freeing_mark(struct fsnotify_mark *fsn_mark, struct fsnotify_group *group) |
diff --git a/fs/notify/notification.c b/fs/notify/notification.c index 952237b8e2d2..18b3c4427dca 100644 --- a/fs/notify/notification.c +++ b/fs/notify/notification.c | |||
| @@ -79,15 +79,15 @@ void fsnotify_destroy_event(struct fsnotify_group *group, | |||
| 79 | 79 | ||
| 80 | /* | 80 | /* |
| 81 | * Add an event to the group notification queue. The group can later pull this | 81 | * Add an event to the group notification queue. The group can later pull this |
| 82 | * event off the queue to deal with. If the event is successfully added to the | 82 | * event off the queue to deal with. The function returns 0 if the event was |
| 83 | * group's notification queue, a reference is taken on event. | 83 | * added to the queue, 1 if the event was merged with some other queued event. |
| 84 | */ | 84 | */ |
| 85 | struct fsnotify_event *fsnotify_add_notify_event(struct fsnotify_group *group, | 85 | int fsnotify_add_notify_event(struct fsnotify_group *group, |
| 86 | struct fsnotify_event *event, | 86 | struct fsnotify_event *event, |
| 87 | struct fsnotify_event *(*merge)(struct list_head *, | 87 | int (*merge)(struct list_head *, |
| 88 | struct fsnotify_event *)) | 88 | struct fsnotify_event *)) |
| 89 | { | 89 | { |
| 90 | struct fsnotify_event *return_event = NULL; | 90 | int ret = 0; |
| 91 | struct list_head *list = &group->notification_list; | 91 | struct list_head *list = &group->notification_list; |
| 92 | 92 | ||
| 93 | pr_debug("%s: group=%p event=%p\n", __func__, group, event); | 93 | pr_debug("%s: group=%p event=%p\n", __func__, group, event); |
| @@ -98,14 +98,14 @@ struct fsnotify_event *fsnotify_add_notify_event(struct fsnotify_group *group, | |||
| 98 | /* Queue overflow event only if it isn't already queued */ | 98 | /* Queue overflow event only if it isn't already queued */ |
| 99 | if (list_empty(&group->overflow_event.list)) | 99 | if (list_empty(&group->overflow_event.list)) |
| 100 | event = &group->overflow_event; | 100 | event = &group->overflow_event; |
| 101 | return_event = event; | 101 | ret = 1; |
| 102 | } | 102 | } |
| 103 | 103 | ||
| 104 | if (!list_empty(list) && merge) { | 104 | if (!list_empty(list) && merge) { |
| 105 | return_event = merge(list, event); | 105 | ret = merge(list, event); |
| 106 | if (return_event) { | 106 | if (ret) { |
| 107 | mutex_unlock(&group->notification_mutex); | 107 | mutex_unlock(&group->notification_mutex); |
| 108 | return return_event; | 108 | return ret; |
| 109 | } | 109 | } |
| 110 | } | 110 | } |
| 111 | 111 | ||
| @@ -115,7 +115,7 @@ struct fsnotify_event *fsnotify_add_notify_event(struct fsnotify_group *group, | |||
| 115 | 115 | ||
| 116 | wake_up(&group->notification_waitq); | 116 | wake_up(&group->notification_waitq); |
| 117 | kill_fasync(&group->fsn_fa, SIGIO, POLL_IN); | 117 | kill_fasync(&group->fsn_fa, SIGIO, POLL_IN); |
| 118 | return return_event; | 118 | return ret; |
| 119 | } | 119 | } |
| 120 | 120 | ||
| 121 | /* | 121 | /* |
diff --git a/fs/read_write.c b/fs/read_write.c index 1193ffd03565..edc5746a902a 100644 --- a/fs/read_write.c +++ b/fs/read_write.c | |||
| @@ -964,9 +964,9 @@ out: | |||
| 964 | return ret; | 964 | return ret; |
| 965 | } | 965 | } |
| 966 | 966 | ||
| 967 | COMPAT_SYSCALL_DEFINE3(readv, unsigned long, fd, | 967 | COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd, |
| 968 | const struct compat_iovec __user *,vec, | 968 | const struct compat_iovec __user *,vec, |
| 969 | unsigned long, vlen) | 969 | compat_ulong_t, vlen) |
| 970 | { | 970 | { |
| 971 | struct fd f = fdget(fd); | 971 | struct fd f = fdget(fd); |
| 972 | ssize_t ret; | 972 | ssize_t ret; |
| @@ -1001,9 +1001,9 @@ COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd, | |||
| 1001 | return ret; | 1001 | return ret; |
| 1002 | } | 1002 | } |
| 1003 | 1003 | ||
| 1004 | COMPAT_SYSCALL_DEFINE5(preadv, unsigned long, fd, | 1004 | COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd, |
| 1005 | const struct compat_iovec __user *,vec, | 1005 | const struct compat_iovec __user *,vec, |
| 1006 | unsigned long, vlen, u32, pos_low, u32, pos_high) | 1006 | compat_ulong_t, vlen, u32, pos_low, u32, pos_high) |
| 1007 | { | 1007 | { |
| 1008 | loff_t pos = ((loff_t)pos_high << 32) | pos_low; | 1008 | loff_t pos = ((loff_t)pos_high << 32) | pos_low; |
| 1009 | return compat_sys_preadv64(fd, vec, vlen, pos); | 1009 | return compat_sys_preadv64(fd, vec, vlen, pos); |
| @@ -1031,9 +1031,9 @@ out: | |||
| 1031 | return ret; | 1031 | return ret; |
| 1032 | } | 1032 | } |
| 1033 | 1033 | ||
| 1034 | COMPAT_SYSCALL_DEFINE3(writev, unsigned long, fd, | 1034 | COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd, |
| 1035 | const struct compat_iovec __user *, vec, | 1035 | const struct compat_iovec __user *, vec, |
| 1036 | unsigned long, vlen) | 1036 | compat_ulong_t, vlen) |
| 1037 | { | 1037 | { |
| 1038 | struct fd f = fdget(fd); | 1038 | struct fd f = fdget(fd); |
| 1039 | ssize_t ret; | 1039 | ssize_t ret; |
| @@ -1068,9 +1068,9 @@ COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd, | |||
| 1068 | return ret; | 1068 | return ret; |
| 1069 | } | 1069 | } |
| 1070 | 1070 | ||
| 1071 | COMPAT_SYSCALL_DEFINE5(pwritev, unsigned long, fd, | 1071 | COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd, |
| 1072 | const struct compat_iovec __user *,vec, | 1072 | const struct compat_iovec __user *,vec, |
| 1073 | unsigned long, vlen, u32, pos_low, u32, pos_high) | 1073 | compat_ulong_t, vlen, u32, pos_low, u32, pos_high) |
| 1074 | { | 1074 | { |
| 1075 | loff_t pos = ((loff_t)pos_high << 32) | pos_low; | 1075 | loff_t pos = ((loff_t)pos_high << 32) | pos_low; |
| 1076 | return compat_sys_pwritev64(fd, vec, vlen, pos); | 1076 | return compat_sys_pwritev64(fd, vec, vlen, pos); |
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 9fccfb594291..51757113a822 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c | |||
| @@ -445,8 +445,8 @@ _xfs_buf_find( | |||
| 445 | numbytes = BBTOB(numblks); | 445 | numbytes = BBTOB(numblks); |
| 446 | 446 | ||
| 447 | /* Check for IOs smaller than the sector size / not sector aligned */ | 447 | /* Check for IOs smaller than the sector size / not sector aligned */ |
| 448 | ASSERT(!(numbytes < (1 << btp->bt_sshift))); | 448 | ASSERT(!(numbytes < btp->bt_meta_sectorsize)); |
| 449 | ASSERT(!(BBTOB(blkno) & (xfs_off_t)btp->bt_smask)); | 449 | ASSERT(!(BBTOB(blkno) & (xfs_off_t)btp->bt_meta_sectormask)); |
| 450 | 450 | ||
| 451 | /* | 451 | /* |
| 452 | * Corrupted block numbers can get through to here, unfortunately, so we | 452 | * Corrupted block numbers can get through to here, unfortunately, so we |
| @@ -1599,9 +1599,9 @@ xfs_setsize_buftarg( | |||
| 1599 | unsigned int blocksize, | 1599 | unsigned int blocksize, |
| 1600 | unsigned int sectorsize) | 1600 | unsigned int sectorsize) |
| 1601 | { | 1601 | { |
| 1602 | btp->bt_bsize = blocksize; | 1602 | /* Set up metadata sector size info */ |
| 1603 | btp->bt_sshift = ffs(sectorsize) - 1; | 1603 | btp->bt_meta_sectorsize = sectorsize; |
| 1604 | btp->bt_smask = sectorsize - 1; | 1604 | btp->bt_meta_sectormask = sectorsize - 1; |
| 1605 | 1605 | ||
| 1606 | if (set_blocksize(btp->bt_bdev, sectorsize)) { | 1606 | if (set_blocksize(btp->bt_bdev, sectorsize)) { |
| 1607 | char name[BDEVNAME_SIZE]; | 1607 | char name[BDEVNAME_SIZE]; |
| @@ -1614,6 +1614,10 @@ xfs_setsize_buftarg( | |||
| 1614 | return EINVAL; | 1614 | return EINVAL; |
| 1615 | } | 1615 | } |
| 1616 | 1616 | ||
| 1617 | /* Set up device logical sector size mask */ | ||
| 1618 | btp->bt_logical_sectorsize = bdev_logical_block_size(btp->bt_bdev); | ||
| 1619 | btp->bt_logical_sectormask = bdev_logical_block_size(btp->bt_bdev) - 1; | ||
| 1620 | |||
| 1617 | return 0; | 1621 | return 0; |
| 1618 | } | 1622 | } |
| 1619 | 1623 | ||
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 1cf21a4a9f22..995339534db6 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h | |||
| @@ -88,14 +88,28 @@ typedef unsigned int xfs_buf_flags_t; | |||
| 88 | */ | 88 | */ |
| 89 | #define XFS_BSTATE_DISPOSE (1 << 0) /* buffer being discarded */ | 89 | #define XFS_BSTATE_DISPOSE (1 << 0) /* buffer being discarded */ |
| 90 | 90 | ||
| 91 | /* | ||
| 92 | * The xfs_buftarg contains 2 notions of "sector size" - | ||
| 93 | * | ||
| 94 | * 1) The metadata sector size, which is the minimum unit and | ||
| 95 | * alignment of IO which will be performed by metadata operations. | ||
| 96 | * 2) The device logical sector size | ||
| 97 | * | ||
| 98 | * The first is specified at mkfs time, and is stored on-disk in the | ||
| 99 | * superblock's sb_sectsize. | ||
| 100 | * | ||
| 101 | * The latter is derived from the underlying device, and controls direct IO | ||
| 102 | * alignment constraints. | ||
| 103 | */ | ||
| 91 | typedef struct xfs_buftarg { | 104 | typedef struct xfs_buftarg { |
| 92 | dev_t bt_dev; | 105 | dev_t bt_dev; |
| 93 | struct block_device *bt_bdev; | 106 | struct block_device *bt_bdev; |
| 94 | struct backing_dev_info *bt_bdi; | 107 | struct backing_dev_info *bt_bdi; |
| 95 | struct xfs_mount *bt_mount; | 108 | struct xfs_mount *bt_mount; |
| 96 | unsigned int bt_bsize; | 109 | unsigned int bt_meta_sectorsize; |
| 97 | unsigned int bt_sshift; | 110 | size_t bt_meta_sectormask; |
| 98 | size_t bt_smask; | 111 | size_t bt_logical_sectorsize; |
| 112 | size_t bt_logical_sectormask; | ||
| 99 | 113 | ||
| 100 | /* LRU control structures */ | 114 | /* LRU control structures */ |
| 101 | struct shrinker bt_shrinker; | 115 | struct shrinker bt_shrinker; |
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index e00121592632..2e7989e3a2d6 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c | |||
| @@ -261,7 +261,8 @@ xfs_file_aio_read( | |||
| 261 | xfs_buftarg_t *target = | 261 | xfs_buftarg_t *target = |
| 262 | XFS_IS_REALTIME_INODE(ip) ? | 262 | XFS_IS_REALTIME_INODE(ip) ? |
| 263 | mp->m_rtdev_targp : mp->m_ddev_targp; | 263 | mp->m_rtdev_targp : mp->m_ddev_targp; |
| 264 | if ((pos & target->bt_smask) || (size & target->bt_smask)) { | 264 | /* DIO must be aligned to device logical sector size */ |
| 265 | if ((pos | size) & target->bt_logical_sectormask) { | ||
| 265 | if (pos == i_size_read(inode)) | 266 | if (pos == i_size_read(inode)) |
| 266 | return 0; | 267 | return 0; |
| 267 | return -XFS_ERROR(EINVAL); | 268 | return -XFS_ERROR(EINVAL); |
| @@ -641,9 +642,11 @@ xfs_file_dio_aio_write( | |||
| 641 | struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ? | 642 | struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ? |
| 642 | mp->m_rtdev_targp : mp->m_ddev_targp; | 643 | mp->m_rtdev_targp : mp->m_ddev_targp; |
| 643 | 644 | ||
| 644 | if ((pos & target->bt_smask) || (count & target->bt_smask)) | 645 | /* DIO must be aligned to device logical sector size */ |
| 646 | if ((pos | count) & target->bt_logical_sectormask) | ||
| 645 | return -XFS_ERROR(EINVAL); | 647 | return -XFS_ERROR(EINVAL); |
| 646 | 648 | ||
| 649 | /* "unaligned" here means not aligned to a filesystem block */ | ||
| 647 | if ((pos & mp->m_blockmask) || ((pos + count) & mp->m_blockmask)) | 650 | if ((pos & mp->m_blockmask) || ((pos + count) & mp->m_blockmask)) |
| 648 | unaligned_io = 1; | 651 | unaligned_io = 1; |
| 649 | 652 | ||
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 518aa56b8f2e..bcfe61202115 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c | |||
| @@ -1583,7 +1583,7 @@ xfs_file_ioctl( | |||
| 1583 | XFS_IS_REALTIME_INODE(ip) ? | 1583 | XFS_IS_REALTIME_INODE(ip) ? |
| 1584 | mp->m_rtdev_targp : mp->m_ddev_targp; | 1584 | mp->m_rtdev_targp : mp->m_ddev_targp; |
| 1585 | 1585 | ||
| 1586 | da.d_mem = da.d_miniosz = 1 << target->bt_sshift; | 1586 | da.d_mem = da.d_miniosz = target->bt_logical_sectorsize; |
| 1587 | da.d_maxiosz = INT_MAX & ~(da.d_miniosz - 1); | 1587 | da.d_maxiosz = INT_MAX & ~(da.d_miniosz - 1); |
| 1588 | 1588 | ||
| 1589 | if (copy_to_user(arg, &da, sizeof(da))) | 1589 | if (copy_to_user(arg, &da, sizeof(da))) |
