diff options
author | Mike Marshall <hubcap@omnibond.com> | 2015-07-17 10:38:12 -0400 |
---|---|---|
committer | Mike Marshall <hubcap@omnibond.com> | 2015-10-03 11:39:54 -0400 |
commit | 5db11c21a929cd9d8c0484006efb1014fc723c93 (patch) | |
tree | 014b77a693c2b33c4558903e325ec0a2e9c0ebf6 | |
parent | f7ab093f74bf638ed98fd1115f3efa17e308bb7f (diff) |
Orangefs: kernel client part 2
Signed-off-by: Mike Marshall <hubcap@omnibond.com>
-rw-r--r-- | fs/orangefs/acl.c | 175 | ||||
-rw-r--r-- | fs/orangefs/dcache.c | 142 | ||||
-rw-r--r-- | fs/orangefs/devpvfs2-req.c | 997 | ||||
-rw-r--r-- | fs/orangefs/dir.c | 394 | ||||
-rw-r--r-- | fs/orangefs/file.c | 1019 | ||||
-rw-r--r-- | fs/orangefs/inode.c | 469 |
6 files changed, 3196 insertions, 0 deletions
diff --git a/fs/orangefs/acl.c b/fs/orangefs/acl.c new file mode 100644 index 000000000000..e462b81a3ba1 --- /dev/null +++ b/fs/orangefs/acl.c | |||
@@ -0,0 +1,175 @@ | |||
1 | /* | ||
2 | * (C) 2001 Clemson University and The University of Chicago | ||
3 | * | ||
4 | * See COPYING in top-level directory. | ||
5 | */ | ||
6 | |||
7 | #include "protocol.h" | ||
8 | #include "pvfs2-kernel.h" | ||
9 | #include "pvfs2-bufmap.h" | ||
10 | #include <linux/posix_acl_xattr.h> | ||
11 | #include <linux/fs_struct.h> | ||
12 | |||
13 | struct posix_acl *pvfs2_get_acl(struct inode *inode, int type) | ||
14 | { | ||
15 | struct posix_acl *acl; | ||
16 | int ret; | ||
17 | char *key = NULL, *value = NULL; | ||
18 | |||
19 | switch (type) { | ||
20 | case ACL_TYPE_ACCESS: | ||
21 | key = PVFS2_XATTR_NAME_ACL_ACCESS; | ||
22 | break; | ||
23 | case ACL_TYPE_DEFAULT: | ||
24 | key = PVFS2_XATTR_NAME_ACL_DEFAULT; | ||
25 | break; | ||
26 | default: | ||
27 | gossip_err("pvfs2_get_acl: bogus value of type %d\n", type); | ||
28 | return ERR_PTR(-EINVAL); | ||
29 | } | ||
30 | /* | ||
31 | * Rather than incurring a network call just to determine the exact | ||
32 | * length of the attribute, I just allocate a max length to save on | ||
33 | * the network call. Conceivably, we could pass NULL to | ||
34 | * pvfs2_inode_getxattr() to probe the length of the value, but | ||
35 | * I don't do that for now. | ||
36 | */ | ||
37 | value = kmalloc(PVFS_MAX_XATTR_VALUELEN, GFP_KERNEL); | ||
38 | if (value == NULL) | ||
39 | return ERR_PTR(-ENOMEM); | ||
40 | |||
41 | gossip_debug(GOSSIP_ACL_DEBUG, | ||
42 | "inode %pU, key %s, type %d\n", | ||
43 | get_khandle_from_ino(inode), | ||
44 | key, | ||
45 | type); | ||
46 | ret = pvfs2_inode_getxattr(inode, | ||
47 | "", | ||
48 | key, | ||
49 | value, | ||
50 | PVFS_MAX_XATTR_VALUELEN); | ||
51 | /* if the key exists, convert it to an in-memory rep */ | ||
52 | if (ret > 0) { | ||
53 | acl = posix_acl_from_xattr(&init_user_ns, value, ret); | ||
54 | } else if (ret == -ENODATA || ret == -ENOSYS) { | ||
55 | acl = NULL; | ||
56 | } else { | ||
57 | gossip_err("inode %pU retrieving acl's failed with error %d\n", | ||
58 | get_khandle_from_ino(inode), | ||
59 | ret); | ||
60 | acl = ERR_PTR(ret); | ||
61 | } | ||
62 | /* kfree(NULL) is safe, so don't worry if value ever got used */ | ||
63 | kfree(value); | ||
64 | return acl; | ||
65 | } | ||
66 | |||
67 | int pvfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type) | ||
68 | { | ||
69 | struct pvfs2_inode_s *pvfs2_inode = PVFS2_I(inode); | ||
70 | int error = 0; | ||
71 | void *value = NULL; | ||
72 | size_t size = 0; | ||
73 | const char *name = NULL; | ||
74 | |||
75 | switch (type) { | ||
76 | case ACL_TYPE_ACCESS: | ||
77 | name = PVFS2_XATTR_NAME_ACL_ACCESS; | ||
78 | if (acl) { | ||
79 | umode_t mode = inode->i_mode; | ||
80 | /* | ||
81 | * can we represent this with the traditional file | ||
82 | * mode permission bits? | ||
83 | */ | ||
84 | error = posix_acl_equiv_mode(acl, &mode); | ||
85 | if (error < 0) { | ||
86 | gossip_err("%s: posix_acl_equiv_mode err: %d\n", | ||
87 | __func__, | ||
88 | error); | ||
89 | return error; | ||
90 | } | ||
91 | |||
92 | if (inode->i_mode != mode) | ||
93 | SetModeFlag(pvfs2_inode); | ||
94 | inode->i_mode = mode; | ||
95 | mark_inode_dirty_sync(inode); | ||
96 | if (error == 0) | ||
97 | acl = NULL; | ||
98 | } | ||
99 | break; | ||
100 | case ACL_TYPE_DEFAULT: | ||
101 | name = PVFS2_XATTR_NAME_ACL_DEFAULT; | ||
102 | break; | ||
103 | default: | ||
104 | gossip_err("%s: invalid type %d!\n", __func__, type); | ||
105 | return -EINVAL; | ||
106 | } | ||
107 | |||
108 | gossip_debug(GOSSIP_ACL_DEBUG, | ||
109 | "%s: inode %pU, key %s type %d\n", | ||
110 | __func__, get_khandle_from_ino(inode), | ||
111 | name, | ||
112 | type); | ||
113 | |||
114 | if (acl) { | ||
115 | size = posix_acl_xattr_size(acl->a_count); | ||
116 | value = kmalloc(size, GFP_KERNEL); | ||
117 | if (!value) | ||
118 | return -ENOMEM; | ||
119 | |||
120 | error = posix_acl_to_xattr(&init_user_ns, acl, value, size); | ||
121 | if (error < 0) | ||
122 | goto out; | ||
123 | } | ||
124 | |||
125 | gossip_debug(GOSSIP_ACL_DEBUG, | ||
126 | "%s: name %s, value %p, size %zd, acl %p\n", | ||
127 | __func__, name, value, size, acl); | ||
128 | /* | ||
129 | * Go ahead and set the extended attribute now. NOTE: Suppose acl | ||
130 | * was NULL, then value will be NULL and size will be 0 and that | ||
131 | * will xlate to a removexattr. However, we don't want removexattr | ||
132 | * complain if attributes does not exist. | ||
133 | */ | ||
134 | error = pvfs2_inode_setxattr(inode, "", name, value, size, 0); | ||
135 | |||
136 | out: | ||
137 | kfree(value); | ||
138 | if (!error) | ||
139 | set_cached_acl(inode, type, acl); | ||
140 | return error; | ||
141 | } | ||
142 | |||
143 | int pvfs2_init_acl(struct inode *inode, struct inode *dir) | ||
144 | { | ||
145 | struct pvfs2_inode_s *pvfs2_inode = PVFS2_I(inode); | ||
146 | struct posix_acl *default_acl, *acl; | ||
147 | umode_t mode = inode->i_mode; | ||
148 | int error = 0; | ||
149 | |||
150 | ClearModeFlag(pvfs2_inode); | ||
151 | |||
152 | error = posix_acl_create(dir, &mode, &default_acl, &acl); | ||
153 | if (error) | ||
154 | return error; | ||
155 | |||
156 | if (default_acl) { | ||
157 | error = pvfs2_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); | ||
158 | posix_acl_release(default_acl); | ||
159 | } | ||
160 | |||
161 | if (acl) { | ||
162 | if (!error) | ||
163 | error = pvfs2_set_acl(inode, acl, ACL_TYPE_ACCESS); | ||
164 | posix_acl_release(acl); | ||
165 | } | ||
166 | |||
167 | /* If mode of the inode was changed, then do a forcible ->setattr */ | ||
168 | if (mode != inode->i_mode) { | ||
169 | SetModeFlag(pvfs2_inode); | ||
170 | inode->i_mode = mode; | ||
171 | pvfs2_flush_inode(inode); | ||
172 | } | ||
173 | |||
174 | return error; | ||
175 | } | ||
diff --git a/fs/orangefs/dcache.c b/fs/orangefs/dcache.c new file mode 100644 index 000000000000..9466b179bf24 --- /dev/null +++ b/fs/orangefs/dcache.c | |||
@@ -0,0 +1,142 @@ | |||
1 | /* | ||
2 | * (C) 2001 Clemson University and The University of Chicago | ||
3 | * | ||
4 | * See COPYING in top-level directory. | ||
5 | */ | ||
6 | |||
7 | /* | ||
8 | * Implementation of dentry (directory cache) functions. | ||
9 | */ | ||
10 | |||
11 | #include "protocol.h" | ||
12 | #include "pvfs2-kernel.h" | ||
13 | |||
14 | /* Returns 1 if dentry can still be trusted, else 0. */ | ||
15 | static int pvfs2_revalidate_lookup(struct dentry *dentry) | ||
16 | { | ||
17 | struct dentry *parent_dentry = dget_parent(dentry); | ||
18 | struct inode *parent_inode = parent_dentry->d_inode; | ||
19 | struct pvfs2_inode_s *parent = PVFS2_I(parent_inode); | ||
20 | struct inode *inode = dentry->d_inode; | ||
21 | struct pvfs2_kernel_op_s *new_op; | ||
22 | int ret = 0; | ||
23 | int err = 0; | ||
24 | |||
25 | gossip_debug(GOSSIP_DCACHE_DEBUG, "%s: attempting lookup.\n", __func__); | ||
26 | |||
27 | new_op = op_alloc(PVFS2_VFS_OP_LOOKUP); | ||
28 | if (!new_op) | ||
29 | goto out_put_parent; | ||
30 | |||
31 | new_op->upcall.req.lookup.sym_follow = PVFS2_LOOKUP_LINK_NO_FOLLOW; | ||
32 | new_op->upcall.req.lookup.parent_refn = parent->refn; | ||
33 | strncpy(new_op->upcall.req.lookup.d_name, | ||
34 | dentry->d_name.name, | ||
35 | PVFS2_NAME_LEN); | ||
36 | |||
37 | gossip_debug(GOSSIP_DCACHE_DEBUG, | ||
38 | "%s:%s:%d interrupt flag [%d]\n", | ||
39 | __FILE__, | ||
40 | __func__, | ||
41 | __LINE__, | ||
42 | get_interruptible_flag(parent_inode)); | ||
43 | |||
44 | err = service_operation(new_op, "pvfs2_lookup", | ||
45 | get_interruptible_flag(parent_inode)); | ||
46 | if (err) | ||
47 | goto out_drop; | ||
48 | |||
49 | if (new_op->downcall.status != 0 || | ||
50 | !match_handle(new_op->downcall.resp.lookup.refn.khandle, inode)) { | ||
51 | gossip_debug(GOSSIP_DCACHE_DEBUG, | ||
52 | "%s:%s:%d " | ||
53 | "lookup failure |%s| or no match |%s|.\n", | ||
54 | __FILE__, | ||
55 | __func__, | ||
56 | __LINE__, | ||
57 | new_op->downcall.status ? "true" : "false", | ||
58 | match_handle(new_op->downcall.resp.lookup.refn.khandle, | ||
59 | inode) ? "false" : "true"); | ||
60 | gossip_debug(GOSSIP_DCACHE_DEBUG, | ||
61 | "%s:%s:%d revalidate failed\n", | ||
62 | __FILE__, __func__, __LINE__); | ||
63 | goto out_drop; | ||
64 | } | ||
65 | |||
66 | ret = 1; | ||
67 | out_release_op: | ||
68 | op_release(new_op); | ||
69 | out_put_parent: | ||
70 | dput(parent_dentry); | ||
71 | return ret; | ||
72 | out_drop: | ||
73 | d_drop(dentry); | ||
74 | goto out_release_op; | ||
75 | } | ||
76 | |||
77 | /* | ||
78 | * Verify that dentry is valid. | ||
79 | * | ||
80 | * Should return 1 if dentry can still be trusted, else 0 | ||
81 | */ | ||
82 | static int pvfs2_d_revalidate(struct dentry *dentry, unsigned int flags) | ||
83 | { | ||
84 | struct inode *inode; | ||
85 | int ret = 0; | ||
86 | |||
87 | if (flags & LOOKUP_RCU) | ||
88 | return -ECHILD; | ||
89 | |||
90 | gossip_debug(GOSSIP_DCACHE_DEBUG, "%s: called on dentry %p.\n", | ||
91 | __func__, dentry); | ||
92 | |||
93 | /* find inode from dentry */ | ||
94 | if (!dentry->d_inode) { | ||
95 | gossip_debug(GOSSIP_DCACHE_DEBUG, "%s: negative dentry.\n", | ||
96 | __func__); | ||
97 | goto invalid_exit; | ||
98 | } | ||
99 | |||
100 | gossip_debug(GOSSIP_DCACHE_DEBUG, "%s: inode valid.\n", __func__); | ||
101 | inode = dentry->d_inode; | ||
102 | |||
103 | /* | ||
104 | * first perform a lookup to make sure that the object not only | ||
105 | * exists, but is still in the expected place in the name space | ||
106 | */ | ||
107 | if (!is_root_handle(inode)) { | ||
108 | if (!pvfs2_revalidate_lookup(dentry)) | ||
109 | goto invalid_exit; | ||
110 | } else { | ||
111 | gossip_debug(GOSSIP_DCACHE_DEBUG, | ||
112 | "%s: root handle, lookup skipped.\n", | ||
113 | __func__); | ||
114 | } | ||
115 | |||
116 | /* now perform getattr */ | ||
117 | gossip_debug(GOSSIP_DCACHE_DEBUG, | ||
118 | "%s: doing getattr: inode: %p, handle: %pU\n", | ||
119 | __func__, | ||
120 | inode, | ||
121 | get_khandle_from_ino(inode)); | ||
122 | ret = pvfs2_inode_getattr(inode, PVFS_ATTR_SYS_ALL_NOHINT); | ||
123 | gossip_debug(GOSSIP_DCACHE_DEBUG, | ||
124 | "%s: getattr %s (ret = %d), returning %s for dentry i_count=%d\n", | ||
125 | __func__, | ||
126 | (ret == 0 ? "succeeded" : "failed"), | ||
127 | ret, | ||
128 | (ret == 0 ? "valid" : "INVALID"), | ||
129 | atomic_read(&inode->i_count)); | ||
130 | if (ret != 0) | ||
131 | goto invalid_exit; | ||
132 | |||
133 | /* dentry is valid! */ | ||
134 | return 1; | ||
135 | |||
136 | invalid_exit: | ||
137 | return 0; | ||
138 | } | ||
139 | |||
140 | const struct dentry_operations pvfs2_dentry_operations = { | ||
141 | .d_revalidate = pvfs2_d_revalidate, | ||
142 | }; | ||
diff --git a/fs/orangefs/devpvfs2-req.c b/fs/orangefs/devpvfs2-req.c new file mode 100644 index 000000000000..3e450228f3dc --- /dev/null +++ b/fs/orangefs/devpvfs2-req.c | |||
@@ -0,0 +1,997 @@ | |||
1 | /* | ||
2 | * (C) 2001 Clemson University and The University of Chicago | ||
3 | * | ||
4 | * Changes by Acxiom Corporation to add protocol version to kernel | ||
5 | * communication, Copyright Acxiom Corporation, 2005. | ||
6 | * | ||
7 | * See COPYING in top-level directory. | ||
8 | */ | ||
9 | |||
10 | #include "protocol.h" | ||
11 | #include "pvfs2-kernel.h" | ||
12 | #include "pvfs2-dev-proto.h" | ||
13 | #include "pvfs2-bufmap.h" | ||
14 | |||
15 | #include <linux/debugfs.h> | ||
16 | #include <linux/slab.h> | ||
17 | |||
18 | /* this file implements the /dev/pvfs2-req device node */ | ||
19 | |||
20 | static int open_access_count; | ||
21 | |||
22 | #define DUMP_DEVICE_ERROR() \ | ||
23 | do { \ | ||
24 | gossip_err("*****************************************************\n");\ | ||
25 | gossip_err("PVFS2 Device Error: You cannot open the device file "); \ | ||
26 | gossip_err("\n/dev/%s more than once. Please make sure that\nthere " \ | ||
27 | "are no ", PVFS2_REQDEVICE_NAME); \ | ||
28 | gossip_err("instances of a program using this device\ncurrently " \ | ||
29 | "running. (You must verify this!)\n"); \ | ||
30 | gossip_err("For example, you can use the lsof program as follows:\n");\ | ||
31 | gossip_err("'lsof | grep %s' (run this as root)\n", \ | ||
32 | PVFS2_REQDEVICE_NAME); \ | ||
33 | gossip_err(" open_access_count = %d\n", open_access_count); \ | ||
34 | gossip_err("*****************************************************\n");\ | ||
35 | } while (0) | ||
36 | |||
37 | static int hash_func(__u64 tag, int table_size) | ||
38 | { | ||
39 | return tag % ((unsigned int)table_size); | ||
40 | } | ||
41 | |||
42 | static void pvfs2_devreq_add_op(struct pvfs2_kernel_op_s *op) | ||
43 | { | ||
44 | int index = hash_func(op->tag, hash_table_size); | ||
45 | |||
46 | spin_lock(&htable_ops_in_progress_lock); | ||
47 | list_add_tail(&op->list, &htable_ops_in_progress[index]); | ||
48 | spin_unlock(&htable_ops_in_progress_lock); | ||
49 | } | ||
50 | |||
51 | static struct pvfs2_kernel_op_s *pvfs2_devreq_remove_op(__u64 tag) | ||
52 | { | ||
53 | struct pvfs2_kernel_op_s *op, *next; | ||
54 | int index; | ||
55 | |||
56 | index = hash_func(tag, hash_table_size); | ||
57 | |||
58 | spin_lock(&htable_ops_in_progress_lock); | ||
59 | list_for_each_entry_safe(op, | ||
60 | next, | ||
61 | &htable_ops_in_progress[index], | ||
62 | list) { | ||
63 | if (op->tag == tag) { | ||
64 | list_del(&op->list); | ||
65 | spin_unlock(&htable_ops_in_progress_lock); | ||
66 | return op; | ||
67 | } | ||
68 | } | ||
69 | |||
70 | spin_unlock(&htable_ops_in_progress_lock); | ||
71 | return NULL; | ||
72 | } | ||
73 | |||
74 | static int pvfs2_devreq_open(struct inode *inode, struct file *file) | ||
75 | { | ||
76 | int ret = -EINVAL; | ||
77 | |||
78 | if (!(file->f_flags & O_NONBLOCK)) { | ||
79 | gossip_err("pvfs2: device cannot be opened in blocking mode\n"); | ||
80 | goto out; | ||
81 | } | ||
82 | ret = -EACCES; | ||
83 | gossip_debug(GOSSIP_DEV_DEBUG, "pvfs2-client-core: opening device\n"); | ||
84 | mutex_lock(&devreq_mutex); | ||
85 | |||
86 | if (open_access_count == 0) { | ||
87 | ret = generic_file_open(inode, file); | ||
88 | if (ret == 0) | ||
89 | open_access_count++; | ||
90 | } else { | ||
91 | DUMP_DEVICE_ERROR(); | ||
92 | } | ||
93 | mutex_unlock(&devreq_mutex); | ||
94 | |||
95 | out: | ||
96 | |||
97 | gossip_debug(GOSSIP_DEV_DEBUG, | ||
98 | "pvfs2-client-core: open device complete (ret = %d)\n", | ||
99 | ret); | ||
100 | return ret; | ||
101 | } | ||
102 | |||
103 | static ssize_t pvfs2_devreq_read(struct file *file, | ||
104 | char __user *buf, | ||
105 | size_t count, loff_t *offset) | ||
106 | { | ||
107 | int ret = 0; | ||
108 | ssize_t len = 0; | ||
109 | struct pvfs2_kernel_op_s *cur_op = NULL; | ||
110 | static __s32 magic = PVFS2_DEVREQ_MAGIC; | ||
111 | __s32 proto_ver = PVFS_KERNEL_PROTO_VERSION; | ||
112 | |||
113 | if (!(file->f_flags & O_NONBLOCK)) { | ||
114 | /* We do not support blocking reads/opens any more */ | ||
115 | gossip_err("pvfs2: blocking reads are not supported! (pvfs2-client-core bug)\n"); | ||
116 | return -EINVAL; | ||
117 | } else { | ||
118 | struct pvfs2_kernel_op_s *op = NULL, *temp = NULL; | ||
119 | /* get next op (if any) from top of list */ | ||
120 | spin_lock(&pvfs2_request_list_lock); | ||
121 | list_for_each_entry_safe(op, temp, &pvfs2_request_list, list) { | ||
122 | __s32 fsid = fsid_of_op(op); | ||
123 | /* | ||
124 | * Check if this op's fsid is known and needs | ||
125 | * remounting | ||
126 | */ | ||
127 | if (fsid != PVFS_FS_ID_NULL && | ||
128 | fs_mount_pending(fsid) == 1) { | ||
129 | gossip_debug(GOSSIP_DEV_DEBUG, | ||
130 | "Skipping op tag %llu %s\n", | ||
131 | llu(op->tag), | ||
132 | get_opname_string(op)); | ||
133 | continue; | ||
134 | } else { | ||
135 | /* | ||
136 | * op does not belong to any particular fsid | ||
137 | * or already mounted.. let it through | ||
138 | */ | ||
139 | cur_op = op; | ||
140 | spin_lock(&cur_op->lock); | ||
141 | list_del(&cur_op->list); | ||
142 | cur_op->op_linger_tmp--; | ||
143 | /* | ||
144 | * if there is a trailer, re-add it to | ||
145 | * the request list. | ||
146 | */ | ||
147 | if (cur_op->op_linger == 2 && | ||
148 | cur_op->op_linger_tmp == 1) { | ||
149 | if (cur_op->upcall.trailer_size <= 0 || | ||
150 | cur_op->upcall.trailer_buf == NULL) | ||
151 | gossip_err("BUG:trailer_size is %ld and trailer buf is %p\n", (long)cur_op->upcall.trailer_size, cur_op->upcall.trailer_buf); | ||
152 | /* re-add it to the head of the list */ | ||
153 | list_add(&cur_op->list, | ||
154 | &pvfs2_request_list); | ||
155 | } | ||
156 | spin_unlock(&cur_op->lock); | ||
157 | break; | ||
158 | } | ||
159 | } | ||
160 | spin_unlock(&pvfs2_request_list_lock); | ||
161 | } | ||
162 | |||
163 | if (cur_op) { | ||
164 | spin_lock(&cur_op->lock); | ||
165 | |||
166 | gossip_debug(GOSSIP_DEV_DEBUG, | ||
167 | "client-core: reading op tag %llu %s\n", | ||
168 | llu(cur_op->tag), get_opname_string(cur_op)); | ||
169 | if (op_state_in_progress(cur_op) || op_state_serviced(cur_op)) { | ||
170 | if (cur_op->op_linger == 1) | ||
171 | gossip_err("WARNING: Current op already queued...skipping\n"); | ||
172 | } else if (cur_op->op_linger == 1 || | ||
173 | (cur_op->op_linger == 2 && | ||
174 | cur_op->op_linger_tmp == 0)) { | ||
175 | /* | ||
176 | * atomically move the operation to the | ||
177 | * htable_ops_in_progress | ||
178 | */ | ||
179 | set_op_state_inprogress(cur_op); | ||
180 | pvfs2_devreq_add_op(cur_op); | ||
181 | } | ||
182 | |||
183 | spin_unlock(&cur_op->lock); | ||
184 | |||
185 | /* 2 cases | ||
186 | * a) OPs with no trailers | ||
187 | * b) OPs with trailers, Stage 1 | ||
188 | * Either way push the upcall out | ||
189 | */ | ||
190 | if (cur_op->op_linger == 1 || | ||
191 | (cur_op->op_linger == 2 && cur_op->op_linger_tmp == 1)) { | ||
192 | len = MAX_ALIGNED_DEV_REQ_UPSIZE; | ||
193 | if ((size_t) len <= count) { | ||
194 | ret = copy_to_user(buf, | ||
195 | &proto_ver, | ||
196 | sizeof(__s32)); | ||
197 | if (ret == 0) { | ||
198 | ret = copy_to_user(buf + sizeof(__s32), | ||
199 | &magic, | ||
200 | sizeof(__s32)); | ||
201 | if (ret == 0) { | ||
202 | ret = copy_to_user(buf+2 * sizeof(__s32), | ||
203 | &cur_op->tag, | ||
204 | sizeof(__u64)); | ||
205 | if (ret == 0) { | ||
206 | ret = copy_to_user( | ||
207 | buf + | ||
208 | 2 * | ||
209 | sizeof(__s32) + | ||
210 | sizeof(__u64), | ||
211 | &cur_op->upcall, | ||
212 | sizeof(struct pvfs2_upcall_s)); | ||
213 | } | ||
214 | } | ||
215 | } | ||
216 | |||
217 | if (ret) { | ||
218 | gossip_err("Failed to copy data to user space\n"); | ||
219 | len = -EFAULT; | ||
220 | } | ||
221 | } else { | ||
222 | gossip_err | ||
223 | ("Failed to copy data to user space\n"); | ||
224 | len = -EIO; | ||
225 | } | ||
226 | } | ||
227 | /* Stage 2: Push the trailer out */ | ||
228 | else if (cur_op->op_linger == 2 && cur_op->op_linger_tmp == 0) { | ||
229 | len = cur_op->upcall.trailer_size; | ||
230 | if ((size_t) len <= count) { | ||
231 | ret = copy_to_user(buf, | ||
232 | cur_op->upcall.trailer_buf, | ||
233 | len); | ||
234 | if (ret) { | ||
235 | gossip_err("Failed to copy trailer to user space\n"); | ||
236 | len = -EFAULT; | ||
237 | } | ||
238 | } else { | ||
239 | gossip_err("Read buffer for trailer is too small (%ld as opposed to %ld)\n", | ||
240 | (long)count, | ||
241 | (long)len); | ||
242 | len = -EIO; | ||
243 | } | ||
244 | } else { | ||
245 | gossip_err("cur_op: %p (op_linger %d), (op_linger_tmp %d), erroneous request list?\n", | ||
246 | cur_op, | ||
247 | cur_op->op_linger, | ||
248 | cur_op->op_linger_tmp); | ||
249 | len = 0; | ||
250 | } | ||
251 | } else if (file->f_flags & O_NONBLOCK) { | ||
252 | /* | ||
253 | * if in non-blocking mode, return EAGAIN since no requests are | ||
254 | * ready yet | ||
255 | */ | ||
256 | len = -EAGAIN; | ||
257 | } | ||
258 | return len; | ||
259 | } | ||
260 | |||
261 | /* Function for writev() callers into the device */ | ||
262 | static ssize_t pvfs2_devreq_writev(struct file *file, | ||
263 | const struct iovec *iov, | ||
264 | size_t count, | ||
265 | loff_t *offset) | ||
266 | { | ||
267 | struct pvfs2_kernel_op_s *op = NULL; | ||
268 | void *buffer = NULL; | ||
269 | void *ptr = NULL; | ||
270 | unsigned long i = 0; | ||
271 | static int max_downsize = MAX_ALIGNED_DEV_REQ_DOWNSIZE; | ||
272 | int ret = 0, num_remaining = max_downsize; | ||
273 | int notrailer_count = 4; /* num elements in iovec without trailer */ | ||
274 | int payload_size = 0; | ||
275 | __s32 magic = 0; | ||
276 | __s32 proto_ver = 0; | ||
277 | __u64 tag = 0; | ||
278 | ssize_t total_returned_size = 0; | ||
279 | |||
280 | /* Either there is a trailer or there isn't */ | ||
281 | if (count != notrailer_count && count != (notrailer_count + 1)) { | ||
282 | gossip_err("Error: Number of iov vectors is (%ld) and notrailer count is %d\n", | ||
283 | count, | ||
284 | notrailer_count); | ||
285 | return -EPROTO; | ||
286 | } | ||
287 | buffer = dev_req_alloc(); | ||
288 | if (!buffer) | ||
289 | return -ENOMEM; | ||
290 | ptr = buffer; | ||
291 | |||
292 | for (i = 0; i < notrailer_count; i++) { | ||
293 | if (iov[i].iov_len > num_remaining) { | ||
294 | gossip_err | ||
295 | ("writev error: Freeing buffer and returning\n"); | ||
296 | dev_req_release(buffer); | ||
297 | return -EMSGSIZE; | ||
298 | } | ||
299 | ret = copy_from_user(ptr, iov[i].iov_base, iov[i].iov_len); | ||
300 | if (ret) { | ||
301 | gossip_err("Failed to copy data from user space\n"); | ||
302 | dev_req_release(buffer); | ||
303 | return -EIO; | ||
304 | } | ||
305 | num_remaining -= iov[i].iov_len; | ||
306 | ptr += iov[i].iov_len; | ||
307 | payload_size += iov[i].iov_len; | ||
308 | } | ||
309 | total_returned_size = payload_size; | ||
310 | |||
311 | /* these elements are currently 8 byte aligned (8 bytes for (version + | ||
312 | * magic) 8 bytes for tag). If you add another element, either | ||
313 | * make it 8 bytes big, or use get_unaligned when asigning. | ||
314 | */ | ||
315 | ptr = buffer; | ||
316 | proto_ver = *((__s32 *) ptr); | ||
317 | ptr += sizeof(__s32); | ||
318 | |||
319 | magic = *((__s32 *) ptr); | ||
320 | ptr += sizeof(__s32); | ||
321 | |||
322 | tag = *((__u64 *) ptr); | ||
323 | ptr += sizeof(__u64); | ||
324 | |||
325 | if (magic != PVFS2_DEVREQ_MAGIC) { | ||
326 | gossip_err("Error: Device magic number does not match.\n"); | ||
327 | dev_req_release(buffer); | ||
328 | return -EPROTO; | ||
329 | } | ||
330 | |||
331 | /* | ||
332 | * proto_ver = 20902 for 2.9.2 | ||
333 | */ | ||
334 | |||
335 | op = pvfs2_devreq_remove_op(tag); | ||
336 | if (op) { | ||
337 | /* Increase ref count! */ | ||
338 | get_op(op); | ||
339 | /* cut off magic and tag from payload size */ | ||
340 | payload_size -= (2 * sizeof(__s32) + sizeof(__u64)); | ||
341 | if (payload_size <= sizeof(struct pvfs2_downcall_s)) | ||
342 | /* copy the passed in downcall into the op */ | ||
343 | memcpy(&op->downcall, | ||
344 | ptr, | ||
345 | sizeof(struct pvfs2_downcall_s)); | ||
346 | else | ||
347 | gossip_debug(GOSSIP_DEV_DEBUG, | ||
348 | "writev: Ignoring %d bytes\n", | ||
349 | payload_size); | ||
350 | |||
351 | /* Do not allocate needlessly if client-core forgets | ||
352 | * to reset trailer size on op errors. | ||
353 | */ | ||
354 | if (op->downcall.status == 0 && op->downcall.trailer_size > 0) { | ||
355 | gossip_debug(GOSSIP_DEV_DEBUG, | ||
356 | "writev: trailer size %ld\n", | ||
357 | (unsigned long)op->downcall.trailer_size); | ||
358 | if (count != (notrailer_count + 1)) { | ||
359 | gossip_err("Error: trailer size (%ld) is non-zero, no trailer elements though? (%ld)\n", (unsigned long)op->downcall.trailer_size, count); | ||
360 | dev_req_release(buffer); | ||
361 | put_op(op); | ||
362 | return -EPROTO; | ||
363 | } | ||
364 | if (iov[notrailer_count].iov_len > | ||
365 | op->downcall.trailer_size) { | ||
366 | gossip_err("writev error: trailer size (%ld) != iov_len (%ld)\n", (unsigned long)op->downcall.trailer_size, (unsigned long)iov[notrailer_count].iov_len); | ||
367 | dev_req_release(buffer); | ||
368 | put_op(op); | ||
369 | return -EMSGSIZE; | ||
370 | } | ||
371 | /* Allocate a buffer large enough to hold the | ||
372 | * trailer bytes. | ||
373 | */ | ||
374 | op->downcall.trailer_buf = | ||
375 | vmalloc(op->downcall.trailer_size); | ||
376 | if (op->downcall.trailer_buf != NULL) { | ||
377 | gossip_debug(GOSSIP_DEV_DEBUG, "vmalloc: %p\n", | ||
378 | op->downcall.trailer_buf); | ||
379 | ret = copy_from_user(op->downcall.trailer_buf, | ||
380 | iov[notrailer_count]. | ||
381 | iov_base, | ||
382 | iov[notrailer_count]. | ||
383 | iov_len); | ||
384 | if (ret) { | ||
385 | gossip_err("Failed to copy trailer data from user space\n"); | ||
386 | dev_req_release(buffer); | ||
387 | gossip_debug(GOSSIP_DEV_DEBUG, | ||
388 | "vfree: %p\n", | ||
389 | op->downcall.trailer_buf); | ||
390 | vfree(op->downcall.trailer_buf); | ||
391 | op->downcall.trailer_buf = NULL; | ||
392 | put_op(op); | ||
393 | return -EIO; | ||
394 | } | ||
395 | } else { | ||
396 | /* Change downcall status */ | ||
397 | op->downcall.status = -ENOMEM; | ||
398 | gossip_err("writev: could not vmalloc for trailer!\n"); | ||
399 | } | ||
400 | } | ||
401 | |||
402 | /* if this operation is an I/O operation and if it was | ||
403 | * initiated on behalf of a *synchronous* VFS I/O operation, | ||
404 | * only then we need to wait | ||
405 | * for all data to be copied before we can return to avoid | ||
406 | * buffer corruption and races that can pull the buffers | ||
407 | * out from under us. | ||
408 | * | ||
409 | * Essentially we're synchronizing with other parts of the | ||
410 | * vfs implicitly by not allowing the user space | ||
411 | * application reading/writing this device to return until | ||
412 | * the buffers are done being used. | ||
413 | */ | ||
414 | if ((op->upcall.type == PVFS2_VFS_OP_FILE_IO && | ||
415 | op->upcall.req.io.async_vfs_io == PVFS_VFS_SYNC_IO) || | ||
416 | op->upcall.type == PVFS2_VFS_OP_FILE_IOX) { | ||
417 | int timed_out = 0; | ||
418 | DECLARE_WAITQUEUE(wait_entry, current); | ||
419 | |||
420 | /* tell the vfs op waiting on a waitqueue | ||
421 | * that this op is done | ||
422 | */ | ||
423 | spin_lock(&op->lock); | ||
424 | set_op_state_serviced(op); | ||
425 | spin_unlock(&op->lock); | ||
426 | |||
427 | add_wait_queue_exclusive(&op->io_completion_waitq, | ||
428 | &wait_entry); | ||
429 | wake_up_interruptible(&op->waitq); | ||
430 | |||
431 | while (1) { | ||
432 | set_current_state(TASK_INTERRUPTIBLE); | ||
433 | |||
434 | spin_lock(&op->lock); | ||
435 | if (op->io_completed) { | ||
436 | spin_unlock(&op->lock); | ||
437 | break; | ||
438 | } | ||
439 | spin_unlock(&op->lock); | ||
440 | |||
441 | if (!signal_pending(current)) { | ||
442 | int timeout = | ||
443 | MSECS_TO_JIFFIES(1000 * | ||
444 | op_timeout_secs); | ||
445 | if (!schedule_timeout(timeout)) { | ||
446 | gossip_debug(GOSSIP_DEV_DEBUG, "*** I/O wait time is up\n"); | ||
447 | timed_out = 1; | ||
448 | break; | ||
449 | } | ||
450 | continue; | ||
451 | } | ||
452 | |||
453 | gossip_debug(GOSSIP_DEV_DEBUG, "*** signal on I/O wait -- aborting\n"); | ||
454 | break; | ||
455 | } | ||
456 | |||
457 | set_current_state(TASK_RUNNING); | ||
458 | remove_wait_queue(&op->io_completion_waitq, | ||
459 | &wait_entry); | ||
460 | |||
461 | /* NOTE: for I/O operations we handle releasing the op | ||
462 | * object except in the case of timeout. the reason we | ||
463 | * can't free the op in timeout cases is that the op | ||
464 | * service logic in the vfs retries operations using | ||
465 | * the same op ptr, thus it can't be freed. | ||
466 | */ | ||
467 | if (!timed_out) | ||
468 | op_release(op); | ||
469 | } else { | ||
470 | |||
471 | /* | ||
472 | * tell the vfs op waiting on a waitqueue that | ||
473 | * this op is done | ||
474 | */ | ||
475 | spin_lock(&op->lock); | ||
476 | set_op_state_serviced(op); | ||
477 | spin_unlock(&op->lock); | ||
478 | /* | ||
479 | for every other operation (i.e. non-I/O), we need to | ||
480 | wake up the callers for downcall completion | ||
481 | notification | ||
482 | */ | ||
483 | wake_up_interruptible(&op->waitq); | ||
484 | } | ||
485 | } else { | ||
486 | /* ignore downcalls that we're not interested in */ | ||
487 | gossip_debug(GOSSIP_DEV_DEBUG, | ||
488 | "WARNING: No one's waiting for tag %llu\n", | ||
489 | llu(tag)); | ||
490 | } | ||
491 | dev_req_release(buffer); | ||
492 | |||
493 | return total_returned_size; | ||
494 | } | ||
495 | |||
496 | static ssize_t pvfs2_devreq_write_iter(struct kiocb *iocb, | ||
497 | struct iov_iter *iter) | ||
498 | { | ||
499 | return pvfs2_devreq_writev(iocb->ki_filp, | ||
500 | iter->iov, | ||
501 | iter->nr_segs, | ||
502 | &iocb->ki_pos); | ||
503 | } | ||
504 | |||
505 | /* Returns whether any FS are still pending remounted */ | ||
506 | static int mark_all_pending_mounts(void) | ||
507 | { | ||
508 | int unmounted = 1; | ||
509 | struct pvfs2_sb_info_s *pvfs2_sb = NULL; | ||
510 | |||
511 | spin_lock(&pvfs2_superblocks_lock); | ||
512 | list_for_each_entry(pvfs2_sb, &pvfs2_superblocks, list) { | ||
513 | /* All of these file system require a remount */ | ||
514 | pvfs2_sb->mount_pending = 1; | ||
515 | unmounted = 0; | ||
516 | } | ||
517 | spin_unlock(&pvfs2_superblocks_lock); | ||
518 | return unmounted; | ||
519 | } | ||
520 | |||
521 | /* | ||
522 | * Determine if a given file system needs to be remounted or not | ||
523 | * Returns -1 on error | ||
524 | * 0 if already mounted | ||
525 | * 1 if needs remount | ||
526 | */ | ||
527 | int fs_mount_pending(__s32 fsid) | ||
528 | { | ||
529 | int mount_pending = -1; | ||
530 | struct pvfs2_sb_info_s *pvfs2_sb = NULL; | ||
531 | |||
532 | spin_lock(&pvfs2_superblocks_lock); | ||
533 | list_for_each_entry(pvfs2_sb, &pvfs2_superblocks, list) { | ||
534 | if (pvfs2_sb->fs_id == fsid) { | ||
535 | mount_pending = pvfs2_sb->mount_pending; | ||
536 | break; | ||
537 | } | ||
538 | } | ||
539 | spin_unlock(&pvfs2_superblocks_lock); | ||
540 | return mount_pending; | ||
541 | } | ||
542 | |||
543 | /* | ||
544 | * NOTE: gets called when the last reference to this device is dropped. | ||
545 | * Using the open_access_count variable, we enforce a reference count | ||
546 | * on this file so that it can be opened by only one process at a time. | ||
547 | * the devreq_mutex is used to make sure all i/o has completed | ||
548 | * before we call pvfs_bufmap_finalize, and similar such tricky | ||
549 | * situations | ||
550 | */ | ||
551 | static int pvfs2_devreq_release(struct inode *inode, struct file *file) | ||
552 | { | ||
553 | int unmounted = 0; | ||
554 | |||
555 | gossip_debug(GOSSIP_DEV_DEBUG, | ||
556 | "%s:pvfs2-client-core: exiting, closing device\n", | ||
557 | __func__); | ||
558 | |||
559 | mutex_lock(&devreq_mutex); | ||
560 | pvfs_bufmap_finalize(); | ||
561 | |||
562 | open_access_count--; | ||
563 | |||
564 | unmounted = mark_all_pending_mounts(); | ||
565 | gossip_debug(GOSSIP_DEV_DEBUG, "PVFS2 Device Close: Filesystem(s) %s\n", | ||
566 | (unmounted ? "UNMOUNTED" : "MOUNTED")); | ||
567 | mutex_unlock(&devreq_mutex); | ||
568 | |||
569 | /* | ||
570 | * Walk through the list of ops in the request list, mark them | ||
571 | * as purged and wake them up. | ||
572 | */ | ||
573 | purge_waiting_ops(); | ||
574 | /* | ||
575 | * Walk through the hash table of in progress operations; mark | ||
576 | * them as purged and wake them up | ||
577 | */ | ||
578 | purge_inprogress_ops(); | ||
579 | gossip_debug(GOSSIP_DEV_DEBUG, | ||
580 | "pvfs2-client-core: device close complete\n"); | ||
581 | return 0; | ||
582 | } | ||
583 | |||
584 | int is_daemon_in_service(void) | ||
585 | { | ||
586 | int in_service; | ||
587 | |||
588 | /* | ||
589 | * What this function does is checks if client-core is alive | ||
590 | * based on the access count we maintain on the device. | ||
591 | */ | ||
592 | mutex_lock(&devreq_mutex); | ||
593 | in_service = open_access_count == 1 ? 0 : -EIO; | ||
594 | mutex_unlock(&devreq_mutex); | ||
595 | return in_service; | ||
596 | } | ||
597 | |||
598 | static inline long check_ioctl_command(unsigned int command) | ||
599 | { | ||
600 | /* Check for valid ioctl codes */ | ||
601 | if (_IOC_TYPE(command) != PVFS_DEV_MAGIC) { | ||
602 | gossip_err("device ioctl magic numbers don't match! Did you rebuild pvfs2-client-core/libpvfs2? [cmd %x, magic %x != %x]\n", | ||
603 | command, | ||
604 | _IOC_TYPE(command), | ||
605 | PVFS_DEV_MAGIC); | ||
606 | return -EINVAL; | ||
607 | } | ||
608 | /* and valid ioctl commands */ | ||
609 | if (_IOC_NR(command) >= PVFS_DEV_MAXNR || _IOC_NR(command) <= 0) { | ||
610 | gossip_err("Invalid ioctl command number [%d >= %d]\n", | ||
611 | _IOC_NR(command), PVFS_DEV_MAXNR); | ||
612 | return -ENOIOCTLCMD; | ||
613 | } | ||
614 | return 0; | ||
615 | } | ||
616 | |||
617 | static long dispatch_ioctl_command(unsigned int command, unsigned long arg) | ||
618 | { | ||
619 | static __s32 magic = PVFS2_DEVREQ_MAGIC; | ||
620 | static __s32 max_up_size = MAX_ALIGNED_DEV_REQ_UPSIZE; | ||
621 | static __s32 max_down_size = MAX_ALIGNED_DEV_REQ_DOWNSIZE; | ||
622 | struct PVFS_dev_map_desc user_desc; | ||
623 | int ret = 0; | ||
624 | struct dev_mask_info_s mask_info = { 0 }; | ||
625 | struct dev_mask2_info_s mask2_info = { 0, 0 }; | ||
626 | int upstream_kmod = 1; | ||
627 | struct list_head *tmp = NULL; | ||
628 | struct pvfs2_sb_info_s *pvfs2_sb = NULL; | ||
629 | |||
630 | /* mtmoore: add locking here */ | ||
631 | |||
632 | switch (command) { | ||
633 | case PVFS_DEV_GET_MAGIC: | ||
634 | return ((put_user(magic, (__s32 __user *) arg) == -EFAULT) ? | ||
635 | -EIO : | ||
636 | 0); | ||
637 | case PVFS_DEV_GET_MAX_UPSIZE: | ||
638 | return ((put_user(max_up_size, | ||
639 | (__s32 __user *) arg) == -EFAULT) ? | ||
640 | -EIO : | ||
641 | 0); | ||
642 | case PVFS_DEV_GET_MAX_DOWNSIZE: | ||
643 | return ((put_user(max_down_size, | ||
644 | (__s32 __user *) arg) == -EFAULT) ? | ||
645 | -EIO : | ||
646 | 0); | ||
647 | case PVFS_DEV_MAP: | ||
648 | ret = copy_from_user(&user_desc, | ||
649 | (struct PVFS_dev_map_desc __user *) | ||
650 | arg, | ||
651 | sizeof(struct PVFS_dev_map_desc)); | ||
652 | return ret ? -EIO : pvfs_bufmap_initialize(&user_desc); | ||
653 | case PVFS_DEV_REMOUNT_ALL: | ||
654 | gossip_debug(GOSSIP_DEV_DEBUG, | ||
655 | "pvfs2_devreq_ioctl: got PVFS_DEV_REMOUNT_ALL\n"); | ||
656 | |||
657 | /* | ||
658 | * remount all mounted pvfs2 volumes to regain the lost | ||
659 | * dynamic mount tables (if any) -- NOTE: this is done | ||
660 | * without keeping the superblock list locked due to the | ||
661 | * upcall/downcall waiting. also, the request semaphore is | ||
662 | * used to ensure that no operations will be serviced until | ||
663 | * all of the remounts are serviced (to avoid ops between | ||
664 | * mounts to fail) | ||
665 | */ | ||
666 | ret = mutex_lock_interruptible(&request_mutex); | ||
667 | if (ret < 0) | ||
668 | return ret; | ||
669 | gossip_debug(GOSSIP_DEV_DEBUG, | ||
670 | "pvfs2_devreq_ioctl: priority remount in progress\n"); | ||
671 | list_for_each(tmp, &pvfs2_superblocks) { | ||
672 | pvfs2_sb = | ||
673 | list_entry(tmp, struct pvfs2_sb_info_s, list); | ||
674 | if (pvfs2_sb && (pvfs2_sb->sb)) { | ||
675 | gossip_debug(GOSSIP_DEV_DEBUG, | ||
676 | "Remounting SB %p\n", | ||
677 | pvfs2_sb); | ||
678 | |||
679 | ret = pvfs2_remount(pvfs2_sb->sb); | ||
680 | if (ret) { | ||
681 | gossip_debug(GOSSIP_DEV_DEBUG, | ||
682 | "SB %p remount failed\n", | ||
683 | pvfs2_sb); | ||
684 | break; | ||
685 | } | ||
686 | } | ||
687 | } | ||
688 | gossip_debug(GOSSIP_DEV_DEBUG, | ||
689 | "pvfs2_devreq_ioctl: priority remount complete\n"); | ||
690 | mutex_unlock(&request_mutex); | ||
691 | return ret; | ||
692 | |||
693 | case PVFS_DEV_UPSTREAM: | ||
694 | ret = copy_to_user((void __user *)arg, | ||
695 | &upstream_kmod, | ||
696 | sizeof(upstream_kmod)); | ||
697 | |||
698 | if (ret != 0) | ||
699 | return -EIO; | ||
700 | else | ||
701 | return ret; | ||
702 | |||
703 | case PVFS_DEV_CLIENT_MASK: | ||
704 | ret = copy_from_user(&mask2_info, | ||
705 | (void __user *)arg, | ||
706 | sizeof(struct dev_mask2_info_s)); | ||
707 | |||
708 | if (ret != 0) | ||
709 | return -EIO; | ||
710 | |||
711 | client_debug_mask.mask1 = mask2_info.mask1_value; | ||
712 | client_debug_mask.mask2 = mask2_info.mask2_value; | ||
713 | |||
714 | pr_info("%s: client debug mask has been been received " | ||
715 | ":%llx: :%llx:\n", | ||
716 | __func__, | ||
717 | (unsigned long long)client_debug_mask.mask1, | ||
718 | (unsigned long long)client_debug_mask.mask2); | ||
719 | |||
720 | return ret; | ||
721 | |||
722 | case PVFS_DEV_CLIENT_STRING: | ||
723 | ret = copy_from_user(&client_debug_array_string, | ||
724 | (void __user *)arg, | ||
725 | PVFS2_MAX_DEBUG_STRING_LEN); | ||
726 | if (ret != 0) { | ||
727 | pr_info("%s: " | ||
728 | "PVFS_DEV_CLIENT_STRING: copy_from_user failed" | ||
729 | "\n", | ||
730 | __func__); | ||
731 | return -EIO; | ||
732 | } | ||
733 | |||
734 | pr_info("%s: client debug array string has been been received." | ||
735 | "\n", | ||
736 | __func__); | ||
737 | |||
738 | if (!help_string_initialized) { | ||
739 | |||
740 | /* Free the "we don't know yet" default string... */ | ||
741 | kfree(debug_help_string); | ||
742 | |||
743 | /* build a proper debug help string */ | ||
744 | if (orangefs_prepare_debugfs_help_string(0)) { | ||
745 | gossip_err("%s: " | ||
746 | "prepare_debugfs_help_string failed" | ||
747 | "\n", | ||
748 | __func__); | ||
749 | return -EIO; | ||
750 | } | ||
751 | |||
752 | /* Replace the boilerplate boot-time debug-help file. */ | ||
753 | debugfs_remove(help_file_dentry); | ||
754 | |||
755 | help_file_dentry = | ||
756 | debugfs_create_file( | ||
757 | ORANGEFS_KMOD_DEBUG_HELP_FILE, | ||
758 | 0444, | ||
759 | debug_dir, | ||
760 | debug_help_string, | ||
761 | &debug_help_fops); | ||
762 | |||
763 | if (!help_file_dentry) { | ||
764 | gossip_err("%s: debugfs_create_file failed for" | ||
765 | " :%s:!\n", | ||
766 | __func__, | ||
767 | ORANGEFS_KMOD_DEBUG_HELP_FILE); | ||
768 | return -EIO; | ||
769 | } | ||
770 | } | ||
771 | |||
772 | debug_mask_to_string(&client_debug_mask, 1); | ||
773 | |||
774 | debugfs_remove(client_debug_dentry); | ||
775 | |||
776 | pvfs2_client_debug_init(); | ||
777 | |||
778 | help_string_initialized++; | ||
779 | |||
780 | return ret; | ||
781 | |||
782 | case PVFS_DEV_DEBUG: | ||
783 | ret = copy_from_user(&mask_info, | ||
784 | (void __user *)arg, | ||
785 | sizeof(mask_info)); | ||
786 | |||
787 | if (ret != 0) | ||
788 | return -EIO; | ||
789 | |||
790 | if (mask_info.mask_type == KERNEL_MASK) { | ||
791 | if ((mask_info.mask_value == 0) | ||
792 | && (kernel_mask_set_mod_init)) { | ||
793 | /* | ||
794 | * the kernel debug mask was set when the | ||
795 | * kernel module was loaded; don't override | ||
796 | * it if the client-core was started without | ||
797 | * a value for PVFS2_KMODMASK. | ||
798 | */ | ||
799 | return 0; | ||
800 | } | ||
801 | debug_mask_to_string(&mask_info.mask_value, | ||
802 | mask_info.mask_type); | ||
803 | gossip_debug_mask = mask_info.mask_value; | ||
804 | pr_info("PVFS: kernel debug mask has been modified to " | ||
805 | ":%s: :%llx:\n", | ||
806 | kernel_debug_string, | ||
807 | (unsigned long long)gossip_debug_mask); | ||
808 | } else if (mask_info.mask_type == CLIENT_MASK) { | ||
809 | debug_mask_to_string(&mask_info.mask_value, | ||
810 | mask_info.mask_type); | ||
811 | pr_info("PVFS: client debug mask has been modified to" | ||
812 | ":%s: :%llx:\n", | ||
813 | client_debug_string, | ||
814 | llu(mask_info.mask_value)); | ||
815 | } else { | ||
816 | gossip_lerr("Invalid mask type....\n"); | ||
817 | return -EINVAL; | ||
818 | } | ||
819 | |||
820 | return ret; | ||
821 | |||
822 | default: | ||
823 | return -ENOIOCTLCMD; | ||
824 | } | ||
825 | return -ENOIOCTLCMD; | ||
826 | } | ||
827 | |||
828 | static long pvfs2_devreq_ioctl(struct file *file, | ||
829 | unsigned int command, unsigned long arg) | ||
830 | { | ||
831 | long ret; | ||
832 | |||
833 | /* Check for properly constructed commands */ | ||
834 | ret = check_ioctl_command(command); | ||
835 | if (ret < 0) | ||
836 | return (int)ret; | ||
837 | |||
838 | return (int)dispatch_ioctl_command(command, arg); | ||
839 | } | ||
840 | |||
841 | #ifdef CONFIG_COMPAT /* CONFIG_COMPAT is in .config */ | ||
842 | |||
843 | /* Compat structure for the PVFS_DEV_MAP ioctl */ | ||
844 | struct PVFS_dev_map_desc32 { | ||
845 | compat_uptr_t ptr; | ||
846 | __s32 total_size; | ||
847 | __s32 size; | ||
848 | __s32 count; | ||
849 | }; | ||
850 | |||
851 | static unsigned long translate_dev_map26(unsigned long args, long *error) | ||
852 | { | ||
853 | struct PVFS_dev_map_desc32 __user *p32 = (void __user *)args; | ||
854 | /* | ||
855 | * Depending on the architecture, allocate some space on the | ||
856 | * user-call-stack based on our expected layout. | ||
857 | */ | ||
858 | struct PVFS_dev_map_desc __user *p = | ||
859 | compat_alloc_user_space(sizeof(*p)); | ||
860 | u32 addr; | ||
861 | |||
862 | *error = 0; | ||
863 | /* get the ptr from the 32 bit user-space */ | ||
864 | if (get_user(addr, &p32->ptr)) | ||
865 | goto err; | ||
866 | /* try to put that into a 64-bit layout */ | ||
867 | if (put_user(compat_ptr(addr), &p->ptr)) | ||
868 | goto err; | ||
869 | /* copy the remaining fields */ | ||
870 | if (copy_in_user(&p->total_size, &p32->total_size, sizeof(__s32))) | ||
871 | goto err; | ||
872 | if (copy_in_user(&p->size, &p32->size, sizeof(__s32))) | ||
873 | goto err; | ||
874 | if (copy_in_user(&p->count, &p32->count, sizeof(__s32))) | ||
875 | goto err; | ||
876 | return (unsigned long)p; | ||
877 | err: | ||
878 | *error = -EFAULT; | ||
879 | return 0; | ||
880 | } | ||
881 | |||
882 | /* | ||
883 | * 32 bit user-space apps' ioctl handlers when kernel modules | ||
884 | * is compiled as a 64 bit one | ||
885 | */ | ||
886 | static long pvfs2_devreq_compat_ioctl(struct file *filp, unsigned int cmd, | ||
887 | unsigned long args) | ||
888 | { | ||
889 | long ret; | ||
890 | unsigned long arg = args; | ||
891 | |||
892 | /* Check for properly constructed commands */ | ||
893 | ret = check_ioctl_command(cmd); | ||
894 | if (ret < 0) | ||
895 | return ret; | ||
896 | if (cmd == PVFS_DEV_MAP) { | ||
897 | /* | ||
898 | * convert the arguments to what we expect internally | ||
899 | * in kernel space | ||
900 | */ | ||
901 | arg = translate_dev_map26(args, &ret); | ||
902 | if (ret < 0) { | ||
903 | gossip_err("Could not translate dev map\n"); | ||
904 | return ret; | ||
905 | } | ||
906 | } | ||
907 | /* no other ioctl requires translation */ | ||
908 | return dispatch_ioctl_command(cmd, arg); | ||
909 | } | ||
910 | |||
911 | static int pvfs2_ioctl32_init(void) | ||
912 | { | ||
913 | return 0; | ||
914 | } | ||
915 | |||
916 | static void pvfs2_ioctl32_cleanup(void) | ||
917 | { | ||
918 | return; | ||
919 | } | ||
920 | |||
921 | #endif /* CONFIG_COMPAT is in .config */ | ||
922 | |||
923 | /* the assigned character device major number */ | ||
924 | static int pvfs2_dev_major; | ||
925 | |||
926 | /* | ||
927 | * Initialize pvfs2 device specific state: | ||
928 | * Must be called at module load time only | ||
929 | */ | ||
930 | int pvfs2_dev_init(void) | ||
931 | { | ||
932 | int ret; | ||
933 | |||
934 | /* register the ioctl32 sub-system */ | ||
935 | ret = pvfs2_ioctl32_init(); | ||
936 | if (ret < 0) | ||
937 | return ret; | ||
938 | |||
939 | /* register pvfs2-req device */ | ||
940 | pvfs2_dev_major = register_chrdev(0, | ||
941 | PVFS2_REQDEVICE_NAME, | ||
942 | &pvfs2_devreq_file_operations); | ||
943 | if (pvfs2_dev_major < 0) { | ||
944 | gossip_debug(GOSSIP_DEV_DEBUG, | ||
945 | "Failed to register /dev/%s (error %d)\n", | ||
946 | PVFS2_REQDEVICE_NAME, pvfs2_dev_major); | ||
947 | pvfs2_ioctl32_cleanup(); | ||
948 | return pvfs2_dev_major; | ||
949 | } | ||
950 | |||
951 | gossip_debug(GOSSIP_DEV_DEBUG, | ||
952 | "*** /dev/%s character device registered ***\n", | ||
953 | PVFS2_REQDEVICE_NAME); | ||
954 | gossip_debug(GOSSIP_DEV_DEBUG, "'mknod /dev/%s c %d 0'.\n", | ||
955 | PVFS2_REQDEVICE_NAME, pvfs2_dev_major); | ||
956 | return 0; | ||
957 | } | ||
958 | |||
959 | void pvfs2_dev_cleanup(void) | ||
960 | { | ||
961 | unregister_chrdev(pvfs2_dev_major, PVFS2_REQDEVICE_NAME); | ||
962 | gossip_debug(GOSSIP_DEV_DEBUG, | ||
963 | "*** /dev/%s character device unregistered ***\n", | ||
964 | PVFS2_REQDEVICE_NAME); | ||
965 | /* unregister the ioctl32 sub-system */ | ||
966 | pvfs2_ioctl32_cleanup(); | ||
967 | } | ||
968 | |||
969 | static unsigned int pvfs2_devreq_poll(struct file *file, | ||
970 | struct poll_table_struct *poll_table) | ||
971 | { | ||
972 | int poll_revent_mask = 0; | ||
973 | |||
974 | if (open_access_count == 1) { | ||
975 | poll_wait(file, &pvfs2_request_list_waitq, poll_table); | ||
976 | |||
977 | spin_lock(&pvfs2_request_list_lock); | ||
978 | if (!list_empty(&pvfs2_request_list)) | ||
979 | poll_revent_mask |= POLL_IN; | ||
980 | spin_unlock(&pvfs2_request_list_lock); | ||
981 | } | ||
982 | return poll_revent_mask; | ||
983 | } | ||
984 | |||
985 | const struct file_operations pvfs2_devreq_file_operations = { | ||
986 | .owner = THIS_MODULE, | ||
987 | .read = pvfs2_devreq_read, | ||
988 | .write_iter = pvfs2_devreq_write_iter, | ||
989 | .open = pvfs2_devreq_open, | ||
990 | .release = pvfs2_devreq_release, | ||
991 | .unlocked_ioctl = pvfs2_devreq_ioctl, | ||
992 | |||
993 | #ifdef CONFIG_COMPAT /* CONFIG_COMPAT is in .config */ | ||
994 | .compat_ioctl = pvfs2_devreq_compat_ioctl, | ||
995 | #endif | ||
996 | .poll = pvfs2_devreq_poll | ||
997 | }; | ||
diff --git a/fs/orangefs/dir.c b/fs/orangefs/dir.c new file mode 100644 index 000000000000..9b5f4bb17874 --- /dev/null +++ b/fs/orangefs/dir.c | |||
@@ -0,0 +1,394 @@ | |||
1 | /* | ||
2 | * (C) 2001 Clemson University and The University of Chicago | ||
3 | * | ||
4 | * See COPYING in top-level directory. | ||
5 | */ | ||
6 | |||
7 | #include "protocol.h" | ||
8 | #include "pvfs2-kernel.h" | ||
9 | #include "pvfs2-bufmap.h" | ||
10 | |||
11 | struct readdir_handle_s { | ||
12 | int buffer_index; | ||
13 | struct pvfs2_readdir_response_s readdir_response; | ||
14 | void *dents_buf; | ||
15 | }; | ||
16 | |||
17 | /* | ||
18 | * decode routine needed by kmod to make sense of the shared page for readdirs. | ||
19 | */ | ||
20 | static long decode_dirents(char *ptr, struct pvfs2_readdir_response_s *readdir) | ||
21 | { | ||
22 | int i; | ||
23 | struct pvfs2_readdir_response_s *rd = | ||
24 | (struct pvfs2_readdir_response_s *) ptr; | ||
25 | char *buf = ptr; | ||
26 | char **pptr = &buf; | ||
27 | |||
28 | readdir->token = rd->token; | ||
29 | readdir->pvfs_dirent_outcount = rd->pvfs_dirent_outcount; | ||
30 | readdir->dirent_array = kmalloc(readdir->pvfs_dirent_outcount * | ||
31 | sizeof(*readdir->dirent_array), | ||
32 | GFP_KERNEL); | ||
33 | if (readdir->dirent_array == NULL) | ||
34 | return -ENOMEM; | ||
35 | *pptr += offsetof(struct pvfs2_readdir_response_s, dirent_array); | ||
36 | for (i = 0; i < readdir->pvfs_dirent_outcount; i++) { | ||
37 | dec_string(pptr, &readdir->dirent_array[i].d_name, | ||
38 | &readdir->dirent_array[i].d_length); | ||
39 | readdir->dirent_array[i].khandle = | ||
40 | *(struct pvfs2_khandle *) *pptr; | ||
41 | *pptr += 16; | ||
42 | } | ||
43 | return (unsigned long)*pptr - (unsigned long)ptr; | ||
44 | } | ||
45 | |||
46 | static long readdir_handle_ctor(struct readdir_handle_s *rhandle, void *buf, | ||
47 | int buffer_index) | ||
48 | { | ||
49 | long ret; | ||
50 | |||
51 | if (buf == NULL) { | ||
52 | gossip_err | ||
53 | ("Invalid NULL buffer specified in readdir_handle_ctor\n"); | ||
54 | return -ENOMEM; | ||
55 | } | ||
56 | if (buffer_index < 0) { | ||
57 | gossip_err | ||
58 | ("Invalid buffer index specified in readdir_handle_ctor\n"); | ||
59 | return -EINVAL; | ||
60 | } | ||
61 | rhandle->buffer_index = buffer_index; | ||
62 | rhandle->dents_buf = buf; | ||
63 | ret = decode_dirents(buf, &rhandle->readdir_response); | ||
64 | if (ret < 0) { | ||
65 | gossip_err("Could not decode readdir from buffer %ld\n", ret); | ||
66 | rhandle->buffer_index = -1; | ||
67 | gossip_debug(GOSSIP_DIR_DEBUG, "vfree %p\n", buf); | ||
68 | vfree(buf); | ||
69 | rhandle->dents_buf = NULL; | ||
70 | } | ||
71 | return ret; | ||
72 | } | ||
73 | |||
74 | static void readdir_handle_dtor(struct pvfs2_bufmap *bufmap, | ||
75 | struct readdir_handle_s *rhandle) | ||
76 | { | ||
77 | if (rhandle == NULL) | ||
78 | return; | ||
79 | |||
80 | /* kfree(NULL) is safe */ | ||
81 | kfree(rhandle->readdir_response.dirent_array); | ||
82 | rhandle->readdir_response.dirent_array = NULL; | ||
83 | |||
84 | if (rhandle->buffer_index >= 0) { | ||
85 | readdir_index_put(bufmap, rhandle->buffer_index); | ||
86 | rhandle->buffer_index = -1; | ||
87 | } | ||
88 | if (rhandle->dents_buf) { | ||
89 | gossip_debug(GOSSIP_DIR_DEBUG, "vfree %p\n", | ||
90 | rhandle->dents_buf); | ||
91 | vfree(rhandle->dents_buf); | ||
92 | rhandle->dents_buf = NULL; | ||
93 | } | ||
94 | } | ||
95 | |||
96 | /* | ||
97 | * Read directory entries from an instance of an open directory. | ||
98 | * | ||
99 | * \note This routine was converted for the readdir to iterate change | ||
100 | * in "struct file_operations". "converted" mostly amounts to | ||
101 | * changing occurrences of "readdir" and "filldir" in the | ||
102 | * comments to "iterate" and "dir_emit". Also filldir calls | ||
103 | * were changed to dir_emit calls. | ||
104 | * | ||
105 | * \param dir_emit callback function called for each entry read. | ||
106 | * | ||
107 | * \retval <0 on error | ||
108 | * \retval 0 when directory has been completely traversed | ||
109 | * \retval >0 if we don't call dir_emit for all entries | ||
110 | * | ||
111 | * \note If the dir_emit call-back returns non-zero, then iterate should | ||
112 | * assume that it has had enough, and should return as well. | ||
113 | */ | ||
114 | static int pvfs2_readdir(struct file *file, struct dir_context *ctx) | ||
115 | { | ||
116 | struct pvfs2_bufmap *bufmap = NULL; | ||
117 | int ret = 0; | ||
118 | int buffer_index; | ||
119 | __u64 *ptoken = file->private_data; | ||
120 | __u64 pos = 0; | ||
121 | ino_t ino = 0; | ||
122 | struct dentry *dentry = file->f_path.dentry; | ||
123 | struct pvfs2_kernel_op_s *new_op = NULL; | ||
124 | struct pvfs2_inode_s *pvfs2_inode = PVFS2_I(dentry->d_inode); | ||
125 | int buffer_full = 0; | ||
126 | struct readdir_handle_s rhandle; | ||
127 | int i = 0; | ||
128 | int len = 0; | ||
129 | ino_t current_ino = 0; | ||
130 | char *current_entry = NULL; | ||
131 | long bytes_decoded; | ||
132 | |||
133 | gossip_ldebug(GOSSIP_DIR_DEBUG, | ||
134 | "%s: ctx->pos:%lld, token = %llu\n", | ||
135 | __func__, | ||
136 | lld(ctx->pos), | ||
137 | llu(*ptoken)); | ||
138 | |||
139 | pos = (__u64) ctx->pos; | ||
140 | |||
141 | /* are we done? */ | ||
142 | if (pos == PVFS_READDIR_END) { | ||
143 | gossip_debug(GOSSIP_DIR_DEBUG, | ||
144 | "Skipping to termination path\n"); | ||
145 | return 0; | ||
146 | } | ||
147 | |||
148 | gossip_debug(GOSSIP_DIR_DEBUG, | ||
149 | "pvfs2_readdir called on %s (pos=%llu)\n", | ||
150 | dentry->d_name.name, llu(pos)); | ||
151 | |||
152 | rhandle.buffer_index = -1; | ||
153 | rhandle.dents_buf = NULL; | ||
154 | memset(&rhandle.readdir_response, 0, sizeof(rhandle.readdir_response)); | ||
155 | |||
156 | new_op = op_alloc(PVFS2_VFS_OP_READDIR); | ||
157 | if (!new_op) | ||
158 | return -ENOMEM; | ||
159 | |||
160 | new_op->uses_shared_memory = 1; | ||
161 | new_op->upcall.req.readdir.refn = pvfs2_inode->refn; | ||
162 | new_op->upcall.req.readdir.max_dirent_count = MAX_DIRENT_COUNT_READDIR; | ||
163 | |||
164 | gossip_debug(GOSSIP_DIR_DEBUG, | ||
165 | "%s: upcall.req.readdir.refn.khandle: %pU\n", | ||
166 | __func__, | ||
167 | &new_op->upcall.req.readdir.refn.khandle); | ||
168 | |||
169 | /* | ||
170 | * NOTE: the position we send to the readdir upcall is out of | ||
171 | * sync with ctx->pos since: | ||
172 | * 1. pvfs2 doesn't include the "." and ".." entries that are | ||
173 | * added below. | ||
174 | * 2. the introduction of distributed directory logic makes token no | ||
175 | * longer be related to f_pos and pos. Instead an independent | ||
176 | * variable is used inside the function and stored in the | ||
177 | * private_data of the file structure. | ||
178 | */ | ||
179 | new_op->upcall.req.readdir.token = *ptoken; | ||
180 | |||
181 | get_new_buffer_index: | ||
182 | ret = readdir_index_get(&bufmap, &buffer_index); | ||
183 | if (ret < 0) { | ||
184 | gossip_lerr("pvfs2_readdir: readdir_index_get() failure (%d)\n", | ||
185 | ret); | ||
186 | goto out_free_op; | ||
187 | } | ||
188 | new_op->upcall.req.readdir.buf_index = buffer_index; | ||
189 | |||
190 | ret = service_operation(new_op, | ||
191 | "pvfs2_readdir", | ||
192 | get_interruptible_flag(dentry->d_inode)); | ||
193 | |||
194 | gossip_debug(GOSSIP_DIR_DEBUG, | ||
195 | "Readdir downcall status is %d. ret:%d\n", | ||
196 | new_op->downcall.status, | ||
197 | ret); | ||
198 | |||
199 | if (ret == -EAGAIN && op_state_purged(new_op)) { | ||
200 | /* | ||
201 | * readdir shared memory aread has been wiped due to | ||
202 | * pvfs2-client-core restarting, so we must get a new | ||
203 | * index into the shared memory. | ||
204 | */ | ||
205 | gossip_debug(GOSSIP_DIR_DEBUG, | ||
206 | "%s: Getting new buffer_index for retry of readdir..\n", | ||
207 | __func__); | ||
208 | readdir_index_put(bufmap, buffer_index); | ||
209 | goto get_new_buffer_index; | ||
210 | } | ||
211 | |||
212 | if (ret == -EIO && op_state_purged(new_op)) { | ||
213 | gossip_err("%s: Client is down. Aborting readdir call.\n", | ||
214 | __func__); | ||
215 | readdir_index_put(bufmap, buffer_index); | ||
216 | goto out_free_op; | ||
217 | } | ||
218 | |||
219 | if (ret < 0 || new_op->downcall.status != 0) { | ||
220 | gossip_debug(GOSSIP_DIR_DEBUG, | ||
221 | "Readdir request failed. Status:%d\n", | ||
222 | new_op->downcall.status); | ||
223 | readdir_index_put(bufmap, buffer_index); | ||
224 | if (ret >= 0) | ||
225 | ret = new_op->downcall.status; | ||
226 | goto out_free_op; | ||
227 | } | ||
228 | |||
229 | bytes_decoded = | ||
230 | readdir_handle_ctor(&rhandle, | ||
231 | new_op->downcall.trailer_buf, | ||
232 | buffer_index); | ||
233 | if (bytes_decoded < 0) { | ||
234 | gossip_err("pvfs2_readdir: Could not decode trailer buffer into a readdir response %d\n", | ||
235 | ret); | ||
236 | ret = bytes_decoded; | ||
237 | readdir_index_put(bufmap, buffer_index); | ||
238 | goto out_free_op; | ||
239 | } | ||
240 | |||
241 | if (bytes_decoded != new_op->downcall.trailer_size) { | ||
242 | gossip_err("pvfs2_readdir: # bytes decoded (%ld) != trailer size (%ld)\n", | ||
243 | bytes_decoded, | ||
244 | (long)new_op->downcall.trailer_size); | ||
245 | ret = -EINVAL; | ||
246 | goto out_destroy_handle; | ||
247 | } | ||
248 | |||
249 | if (pos == 0) { | ||
250 | ino = get_ino_from_khandle(dentry->d_inode); | ||
251 | gossip_debug(GOSSIP_DIR_DEBUG, | ||
252 | "%s: calling dir_emit of \".\" with pos = %llu\n", | ||
253 | __func__, | ||
254 | llu(pos)); | ||
255 | ret = dir_emit(ctx, ".", 1, ino, DT_DIR); | ||
256 | if (ret < 0) | ||
257 | goto out_destroy_handle; | ||
258 | ctx->pos++; | ||
259 | gossip_ldebug(GOSSIP_DIR_DEBUG, | ||
260 | "%s: ctx->pos:%lld\n", | ||
261 | __func__, | ||
262 | lld(ctx->pos)); | ||
263 | pos++; | ||
264 | } | ||
265 | |||
266 | if (pos == 1) { | ||
267 | ino = get_parent_ino_from_dentry(dentry); | ||
268 | gossip_debug(GOSSIP_DIR_DEBUG, | ||
269 | "%s: calling dir_emit of \"..\" with pos = %llu\n", | ||
270 | __func__, | ||
271 | llu(pos)); | ||
272 | ret = dir_emit(ctx, "..", 2, ino, DT_DIR); | ||
273 | if (ret < 0) | ||
274 | goto out_destroy_handle; | ||
275 | ctx->pos++; | ||
276 | gossip_ldebug(GOSSIP_DIR_DEBUG, | ||
277 | "%s: ctx->pos:%lld\n", | ||
278 | __func__, | ||
279 | lld(ctx->pos)); | ||
280 | pos++; | ||
281 | } | ||
282 | |||
283 | for (i = 0; i < rhandle.readdir_response.pvfs_dirent_outcount; i++) { | ||
284 | len = rhandle.readdir_response.dirent_array[i].d_length; | ||
285 | current_entry = rhandle.readdir_response.dirent_array[i].d_name; | ||
286 | current_ino = pvfs2_khandle_to_ino( | ||
287 | &(rhandle.readdir_response.dirent_array[i].khandle)); | ||
288 | |||
289 | gossip_debug(GOSSIP_DIR_DEBUG, | ||
290 | "calling dir_emit for %s with len %d, pos %ld\n", | ||
291 | current_entry, | ||
292 | len, | ||
293 | (unsigned long)pos); | ||
294 | ret = | ||
295 | dir_emit(ctx, current_entry, len, current_ino, DT_UNKNOWN); | ||
296 | if (ret < 0) { | ||
297 | gossip_debug(GOSSIP_DIR_DEBUG, | ||
298 | "dir_emit() failed. ret:%d\n", | ||
299 | ret); | ||
300 | if (i < 2) { | ||
301 | gossip_err("dir_emit failed on one of the first two true PVFS directory entries.\n"); | ||
302 | gossip_err("Duplicate entries may appear.\n"); | ||
303 | } | ||
304 | buffer_full = 1; | ||
305 | break; | ||
306 | } | ||
307 | ctx->pos++; | ||
308 | gossip_ldebug(GOSSIP_DIR_DEBUG, | ||
309 | "%s: ctx->pos:%lld\n", | ||
310 | __func__, | ||
311 | lld(ctx->pos)); | ||
312 | |||
313 | pos++; | ||
314 | } | ||
315 | |||
316 | /* this means that all of the dir_emit calls succeeded */ | ||
317 | if (i == rhandle.readdir_response.pvfs_dirent_outcount) { | ||
318 | /* update token */ | ||
319 | *ptoken = rhandle.readdir_response.token; | ||
320 | } else { | ||
321 | /* this means a dir_emit call failed */ | ||
322 | if (rhandle.readdir_response.token == PVFS_READDIR_END) { | ||
323 | /* | ||
324 | * If PVFS hit end of directory, then there | ||
325 | * is no way to do math on the token that it | ||
326 | * returned. Instead we go by ctx->pos but | ||
327 | * back up to account for the artificial . | ||
328 | * and .. entries. | ||
329 | */ | ||
330 | ctx->pos -= 3; | ||
331 | } else { | ||
332 | /* | ||
333 | * this means a dir_emit call failed. !!! need to set | ||
334 | * back to previous ctx->pos, no middle value allowed | ||
335 | */ | ||
336 | pos -= (i - 1); | ||
337 | ctx->pos -= (i - 1); | ||
338 | } | ||
339 | gossip_debug(GOSSIP_DIR_DEBUG, | ||
340 | "at least one dir_emit call failed. Setting ctx->pos to: %lld\n", | ||
341 | lld(ctx->pos)); | ||
342 | } | ||
343 | |||
344 | /* | ||
345 | * Did we hit the end of the directory? | ||
346 | */ | ||
347 | if (rhandle.readdir_response.token == PVFS_READDIR_END && | ||
348 | !buffer_full) { | ||
349 | gossip_debug(GOSSIP_DIR_DEBUG, "End of dir detected; setting ctx->pos to PVFS_READDIR_END.\n"); | ||
350 | ctx->pos = PVFS_READDIR_END; | ||
351 | } | ||
352 | |||
353 | gossip_debug(GOSSIP_DIR_DEBUG, | ||
354 | "pos = %llu, token = %llu" | ||
355 | ", ctx->pos should have been %lld\n", | ||
356 | llu(pos), | ||
357 | llu(*ptoken), | ||
358 | lld(ctx->pos)); | ||
359 | |||
360 | out_destroy_handle: | ||
361 | readdir_handle_dtor(bufmap, &rhandle); | ||
362 | out_free_op: | ||
363 | op_release(new_op); | ||
364 | gossip_debug(GOSSIP_DIR_DEBUG, "pvfs2_readdir returning %d\n", ret); | ||
365 | return ret; | ||
366 | } | ||
367 | |||
368 | static int pvfs2_dir_open(struct inode *inode, struct file *file) | ||
369 | { | ||
370 | __u64 *ptoken; | ||
371 | |||
372 | file->private_data = kmalloc(sizeof(__u64), GFP_KERNEL); | ||
373 | if (!file->private_data) | ||
374 | return -ENOMEM; | ||
375 | |||
376 | ptoken = file->private_data; | ||
377 | *ptoken = PVFS_READDIR_START; | ||
378 | return 0; | ||
379 | } | ||
380 | |||
381 | static int pvfs2_dir_release(struct inode *inode, struct file *file) | ||
382 | { | ||
383 | pvfs2_flush_inode(inode); | ||
384 | kfree(file->private_data); | ||
385 | return 0; | ||
386 | } | ||
387 | |||
388 | /** PVFS2 implementation of VFS directory operations */ | ||
389 | const struct file_operations pvfs2_dir_operations = { | ||
390 | .read = generic_read_dir, | ||
391 | .iterate = pvfs2_readdir, | ||
392 | .open = pvfs2_dir_open, | ||
393 | .release = pvfs2_dir_release, | ||
394 | }; | ||
diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c new file mode 100644 index 000000000000..8e26f9fac289 --- /dev/null +++ b/fs/orangefs/file.c | |||
@@ -0,0 +1,1019 @@ | |||
1 | /* | ||
2 | * (C) 2001 Clemson University and The University of Chicago | ||
3 | * | ||
4 | * See COPYING in top-level directory. | ||
5 | */ | ||
6 | |||
7 | /* | ||
8 | * Linux VFS file operations. | ||
9 | */ | ||
10 | |||
11 | #include "protocol.h" | ||
12 | #include "pvfs2-kernel.h" | ||
13 | #include "pvfs2-bufmap.h" | ||
14 | #include <linux/fs.h> | ||
15 | #include <linux/pagemap.h> | ||
16 | |||
17 | #define wake_up_daemon_for_return(op) \ | ||
18 | do { \ | ||
19 | spin_lock(&op->lock); \ | ||
20 | op->io_completed = 1; \ | ||
21 | spin_unlock(&op->lock); \ | ||
22 | wake_up_interruptible(&op->io_completion_waitq);\ | ||
23 | } while (0) | ||
24 | |||
25 | /* | ||
26 | * Copy to client-core's address space from the buffers specified | ||
27 | * by the iovec upto total_size bytes. | ||
28 | * NOTE: the iovector can either contain addresses which | ||
29 | * can futher be kernel-space or user-space addresses. | ||
30 | * or it can pointers to struct page's | ||
31 | */ | ||
32 | static int precopy_buffers(struct pvfs2_bufmap *bufmap, | ||
33 | int buffer_index, | ||
34 | const struct iovec *vec, | ||
35 | unsigned long nr_segs, | ||
36 | size_t total_size, | ||
37 | int from_user) | ||
38 | { | ||
39 | int ret = 0; | ||
40 | |||
41 | /* | ||
42 | * copy data from application/kernel by pulling it out | ||
43 | * of the iovec. | ||
44 | */ | ||
45 | /* Are we copying from User Virtual Addresses? */ | ||
46 | if (from_user) | ||
47 | ret = pvfs_bufmap_copy_iovec_from_user( | ||
48 | bufmap, | ||
49 | buffer_index, | ||
50 | vec, | ||
51 | nr_segs, | ||
52 | total_size); | ||
53 | /* Are we copying from Kernel Virtual Addresses? */ | ||
54 | else | ||
55 | ret = pvfs_bufmap_copy_iovec_from_kernel( | ||
56 | bufmap, | ||
57 | buffer_index, | ||
58 | vec, | ||
59 | nr_segs, | ||
60 | total_size); | ||
61 | if (ret < 0) | ||
62 | gossip_err("%s: Failed to copy-in buffers. Please make sure that the pvfs2-client is running. %ld\n", | ||
63 | __func__, | ||
64 | (long)ret); | ||
65 | return ret; | ||
66 | } | ||
67 | |||
68 | /* | ||
69 | * Copy from client-core's address space to the buffers specified | ||
70 | * by the iovec upto total_size bytes. | ||
71 | * NOTE: the iovector can either contain addresses which | ||
72 | * can futher be kernel-space or user-space addresses. | ||
73 | * or it can pointers to struct page's | ||
74 | */ | ||
75 | static int postcopy_buffers(struct pvfs2_bufmap *bufmap, | ||
76 | int buffer_index, | ||
77 | const struct iovec *vec, | ||
78 | int nr_segs, | ||
79 | size_t total_size, | ||
80 | int to_user) | ||
81 | { | ||
82 | int ret = 0; | ||
83 | |||
84 | /* | ||
85 | * copy data to application/kernel by pushing it out to | ||
86 | * the iovec. NOTE; target buffers can be addresses or | ||
87 | * struct page pointers. | ||
88 | */ | ||
89 | if (total_size) { | ||
90 | /* Are we copying to User Virtual Addresses? */ | ||
91 | if (to_user) | ||
92 | ret = pvfs_bufmap_copy_to_user_iovec( | ||
93 | bufmap, | ||
94 | buffer_index, | ||
95 | vec, | ||
96 | nr_segs, | ||
97 | total_size); | ||
98 | /* Are we copying to Kern Virtual Addresses? */ | ||
99 | else | ||
100 | ret = pvfs_bufmap_copy_to_kernel_iovec( | ||
101 | bufmap, | ||
102 | buffer_index, | ||
103 | vec, | ||
104 | nr_segs, | ||
105 | total_size); | ||
106 | if (ret < 0) | ||
107 | gossip_err("%s: Failed to copy-out buffers. Please make sure that the pvfs2-client is running (%ld)\n", | ||
108 | __func__, | ||
109 | (long)ret); | ||
110 | } | ||
111 | return ret; | ||
112 | } | ||
113 | |||
114 | /* | ||
115 | * Post and wait for the I/O upcall to finish | ||
116 | */ | ||
117 | static ssize_t wait_for_direct_io(enum PVFS_io_type type, struct inode *inode, | ||
118 | loff_t *offset, struct iovec *vec, unsigned long nr_segs, | ||
119 | size_t total_size, loff_t readahead_size, int to_user) | ||
120 | { | ||
121 | struct pvfs2_inode_s *pvfs2_inode = PVFS2_I(inode); | ||
122 | struct pvfs2_khandle *handle = &pvfs2_inode->refn.khandle; | ||
123 | struct pvfs2_bufmap *bufmap = NULL; | ||
124 | struct pvfs2_kernel_op_s *new_op = NULL; | ||
125 | int buffer_index = -1; | ||
126 | ssize_t ret; | ||
127 | |||
128 | new_op = op_alloc(PVFS2_VFS_OP_FILE_IO); | ||
129 | if (!new_op) { | ||
130 | ret = -ENOMEM; | ||
131 | goto out; | ||
132 | } | ||
133 | /* synchronous I/O */ | ||
134 | new_op->upcall.req.io.async_vfs_io = PVFS_VFS_SYNC_IO; | ||
135 | new_op->upcall.req.io.readahead_size = readahead_size; | ||
136 | new_op->upcall.req.io.io_type = type; | ||
137 | new_op->upcall.req.io.refn = pvfs2_inode->refn; | ||
138 | |||
139 | populate_shared_memory: | ||
140 | /* get a shared buffer index */ | ||
141 | ret = pvfs_bufmap_get(&bufmap, &buffer_index); | ||
142 | if (ret < 0) { | ||
143 | gossip_debug(GOSSIP_FILE_DEBUG, | ||
144 | "%s: pvfs_bufmap_get failure (%ld)\n", | ||
145 | __func__, (long)ret); | ||
146 | goto out; | ||
147 | } | ||
148 | gossip_debug(GOSSIP_FILE_DEBUG, | ||
149 | "%s(%pU): GET op %p -> buffer_index %d\n", | ||
150 | __func__, | ||
151 | handle, | ||
152 | new_op, | ||
153 | buffer_index); | ||
154 | |||
155 | new_op->uses_shared_memory = 1; | ||
156 | new_op->upcall.req.io.buf_index = buffer_index; | ||
157 | new_op->upcall.req.io.count = total_size; | ||
158 | new_op->upcall.req.io.offset = *offset; | ||
159 | |||
160 | gossip_debug(GOSSIP_FILE_DEBUG, | ||
161 | "%s(%pU): copy_to_user %d nr_segs %lu, offset: %llu total_size: %zd\n", | ||
162 | __func__, | ||
163 | handle, | ||
164 | to_user, | ||
165 | nr_segs, | ||
166 | llu(*offset), | ||
167 | total_size); | ||
168 | /* | ||
169 | * Stage 1: copy the buffers into client-core's address space | ||
170 | * precopy_buffers only pertains to writes. | ||
171 | */ | ||
172 | if (type == PVFS_IO_WRITE) { | ||
173 | ret = precopy_buffers(bufmap, | ||
174 | buffer_index, | ||
175 | vec, | ||
176 | nr_segs, | ||
177 | total_size, | ||
178 | to_user); | ||
179 | if (ret < 0) | ||
180 | goto out; | ||
181 | } | ||
182 | |||
183 | gossip_debug(GOSSIP_FILE_DEBUG, | ||
184 | "%s(%pU): Calling post_io_request with tag (%llu)\n", | ||
185 | __func__, | ||
186 | handle, | ||
187 | llu(new_op->tag)); | ||
188 | |||
189 | /* Stage 2: Service the I/O operation */ | ||
190 | ret = service_operation(new_op, | ||
191 | type == PVFS_IO_WRITE ? | ||
192 | "file_write" : | ||
193 | "file_read", | ||
194 | get_interruptible_flag(inode)); | ||
195 | |||
196 | /* | ||
197 | * If service_operation() returns -EAGAIN #and# the operation was | ||
198 | * purged from pvfs2_request_list or htable_ops_in_progress, then | ||
199 | * we know that the client was restarted, causing the shared memory | ||
200 | * area to be wiped clean. To restart a write operation in this | ||
201 | * case, we must re-copy the data from the user's iovec to a NEW | ||
202 | * shared memory location. To restart a read operation, we must get | ||
203 | * a new shared memory location. | ||
204 | */ | ||
205 | if (ret == -EAGAIN && op_state_purged(new_op)) { | ||
206 | pvfs_bufmap_put(bufmap, buffer_index); | ||
207 | gossip_debug(GOSSIP_FILE_DEBUG, | ||
208 | "%s:going to repopulate_shared_memory.\n", | ||
209 | __func__); | ||
210 | goto populate_shared_memory; | ||
211 | } | ||
212 | |||
213 | if (ret < 0) { | ||
214 | handle_io_error(); /* defined in pvfs2-kernel.h */ | ||
215 | /* | ||
216 | don't write an error to syslog on signaled operation | ||
217 | termination unless we've got debugging turned on, as | ||
218 | this can happen regularly (i.e. ctrl-c) | ||
219 | */ | ||
220 | if (ret == -EINTR) | ||
221 | gossip_debug(GOSSIP_FILE_DEBUG, | ||
222 | "%s: returning error %ld\n", __func__, | ||
223 | (long)ret); | ||
224 | else | ||
225 | gossip_err("%s: error in %s handle %pU, returning %zd\n", | ||
226 | __func__, | ||
227 | type == PVFS_IO_READ ? | ||
228 | "read from" : "write to", | ||
229 | handle, ret); | ||
230 | goto out; | ||
231 | } | ||
232 | |||
233 | /* | ||
234 | * Stage 3: Post copy buffers from client-core's address space | ||
235 | * postcopy_buffers only pertains to reads. | ||
236 | */ | ||
237 | if (type == PVFS_IO_READ) { | ||
238 | ret = postcopy_buffers(bufmap, | ||
239 | buffer_index, | ||
240 | vec, | ||
241 | nr_segs, | ||
242 | new_op->downcall.resp.io.amt_complete, | ||
243 | to_user); | ||
244 | if (ret < 0) { | ||
245 | /* | ||
246 | * put error codes in downcall so that handle_io_error() | ||
247 | * preserves it properly | ||
248 | */ | ||
249 | new_op->downcall.status = ret; | ||
250 | handle_io_error(); | ||
251 | goto out; | ||
252 | } | ||
253 | } | ||
254 | gossip_debug(GOSSIP_FILE_DEBUG, | ||
255 | "%s(%pU): Amount written as returned by the sys-io call:%d\n", | ||
256 | __func__, | ||
257 | handle, | ||
258 | (int)new_op->downcall.resp.io.amt_complete); | ||
259 | |||
260 | ret = new_op->downcall.resp.io.amt_complete; | ||
261 | |||
262 | /* | ||
263 | tell the device file owner waiting on I/O that this read has | ||
264 | completed and it can return now. in this exact case, on | ||
265 | wakeup the daemon will free the op, so we *cannot* touch it | ||
266 | after this. | ||
267 | */ | ||
268 | wake_up_daemon_for_return(new_op); | ||
269 | new_op = NULL; | ||
270 | |||
271 | out: | ||
272 | if (buffer_index >= 0) { | ||
273 | pvfs_bufmap_put(bufmap, buffer_index); | ||
274 | gossip_debug(GOSSIP_FILE_DEBUG, | ||
275 | "%s(%pU): PUT buffer_index %d\n", | ||
276 | __func__, handle, buffer_index); | ||
277 | buffer_index = -1; | ||
278 | } | ||
279 | if (new_op) { | ||
280 | op_release(new_op); | ||
281 | new_op = NULL; | ||
282 | } | ||
283 | return ret; | ||
284 | } | ||
285 | |||
286 | /* | ||
287 | * The reason we need to do this is to be able to support readv and writev | ||
288 | * that are larger than (pvfs_bufmap_size_query()) Default is | ||
289 | * PVFS2_BUFMAP_DEFAULT_DESC_SIZE MB. What that means is that we will | ||
290 | * create a new io vec descriptor for those memory addresses that | ||
291 | * go beyond the limit. Return value for this routine is negative in case | ||
292 | * of errors and 0 in case of success. | ||
293 | * | ||
294 | * Further, the new_nr_segs pointer is updated to hold the new value | ||
295 | * of number of iovecs, the new_vec pointer is updated to hold the pointer | ||
296 | * to the new split iovec, and the size array is an array of integers holding | ||
297 | * the number of iovecs that straddle pvfs_bufmap_size_query(). | ||
298 | * The max_new_nr_segs value is computed by the caller and returned. | ||
299 | * (It will be (count of all iov_len/ block_size) + 1). | ||
300 | */ | ||
301 | static int split_iovecs(unsigned long max_new_nr_segs, /* IN */ | ||
302 | unsigned long nr_segs, /* IN */ | ||
303 | const struct iovec *original_iovec, /* IN */ | ||
304 | unsigned long *new_nr_segs, /* OUT */ | ||
305 | struct iovec **new_vec, /* OUT */ | ||
306 | unsigned long *seg_count, /* OUT */ | ||
307 | unsigned long **seg_array) /* OUT */ | ||
308 | { | ||
309 | unsigned long seg; | ||
310 | unsigned long count = 0; | ||
311 | unsigned long begin_seg; | ||
312 | unsigned long tmpnew_nr_segs = 0; | ||
313 | struct iovec *new_iovec = NULL; | ||
314 | struct iovec *orig_iovec; | ||
315 | unsigned long *sizes = NULL; | ||
316 | unsigned long sizes_count = 0; | ||
317 | |||
318 | if (nr_segs <= 0 || | ||
319 | original_iovec == NULL || | ||
320 | new_nr_segs == NULL || | ||
321 | new_vec == NULL || | ||
322 | seg_count == NULL || | ||
323 | seg_array == NULL || | ||
324 | max_new_nr_segs <= 0) { | ||
325 | gossip_err("Invalid parameters to split_iovecs\n"); | ||
326 | return -EINVAL; | ||
327 | } | ||
328 | *new_nr_segs = 0; | ||
329 | *new_vec = NULL; | ||
330 | *seg_count = 0; | ||
331 | *seg_array = NULL; | ||
332 | /* copy the passed in iovec descriptor to a temp structure */ | ||
333 | orig_iovec = kmalloc_array(nr_segs, | ||
334 | sizeof(*orig_iovec), | ||
335 | PVFS2_BUFMAP_GFP_FLAGS); | ||
336 | if (orig_iovec == NULL) { | ||
337 | gossip_err( | ||
338 | "split_iovecs: Could not allocate memory for %lu bytes!\n", | ||
339 | (unsigned long)(nr_segs * sizeof(*orig_iovec))); | ||
340 | return -ENOMEM; | ||
341 | } | ||
342 | new_iovec = kcalloc(max_new_nr_segs, | ||
343 | sizeof(*new_iovec), | ||
344 | PVFS2_BUFMAP_GFP_FLAGS); | ||
345 | if (new_iovec == NULL) { | ||
346 | kfree(orig_iovec); | ||
347 | gossip_err( | ||
348 | "split_iovecs: Could not allocate memory for %lu bytes!\n", | ||
349 | (unsigned long)(max_new_nr_segs * sizeof(*new_iovec))); | ||
350 | return -ENOMEM; | ||
351 | } | ||
352 | sizes = kcalloc(max_new_nr_segs, | ||
353 | sizeof(*sizes), | ||
354 | PVFS2_BUFMAP_GFP_FLAGS); | ||
355 | if (sizes == NULL) { | ||
356 | kfree(new_iovec); | ||
357 | kfree(orig_iovec); | ||
358 | gossip_err( | ||
359 | "split_iovecs: Could not allocate memory for %lu bytes!\n", | ||
360 | (unsigned long)(max_new_nr_segs * sizeof(*sizes))); | ||
361 | return -ENOMEM; | ||
362 | } | ||
363 | /* copy the passed in iovec to a temp structure */ | ||
364 | memcpy(orig_iovec, original_iovec, nr_segs * sizeof(*orig_iovec)); | ||
365 | begin_seg = 0; | ||
366 | repeat: | ||
367 | for (seg = begin_seg; seg < nr_segs; seg++) { | ||
368 | if (tmpnew_nr_segs >= max_new_nr_segs || | ||
369 | sizes_count >= max_new_nr_segs) { | ||
370 | kfree(sizes); | ||
371 | kfree(orig_iovec); | ||
372 | kfree(new_iovec); | ||
373 | gossip_err | ||
374 | ("split_iovecs: exceeded the index limit (%lu)\n", | ||
375 | tmpnew_nr_segs); | ||
376 | return -EINVAL; | ||
377 | } | ||
378 | if (count + orig_iovec[seg].iov_len < | ||
379 | pvfs_bufmap_size_query()) { | ||
380 | count += orig_iovec[seg].iov_len; | ||
381 | memcpy(&new_iovec[tmpnew_nr_segs], | ||
382 | &orig_iovec[seg], | ||
383 | sizeof(*new_iovec)); | ||
384 | tmpnew_nr_segs++; | ||
385 | sizes[sizes_count]++; | ||
386 | } else { | ||
387 | new_iovec[tmpnew_nr_segs].iov_base = | ||
388 | orig_iovec[seg].iov_base; | ||
389 | new_iovec[tmpnew_nr_segs].iov_len = | ||
390 | (pvfs_bufmap_size_query() - count); | ||
391 | tmpnew_nr_segs++; | ||
392 | sizes[sizes_count]++; | ||
393 | sizes_count++; | ||
394 | begin_seg = seg; | ||
395 | orig_iovec[seg].iov_base += | ||
396 | (pvfs_bufmap_size_query() - count); | ||
397 | orig_iovec[seg].iov_len -= | ||
398 | (pvfs_bufmap_size_query() - count); | ||
399 | count = 0; | ||
400 | break; | ||
401 | } | ||
402 | } | ||
403 | if (seg != nr_segs) | ||
404 | goto repeat; | ||
405 | else | ||
406 | sizes_count++; | ||
407 | |||
408 | *new_nr_segs = tmpnew_nr_segs; | ||
409 | /* new_iovec is freed by the caller */ | ||
410 | *new_vec = new_iovec; | ||
411 | *seg_count = sizes_count; | ||
412 | /* seg_array is also freed by the caller */ | ||
413 | *seg_array = sizes; | ||
414 | kfree(orig_iovec); | ||
415 | return 0; | ||
416 | } | ||
417 | |||
418 | static long bound_max_iovecs(const struct iovec *curr, unsigned long nr_segs, | ||
419 | ssize_t *total_count) | ||
420 | { | ||
421 | unsigned long i; | ||
422 | long max_nr_iovecs; | ||
423 | ssize_t total; | ||
424 | ssize_t count; | ||
425 | |||
426 | total = 0; | ||
427 | count = 0; | ||
428 | max_nr_iovecs = 0; | ||
429 | for (i = 0; i < nr_segs; i++) { | ||
430 | const struct iovec *iv = &curr[i]; | ||
431 | |||
432 | count += iv->iov_len; | ||
433 | if (unlikely((ssize_t) (count | iv->iov_len) < 0)) | ||
434 | return -EINVAL; | ||
435 | if (total + iv->iov_len < pvfs_bufmap_size_query()) { | ||
436 | total += iv->iov_len; | ||
437 | max_nr_iovecs++; | ||
438 | } else { | ||
439 | total = | ||
440 | (total + iv->iov_len - pvfs_bufmap_size_query()); | ||
441 | max_nr_iovecs += (total / pvfs_bufmap_size_query() + 2); | ||
442 | } | ||
443 | } | ||
444 | *total_count = count; | ||
445 | return max_nr_iovecs; | ||
446 | } | ||
447 | |||
448 | /* | ||
449 | * Common entry point for read/write/readv/writev | ||
450 | * This function will dispatch it to either the direct I/O | ||
451 | * or buffered I/O path depending on the mount options and/or | ||
452 | * augmented/extended metadata attached to the file. | ||
453 | * Note: File extended attributes override any mount options. | ||
454 | */ | ||
455 | static ssize_t do_readv_writev(enum PVFS_io_type type, struct file *file, | ||
456 | loff_t *offset, const struct iovec *iov, unsigned long nr_segs) | ||
457 | { | ||
458 | struct inode *inode = file->f_mapping->host; | ||
459 | struct pvfs2_inode_s *pvfs2_inode = PVFS2_I(inode); | ||
460 | struct pvfs2_khandle *handle = &pvfs2_inode->refn.khandle; | ||
461 | ssize_t ret; | ||
462 | ssize_t total_count; | ||
463 | unsigned int to_free; | ||
464 | size_t count; | ||
465 | unsigned long seg; | ||
466 | unsigned long new_nr_segs = 0; | ||
467 | unsigned long max_new_nr_segs = 0; | ||
468 | unsigned long seg_count = 0; | ||
469 | unsigned long *seg_array = NULL; | ||
470 | struct iovec *iovecptr = NULL; | ||
471 | struct iovec *ptr = NULL; | ||
472 | |||
473 | total_count = 0; | ||
474 | ret = -EINVAL; | ||
475 | count = 0; | ||
476 | to_free = 0; | ||
477 | |||
478 | /* Compute total and max number of segments after split */ | ||
479 | max_new_nr_segs = bound_max_iovecs(iov, nr_segs, &count); | ||
480 | if (max_new_nr_segs < 0) { | ||
481 | gossip_lerr("%s: could not bound iovec %lu\n", | ||
482 | __func__, | ||
483 | max_new_nr_segs); | ||
484 | goto out; | ||
485 | } | ||
486 | |||
487 | gossip_debug(GOSSIP_FILE_DEBUG, | ||
488 | "%s-BEGIN(%pU): count(%d) after estimate_max_iovecs.\n", | ||
489 | __func__, | ||
490 | handle, | ||
491 | (int)count); | ||
492 | |||
493 | if (type == PVFS_IO_WRITE) { | ||
494 | gossip_debug(GOSSIP_FILE_DEBUG, | ||
495 | "%s(%pU): proceeding with offset : %llu, " | ||
496 | "size %d\n", | ||
497 | __func__, | ||
498 | handle, | ||
499 | llu(*offset), | ||
500 | (int)count); | ||
501 | } | ||
502 | |||
503 | if (count == 0) { | ||
504 | ret = 0; | ||
505 | goto out; | ||
506 | } | ||
507 | |||
508 | /* | ||
509 | * if the total size of data transfer requested is greater than | ||
510 | * the kernel-set blocksize of PVFS2, then we split the iovecs | ||
511 | * such that no iovec description straddles a block size limit | ||
512 | */ | ||
513 | |||
514 | gossip_debug(GOSSIP_FILE_DEBUG, | ||
515 | "%s: pvfs_bufmap_size:%d\n", | ||
516 | __func__, | ||
517 | pvfs_bufmap_size_query()); | ||
518 | |||
519 | if (count > pvfs_bufmap_size_query()) { | ||
520 | /* | ||
521 | * Split up the given iovec description such that | ||
522 | * no iovec descriptor straddles over the block-size limitation. | ||
523 | * This makes us our job easier to stage the I/O. | ||
524 | * In addition, this function will also compute an array | ||
525 | * with seg_count entries that will store the number of | ||
526 | * segments that straddle the block-size boundaries. | ||
527 | */ | ||
528 | ret = split_iovecs(max_new_nr_segs, /* IN */ | ||
529 | nr_segs, /* IN */ | ||
530 | iov, /* IN */ | ||
531 | &new_nr_segs, /* OUT */ | ||
532 | &iovecptr, /* OUT */ | ||
533 | &seg_count, /* OUT */ | ||
534 | &seg_array); /* OUT */ | ||
535 | if (ret < 0) { | ||
536 | gossip_err("%s: Failed to split iovecs to satisfy larger than blocksize readv/writev request %zd\n", | ||
537 | __func__, | ||
538 | ret); | ||
539 | goto out; | ||
540 | } | ||
541 | gossip_debug(GOSSIP_FILE_DEBUG, | ||
542 | "%s: Splitting iovecs from %lu to %lu" | ||
543 | " [max_new %lu]\n", | ||
544 | __func__, | ||
545 | nr_segs, | ||
546 | new_nr_segs, | ||
547 | max_new_nr_segs); | ||
548 | /* We must free seg_array and iovecptr */ | ||
549 | to_free = 1; | ||
550 | } else { | ||
551 | new_nr_segs = nr_segs; | ||
552 | /* use the given iovec description */ | ||
553 | iovecptr = (struct iovec *)iov; | ||
554 | /* There is only 1 element in the seg_array */ | ||
555 | seg_count = 1; | ||
556 | /* and its value is the number of segments passed in */ | ||
557 | seg_array = &nr_segs; | ||
558 | /* We dont have to free up anything */ | ||
559 | to_free = 0; | ||
560 | } | ||
561 | ptr = iovecptr; | ||
562 | |||
563 | gossip_debug(GOSSIP_FILE_DEBUG, | ||
564 | "%s(%pU) %zd@%llu\n", | ||
565 | __func__, | ||
566 | handle, | ||
567 | count, | ||
568 | llu(*offset)); | ||
569 | gossip_debug(GOSSIP_FILE_DEBUG, | ||
570 | "%s(%pU): new_nr_segs: %lu, seg_count: %lu\n", | ||
571 | __func__, | ||
572 | handle, | ||
573 | new_nr_segs, seg_count); | ||
574 | |||
575 | /* PVFS2_KERNEL_DEBUG is a CFLAGS define. */ | ||
576 | #ifdef PVFS2_KERNEL_DEBUG | ||
577 | for (seg = 0; seg < new_nr_segs; seg++) | ||
578 | gossip_debug(GOSSIP_FILE_DEBUG, | ||
579 | "%s: %d) %p to %p [%d bytes]\n", | ||
580 | __func__, | ||
581 | (int)seg + 1, | ||
582 | iovecptr[seg].iov_base, | ||
583 | iovecptr[seg].iov_base + iovecptr[seg].iov_len, | ||
584 | (int)iovecptr[seg].iov_len); | ||
585 | for (seg = 0; seg < seg_count; seg++) | ||
586 | gossip_debug(GOSSIP_FILE_DEBUG, | ||
587 | "%s: %zd) %lu\n", | ||
588 | __func__, | ||
589 | seg + 1, | ||
590 | seg_array[seg]); | ||
591 | #endif | ||
592 | seg = 0; | ||
593 | while (total_count < count) { | ||
594 | size_t each_count; | ||
595 | size_t amt_complete; | ||
596 | |||
597 | /* how much to transfer in this loop iteration */ | ||
598 | each_count = | ||
599 | (((count - total_count) > pvfs_bufmap_size_query()) ? | ||
600 | pvfs_bufmap_size_query() : | ||
601 | (count - total_count)); | ||
602 | |||
603 | gossip_debug(GOSSIP_FILE_DEBUG, | ||
604 | "%s(%pU): size of each_count(%d)\n", | ||
605 | __func__, | ||
606 | handle, | ||
607 | (int)each_count); | ||
608 | gossip_debug(GOSSIP_FILE_DEBUG, | ||
609 | "%s(%pU): BEFORE wait_for_io: offset is %d\n", | ||
610 | __func__, | ||
611 | handle, | ||
612 | (int)*offset); | ||
613 | |||
614 | ret = wait_for_direct_io(type, inode, offset, ptr, | ||
615 | seg_array[seg], each_count, 0, 1); | ||
616 | gossip_debug(GOSSIP_FILE_DEBUG, | ||
617 | "%s(%pU): return from wait_for_io:%d\n", | ||
618 | __func__, | ||
619 | handle, | ||
620 | (int)ret); | ||
621 | |||
622 | if (ret < 0) | ||
623 | goto out; | ||
624 | |||
625 | /* advance the iovec pointer */ | ||
626 | ptr += seg_array[seg]; | ||
627 | seg++; | ||
628 | *offset += ret; | ||
629 | total_count += ret; | ||
630 | amt_complete = ret; | ||
631 | |||
632 | gossip_debug(GOSSIP_FILE_DEBUG, | ||
633 | "%s(%pU): AFTER wait_for_io: offset is %d\n", | ||
634 | __func__, | ||
635 | handle, | ||
636 | (int)*offset); | ||
637 | |||
638 | /* | ||
639 | * if we got a short I/O operations, | ||
640 | * fall out and return what we got so far | ||
641 | */ | ||
642 | if (amt_complete < each_count) | ||
643 | break; | ||
644 | } /*end while */ | ||
645 | |||
646 | if (total_count > 0) | ||
647 | ret = total_count; | ||
648 | out: | ||
649 | if (to_free) { | ||
650 | kfree(iovecptr); | ||
651 | kfree(seg_array); | ||
652 | } | ||
653 | if (ret > 0) { | ||
654 | if (type == PVFS_IO_READ) { | ||
655 | file_accessed(file); | ||
656 | } else { | ||
657 | SetMtimeFlag(pvfs2_inode); | ||
658 | inode->i_mtime = CURRENT_TIME; | ||
659 | mark_inode_dirty_sync(inode); | ||
660 | } | ||
661 | } | ||
662 | |||
663 | gossip_debug(GOSSIP_FILE_DEBUG, | ||
664 | "%s(%pU): Value(%d) returned.\n", | ||
665 | __func__, | ||
666 | handle, | ||
667 | (int)ret); | ||
668 | |||
669 | return ret; | ||
670 | } | ||
671 | |||
672 | /* | ||
673 | * Read data from a specified offset in a file (referenced by inode). | ||
674 | * Data may be placed either in a user or kernel buffer. | ||
675 | */ | ||
676 | ssize_t pvfs2_inode_read(struct inode *inode, | ||
677 | char __user *buf, | ||
678 | size_t count, | ||
679 | loff_t *offset, | ||
680 | loff_t readahead_size) | ||
681 | { | ||
682 | struct pvfs2_inode_s *pvfs2_inode = PVFS2_I(inode); | ||
683 | size_t bufmap_size; | ||
684 | struct iovec vec; | ||
685 | ssize_t ret = -EINVAL; | ||
686 | |||
687 | g_pvfs2_stats.reads++; | ||
688 | |||
689 | vec.iov_base = buf; | ||
690 | vec.iov_len = count; | ||
691 | |||
692 | bufmap_size = pvfs_bufmap_size_query(); | ||
693 | if (count > bufmap_size) { | ||
694 | gossip_debug(GOSSIP_FILE_DEBUG, | ||
695 | "%s: count is too large (%zd/%zd)!\n", | ||
696 | __func__, count, bufmap_size); | ||
697 | return -EINVAL; | ||
698 | } | ||
699 | |||
700 | gossip_debug(GOSSIP_FILE_DEBUG, | ||
701 | "%s(%pU) %zd@%llu\n", | ||
702 | __func__, | ||
703 | &pvfs2_inode->refn.khandle, | ||
704 | count, | ||
705 | llu(*offset)); | ||
706 | |||
707 | ret = wait_for_direct_io(PVFS_IO_READ, inode, offset, &vec, 1, | ||
708 | count, readahead_size, 0); | ||
709 | if (ret > 0) | ||
710 | *offset += ret; | ||
711 | |||
712 | gossip_debug(GOSSIP_FILE_DEBUG, | ||
713 | "%s(%pU): Value(%zd) returned.\n", | ||
714 | __func__, | ||
715 | &pvfs2_inode->refn.khandle, | ||
716 | ret); | ||
717 | |||
718 | return ret; | ||
719 | } | ||
720 | |||
721 | static ssize_t pvfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) | ||
722 | { | ||
723 | struct file *file = iocb->ki_filp; | ||
724 | loff_t pos = *(&iocb->ki_pos); | ||
725 | ssize_t rc = 0; | ||
726 | unsigned long nr_segs = iter->nr_segs; | ||
727 | |||
728 | BUG_ON(iocb->private); | ||
729 | |||
730 | gossip_debug(GOSSIP_FILE_DEBUG, "pvfs2_file_read_iter\n"); | ||
731 | |||
732 | g_pvfs2_stats.reads++; | ||
733 | |||
734 | rc = do_readv_writev(PVFS_IO_READ, | ||
735 | file, | ||
736 | &pos, | ||
737 | iter->iov, | ||
738 | nr_segs); | ||
739 | iocb->ki_pos = pos; | ||
740 | |||
741 | return rc; | ||
742 | } | ||
743 | |||
744 | static ssize_t pvfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *iter) | ||
745 | { | ||
746 | struct file *file = iocb->ki_filp; | ||
747 | loff_t pos = *(&iocb->ki_pos); | ||
748 | unsigned long nr_segs = iter->nr_segs; | ||
749 | ssize_t rc; | ||
750 | |||
751 | BUG_ON(iocb->private); | ||
752 | |||
753 | gossip_debug(GOSSIP_FILE_DEBUG, "pvfs2_file_write_iter\n"); | ||
754 | |||
755 | mutex_lock(&file->f_mapping->host->i_mutex); | ||
756 | |||
757 | /* Make sure generic_write_checks sees an up to date inode size. */ | ||
758 | if (file->f_flags & O_APPEND) { | ||
759 | rc = pvfs2_inode_getattr(file->f_mapping->host, | ||
760 | PVFS_ATTR_SYS_SIZE); | ||
761 | if (rc) { | ||
762 | gossip_err("%s: pvfs2_inode_getattr failed, rc:%zd:.\n", | ||
763 | __func__, rc); | ||
764 | goto out; | ||
765 | } | ||
766 | } | ||
767 | |||
768 | if (file->f_pos > i_size_read(file->f_mapping->host)) | ||
769 | pvfs2_i_size_write(file->f_mapping->host, file->f_pos); | ||
770 | |||
771 | rc = generic_write_checks(iocb, iter); | ||
772 | |||
773 | if (rc <= 0) { | ||
774 | gossip_err("%s: generic_write_checks failed, rc:%zd:.\n", | ||
775 | __func__, rc); | ||
776 | goto out; | ||
777 | } | ||
778 | |||
779 | rc = do_readv_writev(PVFS_IO_WRITE, | ||
780 | file, | ||
781 | &pos, | ||
782 | iter->iov, | ||
783 | nr_segs); | ||
784 | if (rc < 0) { | ||
785 | gossip_err("%s: do_readv_writev failed, rc:%zd:.\n", | ||
786 | __func__, rc); | ||
787 | goto out; | ||
788 | } | ||
789 | |||
790 | iocb->ki_pos = pos; | ||
791 | g_pvfs2_stats.writes++; | ||
792 | |||
793 | out: | ||
794 | |||
795 | mutex_unlock(&file->f_mapping->host->i_mutex); | ||
796 | return rc; | ||
797 | } | ||
798 | |||
799 | /* | ||
800 | * Perform a miscellaneous operation on a file. | ||
801 | */ | ||
802 | long pvfs2_ioctl(struct file *file, unsigned int cmd, unsigned long arg) | ||
803 | { | ||
804 | int ret = -ENOTTY; | ||
805 | __u64 val = 0; | ||
806 | unsigned long uval; | ||
807 | |||
808 | gossip_debug(GOSSIP_FILE_DEBUG, | ||
809 | "pvfs2_ioctl: called with cmd %d\n", | ||
810 | cmd); | ||
811 | |||
812 | /* | ||
813 | * we understand some general ioctls on files, such as the immutable | ||
814 | * and append flags | ||
815 | */ | ||
816 | if (cmd == FS_IOC_GETFLAGS) { | ||
817 | val = 0; | ||
818 | ret = pvfs2_xattr_get_default(file->f_path.dentry, | ||
819 | "user.pvfs2.meta_hint", | ||
820 | &val, | ||
821 | sizeof(val), | ||
822 | 0); | ||
823 | if (ret < 0 && ret != -ENODATA) | ||
824 | return ret; | ||
825 | else if (ret == -ENODATA) | ||
826 | val = 0; | ||
827 | uval = val; | ||
828 | gossip_debug(GOSSIP_FILE_DEBUG, | ||
829 | "pvfs2_ioctl: FS_IOC_GETFLAGS: %llu\n", | ||
830 | (unsigned long long)uval); | ||
831 | return put_user(uval, (int __user *)arg); | ||
832 | } else if (cmd == FS_IOC_SETFLAGS) { | ||
833 | ret = 0; | ||
834 | if (get_user(uval, (int __user *)arg)) | ||
835 | return -EFAULT; | ||
836 | /* | ||
837 | * PVFS_MIRROR_FL is set internally when the mirroring mode | ||
838 | * is turned on for a file. The user is not allowed to turn | ||
839 | * on this bit, but the bit is present if the user first gets | ||
840 | * the flags and then updates the flags with some new | ||
841 | * settings. So, we ignore it in the following edit. bligon. | ||
842 | */ | ||
843 | if ((uval & ~PVFS_MIRROR_FL) & | ||
844 | (~(FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NOATIME_FL))) { | ||
845 | gossip_err("pvfs2_ioctl: the FS_IOC_SETFLAGS only supports setting one of FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NOATIME_FL\n"); | ||
846 | return -EINVAL; | ||
847 | } | ||
848 | val = uval; | ||
849 | gossip_debug(GOSSIP_FILE_DEBUG, | ||
850 | "pvfs2_ioctl: FS_IOC_SETFLAGS: %llu\n", | ||
851 | (unsigned long long)val); | ||
852 | ret = pvfs2_xattr_set_default(file->f_path.dentry, | ||
853 | "user.pvfs2.meta_hint", | ||
854 | &val, | ||
855 | sizeof(val), | ||
856 | 0, | ||
857 | 0); | ||
858 | } | ||
859 | |||
860 | return ret; | ||
861 | } | ||
862 | |||
863 | /* | ||
864 | * Memory map a region of a file. | ||
865 | */ | ||
866 | static int pvfs2_file_mmap(struct file *file, struct vm_area_struct *vma) | ||
867 | { | ||
868 | gossip_debug(GOSSIP_FILE_DEBUG, | ||
869 | "pvfs2_file_mmap: called on %s\n", | ||
870 | (file ? | ||
871 | (char *)file->f_path.dentry->d_name.name : | ||
872 | (char *)"Unknown")); | ||
873 | |||
874 | /* set the sequential readahead hint */ | ||
875 | vma->vm_flags |= VM_SEQ_READ; | ||
876 | vma->vm_flags &= ~VM_RAND_READ; | ||
877 | return generic_file_mmap(file, vma); | ||
878 | } | ||
879 | |||
880 | #define mapping_nrpages(idata) ((idata)->nrpages) | ||
881 | |||
882 | /* | ||
883 | * Called to notify the module that there are no more references to | ||
884 | * this file (i.e. no processes have it open). | ||
885 | * | ||
886 | * \note Not called when each file is closed. | ||
887 | */ | ||
888 | int pvfs2_file_release(struct inode *inode, struct file *file) | ||
889 | { | ||
890 | gossip_debug(GOSSIP_FILE_DEBUG, | ||
891 | "pvfs2_file_release: called on %s\n", | ||
892 | file->f_path.dentry->d_name.name); | ||
893 | |||
894 | pvfs2_flush_inode(inode); | ||
895 | |||
896 | /* | ||
897 | remove all associated inode pages from the page cache and mmap | ||
898 | readahead cache (if any); this forces an expensive refresh of | ||
899 | data for the next caller of mmap (or 'get_block' accesses) | ||
900 | */ | ||
901 | if (file->f_path.dentry->d_inode && | ||
902 | file->f_path.dentry->d_inode->i_mapping && | ||
903 | mapping_nrpages(&file->f_path.dentry->d_inode->i_data)) | ||
904 | truncate_inode_pages(file->f_path.dentry->d_inode->i_mapping, | ||
905 | 0); | ||
906 | return 0; | ||
907 | } | ||
908 | |||
909 | /* | ||
910 | * Push all data for a specific file onto permanent storage. | ||
911 | */ | ||
912 | int pvfs2_fsync(struct file *file, loff_t start, loff_t end, int datasync) | ||
913 | { | ||
914 | int ret = -EINVAL; | ||
915 | struct pvfs2_inode_s *pvfs2_inode = | ||
916 | PVFS2_I(file->f_path.dentry->d_inode); | ||
917 | struct pvfs2_kernel_op_s *new_op = NULL; | ||
918 | |||
919 | /* required call */ | ||
920 | filemap_write_and_wait_range(file->f_mapping, start, end); | ||
921 | |||
922 | new_op = op_alloc(PVFS2_VFS_OP_FSYNC); | ||
923 | if (!new_op) | ||
924 | return -ENOMEM; | ||
925 | new_op->upcall.req.fsync.refn = pvfs2_inode->refn; | ||
926 | |||
927 | ret = service_operation(new_op, | ||
928 | "pvfs2_fsync", | ||
929 | get_interruptible_flag(file->f_path.dentry->d_inode)); | ||
930 | |||
931 | gossip_debug(GOSSIP_FILE_DEBUG, | ||
932 | "pvfs2_fsync got return value of %d\n", | ||
933 | ret); | ||
934 | |||
935 | op_release(new_op); | ||
936 | |||
937 | pvfs2_flush_inode(file->f_path.dentry->d_inode); | ||
938 | return ret; | ||
939 | } | ||
940 | |||
941 | /* | ||
942 | * Change the file pointer position for an instance of an open file. | ||
943 | * | ||
944 | * \note If .llseek is overriden, we must acquire lock as described in | ||
945 | * Documentation/filesystems/Locking. | ||
946 | * | ||
947 | * Future upgrade could support SEEK_DATA and SEEK_HOLE but would | ||
948 | * require much changes to the FS | ||
949 | */ | ||
950 | loff_t pvfs2_file_llseek(struct file *file, loff_t offset, int origin) | ||
951 | { | ||
952 | int ret = -EINVAL; | ||
953 | struct inode *inode = file->f_path.dentry->d_inode; | ||
954 | |||
955 | if (!inode) { | ||
956 | gossip_err("pvfs2_file_llseek: invalid inode (NULL)\n"); | ||
957 | return ret; | ||
958 | } | ||
959 | |||
960 | if (origin == PVFS2_SEEK_END) { | ||
961 | /* | ||
962 | * revalidate the inode's file size. | ||
963 | * NOTE: We are only interested in file size here, | ||
964 | * so we set mask accordingly. | ||
965 | */ | ||
966 | ret = pvfs2_inode_getattr(inode, PVFS_ATTR_SYS_SIZE); | ||
967 | if (ret) { | ||
968 | gossip_debug(GOSSIP_FILE_DEBUG, | ||
969 | "%s:%s:%d calling make bad inode\n", | ||
970 | __FILE__, | ||
971 | __func__, | ||
972 | __LINE__); | ||
973 | pvfs2_make_bad_inode(inode); | ||
974 | return ret; | ||
975 | } | ||
976 | } | ||
977 | |||
978 | gossip_debug(GOSSIP_FILE_DEBUG, | ||
979 | "pvfs2_file_llseek: offset is %ld | origin is %d | " | ||
980 | "inode size is %lu\n", | ||
981 | (long)offset, | ||
982 | origin, | ||
983 | (unsigned long)file->f_path.dentry->d_inode->i_size); | ||
984 | |||
985 | return generic_file_llseek(file, offset, origin); | ||
986 | } | ||
987 | |||
988 | /* | ||
989 | * Support local locks (locks that only this kernel knows about) | ||
990 | * if Orangefs was mounted -o local_lock. | ||
991 | */ | ||
992 | int pvfs2_lock(struct file *filp, int cmd, struct file_lock *fl) | ||
993 | { | ||
994 | int rc = -ENOLCK; | ||
995 | |||
996 | if (PVFS2_SB(filp->f_inode->i_sb)->flags & PVFS2_OPT_LOCAL_LOCK) { | ||
997 | if (cmd == F_GETLK) { | ||
998 | rc = 0; | ||
999 | posix_test_lock(filp, fl); | ||
1000 | } else { | ||
1001 | rc = posix_lock_file(filp, fl, NULL); | ||
1002 | } | ||
1003 | } | ||
1004 | |||
1005 | return rc; | ||
1006 | } | ||
1007 | |||
1008 | /** PVFS2 implementation of VFS file operations */ | ||
1009 | const struct file_operations pvfs2_file_operations = { | ||
1010 | .llseek = pvfs2_file_llseek, | ||
1011 | .read_iter = pvfs2_file_read_iter, | ||
1012 | .write_iter = pvfs2_file_write_iter, | ||
1013 | .lock = pvfs2_lock, | ||
1014 | .unlocked_ioctl = pvfs2_ioctl, | ||
1015 | .mmap = pvfs2_file_mmap, | ||
1016 | .open = generic_file_open, | ||
1017 | .release = pvfs2_file_release, | ||
1018 | .fsync = pvfs2_fsync, | ||
1019 | }; | ||
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c new file mode 100644 index 000000000000..feda00fcdd7d --- /dev/null +++ b/fs/orangefs/inode.c | |||
@@ -0,0 +1,469 @@ | |||
1 | /* | ||
2 | * (C) 2001 Clemson University and The University of Chicago | ||
3 | * | ||
4 | * See COPYING in top-level directory. | ||
5 | */ | ||
6 | |||
7 | /* | ||
8 | * Linux VFS inode operations. | ||
9 | */ | ||
10 | |||
11 | #include "protocol.h" | ||
12 | #include "pvfs2-kernel.h" | ||
13 | #include "pvfs2-bufmap.h" | ||
14 | |||
15 | static int read_one_page(struct page *page) | ||
16 | { | ||
17 | void *page_data; | ||
18 | int ret; | ||
19 | int max_block; | ||
20 | ssize_t bytes_read = 0; | ||
21 | struct inode *inode = page->mapping->host; | ||
22 | const __u32 blocksize = PAGE_CACHE_SIZE; /* inode->i_blksize */ | ||
23 | const __u32 blockbits = PAGE_CACHE_SHIFT; /* inode->i_blkbits */ | ||
24 | |||
25 | gossip_debug(GOSSIP_INODE_DEBUG, | ||
26 | "pvfs2_readpage called with page %p\n", | ||
27 | page); | ||
28 | page_data = pvfs2_kmap(page); | ||
29 | |||
30 | max_block = ((inode->i_size / blocksize) + 1); | ||
31 | |||
32 | if (page->index < max_block) { | ||
33 | loff_t blockptr_offset = (((loff_t) page->index) << blockbits); | ||
34 | |||
35 | bytes_read = pvfs2_inode_read(inode, | ||
36 | page_data, | ||
37 | blocksize, | ||
38 | &blockptr_offset, | ||
39 | inode->i_size); | ||
40 | } | ||
41 | /* only zero remaining unread portions of the page data */ | ||
42 | if (bytes_read > 0) | ||
43 | memset(page_data + bytes_read, 0, blocksize - bytes_read); | ||
44 | else | ||
45 | memset(page_data, 0, blocksize); | ||
46 | /* takes care of potential aliasing */ | ||
47 | flush_dcache_page(page); | ||
48 | if (bytes_read < 0) { | ||
49 | ret = bytes_read; | ||
50 | SetPageError(page); | ||
51 | } else { | ||
52 | SetPageUptodate(page); | ||
53 | if (PageError(page)) | ||
54 | ClearPageError(page); | ||
55 | ret = 0; | ||
56 | } | ||
57 | pvfs2_kunmap(page); | ||
58 | /* unlock the page after the ->readpage() routine completes */ | ||
59 | unlock_page(page); | ||
60 | return ret; | ||
61 | } | ||
62 | |||
63 | static int pvfs2_readpage(struct file *file, struct page *page) | ||
64 | { | ||
65 | return read_one_page(page); | ||
66 | } | ||
67 | |||
68 | static int pvfs2_readpages(struct file *file, | ||
69 | struct address_space *mapping, | ||
70 | struct list_head *pages, | ||
71 | unsigned nr_pages) | ||
72 | { | ||
73 | int page_idx; | ||
74 | int ret; | ||
75 | |||
76 | gossip_debug(GOSSIP_INODE_DEBUG, "pvfs2_readpages called\n"); | ||
77 | |||
78 | for (page_idx = 0; page_idx < nr_pages; page_idx++) { | ||
79 | struct page *page; | ||
80 | |||
81 | page = list_entry(pages->prev, struct page, lru); | ||
82 | list_del(&page->lru); | ||
83 | if (!add_to_page_cache(page, | ||
84 | mapping, | ||
85 | page->index, | ||
86 | GFP_KERNEL)) { | ||
87 | ret = read_one_page(page); | ||
88 | gossip_debug(GOSSIP_INODE_DEBUG, | ||
89 | "failure adding page to cache, read_one_page returned: %d\n", | ||
90 | ret); | ||
91 | } else { | ||
92 | page_cache_release(page); | ||
93 | } | ||
94 | } | ||
95 | BUG_ON(!list_empty(pages)); | ||
96 | return 0; | ||
97 | } | ||
98 | |||
99 | static void pvfs2_invalidatepage(struct page *page, | ||
100 | unsigned int offset, | ||
101 | unsigned int length) | ||
102 | { | ||
103 | gossip_debug(GOSSIP_INODE_DEBUG, | ||
104 | "pvfs2_invalidatepage called on page %p " | ||
105 | "(offset is %u)\n", | ||
106 | page, | ||
107 | offset); | ||
108 | |||
109 | ClearPageUptodate(page); | ||
110 | ClearPageMappedToDisk(page); | ||
111 | return; | ||
112 | |||
113 | } | ||
114 | |||
115 | static int pvfs2_releasepage(struct page *page, gfp_t foo) | ||
116 | { | ||
117 | gossip_debug(GOSSIP_INODE_DEBUG, | ||
118 | "pvfs2_releasepage called on page %p\n", | ||
119 | page); | ||
120 | return 0; | ||
121 | } | ||
122 | |||
123 | /* | ||
124 | * Having a direct_IO entry point in the address_space_operations | ||
125 | * struct causes the kernel to allows us to use O_DIRECT on | ||
126 | * open. Nothing will ever call this thing, but in the future we | ||
127 | * will need to be able to use O_DIRECT on open in order to support | ||
128 | * AIO. Modeled after NFS, they do this too. | ||
129 | */ | ||
130 | /* | ||
131 | static ssize_t pvfs2_direct_IO(int rw, | ||
132 | struct kiocb *iocb, | ||
133 | struct iov_iter *iter, | ||
134 | loff_t offset) | ||
135 | { | ||
136 | gossip_debug(GOSSIP_INODE_DEBUG, | ||
137 | "pvfs2_direct_IO: %s\n", | ||
138 | iocb->ki_filp->f_path.dentry->d_name.name); | ||
139 | |||
140 | return -EINVAL; | ||
141 | } | ||
142 | */ | ||
143 | |||
144 | struct backing_dev_info pvfs2_backing_dev_info = { | ||
145 | .name = "pvfs2", | ||
146 | .ra_pages = 0, | ||
147 | .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK, | ||
148 | }; | ||
149 | |||
150 | /** PVFS2 implementation of address space operations */ | ||
151 | const struct address_space_operations pvfs2_address_operations = { | ||
152 | .readpage = pvfs2_readpage, | ||
153 | .readpages = pvfs2_readpages, | ||
154 | .invalidatepage = pvfs2_invalidatepage, | ||
155 | .releasepage = pvfs2_releasepage, | ||
156 | /* .direct_IO = pvfs2_direct_IO */ | ||
157 | }; | ||
158 | |||
159 | static int pvfs2_setattr_size(struct inode *inode, struct iattr *iattr) | ||
160 | { | ||
161 | struct pvfs2_inode_s *pvfs2_inode = PVFS2_I(inode); | ||
162 | struct pvfs2_kernel_op_s *new_op; | ||
163 | loff_t orig_size = i_size_read(inode); | ||
164 | int ret = -EINVAL; | ||
165 | |||
166 | gossip_debug(GOSSIP_INODE_DEBUG, | ||
167 | "%s: %pU: Handle is %pU | fs_id %d | size is %llu\n", | ||
168 | __func__, | ||
169 | get_khandle_from_ino(inode), | ||
170 | &pvfs2_inode->refn.khandle, | ||
171 | pvfs2_inode->refn.fs_id, | ||
172 | iattr->ia_size); | ||
173 | |||
174 | truncate_setsize(inode, iattr->ia_size); | ||
175 | |||
176 | new_op = op_alloc(PVFS2_VFS_OP_TRUNCATE); | ||
177 | if (!new_op) | ||
178 | return -ENOMEM; | ||
179 | |||
180 | new_op->upcall.req.truncate.refn = pvfs2_inode->refn; | ||
181 | new_op->upcall.req.truncate.size = (__s64) iattr->ia_size; | ||
182 | |||
183 | ret = service_operation(new_op, __func__, | ||
184 | get_interruptible_flag(inode)); | ||
185 | |||
186 | /* | ||
187 | * the truncate has no downcall members to retrieve, but | ||
188 | * the status value tells us if it went through ok or not | ||
189 | */ | ||
190 | gossip_debug(GOSSIP_INODE_DEBUG, | ||
191 | "pvfs2: pvfs2_truncate got return value of %d\n", | ||
192 | ret); | ||
193 | |||
194 | op_release(new_op); | ||
195 | |||
196 | if (ret != 0) | ||
197 | return ret; | ||
198 | |||
199 | /* | ||
200 | * Only change the c/mtime if we are changing the size or we are | ||
201 | * explicitly asked to change it. This handles the semantic difference | ||
202 | * between truncate() and ftruncate() as implemented in the VFS. | ||
203 | * | ||
204 | * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a | ||
205 | * special case where we need to update the times despite not having | ||
206 | * these flags set. For all other operations the VFS set these flags | ||
207 | * explicitly if it wants a timestamp update. | ||
208 | */ | ||
209 | if (orig_size != i_size_read(inode) && | ||
210 | !(iattr->ia_valid & (ATTR_CTIME | ATTR_MTIME))) { | ||
211 | iattr->ia_ctime = iattr->ia_mtime = | ||
212 | current_fs_time(inode->i_sb); | ||
213 | iattr->ia_valid |= ATTR_CTIME | ATTR_MTIME; | ||
214 | } | ||
215 | |||
216 | return ret; | ||
217 | } | ||
218 | |||
219 | /* | ||
220 | * Change attributes of an object referenced by dentry. | ||
221 | */ | ||
222 | int pvfs2_setattr(struct dentry *dentry, struct iattr *iattr) | ||
223 | { | ||
224 | int ret = -EINVAL; | ||
225 | struct inode *inode = dentry->d_inode; | ||
226 | |||
227 | gossip_debug(GOSSIP_INODE_DEBUG, | ||
228 | "pvfs2_setattr: called on %s\n", | ||
229 | dentry->d_name.name); | ||
230 | |||
231 | ret = inode_change_ok(inode, iattr); | ||
232 | if (ret) | ||
233 | goto out; | ||
234 | |||
235 | if ((iattr->ia_valid & ATTR_SIZE) && | ||
236 | iattr->ia_size != i_size_read(inode)) { | ||
237 | ret = pvfs2_setattr_size(inode, iattr); | ||
238 | if (ret) | ||
239 | goto out; | ||
240 | } | ||
241 | |||
242 | setattr_copy(inode, iattr); | ||
243 | mark_inode_dirty(inode); | ||
244 | |||
245 | ret = pvfs2_inode_setattr(inode, iattr); | ||
246 | gossip_debug(GOSSIP_INODE_DEBUG, | ||
247 | "pvfs2_setattr: inode_setattr returned %d\n", | ||
248 | ret); | ||
249 | |||
250 | if (!ret && (iattr->ia_valid & ATTR_MODE)) | ||
251 | /* change mod on a file that has ACLs */ | ||
252 | ret = posix_acl_chmod(inode, inode->i_mode); | ||
253 | |||
254 | out: | ||
255 | gossip_debug(GOSSIP_INODE_DEBUG, "pvfs2_setattr: returning %d\n", ret); | ||
256 | return ret; | ||
257 | } | ||
258 | |||
259 | /* | ||
260 | * Obtain attributes of an object given a dentry | ||
261 | */ | ||
262 | int pvfs2_getattr(struct vfsmount *mnt, | ||
263 | struct dentry *dentry, | ||
264 | struct kstat *kstat) | ||
265 | { | ||
266 | int ret = -ENOENT; | ||
267 | struct inode *inode = dentry->d_inode; | ||
268 | struct pvfs2_inode_s *pvfs2_inode = NULL; | ||
269 | |||
270 | gossip_debug(GOSSIP_INODE_DEBUG, | ||
271 | "pvfs2_getattr: called on %s\n", | ||
272 | dentry->d_name.name); | ||
273 | |||
274 | /* | ||
275 | * Similar to the above comment, a getattr also expects that all | ||
276 | * fields/attributes of the inode would be refreshed. So again, we | ||
277 | * dont have too much of a choice but refresh all the attributes. | ||
278 | */ | ||
279 | ret = pvfs2_inode_getattr(inode, PVFS_ATTR_SYS_ALL_NOHINT); | ||
280 | if (ret == 0) { | ||
281 | generic_fillattr(inode, kstat); | ||
282 | /* override block size reported to stat */ | ||
283 | pvfs2_inode = PVFS2_I(inode); | ||
284 | kstat->blksize = pvfs2_inode->blksize; | ||
285 | } else { | ||
286 | /* assume an I/O error and flag inode as bad */ | ||
287 | gossip_debug(GOSSIP_INODE_DEBUG, | ||
288 | "%s:%s:%d calling make bad inode\n", | ||
289 | __FILE__, | ||
290 | __func__, | ||
291 | __LINE__); | ||
292 | pvfs2_make_bad_inode(inode); | ||
293 | } | ||
294 | return ret; | ||
295 | } | ||
296 | |||
297 | /* PVFS2 implementation of VFS inode operations for files */ | ||
298 | struct inode_operations pvfs2_file_inode_operations = { | ||
299 | .get_acl = pvfs2_get_acl, | ||
300 | .set_acl = pvfs2_set_acl, | ||
301 | .setattr = pvfs2_setattr, | ||
302 | .getattr = pvfs2_getattr, | ||
303 | .setxattr = generic_setxattr, | ||
304 | .getxattr = generic_getxattr, | ||
305 | .listxattr = pvfs2_listxattr, | ||
306 | .removexattr = generic_removexattr, | ||
307 | }; | ||
308 | |||
309 | static int pvfs2_init_iops(struct inode *inode) | ||
310 | { | ||
311 | inode->i_mapping->a_ops = &pvfs2_address_operations; | ||
312 | |||
313 | switch (inode->i_mode & S_IFMT) { | ||
314 | case S_IFREG: | ||
315 | inode->i_op = &pvfs2_file_inode_operations; | ||
316 | inode->i_fop = &pvfs2_file_operations; | ||
317 | inode->i_blkbits = PAGE_CACHE_SHIFT; | ||
318 | break; | ||
319 | case S_IFLNK: | ||
320 | inode->i_op = &pvfs2_symlink_inode_operations; | ||
321 | break; | ||
322 | case S_IFDIR: | ||
323 | inode->i_op = &pvfs2_dir_inode_operations; | ||
324 | inode->i_fop = &pvfs2_dir_operations; | ||
325 | break; | ||
326 | default: | ||
327 | gossip_debug(GOSSIP_INODE_DEBUG, | ||
328 | "%s: unsupported mode\n", | ||
329 | __func__); | ||
330 | return -EINVAL; | ||
331 | } | ||
332 | |||
333 | return 0; | ||
334 | } | ||
335 | |||
336 | /* | ||
337 | * Given a PVFS2 object identifier (fsid, handle), convert it into a ino_t type | ||
338 | * that will be used as a hash-index from where the handle will | ||
339 | * be searched for in the VFS hash table of inodes. | ||
340 | */ | ||
341 | static inline ino_t pvfs2_handle_hash(struct pvfs2_object_kref *ref) | ||
342 | { | ||
343 | if (!ref) | ||
344 | return 0; | ||
345 | return pvfs2_khandle_to_ino(&(ref->khandle)); | ||
346 | } | ||
347 | |||
348 | /* | ||
349 | * Called to set up an inode from iget5_locked. | ||
350 | */ | ||
351 | static int pvfs2_set_inode(struct inode *inode, void *data) | ||
352 | { | ||
353 | struct pvfs2_object_kref *ref = (struct pvfs2_object_kref *) data; | ||
354 | struct pvfs2_inode_s *pvfs2_inode = NULL; | ||
355 | |||
356 | /* Make sure that we have sane parameters */ | ||
357 | if (!data || !inode) | ||
358 | return 0; | ||
359 | pvfs2_inode = PVFS2_I(inode); | ||
360 | if (!pvfs2_inode) | ||
361 | return 0; | ||
362 | pvfs2_inode->refn.fs_id = ref->fs_id; | ||
363 | pvfs2_inode->refn.khandle = ref->khandle; | ||
364 | return 0; | ||
365 | } | ||
366 | |||
367 | /* | ||
368 | * Called to determine if handles match. | ||
369 | */ | ||
370 | static int pvfs2_test_inode(struct inode *inode, void *data) | ||
371 | { | ||
372 | struct pvfs2_object_kref *ref = (struct pvfs2_object_kref *) data; | ||
373 | struct pvfs2_inode_s *pvfs2_inode = NULL; | ||
374 | |||
375 | pvfs2_inode = PVFS2_I(inode); | ||
376 | return (!PVFS_khandle_cmp(&(pvfs2_inode->refn.khandle), &(ref->khandle)) | ||
377 | && pvfs2_inode->refn.fs_id == ref->fs_id); | ||
378 | } | ||
379 | |||
380 | /* | ||
381 | * Front-end to lookup the inode-cache maintained by the VFS using the PVFS2 | ||
382 | * file handle. | ||
383 | * | ||
384 | * @sb: the file system super block instance. | ||
385 | * @ref: The PVFS2 object for which we are trying to locate an inode structure. | ||
386 | */ | ||
387 | struct inode *pvfs2_iget(struct super_block *sb, struct pvfs2_object_kref *ref) | ||
388 | { | ||
389 | struct inode *inode = NULL; | ||
390 | unsigned long hash; | ||
391 | int error; | ||
392 | |||
393 | hash = pvfs2_handle_hash(ref); | ||
394 | inode = iget5_locked(sb, hash, pvfs2_test_inode, pvfs2_set_inode, ref); | ||
395 | if (!inode || !(inode->i_state & I_NEW)) | ||
396 | return inode; | ||
397 | |||
398 | error = pvfs2_inode_getattr(inode, PVFS_ATTR_SYS_ALL_NOHINT); | ||
399 | if (error) { | ||
400 | iget_failed(inode); | ||
401 | return ERR_PTR(error); | ||
402 | } | ||
403 | |||
404 | inode->i_ino = hash; /* needed for stat etc */ | ||
405 | pvfs2_init_iops(inode); | ||
406 | unlock_new_inode(inode); | ||
407 | |||
408 | gossip_debug(GOSSIP_INODE_DEBUG, | ||
409 | "iget handle %pU, fsid %d hash %ld i_ino %lu\n", | ||
410 | &ref->khandle, | ||
411 | ref->fs_id, | ||
412 | hash, | ||
413 | inode->i_ino); | ||
414 | |||
415 | return inode; | ||
416 | } | ||
417 | |||
418 | /* | ||
419 | * Allocate an inode for a newly created file and insert it into the inode hash. | ||
420 | */ | ||
421 | struct inode *pvfs2_new_inode(struct super_block *sb, struct inode *dir, | ||
422 | int mode, dev_t dev, struct pvfs2_object_kref *ref) | ||
423 | { | ||
424 | unsigned long hash = pvfs2_handle_hash(ref); | ||
425 | struct inode *inode; | ||
426 | int error; | ||
427 | |||
428 | gossip_debug(GOSSIP_INODE_DEBUG, | ||
429 | "pvfs2_get_custom_inode_common: called\n" | ||
430 | "(sb is %p | MAJOR(dev)=%u | MINOR(dev)=%u mode=%o)\n", | ||
431 | sb, | ||
432 | MAJOR(dev), | ||
433 | MINOR(dev), | ||
434 | mode); | ||
435 | |||
436 | inode = new_inode(sb); | ||
437 | if (!inode) | ||
438 | return NULL; | ||
439 | |||
440 | pvfs2_set_inode(inode, ref); | ||
441 | inode->i_ino = hash; /* needed for stat etc */ | ||
442 | |||
443 | error = pvfs2_inode_getattr(inode, PVFS_ATTR_SYS_ALL_NOHINT); | ||
444 | if (error) | ||
445 | goto out_iput; | ||
446 | |||
447 | pvfs2_init_iops(inode); | ||
448 | |||
449 | inode->i_mode = mode; | ||
450 | inode->i_uid = current_fsuid(); | ||
451 | inode->i_gid = current_fsgid(); | ||
452 | inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; | ||
453 | inode->i_size = PAGE_CACHE_SIZE; | ||
454 | inode->i_rdev = dev; | ||
455 | |||
456 | error = insert_inode_locked4(inode, hash, pvfs2_test_inode, ref); | ||
457 | if (error < 0) | ||
458 | goto out_iput; | ||
459 | |||
460 | gossip_debug(GOSSIP_INODE_DEBUG, | ||
461 | "Initializing ACL's for inode %pU\n", | ||
462 | get_khandle_from_ino(inode)); | ||
463 | pvfs2_init_acl(inode, dir); | ||
464 | return inode; | ||
465 | |||
466 | out_iput: | ||
467 | iput(inode); | ||
468 | return ERR_PTR(error); | ||
469 | } | ||