aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorDave Kleikamp <shaggy@austin.ibm.com>2006-03-14 18:05:45 -0500
committerDave Kleikamp <shaggy@austin.ibm.com>2006-03-14 18:05:45 -0500
commitc5111f504d2a9b0d258d7c4752b4093523315989 (patch)
tree6a52864aff79691689aea21cb0cb928327d5de5b /fs
parent69eb66d7da7dba2696281981347698e1693c2340 (diff)
parenta488edc914aa1d766a4e2c982b5ae03d5657ec1b (diff)
Merge with /home/shaggy/git/linus-clean/
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/9p.c1
-rw-r--r--fs/9p/conv.c28
-rw-r--r--fs/9p/fid.c145
-rw-r--r--fs/9p/fid.h6
-rw-r--r--fs/9p/mux.c15
-rw-r--r--fs/9p/trans_fd.c1
-rw-r--r--fs/9p/v9fs.c3
-rw-r--r--fs/9p/v9fs_vfs.h1
-rw-r--r--fs/9p/vfs_dentry.c45
-rw-r--r--fs/9p/vfs_file.c106
-rw-r--r--fs/9p/vfs_inode.c470
-rw-r--r--fs/9p/vfs_super.c10
-rw-r--r--fs/Kconfig4
-rw-r--r--fs/binfmt_elf.c5
-rw-r--r--fs/bio.c1
-rw-r--r--fs/buffer.c65
-rw-r--r--fs/cifs/cifsproto.h2
-rw-r--r--fs/cifs/cifssmb.c7
-rw-r--r--fs/cifs/connect.c21
-rw-r--r--fs/cifs/file.c22
-rw-r--r--fs/cifs/misc.c4
-rw-r--r--fs/cifs/transport.c1
-rw-r--r--fs/compat.c51
-rw-r--r--fs/compat_ioctl.c23
-rw-r--r--fs/configfs/configfs_internal.h11
-rw-r--r--fs/configfs/dir.c36
-rw-r--r--fs/configfs/file.c19
-rw-r--r--fs/configfs/inode.c120
-rw-r--r--fs/configfs/mount.c28
-rw-r--r--fs/configfs/symlink.c4
-rw-r--r--fs/cramfs/inode.c60
-rw-r--r--fs/dcache.c4
-rw-r--r--fs/debugfs/file.c6
-rw-r--r--fs/direct-io.c9
-rw-r--r--fs/exec.c8
-rw-r--r--fs/ext2/acl.c2
-rw-r--r--fs/ext2/ialloc.c2
-rw-r--r--fs/ext2/inode.c2
-rw-r--r--fs/ext2/super.c5
-rw-r--r--fs/ext2/xattr.c6
-rw-r--r--fs/ext3/acl.c2
-rw-r--r--fs/ext3/inode.c19
-rw-r--r--fs/ext3/namei.c3
-rw-r--r--fs/fat/file.c50
-rw-r--r--fs/fat/misc.c14
-rw-r--r--fs/fcntl.c7
-rw-r--r--fs/fifo.c7
-rw-r--r--fs/file.c3
-rw-r--r--fs/file_table.c87
-rw-r--r--fs/fuse/dev.c46
-rw-r--r--fs/fuse/dir.c10
-rw-r--r--fs/fuse/file.c20
-rw-r--r--fs/fuse/fuse_i.h3
-rw-r--r--fs/fuse/inode.c14
-rw-r--r--fs/hugetlbfs/inode.c4
-rw-r--r--fs/inode.c2
-rw-r--r--fs/inotify.c2
-rw-r--r--fs/jbd/checkpoint.c418
-rw-r--r--fs/jbd/commit.c3
-rw-r--r--fs/jbd/transaction.c10
-rw-r--r--fs/jffs/intrep.c2
-rw-r--r--fs/jffs2/nodelist.c3
-rw-r--r--fs/jffs2/readinode.c2
-rw-r--r--fs/jffs2/scan.c2
-rw-r--r--fs/libfs.c1
-rw-r--r--fs/lockd/clntlock.c27
-rw-r--r--fs/lockd/clntproc.c20
-rw-r--r--fs/lockd/svc4proc.c2
-rw-r--r--fs/lockd/svcproc.c2
-rw-r--r--fs/namei.c66
-rw-r--r--fs/namespace.c58
-rw-r--r--fs/nfs/direct.c12
-rw-r--r--fs/nfs/nfs4proc.c2
-rw-r--r--fs/nfs/nfsroot.c3
-rw-r--r--fs/nfsd/nfs4proc.c6
-rw-r--r--fs/nfsd/nfssvc.c76
-rw-r--r--fs/ntfs/ChangeLog36
-rw-r--r--fs/ntfs/Makefile2
-rw-r--r--fs/ntfs/aops.c18
-rw-r--r--fs/ntfs/file.c10
-rw-r--r--fs/ntfs/inode.c49
-rw-r--r--fs/ntfs/layout.h25
-rw-r--r--fs/ntfs/mft.c8
-rw-r--r--fs/ntfs/ntfs.h10
-rw-r--r--fs/ntfs/super.c197
-rw-r--r--fs/ntfs/upcase.c10
-rw-r--r--fs/ntfs/volume.h28
-rw-r--r--fs/ocfs2/buffer_head_io.c10
-rw-r--r--fs/ocfs2/cluster/heartbeat.c5
-rw-r--r--fs/ocfs2/cluster/masklog.c1
-rw-r--r--fs/ocfs2/cluster/masklog.h2
-rw-r--r--fs/ocfs2/cluster/nodemanager.c4
-rw-r--r--fs/ocfs2/cluster/tcp.c30
-rw-r--r--fs/ocfs2/cluster/tcp.h5
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h13
-rw-r--r--fs/ocfs2/dlm/dlmconvert.c12
-rw-r--r--fs/ocfs2/dlm/dlmdebug.c12
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c57
-rw-r--r--fs/ocfs2/dlm/dlmlock.c25
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c35
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c315
-rw-r--r--fs/ocfs2/dlm/dlmunlock.c13
-rw-r--r--fs/ocfs2/dlm/userdlm.c2
-rw-r--r--fs/ocfs2/extent_map.c50
-rw-r--r--fs/ocfs2/file.c61
-rw-r--r--fs/ocfs2/heartbeat.c1
-rw-r--r--fs/ocfs2/inode.c52
-rw-r--r--fs/ocfs2/inode.h4
-rw-r--r--fs/ocfs2/journal.c163
-rw-r--r--fs/ocfs2/journal.h2
-rw-r--r--fs/ocfs2/ocfs2.h10
-rw-r--r--fs/ocfs2/ocfs2_fs.h1
-rw-r--r--fs/ocfs2/super.c22
-rw-r--r--fs/ocfs2/sysfile.c6
-rw-r--r--fs/ocfs2/uptodate.c12
-rw-r--r--fs/ocfs2/uptodate.h2
-rw-r--r--fs/partitions/ibm.c16
-rw-r--r--fs/pipe.c6
-rw-r--r--fs/proc/inode.c4
-rw-r--r--fs/proc/proc_misc.c2
-rw-r--r--fs/proc/root.c17
-rw-r--r--fs/proc/task_mmu.c11
-rw-r--r--fs/quota_v2.c2
-rw-r--r--fs/ramfs/inode.c3
-rw-r--r--fs/reiserfs/dir.c16
-rw-r--r--fs/reiserfs/file.c33
-rw-r--r--fs/reiserfs/fix_node.c50
-rw-r--r--fs/reiserfs/hashes.c1
-rw-r--r--fs/reiserfs/inode.c19
-rw-r--r--fs/reiserfs/journal.c143
-rw-r--r--fs/reiserfs/namei.c24
-rw-r--r--fs/reiserfs/procfs.c3
-rw-r--r--fs/reiserfs/super.c8
-rw-r--r--fs/reiserfs/xattr.c128
-rw-r--r--fs/reiserfs/xattr_acl.c3
-rw-r--r--fs/select.c36
-rw-r--r--fs/smbfs/dir.c5
-rw-r--r--fs/stat.c22
-rw-r--r--fs/super.c20
-rw-r--r--fs/udf/balloc.c7
-rw-r--r--fs/udf/inode.c16
-rw-r--r--fs/udf/namei.c4
-rw-r--r--fs/udf/super.c18
-rw-r--r--fs/udf/udf_sb.h4
-rw-r--r--fs/ufs/inode.c2
-rw-r--r--fs/ufs/super.c10
-rw-r--r--fs/ufs/truncate.c72
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c6
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c8
-rw-r--r--fs/xfs/linux-2.6/xfs_iops.c52
-rw-r--r--fs/xfs/quota/xfs_qm.c11
-rw-r--r--fs/xfs/xfs_rtalloc.c29
152 files changed, 2653 insertions, 1845 deletions
diff --git a/fs/9p/9p.c b/fs/9p/9p.c
index 1a6d08761f39..f86a28d1d6a6 100644
--- a/fs/9p/9p.c
+++ b/fs/9p/9p.c
@@ -111,7 +111,6 @@ static void v9fs_t_clunk_cb(void *a, struct v9fs_fcall *tc,
111 if (!rc) 111 if (!rc)
112 return; 112 return;
113 113
114 dprintk(DEBUG_9P, "tcall id %d rcall id %d\n", tc->id, rc->id);
115 v9ses = a; 114 v9ses = a;
116 if (rc->id == RCLUNK) 115 if (rc->id == RCLUNK)
117 v9fs_put_idpool(fid, &v9ses->fidpool); 116 v9fs_put_idpool(fid, &v9ses->fidpool);
diff --git a/fs/9p/conv.c b/fs/9p/conv.c
index 32a9f99154e2..bf1f10067960 100644
--- a/fs/9p/conv.c
+++ b/fs/9p/conv.c
@@ -116,13 +116,19 @@ static void buf_put_int64(struct cbuf *buf, u64 val)
116 } 116 }
117} 117}
118 118
119static void buf_put_stringn(struct cbuf *buf, const char *s, u16 slen) 119static char *buf_put_stringn(struct cbuf *buf, const char *s, u16 slen)
120{ 120{
121 char *ret;
122
123 ret = NULL;
121 if (buf_check_size(buf, slen + 2)) { 124 if (buf_check_size(buf, slen + 2)) {
122 buf_put_int16(buf, slen); 125 buf_put_int16(buf, slen);
126 ret = buf->p;
123 memcpy(buf->p, s, slen); 127 memcpy(buf->p, s, slen);
124 buf->p += slen; 128 buf->p += slen;
125 } 129 }
130
131 return ret;
126} 132}
127 133
128static inline void buf_put_string(struct cbuf *buf, const char *s) 134static inline void buf_put_string(struct cbuf *buf, const char *s)
@@ -430,15 +436,19 @@ static inline void v9fs_put_int64(struct cbuf *bufp, u64 val, u64 * p)
430static void 436static void
431v9fs_put_str(struct cbuf *bufp, char *data, struct v9fs_str *str) 437v9fs_put_str(struct cbuf *bufp, char *data, struct v9fs_str *str)
432{ 438{
433 if (data) { 439 int len;
434 str->len = strlen(data); 440 char *s;
435 str->str = bufp->p; 441
436 } else { 442 if (data)
437 str->len = 0; 443 len = strlen(data);
438 str->str = NULL; 444 else
439 } 445 len = 0;
440 446
441 buf_put_stringn(bufp, data, str->len); 447 s = buf_put_stringn(bufp, data, len);
448 if (str) {
449 str->len = len;
450 str->str = s;
451 }
442} 452}
443 453
444static int 454static int
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index eda449778fa5..c4d13bf904d2 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * V9FS FID Management 2 * V9FS FID Management
3 * 3 *
4 * Copyright (C) 2005 by Eric Van Hensbergen <ericvh@gmail.com> 4 * Copyright (C) 2005, 2006 by Eric Van Hensbergen <ericvh@gmail.com>
5 * 5 *
6 * This program is free software; you can redistribute it and/or modify 6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by 7 * it under the terms of the GNU General Public License as published by
@@ -40,7 +40,7 @@
40 * 40 *
41 */ 41 */
42 42
43static int v9fs_fid_insert(struct v9fs_fid *fid, struct dentry *dentry) 43int v9fs_fid_insert(struct v9fs_fid *fid, struct dentry *dentry)
44{ 44{
45 struct list_head *fid_list = (struct list_head *)dentry->d_fsdata; 45 struct list_head *fid_list = (struct list_head *)dentry->d_fsdata;
46 dprintk(DEBUG_9P, "fid %d (%p) dentry %s (%p)\n", fid->fid, fid, 46 dprintk(DEBUG_9P, "fid %d (%p) dentry %s (%p)\n", fid->fid, fid,
@@ -57,7 +57,6 @@ static int v9fs_fid_insert(struct v9fs_fid *fid, struct dentry *dentry)
57 } 57 }
58 58
59 fid->uid = current->uid; 59 fid->uid = current->uid;
60 fid->pid = current->pid;
61 list_add(&fid->list, fid_list); 60 list_add(&fid->list, fid_list);
62 return 0; 61 return 0;
63} 62}
@@ -68,14 +67,11 @@ static int v9fs_fid_insert(struct v9fs_fid *fid, struct dentry *dentry)
68 * 67 *
69 */ 68 */
70 69
71struct v9fs_fid *v9fs_fid_create(struct dentry *dentry, 70struct v9fs_fid *v9fs_fid_create(struct v9fs_session_info *v9ses, int fid)
72 struct v9fs_session_info *v9ses, int fid, int create)
73{ 71{
74 struct v9fs_fid *new; 72 struct v9fs_fid *new;
75 73
76 dprintk(DEBUG_9P, "fid create dentry %p, fid %d, create %d\n", 74 dprintk(DEBUG_9P, "fid create fid %d\n", fid);
77 dentry, fid, create);
78
79 new = kmalloc(sizeof(struct v9fs_fid), GFP_KERNEL); 75 new = kmalloc(sizeof(struct v9fs_fid), GFP_KERNEL);
80 if (new == NULL) { 76 if (new == NULL) {
81 dprintk(DEBUG_ERROR, "Out of Memory\n"); 77 dprintk(DEBUG_ERROR, "Out of Memory\n");
@@ -85,19 +81,13 @@ struct v9fs_fid *v9fs_fid_create(struct dentry *dentry,
85 new->fid = fid; 81 new->fid = fid;
86 new->v9ses = v9ses; 82 new->v9ses = v9ses;
87 new->fidopen = 0; 83 new->fidopen = 0;
88 new->fidcreate = create;
89 new->fidclunked = 0; 84 new->fidclunked = 0;
90 new->iounit = 0; 85 new->iounit = 0;
91 new->rdir_pos = 0; 86 new->rdir_pos = 0;
92 new->rdir_fcall = NULL; 87 new->rdir_fcall = NULL;
88 INIT_LIST_HEAD(&new->list);
93 89
94 if (v9fs_fid_insert(new, dentry) == 0) 90 return new;
95 return new;
96 else {
97 dprintk(DEBUG_ERROR, "Problems inserting to dentry\n");
98 kfree(new);
99 return NULL;
100 }
101} 91}
102 92
103/** 93/**
@@ -113,140 +103,29 @@ void v9fs_fid_destroy(struct v9fs_fid *fid)
113} 103}
114 104
115/** 105/**
116 * v9fs_fid_walk_up - walks from the process current directory
117 * up to the specified dentry.
118 */
119static struct v9fs_fid *v9fs_fid_walk_up(struct dentry *dentry)
120{
121 int fidnum, cfidnum, err;
122 struct v9fs_fid *cfid;
123 struct dentry *cde;
124 struct v9fs_session_info *v9ses;
125
126 v9ses = v9fs_inode2v9ses(current->fs->pwd->d_inode);
127 cfid = v9fs_fid_lookup(current->fs->pwd);
128 if (cfid == NULL) {
129 dprintk(DEBUG_ERROR, "process cwd doesn't have a fid\n");
130 return ERR_PTR(-ENOENT);
131 }
132
133 cfidnum = cfid->fid;
134 cde = current->fs->pwd;
135 /* TODO: take advantage of multiwalk */
136
137 fidnum = v9fs_get_idpool(&v9ses->fidpool);
138 if (fidnum < 0) {
139 dprintk(DEBUG_ERROR, "could not get a new fid num\n");
140 err = -ENOENT;
141 goto clunk_fid;
142 }
143
144 while (cde != dentry) {
145 if (cde == cde->d_parent) {
146 dprintk(DEBUG_ERROR, "can't find dentry\n");
147 err = -ENOENT;
148 goto clunk_fid;
149 }
150
151 err = v9fs_t_walk(v9ses, cfidnum, fidnum, "..", NULL);
152 if (err < 0) {
153 dprintk(DEBUG_ERROR, "problem walking to parent\n");
154 goto clunk_fid;
155 }
156
157 cfidnum = fidnum;
158 cde = cde->d_parent;
159 }
160
161 return v9fs_fid_create(dentry, v9ses, fidnum, 0);
162
163clunk_fid:
164 v9fs_t_clunk(v9ses, fidnum);
165 return ERR_PTR(err);
166}
167
168/**
169 * v9fs_fid_lookup - retrieve the right fid from a particular dentry 106 * v9fs_fid_lookup - retrieve the right fid from a particular dentry
170 * @dentry: dentry to look for fid in 107 * @dentry: dentry to look for fid in
171 * @type: intent of lookup (operation or traversal) 108 * @type: intent of lookup (operation or traversal)
172 * 109 *
173 * search list of fids associated with a dentry for a fid with a matching 110 * find a fid in the dentry
174 * thread id or uid. If that fails, look up the dentry's parents to see if you 111 *
175 * can find a matching fid. 112 * TODO: only match fids that have the same uid as current user
176 * 113 *
177 */ 114 */
178 115
179struct v9fs_fid *v9fs_fid_lookup(struct dentry *dentry) 116struct v9fs_fid *v9fs_fid_lookup(struct dentry *dentry)
180{ 117{
181 struct list_head *fid_list = (struct list_head *)dentry->d_fsdata; 118 struct list_head *fid_list = (struct list_head *)dentry->d_fsdata;
182 struct v9fs_fid *current_fid = NULL;
183 struct v9fs_fid *temp = NULL;
184 struct v9fs_fid *return_fid = NULL; 119 struct v9fs_fid *return_fid = NULL;
185 120
186 dprintk(DEBUG_9P, " dentry: %s (%p)\n", dentry->d_iname, dentry); 121 dprintk(DEBUG_9P, " dentry: %s (%p)\n", dentry->d_iname, dentry);
187 122
188 if (fid_list) { 123 if (fid_list)
189 list_for_each_entry_safe(current_fid, temp, fid_list, list) { 124 return_fid = list_entry(fid_list->next, struct v9fs_fid, list);
190 if (!current_fid->fidcreate) {
191 return_fid = current_fid;
192 break;
193 }
194 }
195
196 if (!return_fid)
197 return_fid = current_fid;
198 }
199
200 /* we are at the root but didn't match */
201 if ((!return_fid) && (dentry->d_parent == dentry)) {
202 /* TODO: clone attach with new uid */
203 return_fid = current_fid;
204 }
205 125
206 if (!return_fid) { 126 if (!return_fid) {
207 struct dentry *par = current->fs->pwd->d_parent; 127 dprintk(DEBUG_ERROR, "Couldn't find a fid in dentry\n");
208 int count = 1;
209 while (par != NULL) {
210 if (par == dentry)
211 break;
212 count++;
213 if (par == par->d_parent) {
214 dprintk(DEBUG_ERROR,
215 "got to root without finding dentry\n");
216 break;
217 }
218 par = par->d_parent;
219 }
220
221/* XXX - there may be some duplication we can get rid of */
222 if (par == dentry) {
223 return_fid = v9fs_fid_walk_up(dentry);
224 if (IS_ERR(return_fid))
225 return_fid = NULL;
226 }
227 } 128 }
228 129
229 return return_fid; 130 return return_fid;
230} 131}
231
232struct v9fs_fid *v9fs_fid_get_created(struct dentry *dentry)
233{
234 struct list_head *fid_list;
235 struct v9fs_fid *fid, *ftmp, *ret;
236
237 dprintk(DEBUG_9P, " dentry: %s (%p)\n", dentry->d_iname, dentry);
238 fid_list = (struct list_head *)dentry->d_fsdata;
239 ret = NULL;
240 if (fid_list) {
241 list_for_each_entry_safe(fid, ftmp, fid_list, list) {
242 if (fid->fidcreate && fid->pid == current->pid) {
243 list_del(&fid->list);
244 ret = fid;
245 break;
246 }
247 }
248 }
249
250 dprintk(DEBUG_9P, "return %p\n", ret);
251 return ret;
252}
diff --git a/fs/9p/fid.h b/fs/9p/fid.h
index 84c673a44c83..1fc2dd08d75a 100644
--- a/fs/9p/fid.h
+++ b/fs/9p/fid.h
@@ -33,7 +33,6 @@ struct v9fs_fid {
33 33
34 u32 fid; 34 u32 fid;
35 unsigned char fidopen; /* set when fid is opened */ 35 unsigned char fidopen; /* set when fid is opened */
36 unsigned char fidcreate; /* set when fid was just created */
37 unsigned char fidclunked; /* set when fid has already been clunked */ 36 unsigned char fidclunked; /* set when fid has already been clunked */
38 37
39 struct v9fs_qid qid; 38 struct v9fs_qid qid;
@@ -45,7 +44,6 @@ struct v9fs_fid {
45 struct v9fs_fcall *rdir_fcall; 44 struct v9fs_fcall *rdir_fcall;
46 45
47 /* management stuff */ 46 /* management stuff */
48 pid_t pid; /* thread associated with this fid */
49 uid_t uid; /* user associated with this fid */ 47 uid_t uid; /* user associated with this fid */
50 48
51 /* private data */ 49 /* private data */
@@ -56,5 +54,5 @@ struct v9fs_fid {
56struct v9fs_fid *v9fs_fid_lookup(struct dentry *dentry); 54struct v9fs_fid *v9fs_fid_lookup(struct dentry *dentry);
57struct v9fs_fid *v9fs_fid_get_created(struct dentry *); 55struct v9fs_fid *v9fs_fid_get_created(struct dentry *);
58void v9fs_fid_destroy(struct v9fs_fid *fid); 56void v9fs_fid_destroy(struct v9fs_fid *fid);
59struct v9fs_fid *v9fs_fid_create(struct dentry *, 57struct v9fs_fid *v9fs_fid_create(struct v9fs_session_info *, int fid);
60 struct v9fs_session_info *v9ses, int fid, int create); 58int v9fs_fid_insert(struct v9fs_fid *fid, struct dentry *dentry);
diff --git a/fs/9p/mux.c b/fs/9p/mux.c
index 945cb368d451..ea1134eb47c8 100644
--- a/fs/9p/mux.c
+++ b/fs/9p/mux.c
@@ -471,10 +471,13 @@ static void v9fs_write_work(void *a)
471 } 471 }
472 472
473 spin_lock(&m->lock); 473 spin_lock(&m->lock);
474 req = 474again:
475 list_entry(m->unsent_req_list.next, struct v9fs_req, 475 req = list_entry(m->unsent_req_list.next, struct v9fs_req,
476 req_list); 476 req_list);
477 list_move_tail(&req->req_list, &m->req_list); 477 list_move_tail(&req->req_list, &m->req_list);
478 if (req->err == ERREQFLUSH)
479 goto again;
480
478 m->wbuf = req->tcall->sdata; 481 m->wbuf = req->tcall->sdata;
479 m->wsize = req->tcall->size; 482 m->wsize = req->tcall->size;
480 m->wpos = 0; 483 m->wpos = 0;
@@ -525,7 +528,7 @@ static void process_request(struct v9fs_mux_data *m, struct v9fs_req *req)
525 struct v9fs_str *ename; 528 struct v9fs_str *ename;
526 529
527 tag = req->tag; 530 tag = req->tag;
528 if (req->rcall->id == RERROR && !req->err) { 531 if (!req->err && req->rcall->id == RERROR) {
529 ecode = req->rcall->params.rerror.errno; 532 ecode = req->rcall->params.rerror.errno;
530 ename = &req->rcall->params.rerror.error; 533 ename = &req->rcall->params.rerror.error;
531 534
@@ -551,7 +554,10 @@ static void process_request(struct v9fs_mux_data *m, struct v9fs_req *req)
551 req->err = -EIO; 554 req->err = -EIO;
552 } 555 }
553 556
554 if (req->cb && req->err != ERREQFLUSH) { 557 if (req->err == ERREQFLUSH)
558 return;
559
560 if (req->cb) {
555 dprintk(DEBUG_MUX, "calling callback tcall %p rcall %p\n", 561 dprintk(DEBUG_MUX, "calling callback tcall %p rcall %p\n",
556 req->tcall, req->rcall); 562 req->tcall, req->rcall);
557 563
@@ -812,6 +818,7 @@ v9fs_mux_rpc_cb(void *a, struct v9fs_fcall *tc, struct v9fs_fcall *rc, int err)
812 struct v9fs_mux_rpc *r; 818 struct v9fs_mux_rpc *r;
813 819
814 if (err == ERREQFLUSH) { 820 if (err == ERREQFLUSH) {
821 kfree(rc);
815 dprintk(DEBUG_MUX, "err req flush\n"); 822 dprintk(DEBUG_MUX, "err req flush\n");
816 return; 823 return;
817 } 824 }
diff --git a/fs/9p/trans_fd.c b/fs/9p/trans_fd.c
index 1a28ef97a3d1..5b2ce21b10fa 100644
--- a/fs/9p/trans_fd.c
+++ b/fs/9p/trans_fd.c
@@ -80,6 +80,7 @@ static int v9fs_fd_send(struct v9fs_transport *trans, void *v, int len)
80 if (!trans || trans->status != Connected || !ts) 80 if (!trans || trans->status != Connected || !ts)
81 return -EIO; 81 return -EIO;
82 82
83 oldfs = get_fs();
83 set_fs(get_ds()); 84 set_fs(get_ds());
84 /* The cast to a user pointer is valid due to the set_fs() */ 85 /* The cast to a user pointer is valid due to the set_fs() */
85 ret = vfs_write(ts->out_file, (void __user *)v, len, &ts->out_file->f_pos); 86 ret = vfs_write(ts->out_file, (void __user *)v, len, &ts->out_file->f_pos);
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 5250c428fc1f..61352491ba36 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -66,7 +66,7 @@ static match_table_t tokens = {
66 {Opt_afid, "afid=%u"}, 66 {Opt_afid, "afid=%u"},
67 {Opt_rfdno, "rfdno=%u"}, 67 {Opt_rfdno, "rfdno=%u"},
68 {Opt_wfdno, "wfdno=%u"}, 68 {Opt_wfdno, "wfdno=%u"},
69 {Opt_debug, "debug=%u"}, 69 {Opt_debug, "debug=%x"},
70 {Opt_name, "name=%s"}, 70 {Opt_name, "name=%s"},
71 {Opt_remotename, "aname=%s"}, 71 {Opt_remotename, "aname=%s"},
72 {Opt_unix, "proto=unix"}, 72 {Opt_unix, "proto=unix"},
@@ -397,6 +397,7 @@ v9fs_session_init(struct v9fs_session_info *v9ses,
397 } 397 }
398 398
399 if (v9ses->afid != ~0) { 399 if (v9ses->afid != ~0) {
400 dprintk(DEBUG_ERROR, "afid not equal to ~0\n");
400 if (v9fs_t_clunk(v9ses, v9ses->afid)) 401 if (v9fs_t_clunk(v9ses, v9ses->afid))
401 dprintk(DEBUG_ERROR, "clunk failed\n"); 402 dprintk(DEBUG_ERROR, "clunk failed\n");
402 } 403 }
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index 69cf2905dc90..a759278acaae 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -51,3 +51,4 @@ int v9fs_dir_release(struct inode *inode, struct file *filp);
51int v9fs_file_open(struct inode *inode, struct file *file); 51int v9fs_file_open(struct inode *inode, struct file *file);
52void v9fs_inode2stat(struct inode *inode, struct v9fs_stat *stat); 52void v9fs_inode2stat(struct inode *inode, struct v9fs_stat *stat);
53void v9fs_dentry_release(struct dentry *); 53void v9fs_dentry_release(struct dentry *);
54int v9fs_uflags2omode(int uflags);
diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
index 2dd806dac9f1..12c9cc926b71 100644
--- a/fs/9p/vfs_dentry.c
+++ b/fs/9p/vfs_dentry.c
@@ -43,47 +43,18 @@
43#include "fid.h" 43#include "fid.h"
44 44
45/** 45/**
46 * v9fs_dentry_validate - VFS dcache hook to validate cache 46 * v9fs_dentry_delete - called when dentry refcount equals 0
47 * @dentry: dentry that is being validated 47 * @dentry: dentry in question
48 * @nd: path data
49 * 48 *
50 * dcache really shouldn't be used for 9P2000 as at all due to 49 * By returning 1 here we should remove cacheing of unused
51 * potential attached semantics to directory traversal (walk). 50 * dentry components.
52 *
53 * FUTURE: look into how to use dcache to allow multi-stage
54 * walks in Plan 9 & potential for better dcache operation which
55 * would remain valid for Plan 9 semantics. Older versions
56 * had validation via stat for those interested. However, since
57 * stat has the same approximate overhead as walk there really
58 * is no difference. The only improvement would be from a
59 * time-decay cache like NFS has and that undermines the
60 * synchronous nature of 9P2000.
61 * 51 *
62 */ 52 */
63 53
64static int v9fs_dentry_validate(struct dentry *dentry, struct nameidata *nd) 54int v9fs_dentry_delete(struct dentry *dentry)
65{ 55{
66 struct dentry *dc = current->fs->pwd; 56 dprintk(DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_iname, dentry);
67 57 return 1;
68 dprintk(DEBUG_VFS, "dentry: %s (%p)\n", dentry->d_iname, dentry);
69 if (v9fs_fid_lookup(dentry)) {
70 dprintk(DEBUG_VFS, "VALID\n");
71 return 1;
72 }
73
74 while (dc != NULL) {
75 if (dc == dentry) {
76 dprintk(DEBUG_VFS, "VALID\n");
77 return 1;
78 }
79 if (dc == dc->d_parent)
80 break;
81
82 dc = dc->d_parent;
83 }
84
85 dprintk(DEBUG_VFS, "INVALID\n");
86 return 0;
87} 58}
88 59
89/** 60/**
@@ -118,6 +89,6 @@ void v9fs_dentry_release(struct dentry *dentry)
118} 89}
119 90
120struct dentry_operations v9fs_dentry_operations = { 91struct dentry_operations v9fs_dentry_operations = {
121 .d_revalidate = v9fs_dentry_validate, 92 .d_delete = v9fs_dentry_delete,
122 .d_release = v9fs_dentry_release, 93 .d_release = v9fs_dentry_release,
123}; 94};
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index c7e14d917215..de3a129698da 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -53,94 +53,70 @@
53int v9fs_file_open(struct inode *inode, struct file *file) 53int v9fs_file_open(struct inode *inode, struct file *file)
54{ 54{
55 struct v9fs_session_info *v9ses = v9fs_inode2v9ses(inode); 55 struct v9fs_session_info *v9ses = v9fs_inode2v9ses(inode);
56 struct v9fs_fid *v9fid, *fid; 56 struct v9fs_fid *vfid;
57 struct v9fs_fcall *fcall = NULL; 57 struct v9fs_fcall *fcall = NULL;
58 int open_mode = 0; 58 int omode;
59 unsigned int iounit = 0; 59 int fid = V9FS_NOFID;
60 int newfid = -1; 60 int err;
61 long result = -1;
62 61
63 dprintk(DEBUG_VFS, "inode: %p file: %p \n", inode, file); 62 dprintk(DEBUG_VFS, "inode: %p file: %p \n", inode, file);
64 63
65 v9fid = v9fs_fid_get_created(file->f_dentry); 64 vfid = v9fs_fid_lookup(file->f_dentry);
66 if (!v9fid) 65 if (!vfid) {
67 v9fid = v9fs_fid_lookup(file->f_dentry);
68
69 if (!v9fid) {
70 dprintk(DEBUG_ERROR, "Couldn't resolve fid from dentry\n"); 66 dprintk(DEBUG_ERROR, "Couldn't resolve fid from dentry\n");
71 return -EBADF; 67 return -EBADF;
72 } 68 }
73 69
74 if (!v9fid->fidcreate) { 70 fid = v9fs_get_idpool(&v9ses->fidpool);
75 fid = kmalloc(sizeof(struct v9fs_fid), GFP_KERNEL); 71 if (fid < 0) {
76 if (fid == NULL) {
77 dprintk(DEBUG_ERROR, "Out of Memory\n");
78 return -ENOMEM;
79 }
80
81 fid->fidopen = 0;
82 fid->fidcreate = 0;
83 fid->fidclunked = 0;
84 fid->iounit = 0;
85 fid->v9ses = v9ses;
86
87 newfid = v9fs_get_idpool(&v9ses->fidpool);
88 if (newfid < 0) {
89 eprintk(KERN_WARNING, "newfid fails!\n"); 72 eprintk(KERN_WARNING, "newfid fails!\n");
90 return -ENOSPC; 73 return -ENOSPC;
91 } 74 }
92 75
93 result = 76 err = v9fs_t_walk(v9ses, vfid->fid, fid, NULL, NULL);
94 v9fs_t_walk(v9ses, v9fid->fid, newfid, NULL, NULL); 77 if (err < 0) {
95
96 if (result < 0) {
97 v9fs_put_idpool(newfid, &v9ses->fidpool);
98 dprintk(DEBUG_ERROR, "rewalk didn't work\n"); 78 dprintk(DEBUG_ERROR, "rewalk didn't work\n");
99 return -EBADF; 79 goto put_fid;
80 }
81
82 vfid = kmalloc(sizeof(struct v9fs_fid), GFP_KERNEL);
83 if (vfid == NULL) {
84 dprintk(DEBUG_ERROR, "out of memory\n");
85 goto clunk_fid;
100 } 86 }
101 87
102 fid->fid = newfid;
103 v9fid = fid;
104 /* TODO: do special things for O_EXCL, O_NOFOLLOW, O_SYNC */ 88 /* TODO: do special things for O_EXCL, O_NOFOLLOW, O_SYNC */
105 /* translate open mode appropriately */ 89 /* translate open mode appropriately */
106 open_mode = file->f_flags & 0x3; 90 omode = v9fs_uflags2omode(file->f_flags);
91 err = v9fs_t_open(v9ses, fid, omode, &fcall);
92 if (err < 0) {
93 PRINT_FCALL_ERROR("open failed", fcall);
94 goto destroy_vfid;
95 }
107 96
108 if (file->f_flags & O_EXCL) 97 file->private_data = vfid;
109 open_mode |= V9FS_OEXCL; 98 vfid->fid = fid;
99 vfid->fidopen = 1;
100 vfid->fidclunked = 0;
101 vfid->iounit = fcall->params.ropen.iounit;
102 vfid->rdir_pos = 0;
103 vfid->rdir_fcall = NULL;
104 vfid->filp = file;
105 kfree(fcall);
110 106
111 if (v9ses->extended) { 107 return 0;
112 if (file->f_flags & O_TRUNC)
113 open_mode |= V9FS_OTRUNC;
114 108
115 if (file->f_flags & O_APPEND) 109destroy_vfid:
116 open_mode |= V9FS_OAPPEND; 110 v9fs_fid_destroy(vfid);
117 }
118 111
119 result = v9fs_t_open(v9ses, newfid, open_mode, &fcall); 112clunk_fid:
120 if (result < 0) { 113 v9fs_t_clunk(v9ses, fid);
121 PRINT_FCALL_ERROR("open failed", fcall);
122 kfree(fcall);
123 return result;
124 }
125 114
126 iounit = fcall->params.ropen.iounit; 115put_fid:
116 v9fs_put_idpool(fid, &v9ses->fidpool);
127 kfree(fcall); 117 kfree(fcall);
128 } else {
129 /* create case */
130 newfid = v9fid->fid;
131 iounit = v9fid->iounit;
132 v9fid->fidcreate = 0;
133 }
134
135 file->private_data = v9fid;
136
137 v9fid->rdir_pos = 0;
138 v9fid->rdir_fcall = NULL;
139 v9fid->fidopen = 1;
140 v9fid->filp = file;
141 v9fid->iounit = iounit;
142 118
143 return 0; 119 return err;
144} 120}
145 121
146/** 122/**
@@ -289,9 +265,7 @@ v9fs_file_write(struct file *filp, const char __user * data,
289 total += result; 265 total += result;
290 } while (count); 266 } while (count);
291 267
292 if(inode->i_mapping->nrpages)
293 invalidate_inode_pages2(inode->i_mapping); 268 invalidate_inode_pages2(inode->i_mapping);
294
295 return total; 269 return total;
296} 270}
297 271
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 91f552454c76..3ad8455f8577 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -125,6 +125,38 @@ static int p9mode2unixmode(struct v9fs_session_info *v9ses, int mode)
125 return res; 125 return res;
126} 126}
127 127
128int v9fs_uflags2omode(int uflags)
129{
130 int ret;
131
132 ret = 0;
133 switch (uflags&3) {
134 default:
135 case O_RDONLY:
136 ret = V9FS_OREAD;
137 break;
138
139 case O_WRONLY:
140 ret = V9FS_OWRITE;
141 break;
142
143 case O_RDWR:
144 ret = V9FS_ORDWR;
145 break;
146 }
147
148 if (uflags & O_EXCL)
149 ret |= V9FS_OEXCL;
150
151 if (uflags & O_TRUNC)
152 ret |= V9FS_OTRUNC;
153
154 if (uflags & O_APPEND)
155 ret |= V9FS_OAPPEND;
156
157 return ret;
158}
159
128/** 160/**
129 * v9fs_blank_wstat - helper function to setup a 9P stat structure 161 * v9fs_blank_wstat - helper function to setup a 9P stat structure
130 * @v9ses: 9P session info (for determining extended mode) 162 * @v9ses: 9P session info (for determining extended mode)
@@ -163,7 +195,7 @@ v9fs_blank_wstat(struct v9fs_wstat *wstat)
163 195
164struct inode *v9fs_get_inode(struct super_block *sb, int mode) 196struct inode *v9fs_get_inode(struct super_block *sb, int mode)
165{ 197{
166 struct inode *inode = NULL; 198 struct inode *inode;
167 struct v9fs_session_info *v9ses = sb->s_fs_info; 199 struct v9fs_session_info *v9ses = sb->s_fs_info;
168 200
169 dprintk(DEBUG_VFS, "super block: %p mode: %o\n", sb, mode); 201 dprintk(DEBUG_VFS, "super block: %p mode: %o\n", sb, mode);
@@ -222,171 +254,133 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode)
222 return inode; 254 return inode;
223} 255}
224 256
225/**
226 * v9fs_create - helper function to create files and directories
227 * @dir: directory inode file is being created in
228 * @file_dentry: dentry file is being created in
229 * @perm: permissions file is being created with
230 * @open_mode: resulting open mode for file
231 *
232 */
233
234static int 257static int
235v9fs_create(struct inode *dir, 258v9fs_create(struct v9fs_session_info *v9ses, u32 pfid, char *name,
236 struct dentry *file_dentry, 259 u32 perm, u8 mode, u32 *fidp, struct v9fs_qid *qid, u32 *iounit)
237 unsigned int perm, unsigned int open_mode)
238{ 260{
239 struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dir); 261 u32 fid;
240 struct super_block *sb = dir->i_sb;
241 struct v9fs_fid *dirfid =
242 v9fs_fid_lookup(file_dentry->d_parent);
243 struct v9fs_fid *fid = NULL;
244 struct inode *file_inode = NULL;
245 struct v9fs_fcall *fcall = NULL;
246 struct v9fs_qid qid;
247 int dirfidnum = -1;
248 long newfid = -1;
249 int result = 0;
250 unsigned int iounit = 0;
251 int wfidno = -1;
252 int err; 262 int err;
263 struct v9fs_fcall *fcall;
253 264
254 perm = unixmode2p9mode(v9ses, perm); 265 fid = v9fs_get_idpool(&v9ses->fidpool);
255 266 if (fid < 0) {
256 dprintk(DEBUG_VFS, "dir: %p dentry: %p perm: %o mode: %o\n", dir,
257 file_dentry, perm, open_mode);
258
259 if (!dirfid)
260 return -EBADF;
261
262 dirfidnum = dirfid->fid;
263 if (dirfidnum < 0) {
264 dprintk(DEBUG_ERROR, "No fid for the directory #%lu\n",
265 dir->i_ino);
266 return -EBADF;
267 }
268
269 if (file_dentry->d_inode) {
270 dprintk(DEBUG_ERROR,
271 "Odd. There is an inode for dir %lu, name :%s:\n",
272 dir->i_ino, file_dentry->d_name.name);
273 return -EEXIST;
274 }
275
276 newfid = v9fs_get_idpool(&v9ses->fidpool);
277 if (newfid < 0) {
278 eprintk(KERN_WARNING, "no free fids available\n"); 267 eprintk(KERN_WARNING, "no free fids available\n");
279 return -ENOSPC; 268 return -ENOSPC;
280 } 269 }
281 270
282 result = v9fs_t_walk(v9ses, dirfidnum, newfid, NULL, &fcall); 271 err = v9fs_t_walk(v9ses, pfid, fid, NULL, &fcall);
283 if (result < 0) { 272 if (err < 0) {
284 PRINT_FCALL_ERROR("clone error", fcall); 273 PRINT_FCALL_ERROR("clone error", fcall);
285 v9fs_put_idpool(newfid, &v9ses->fidpool); 274 goto error;
286 newfid = -1;
287 goto CleanUpFid;
288 } 275 }
289
290 kfree(fcall); 276 kfree(fcall);
291 fcall = NULL;
292 277
293 result = v9fs_t_create(v9ses, newfid, (char *)file_dentry->d_name.name, 278 err = v9fs_t_create(v9ses, fid, name, perm, mode, &fcall);
294 perm, open_mode, &fcall); 279 if (err < 0) {
295 if (result < 0) {
296 PRINT_FCALL_ERROR("create fails", fcall); 280 PRINT_FCALL_ERROR("create fails", fcall);
297 goto CleanUpFid; 281 goto error;
298 } 282 }
299 283
300 iounit = fcall->params.rcreate.iounit; 284 if (iounit)
301 qid = fcall->params.rcreate.qid; 285 *iounit = fcall->params.rcreate.iounit;
286
287 if (qid)
288 *qid = fcall->params.rcreate.qid;
289
290 if (fidp)
291 *fidp = fid;
292
302 kfree(fcall); 293 kfree(fcall);
303 fcall = NULL; 294 return 0;
304 295
305 if (!(perm&V9FS_DMDIR)) { 296error:
306 fid = v9fs_fid_create(file_dentry, v9ses, newfid, 1); 297 if (fid >= 0)
307 dprintk(DEBUG_VFS, "fid %p %d\n", fid, fid->fidcreate); 298 v9fs_put_idpool(fid, &v9ses->fidpool);
308 if (!fid) {
309 result = -ENOMEM;
310 goto CleanUpFid;
311 }
312 299
313 fid->qid = qid; 300 kfree(fcall);
314 fid->iounit = iounit; 301 return err;
315 } else { 302}
316 err = v9fs_t_clunk(v9ses, newfid); 303
317 newfid = -1; 304static struct v9fs_fid*
318 if (err < 0) 305v9fs_clone_walk(struct v9fs_session_info *v9ses, u32 fid, struct dentry *dentry)
319 dprintk(DEBUG_ERROR, "clunk for mkdir failed: %d\n", err); 306{
320 } 307 int err;
308 u32 nfid;
309 struct v9fs_fid *ret;
310 struct v9fs_fcall *fcall;
321 311
322 /* walk to the newly created file and put the fid in the dentry */ 312 nfid = v9fs_get_idpool(&v9ses->fidpool);
323 wfidno = v9fs_get_idpool(&v9ses->fidpool); 313 if (nfid < 0) {
324 if (wfidno < 0) {
325 eprintk(KERN_WARNING, "no free fids available\n"); 314 eprintk(KERN_WARNING, "no free fids available\n");
326 return -ENOSPC; 315 return ERR_PTR(-ENOSPC);
327 } 316 }
328 317
329 result = v9fs_t_walk(v9ses, dirfidnum, wfidno, 318 err = v9fs_t_walk(v9ses, fid, nfid, (char *) dentry->d_name.name,
330 (char *) file_dentry->d_name.name, &fcall); 319 &fcall);
331 if (result < 0) { 320
332 PRINT_FCALL_ERROR("clone error", fcall); 321 if (err < 0) {
333 v9fs_put_idpool(wfidno, &v9ses->fidpool); 322 PRINT_FCALL_ERROR("walk error", fcall);
334 wfidno = -1; 323 v9fs_put_idpool(nfid, &v9ses->fidpool);
335 goto CleanUpFid; 324 goto error;
336 } 325 }
326
337 kfree(fcall); 327 kfree(fcall);
338 fcall = NULL; 328 fcall = NULL;
329 ret = v9fs_fid_create(v9ses, nfid);
330 if (!ret) {
331 err = -ENOMEM;
332 goto clunk_fid;
333 }
339 334
340 if (!v9fs_fid_create(file_dentry, v9ses, wfidno, 0)) { 335 err = v9fs_fid_insert(ret, dentry);
341 v9fs_put_idpool(wfidno, &v9ses->fidpool); 336 if (err < 0) {
342 337 v9fs_fid_destroy(ret);
343 goto CleanUpFid; 338 goto clunk_fid;
344 } 339 }
345 340
346 if ((perm & V9FS_DMSYMLINK) || (perm & V9FS_DMLINK) || 341 return ret;
347 (perm & V9FS_DMNAMEDPIPE) || (perm & V9FS_DMSOCKET) ||
348 (perm & V9FS_DMDEVICE))
349 return 0;
350 342
351 result = v9fs_t_stat(v9ses, wfidno, &fcall); 343clunk_fid:
352 if (result < 0) { 344 v9fs_t_clunk(v9ses, nfid);
353 PRINT_FCALL_ERROR("stat error", fcall);
354 goto CleanUpFid;
355 }
356 345
346error:
347 kfree(fcall);
348 return ERR_PTR(err);
349}
357 350
358 file_inode = v9fs_get_inode(sb, 351struct inode *
359 p9mode2unixmode(v9ses, fcall->params.rstat.stat.mode)); 352v9fs_inode_from_fid(struct v9fs_session_info *v9ses, u32 fid,
353 struct super_block *sb)
354{
355 int err, umode;
356 struct inode *ret;
357 struct v9fs_fcall *fcall;
360 358
361 if ((!file_inode) || IS_ERR(file_inode)) { 359 ret = NULL;
362 dprintk(DEBUG_ERROR, "create inode failed\n"); 360 err = v9fs_t_stat(v9ses, fid, &fcall);
363 result = -EBADF; 361 if (err) {
364 goto CleanUpFid; 362 PRINT_FCALL_ERROR("stat error", fcall);
363 goto error;
365 } 364 }
366 365
367 v9fs_stat2inode(&fcall->params.rstat.stat, file_inode, sb); 366 umode = p9mode2unixmode(v9ses, fcall->params.rstat.stat.mode);
368 kfree(fcall); 367 ret = v9fs_get_inode(sb, umode);
369 fcall = NULL; 368 if (IS_ERR(ret)) {
370 file_dentry->d_op = &v9fs_dentry_operations; 369 err = PTR_ERR(ret);
371 d_instantiate(file_dentry, file_inode); 370 ret = NULL;
371 goto error;
372 }
372 373
373 return 0; 374 v9fs_stat2inode(&fcall->params.rstat.stat, ret, sb);
375 kfree(fcall);
376 return ret;
374 377
375 CleanUpFid: 378error:
376 kfree(fcall); 379 kfree(fcall);
377 fcall = NULL; 380 if (ret)
381 iput(ret);
378 382
379 if (newfid >= 0) { 383 return ERR_PTR(err);
380 err = v9fs_t_clunk(v9ses, newfid);
381 if (err < 0)
382 dprintk(DEBUG_ERROR, "clunk failed: %d\n", err);
383 }
384 if (wfidno >= 0) {
385 err = v9fs_t_clunk(v9ses, wfidno);
386 if (err < 0)
387 dprintk(DEBUG_ERROR, "clunk failed: %d\n", err);
388 }
389 return result;
390} 384}
391 385
392/** 386/**
@@ -440,20 +434,97 @@ static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir)
440 return result; 434 return result;
441} 435}
442 436
437static int
438v9fs_open_created(struct inode *inode, struct file *file)
439{
440 return 0;
441}
442
443/** 443/**
444 * v9fs_vfs_create - VFS hook to create files 444 * v9fs_vfs_create - VFS hook to create files
445 * @inode: directory inode that is being deleted 445 * @inode: directory inode that is being deleted
446 * @dentry: dentry that is being deleted 446 * @dentry: dentry that is being deleted
447 * @perm: create permissions 447 * @mode: create permissions
448 * @nd: path information 448 * @nd: path information
449 * 449 *
450 */ 450 */
451 451
452static int 452static int
453v9fs_vfs_create(struct inode *inode, struct dentry *dentry, int perm, 453v9fs_vfs_create(struct inode *dir, struct dentry *dentry, int mode,
454 struct nameidata *nd) 454 struct nameidata *nd)
455{ 455{
456 return v9fs_create(inode, dentry, perm, O_RDWR); 456 int err;
457 u32 fid, perm, iounit;
458 int flags;
459 struct v9fs_session_info *v9ses;
460 struct v9fs_fid *dfid, *vfid, *ffid;
461 struct inode *inode;
462 struct v9fs_qid qid;
463 struct file *filp;
464
465 inode = NULL;
466 vfid = NULL;
467 v9ses = v9fs_inode2v9ses(dir);
468 dfid = v9fs_fid_lookup(dentry->d_parent);
469 perm = unixmode2p9mode(v9ses, mode);
470
471 if (nd && nd->flags & LOOKUP_OPEN)
472 flags = nd->intent.open.flags - 1;
473 else
474 flags = O_RDWR;
475
476 err = v9fs_create(v9ses, dfid->fid, (char *) dentry->d_name.name,
477 perm, v9fs_uflags2omode(flags), &fid, &qid, &iounit);
478
479 if (err)
480 goto error;
481
482 vfid = v9fs_clone_walk(v9ses, dfid->fid, dentry);
483 if (IS_ERR(vfid)) {
484 err = PTR_ERR(vfid);
485 vfid = NULL;
486 goto error;
487 }
488
489 inode = v9fs_inode_from_fid(v9ses, vfid->fid, dir->i_sb);
490 if (IS_ERR(inode)) {
491 err = PTR_ERR(inode);
492 inode = NULL;
493 goto error;
494 }
495
496 dentry->d_op = &v9fs_dentry_operations;
497 d_instantiate(dentry, inode);
498
499 if (nd && nd->flags & LOOKUP_OPEN) {
500 ffid = v9fs_fid_create(v9ses, fid);
501 if (!ffid)
502 return -ENOMEM;
503
504 filp = lookup_instantiate_filp(nd, dentry, v9fs_open_created);
505 if (IS_ERR(filp)) {
506 v9fs_fid_destroy(ffid);
507 return PTR_ERR(filp);
508 }
509
510 ffid->rdir_pos = 0;
511 ffid->rdir_fcall = NULL;
512 ffid->fidopen = 1;
513 ffid->iounit = iounit;
514 ffid->filp = filp;
515 filp->private_data = ffid;
516 }
517
518 return 0;
519
520error:
521 if (vfid)
522 v9fs_fid_destroy(vfid);
523
524 if (inode)
525 iput(inode);
526
527 return err;
457} 528}
458 529
459/** 530/**
@@ -464,9 +535,57 @@ v9fs_vfs_create(struct inode *inode, struct dentry *dentry, int perm,
464 * 535 *
465 */ 536 */
466 537
467static int v9fs_vfs_mkdir(struct inode *inode, struct dentry *dentry, int mode) 538static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
468{ 539{
469 return v9fs_create(inode, dentry, mode | S_IFDIR, O_RDONLY); 540 int err;
541 u32 fid, perm;
542 struct v9fs_session_info *v9ses;
543 struct v9fs_fid *dfid, *vfid;
544 struct inode *inode;
545
546 inode = NULL;
547 vfid = NULL;
548 v9ses = v9fs_inode2v9ses(dir);
549 dfid = v9fs_fid_lookup(dentry->d_parent);
550 perm = unixmode2p9mode(v9ses, mode | S_IFDIR);
551
552 err = v9fs_create(v9ses, dfid->fid, (char *) dentry->d_name.name,
553 perm, V9FS_OREAD, &fid, NULL, NULL);
554
555 if (err) {
556 dprintk(DEBUG_ERROR, "create error %d\n", err);
557 goto error;
558 }
559
560 err = v9fs_t_clunk(v9ses, fid);
561 if (err) {
562 dprintk(DEBUG_ERROR, "clunk error %d\n", err);
563 goto error;
564 }
565
566 vfid = v9fs_clone_walk(v9ses, dfid->fid, dentry);
567 if (IS_ERR(vfid)) {
568 err = PTR_ERR(vfid);
569 vfid = NULL;
570 goto error;
571 }
572
573 inode = v9fs_inode_from_fid(v9ses, vfid->fid, dir->i_sb);
574 if (IS_ERR(inode)) {
575 err = PTR_ERR(inode);
576 inode = NULL;
577 goto error;
578 }
579
580 dentry->d_op = &v9fs_dentry_operations;
581 d_instantiate(dentry, inode);
582 return 0;
583
584error:
585 if (vfid)
586 v9fs_fid_destroy(vfid);
587
588 return err;
470} 589}
471 590
472/** 591/**
@@ -491,7 +610,7 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
491 int result = 0; 610 int result = 0;
492 611
493 dprintk(DEBUG_VFS, "dir: %p dentry: (%s) %p nameidata: %p\n", 612 dprintk(DEBUG_VFS, "dir: %p dentry: (%s) %p nameidata: %p\n",
494 dir, dentry->d_iname, dentry, nameidata); 613 dir, dentry->d_name.name, dentry, nameidata);
495 614
496 sb = dir->i_sb; 615 sb = dir->i_sb;
497 v9ses = v9fs_inode2v9ses(dir); 616 v9ses = v9fs_inode2v9ses(dir);
@@ -516,9 +635,8 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
516 return ERR_PTR(-ENOSPC); 635 return ERR_PTR(-ENOSPC);
517 } 636 }
518 637
519 result = 638 result = v9fs_t_walk(v9ses, dirfidnum, newfid,
520 v9fs_t_walk(v9ses, dirfidnum, newfid, (char *)dentry->d_name.name, 639 (char *)dentry->d_name.name, NULL);
521 NULL);
522 if (result < 0) { 640 if (result < 0) {
523 v9fs_put_idpool(newfid, &v9ses->fidpool); 641 v9fs_put_idpool(newfid, &v9ses->fidpool);
524 if (result == -ENOENT) { 642 if (result == -ENOENT) {
@@ -551,13 +669,17 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
551 669
552 inode->i_ino = v9fs_qid2ino(&fcall->params.rstat.stat.qid); 670 inode->i_ino = v9fs_qid2ino(&fcall->params.rstat.stat.qid);
553 671
554 fid = v9fs_fid_create(dentry, v9ses, newfid, 0); 672 fid = v9fs_fid_create(v9ses, newfid);
555 if (fid == NULL) { 673 if (fid == NULL) {
556 dprintk(DEBUG_ERROR, "couldn't insert\n"); 674 dprintk(DEBUG_ERROR, "couldn't insert\n");
557 result = -ENOMEM; 675 result = -ENOMEM;
558 goto FreeFcall; 676 goto FreeFcall;
559 } 677 }
560 678
679 result = v9fs_fid_insert(fid, dentry);
680 if (result < 0)
681 goto FreeFcall;
682
561 fid->qid = fcall->params.rstat.stat.qid; 683 fid->qid = fcall->params.rstat.stat.qid;
562 684
563 dentry->d_op = &v9fs_dentry_operations; 685 dentry->d_op = &v9fs_dentry_operations;
@@ -983,53 +1105,75 @@ static void v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd, void
983static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry, 1105static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry,
984 int mode, const char *extension) 1106 int mode, const char *extension)
985{ 1107{
986 int err, retval; 1108 int err;
1109 u32 fid, perm;
987 struct v9fs_session_info *v9ses; 1110 struct v9fs_session_info *v9ses;
1111 struct v9fs_fid *dfid, *vfid;
1112 struct inode *inode;
988 struct v9fs_fcall *fcall; 1113 struct v9fs_fcall *fcall;
989 struct v9fs_fid *fid;
990 struct v9fs_wstat wstat; 1114 struct v9fs_wstat wstat;
991 1115
992 v9ses = v9fs_inode2v9ses(dir);
993 retval = -EPERM;
994 fcall = NULL; 1116 fcall = NULL;
1117 inode = NULL;
1118 vfid = NULL;
1119 v9ses = v9fs_inode2v9ses(dir);
1120 dfid = v9fs_fid_lookup(dentry->d_parent);
1121 perm = unixmode2p9mode(v9ses, mode);
995 1122
996 if (!v9ses->extended) { 1123 if (!v9ses->extended) {
997 dprintk(DEBUG_ERROR, "not extended\n"); 1124 dprintk(DEBUG_ERROR, "not extended\n");
998 goto free_mem; 1125 return -EPERM;
999 } 1126 }
1000 1127
1001 /* issue a create */ 1128 err = v9fs_create(v9ses, dfid->fid, (char *) dentry->d_name.name,
1002 retval = v9fs_create(dir, dentry, mode, 0); 1129 perm, V9FS_OREAD, &fid, NULL, NULL);
1003 if (retval != 0)
1004 goto free_mem;
1005 1130
1006 fid = v9fs_fid_get_created(dentry); 1131 if (err)
1007 if (!fid) { 1132 goto error;
1008 dprintk(DEBUG_ERROR, "couldn't resolve fid from dentry\n"); 1133
1009 goto free_mem; 1134 err = v9fs_t_clunk(v9ses, fid);
1135 if (err)
1136 goto error;
1137
1138 vfid = v9fs_clone_walk(v9ses, dfid->fid, dentry);
1139 if (IS_ERR(vfid)) {
1140 err = PTR_ERR(vfid);
1141 vfid = NULL;
1142 goto error;
1143 }
1144
1145 inode = v9fs_inode_from_fid(v9ses, vfid->fid, dir->i_sb);
1146 if (IS_ERR(inode)) {
1147 err = PTR_ERR(inode);
1148 inode = NULL;
1149 goto error;
1010 } 1150 }
1011 1151
1012 /* issue a Twstat */ 1152 /* issue a Twstat */
1013 v9fs_blank_wstat(&wstat); 1153 v9fs_blank_wstat(&wstat);
1014 wstat.muid = v9ses->name; 1154 wstat.muid = v9ses->name;
1015 wstat.extension = (char *) extension; 1155 wstat.extension = (char *) extension;
1016 retval = v9fs_t_wstat(v9ses, fid->fid, &wstat, &fcall); 1156 err = v9fs_t_wstat(v9ses, vfid->fid, &wstat, &fcall);
1017 if (retval < 0) {
1018 PRINT_FCALL_ERROR("wstat error", fcall);
1019 goto free_mem;
1020 }
1021
1022 err = v9fs_t_clunk(v9ses, fid->fid);
1023 if (err < 0) { 1157 if (err < 0) {
1024 dprintk(DEBUG_ERROR, "clunk failed: %d\n", err); 1158 PRINT_FCALL_ERROR("wstat error", fcall);
1025 goto free_mem; 1159 goto error;
1026 } 1160 }
1027 1161
1028 d_drop(dentry); /* FID - will this also clunk? */ 1162 kfree(fcall);
1163 dentry->d_op = &v9fs_dentry_operations;
1164 d_instantiate(dentry, inode);
1165 return 0;
1029 1166
1030free_mem: 1167error:
1031 kfree(fcall); 1168 kfree(fcall);
1032 return retval; 1169 if (vfid)
1170 v9fs_fid_destroy(vfid);
1171
1172 if (inode)
1173 iput(inode);
1174
1175 return err;
1176
1033} 1177}
1034 1178
1035/** 1179/**
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 2c4fa75be025..d05318fa684e 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -146,7 +146,6 @@ static struct super_block *v9fs_get_sb(struct file_system_type
146 inode->i_gid = gid; 146 inode->i_gid = gid;
147 147
148 root = d_alloc_root(inode); 148 root = d_alloc_root(inode);
149
150 if (!root) { 149 if (!root) {
151 retval = -ENOMEM; 150 retval = -ENOMEM;
152 goto put_back_sb; 151 goto put_back_sb;
@@ -158,15 +157,20 @@ static struct super_block *v9fs_get_sb(struct file_system_type
158 if (stat_result < 0) { 157 if (stat_result < 0) {
159 dprintk(DEBUG_ERROR, "stat error\n"); 158 dprintk(DEBUG_ERROR, "stat error\n");
160 v9fs_t_clunk(v9ses, newfid); 159 v9fs_t_clunk(v9ses, newfid);
161 v9fs_put_idpool(newfid, &v9ses->fidpool);
162 } else { 160 } else {
163 /* Setup the Root Inode */ 161 /* Setup the Root Inode */
164 root_fid = v9fs_fid_create(root, v9ses, newfid, 0); 162 root_fid = v9fs_fid_create(v9ses, newfid);
165 if (root_fid == NULL) { 163 if (root_fid == NULL) {
166 retval = -ENOMEM; 164 retval = -ENOMEM;
167 goto put_back_sb; 165 goto put_back_sb;
168 } 166 }
169 167
168 retval = v9fs_fid_insert(root_fid, root);
169 if (retval < 0) {
170 kfree(fcall);
171 goto put_back_sb;
172 }
173
170 root_fid->qid = fcall->params.rstat.stat.qid; 174 root_fid->qid = fcall->params.rstat.stat.qid;
171 root->d_inode->i_ino = 175 root->d_inode->i_ino =
172 v9fs_qid2ino(&fcall->params.rstat.stat.qid); 176 v9fs_qid2ino(&fcall->params.rstat.stat.qid);
diff --git a/fs/Kconfig b/fs/Kconfig
index ef78e3a42d32..e9749b0eecd8 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -883,8 +883,6 @@ config CONFIGFS_FS
883 Both sysfs and configfs can and should exist together on the 883 Both sysfs and configfs can and should exist together on the
884 same system. One is not a replacement for the other. 884 same system. One is not a replacement for the other.
885 885
886 If unsure, say N.
887
888endmenu 886endmenu
889 887
890menu "Miscellaneous filesystems" 888menu "Miscellaneous filesystems"
@@ -1327,7 +1325,7 @@ config UFS_FS
1327 1325
1328config UFS_FS_WRITE 1326config UFS_FS_WRITE
1329 bool "UFS file system write support (DANGEROUS)" 1327 bool "UFS file system write support (DANGEROUS)"
1330 depends on UFS_FS && EXPERIMENTAL 1328 depends on UFS_FS && EXPERIMENTAL && BROKEN
1331 help 1329 help
1332 Say Y here if you want to try writing to UFS partitions. This is 1330 Say Y here if you want to try writing to UFS partitions. This is
1333 experimental, so you should back up your UFS partitions beforehand. 1331 experimental, so you should back up your UFS partitions beforehand.
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 1b117a441298..c2eac2a50bd2 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -938,6 +938,11 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs)
938 kfree(elf_interpreter); 938 kfree(elf_interpreter);
939 } else { 939 } else {
940 elf_entry = loc->elf_ex.e_entry; 940 elf_entry = loc->elf_ex.e_entry;
941 if (BAD_ADDR(elf_entry)) {
942 send_sig(SIGSEGV, current, 0);
943 retval = -ENOEXEC; /* Nobody gets to see this, but.. */
944 goto out_free_dentry;
945 }
941 } 946 }
942 947
943 kfree(elf_phdata); 948 kfree(elf_phdata);
diff --git a/fs/bio.c b/fs/bio.c
index bbc442b8c867..1f3bb501c262 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -411,6 +411,7 @@ static int __bio_add_page(request_queue_t *q, struct bio *bio, struct page
411 411
412/** 412/**
413 * bio_add_pc_page - attempt to add page to bio 413 * bio_add_pc_page - attempt to add page to bio
414 * @q: the target queue
414 * @bio: destination bio 415 * @bio: destination bio
415 * @page: page to add 416 * @page: page to add
416 * @len: vec entry length 417 * @len: vec entry length
diff --git a/fs/buffer.c b/fs/buffer.c
index 3dc712f29d2d..62cfd17dc5fe 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1022,6 +1022,7 @@ try_again:
1022 1022
1023 bh->b_state = 0; 1023 bh->b_state = 0;
1024 atomic_set(&bh->b_count, 0); 1024 atomic_set(&bh->b_count, 0);
1025 bh->b_private = NULL;
1025 bh->b_size = size; 1026 bh->b_size = size;
1026 1027
1027 /* Link the buffer to its page */ 1028 /* Link the buffer to its page */
@@ -2866,22 +2867,22 @@ void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
2866 else if (test_set_buffer_locked(bh)) 2867 else if (test_set_buffer_locked(bh))
2867 continue; 2868 continue;
2868 2869
2869 get_bh(bh);
2870 if (rw == WRITE || rw == SWRITE) { 2870 if (rw == WRITE || rw == SWRITE) {
2871 if (test_clear_buffer_dirty(bh)) { 2871 if (test_clear_buffer_dirty(bh)) {
2872 bh->b_end_io = end_buffer_write_sync; 2872 bh->b_end_io = end_buffer_write_sync;
2873 get_bh(bh);
2873 submit_bh(WRITE, bh); 2874 submit_bh(WRITE, bh);
2874 continue; 2875 continue;
2875 } 2876 }
2876 } else { 2877 } else {
2877 if (!buffer_uptodate(bh)) { 2878 if (!buffer_uptodate(bh)) {
2878 bh->b_end_io = end_buffer_read_sync; 2879 bh->b_end_io = end_buffer_read_sync;
2880 get_bh(bh);
2879 submit_bh(rw, bh); 2881 submit_bh(rw, bh);
2880 continue; 2882 continue;
2881 } 2883 }
2882 } 2884 }
2883 unlock_buffer(bh); 2885 unlock_buffer(bh);
2884 put_bh(bh);
2885 } 2886 }
2886} 2887}
2887 2888
@@ -3050,6 +3051,66 @@ asmlinkage long sys_bdflush(int func, long data)
3050} 3051}
3051 3052
3052/* 3053/*
3054 * Migration function for pages with buffers. This function can only be used
3055 * if the underlying filesystem guarantees that no other references to "page"
3056 * exist.
3057 */
3058#ifdef CONFIG_MIGRATION
3059int buffer_migrate_page(struct page *newpage, struct page *page)
3060{
3061 struct address_space *mapping = page->mapping;
3062 struct buffer_head *bh, *head;
3063
3064 if (!mapping)
3065 return -EAGAIN;
3066
3067 if (!page_has_buffers(page))
3068 return migrate_page(newpage, page);
3069
3070 head = page_buffers(page);
3071
3072 if (migrate_page_remove_references(newpage, page, 3))
3073 return -EAGAIN;
3074
3075 bh = head;
3076 do {
3077 get_bh(bh);
3078 lock_buffer(bh);
3079 bh = bh->b_this_page;
3080
3081 } while (bh != head);
3082
3083 ClearPagePrivate(page);
3084 set_page_private(newpage, page_private(page));
3085 set_page_private(page, 0);
3086 put_page(page);
3087 get_page(newpage);
3088
3089 bh = head;
3090 do {
3091 set_bh_page(bh, newpage, bh_offset(bh));
3092 bh = bh->b_this_page;
3093
3094 } while (bh != head);
3095
3096 SetPagePrivate(newpage);
3097
3098 migrate_page_copy(newpage, page);
3099
3100 bh = head;
3101 do {
3102 unlock_buffer(bh);
3103 put_bh(bh);
3104 bh = bh->b_this_page;
3105
3106 } while (bh != head);
3107
3108 return 0;
3109}
3110EXPORT_SYMBOL(buffer_migrate_page);
3111#endif
3112
3113/*
3053 * Buffer-head allocation 3114 * Buffer-head allocation
3054 */ 3115 */
3055static kmem_cache_t *bh_cachep; 3116static kmem_cache_t *bh_cachep;
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 3c03aadaff0c..7b25463d3c14 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -52,7 +52,7 @@ extern int SendReceive2(const unsigned int /* xid */ , struct cifsSesInfo *,
52 int * /* type of buf returned */ , const int long_op); 52 int * /* type of buf returned */ , const int long_op);
53extern int checkSMBhdr(struct smb_hdr *smb, __u16 mid); 53extern int checkSMBhdr(struct smb_hdr *smb, __u16 mid);
54extern int checkSMB(struct smb_hdr *smb, __u16 mid, int length); 54extern int checkSMB(struct smb_hdr *smb, __u16 mid, int length);
55extern int is_valid_oplock_break(struct smb_hdr *smb); 55extern int is_valid_oplock_break(struct smb_hdr *smb, struct TCP_Server_Info *);
56extern int is_size_safe_to_change(struct cifsInodeInfo *); 56extern int is_size_safe_to_change(struct cifsInodeInfo *);
57extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *); 57extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *);
58extern unsigned int smbCalcSize(struct smb_hdr *ptr); 58extern unsigned int smbCalcSize(struct smb_hdr *ptr);
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 217323b0c896..b41e8b379652 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -1048,13 +1048,14 @@ CIFSSMBRead(const int xid, struct cifsTconInfo *tcon,
1048 cifs_small_buf_release(iov[0].iov_base); 1048 cifs_small_buf_release(iov[0].iov_base);
1049 else if(resp_buf_type == CIFS_LARGE_BUFFER) 1049 else if(resp_buf_type == CIFS_LARGE_BUFFER)
1050 cifs_buf_release(iov[0].iov_base); 1050 cifs_buf_release(iov[0].iov_base);
1051 } else /* return buffer to caller to free */ /* BB FIXME how do we tell caller if it is not a large buffer */ { 1051 } else if(resp_buf_type != CIFS_NO_BUFFER) {
1052 *buf = iov[0].iov_base; 1052 /* return buffer to caller to free */
1053 *buf = iov[0].iov_base;
1053 if(resp_buf_type == CIFS_SMALL_BUFFER) 1054 if(resp_buf_type == CIFS_SMALL_BUFFER)
1054 *pbuf_type = CIFS_SMALL_BUFFER; 1055 *pbuf_type = CIFS_SMALL_BUFFER;
1055 else if(resp_buf_type == CIFS_LARGE_BUFFER) 1056 else if(resp_buf_type == CIFS_LARGE_BUFFER)
1056 *pbuf_type = CIFS_LARGE_BUFFER; 1057 *pbuf_type = CIFS_LARGE_BUFFER;
1057 } 1058 } /* else no valid buffer on return - leave as null */
1058 1059
1059 /* Note: On -EAGAIN error only caller can retry on handle based calls 1060 /* Note: On -EAGAIN error only caller can retry on handle based calls
1060 since file handle passed in no longer valid */ 1061 since file handle passed in no longer valid */
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 88f60aa52058..2a0c1f4ca0ae 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -630,7 +630,7 @@ multi_t2_fnd:
630 smallbuf = NULL; 630 smallbuf = NULL;
631 } 631 }
632 wake_up_process(task_to_wake); 632 wake_up_process(task_to_wake);
633 } else if ((is_valid_oplock_break(smb_buffer) == FALSE) 633 } else if ((is_valid_oplock_break(smb_buffer, server) == FALSE)
634 && (isMultiRsp == FALSE)) { 634 && (isMultiRsp == FALSE)) {
635 cERROR(1, ("No task to wake, unknown frame rcvd!")); 635 cERROR(1, ("No task to wake, unknown frame rcvd!"));
636 cifs_dump_mem("Received Data is: ",(char *)smb_buffer, 636 cifs_dump_mem("Received Data is: ",(char *)smb_buffer,
@@ -1785,11 +1785,20 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
1785 } else if(volume_info.wsize) 1785 } else if(volume_info.wsize)
1786 cifs_sb->wsize = volume_info.wsize; 1786 cifs_sb->wsize = volume_info.wsize;
1787 else 1787 else
1788 cifs_sb->wsize = CIFSMaxBufSize; /* default */ 1788 cifs_sb->wsize =
1789 if(cifs_sb->rsize < PAGE_CACHE_SIZE) { 1789 min_t(const int, PAGEVEC_SIZE * PAGE_CACHE_SIZE,
1790 cifs_sb->rsize = PAGE_CACHE_SIZE; 1790 127*1024);
1791 /* Windows ME does this */ 1791 /* old default of CIFSMaxBufSize was too small now
1792 cFYI(1,("Attempt to set readsize for mount to less than one page (4096)")); 1792 that SMB Write2 can send multiple pages in kvec.
1793 RFC1001 does not describe what happens when frame
1794 bigger than 128K is sent so use that as max in
1795 conjunction with 52K kvec constraint on arch with 4K
1796 page size */
1797
1798 if(cifs_sb->rsize < 2048) {
1799 cifs_sb->rsize = 2048;
1800 /* Windows ME may prefer this */
1801 cFYI(1,("readsize set to minimum 2048"));
1793 } 1802 }
1794 cifs_sb->mnt_uid = volume_info.linux_uid; 1803 cifs_sb->mnt_uid = volume_info.linux_uid;
1795 cifs_sb->mnt_gid = volume_info.linux_gid; 1804 cifs_sb->mnt_gid = volume_info.linux_gid;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 77c990f0cb98..675bd2568297 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1190,7 +1190,6 @@ retry:
1190 /* BB what if continued retry is 1190 /* BB what if continued retry is
1191 requested via mount flags? */ 1191 requested via mount flags? */
1192 set_bit(AS_EIO, &mapping->flags); 1192 set_bit(AS_EIO, &mapping->flags);
1193 SetPageError(page);
1194 } else { 1193 } else {
1195 cifs_stats_bytes_written(cifs_sb->tcon, 1194 cifs_stats_bytes_written(cifs_sb->tcon,
1196 bytes_written); 1195 bytes_written);
@@ -1198,6 +1197,13 @@ retry:
1198 } 1197 }
1199 for (i = 0; i < n_iov; i++) { 1198 for (i = 0; i < n_iov; i++) {
1200 page = pvec.pages[first + i]; 1199 page = pvec.pages[first + i];
1200 /* Should we also set page error on
1201 success rc but too little data written? */
1202 /* BB investigate retry logic on temporary
1203 server crash cases and how recovery works
1204 when page marked as error */
1205 if(rc)
1206 SetPageError(page);
1201 kunmap(page); 1207 kunmap(page);
1202 unlock_page(page); 1208 unlock_page(page);
1203 page_cache_release(page); 1209 page_cache_release(page);
@@ -1436,13 +1442,15 @@ ssize_t cifs_user_read(struct file *file, char __user *read_data,
1436 &bytes_read, &smb_read_data, 1442 &bytes_read, &smb_read_data,
1437 &buf_type); 1443 &buf_type);
1438 pSMBr = (struct smb_com_read_rsp *)smb_read_data; 1444 pSMBr = (struct smb_com_read_rsp *)smb_read_data;
1439 if (copy_to_user(current_offset,
1440 smb_read_data + 4 /* RFC1001 hdr */
1441 + le16_to_cpu(pSMBr->DataOffset),
1442 bytes_read)) {
1443 rc = -EFAULT;
1444 }
1445 if (smb_read_data) { 1445 if (smb_read_data) {
1446 if (copy_to_user(current_offset,
1447 smb_read_data +
1448 4 /* RFC1001 length field */ +
1449 le16_to_cpu(pSMBr->DataOffset),
1450 bytes_read)) {
1451 rc = -EFAULT;
1452 }
1453
1446 if(buf_type == CIFS_SMALL_BUFFER) 1454 if(buf_type == CIFS_SMALL_BUFFER)
1447 cifs_small_buf_release(smb_read_data); 1455 cifs_small_buf_release(smb_read_data);
1448 else if(buf_type == CIFS_LARGE_BUFFER) 1456 else if(buf_type == CIFS_LARGE_BUFFER)
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 812c6bb0fe38..432ba15e2c2d 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -475,7 +475,7 @@ checkSMB(struct smb_hdr *smb, __u16 mid, int length)
475 return 0; 475 return 0;
476} 476}
477int 477int
478is_valid_oplock_break(struct smb_hdr *buf) 478is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
479{ 479{
480 struct smb_com_lock_req * pSMB = (struct smb_com_lock_req *)buf; 480 struct smb_com_lock_req * pSMB = (struct smb_com_lock_req *)buf;
481 struct list_head *tmp; 481 struct list_head *tmp;
@@ -535,7 +535,7 @@ is_valid_oplock_break(struct smb_hdr *buf)
535 read_lock(&GlobalSMBSeslock); 535 read_lock(&GlobalSMBSeslock);
536 list_for_each(tmp, &GlobalTreeConnectionList) { 536 list_for_each(tmp, &GlobalTreeConnectionList) {
537 tcon = list_entry(tmp, struct cifsTconInfo, cifsConnectionList); 537 tcon = list_entry(tmp, struct cifsTconInfo, cifsConnectionList);
538 if (tcon->tid == buf->Tid) { 538 if ((tcon->tid == buf->Tid) && (srv == tcon->ses->server)) {
539 cifs_stats_inc(&tcon->num_oplock_brks); 539 cifs_stats_inc(&tcon->num_oplock_brks);
540 list_for_each(tmp1,&tcon->openFileList){ 540 list_for_each(tmp1,&tcon->openFileList){
541 netfile = list_entry(tmp1,struct cifsFileInfo, 541 netfile = list_entry(tmp1,struct cifsFileInfo,
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 7b98792150ea..b12cb8a7da7c 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -498,7 +498,6 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
498 else 498 else
499 *pRespBufType = CIFS_SMALL_BUFFER; 499 *pRespBufType = CIFS_SMALL_BUFFER;
500 iov[0].iov_len = receive_len + 4; 500 iov[0].iov_len = receive_len + 4;
501 iov[1].iov_len = 0;
502 501
503 dump_smb(midQ->resp_buf, 80); 502 dump_smb(midQ->resp_buf, 80);
504 /* convert the length into a more usable form */ 503 /* convert the length into a more usable form */
diff --git a/fs/compat.c b/fs/compat.c
index ff0bafcff720..5333c7d7427f 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -73,17 +73,17 @@ asmlinkage long compat_sys_utime(char __user *filename, struct compat_utimbuf __
73 return do_utimes(AT_FDCWD, filename, t ? tv : NULL); 73 return do_utimes(AT_FDCWD, filename, t ? tv : NULL);
74} 74}
75 75
76asmlinkage long compat_sys_futimesat(int dfd, char __user *filename, struct compat_timeval __user *t) 76asmlinkage long compat_sys_futimesat(unsigned int dfd, char __user *filename, struct compat_timeval __user *t)
77{ 77{
78 struct timeval tv[2]; 78 struct timeval tv[2];
79 79
80 if (t) { 80 if (t) {
81 if (get_user(tv[0].tv_sec, &t[0].tv_sec) || 81 if (get_user(tv[0].tv_sec, &t[0].tv_sec) ||
82 get_user(tv[0].tv_usec, &t[0].tv_usec) || 82 get_user(tv[0].tv_usec, &t[0].tv_usec) ||
83 get_user(tv[1].tv_sec, &t[1].tv_sec) || 83 get_user(tv[1].tv_sec, &t[1].tv_sec) ||
84 get_user(tv[1].tv_usec, &t[1].tv_usec)) 84 get_user(tv[1].tv_usec, &t[1].tv_usec))
85 return -EFAULT; 85 return -EFAULT;
86 } 86 }
87 return do_utimes(dfd, filename, t ? tv : NULL); 87 return do_utimes(dfd, filename, t ? tv : NULL);
88} 88}
89 89
@@ -114,7 +114,7 @@ asmlinkage long compat_sys_newlstat(char __user * filename,
114 return error; 114 return error;
115} 115}
116 116
117asmlinkage long compat_sys_newfstatat(int dfd, char __user *filename, 117asmlinkage long compat_sys_newfstatat(unsigned int dfd, char __user *filename,
118 struct compat_stat __user *statbuf, int flag) 118 struct compat_stat __user *statbuf, int flag)
119{ 119{
120 struct kstat stat; 120 struct kstat stat;
@@ -1326,7 +1326,7 @@ compat_sys_open(const char __user *filename, int flags, int mode)
1326 * O_LARGEFILE flag. 1326 * O_LARGEFILE flag.
1327 */ 1327 */
1328asmlinkage long 1328asmlinkage long
1329compat_sys_openat(int dfd, const char __user *filename, int flags, int mode) 1329compat_sys_openat(unsigned int dfd, const char __user *filename, int flags, int mode)
1330{ 1330{
1331 return do_sys_open(dfd, filename, flags, mode); 1331 return do_sys_open(dfd, filename, flags, mode);
1332} 1332}
@@ -1751,11 +1751,15 @@ asmlinkage long compat_sys_select(int n, compat_ulong_t __user *inp,
1751 ret = compat_core_sys_select(n, inp, outp, exp, &timeout); 1751 ret = compat_core_sys_select(n, inp, outp, exp, &timeout);
1752 1752
1753 if (tvp) { 1753 if (tvp) {
1754 struct compat_timeval rtv;
1755
1754 if (current->personality & STICKY_TIMEOUTS) 1756 if (current->personality & STICKY_TIMEOUTS)
1755 goto sticky; 1757 goto sticky;
1756 tv.tv_usec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)); 1758 rtv.tv_usec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ));
1757 tv.tv_sec = timeout; 1759 rtv.tv_sec = timeout;
1758 if (copy_to_user(tvp, &tv, sizeof(tv))) { 1760 if (compat_timeval_compare(&rtv, &tv) >= 0)
1761 rtv = tv;
1762 if (copy_to_user(tvp, &rtv, sizeof(rtv))) {
1759sticky: 1763sticky:
1760 /* 1764 /*
1761 * If an application puts its timeval in read-only 1765 * If an application puts its timeval in read-only
@@ -1781,7 +1785,7 @@ asmlinkage long compat_sys_pselect7(int n, compat_ulong_t __user *inp,
1781{ 1785{
1782 compat_sigset_t ss32; 1786 compat_sigset_t ss32;
1783 sigset_t ksigmask, sigsaved; 1787 sigset_t ksigmask, sigsaved;
1784 long timeout = MAX_SCHEDULE_TIMEOUT; 1788 s64 timeout = MAX_SCHEDULE_TIMEOUT;
1785 struct compat_timespec ts; 1789 struct compat_timespec ts;
1786 int ret; 1790 int ret;
1787 1791
@@ -1822,13 +1826,17 @@ asmlinkage long compat_sys_pselect7(int n, compat_ulong_t __user *inp,
1822 } while (!ret && !timeout && tsp && (ts.tv_sec || ts.tv_nsec)); 1826 } while (!ret && !timeout && tsp && (ts.tv_sec || ts.tv_nsec));
1823 1827
1824 if (tsp && !(current->personality & STICKY_TIMEOUTS)) { 1828 if (tsp && !(current->personality & STICKY_TIMEOUTS)) {
1825 ts.tv_sec += timeout / HZ; 1829 struct compat_timespec rts;
1826 ts.tv_nsec += (timeout % HZ) * (1000000000/HZ); 1830
1827 if (ts.tv_nsec >= 1000000000) { 1831 rts.tv_sec = timeout / HZ;
1828 ts.tv_sec++; 1832 rts.tv_nsec = (timeout % HZ) * (NSEC_PER_SEC/HZ);
1829 ts.tv_nsec -= 1000000000; 1833 if (rts.tv_nsec >= NSEC_PER_SEC) {
1834 rts.tv_sec++;
1835 rts.tv_nsec -= NSEC_PER_SEC;
1830 } 1836 }
1831 (void)copy_to_user(tsp, &ts, sizeof(ts)); 1837 if (compat_timespec_compare(&rts, &ts) >= 0)
1838 rts = ts;
1839 copy_to_user(tsp, &rts, sizeof(rts));
1832 } 1840 }
1833 1841
1834 if (ret == -ERESTARTNOHAND) { 1842 if (ret == -ERESTARTNOHAND) {
@@ -1918,12 +1926,17 @@ asmlinkage long compat_sys_ppoll(struct pollfd __user *ufds,
1918 sigprocmask(SIG_SETMASK, &sigsaved, NULL); 1926 sigprocmask(SIG_SETMASK, &sigsaved, NULL);
1919 1927
1920 if (tsp && timeout >= 0) { 1928 if (tsp && timeout >= 0) {
1929 struct compat_timespec rts;
1930
1921 if (current->personality & STICKY_TIMEOUTS) 1931 if (current->personality & STICKY_TIMEOUTS)
1922 goto sticky; 1932 goto sticky;
1923 /* Yes, we know it's actually an s64, but it's also positive. */ 1933 /* Yes, we know it's actually an s64, but it's also positive. */
1924 ts.tv_nsec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)) * 1000; 1934 rts.tv_nsec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)) *
1925 ts.tv_sec = timeout; 1935 1000;
1926 if (copy_to_user(tsp, &ts, sizeof(ts))) { 1936 rts.tv_sec = timeout;
1937 if (compat_timespec_compare(&rts, &ts) >= 0)
1938 rts = ts;
1939 if (copy_to_user(tsp, &rts, sizeof(rts))) {
1927sticky: 1940sticky:
1928 /* 1941 /*
1929 * If an application puts its timeval in read-only 1942 * If an application puts its timeval in read-only
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 5dd0207ffd46..c666769a875d 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -446,7 +446,7 @@ static int dev_ifconf(unsigned int fd, unsigned int cmd, unsigned long arg)
446 ifr = ifc.ifc_req; 446 ifr = ifc.ifc_req;
447 ifr32 = compat_ptr(ifc32.ifcbuf); 447 ifr32 = compat_ptr(ifc32.ifcbuf);
448 for (i = 0, j = 0; 448 for (i = 0, j = 0;
449 i + sizeof (struct ifreq32) < ifc32.ifc_len && j < ifc.ifc_len; 449 i + sizeof (struct ifreq32) <= ifc32.ifc_len && j < ifc.ifc_len;
450 i += sizeof (struct ifreq32), j += sizeof (struct ifreq)) { 450 i += sizeof (struct ifreq32), j += sizeof (struct ifreq)) {
451 if (copy_in_user(ifr32, ifr, sizeof (struct ifreq32))) 451 if (copy_in_user(ifr32, ifr, sizeof (struct ifreq32)))
452 return -EFAULT; 452 return -EFAULT;
@@ -931,8 +931,8 @@ struct compat_sg_req_info { /* used by SG_GET_REQUEST_TABLE ioctl() */
931static int sg_grt_trans(unsigned int fd, unsigned int cmd, unsigned long arg) 931static int sg_grt_trans(unsigned int fd, unsigned int cmd, unsigned long arg)
932{ 932{
933 int err, i; 933 int err, i;
934 sg_req_info_t *r; 934 sg_req_info_t __user *r;
935 struct compat_sg_req_info *o = (struct compat_sg_req_info *)arg; 935 struct compat_sg_req_info __user *o = (void __user *)arg;
936 r = compat_alloc_user_space(sizeof(sg_req_info_t)*SG_MAX_QUEUE); 936 r = compat_alloc_user_space(sizeof(sg_req_info_t)*SG_MAX_QUEUE);
937 err = sys_ioctl(fd,cmd,(unsigned long)r); 937 err = sys_ioctl(fd,cmd,(unsigned long)r);
938 if (err < 0) 938 if (err < 0)
@@ -2531,18 +2531,9 @@ static int rtc_ioctl(unsigned fd, unsigned cmd, unsigned long arg)
2531 val32 = kval; 2531 val32 = kval;
2532 return put_user(val32, (unsigned int __user *)arg); 2532 return put_user(val32, (unsigned int __user *)arg);
2533 case RTC_IRQP_SET32: 2533 case RTC_IRQP_SET32:
2534 return sys_ioctl(fd, RTC_IRQP_SET, arg);
2534 case RTC_EPOCH_SET32: 2535 case RTC_EPOCH_SET32:
2535 ret = get_user(val32, (unsigned int __user *)arg); 2536 return sys_ioctl(fd, RTC_EPOCH_SET, arg);
2536 if (ret)
2537 return ret;
2538 kval = val32;
2539
2540 set_fs(KERNEL_DS);
2541 ret = sys_ioctl(fd, (cmd == RTC_IRQP_SET32) ?
2542 RTC_IRQP_SET : RTC_EPOCH_SET,
2543 (unsigned long)&kval);
2544 set_fs(oldfs);
2545 return ret;
2546 default: 2537 default:
2547 /* unreached */ 2538 /* unreached */
2548 return -ENOIOCTLCMD; 2539 return -ENOIOCTLCMD;
@@ -2739,8 +2730,8 @@ static int do_ncp_setprivatedata(unsigned int fd, unsigned int cmd, unsigned lon
2739static int 2730static int
2740lp_timeout_trans(unsigned int fd, unsigned int cmd, unsigned long arg) 2731lp_timeout_trans(unsigned int fd, unsigned int cmd, unsigned long arg)
2741{ 2732{
2742 struct compat_timeval *tc = (struct compat_timeval *)arg; 2733 struct compat_timeval __user *tc = (struct compat_timeval __user *)arg;
2743 struct timeval *tn = compat_alloc_user_space(sizeof(struct timeval)); 2734 struct timeval __user *tn = compat_alloc_user_space(sizeof(struct timeval));
2744 struct timeval ts; 2735 struct timeval ts;
2745 if (get_user(ts.tv_sec, &tc->tv_sec) || 2736 if (get_user(ts.tv_sec, &tc->tv_sec) ||
2746 get_user(ts.tv_usec, &tc->tv_usec) || 2737 get_user(ts.tv_usec, &tc->tv_usec) ||
diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h
index 8899d9c5f6bf..f70e46951b37 100644
--- a/fs/configfs/configfs_internal.h
+++ b/fs/configfs/configfs_internal.h
@@ -36,6 +36,7 @@ struct configfs_dirent {
36 int s_type; 36 int s_type;
37 umode_t s_mode; 37 umode_t s_mode;
38 struct dentry * s_dentry; 38 struct dentry * s_dentry;
39 struct iattr * s_iattr;
39}; 40};
40 41
41#define CONFIGFS_ROOT 0x0001 42#define CONFIGFS_ROOT 0x0001
@@ -48,10 +49,11 @@ struct configfs_dirent {
48#define CONFIGFS_NOT_PINNED (CONFIGFS_ITEM_ATTR) 49#define CONFIGFS_NOT_PINNED (CONFIGFS_ITEM_ATTR)
49 50
50extern struct vfsmount * configfs_mount; 51extern struct vfsmount * configfs_mount;
52extern kmem_cache_t *configfs_dir_cachep;
51 53
52extern int configfs_is_root(struct config_item *item); 54extern int configfs_is_root(struct config_item *item);
53 55
54extern struct inode * configfs_new_inode(mode_t mode); 56extern struct inode * configfs_new_inode(mode_t mode, struct configfs_dirent *);
55extern int configfs_create(struct dentry *, int mode, int (*init)(struct inode *)); 57extern int configfs_create(struct dentry *, int mode, int (*init)(struct inode *));
56 58
57extern int configfs_create_file(struct config_item *, const struct configfs_attribute *); 59extern int configfs_create_file(struct config_item *, const struct configfs_attribute *);
@@ -63,6 +65,7 @@ extern void configfs_hash_and_remove(struct dentry * dir, const char * name);
63 65
64extern const unsigned char * configfs_get_name(struct configfs_dirent *sd); 66extern const unsigned char * configfs_get_name(struct configfs_dirent *sd);
65extern void configfs_drop_dentry(struct configfs_dirent *sd, struct dentry *parent); 67extern void configfs_drop_dentry(struct configfs_dirent *sd, struct dentry *parent);
68extern int configfs_setattr(struct dentry *dentry, struct iattr *iattr);
66 69
67extern int configfs_pin_fs(void); 70extern int configfs_pin_fs(void);
68extern void configfs_release_fs(void); 71extern void configfs_release_fs(void);
@@ -120,8 +123,10 @@ static inline struct config_item *configfs_get_config_item(struct dentry *dentry
120 123
121static inline void release_configfs_dirent(struct configfs_dirent * sd) 124static inline void release_configfs_dirent(struct configfs_dirent * sd)
122{ 125{
123 if (!(sd->s_type & CONFIGFS_ROOT)) 126 if (!(sd->s_type & CONFIGFS_ROOT)) {
124 kfree(sd); 127 kfree(sd->s_iattr);
128 kmem_cache_free(configfs_dir_cachep, sd);
129 }
125} 130}
126 131
127static inline struct configfs_dirent * configfs_get(struct configfs_dirent * sd) 132static inline struct configfs_dirent * configfs_get(struct configfs_dirent * sd)
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index b668ec61527e..ca60e3abef45 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -72,7 +72,7 @@ static struct configfs_dirent *configfs_new_dirent(struct configfs_dirent * pare
72{ 72{
73 struct configfs_dirent * sd; 73 struct configfs_dirent * sd;
74 74
75 sd = kmalloc(sizeof(*sd), GFP_KERNEL); 75 sd = kmem_cache_alloc(configfs_dir_cachep, GFP_KERNEL);
76 if (!sd) 76 if (!sd)
77 return NULL; 77 return NULL;
78 78
@@ -136,13 +136,19 @@ static int create_dir(struct config_item * k, struct dentry * p,
136 int error; 136 int error;
137 umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO; 137 umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO;
138 138
139 error = configfs_create(d, mode, init_dir); 139 error = configfs_make_dirent(p->d_fsdata, d, k, mode,
140 CONFIGFS_DIR);
140 if (!error) { 141 if (!error) {
141 error = configfs_make_dirent(p->d_fsdata, d, k, mode, 142 error = configfs_create(d, mode, init_dir);
142 CONFIGFS_DIR);
143 if (!error) { 143 if (!error) {
144 p->d_inode->i_nlink++; 144 p->d_inode->i_nlink++;
145 (d)->d_op = &configfs_dentry_ops; 145 (d)->d_op = &configfs_dentry_ops;
146 } else {
147 struct configfs_dirent *sd = d->d_fsdata;
148 if (sd) {
149 list_del_init(&sd->s_sibling);
150 configfs_put(sd);
151 }
146 } 152 }
147 } 153 }
148 return error; 154 return error;
@@ -182,12 +188,19 @@ int configfs_create_link(struct configfs_symlink *sl,
182 int err = 0; 188 int err = 0;
183 umode_t mode = S_IFLNK | S_IRWXUGO; 189 umode_t mode = S_IFLNK | S_IRWXUGO;
184 190
185 err = configfs_create(dentry, mode, init_symlink); 191 err = configfs_make_dirent(parent->d_fsdata, dentry, sl, mode,
192 CONFIGFS_ITEM_LINK);
186 if (!err) { 193 if (!err) {
187 err = configfs_make_dirent(parent->d_fsdata, dentry, sl, 194 err = configfs_create(dentry, mode, init_symlink);
188 mode, CONFIGFS_ITEM_LINK);
189 if (!err) 195 if (!err)
190 dentry->d_op = &configfs_dentry_ops; 196 dentry->d_op = &configfs_dentry_ops;
197 else {
198 struct configfs_dirent *sd = dentry->d_fsdata;
199 if (sd) {
200 list_del_init(&sd->s_sibling);
201 configfs_put(sd);
202 }
203 }
191 } 204 }
192 return err; 205 return err;
193} 206}
@@ -241,13 +254,15 @@ static int configfs_attach_attr(struct configfs_dirent * sd, struct dentry * den
241 struct configfs_attribute * attr = sd->s_element; 254 struct configfs_attribute * attr = sd->s_element;
242 int error; 255 int error;
243 256
257 dentry->d_fsdata = configfs_get(sd);
258 sd->s_dentry = dentry;
244 error = configfs_create(dentry, (attr->ca_mode & S_IALLUGO) | S_IFREG, init_file); 259 error = configfs_create(dentry, (attr->ca_mode & S_IALLUGO) | S_IFREG, init_file);
245 if (error) 260 if (error) {
261 configfs_put(sd);
246 return error; 262 return error;
263 }
247 264
248 dentry->d_op = &configfs_dentry_ops; 265 dentry->d_op = &configfs_dentry_ops;
249 dentry->d_fsdata = configfs_get(sd);
250 sd->s_dentry = dentry;
251 d_rehash(dentry); 266 d_rehash(dentry);
252 267
253 return 0; 268 return 0;
@@ -839,6 +854,7 @@ struct inode_operations configfs_dir_inode_operations = {
839 .symlink = configfs_symlink, 854 .symlink = configfs_symlink,
840 .unlink = configfs_unlink, 855 .unlink = configfs_unlink,
841 .lookup = configfs_lookup, 856 .lookup = configfs_lookup,
857 .setattr = configfs_setattr,
842}; 858};
843 859
844#if 0 860#if 0
diff --git a/fs/configfs/file.c b/fs/configfs/file.c
index c26cd61f13af..3921920d8716 100644
--- a/fs/configfs/file.c
+++ b/fs/configfs/file.c
@@ -26,7 +26,6 @@
26 26
27#include <linux/fs.h> 27#include <linux/fs.h>
28#include <linux/module.h> 28#include <linux/module.h>
29#include <linux/dnotify.h>
30#include <linux/slab.h> 29#include <linux/slab.h>
31#include <asm/uaccess.h> 30#include <asm/uaccess.h>
32#include <asm/semaphore.h> 31#include <asm/semaphore.h>
@@ -150,7 +149,7 @@ out:
150/** 149/**
151 * fill_write_buffer - copy buffer from userspace. 150 * fill_write_buffer - copy buffer from userspace.
152 * @buffer: data buffer for file. 151 * @buffer: data buffer for file.
153 * @userbuf: data from user. 152 * @buf: data from user.
154 * @count: number of bytes in @userbuf. 153 * @count: number of bytes in @userbuf.
155 * 154 *
156 * Allocate @buffer->page if it hasn't been already, then 155 * Allocate @buffer->page if it hasn't been already, then
@@ -177,8 +176,9 @@ fill_write_buffer(struct configfs_buffer * buffer, const char __user * buf, size
177 176
178/** 177/**
179 * flush_write_buffer - push buffer to config_item. 178 * flush_write_buffer - push buffer to config_item.
180 * @file: file pointer. 179 * @dentry: dentry to the attribute
181 * @buffer: data buffer for file. 180 * @buffer: data buffer for file.
181 * @count: number of bytes
182 * 182 *
183 * Get the correct pointers for the config_item and the attribute we're 183 * Get the correct pointers for the config_item and the attribute we're
184 * dealing with, then call the store() method for the attribute, 184 * dealing with, then call the store() method for the attribute,
@@ -217,15 +217,16 @@ static ssize_t
217configfs_write_file(struct file *file, const char __user *buf, size_t count, loff_t *ppos) 217configfs_write_file(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
218{ 218{
219 struct configfs_buffer * buffer = file->private_data; 219 struct configfs_buffer * buffer = file->private_data;
220 ssize_t len;
220 221
221 down(&buffer->sem); 222 down(&buffer->sem);
222 count = fill_write_buffer(buffer,buf,count); 223 len = fill_write_buffer(buffer, buf, count);
223 if (count > 0) 224 if (len > 0)
224 count = flush_write_buffer(file->f_dentry,buffer,count); 225 len = flush_write_buffer(file->f_dentry, buffer, count);
225 if (count > 0) 226 if (len > 0)
226 *ppos += count; 227 *ppos += len;
227 up(&buffer->sem); 228 up(&buffer->sem);
228 return count; 229 return len;
229} 230}
230 231
231static int check_perm(struct inode * inode, struct file * file) 232static int check_perm(struct inode * inode, struct file * file)
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index 6577c588de9d..c153bd9534cb 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -31,6 +31,7 @@
31#include <linux/pagemap.h> 31#include <linux/pagemap.h>
32#include <linux/namei.h> 32#include <linux/namei.h>
33#include <linux/backing-dev.h> 33#include <linux/backing-dev.h>
34#include <linux/capability.h>
34 35
35#include <linux/configfs.h> 36#include <linux/configfs.h>
36#include "configfs_internal.h" 37#include "configfs_internal.h"
@@ -48,18 +49,107 @@ static struct backing_dev_info configfs_backing_dev_info = {
48 .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK, 49 .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
49}; 50};
50 51
51struct inode * configfs_new_inode(mode_t mode) 52static struct inode_operations configfs_inode_operations ={
53 .setattr = configfs_setattr,
54};
55
56int configfs_setattr(struct dentry * dentry, struct iattr * iattr)
57{
58 struct inode * inode = dentry->d_inode;
59 struct configfs_dirent * sd = dentry->d_fsdata;
60 struct iattr * sd_iattr;
61 unsigned int ia_valid = iattr->ia_valid;
62 int error;
63
64 if (!sd)
65 return -EINVAL;
66
67 sd_iattr = sd->s_iattr;
68
69 error = inode_change_ok(inode, iattr);
70 if (error)
71 return error;
72
73 error = inode_setattr(inode, iattr);
74 if (error)
75 return error;
76
77 if (!sd_iattr) {
78 /* setting attributes for the first time, allocate now */
79 sd_iattr = kmalloc(sizeof(struct iattr), GFP_KERNEL);
80 if (!sd_iattr)
81 return -ENOMEM;
82 /* assign default attributes */
83 memset(sd_iattr, 0, sizeof(struct iattr));
84 sd_iattr->ia_mode = sd->s_mode;
85 sd_iattr->ia_uid = 0;
86 sd_iattr->ia_gid = 0;
87 sd_iattr->ia_atime = sd_iattr->ia_mtime = sd_iattr->ia_ctime = CURRENT_TIME;
88 sd->s_iattr = sd_iattr;
89 }
90
91 /* attributes were changed atleast once in past */
92
93 if (ia_valid & ATTR_UID)
94 sd_iattr->ia_uid = iattr->ia_uid;
95 if (ia_valid & ATTR_GID)
96 sd_iattr->ia_gid = iattr->ia_gid;
97 if (ia_valid & ATTR_ATIME)
98 sd_iattr->ia_atime = timespec_trunc(iattr->ia_atime,
99 inode->i_sb->s_time_gran);
100 if (ia_valid & ATTR_MTIME)
101 sd_iattr->ia_mtime = timespec_trunc(iattr->ia_mtime,
102 inode->i_sb->s_time_gran);
103 if (ia_valid & ATTR_CTIME)
104 sd_iattr->ia_ctime = timespec_trunc(iattr->ia_ctime,
105 inode->i_sb->s_time_gran);
106 if (ia_valid & ATTR_MODE) {
107 umode_t mode = iattr->ia_mode;
108
109 if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
110 mode &= ~S_ISGID;
111 sd_iattr->ia_mode = sd->s_mode = mode;
112 }
113
114 return error;
115}
116
117static inline void set_default_inode_attr(struct inode * inode, mode_t mode)
118{
119 inode->i_mode = mode;
120 inode->i_uid = 0;
121 inode->i_gid = 0;
122 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
123}
124
125static inline void set_inode_attr(struct inode * inode, struct iattr * iattr)
126{
127 inode->i_mode = iattr->ia_mode;
128 inode->i_uid = iattr->ia_uid;
129 inode->i_gid = iattr->ia_gid;
130 inode->i_atime = iattr->ia_atime;
131 inode->i_mtime = iattr->ia_mtime;
132 inode->i_ctime = iattr->ia_ctime;
133}
134
135struct inode * configfs_new_inode(mode_t mode, struct configfs_dirent * sd)
52{ 136{
53 struct inode * inode = new_inode(configfs_sb); 137 struct inode * inode = new_inode(configfs_sb);
54 if (inode) { 138 if (inode) {
55 inode->i_mode = mode;
56 inode->i_uid = 0;
57 inode->i_gid = 0;
58 inode->i_blksize = PAGE_CACHE_SIZE; 139 inode->i_blksize = PAGE_CACHE_SIZE;
59 inode->i_blocks = 0; 140 inode->i_blocks = 0;
60 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
61 inode->i_mapping->a_ops = &configfs_aops; 141 inode->i_mapping->a_ops = &configfs_aops;
62 inode->i_mapping->backing_dev_info = &configfs_backing_dev_info; 142 inode->i_mapping->backing_dev_info = &configfs_backing_dev_info;
143 inode->i_op = &configfs_inode_operations;
144
145 if (sd->s_iattr) {
146 /* sysfs_dirent has non-default attributes
147 * get them for the new inode from persistent copy
148 * in sysfs_dirent
149 */
150 set_inode_attr(inode, sd->s_iattr);
151 } else
152 set_default_inode_attr(inode, mode);
63 } 153 }
64 return inode; 154 return inode;
65} 155}
@@ -70,7 +160,8 @@ int configfs_create(struct dentry * dentry, int mode, int (*init)(struct inode *
70 struct inode * inode = NULL; 160 struct inode * inode = NULL;
71 if (dentry) { 161 if (dentry) {
72 if (!dentry->d_inode) { 162 if (!dentry->d_inode) {
73 if ((inode = configfs_new_inode(mode))) { 163 struct configfs_dirent *sd = dentry->d_fsdata;
164 if ((inode = configfs_new_inode(mode, sd))) {
74 if (dentry->d_parent && dentry->d_parent->d_inode) { 165 if (dentry->d_parent && dentry->d_parent->d_inode) {
75 struct inode *p_inode = dentry->d_parent->d_inode; 166 struct inode *p_inode = dentry->d_parent->d_inode;
76 p_inode->i_mtime = p_inode->i_ctime = CURRENT_TIME; 167 p_inode->i_mtime = p_inode->i_ctime = CURRENT_TIME;
@@ -103,10 +194,9 @@ int configfs_create(struct dentry * dentry, int mode, int (*init)(struct inode *
103 */ 194 */
104const unsigned char * configfs_get_name(struct configfs_dirent *sd) 195const unsigned char * configfs_get_name(struct configfs_dirent *sd)
105{ 196{
106 struct attribute * attr; 197 struct configfs_attribute *attr;
107 198
108 if (!sd || !sd->s_element) 199 BUG_ON(!sd || !sd->s_element);
109 BUG();
110 200
111 /* These always have a dentry, so use that */ 201 /* These always have a dentry, so use that */
112 if (sd->s_type & (CONFIGFS_DIR | CONFIGFS_ITEM_LINK)) 202 if (sd->s_type & (CONFIGFS_DIR | CONFIGFS_ITEM_LINK))
@@ -114,7 +204,7 @@ const unsigned char * configfs_get_name(struct configfs_dirent *sd)
114 204
115 if (sd->s_type & CONFIGFS_ITEM_ATTR) { 205 if (sd->s_type & CONFIGFS_ITEM_ATTR) {
116 attr = sd->s_element; 206 attr = sd->s_element;
117 return attr->name; 207 return attr->ca_name;
118 } 208 }
119 return NULL; 209 return NULL;
120} 210}
@@ -130,13 +220,17 @@ void configfs_drop_dentry(struct configfs_dirent * sd, struct dentry * parent)
130 220
131 if (dentry) { 221 if (dentry) {
132 spin_lock(&dcache_lock); 222 spin_lock(&dcache_lock);
223 spin_lock(&dentry->d_lock);
133 if (!(d_unhashed(dentry) && dentry->d_inode)) { 224 if (!(d_unhashed(dentry) && dentry->d_inode)) {
134 dget_locked(dentry); 225 dget_locked(dentry);
135 __d_drop(dentry); 226 __d_drop(dentry);
227 spin_unlock(&dentry->d_lock);
136 spin_unlock(&dcache_lock); 228 spin_unlock(&dcache_lock);
137 simple_unlink(parent->d_inode, dentry); 229 simple_unlink(parent->d_inode, dentry);
138 } else 230 } else {
231 spin_unlock(&dentry->d_lock);
139 spin_unlock(&dcache_lock); 232 spin_unlock(&dcache_lock);
233 }
140 } 234 }
141} 235}
142 236
@@ -145,6 +239,10 @@ void configfs_hash_and_remove(struct dentry * dir, const char * name)
145 struct configfs_dirent * sd; 239 struct configfs_dirent * sd;
146 struct configfs_dirent * parent_sd = dir->d_fsdata; 240 struct configfs_dirent * parent_sd = dir->d_fsdata;
147 241
242 if (dir->d_inode == NULL)
243 /* no inode means this hasn't been made visible yet */
244 return;
245
148 mutex_lock(&dir->d_inode->i_mutex); 246 mutex_lock(&dir->d_inode->i_mutex);
149 list_for_each_entry(sd, &parent_sd->s_children, s_sibling) { 247 list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
150 if (!sd->s_element) 248 if (!sd->s_element)
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index 1a2f6f6a4d91..f920d30478e5 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -38,6 +38,7 @@
38 38
39struct vfsmount * configfs_mount = NULL; 39struct vfsmount * configfs_mount = NULL;
40struct super_block * configfs_sb = NULL; 40struct super_block * configfs_sb = NULL;
41kmem_cache_t *configfs_dir_cachep;
41static int configfs_mnt_count = 0; 42static int configfs_mnt_count = 0;
42 43
43static struct super_operations configfs_ops = { 44static struct super_operations configfs_ops = {
@@ -62,6 +63,7 @@ static struct configfs_dirent configfs_root = {
62 .s_children = LIST_HEAD_INIT(configfs_root.s_children), 63 .s_children = LIST_HEAD_INIT(configfs_root.s_children),
63 .s_element = &configfs_root_group.cg_item, 64 .s_element = &configfs_root_group.cg_item,
64 .s_type = CONFIGFS_ROOT, 65 .s_type = CONFIGFS_ROOT,
66 .s_iattr = NULL,
65}; 67};
66 68
67static int configfs_fill_super(struct super_block *sb, void *data, int silent) 69static int configfs_fill_super(struct super_block *sb, void *data, int silent)
@@ -73,9 +75,11 @@ static int configfs_fill_super(struct super_block *sb, void *data, int silent)
73 sb->s_blocksize_bits = PAGE_CACHE_SHIFT; 75 sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
74 sb->s_magic = CONFIGFS_MAGIC; 76 sb->s_magic = CONFIGFS_MAGIC;
75 sb->s_op = &configfs_ops; 77 sb->s_op = &configfs_ops;
78 sb->s_time_gran = 1;
76 configfs_sb = sb; 79 configfs_sb = sb;
77 80
78 inode = configfs_new_inode(S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO); 81 inode = configfs_new_inode(S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO,
82 &configfs_root);
79 if (inode) { 83 if (inode) {
80 inode->i_op = &configfs_dir_inode_operations; 84 inode->i_op = &configfs_dir_inode_operations;
81 inode->i_fop = &configfs_dir_operations; 85 inode->i_fop = &configfs_dir_operations;
@@ -128,19 +132,31 @@ static decl_subsys(config, NULL, NULL);
128 132
129static int __init configfs_init(void) 133static int __init configfs_init(void)
130{ 134{
131 int err; 135 int err = -ENOMEM;
136
137 configfs_dir_cachep = kmem_cache_create("configfs_dir_cache",
138 sizeof(struct configfs_dirent),
139 0, 0, NULL, NULL);
140 if (!configfs_dir_cachep)
141 goto out;
132 142
133 kset_set_kset_s(&config_subsys, kernel_subsys); 143 kset_set_kset_s(&config_subsys, kernel_subsys);
134 err = subsystem_register(&config_subsys); 144 err = subsystem_register(&config_subsys);
135 if (err) 145 if (err) {
136 return err; 146 kmem_cache_destroy(configfs_dir_cachep);
147 configfs_dir_cachep = NULL;
148 goto out;
149 }
137 150
138 err = register_filesystem(&configfs_fs_type); 151 err = register_filesystem(&configfs_fs_type);
139 if (err) { 152 if (err) {
140 printk(KERN_ERR "configfs: Unable to register filesystem!\n"); 153 printk(KERN_ERR "configfs: Unable to register filesystem!\n");
141 subsystem_unregister(&config_subsys); 154 subsystem_unregister(&config_subsys);
155 kmem_cache_destroy(configfs_dir_cachep);
156 configfs_dir_cachep = NULL;
142 } 157 }
143 158
159out:
144 return err; 160 return err;
145} 161}
146 162
@@ -148,11 +164,13 @@ static void __exit configfs_exit(void)
148{ 164{
149 unregister_filesystem(&configfs_fs_type); 165 unregister_filesystem(&configfs_fs_type);
150 subsystem_unregister(&config_subsys); 166 subsystem_unregister(&config_subsys);
167 kmem_cache_destroy(configfs_dir_cachep);
168 configfs_dir_cachep = NULL;
151} 169}
152 170
153MODULE_AUTHOR("Oracle"); 171MODULE_AUTHOR("Oracle");
154MODULE_LICENSE("GPL"); 172MODULE_LICENSE("GPL");
155MODULE_VERSION("0.0.1"); 173MODULE_VERSION("0.0.2");
156MODULE_DESCRIPTION("Simple RAM filesystem for user driven kernel subsystem configuration."); 174MODULE_DESCRIPTION("Simple RAM filesystem for user driven kernel subsystem configuration.");
157 175
158module_init(configfs_init); 176module_init(configfs_init);
diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c
index 50f5840521a9..e5512e295cf2 100644
--- a/fs/configfs/symlink.c
+++ b/fs/configfs/symlink.c
@@ -162,8 +162,7 @@ int configfs_unlink(struct inode *dir, struct dentry *dentry)
162 if (!(sd->s_type & CONFIGFS_ITEM_LINK)) 162 if (!(sd->s_type & CONFIGFS_ITEM_LINK))
163 goto out; 163 goto out;
164 164
165 if (dentry->d_parent == configfs_sb->s_root) 165 BUG_ON(dentry->d_parent == configfs_sb->s_root);
166 BUG();
167 166
168 sl = sd->s_element; 167 sl = sd->s_element;
169 168
@@ -277,5 +276,6 @@ struct inode_operations configfs_symlink_inode_operations = {
277 .follow_link = configfs_follow_link, 276 .follow_link = configfs_follow_link,
278 .readlink = generic_readlink, 277 .readlink = generic_readlink,
279 .put_link = configfs_put_link, 278 .put_link = configfs_put_link,
279 .setattr = configfs_setattr,
280}; 280};
281 281
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 7fe85415ae7c..8ad52f5bf255 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -36,7 +36,7 @@ static DECLARE_MUTEX(read_mutex);
36 36
37/* These two macros may change in future, to provide better st_ino 37/* These two macros may change in future, to provide better st_ino
38 semantics. */ 38 semantics. */
39#define CRAMINO(x) ((x)->offset?(x)->offset<<2:1) 39#define CRAMINO(x) (((x)->offset && (x)->size)?(x)->offset<<2:1)
40#define OFFSET(x) ((x)->i_ino) 40#define OFFSET(x) ((x)->i_ino)
41 41
42 42
@@ -66,8 +66,36 @@ static int cramfs_iget5_test(struct inode *inode, void *opaque)
66 66
67static int cramfs_iget5_set(struct inode *inode, void *opaque) 67static int cramfs_iget5_set(struct inode *inode, void *opaque)
68{ 68{
69 static struct timespec zerotime;
69 struct cramfs_inode *cramfs_inode = opaque; 70 struct cramfs_inode *cramfs_inode = opaque;
71 inode->i_mode = cramfs_inode->mode;
72 inode->i_uid = cramfs_inode->uid;
73 inode->i_size = cramfs_inode->size;
74 inode->i_blocks = (cramfs_inode->size - 1) / 512 + 1;
75 inode->i_blksize = PAGE_CACHE_SIZE;
76 inode->i_gid = cramfs_inode->gid;
77 /* Struct copy intentional */
78 inode->i_mtime = inode->i_atime = inode->i_ctime = zerotime;
70 inode->i_ino = CRAMINO(cramfs_inode); 79 inode->i_ino = CRAMINO(cramfs_inode);
80 /* inode->i_nlink is left 1 - arguably wrong for directories,
81 but it's the best we can do without reading the directory
82 contents. 1 yields the right result in GNU find, even
83 without -noleaf option. */
84 if (S_ISREG(inode->i_mode)) {
85 inode->i_fop = &generic_ro_fops;
86 inode->i_data.a_ops = &cramfs_aops;
87 } else if (S_ISDIR(inode->i_mode)) {
88 inode->i_op = &cramfs_dir_inode_operations;
89 inode->i_fop = &cramfs_directory_operations;
90 } else if (S_ISLNK(inode->i_mode)) {
91 inode->i_op = &page_symlink_inode_operations;
92 inode->i_data.a_ops = &cramfs_aops;
93 } else {
94 inode->i_size = 0;
95 inode->i_blocks = 0;
96 init_special_inode(inode, inode->i_mode,
97 old_decode_dev(cramfs_inode->size));
98 }
71 return 0; 99 return 0;
72} 100}
73 101
@@ -77,37 +105,7 @@ static struct inode *get_cramfs_inode(struct super_block *sb,
77 struct inode *inode = iget5_locked(sb, CRAMINO(cramfs_inode), 105 struct inode *inode = iget5_locked(sb, CRAMINO(cramfs_inode),
78 cramfs_iget5_test, cramfs_iget5_set, 106 cramfs_iget5_test, cramfs_iget5_set,
79 cramfs_inode); 107 cramfs_inode);
80 static struct timespec zerotime;
81
82 if (inode && (inode->i_state & I_NEW)) { 108 if (inode && (inode->i_state & I_NEW)) {
83 inode->i_mode = cramfs_inode->mode;
84 inode->i_uid = cramfs_inode->uid;
85 inode->i_size = cramfs_inode->size;
86 inode->i_blocks = (cramfs_inode->size - 1) / 512 + 1;
87 inode->i_blksize = PAGE_CACHE_SIZE;
88 inode->i_gid = cramfs_inode->gid;
89 /* Struct copy intentional */
90 inode->i_mtime = inode->i_atime = inode->i_ctime = zerotime;
91 inode->i_ino = CRAMINO(cramfs_inode);
92 /* inode->i_nlink is left 1 - arguably wrong for directories,
93 but it's the best we can do without reading the directory
94 contents. 1 yields the right result in GNU find, even
95 without -noleaf option. */
96 if (S_ISREG(inode->i_mode)) {
97 inode->i_fop = &generic_ro_fops;
98 inode->i_data.a_ops = &cramfs_aops;
99 } else if (S_ISDIR(inode->i_mode)) {
100 inode->i_op = &cramfs_dir_inode_operations;
101 inode->i_fop = &cramfs_directory_operations;
102 } else if (S_ISLNK(inode->i_mode)) {
103 inode->i_op = &page_symlink_inode_operations;
104 inode->i_data.a_ops = &cramfs_aops;
105 } else {
106 inode->i_size = 0;
107 inode->i_blocks = 0;
108 init_special_inode(inode, inode->i_mode,
109 old_decode_dev(cramfs_inode->size));
110 }
111 unlock_new_inode(inode); 109 unlock_new_inode(inode);
112 } 110 }
113 return inode; 111 return inode;
diff --git a/fs/dcache.c b/fs/dcache.c
index 86bdb93789c6..11dc83092d4a 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -743,7 +743,9 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
743 dentry->d_op = NULL; 743 dentry->d_op = NULL;
744 dentry->d_fsdata = NULL; 744 dentry->d_fsdata = NULL;
745 dentry->d_mounted = 0; 745 dentry->d_mounted = 0;
746#ifdef CONFIG_PROFILING
746 dentry->d_cookie = NULL; 747 dentry->d_cookie = NULL;
748#endif
747 INIT_HLIST_NODE(&dentry->d_hash); 749 INIT_HLIST_NODE(&dentry->d_hash);
748 INIT_LIST_HEAD(&dentry->d_lru); 750 INIT_LIST_HEAD(&dentry->d_lru);
749 INIT_LIST_HEAD(&dentry->d_subdirs); 751 INIT_LIST_HEAD(&dentry->d_subdirs);
@@ -1734,7 +1736,7 @@ void __init vfs_caches_init(unsigned long mempages)
1734 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); 1736 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
1735 1737
1736 filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0, 1738 filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0,
1737 SLAB_HWCACHE_ALIGN|SLAB_PANIC, filp_ctor, filp_dtor); 1739 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
1738 1740
1739 dcache_init(mempages); 1741 dcache_init(mempages);
1740 inode_init(mempages); 1742 inode_init(mempages);
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index efc97d9b7860..d575452cd9f7 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -56,7 +56,7 @@ static u64 debugfs_u8_get(void *data)
56DEFINE_SIMPLE_ATTRIBUTE(fops_u8, debugfs_u8_get, debugfs_u8_set, "%llu\n"); 56DEFINE_SIMPLE_ATTRIBUTE(fops_u8, debugfs_u8_get, debugfs_u8_set, "%llu\n");
57 57
58/** 58/**
59 * debugfs_create_u8 - create a file in the debugfs filesystem that is used to read and write a unsigned 8 bit value. 59 * debugfs_create_u8 - create a file in the debugfs filesystem that is used to read and write an unsigned 8 bit value.
60 * 60 *
61 * @name: a pointer to a string containing the name of the file to create. 61 * @name: a pointer to a string containing the name of the file to create.
62 * @mode: the permission that the file should have 62 * @mode: the permission that the file should have
@@ -98,7 +98,7 @@ static u64 debugfs_u16_get(void *data)
98DEFINE_SIMPLE_ATTRIBUTE(fops_u16, debugfs_u16_get, debugfs_u16_set, "%llu\n"); 98DEFINE_SIMPLE_ATTRIBUTE(fops_u16, debugfs_u16_get, debugfs_u16_set, "%llu\n");
99 99
100/** 100/**
101 * debugfs_create_u16 - create a file in the debugfs filesystem that is used to read and write a unsigned 8 bit value. 101 * debugfs_create_u16 - create a file in the debugfs filesystem that is used to read and write an unsigned 16 bit value.
102 * 102 *
103 * @name: a pointer to a string containing the name of the file to create. 103 * @name: a pointer to a string containing the name of the file to create.
104 * @mode: the permission that the file should have 104 * @mode: the permission that the file should have
@@ -140,7 +140,7 @@ static u64 debugfs_u32_get(void *data)
140DEFINE_SIMPLE_ATTRIBUTE(fops_u32, debugfs_u32_get, debugfs_u32_set, "%llu\n"); 140DEFINE_SIMPLE_ATTRIBUTE(fops_u32, debugfs_u32_get, debugfs_u32_set, "%llu\n");
141 141
142/** 142/**
143 * debugfs_create_u32 - create a file in the debugfs filesystem that is used to read and write a unsigned 8 bit value. 143 * debugfs_create_u32 - create a file in the debugfs filesystem that is used to read and write an unsigned 32 bit value.
144 * 144 *
145 * @name: a pointer to a string containing the name of the file to create. 145 * @name: a pointer to a string containing the name of the file to create.
146 * @mode: the permission that the file should have 146 * @mode: the permission that the file should have
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 30dbbd1df511..848044af7e16 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -857,6 +857,7 @@ do_holes:
857 /* Handle holes */ 857 /* Handle holes */
858 if (!buffer_mapped(map_bh)) { 858 if (!buffer_mapped(map_bh)) {
859 char *kaddr; 859 char *kaddr;
860 loff_t i_size_aligned;
860 861
861 /* AKPM: eargh, -ENOTBLK is a hack */ 862 /* AKPM: eargh, -ENOTBLK is a hack */
862 if (dio->rw == WRITE) { 863 if (dio->rw == WRITE) {
@@ -864,8 +865,14 @@ do_holes:
864 return -ENOTBLK; 865 return -ENOTBLK;
865 } 866 }
866 867
868 /*
869 * Be sure to account for a partial block as the
870 * last block in the file
871 */
872 i_size_aligned = ALIGN(i_size_read(dio->inode),
873 1 << blkbits);
867 if (dio->block_in_file >= 874 if (dio->block_in_file >=
868 i_size_read(dio->inode)>>blkbits) { 875 i_size_aligned >> blkbits) {
869 /* We hit eof */ 876 /* We hit eof */
870 page_cache_release(page); 877 page_cache_release(page);
871 goto out; 878 goto out;
diff --git a/fs/exec.c b/fs/exec.c
index 055378d2513e..0b515ac53134 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -885,6 +885,12 @@ int flush_old_exec(struct linux_binprm * bprm)
885 current->flags &= ~PF_RANDOMIZE; 885 current->flags &= ~PF_RANDOMIZE;
886 flush_thread(); 886 flush_thread();
887 887
888 /* Set the new mm task size. We have to do that late because it may
889 * depend on TIF_32BIT which is only updated in flush_thread() on
890 * some architectures like powerpc
891 */
892 current->mm->task_size = TASK_SIZE;
893
888 if (bprm->e_uid != current->euid || bprm->e_gid != current->egid || 894 if (bprm->e_uid != current->euid || bprm->e_gid != current->egid ||
889 file_permission(bprm->file, MAY_READ) || 895 file_permission(bprm->file, MAY_READ) ||
890 (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP)) { 896 (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP)) {
@@ -1403,7 +1409,7 @@ static void zap_threads (struct mm_struct *mm)
1403 do_each_thread(g,p) { 1409 do_each_thread(g,p) {
1404 if (mm == p->mm && p != tsk && 1410 if (mm == p->mm && p != tsk &&
1405 p->ptrace && p->parent->mm == mm) { 1411 p->ptrace && p->parent->mm == mm) {
1406 __ptrace_unlink(p); 1412 __ptrace_detach(p, 0);
1407 } 1413 }
1408 } while_each_thread(g,p); 1414 } while_each_thread(g,p);
1409 write_unlock_irq(&tasklist_lock); 1415 write_unlock_irq(&tasklist_lock);
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index 35acc43b897f..da52b4a5db64 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -220,7 +220,7 @@ ext2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
220 struct ext2_inode_info *ei = EXT2_I(inode); 220 struct ext2_inode_info *ei = EXT2_I(inode);
221 int name_index; 221 int name_index;
222 void *value = NULL; 222 void *value = NULL;
223 size_t size; 223 size_t size = 0;
224 int error; 224 int error;
225 225
226 if (S_ISLNK(inode->i_mode)) 226 if (S_ISLNK(inode->i_mode))
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index 74714af4ae69..e52765219e16 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -605,7 +605,7 @@ got:
605 insert_inode_hash(inode); 605 insert_inode_hash(inode);
606 606
607 if (DQUOT_ALLOC_INODE(inode)) { 607 if (DQUOT_ALLOC_INODE(inode)) {
608 err = -ENOSPC; 608 err = -EDQUOT;
609 goto fail_drop; 609 goto fail_drop;
610 } 610 }
611 611
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index e7d3f0522d01..a717837f272e 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -706,6 +706,7 @@ struct address_space_operations ext2_aops = {
706 .bmap = ext2_bmap, 706 .bmap = ext2_bmap,
707 .direct_IO = ext2_direct_IO, 707 .direct_IO = ext2_direct_IO,
708 .writepages = ext2_writepages, 708 .writepages = ext2_writepages,
709 .migratepage = buffer_migrate_page,
709}; 710};
710 711
711struct address_space_operations ext2_aops_xip = { 712struct address_space_operations ext2_aops_xip = {
@@ -723,6 +724,7 @@ struct address_space_operations ext2_nobh_aops = {
723 .bmap = ext2_bmap, 724 .bmap = ext2_bmap,
724 .direct_IO = ext2_direct_IO, 725 .direct_IO = ext2_direct_IO,
725 .writepages = ext2_writepages, 726 .writepages = ext2_writepages,
727 .migratepage = buffer_migrate_page,
726}; 728};
727 729
728/* 730/*
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 8d6819846fc9..cb6f9bd658de 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -221,6 +221,11 @@ static int ext2_show_options(struct seq_file *seq, struct vfsmount *vfs)
221 seq_puts(seq, ",grpquota"); 221 seq_puts(seq, ",grpquota");
222#endif 222#endif
223 223
224#if defined(CONFIG_EXT2_FS_XIP)
225 if (sbi->s_mount_opt & EXT2_MOUNT_XIP)
226 seq_puts(seq, ",xip");
227#endif
228
224 return 0; 229 return 0;
225} 230}
226 231
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index a2ca3107d475..86ae8e93adb9 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -792,18 +792,20 @@ ext2_xattr_delete_inode(struct inode *inode)
792 ext2_free_blocks(inode, EXT2_I(inode)->i_file_acl, 1); 792 ext2_free_blocks(inode, EXT2_I(inode)->i_file_acl, 1);
793 get_bh(bh); 793 get_bh(bh);
794 bforget(bh); 794 bforget(bh);
795 unlock_buffer(bh);
795 } else { 796 } else {
796 HDR(bh)->h_refcount = cpu_to_le32( 797 HDR(bh)->h_refcount = cpu_to_le32(
797 le32_to_cpu(HDR(bh)->h_refcount) - 1); 798 le32_to_cpu(HDR(bh)->h_refcount) - 1);
798 if (ce) 799 if (ce)
799 mb_cache_entry_release(ce); 800 mb_cache_entry_release(ce);
801 ea_bdebug(bh, "refcount now=%d",
802 le32_to_cpu(HDR(bh)->h_refcount));
803 unlock_buffer(bh);
800 mark_buffer_dirty(bh); 804 mark_buffer_dirty(bh);
801 if (IS_SYNC(inode)) 805 if (IS_SYNC(inode))
802 sync_dirty_buffer(bh); 806 sync_dirty_buffer(bh);
803 DQUOT_FREE_BLOCK(inode, 1); 807 DQUOT_FREE_BLOCK(inode, 1);
804 } 808 }
805 ea_bdebug(bh, "refcount now=%d", le32_to_cpu(HDR(bh)->h_refcount) - 1);
806 unlock_buffer(bh);
807 EXT2_I(inode)->i_file_acl = 0; 809 EXT2_I(inode)->i_file_acl = 0;
808 810
809cleanup: 811cleanup:
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index 47a9da2dfb4f..0d21d558b87a 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -226,7 +226,7 @@ ext3_set_acl(handle_t *handle, struct inode *inode, int type,
226 struct ext3_inode_info *ei = EXT3_I(inode); 226 struct ext3_inode_info *ei = EXT3_I(inode);
227 int name_index; 227 int name_index;
228 void *value = NULL; 228 void *value = NULL;
229 size_t size; 229 size_t size = 0;
230 int error; 230 int error;
231 231
232 if (S_ISLNK(inode->i_mode)) 232 if (S_ISLNK(inode->i_mode))
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 8824e84f8a56..0384e539b88f 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1559,6 +1559,7 @@ static struct address_space_operations ext3_ordered_aops = {
1559 .invalidatepage = ext3_invalidatepage, 1559 .invalidatepage = ext3_invalidatepage,
1560 .releasepage = ext3_releasepage, 1560 .releasepage = ext3_releasepage,
1561 .direct_IO = ext3_direct_IO, 1561 .direct_IO = ext3_direct_IO,
1562 .migratepage = buffer_migrate_page,
1562}; 1563};
1563 1564
1564static struct address_space_operations ext3_writeback_aops = { 1565static struct address_space_operations ext3_writeback_aops = {
@@ -1572,6 +1573,7 @@ static struct address_space_operations ext3_writeback_aops = {
1572 .invalidatepage = ext3_invalidatepage, 1573 .invalidatepage = ext3_invalidatepage,
1573 .releasepage = ext3_releasepage, 1574 .releasepage = ext3_releasepage,
1574 .direct_IO = ext3_direct_IO, 1575 .direct_IO = ext3_direct_IO,
1576 .migratepage = buffer_migrate_page,
1575}; 1577};
1576 1578
1577static struct address_space_operations ext3_journalled_aops = { 1579static struct address_space_operations ext3_journalled_aops = {
@@ -1622,15 +1624,14 @@ static int ext3_block_truncate_page(handle_t *handle, struct page *page,
1622 * For "nobh" option, we can only work if we don't need to 1624 * For "nobh" option, we can only work if we don't need to
1623 * read-in the page - otherwise we create buffers to do the IO. 1625 * read-in the page - otherwise we create buffers to do the IO.
1624 */ 1626 */
1625 if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH)) { 1627 if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH) &&
1626 if (PageUptodate(page)) { 1628 ext3_should_writeback_data(inode) && PageUptodate(page)) {
1627 kaddr = kmap_atomic(page, KM_USER0); 1629 kaddr = kmap_atomic(page, KM_USER0);
1628 memset(kaddr + offset, 0, length); 1630 memset(kaddr + offset, 0, length);
1629 flush_dcache_page(page); 1631 flush_dcache_page(page);
1630 kunmap_atomic(kaddr, KM_USER0); 1632 kunmap_atomic(kaddr, KM_USER0);
1631 set_page_dirty(page); 1633 set_page_dirty(page);
1632 goto unlock; 1634 goto unlock;
1633 }
1634 } 1635 }
1635 1636
1636 if (!page_has_buffers(page)) 1637 if (!page_has_buffers(page))
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 8bd8ac077704..b8f5cd1e540d 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -2141,7 +2141,8 @@ retry:
2141 * We have a transaction open. All is sweetness. It also sets 2141 * We have a transaction open. All is sweetness. It also sets
2142 * i_size in generic_commit_write(). 2142 * i_size in generic_commit_write().
2143 */ 2143 */
2144 err = page_symlink(inode, symname, l); 2144 err = __page_symlink(inode, symname, l,
2145 mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
2145 if (err) { 2146 if (err) {
2146 ext3_dec_count(handle, inode); 2147 ext3_dec_count(handle, inode);
2147 ext3_mark_inode_dirty(handle, inode); 2148 ext3_mark_inode_dirty(handle, inode);
diff --git a/fs/fat/file.c b/fs/fat/file.c
index e99c5a73b39e..88aa1ae13f9f 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -210,10 +210,30 @@ static int fat_free(struct inode *inode, int skip)
210 if (MSDOS_I(inode)->i_start == 0) 210 if (MSDOS_I(inode)->i_start == 0)
211 return 0; 211 return 0;
212 212
213 /* 213 fat_cache_inval_inode(inode);
214 * Write a new EOF, and get the remaining cluster chain for freeing. 214
215 */
216 wait = IS_DIRSYNC(inode); 215 wait = IS_DIRSYNC(inode);
216 i_start = free_start = MSDOS_I(inode)->i_start;
217 i_logstart = MSDOS_I(inode)->i_logstart;
218
219 /* First, we write the new file size. */
220 if (!skip) {
221 MSDOS_I(inode)->i_start = 0;
222 MSDOS_I(inode)->i_logstart = 0;
223 }
224 MSDOS_I(inode)->i_attrs |= ATTR_ARCH;
225 inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC;
226 if (wait) {
227 err = fat_sync_inode(inode);
228 if (err) {
229 MSDOS_I(inode)->i_start = i_start;
230 MSDOS_I(inode)->i_logstart = i_logstart;
231 return err;
232 }
233 } else
234 mark_inode_dirty(inode);
235
236 /* Write a new EOF, and get the remaining cluster chain for freeing. */
217 if (skip) { 237 if (skip) {
218 struct fat_entry fatent; 238 struct fat_entry fatent;
219 int ret, fclus, dclus; 239 int ret, fclus, dclus;
@@ -244,35 +264,11 @@ static int fat_free(struct inode *inode, int skip)
244 return ret; 264 return ret;
245 265
246 free_start = ret; 266 free_start = ret;
247 i_start = i_logstart = 0;
248 fat_cache_inval_inode(inode);
249 } else {
250 fat_cache_inval_inode(inode);
251
252 i_start = free_start = MSDOS_I(inode)->i_start;
253 i_logstart = MSDOS_I(inode)->i_logstart;
254 MSDOS_I(inode)->i_start = 0;
255 MSDOS_I(inode)->i_logstart = 0;
256 } 267 }
257 MSDOS_I(inode)->i_attrs |= ATTR_ARCH;
258 inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC;
259 if (wait) {
260 err = fat_sync_inode(inode);
261 if (err)
262 goto error;
263 } else
264 mark_inode_dirty(inode);
265 inode->i_blocks = skip << (MSDOS_SB(sb)->cluster_bits - 9); 268 inode->i_blocks = skip << (MSDOS_SB(sb)->cluster_bits - 9);
266 269
267 /* Freeing the remained cluster chain */ 270 /* Freeing the remained cluster chain */
268 return fat_free_clusters(inode, free_start); 271 return fat_free_clusters(inode, free_start);
269
270error:
271 if (i_start) {
272 MSDOS_I(inode)->i_start = i_start;
273 MSDOS_I(inode)->i_logstart = i_logstart;
274 }
275 return err;
276} 272}
277 273
278void fat_truncate(struct inode *inode) 274void fat_truncate(struct inode *inode)
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index 32fb0a3f1da4..944652e9dde1 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -196,19 +196,9 @@ EXPORT_SYMBOL_GPL(fat_date_unix2dos);
196 196
197int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs) 197int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs)
198{ 198{
199 int i, e, err = 0; 199 int i, err = 0;
200 200
201 for (i = 0; i < nr_bhs; i++) { 201 ll_rw_block(SWRITE, nr_bhs, bhs);
202 lock_buffer(bhs[i]);
203 if (test_clear_buffer_dirty(bhs[i])) {
204 get_bh(bhs[i]);
205 bhs[i]->b_end_io = end_buffer_write_sync;
206 e = submit_bh(WRITE, bhs[i]);
207 if (!err && e)
208 err = e;
209 } else
210 unlock_buffer(bhs[i]);
211 }
212 for (i = 0; i < nr_bhs; i++) { 202 for (i = 0; i < nr_bhs; i++) {
213 wait_on_buffer(bhs[i]); 203 wait_on_buffer(bhs[i]);
214 if (buffer_eopnotsupp(bhs[i])) { 204 if (buffer_eopnotsupp(bhs[i])) {
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 5f96786d1c73..dc4a7007f4e7 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -208,8 +208,11 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
208 struct inode * inode = filp->f_dentry->d_inode; 208 struct inode * inode = filp->f_dentry->d_inode;
209 int error = 0; 209 int error = 0;
210 210
211 /* O_APPEND cannot be cleared if the file is marked as append-only */ 211 /*
212 if (!(arg & O_APPEND) && IS_APPEND(inode)) 212 * O_APPEND cannot be cleared if the file is marked as append-only
213 * and the file is open for write.
214 */
215 if (((arg ^ filp->f_flags) & O_APPEND) && IS_APPEND(inode))
213 return -EPERM; 216 return -EPERM;
214 217
215 /* O_NOATIME can only be set by the owner or superuser */ 218 /* O_NOATIME can only be set by the owner or superuser */
diff --git a/fs/fifo.c b/fs/fifo.c
index 923371b753ab..d13fcd3ec803 100644
--- a/fs/fifo.c
+++ b/fs/fifo.c
@@ -34,10 +34,7 @@ static int fifo_open(struct inode *inode, struct file *filp)
34{ 34{
35 int ret; 35 int ret;
36 36
37 ret = -ERESTARTSYS; 37 mutex_lock(PIPE_MUTEX(*inode));
38 if (mutex_lock_interruptible(PIPE_MUTEX(*inode)))
39 goto err_nolock_nocleanup;
40
41 if (!inode->i_pipe) { 38 if (!inode->i_pipe) {
42 ret = -ENOMEM; 39 ret = -ENOMEM;
43 if(!pipe_new(inode)) 40 if(!pipe_new(inode))
@@ -140,8 +137,6 @@ err:
140 137
141err_nocleanup: 138err_nocleanup:
142 mutex_unlock(PIPE_MUTEX(*inode)); 139 mutex_unlock(PIPE_MUTEX(*inode));
143
144err_nolock_nocleanup:
145 return ret; 140 return ret;
146} 141}
147 142
diff --git a/fs/file.c b/fs/file.c
index fd066b261c75..cea7cbea11d0 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -379,7 +379,6 @@ static void __devinit fdtable_defer_list_init(int cpu)
379void __init files_defer_init(void) 379void __init files_defer_init(void)
380{ 380{
381 int i; 381 int i;
382 /* Really early - can't use for_each_cpu */ 382 for_each_cpu(i)
383 for (i = 0; i < NR_CPUS; i++)
384 fdtable_defer_list_init(i); 383 fdtable_defer_list_init(i);
385} 384}
diff --git a/fs/file_table.c b/fs/file_table.c
index 768b58167543..44fabeaa9415 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -5,6 +5,7 @@
5 * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu) 5 * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu)
6 */ 6 */
7 7
8#include <linux/config.h>
8#include <linux/string.h> 9#include <linux/string.h>
9#include <linux/slab.h> 10#include <linux/slab.h>
10#include <linux/file.h> 11#include <linux/file.h>
@@ -19,52 +20,67 @@
19#include <linux/capability.h> 20#include <linux/capability.h>
20#include <linux/cdev.h> 21#include <linux/cdev.h>
21#include <linux/fsnotify.h> 22#include <linux/fsnotify.h>
23#include <linux/sysctl.h>
24#include <linux/percpu_counter.h>
25
26#include <asm/atomic.h>
22 27
23/* sysctl tunables... */ 28/* sysctl tunables... */
24struct files_stat_struct files_stat = { 29struct files_stat_struct files_stat = {
25 .max_files = NR_FILE 30 .max_files = NR_FILE
26}; 31};
27 32
28EXPORT_SYMBOL(files_stat); /* Needed by unix.o */
29
30/* public. Not pretty! */ 33/* public. Not pretty! */
31 __cacheline_aligned_in_smp DEFINE_SPINLOCK(files_lock); 34__cacheline_aligned_in_smp DEFINE_SPINLOCK(files_lock);
32 35
33static DEFINE_SPINLOCK(filp_count_lock); 36static struct percpu_counter nr_files __cacheline_aligned_in_smp;
34 37
35/* slab constructors and destructors are called from arbitrary 38static inline void file_free_rcu(struct rcu_head *head)
36 * context and must be fully threaded - use a local spinlock
37 * to protect files_stat.nr_files
38 */
39void filp_ctor(void *objp, struct kmem_cache *cachep, unsigned long cflags)
40{ 39{
41 if ((cflags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == 40 struct file *f = container_of(head, struct file, f_u.fu_rcuhead);
42 SLAB_CTOR_CONSTRUCTOR) { 41 kmem_cache_free(filp_cachep, f);
43 unsigned long flags;
44 spin_lock_irqsave(&filp_count_lock, flags);
45 files_stat.nr_files++;
46 spin_unlock_irqrestore(&filp_count_lock, flags);
47 }
48} 42}
49 43
50void filp_dtor(void *objp, struct kmem_cache *cachep, unsigned long dflags) 44static inline void file_free(struct file *f)
51{ 45{
52 unsigned long flags; 46 percpu_counter_dec(&nr_files);
53 spin_lock_irqsave(&filp_count_lock, flags); 47 call_rcu(&f->f_u.fu_rcuhead, file_free_rcu);
54 files_stat.nr_files--;
55 spin_unlock_irqrestore(&filp_count_lock, flags);
56} 48}
57 49
58static inline void file_free_rcu(struct rcu_head *head) 50/*
51 * Return the total number of open files in the system
52 */
53static int get_nr_files(void)
59{ 54{
60 struct file *f = container_of(head, struct file, f_u.fu_rcuhead); 55 return percpu_counter_read_positive(&nr_files);
61 kmem_cache_free(filp_cachep, f);
62} 56}
63 57
64static inline void file_free(struct file *f) 58/*
59 * Return the maximum number of open files in the system
60 */
61int get_max_files(void)
65{ 62{
66 call_rcu(&f->f_u.fu_rcuhead, file_free_rcu); 63 return files_stat.max_files;
67} 64}
65EXPORT_SYMBOL_GPL(get_max_files);
66
67/*
68 * Handle nr_files sysctl
69 */
70#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
71int proc_nr_files(ctl_table *table, int write, struct file *filp,
72 void __user *buffer, size_t *lenp, loff_t *ppos)
73{
74 files_stat.nr_files = get_nr_files();
75 return proc_dointvec(table, write, filp, buffer, lenp, ppos);
76}
77#else
78int proc_nr_files(ctl_table *table, int write, struct file *filp,
79 void __user *buffer, size_t *lenp, loff_t *ppos)
80{
81 return -ENOSYS;
82}
83#endif
68 84
69/* Find an unused file structure and return a pointer to it. 85/* Find an unused file structure and return a pointer to it.
70 * Returns NULL, if there are no more free file structures or 86 * Returns NULL, if there are no more free file structures or
@@ -78,14 +94,20 @@ struct file *get_empty_filp(void)
78 /* 94 /*
79 * Privileged users can go above max_files 95 * Privileged users can go above max_files
80 */ 96 */
81 if (files_stat.nr_files >= files_stat.max_files && 97 if (get_nr_files() >= files_stat.max_files && !capable(CAP_SYS_ADMIN)) {
82 !capable(CAP_SYS_ADMIN)) 98 /*
83 goto over; 99 * percpu_counters are inaccurate. Do an expensive check before
100 * we go and fail.
101 */
102 if (percpu_counter_sum(&nr_files) >= files_stat.max_files)
103 goto over;
104 }
84 105
85 f = kmem_cache_alloc(filp_cachep, GFP_KERNEL); 106 f = kmem_cache_alloc(filp_cachep, GFP_KERNEL);
86 if (f == NULL) 107 if (f == NULL)
87 goto fail; 108 goto fail;
88 109
110 percpu_counter_inc(&nr_files);
89 memset(f, 0, sizeof(*f)); 111 memset(f, 0, sizeof(*f));
90 if (security_file_alloc(f)) 112 if (security_file_alloc(f))
91 goto fail_sec; 113 goto fail_sec;
@@ -101,10 +123,10 @@ struct file *get_empty_filp(void)
101 123
102over: 124over:
103 /* Ran out of filps - report that */ 125 /* Ran out of filps - report that */
104 if (files_stat.nr_files > old_max) { 126 if (get_nr_files() > old_max) {
105 printk(KERN_INFO "VFS: file-max limit %d reached\n", 127 printk(KERN_INFO "VFS: file-max limit %d reached\n",
106 files_stat.max_files); 128 get_max_files());
107 old_max = files_stat.nr_files; 129 old_max = get_nr_files();
108 } 130 }
109 goto fail; 131 goto fail;
110 132
@@ -276,4 +298,5 @@ void __init files_init(unsigned long mempages)
276 if (files_stat.max_files < NR_FILE) 298 if (files_stat.max_files < NR_FILE)
277 files_stat.max_files = NR_FILE; 299 files_stat.max_files = NR_FILE;
278 files_defer_init(); 300 files_defer_init();
301 percpu_counter_init(&nr_files);
279} 302}
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 4526da8907c6..0c9a2ee54c91 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -66,6 +66,12 @@ static void restore_sigs(sigset_t *oldset)
66 sigprocmask(SIG_SETMASK, oldset, NULL); 66 sigprocmask(SIG_SETMASK, oldset, NULL);
67} 67}
68 68
69/*
70 * Reset request, so that it can be reused
71 *
72 * The caller must be _very_ careful to make sure, that it is holding
73 * the only reference to req
74 */
69void fuse_reset_request(struct fuse_req *req) 75void fuse_reset_request(struct fuse_req *req)
70{ 76{
71 int preallocated = req->preallocated; 77 int preallocated = req->preallocated;
@@ -120,9 +126,9 @@ struct fuse_req *fuse_get_request(struct fuse_conn *fc)
120 return do_get_request(fc); 126 return do_get_request(fc);
121} 127}
122 128
129/* Must be called with fuse_lock held */
123static void fuse_putback_request(struct fuse_conn *fc, struct fuse_req *req) 130static void fuse_putback_request(struct fuse_conn *fc, struct fuse_req *req)
124{ 131{
125 spin_lock(&fuse_lock);
126 if (req->preallocated) { 132 if (req->preallocated) {
127 atomic_dec(&fc->num_waiting); 133 atomic_dec(&fc->num_waiting);
128 list_add(&req->list, &fc->unused_list); 134 list_add(&req->list, &fc->unused_list);
@@ -134,11 +140,19 @@ static void fuse_putback_request(struct fuse_conn *fc, struct fuse_req *req)
134 fc->outstanding_debt--; 140 fc->outstanding_debt--;
135 else 141 else
136 up(&fc->outstanding_sem); 142 up(&fc->outstanding_sem);
137 spin_unlock(&fuse_lock);
138} 143}
139 144
140void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req) 145void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req)
141{ 146{
147 if (atomic_dec_and_test(&req->count)) {
148 spin_lock(&fuse_lock);
149 fuse_putback_request(fc, req);
150 spin_unlock(&fuse_lock);
151 }
152}
153
154static void fuse_put_request_locked(struct fuse_conn *fc, struct fuse_req *req)
155{
142 if (atomic_dec_and_test(&req->count)) 156 if (atomic_dec_and_test(&req->count))
143 fuse_putback_request(fc, req); 157 fuse_putback_request(fc, req);
144} 158}
@@ -163,26 +177,36 @@ void fuse_release_background(struct fuse_req *req)
163 * still waiting), the 'end' callback is called if given, else the 177 * still waiting), the 'end' callback is called if given, else the
164 * reference to the request is released 178 * reference to the request is released
165 * 179 *
180 * Releasing extra reference for foreground requests must be done
181 * within the same locked region as setting state to finished. This
182 * is because fuse_reset_request() may be called after request is
183 * finished and it must be the sole possessor. If request is
184 * interrupted and put in the background, it will return with an error
185 * and hence never be reset and reused.
186 *
166 * Called with fuse_lock, unlocks it 187 * Called with fuse_lock, unlocks it
167 */ 188 */
168static void request_end(struct fuse_conn *fc, struct fuse_req *req) 189static void request_end(struct fuse_conn *fc, struct fuse_req *req)
169{ 190{
170 void (*end) (struct fuse_conn *, struct fuse_req *) = req->end;
171 req->end = NULL;
172 list_del(&req->list); 191 list_del(&req->list);
173 req->state = FUSE_REQ_FINISHED; 192 req->state = FUSE_REQ_FINISHED;
174 spin_unlock(&fuse_lock); 193 if (!req->background) {
175 if (req->background) { 194 wake_up(&req->waitq);
195 fuse_put_request_locked(fc, req);
196 spin_unlock(&fuse_lock);
197 } else {
198 void (*end) (struct fuse_conn *, struct fuse_req *) = req->end;
199 req->end = NULL;
200 spin_unlock(&fuse_lock);
176 down_read(&fc->sbput_sem); 201 down_read(&fc->sbput_sem);
177 if (fc->mounted) 202 if (fc->mounted)
178 fuse_release_background(req); 203 fuse_release_background(req);
179 up_read(&fc->sbput_sem); 204 up_read(&fc->sbput_sem);
205 if (end)
206 end(fc, req);
207 else
208 fuse_put_request(fc, req);
180 } 209 }
181 wake_up(&req->waitq);
182 if (end)
183 end(fc, req);
184 else
185 fuse_put_request(fc, req);
186} 210}
187 211
188/* 212/*
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 21fd59c7bc24..c72a8a97935c 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -111,6 +111,8 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
111 111
112 /* Doesn't hurt to "reset" the validity timeout */ 112 /* Doesn't hurt to "reset" the validity timeout */
113 fuse_invalidate_entry_cache(entry); 113 fuse_invalidate_entry_cache(entry);
114
115 /* For negative dentries, always do a fresh lookup */
114 if (!inode) 116 if (!inode)
115 return 0; 117 return 0;
116 118
@@ -122,6 +124,9 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
122 fuse_lookup_init(req, entry->d_parent->d_inode, entry, &outarg); 124 fuse_lookup_init(req, entry->d_parent->d_inode, entry, &outarg);
123 request_send(fc, req); 125 request_send(fc, req);
124 err = req->out.h.error; 126 err = req->out.h.error;
127 /* Zero nodeid is same as -ENOENT */
128 if (!err && !outarg.nodeid)
129 err = -ENOENT;
125 if (!err) { 130 if (!err) {
126 struct fuse_inode *fi = get_fuse_inode(inode); 131 struct fuse_inode *fi = get_fuse_inode(inode);
127 if (outarg.nodeid != get_node_id(inode)) { 132 if (outarg.nodeid != get_node_id(inode)) {
@@ -190,8 +195,9 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
190 fuse_lookup_init(req, dir, entry, &outarg); 195 fuse_lookup_init(req, dir, entry, &outarg);
191 request_send(fc, req); 196 request_send(fc, req);
192 err = req->out.h.error; 197 err = req->out.h.error;
193 if (!err && ((outarg.nodeid && invalid_nodeid(outarg.nodeid)) || 198 /* Zero nodeid is same as -ENOENT, but with valid timeout */
194 !valid_mode(outarg.attr.mode))) 199 if (!err && outarg.nodeid &&
200 (invalid_nodeid(outarg.nodeid) || !valid_mode(outarg.attr.mode)))
195 err = -EIO; 201 err = -EIO;
196 if (!err && outarg.nodeid) { 202 if (!err && outarg.nodeid) {
197 inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation, 203 inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation,
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index a7ef5e716f3c..6f05379b0a0d 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -116,9 +116,14 @@ int fuse_open_common(struct inode *inode, struct file *file, int isdir)
116/* Special case for failed iget in CREATE */ 116/* Special case for failed iget in CREATE */
117static void fuse_release_end(struct fuse_conn *fc, struct fuse_req *req) 117static void fuse_release_end(struct fuse_conn *fc, struct fuse_req *req)
118{ 118{
119 u64 nodeid = req->in.h.nodeid; 119 /* If called from end_io_requests(), req has more than one
120 fuse_reset_request(req); 120 reference and fuse_reset_request() cannot work */
121 fuse_send_forget(fc, req, nodeid, 1); 121 if (fc->connected) {
122 u64 nodeid = req->in.h.nodeid;
123 fuse_reset_request(req);
124 fuse_send_forget(fc, req, nodeid, 1);
125 } else
126 fuse_put_request(fc, req);
122} 127}
123 128
124void fuse_send_release(struct fuse_conn *fc, struct fuse_file *ff, 129void fuse_send_release(struct fuse_conn *fc, struct fuse_file *ff,
@@ -335,9 +340,14 @@ static void fuse_send_readpages(struct fuse_req *req, struct file *file,
335 loff_t pos = page_offset(req->pages[0]); 340 loff_t pos = page_offset(req->pages[0]);
336 size_t count = req->num_pages << PAGE_CACHE_SHIFT; 341 size_t count = req->num_pages << PAGE_CACHE_SHIFT;
337 req->out.page_zeroing = 1; 342 req->out.page_zeroing = 1;
338 req->end = fuse_readpages_end;
339 fuse_read_fill(req, file, inode, pos, count, FUSE_READ); 343 fuse_read_fill(req, file, inode, pos, count, FUSE_READ);
340 request_send_background(fc, req); 344 if (fc->async_read) {
345 req->end = fuse_readpages_end;
346 request_send_background(fc, req);
347 } else {
348 request_send(fc, req);
349 fuse_readpages_end(fc, req);
350 }
341} 351}
342 352
343struct fuse_readpages_data { 353struct fuse_readpages_data {
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 46cf933aa3bf..4a83adfec968 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -272,6 +272,9 @@ struct fuse_conn {
272 reply, before any other request, and never cleared */ 272 reply, before any other request, and never cleared */
273 unsigned conn_error : 1; 273 unsigned conn_error : 1;
274 274
275 /** Do readpages asynchronously? Only set in INIT */
276 unsigned async_read : 1;
277
275 /* 278 /*
276 * The following bitfields are only for optimization purposes 279 * The following bitfields are only for optimization purposes
277 * and hence races in setting them will not cause malfunction 280 * and hence races in setting them will not cause malfunction
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index c755a0440a66..879e6fba9480 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -473,6 +473,16 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
473 if (req->out.h.error || arg->major != FUSE_KERNEL_VERSION) 473 if (req->out.h.error || arg->major != FUSE_KERNEL_VERSION)
474 fc->conn_error = 1; 474 fc->conn_error = 1;
475 else { 475 else {
476 unsigned long ra_pages;
477
478 if (arg->minor >= 6) {
479 ra_pages = arg->max_readahead / PAGE_CACHE_SIZE;
480 if (arg->flags & FUSE_ASYNC_READ)
481 fc->async_read = 1;
482 } else
483 ra_pages = fc->max_read / PAGE_CACHE_SIZE;
484
485 fc->bdi.ra_pages = min(fc->bdi.ra_pages, ra_pages);
476 fc->minor = arg->minor; 486 fc->minor = arg->minor;
477 fc->max_write = arg->minor < 5 ? 4096 : arg->max_write; 487 fc->max_write = arg->minor < 5 ? 4096 : arg->max_write;
478 } 488 }
@@ -496,6 +506,8 @@ static void fuse_send_init(struct fuse_conn *fc)
496 506
497 arg->major = FUSE_KERNEL_VERSION; 507 arg->major = FUSE_KERNEL_VERSION;
498 arg->minor = FUSE_KERNEL_MINOR_VERSION; 508 arg->minor = FUSE_KERNEL_MINOR_VERSION;
509 arg->max_readahead = fc->bdi.ra_pages * PAGE_CACHE_SIZE;
510 arg->flags |= FUSE_ASYNC_READ;
499 req->in.h.opcode = FUSE_INIT; 511 req->in.h.opcode = FUSE_INIT;
500 req->in.numargs = 1; 512 req->in.numargs = 1;
501 req->in.args[0].size = sizeof(*arg); 513 req->in.args[0].size = sizeof(*arg);
@@ -552,8 +564,6 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
552 fc->user_id = d.user_id; 564 fc->user_id = d.user_id;
553 fc->group_id = d.group_id; 565 fc->group_id = d.group_id;
554 fc->max_read = d.max_read; 566 fc->max_read = d.max_read;
555 if (fc->max_read / PAGE_CACHE_SIZE < fc->bdi.ra_pages)
556 fc->bdi.ra_pages = fc->max_read / PAGE_CACHE_SIZE;
557 567
558 /* Used by get_root_inode() */ 568 /* Used by get_root_inode() */
559 sb->s_fs_info = fc; 569 sb->s_fs_info = fc;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index f568102da1e8..b35195289945 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -72,8 +72,8 @@ huge_pages_needed(struct address_space *mapping, struct vm_area_struct *vma)
72 unsigned long start = vma->vm_start; 72 unsigned long start = vma->vm_start;
73 unsigned long end = vma->vm_end; 73 unsigned long end = vma->vm_end;
74 unsigned long hugepages = (end - start) >> HPAGE_SHIFT; 74 unsigned long hugepages = (end - start) >> HPAGE_SHIFT;
75 pgoff_t next = vma->vm_pgoff; 75 pgoff_t next = vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT);
76 pgoff_t endpg = next + ((end - start) >> PAGE_SHIFT); 76 pgoff_t endpg = next + hugepages;
77 77
78 pagevec_init(&pvec, 0); 78 pagevec_init(&pvec, 0);
79 while (next < endpg) { 79 while (next < endpg) {
diff --git a/fs/inode.c b/fs/inode.c
index 108138d4e909..d0be6159eb7f 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1179,7 +1179,7 @@ EXPORT_SYMBOL(bmap);
1179/** 1179/**
1180 * touch_atime - update the access time 1180 * touch_atime - update the access time
1181 * @mnt: mount the inode is accessed on 1181 * @mnt: mount the inode is accessed on
1182 * @inode: inode accessed 1182 * @dentry: dentry accessed
1183 * 1183 *
1184 * Update the accessed time on an inode and mark it for writeback. 1184 * Update the accessed time on an inode and mark it for writeback.
1185 * This function automatically handles read only file systems and media, 1185 * This function automatically handles read only file systems and media,
diff --git a/fs/inotify.c b/fs/inotify.c
index 878ccca61213..3041503bde02 100644
--- a/fs/inotify.c
+++ b/fs/inotify.c
@@ -967,7 +967,7 @@ asmlinkage long sys_inotify_add_watch(int fd, const char __user *path, u32 mask)
967 mask_add = 1; 967 mask_add = 1;
968 968
969 /* don't let user-space set invalid bits: we don't want flags set */ 969 /* don't let user-space set invalid bits: we don't want flags set */
970 mask &= IN_ALL_EVENTS; 970 mask &= IN_ALL_EVENTS | IN_ONESHOT;
971 if (unlikely(!mask)) { 971 if (unlikely(!mask)) {
972 ret = -EINVAL; 972 ret = -EINVAL;
973 goto out; 973 goto out;
diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
index e6265a0b56b8..543ed543d1e5 100644
--- a/fs/jbd/checkpoint.c
+++ b/fs/jbd/checkpoint.c
@@ -24,75 +24,29 @@
24#include <linux/slab.h> 24#include <linux/slab.h>
25 25
26/* 26/*
27 * Unlink a buffer from a transaction checkpoint list. 27 * Unlink a buffer from a transaction.
28 * 28 *
29 * Called with j_list_lock held. 29 * Called with j_list_lock held.
30 */ 30 */
31 31
32static void __buffer_unlink_first(struct journal_head *jh) 32static inline void __buffer_unlink(struct journal_head *jh)
33{ 33{
34 transaction_t *transaction; 34 transaction_t *transaction;
35 35
36 transaction = jh->b_cp_transaction; 36 transaction = jh->b_cp_transaction;
37 jh->b_cp_transaction = NULL;
37 38
38 jh->b_cpnext->b_cpprev = jh->b_cpprev; 39 jh->b_cpnext->b_cpprev = jh->b_cpprev;
39 jh->b_cpprev->b_cpnext = jh->b_cpnext; 40 jh->b_cpprev->b_cpnext = jh->b_cpnext;
40 if (transaction->t_checkpoint_list == jh) { 41 if (transaction->t_checkpoint_list == jh)
41 transaction->t_checkpoint_list = jh->b_cpnext; 42 transaction->t_checkpoint_list = jh->b_cpnext;
42 if (transaction->t_checkpoint_list == jh) 43 if (transaction->t_checkpoint_list == jh)
43 transaction->t_checkpoint_list = NULL; 44 transaction->t_checkpoint_list = NULL;
44 }
45}
46
47/*
48 * Unlink a buffer from a transaction checkpoint(io) list.
49 *
50 * Called with j_list_lock held.
51 */
52
53static inline void __buffer_unlink(struct journal_head *jh)
54{
55 transaction_t *transaction;
56
57 transaction = jh->b_cp_transaction;
58
59 __buffer_unlink_first(jh);
60 if (transaction->t_checkpoint_io_list == jh) {
61 transaction->t_checkpoint_io_list = jh->b_cpnext;
62 if (transaction->t_checkpoint_io_list == jh)
63 transaction->t_checkpoint_io_list = NULL;
64 }
65}
66
67/*
68 * Move a buffer from the checkpoint list to the checkpoint io list
69 *
70 * Called with j_list_lock held
71 */
72
73static inline void __buffer_relink_io(struct journal_head *jh)
74{
75 transaction_t *transaction;
76
77 transaction = jh->b_cp_transaction;
78 __buffer_unlink_first(jh);
79
80 if (!transaction->t_checkpoint_io_list) {
81 jh->b_cpnext = jh->b_cpprev = jh;
82 } else {
83 jh->b_cpnext = transaction->t_checkpoint_io_list;
84 jh->b_cpprev = transaction->t_checkpoint_io_list->b_cpprev;
85 jh->b_cpprev->b_cpnext = jh;
86 jh->b_cpnext->b_cpprev = jh;
87 }
88 transaction->t_checkpoint_io_list = jh;
89} 45}
90 46
91/* 47/*
92 * Try to release a checkpointed buffer from its transaction. 48 * Try to release a checkpointed buffer from its transaction.
93 * Returns 1 if we released it and 2 if we also released the 49 * Returns 1 if we released it.
94 * whole transaction.
95 *
96 * Requires j_list_lock 50 * Requires j_list_lock
97 * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it 51 * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
98 */ 52 */
@@ -103,11 +57,12 @@ static int __try_to_free_cp_buf(struct journal_head *jh)
103 57
104 if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh)) { 58 if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh)) {
105 JBUFFER_TRACE(jh, "remove from checkpoint list"); 59 JBUFFER_TRACE(jh, "remove from checkpoint list");
106 ret = __journal_remove_checkpoint(jh) + 1; 60 __journal_remove_checkpoint(jh);
107 jbd_unlock_bh_state(bh); 61 jbd_unlock_bh_state(bh);
108 journal_remove_journal_head(bh); 62 journal_remove_journal_head(bh);
109 BUFFER_TRACE(bh, "release"); 63 BUFFER_TRACE(bh, "release");
110 __brelse(bh); 64 __brelse(bh);
65 ret = 1;
111 } else { 66 } else {
112 jbd_unlock_bh_state(bh); 67 jbd_unlock_bh_state(bh);
113 } 68 }
@@ -162,53 +117,83 @@ static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh)
162} 117}
163 118
164/* 119/*
165 * Clean up transaction's list of buffers submitted for io. 120 * Clean up a transaction's checkpoint list.
166 * We wait for any pending IO to complete and remove any clean 121 *
167 * buffers. Note that we take the buffers in the opposite ordering 122 * We wait for any pending IO to complete and make sure any clean
168 * from the one in which they were submitted for IO. 123 * buffers are removed from the transaction.
124 *
125 * Return 1 if we performed any actions which might have destroyed the
126 * checkpoint. (journal_remove_checkpoint() deletes the transaction when
127 * the last checkpoint buffer is cleansed)
169 * 128 *
170 * Called with j_list_lock held. 129 * Called with j_list_lock held.
171 */ 130 */
172 131static int __cleanup_transaction(journal_t *journal, transaction_t *transaction)
173static void __wait_cp_io(journal_t *journal, transaction_t *transaction)
174{ 132{
175 struct journal_head *jh; 133 struct journal_head *jh, *next_jh, *last_jh;
176 struct buffer_head *bh; 134 struct buffer_head *bh;
177 tid_t this_tid; 135 int ret = 0;
178 int released = 0; 136
179 137 assert_spin_locked(&journal->j_list_lock);
180 this_tid = transaction->t_tid; 138 jh = transaction->t_checkpoint_list;
181restart: 139 if (!jh)
182 /* Didn't somebody clean up the transaction in the meanwhile */ 140 return 0;
183 if (journal->j_checkpoint_transactions != transaction || 141
184 transaction->t_tid != this_tid) 142 last_jh = jh->b_cpprev;
185 return; 143 next_jh = jh;
186 while (!released && transaction->t_checkpoint_io_list) { 144 do {
187 jh = transaction->t_checkpoint_io_list; 145 jh = next_jh;
188 bh = jh2bh(jh); 146 bh = jh2bh(jh);
189 if (!jbd_trylock_bh_state(bh)) {
190 jbd_sync_bh(journal, bh);
191 spin_lock(&journal->j_list_lock);
192 goto restart;
193 }
194 if (buffer_locked(bh)) { 147 if (buffer_locked(bh)) {
195 atomic_inc(&bh->b_count); 148 atomic_inc(&bh->b_count);
196 spin_unlock(&journal->j_list_lock); 149 spin_unlock(&journal->j_list_lock);
197 jbd_unlock_bh_state(bh);
198 wait_on_buffer(bh); 150 wait_on_buffer(bh);
199 /* the journal_head may have gone by now */ 151 /* the journal_head may have gone by now */
200 BUFFER_TRACE(bh, "brelse"); 152 BUFFER_TRACE(bh, "brelse");
201 __brelse(bh); 153 __brelse(bh);
202 spin_lock(&journal->j_list_lock); 154 goto out_return_1;
203 goto restart;
204 } 155 }
156
205 /* 157 /*
206 * Now in whatever state the buffer currently is, we know that 158 * This is foul
207 * it has been written out and so we can drop it from the list
208 */ 159 */
209 released = __journal_remove_checkpoint(jh); 160 if (!jbd_trylock_bh_state(bh)) {
210 jbd_unlock_bh_state(bh); 161 jbd_sync_bh(journal, bh);
211 } 162 goto out_return_1;
163 }
164
165 if (jh->b_transaction != NULL) {
166 transaction_t *t = jh->b_transaction;
167 tid_t tid = t->t_tid;
168
169 spin_unlock(&journal->j_list_lock);
170 jbd_unlock_bh_state(bh);
171 log_start_commit(journal, tid);
172 log_wait_commit(journal, tid);
173 goto out_return_1;
174 }
175
176 /*
177 * AKPM: I think the buffer_jbddirty test is redundant - it
178 * shouldn't have NULL b_transaction?
179 */
180 next_jh = jh->b_cpnext;
181 if (!buffer_dirty(bh) && !buffer_jbddirty(bh)) {
182 BUFFER_TRACE(bh, "remove from checkpoint");
183 __journal_remove_checkpoint(jh);
184 jbd_unlock_bh_state(bh);
185 journal_remove_journal_head(bh);
186 __brelse(bh);
187 ret = 1;
188 } else {
189 jbd_unlock_bh_state(bh);
190 }
191 } while (jh != last_jh);
192
193 return ret;
194out_return_1:
195 spin_lock(&journal->j_list_lock);
196 return 1;
212} 197}
213 198
214#define NR_BATCH 64 199#define NR_BATCH 64
@@ -218,7 +203,9 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
218{ 203{
219 int i; 204 int i;
220 205
206 spin_unlock(&journal->j_list_lock);
221 ll_rw_block(SWRITE, *batch_count, bhs); 207 ll_rw_block(SWRITE, *batch_count, bhs);
208 spin_lock(&journal->j_list_lock);
222 for (i = 0; i < *batch_count; i++) { 209 for (i = 0; i < *batch_count; i++) {
223 struct buffer_head *bh = bhs[i]; 210 struct buffer_head *bh = bhs[i];
224 clear_buffer_jwrite(bh); 211 clear_buffer_jwrite(bh);
@@ -234,46 +221,19 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
234 * Return 1 if something happened which requires us to abort the current 221 * Return 1 if something happened which requires us to abort the current
235 * scan of the checkpoint list. 222 * scan of the checkpoint list.
236 * 223 *
237 * Called with j_list_lock held and drops it if 1 is returned 224 * Called with j_list_lock held.
238 * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it 225 * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
239 */ 226 */
240static int __process_buffer(journal_t *journal, struct journal_head *jh, 227static int __flush_buffer(journal_t *journal, struct journal_head *jh,
241 struct buffer_head **bhs, int *batch_count) 228 struct buffer_head **bhs, int *batch_count,
229 int *drop_count)
242{ 230{
243 struct buffer_head *bh = jh2bh(jh); 231 struct buffer_head *bh = jh2bh(jh);
244 int ret = 0; 232 int ret = 0;
245 233
246 if (buffer_locked(bh)) { 234 if (buffer_dirty(bh) && !buffer_locked(bh) && jh->b_jlist == BJ_None) {
247 get_bh(bh); 235 J_ASSERT_JH(jh, jh->b_transaction == NULL);
248 spin_unlock(&journal->j_list_lock);
249 jbd_unlock_bh_state(bh);
250 wait_on_buffer(bh);
251 /* the journal_head may have gone by now */
252 BUFFER_TRACE(bh, "brelse");
253 put_bh(bh);
254 ret = 1;
255 }
256 else if (jh->b_transaction != NULL) {
257 transaction_t *t = jh->b_transaction;
258 tid_t tid = t->t_tid;
259 236
260 spin_unlock(&journal->j_list_lock);
261 jbd_unlock_bh_state(bh);
262 log_start_commit(journal, tid);
263 log_wait_commit(journal, tid);
264 ret = 1;
265 }
266 else if (!buffer_dirty(bh)) {
267 J_ASSERT_JH(jh, !buffer_jbddirty(bh));
268 BUFFER_TRACE(bh, "remove from checkpoint");
269 __journal_remove_checkpoint(jh);
270 spin_unlock(&journal->j_list_lock);
271 jbd_unlock_bh_state(bh);
272 journal_remove_journal_head(bh);
273 put_bh(bh);
274 ret = 1;
275 }
276 else {
277 /* 237 /*
278 * Important: we are about to write the buffer, and 238 * Important: we are about to write the buffer, and
279 * possibly block, while still holding the journal lock. 239 * possibly block, while still holding the journal lock.
@@ -286,30 +246,45 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
286 J_ASSERT_BH(bh, !buffer_jwrite(bh)); 246 J_ASSERT_BH(bh, !buffer_jwrite(bh));
287 set_buffer_jwrite(bh); 247 set_buffer_jwrite(bh);
288 bhs[*batch_count] = bh; 248 bhs[*batch_count] = bh;
289 __buffer_relink_io(jh);
290 jbd_unlock_bh_state(bh); 249 jbd_unlock_bh_state(bh);
291 (*batch_count)++; 250 (*batch_count)++;
292 if (*batch_count == NR_BATCH) { 251 if (*batch_count == NR_BATCH) {
293 spin_unlock(&journal->j_list_lock);
294 __flush_batch(journal, bhs, batch_count); 252 __flush_batch(journal, bhs, batch_count);
295 ret = 1; 253 ret = 1;
296 } 254 }
255 } else {
256 int last_buffer = 0;
257 if (jh->b_cpnext == jh) {
258 /* We may be about to drop the transaction. Tell the
259 * caller that the lists have changed.
260 */
261 last_buffer = 1;
262 }
263 if (__try_to_free_cp_buf(jh)) {
264 (*drop_count)++;
265 ret = last_buffer;
266 }
297 } 267 }
298 return ret; 268 return ret;
299} 269}
300 270
301/* 271/*
302 * Perform an actual checkpoint. We take the first transaction on the 272 * Perform an actual checkpoint. We don't write out only enough to
303 * list of transactions to be checkpointed and send all its buffers 273 * satisfy the current blocked requests: rather we submit a reasonably
304 * to disk. We submit larger chunks of data at once. 274 * sized chunk of the outstanding data to disk at once for
275 * efficiency. __log_wait_for_space() will retry if we didn't free enough.
305 * 276 *
277 * However, we _do_ take into account the amount requested so that once
278 * the IO has been queued, we can return as soon as enough of it has
279 * completed to disk.
280 *
306 * The journal should be locked before calling this function. 281 * The journal should be locked before calling this function.
307 */ 282 */
308int log_do_checkpoint(journal_t *journal) 283int log_do_checkpoint(journal_t *journal)
309{ 284{
310 transaction_t *transaction;
311 tid_t this_tid;
312 int result; 285 int result;
286 int batch_count = 0;
287 struct buffer_head *bhs[NR_BATCH];
313 288
314 jbd_debug(1, "Start checkpoint\n"); 289 jbd_debug(1, "Start checkpoint\n");
315 290
@@ -324,70 +299,79 @@ int log_do_checkpoint(journal_t *journal)
324 return result; 299 return result;
325 300
326 /* 301 /*
327 * OK, we need to start writing disk blocks. Take one transaction 302 * OK, we need to start writing disk blocks. Try to free up a
328 * and write it. 303 * quarter of the log in a single checkpoint if we can.
329 */ 304 */
330 spin_lock(&journal->j_list_lock);
331 if (!journal->j_checkpoint_transactions)
332 goto out;
333 transaction = journal->j_checkpoint_transactions;
334 this_tid = transaction->t_tid;
335restart:
336 /* 305 /*
337 * If someone cleaned up this transaction while we slept, we're 306 * AKPM: check this code. I had a feeling a while back that it
338 * done (maybe it's a new transaction, but it fell at the same 307 * degenerates into a busy loop at unmount time.
339 * address).
340 */ 308 */
341 if (journal->j_checkpoint_transactions == transaction && 309 spin_lock(&journal->j_list_lock);
342 transaction->t_tid == this_tid) { 310 while (journal->j_checkpoint_transactions) {
343 int batch_count = 0; 311 transaction_t *transaction;
344 struct buffer_head *bhs[NR_BATCH]; 312 struct journal_head *jh, *last_jh, *next_jh;
345 struct journal_head *jh; 313 int drop_count = 0;
346 int retry = 0; 314 int cleanup_ret, retry = 0;
347 315 tid_t this_tid;
348 while (!retry && transaction->t_checkpoint_list) { 316
317 transaction = journal->j_checkpoint_transactions;
318 this_tid = transaction->t_tid;
319 jh = transaction->t_checkpoint_list;
320 last_jh = jh->b_cpprev;
321 next_jh = jh;
322 do {
349 struct buffer_head *bh; 323 struct buffer_head *bh;
350 324
351 jh = transaction->t_checkpoint_list; 325 jh = next_jh;
326 next_jh = jh->b_cpnext;
352 bh = jh2bh(jh); 327 bh = jh2bh(jh);
353 if (!jbd_trylock_bh_state(bh)) { 328 if (!jbd_trylock_bh_state(bh)) {
354 jbd_sync_bh(journal, bh); 329 jbd_sync_bh(journal, bh);
330 spin_lock(&journal->j_list_lock);
355 retry = 1; 331 retry = 1;
356 break; 332 break;
357 } 333 }
358 retry = __process_buffer(journal, jh, bhs, 334 retry = __flush_buffer(journal, jh, bhs, &batch_count, &drop_count);
359 &batch_count); 335 if (cond_resched_lock(&journal->j_list_lock)) {
360 if (!retry &&
361 lock_need_resched(&journal->j_list_lock)) {
362 spin_unlock(&journal->j_list_lock);
363 retry = 1; 336 retry = 1;
364 break; 337 break;
365 } 338 }
366 } 339 } while (jh != last_jh && !retry);
367 340
368 if (batch_count) { 341 if (batch_count) {
369 if (!retry) {
370 spin_unlock(&journal->j_list_lock);
371 retry = 1;
372 }
373 __flush_batch(journal, bhs, &batch_count); 342 __flush_batch(journal, bhs, &batch_count);
343 retry = 1;
374 } 344 }
375 345
376 if (retry) {
377 spin_lock(&journal->j_list_lock);
378 goto restart;
379 }
380 /* 346 /*
381 * Now we have cleaned up the first transaction's checkpoint 347 * If someone cleaned up this transaction while we slept, we're
382 * list. Let's clean up the second one. 348 * done
349 */
350 if (journal->j_checkpoint_transactions != transaction)
351 break;
352 if (retry)
353 continue;
354 /*
355 * Maybe it's a new transaction, but it fell at the same
356 * address
383 */ 357 */
384 __wait_cp_io(journal, transaction); 358 if (transaction->t_tid != this_tid)
359 continue;
360 /*
361 * We have walked the whole transaction list without
362 * finding anything to write to disk. We had better be
363 * able to make some progress or we are in trouble.
364 */
365 cleanup_ret = __cleanup_transaction(journal, transaction);
366 J_ASSERT(drop_count != 0 || cleanup_ret != 0);
367 if (journal->j_checkpoint_transactions != transaction)
368 break;
385 } 369 }
386out:
387 spin_unlock(&journal->j_list_lock); 370 spin_unlock(&journal->j_list_lock);
388 result = cleanup_journal_tail(journal); 371 result = cleanup_journal_tail(journal);
389 if (result < 0) 372 if (result < 0)
390 return result; 373 return result;
374
391 return 0; 375 return 0;
392} 376}
393 377
@@ -472,91 +456,52 @@ int cleanup_journal_tail(journal_t *journal)
472/* Checkpoint list management */ 456/* Checkpoint list management */
473 457
474/* 458/*
475 * journal_clean_one_cp_list
476 *
477 * Find all the written-back checkpoint buffers in the given list and release them.
478 *
479 * Called with the journal locked.
480 * Called with j_list_lock held.
481 * Returns number of bufers reaped (for debug)
482 */
483
484static int journal_clean_one_cp_list(struct journal_head *jh, int *released)
485{
486 struct journal_head *last_jh;
487 struct journal_head *next_jh = jh;
488 int ret, freed = 0;
489
490 *released = 0;
491 if (!jh)
492 return 0;
493
494 last_jh = jh->b_cpprev;
495 do {
496 jh = next_jh;
497 next_jh = jh->b_cpnext;
498 /* Use trylock because of the ranking */
499 if (jbd_trylock_bh_state(jh2bh(jh))) {
500 ret = __try_to_free_cp_buf(jh);
501 if (ret) {
502 freed++;
503 if (ret == 2) {
504 *released = 1;
505 return freed;
506 }
507 }
508 }
509 /*
510 * This function only frees up some memory if possible so we
511 * dont have an obligation to finish processing. Bail out if
512 * preemption requested:
513 */
514 if (need_resched())
515 return freed;
516 } while (jh != last_jh);
517
518 return freed;
519}
520
521/*
522 * journal_clean_checkpoint_list 459 * journal_clean_checkpoint_list
523 * 460 *
524 * Find all the written-back checkpoint buffers in the journal and release them. 461 * Find all the written-back checkpoint buffers in the journal and release them.
525 * 462 *
526 * Called with the journal locked. 463 * Called with the journal locked.
527 * Called with j_list_lock held. 464 * Called with j_list_lock held.
528 * Returns number of buffers reaped (for debug) 465 * Returns number of bufers reaped (for debug)
529 */ 466 */
530 467
531int __journal_clean_checkpoint_list(journal_t *journal) 468int __journal_clean_checkpoint_list(journal_t *journal)
532{ 469{
533 transaction_t *transaction, *last_transaction, *next_transaction; 470 transaction_t *transaction, *last_transaction, *next_transaction;
534 int ret = 0, released; 471 int ret = 0;
535 472
536 transaction = journal->j_checkpoint_transactions; 473 transaction = journal->j_checkpoint_transactions;
537 if (!transaction) 474 if (transaction == 0)
538 goto out; 475 goto out;
539 476
540 last_transaction = transaction->t_cpprev; 477 last_transaction = transaction->t_cpprev;
541 next_transaction = transaction; 478 next_transaction = transaction;
542 do { 479 do {
480 struct journal_head *jh;
481
543 transaction = next_transaction; 482 transaction = next_transaction;
544 next_transaction = transaction->t_cpnext; 483 next_transaction = transaction->t_cpnext;
545 ret += journal_clean_one_cp_list(transaction-> 484 jh = transaction->t_checkpoint_list;
546 t_checkpoint_list, &released); 485 if (jh) {
547 if (need_resched()) 486 struct journal_head *last_jh = jh->b_cpprev;
548 goto out; 487 struct journal_head *next_jh = jh;
549 if (released) 488
550 continue; 489 do {
551 /* 490 jh = next_jh;
552 * It is essential that we are as careful as in the case of 491 next_jh = jh->b_cpnext;
553 * t_checkpoint_list with removing the buffer from the list as 492 /* Use trylock because of the ranknig */
554 * we can possibly see not yet submitted buffers on io_list 493 if (jbd_trylock_bh_state(jh2bh(jh)))
555 */ 494 ret += __try_to_free_cp_buf(jh);
556 ret += journal_clean_one_cp_list(transaction-> 495 /*
557 t_checkpoint_io_list, &released); 496 * This function only frees up some memory
558 if (need_resched()) 497 * if possible so we dont have an obligation
559 goto out; 498 * to finish processing. Bail out if preemption
499 * requested:
500 */
501 if (need_resched())
502 goto out;
503 } while (jh != last_jh);
504 }
560 } while (transaction != last_transaction); 505 } while (transaction != last_transaction);
561out: 506out:
562 return ret; 507 return ret;
@@ -571,22 +516,18 @@ out:
571 * buffer updates committed in that transaction have safely been stored 516 * buffer updates committed in that transaction have safely been stored
572 * elsewhere on disk. To achieve this, all of the buffers in a 517 * elsewhere on disk. To achieve this, all of the buffers in a
573 * transaction need to be maintained on the transaction's checkpoint 518 * transaction need to be maintained on the transaction's checkpoint
574 * lists until they have been rewritten, at which point this function is 519 * list until they have been rewritten, at which point this function is
575 * called to remove the buffer from the existing transaction's 520 * called to remove the buffer from the existing transaction's
576 * checkpoint lists. 521 * checkpoint list.
577 *
578 * The function returns 1 if it frees the transaction, 0 otherwise.
579 * 522 *
580 * This function is called with the journal locked. 523 * This function is called with the journal locked.
581 * This function is called with j_list_lock held. 524 * This function is called with j_list_lock held.
582 * This function is called with jbd_lock_bh_state(jh2bh(jh))
583 */ 525 */
584 526
585int __journal_remove_checkpoint(struct journal_head *jh) 527void __journal_remove_checkpoint(struct journal_head *jh)
586{ 528{
587 transaction_t *transaction; 529 transaction_t *transaction;
588 journal_t *journal; 530 journal_t *journal;
589 int ret = 0;
590 531
591 JBUFFER_TRACE(jh, "entry"); 532 JBUFFER_TRACE(jh, "entry");
592 533
@@ -597,10 +538,8 @@ int __journal_remove_checkpoint(struct journal_head *jh)
597 journal = transaction->t_journal; 538 journal = transaction->t_journal;
598 539
599 __buffer_unlink(jh); 540 __buffer_unlink(jh);
600 jh->b_cp_transaction = NULL;
601 541
602 if (transaction->t_checkpoint_list != NULL || 542 if (transaction->t_checkpoint_list != NULL)
603 transaction->t_checkpoint_io_list != NULL)
604 goto out; 543 goto out;
605 JBUFFER_TRACE(jh, "transaction has no more buffers"); 544 JBUFFER_TRACE(jh, "transaction has no more buffers");
606 545
@@ -626,10 +565,8 @@ int __journal_remove_checkpoint(struct journal_head *jh)
626 /* Just in case anybody was waiting for more transactions to be 565 /* Just in case anybody was waiting for more transactions to be
627 checkpointed... */ 566 checkpointed... */
628 wake_up(&journal->j_wait_logspace); 567 wake_up(&journal->j_wait_logspace);
629 ret = 1;
630out: 568out:
631 JBUFFER_TRACE(jh, "exit"); 569 JBUFFER_TRACE(jh, "exit");
632 return ret;
633} 570}
634 571
635/* 572/*
@@ -691,7 +628,6 @@ void __journal_drop_transaction(journal_t *journal, transaction_t *transaction)
691 J_ASSERT(transaction->t_shadow_list == NULL); 628 J_ASSERT(transaction->t_shadow_list == NULL);
692 J_ASSERT(transaction->t_log_list == NULL); 629 J_ASSERT(transaction->t_log_list == NULL);
693 J_ASSERT(transaction->t_checkpoint_list == NULL); 630 J_ASSERT(transaction->t_checkpoint_list == NULL);
694 J_ASSERT(transaction->t_checkpoint_io_list == NULL);
695 J_ASSERT(transaction->t_updates == 0); 631 J_ASSERT(transaction->t_updates == 0);
696 J_ASSERT(journal->j_committing_transaction != transaction); 632 J_ASSERT(journal->j_committing_transaction != transaction);
697 J_ASSERT(journal->j_running_transaction != transaction); 633 J_ASSERT(journal->j_running_transaction != transaction);
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 29e62d98bae6..002ad2bbc769 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -829,8 +829,7 @@ restart_loop:
829 journal->j_committing_transaction = NULL; 829 journal->j_committing_transaction = NULL;
830 spin_unlock(&journal->j_state_lock); 830 spin_unlock(&journal->j_state_lock);
831 831
832 if (commit_transaction->t_checkpoint_list == NULL && 832 if (commit_transaction->t_checkpoint_list == NULL) {
833 commit_transaction->t_checkpoint_io_list == NULL) {
834 __journal_drop_transaction(journal, commit_transaction); 833 __journal_drop_transaction(journal, commit_transaction);
835 } else { 834 } else {
836 if (journal->j_checkpoint_transactions == NULL) { 835 if (journal->j_checkpoint_transactions == NULL) {
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 429f4b263cf1..ca917973c2c0 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -1308,6 +1308,7 @@ int journal_stop(handle_t *handle)
1308 transaction_t *transaction = handle->h_transaction; 1308 transaction_t *transaction = handle->h_transaction;
1309 journal_t *journal = transaction->t_journal; 1309 journal_t *journal = transaction->t_journal;
1310 int old_handle_count, err; 1310 int old_handle_count, err;
1311 pid_t pid;
1311 1312
1312 J_ASSERT(transaction->t_updates > 0); 1313 J_ASSERT(transaction->t_updates > 0);
1313 J_ASSERT(journal_current_handle() == handle); 1314 J_ASSERT(journal_current_handle() == handle);
@@ -1333,8 +1334,15 @@ int journal_stop(handle_t *handle)
1333 * It doesn't cost much - we're about to run a commit and sleep 1334 * It doesn't cost much - we're about to run a commit and sleep
1334 * on IO anyway. Speeds up many-threaded, many-dir operations 1335 * on IO anyway. Speeds up many-threaded, many-dir operations
1335 * by 30x or more... 1336 * by 30x or more...
1337 *
1338 * But don't do this if this process was the most recent one to
1339 * perform a synchronous write. We do this to detect the case where a
1340 * single process is doing a stream of sync writes. No point in waiting
1341 * for joiners in that case.
1336 */ 1342 */
1337 if (handle->h_sync) { 1343 pid = current->pid;
1344 if (handle->h_sync && journal->j_last_sync_writer != pid) {
1345 journal->j_last_sync_writer = pid;
1338 do { 1346 do {
1339 old_handle_count = transaction->t_handle_count; 1347 old_handle_count = transaction->t_handle_count;
1340 schedule_timeout_uninterruptible(1); 1348 schedule_timeout_uninterruptible(1);
diff --git a/fs/jffs/intrep.c b/fs/jffs/intrep.c
index b2e95421d932..ce7b54b0b2b7 100644
--- a/fs/jffs/intrep.c
+++ b/fs/jffs/intrep.c
@@ -1965,7 +1965,7 @@ retry:
1965 iovec_cnt++; 1965 iovec_cnt++;
1966 1966
1967 if (JFFS_GET_PAD_BYTES(raw_inode->nsize)) { 1967 if (JFFS_GET_PAD_BYTES(raw_inode->nsize)) {
1968 static char allff[3]={255,255,255}; 1968 static unsigned char allff[3]={255,255,255};
1969 /* Add some extra padding if necessary */ 1969 /* Add some extra padding if necessary */
1970 node_iovec[iovec_cnt].iov_base = allff; 1970 node_iovec[iovec_cnt].iov_base = allff;
1971 node_iovec[iovec_cnt].iov_len = 1971 node_iovec[iovec_cnt].iov_len =
diff --git a/fs/jffs2/nodelist.c b/fs/jffs2/nodelist.c
index b635e167a3fa..d4d0c41490cd 100644
--- a/fs/jffs2/nodelist.c
+++ b/fs/jffs2/nodelist.c
@@ -406,7 +406,8 @@ static int check_node_data(struct jffs2_sb_info *c, struct jffs2_tmp_dnode_info
406 int err = 0, pointed = 0; 406 int err = 0, pointed = 0;
407 struct jffs2_eraseblock *jeb; 407 struct jffs2_eraseblock *jeb;
408 unsigned char *buffer; 408 unsigned char *buffer;
409 uint32_t crc, ofs, retlen, len; 409 uint32_t crc, ofs, len;
410 size_t retlen;
410 411
411 BUG_ON(tn->csize == 0); 412 BUG_ON(tn->csize == 0);
412 413
diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c
index 5f0652df5d47..f1695642d0f7 100644
--- a/fs/jffs2/readinode.c
+++ b/fs/jffs2/readinode.c
@@ -112,7 +112,7 @@ static struct jffs2_raw_node_ref *jffs2_first_valid_node(struct jffs2_raw_node_r
112 * negative error code on failure. 112 * negative error code on failure.
113 */ 113 */
114static inline int read_direntry(struct jffs2_sb_info *c, struct jffs2_raw_node_ref *ref, 114static inline int read_direntry(struct jffs2_sb_info *c, struct jffs2_raw_node_ref *ref,
115 struct jffs2_raw_dirent *rd, uint32_t read, struct jffs2_full_dirent **fdp, 115 struct jffs2_raw_dirent *rd, size_t read, struct jffs2_full_dirent **fdp,
116 uint32_t *latest_mctime, uint32_t *mctime_ver) 116 uint32_t *latest_mctime, uint32_t *mctime_ver)
117{ 117{
118 struct jffs2_full_dirent *fd; 118 struct jffs2_full_dirent *fd;
diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index 3e51dd1da8aa..cf55b221fc2b 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -233,7 +233,7 @@ int jffs2_scan_medium(struct jffs2_sb_info *c)
233 c->nextblock->dirty_size = 0; 233 c->nextblock->dirty_size = 0;
234 } 234 }
235#ifdef CONFIG_JFFS2_FS_WRITEBUFFER 235#ifdef CONFIG_JFFS2_FS_WRITEBUFFER
236 if (!jffs2_can_mark_obsolete(c) && c->nextblock && (c->nextblock->free_size % c->wbuf_pagesize)) { 236 if (!jffs2_can_mark_obsolete(c) && c->wbuf_pagesize && c->nextblock && (c->nextblock->free_size % c->wbuf_pagesize)) {
237 /* If we're going to start writing into a block which already 237 /* If we're going to start writing into a block which already
238 contains data, and the end of the data isn't page-aligned, 238 contains data, and the end of the data isn't page-aligned,
239 skip a little and align it. */ 239 skip a little and align it. */
diff --git a/fs/libfs.c b/fs/libfs.c
index 63c020e6589e..71fd08fa4103 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -388,6 +388,7 @@ int simple_fill_super(struct super_block *s, int magic, struct tree_descr *files
388 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 388 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
389 inode->i_op = &simple_dir_inode_operations; 389 inode->i_op = &simple_dir_inode_operations;
390 inode->i_fop = &simple_dir_operations; 390 inode->i_fop = &simple_dir_operations;
391 inode->i_nlink = 2;
391 root = d_alloc_root(inode); 392 root = d_alloc_root(inode);
392 if (!root) { 393 if (!root) {
393 iput(inode); 394 iput(inode);
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index 3eaf6e701087..da6354baa0b8 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -111,9 +111,10 @@ long nlmclnt_block(struct nlm_rqst *req, long timeout)
111/* 111/*
112 * The server lockd has called us back to tell us the lock was granted 112 * The server lockd has called us back to tell us the lock was granted
113 */ 113 */
114u32 114u32 nlmclnt_grant(const struct sockaddr_in *addr, const struct nlm_lock *lock)
115nlmclnt_grant(struct nlm_lock *lock)
116{ 115{
116 const struct file_lock *fl = &lock->fl;
117 const struct nfs_fh *fh = &lock->fh;
117 struct nlm_wait *block; 118 struct nlm_wait *block;
118 u32 res = nlm_lck_denied; 119 u32 res = nlm_lck_denied;
119 120
@@ -122,14 +123,20 @@ nlmclnt_grant(struct nlm_lock *lock)
122 * Warning: must not use cookie to match it! 123 * Warning: must not use cookie to match it!
123 */ 124 */
124 list_for_each_entry(block, &nlm_blocked, b_list) { 125 list_for_each_entry(block, &nlm_blocked, b_list) {
125 if (nlm_compare_locks(block->b_lock, &lock->fl)) { 126 struct file_lock *fl_blocked = block->b_lock;
126 /* Alright, we found a lock. Set the return status 127
127 * and wake up the caller 128 if (!nlm_compare_locks(fl_blocked, fl))
128 */ 129 continue;
129 block->b_status = NLM_LCK_GRANTED; 130 if (!nlm_cmp_addr(&block->b_host->h_addr, addr))
130 wake_up(&block->b_wait); 131 continue;
131 res = nlm_granted; 132 if (nfs_compare_fh(NFS_FH(fl_blocked->fl_file->f_dentry->d_inode) ,fh) != 0)
132 } 133 continue;
134 /* Alright, we found a lock. Set the return status
135 * and wake up the caller
136 */
137 block->b_status = NLM_LCK_GRANTED;
138 wake_up(&block->b_wait);
139 res = nlm_granted;
133 } 140 }
134 return res; 141 return res;
135} 142}
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 145524039577..970b6a6aa337 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -22,12 +22,14 @@
22#define NLMDBG_FACILITY NLMDBG_CLIENT 22#define NLMDBG_FACILITY NLMDBG_CLIENT
23#define NLMCLNT_GRACE_WAIT (5*HZ) 23#define NLMCLNT_GRACE_WAIT (5*HZ)
24#define NLMCLNT_POLL_TIMEOUT (30*HZ) 24#define NLMCLNT_POLL_TIMEOUT (30*HZ)
25#define NLMCLNT_MAX_RETRIES 3
25 26
26static int nlmclnt_test(struct nlm_rqst *, struct file_lock *); 27static int nlmclnt_test(struct nlm_rqst *, struct file_lock *);
27static int nlmclnt_lock(struct nlm_rqst *, struct file_lock *); 28static int nlmclnt_lock(struct nlm_rqst *, struct file_lock *);
28static int nlmclnt_unlock(struct nlm_rqst *, struct file_lock *); 29static int nlmclnt_unlock(struct nlm_rqst *, struct file_lock *);
29static int nlm_stat_to_errno(u32 stat); 30static int nlm_stat_to_errno(u32 stat);
30static void nlmclnt_locks_init_private(struct file_lock *fl, struct nlm_host *host); 31static void nlmclnt_locks_init_private(struct file_lock *fl, struct nlm_host *host);
32static int nlmclnt_cancel(struct nlm_host *, int , struct file_lock *);
31 33
32static const struct rpc_call_ops nlmclnt_unlock_ops; 34static const struct rpc_call_ops nlmclnt_unlock_ops;
33static const struct rpc_call_ops nlmclnt_cancel_ops; 35static const struct rpc_call_ops nlmclnt_cancel_ops;
@@ -598,7 +600,7 @@ out_unblock:
598 nlmclnt_finish_block(req); 600 nlmclnt_finish_block(req);
599 /* Cancel the blocked request if it is still pending */ 601 /* Cancel the blocked request if it is still pending */
600 if (resp->status == NLM_LCK_BLOCKED) 602 if (resp->status == NLM_LCK_BLOCKED)
601 nlmclnt_cancel(host, fl); 603 nlmclnt_cancel(host, req->a_args.block, fl);
602out: 604out:
603 nlmclnt_release_lockargs(req); 605 nlmclnt_release_lockargs(req);
604 return status; 606 return status;
@@ -660,12 +662,18 @@ nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl)
660 * reclaimed while we're stuck in the unlock call. */ 662 * reclaimed while we're stuck in the unlock call. */
661 fl->fl_u.nfs_fl.flags &= ~NFS_LCK_GRANTED; 663 fl->fl_u.nfs_fl.flags &= ~NFS_LCK_GRANTED;
662 664
665 /*
666 * Note: the server is supposed to either grant us the unlock
667 * request, or to deny it with NLM_LCK_DENIED_GRACE_PERIOD. In either
668 * case, we want to unlock.
669 */
670 do_vfs_lock(fl);
671
663 if (req->a_flags & RPC_TASK_ASYNC) { 672 if (req->a_flags & RPC_TASK_ASYNC) {
664 status = nlmclnt_async_call(req, NLMPROC_UNLOCK, 673 status = nlmclnt_async_call(req, NLMPROC_UNLOCK,
665 &nlmclnt_unlock_ops); 674 &nlmclnt_unlock_ops);
666 /* Hrmf... Do the unlock early since locks_remove_posix() 675 /* Hrmf... Do the unlock early since locks_remove_posix()
667 * really expects us to free the lock synchronously */ 676 * really expects us to free the lock synchronously */
668 do_vfs_lock(fl);
669 if (status < 0) { 677 if (status < 0) {
670 nlmclnt_release_lockargs(req); 678 nlmclnt_release_lockargs(req);
671 kfree(req); 679 kfree(req);
@@ -678,7 +686,6 @@ nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl)
678 if (status < 0) 686 if (status < 0)
679 return status; 687 return status;
680 688
681 do_vfs_lock(fl);
682 if (resp->status == NLM_LCK_GRANTED) 689 if (resp->status == NLM_LCK_GRANTED)
683 return 0; 690 return 0;
684 691
@@ -728,8 +735,7 @@ static const struct rpc_call_ops nlmclnt_unlock_ops = {
728 * We always use an async RPC call for this in order not to hang a 735 * We always use an async RPC call for this in order not to hang a
729 * process that has been Ctrl-C'ed. 736 * process that has been Ctrl-C'ed.
730 */ 737 */
731int 738static int nlmclnt_cancel(struct nlm_host *host, int block, struct file_lock *fl)
732nlmclnt_cancel(struct nlm_host *host, struct file_lock *fl)
733{ 739{
734 struct nlm_rqst *req; 740 struct nlm_rqst *req;
735 unsigned long flags; 741 unsigned long flags;
@@ -750,6 +756,7 @@ nlmclnt_cancel(struct nlm_host *host, struct file_lock *fl)
750 req->a_flags = RPC_TASK_ASYNC; 756 req->a_flags = RPC_TASK_ASYNC;
751 757
752 nlmclnt_setlockargs(req, fl); 758 nlmclnt_setlockargs(req, fl);
759 req->a_args.block = block;
753 760
754 status = nlmclnt_async_call(req, NLMPROC_CANCEL, &nlmclnt_cancel_ops); 761 status = nlmclnt_async_call(req, NLMPROC_CANCEL, &nlmclnt_cancel_ops);
755 if (status < 0) { 762 if (status < 0) {
@@ -801,6 +808,9 @@ die:
801 return; 808 return;
802 809
803retry_cancel: 810retry_cancel:
811 /* Don't ever retry more than 3 times */
812 if (req->a_retries++ >= NLMCLNT_MAX_RETRIES)
813 goto die;
804 nlm_rebind_host(req->a_host); 814 nlm_rebind_host(req->a_host);
805 rpc_restart_call(task); 815 rpc_restart_call(task);
806 rpc_delay(task, 30 * HZ); 816 rpc_delay(task, 30 * HZ);
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index 4063095d849e..b10f913aa06a 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -228,7 +228,7 @@ nlm4svc_proc_granted(struct svc_rqst *rqstp, struct nlm_args *argp,
228 resp->cookie = argp->cookie; 228 resp->cookie = argp->cookie;
229 229
230 dprintk("lockd: GRANTED called\n"); 230 dprintk("lockd: GRANTED called\n");
231 resp->status = nlmclnt_grant(&argp->lock); 231 resp->status = nlmclnt_grant(&rqstp->rq_addr, &argp->lock);
232 dprintk("lockd: GRANTED status %d\n", ntohl(resp->status)); 232 dprintk("lockd: GRANTED status %d\n", ntohl(resp->status));
233 return rpc_success; 233 return rpc_success;
234} 234}
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 3bc437e0cf5b..35681d9cf1fc 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -256,7 +256,7 @@ nlmsvc_proc_granted(struct svc_rqst *rqstp, struct nlm_args *argp,
256 resp->cookie = argp->cookie; 256 resp->cookie = argp->cookie;
257 257
258 dprintk("lockd: GRANTED called\n"); 258 dprintk("lockd: GRANTED called\n");
259 resp->status = nlmclnt_grant(&argp->lock); 259 resp->status = nlmclnt_grant(&rqstp->rq_addr, &argp->lock);
260 dprintk("lockd: GRANTED status %d\n", ntohl(resp->status)); 260 dprintk("lockd: GRANTED status %d\n", ntohl(resp->status));
261 return rpc_success; 261 return rpc_success;
262} 262}
diff --git a/fs/namei.c b/fs/namei.c
index 4acdac043b6b..8dc2b038d5d9 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -790,7 +790,7 @@ static fastcall int __link_path_walk(const char * name, struct nameidata *nd)
790 790
791 inode = nd->dentry->d_inode; 791 inode = nd->dentry->d_inode;
792 if (nd->depth) 792 if (nd->depth)
793 lookup_flags = LOOKUP_FOLLOW; 793 lookup_flags = LOOKUP_FOLLOW | (nd->flags & LOOKUP_CONTINUE);
794 794
795 /* At this point we know we have a real path component. */ 795 /* At this point we know we have a real path component. */
796 for(;;) { 796 for(;;) {
@@ -885,7 +885,8 @@ static fastcall int __link_path_walk(const char * name, struct nameidata *nd)
885last_with_slashes: 885last_with_slashes:
886 lookup_flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; 886 lookup_flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
887last_component: 887last_component:
888 nd->flags &= ~LOOKUP_CONTINUE; 888 /* Clear LOOKUP_CONTINUE iff it was previously unset */
889 nd->flags &= lookup_flags | ~LOOKUP_CONTINUE;
889 if (lookup_flags & LOOKUP_PARENT) 890 if (lookup_flags & LOOKUP_PARENT)
890 goto lookup_parent; 891 goto lookup_parent;
891 if (this.name[0] == '.') switch (this.len) { 892 if (this.name[0] == '.') switch (this.len) {
@@ -1069,6 +1070,8 @@ static int fastcall do_path_lookup(int dfd, const char *name,
1069 unsigned int flags, struct nameidata *nd) 1070 unsigned int flags, struct nameidata *nd)
1070{ 1071{
1071 int retval = 0; 1072 int retval = 0;
1073 int fput_needed;
1074 struct file *file;
1072 1075
1073 nd->last_type = LAST_ROOT; /* if there are only slashes... */ 1076 nd->last_type = LAST_ROOT; /* if there are only slashes... */
1074 nd->flags = flags; 1077 nd->flags = flags;
@@ -1090,29 +1093,22 @@ static int fastcall do_path_lookup(int dfd, const char *name,
1090 nd->mnt = mntget(current->fs->pwdmnt); 1093 nd->mnt = mntget(current->fs->pwdmnt);
1091 nd->dentry = dget(current->fs->pwd); 1094 nd->dentry = dget(current->fs->pwd);
1092 } else { 1095 } else {
1093 struct file *file;
1094 int fput_needed;
1095 struct dentry *dentry; 1096 struct dentry *dentry;
1096 1097
1097 file = fget_light(dfd, &fput_needed); 1098 file = fget_light(dfd, &fput_needed);
1098 if (!file) { 1099 retval = -EBADF;
1099 retval = -EBADF; 1100 if (!file)
1100 goto out_fail; 1101 goto unlock_fail;
1101 }
1102 1102
1103 dentry = file->f_dentry; 1103 dentry = file->f_dentry;
1104 1104
1105 if (!S_ISDIR(dentry->d_inode->i_mode)) { 1105 retval = -ENOTDIR;
1106 retval = -ENOTDIR; 1106 if (!S_ISDIR(dentry->d_inode->i_mode))
1107 fput_light(file, fput_needed); 1107 goto fput_unlock_fail;
1108 goto out_fail;
1109 }
1110 1108
1111 retval = file_permission(file, MAY_EXEC); 1109 retval = file_permission(file, MAY_EXEC);
1112 if (retval) { 1110 if (retval)
1113 fput_light(file, fput_needed); 1111 goto fput_unlock_fail;
1114 goto out_fail;
1115 }
1116 1112
1117 nd->mnt = mntget(file->f_vfsmnt); 1113 nd->mnt = mntget(file->f_vfsmnt);
1118 nd->dentry = dget(dentry); 1114 nd->dentry = dget(dentry);
@@ -1123,10 +1119,17 @@ static int fastcall do_path_lookup(int dfd, const char *name,
1123 current->total_link_count = 0; 1119 current->total_link_count = 0;
1124 retval = link_path_walk(name, nd); 1120 retval = link_path_walk(name, nd);
1125out: 1121out:
1126 if (unlikely(current->audit_context 1122 if (likely(retval == 0)) {
1127 && nd && nd->dentry && nd->dentry->d_inode)) 1123 if (unlikely(current->audit_context && nd && nd->dentry &&
1124 nd->dentry->d_inode))
1128 audit_inode(name, nd->dentry->d_inode, flags); 1125 audit_inode(name, nd->dentry->d_inode, flags);
1129out_fail: 1126 }
1127 return retval;
1128
1129fput_unlock_fail:
1130 fput_light(file, fput_needed);
1131unlock_fail:
1132 read_unlock(&current->fs->lock);
1130 return retval; 1133 return retval;
1131} 1134}
1132 1135
@@ -1161,6 +1164,7 @@ static int __path_lookup_intent_open(int dfd, const char *name,
1161 1164
1162/** 1165/**
1163 * path_lookup_open - lookup a file path with open intent 1166 * path_lookup_open - lookup a file path with open intent
1167 * @dfd: the directory to use as base, or AT_FDCWD
1164 * @name: pointer to file name 1168 * @name: pointer to file name
1165 * @lookup_flags: lookup intent flags 1169 * @lookup_flags: lookup intent flags
1166 * @nd: pointer to nameidata 1170 * @nd: pointer to nameidata
@@ -1175,6 +1179,7 @@ int path_lookup_open(int dfd, const char *name, unsigned int lookup_flags,
1175 1179
1176/** 1180/**
1177 * path_lookup_create - lookup a file path with open + create intent 1181 * path_lookup_create - lookup a file path with open + create intent
1182 * @dfd: the directory to use as base, or AT_FDCWD
1178 * @name: pointer to file name 1183 * @name: pointer to file name
1179 * @lookup_flags: lookup intent flags 1184 * @lookup_flags: lookup intent flags
1180 * @nd: pointer to nameidata 1185 * @nd: pointer to nameidata
@@ -2219,13 +2224,17 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
2219 * and other special files. --ADM 2224 * and other special files. --ADM
2220 */ 2225 */
2221asmlinkage long sys_linkat(int olddfd, const char __user *oldname, 2226asmlinkage long sys_linkat(int olddfd, const char __user *oldname,
2222 int newdfd, const char __user *newname) 2227 int newdfd, const char __user *newname,
2228 int flags)
2223{ 2229{
2224 struct dentry *new_dentry; 2230 struct dentry *new_dentry;
2225 struct nameidata nd, old_nd; 2231 struct nameidata nd, old_nd;
2226 int error; 2232 int error;
2227 char * to; 2233 char * to;
2228 2234
2235 if (flags != 0)
2236 return -EINVAL;
2237
2229 to = getname(newname); 2238 to = getname(newname);
2230 if (IS_ERR(to)) 2239 if (IS_ERR(to))
2231 return PTR_ERR(to); 2240 return PTR_ERR(to);
@@ -2258,7 +2267,7 @@ exit:
2258 2267
2259asmlinkage long sys_link(const char __user *oldname, const char __user *newname) 2268asmlinkage long sys_link(const char __user *oldname, const char __user *newname)
2260{ 2269{
2261 return sys_linkat(AT_FDCWD, oldname, AT_FDCWD, newname); 2270 return sys_linkat(AT_FDCWD, oldname, AT_FDCWD, newname, 0);
2262} 2271}
2263 2272
2264/* 2273/*
@@ -2604,13 +2613,15 @@ void page_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
2604 } 2613 }
2605} 2614}
2606 2615
2607int page_symlink(struct inode *inode, const char *symname, int len) 2616int __page_symlink(struct inode *inode, const char *symname, int len,
2617 gfp_t gfp_mask)
2608{ 2618{
2609 struct address_space *mapping = inode->i_mapping; 2619 struct address_space *mapping = inode->i_mapping;
2610 struct page *page = grab_cache_page(mapping, 0); 2620 struct page *page;
2611 int err = -ENOMEM; 2621 int err = -ENOMEM;
2612 char *kaddr; 2622 char *kaddr;
2613 2623
2624 page = find_or_create_page(mapping, 0, gfp_mask);
2614 if (!page) 2625 if (!page)
2615 goto fail; 2626 goto fail;
2616 err = mapping->a_ops->prepare_write(NULL, page, 0, len-1); 2627 err = mapping->a_ops->prepare_write(NULL, page, 0, len-1);
@@ -2645,6 +2656,12 @@ fail:
2645 return err; 2656 return err;
2646} 2657}
2647 2658
2659int page_symlink(struct inode *inode, const char *symname, int len)
2660{
2661 return __page_symlink(inode, symname, len,
2662 mapping_gfp_mask(inode->i_mapping));
2663}
2664
2648struct inode_operations page_symlink_inode_operations = { 2665struct inode_operations page_symlink_inode_operations = {
2649 .readlink = generic_readlink, 2666 .readlink = generic_readlink,
2650 .follow_link = page_follow_link_light, 2667 .follow_link = page_follow_link_light,
@@ -2663,6 +2680,7 @@ EXPORT_SYMBOL(lookup_one_len);
2663EXPORT_SYMBOL(page_follow_link_light); 2680EXPORT_SYMBOL(page_follow_link_light);
2664EXPORT_SYMBOL(page_put_link); 2681EXPORT_SYMBOL(page_put_link);
2665EXPORT_SYMBOL(page_readlink); 2682EXPORT_SYMBOL(page_readlink);
2683EXPORT_SYMBOL(__page_symlink);
2666EXPORT_SYMBOL(page_symlink); 2684EXPORT_SYMBOL(page_symlink);
2667EXPORT_SYMBOL(page_symlink_inode_operations); 2685EXPORT_SYMBOL(page_symlink_inode_operations);
2668EXPORT_SYMBOL(path_lookup); 2686EXPORT_SYMBOL(path_lookup);
diff --git a/fs/namespace.c b/fs/namespace.c
index ce97becff461..058a44865beb 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -494,7 +494,7 @@ void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill)
494 p->mnt_namespace = NULL; 494 p->mnt_namespace = NULL;
495 list_del_init(&p->mnt_child); 495 list_del_init(&p->mnt_child);
496 if (p->mnt_parent != p) 496 if (p->mnt_parent != p)
497 mnt->mnt_mountpoint->d_mounted--; 497 p->mnt_mountpoint->d_mounted--;
498 change_mnt_propagation(p, MS_PRIVATE); 498 change_mnt_propagation(p, MS_PRIVATE);
499 } 499 }
500} 500}
@@ -1325,27 +1325,17 @@ dput_out:
1325 return retval; 1325 return retval;
1326} 1326}
1327 1327
1328int copy_namespace(int flags, struct task_struct *tsk) 1328/*
1329 * Allocate a new namespace structure and populate it with contents
1330 * copied from the namespace of the passed in task structure.
1331 */
1332struct namespace *dup_namespace(struct task_struct *tsk, struct fs_struct *fs)
1329{ 1333{
1330 struct namespace *namespace = tsk->namespace; 1334 struct namespace *namespace = tsk->namespace;
1331 struct namespace *new_ns; 1335 struct namespace *new_ns;
1332 struct vfsmount *rootmnt = NULL, *pwdmnt = NULL, *altrootmnt = NULL; 1336 struct vfsmount *rootmnt = NULL, *pwdmnt = NULL, *altrootmnt = NULL;
1333 struct fs_struct *fs = tsk->fs;
1334 struct vfsmount *p, *q; 1337 struct vfsmount *p, *q;
1335 1338
1336 if (!namespace)
1337 return 0;
1338
1339 get_namespace(namespace);
1340
1341 if (!(flags & CLONE_NEWNS))
1342 return 0;
1343
1344 if (!capable(CAP_SYS_ADMIN)) {
1345 put_namespace(namespace);
1346 return -EPERM;
1347 }
1348
1349 new_ns = kmalloc(sizeof(struct namespace), GFP_KERNEL); 1339 new_ns = kmalloc(sizeof(struct namespace), GFP_KERNEL);
1350 if (!new_ns) 1340 if (!new_ns)
1351 goto out; 1341 goto out;
@@ -1396,8 +1386,6 @@ int copy_namespace(int flags, struct task_struct *tsk)
1396 } 1386 }
1397 up_write(&namespace_sem); 1387 up_write(&namespace_sem);
1398 1388
1399 tsk->namespace = new_ns;
1400
1401 if (rootmnt) 1389 if (rootmnt)
1402 mntput(rootmnt); 1390 mntput(rootmnt);
1403 if (pwdmnt) 1391 if (pwdmnt)
@@ -1405,12 +1393,40 @@ int copy_namespace(int flags, struct task_struct *tsk)
1405 if (altrootmnt) 1393 if (altrootmnt)
1406 mntput(altrootmnt); 1394 mntput(altrootmnt);
1407 1395
1408 put_namespace(namespace); 1396out:
1409 return 0; 1397 return new_ns;
1398}
1399
1400int copy_namespace(int flags, struct task_struct *tsk)
1401{
1402 struct namespace *namespace = tsk->namespace;
1403 struct namespace *new_ns;
1404 int err = 0;
1405
1406 if (!namespace)
1407 return 0;
1408
1409 get_namespace(namespace);
1410
1411 if (!(flags & CLONE_NEWNS))
1412 return 0;
1413
1414 if (!capable(CAP_SYS_ADMIN)) {
1415 err = -EPERM;
1416 goto out;
1417 }
1418
1419 new_ns = dup_namespace(tsk, tsk->fs);
1420 if (!new_ns) {
1421 err = -ENOMEM;
1422 goto out;
1423 }
1424
1425 tsk->namespace = new_ns;
1410 1426
1411out: 1427out:
1412 put_namespace(namespace); 1428 put_namespace(namespace);
1413 return -ENOMEM; 1429 return err;
1414} 1430}
1415 1431
1416asmlinkage long sys_mount(char __user * dev_name, char __user * dir_name, 1432asmlinkage long sys_mount(char __user * dev_name, char __user * dir_name,
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 10ae377e68ff..4e9b3a1b36c5 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -57,6 +57,7 @@
57#define NFSDBG_FACILITY NFSDBG_VFS 57#define NFSDBG_FACILITY NFSDBG_VFS
58#define MAX_DIRECTIO_SIZE (4096UL << PAGE_SHIFT) 58#define MAX_DIRECTIO_SIZE (4096UL << PAGE_SHIFT)
59 59
60static void nfs_free_user_pages(struct page **pages, int npages, int do_dirty);
60static kmem_cache_t *nfs_direct_cachep; 61static kmem_cache_t *nfs_direct_cachep;
61 62
62/* 63/*
@@ -107,6 +108,15 @@ nfs_get_user_pages(int rw, unsigned long user_addr, size_t size,
107 page_count, (rw == READ), 0, 108 page_count, (rw == READ), 0,
108 *pages, NULL); 109 *pages, NULL);
109 up_read(&current->mm->mmap_sem); 110 up_read(&current->mm->mmap_sem);
111 /*
112 * If we got fewer pages than expected from get_user_pages(),
113 * the user buffer runs off the end of a mapping; return EFAULT.
114 */
115 if (result >= 0 && result < page_count) {
116 nfs_free_user_pages(*pages, result, 0);
117 *pages = NULL;
118 result = -EFAULT;
119 }
110 } 120 }
111 return result; 121 return result;
112} 122}
@@ -481,7 +491,7 @@ retry:
481 if (wdata->verf.committed != NFS_FILE_SYNC) { 491 if (wdata->verf.committed != NFS_FILE_SYNC) {
482 need_commit = 1; 492 need_commit = 1;
483 if (memcmp(&first_verf.verifier, &wdata->verf.verifier, 493 if (memcmp(&first_verf.verifier, &wdata->verf.verifier,
484 sizeof(first_verf.verifier))); 494 sizeof(first_verf.verifier)))
485 goto sync_retry; 495 goto sync_retry;
486 } 496 }
487 497
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 984ca3454d04..f8c0066e02e1 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1430,7 +1430,7 @@ static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
1430 if (status == 0) 1430 if (status == 0)
1431 status = nfs4_do_fsinfo(server, fhandle, info); 1431 status = nfs4_do_fsinfo(server, fhandle, info);
1432out: 1432out:
1433 return status; 1433 return nfs4_map_errors(status);
1434} 1434}
1435 1435
1436static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr) 1436static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr)
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index e897e00c2c9d..c0a754ecdee6 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -465,10 +465,11 @@ static int __init root_nfs_ports(void)
465 "number from server, using default\n"); 465 "number from server, using default\n");
466 port = nfsd_port; 466 port = nfsd_port;
467 } 467 }
468 nfs_port = htons(port); 468 nfs_port = port;
469 dprintk("Root-NFS: Portmapper on server returned %d " 469 dprintk("Root-NFS: Portmapper on server returned %d "
470 "as nfsd port\n", port); 470 "as nfsd port\n", port);
471 } 471 }
472 nfs_port = htons(nfs_port);
472 473
473 if ((port = root_nfs_getport(NFS_MNT_PROGRAM, mountd_ver, proto)) < 0) { 474 if ((port = root_nfs_getport(NFS_MNT_PROGRAM, mountd_ver, proto)) < 0) {
474 printk(KERN_ERR "Root-NFS: Unable to get mountd port " 475 printk(KERN_ERR "Root-NFS: Unable to get mountd port "
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index a00fe8686293..6d63f1d9e5f5 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -195,10 +195,12 @@ nfsd4_open(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open
195 195
196 /* Openowner is now set, so sequence id will get bumped. Now we need 196 /* Openowner is now set, so sequence id will get bumped. Now we need
197 * these checks before we do any creates: */ 197 * these checks before we do any creates: */
198 status = nfserr_grace;
198 if (nfs4_in_grace() && open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS) 199 if (nfs4_in_grace() && open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS)
199 return nfserr_grace; 200 goto out;
201 status = nfserr_no_grace;
200 if (!nfs4_in_grace() && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS) 202 if (!nfs4_in_grace() && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS)
201 return nfserr_no_grace; 203 goto out;
202 204
203 switch (open->op_claim_type) { 205 switch (open->op_claim_type) {
204 case NFS4_OPEN_CLAIM_DELEGATE_CUR: 206 case NFS4_OPEN_CLAIM_DELEGATE_CUR:
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 89ed04696865..1d163b616915 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -64,6 +64,32 @@ struct nfsd_list {
64}; 64};
65static struct list_head nfsd_list = LIST_HEAD_INIT(nfsd_list); 65static struct list_head nfsd_list = LIST_HEAD_INIT(nfsd_list);
66 66
67#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
68static struct svc_stat nfsd_acl_svcstats;
69static struct svc_version * nfsd_acl_version[] = {
70 [2] = &nfsd_acl_version2,
71 [3] = &nfsd_acl_version3,
72};
73
74#define NFSD_ACL_MINVERS 2
75#define NFSD_ACL_NRVERS (sizeof(nfsd_acl_version)/sizeof(nfsd_acl_version[0]))
76static struct svc_version *nfsd_acl_versions[NFSD_ACL_NRVERS];
77
78static struct svc_program nfsd_acl_program = {
79 .pg_prog = NFS_ACL_PROGRAM,
80 .pg_nvers = NFSD_ACL_NRVERS,
81 .pg_vers = nfsd_acl_versions,
82 .pg_name = "nfsd",
83 .pg_class = "nfsd",
84 .pg_stats = &nfsd_acl_svcstats,
85 .pg_authenticate = &svc_set_client,
86};
87
88static struct svc_stat nfsd_acl_svcstats = {
89 .program = &nfsd_acl_program,
90};
91#endif /* defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) */
92
67static struct svc_version * nfsd_version[] = { 93static struct svc_version * nfsd_version[] = {
68 [2] = &nfsd_version2, 94 [2] = &nfsd_version2,
69#if defined(CONFIG_NFSD_V3) 95#if defined(CONFIG_NFSD_V3)
@@ -79,6 +105,9 @@ static struct svc_version * nfsd_version[] = {
79static struct svc_version *nfsd_versions[NFSD_NRVERS]; 105static struct svc_version *nfsd_versions[NFSD_NRVERS];
80 106
81struct svc_program nfsd_program = { 107struct svc_program nfsd_program = {
108#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
109 .pg_next = &nfsd_acl_program,
110#endif
82 .pg_prog = NFS_PROGRAM, /* program number */ 111 .pg_prog = NFS_PROGRAM, /* program number */
83 .pg_nvers = NFSD_NRVERS, /* nr of entries in nfsd_version */ 112 .pg_nvers = NFSD_NRVERS, /* nr of entries in nfsd_version */
84 .pg_vers = nfsd_versions, /* version table */ 113 .pg_vers = nfsd_versions, /* version table */
@@ -147,6 +176,26 @@ nfsd_svc(unsigned short port, int nrservs)
147 nfsd_program.pg_vers[i] = nfsd_version[i]; 176 nfsd_program.pg_vers[i] = nfsd_version[i];
148 } 177 }
149 178
179
180#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
181 found_one = 0;
182
183 for (i = NFSD_ACL_MINVERS; i < NFSD_ACL_NRVERS; i++) {
184 if (NFSCTL_VERISSET(nfsd_versbits, i)) {
185 nfsd_acl_program.pg_vers[i] =
186 nfsd_acl_version[i];
187 found_one = 1;
188 } else
189 nfsd_acl_program.pg_vers[i] = NULL;
190 }
191
192 if (!found_one) {
193 for (i = NFSD_ACL_MINVERS; i < NFSD_ACL_NRVERS; i++)
194 nfsd_acl_program.pg_vers[i] =
195 nfsd_acl_version[i];
196 }
197#endif
198
150 atomic_set(&nfsd_busy, 0); 199 atomic_set(&nfsd_busy, 0);
151 error = -ENOMEM; 200 error = -ENOMEM;
152 nfsd_serv = svc_create(&nfsd_program, NFSD_BUFSIZE); 201 nfsd_serv = svc_create(&nfsd_program, NFSD_BUFSIZE);
@@ -411,30 +460,3 @@ nfsd_dispatch(struct svc_rqst *rqstp, u32 *statp)
411 nfsd_cache_update(rqstp, proc->pc_cachetype, statp + 1); 460 nfsd_cache_update(rqstp, proc->pc_cachetype, statp + 1);
412 return 1; 461 return 1;
413} 462}
414
415#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
416static struct svc_stat nfsd_acl_svcstats;
417static struct svc_version * nfsd_acl_version[] = {
418 [2] = &nfsd_acl_version2,
419 [3] = &nfsd_acl_version3,
420};
421
422#define NFSD_ACL_NRVERS (sizeof(nfsd_acl_version)/sizeof(nfsd_acl_version[0]))
423static struct svc_program nfsd_acl_program = {
424 .pg_prog = NFS_ACL_PROGRAM,
425 .pg_nvers = NFSD_ACL_NRVERS,
426 .pg_vers = nfsd_acl_version,
427 .pg_name = "nfsd",
428 .pg_class = "nfsd",
429 .pg_stats = &nfsd_acl_svcstats,
430 .pg_authenticate = &svc_set_client,
431};
432
433static struct svc_stat nfsd_acl_svcstats = {
434 .program = &nfsd_acl_program,
435};
436
437#define nfsd_acl_program_p &nfsd_acl_program
438#else
439#define nfsd_acl_program_p NULL
440#endif /* defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) */
diff --git a/fs/ntfs/ChangeLog b/fs/ntfs/ChangeLog
index 02f44094bda9..9d8ffa89e2c2 100644
--- a/fs/ntfs/ChangeLog
+++ b/fs/ntfs/ChangeLog
@@ -1,9 +1,9 @@
1ToDo/Notes: 1ToDo/Notes:
2 - Find and fix bugs. 2 - Find and fix bugs.
3 - The only places in the kernel where a file is resized are 3 - The only places in the kernel where a file is resized are
4 ntfs_file_write*() and ntfs_truncate() for both of which i_sem is 4 ntfs_file_write*() and ntfs_truncate() for both of which i_mutex is
5 held. Just have to be careful in read-/writepage and other helpers 5 held. Just have to be careful in read-/writepage and other helpers
6 not running under i_sem that we play nice... Also need to be careful 6 not running under i_mutex that we play nice. Also need to be careful
7 with initialized_size extension in ntfs_file_write*() and writepage. 7 with initialized_size extension in ntfs_file_write*() and writepage.
8 UPDATE: The only things that need to be checked are the compressed 8 UPDATE: The only things that need to be checked are the compressed
9 write and the other attribute resize/write cases like index 9 write and the other attribute resize/write cases like index
@@ -19,6 +19,24 @@ ToDo/Notes:
19 - Enable the code for setting the NT4 compatibility flag when we start 19 - Enable the code for setting the NT4 compatibility flag when we start
20 making NTFS 1.2 specific modifications. 20 making NTFS 1.2 specific modifications.
21 21
222.1.26 - Minor bug fixes and updates.
23
24 - Fix a potential overflow in file.c where a cast to s64 was missing in
25 a left shift of a page index.
26 - The struct inode has had its i_sem semaphore changed to a mutex named
27 i_mutex.
28 - We have struct kmem_cache now so use it instead of the typedef
29 kmem_cache_t. (Pekka Enberg)
30 - Implement support for sector sizes above 512 bytes (up to the maximum
31 supported by NTFS which is 4096 bytes).
32 - Do more detailed reporting of why we cannot mount read-write by
33 special casing the VOLUME_MODIFIED_BY_CHKDSK flag.
34 - Miscellaneous updates to layout.h.
35 - Cope with attribute list attribute having invalid flags. Windows
36 copes with this and even chkdsk does not detect or fix this so we
37 have to cope with it, too. Thanks to Pawel Kot for reporting the
38 problem.
39
222.1.25 - (Almost) fully implement write(2) and truncate(2). 402.1.25 - (Almost) fully implement write(2) and truncate(2).
23 41
24 - Change ntfs_map_runlist_nolock(), ntfs_attr_find_vcn_nolock() and 42 - Change ntfs_map_runlist_nolock(), ntfs_attr_find_vcn_nolock() and
@@ -373,7 +391,7 @@ ToDo/Notes:
373 single one of them had an mst error. (Thanks to Ken MacFerrin for 391 single one of them had an mst error. (Thanks to Ken MacFerrin for
374 the bug report.) 392 the bug report.)
375 - Fix error handling in fs/ntfs/quota.c::ntfs_mark_quotas_out_of_date() 393 - Fix error handling in fs/ntfs/quota.c::ntfs_mark_quotas_out_of_date()
376 where we failed to release i_sem on the $Quota/$Q attribute inode. 394 where we failed to release i_mutex on the $Quota/$Q attribute inode.
377 - Fix bug in handling of bad inodes in fs/ntfs/namei.c::ntfs_lookup(). 395 - Fix bug in handling of bad inodes in fs/ntfs/namei.c::ntfs_lookup().
378 - Add mapping of unmapped buffers to all remaining code paths, i.e. 396 - Add mapping of unmapped buffers to all remaining code paths, i.e.
379 fs/ntfs/aops.c::ntfs_write_mst_block(), mft.c::ntfs_sync_mft_mirror(), 397 fs/ntfs/aops.c::ntfs_write_mst_block(), mft.c::ntfs_sync_mft_mirror(),
@@ -874,7 +892,7 @@ ToDo/Notes:
874 clusters. (Philipp Thomas) 892 clusters. (Philipp Thomas)
875 - attrib.c::load_attribute_list(): Fix bug when initialized_size is a 893 - attrib.c::load_attribute_list(): Fix bug when initialized_size is a
876 multiple of the block_size but not the cluster size. (Szabolcs 894 multiple of the block_size but not the cluster size. (Szabolcs
877 Szakacsits <szaka@sienet.hu>) 895 Szakacsits)
878 896
8792.1.2 - Important bug fixes aleviating the hangs in statfs. 8972.1.2 - Important bug fixes aleviating the hangs in statfs.
880 898
@@ -884,7 +902,7 @@ ToDo/Notes:
884 902
885 - Add handling for initialized_size != data_size in compressed files. 903 - Add handling for initialized_size != data_size in compressed files.
886 - Reduce function local stack usage from 0x3d4 bytes to just noise in 904 - Reduce function local stack usage from 0x3d4 bytes to just noise in
887 fs/ntfs/upcase.c. (Randy Dunlap <rdunlap@xenotime.net>) 905 fs/ntfs/upcase.c. (Randy Dunlap)
888 - Remove compiler warnings for newer gcc. 906 - Remove compiler warnings for newer gcc.
889 - Pages are no longer kmapped by mm/filemap.c::generic_file_write() 907 - Pages are no longer kmapped by mm/filemap.c::generic_file_write()
890 around calls to ->{prepare,commit}_write. Adapt NTFS appropriately 908 around calls to ->{prepare,commit}_write. Adapt NTFS appropriately
@@ -1201,11 +1219,11 @@ ToDo/Notes:
1201 the kernel. We probably want a kernel generic init_address_space() 1219 the kernel. We probably want a kernel generic init_address_space()
1202 function... 1220 function...
1203 - Drop BKL from ntfs_readdir() after consultation with Al Viro. The 1221 - Drop BKL from ntfs_readdir() after consultation with Al Viro. The
1204 only caller of ->readdir() is vfs_readdir() which holds i_sem during 1222 only caller of ->readdir() is vfs_readdir() which holds i_mutex
1205 the call, and i_sem is sufficient protection against changes in the 1223 during the call, and i_mutex is sufficient protection against changes
1206 directory inode (including ->i_size). 1224 in the directory inode (including ->i_size).
1207 - Use generic_file_llseek() for directories (as opposed to 1225 - Use generic_file_llseek() for directories (as opposed to
1208 default_llseek()) as this downs i_sem instead of the BKL which is 1226 default_llseek()) as this downs i_mutex instead of the BKL which is
1209 what we now need for exclusion against ->f_pos changes considering we 1227 what we now need for exclusion against ->f_pos changes considering we
1210 no longer take the BKL in ntfs_readdir(). 1228 no longer take the BKL in ntfs_readdir().
1211 1229
diff --git a/fs/ntfs/Makefile b/fs/ntfs/Makefile
index d0d45d1c853a..d95fac7fdeb6 100644
--- a/fs/ntfs/Makefile
+++ b/fs/ntfs/Makefile
@@ -6,7 +6,7 @@ ntfs-objs := aops.o attrib.o collate.o compress.o debug.o dir.o file.o \
6 index.o inode.o mft.o mst.o namei.o runlist.o super.o sysctl.o \ 6 index.o inode.o mft.o mst.o namei.o runlist.o super.o sysctl.o \
7 unistr.o upcase.o 7 unistr.o upcase.o
8 8
9EXTRA_CFLAGS = -DNTFS_VERSION=\"2.1.25\" 9EXTRA_CFLAGS = -DNTFS_VERSION=\"2.1.26\"
10 10
11ifeq ($(CONFIG_NTFS_DEBUG),y) 11ifeq ($(CONFIG_NTFS_DEBUG),y)
12EXTRA_CFLAGS += -DDEBUG 12EXTRA_CFLAGS += -DDEBUG
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index 1c0a4315876a..7e361da770b3 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -2,7 +2,7 @@
2 * aops.c - NTFS kernel address space operations and page cache handling. 2 * aops.c - NTFS kernel address space operations and page cache handling.
3 * Part of the Linux-NTFS project. 3 * Part of the Linux-NTFS project.
4 * 4 *
5 * Copyright (c) 2001-2005 Anton Altaparmakov 5 * Copyright (c) 2001-2006 Anton Altaparmakov
6 * Copyright (c) 2002 Richard Russon 6 * Copyright (c) 2002 Richard Russon
7 * 7 *
8 * This program/include file is free software; you can redistribute it and/or 8 * This program/include file is free software; you can redistribute it and/or
@@ -200,8 +200,8 @@ static int ntfs_read_block(struct page *page)
200 /* $MFT/$DATA must have its complete runlist in memory at all times. */ 200 /* $MFT/$DATA must have its complete runlist in memory at all times. */
201 BUG_ON(!ni->runlist.rl && !ni->mft_no && !NInoAttr(ni)); 201 BUG_ON(!ni->runlist.rl && !ni->mft_no && !NInoAttr(ni));
202 202
203 blocksize_bits = VFS_I(ni)->i_blkbits; 203 blocksize = vol->sb->s_blocksize;
204 blocksize = 1 << blocksize_bits; 204 blocksize_bits = vol->sb->s_blocksize_bits;
205 205
206 if (!page_has_buffers(page)) { 206 if (!page_has_buffers(page)) {
207 create_empty_buffers(page, blocksize, 0); 207 create_empty_buffers(page, blocksize, 0);
@@ -569,10 +569,8 @@ static int ntfs_write_block(struct page *page, struct writeback_control *wbc)
569 569
570 BUG_ON(!NInoNonResident(ni)); 570 BUG_ON(!NInoNonResident(ni));
571 BUG_ON(NInoMstProtected(ni)); 571 BUG_ON(NInoMstProtected(ni));
572 572 blocksize = vol->sb->s_blocksize;
573 blocksize_bits = vi->i_blkbits; 573 blocksize_bits = vol->sb->s_blocksize_bits;
574 blocksize = 1 << blocksize_bits;
575
576 if (!page_has_buffers(page)) { 574 if (!page_has_buffers(page)) {
577 BUG_ON(!PageUptodate(page)); 575 BUG_ON(!PageUptodate(page));
578 create_empty_buffers(page, blocksize, 576 create_empty_buffers(page, blocksize,
@@ -949,8 +947,8 @@ static int ntfs_write_mst_block(struct page *page,
949 */ 947 */
950 BUG_ON(!(is_mft || S_ISDIR(vi->i_mode) || 948 BUG_ON(!(is_mft || S_ISDIR(vi->i_mode) ||
951 (NInoAttr(ni) && ni->type == AT_INDEX_ALLOCATION))); 949 (NInoAttr(ni) && ni->type == AT_INDEX_ALLOCATION)));
952 bh_size_bits = vi->i_blkbits; 950 bh_size = vol->sb->s_blocksize;
953 bh_size = 1 << bh_size_bits; 951 bh_size_bits = vol->sb->s_blocksize_bits;
954 max_bhs = PAGE_CACHE_SIZE / bh_size; 952 max_bhs = PAGE_CACHE_SIZE / bh_size;
955 BUG_ON(!max_bhs); 953 BUG_ON(!max_bhs);
956 BUG_ON(max_bhs > MAX_BUF_PER_PAGE); 954 BUG_ON(max_bhs > MAX_BUF_PER_PAGE);
@@ -1596,7 +1594,7 @@ void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs) {
1596 1594
1597 BUG_ON(!PageUptodate(page)); 1595 BUG_ON(!PageUptodate(page));
1598 end = ofs + ni->itype.index.block_size; 1596 end = ofs + ni->itype.index.block_size;
1599 bh_size = 1 << VFS_I(ni)->i_blkbits; 1597 bh_size = VFS_I(ni)->i_sb->s_blocksize;
1600 spin_lock(&mapping->private_lock); 1598 spin_lock(&mapping->private_lock);
1601 if (unlikely(!page_has_buffers(page))) { 1599 if (unlikely(!page_has_buffers(page))) {
1602 spin_unlock(&mapping->private_lock); 1600 spin_unlock(&mapping->private_lock);
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index fb413d3d8618..5027d3d1b3fe 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * file.c - NTFS kernel file operations. Part of the Linux-NTFS project. 2 * file.c - NTFS kernel file operations. Part of the Linux-NTFS project.
3 * 3 *
4 * Copyright (c) 2001-2005 Anton Altaparmakov 4 * Copyright (c) 2001-2006 Anton Altaparmakov
5 * 5 *
6 * This program/include file is free software; you can redistribute it and/or 6 * This program/include file is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as published 7 * modify it under the terms of the GNU General Public License as published
@@ -248,7 +248,7 @@ do_non_resident_extend:
248 * enough to make ntfs_writepage() work. 248 * enough to make ntfs_writepage() work.
249 */ 249 */
250 write_lock_irqsave(&ni->size_lock, flags); 250 write_lock_irqsave(&ni->size_lock, flags);
251 ni->initialized_size = (index + 1) << PAGE_CACHE_SHIFT; 251 ni->initialized_size = (s64)(index + 1) << PAGE_CACHE_SHIFT;
252 if (ni->initialized_size > new_init_size) 252 if (ni->initialized_size > new_init_size)
253 ni->initialized_size = new_init_size; 253 ni->initialized_size = new_init_size;
254 write_unlock_irqrestore(&ni->size_lock, flags); 254 write_unlock_irqrestore(&ni->size_lock, flags);
@@ -529,8 +529,8 @@ static int ntfs_prepare_pages_for_non_resident_write(struct page **pages,
529 "index 0x%lx, nr_pages 0x%x, pos 0x%llx, bytes 0x%zx.", 529 "index 0x%lx, nr_pages 0x%x, pos 0x%llx, bytes 0x%zx.",
530 vi->i_ino, ni->type, pages[0]->index, nr_pages, 530 vi->i_ino, ni->type, pages[0]->index, nr_pages,
531 (long long)pos, bytes); 531 (long long)pos, bytes);
532 blocksize_bits = vi->i_blkbits; 532 blocksize = vol->sb->s_blocksize;
533 blocksize = 1 << blocksize_bits; 533 blocksize_bits = vol->sb->s_blocksize_bits;
534 u = 0; 534 u = 0;
535 do { 535 do {
536 struct page *page = pages[u]; 536 struct page *page = pages[u];
@@ -1525,7 +1525,7 @@ static inline int ntfs_commit_pages_after_non_resident_write(
1525 1525
1526 vi = pages[0]->mapping->host; 1526 vi = pages[0]->mapping->host;
1527 ni = NTFS_I(vi); 1527 ni = NTFS_I(vi);
1528 blocksize = 1 << vi->i_blkbits; 1528 blocksize = vi->i_sb->s_blocksize;
1529 end = pos + bytes; 1529 end = pos + bytes;
1530 u = 0; 1530 u = 0;
1531 do { 1531 do {
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index ea1bd3feea1b..55263b7de9c0 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -677,13 +677,28 @@ static int ntfs_read_locked_inode(struct inode *vi)
677 ntfs_debug("Attribute list found in inode 0x%lx.", vi->i_ino); 677 ntfs_debug("Attribute list found in inode 0x%lx.", vi->i_ino);
678 NInoSetAttrList(ni); 678 NInoSetAttrList(ni);
679 a = ctx->attr; 679 a = ctx->attr;
680 if (a->flags & ATTR_IS_ENCRYPTED || 680 if (a->flags & ATTR_COMPRESSION_MASK) {
681 a->flags & ATTR_COMPRESSION_MASK ||
682 a->flags & ATTR_IS_SPARSE) {
683 ntfs_error(vi->i_sb, "Attribute list attribute is " 681 ntfs_error(vi->i_sb, "Attribute list attribute is "
684 "compressed/encrypted/sparse."); 682 "compressed.");
685 goto unm_err_out; 683 goto unm_err_out;
686 } 684 }
685 if (a->flags & ATTR_IS_ENCRYPTED ||
686 a->flags & ATTR_IS_SPARSE) {
687 if (a->non_resident) {
688 ntfs_error(vi->i_sb, "Non-resident attribute "
689 "list attribute is encrypted/"
690 "sparse.");
691 goto unm_err_out;
692 }
693 ntfs_warning(vi->i_sb, "Resident attribute list "
694 "attribute in inode 0x%lx is marked "
695 "encrypted/sparse which is not true. "
696 "However, Windows allows this and "
697 "chkdsk does not detect or correct it "
698 "so we will just ignore the invalid "
699 "flags and pretend they are not set.",
700 vi->i_ino);
701 }
687 /* Now allocate memory for the attribute list. */ 702 /* Now allocate memory for the attribute list. */
688 ni->attr_list_size = (u32)ntfs_attr_size(a); 703 ni->attr_list_size = (u32)ntfs_attr_size(a);
689 ni->attr_list = ntfs_malloc_nofs(ni->attr_list_size); 704 ni->attr_list = ntfs_malloc_nofs(ni->attr_list_size);
@@ -1809,19 +1824,33 @@ int ntfs_read_inode_mount(struct inode *vi)
1809 } else /* if (!err) */ { 1824 } else /* if (!err) */ {
1810 ATTR_LIST_ENTRY *al_entry, *next_al_entry; 1825 ATTR_LIST_ENTRY *al_entry, *next_al_entry;
1811 u8 *al_end; 1826 u8 *al_end;
1827 static const char *es = " Not allowed. $MFT is corrupt. "
1828 "You should run chkdsk.";
1812 1829
1813 ntfs_debug("Attribute list attribute found in $MFT."); 1830 ntfs_debug("Attribute list attribute found in $MFT.");
1814 NInoSetAttrList(ni); 1831 NInoSetAttrList(ni);
1815 a = ctx->attr; 1832 a = ctx->attr;
1816 if (a->flags & ATTR_IS_ENCRYPTED || 1833 if (a->flags & ATTR_COMPRESSION_MASK) {
1817 a->flags & ATTR_COMPRESSION_MASK ||
1818 a->flags & ATTR_IS_SPARSE) {
1819 ntfs_error(sb, "Attribute list attribute is " 1834 ntfs_error(sb, "Attribute list attribute is "
1820 "compressed/encrypted/sparse. Not " 1835 "compressed.%s", es);
1821 "allowed. $MFT is corrupt. You should "
1822 "run chkdsk.");
1823 goto put_err_out; 1836 goto put_err_out;
1824 } 1837 }
1838 if (a->flags & ATTR_IS_ENCRYPTED ||
1839 a->flags & ATTR_IS_SPARSE) {
1840 if (a->non_resident) {
1841 ntfs_error(sb, "Non-resident attribute list "
1842 "attribute is encrypted/"
1843 "sparse.%s", es);
1844 goto put_err_out;
1845 }
1846 ntfs_warning(sb, "Resident attribute list attribute "
1847 "in $MFT system file is marked "
1848 "encrypted/sparse which is not true. "
1849 "However, Windows allows this and "
1850 "chkdsk does not detect or correct it "
1851 "so we will just ignore the invalid "
1852 "flags and pretend they are not set.");
1853 }
1825 /* Now allocate memory for the attribute list. */ 1854 /* Now allocate memory for the attribute list. */
1826 ni->attr_list_size = (u32)ntfs_attr_size(a); 1855 ni->attr_list_size = (u32)ntfs_attr_size(a);
1827 ni->attr_list = ntfs_malloc_nofs(ni->attr_list_size); 1856 ni->attr_list = ntfs_malloc_nofs(ni->attr_list_size);
diff --git a/fs/ntfs/layout.h b/fs/ntfs/layout.h
index f5678d5d7919..bb408d4dcbb0 100644
--- a/fs/ntfs/layout.h
+++ b/fs/ntfs/layout.h
@@ -838,15 +838,19 @@ enum {
838 F_A_DEVICE, F_A_DIRECTORY, F_A_SPARSE_FILE, F_A_REPARSE_POINT, 838 F_A_DEVICE, F_A_DIRECTORY, F_A_SPARSE_FILE, F_A_REPARSE_POINT,
839 F_A_COMPRESSED, and F_A_ENCRYPTED and preserves the rest. This mask 839 F_A_COMPRESSED, and F_A_ENCRYPTED and preserves the rest. This mask
840 is used to to obtain all flags that are valid for setting. */ 840 is used to to obtain all flags that are valid for setting. */
841
842 /* 841 /*
843 * The following flags are only present in the FILE_NAME attribute (in 842 * The following flag is only present in the FILE_NAME attribute (in
844 * the field file_attributes). 843 * the field file_attributes).
845 */ 844 */
846 FILE_ATTR_DUP_FILE_NAME_INDEX_PRESENT = const_cpu_to_le32(0x10000000), 845 FILE_ATTR_DUP_FILE_NAME_INDEX_PRESENT = const_cpu_to_le32(0x10000000),
847 /* Note, this is a copy of the corresponding bit from the mft record, 846 /* Note, this is a copy of the corresponding bit from the mft record,
848 telling us whether this is a directory or not, i.e. whether it has 847 telling us whether this is a directory or not, i.e. whether it has
849 an index root attribute or not. */ 848 an index root attribute or not. */
849 /*
850 * The following flag is present both in the STANDARD_INFORMATION
851 * attribute and in the FILE_NAME attribute (in the field
852 * file_attributes).
853 */
850 FILE_ATTR_DUP_VIEW_INDEX_PRESENT = const_cpu_to_le32(0x20000000), 854 FILE_ATTR_DUP_VIEW_INDEX_PRESENT = const_cpu_to_le32(0x20000000),
851 /* Note, this is a copy of the corresponding bit from the mft record, 855 /* Note, this is a copy of the corresponding bit from the mft record,
852 telling us whether this file has a view index present (eg. object id 856 telling us whether this file has a view index present (eg. object id
@@ -1071,9 +1075,15 @@ typedef struct {
1071 modified. */ 1075 modified. */
1072/* 20*/ sle64 last_access_time; /* Time this mft record was last 1076/* 20*/ sle64 last_access_time; /* Time this mft record was last
1073 accessed. */ 1077 accessed. */
1074/* 28*/ sle64 allocated_size; /* Byte size of allocated space for the 1078/* 28*/ sle64 allocated_size; /* Byte size of on-disk allocated space
1075 data attribute. NOTE: Is a multiple 1079 for the data attribute. So for
1076 of the cluster size. */ 1080 normal $DATA, this is the
1081 allocated_size from the unnamed
1082 $DATA attribute and for compressed
1083 and/or sparse $DATA, this is the
1084 compressed_size from the unnamed
1085 $DATA attribute. NOTE: This is a
1086 multiple of the cluster size. */
1077/* 30*/ sle64 data_size; /* Byte size of actual data in data 1087/* 30*/ sle64 data_size; /* Byte size of actual data in data
1078 attribute. */ 1088 attribute. */
1079/* 38*/ FILE_ATTR_FLAGS file_attributes; /* Flags describing the file. */ 1089/* 38*/ FILE_ATTR_FLAGS file_attributes; /* Flags describing the file. */
@@ -1904,12 +1914,13 @@ enum {
1904 VOLUME_DELETE_USN_UNDERWAY = const_cpu_to_le16(0x0010), 1914 VOLUME_DELETE_USN_UNDERWAY = const_cpu_to_le16(0x0010),
1905 VOLUME_REPAIR_OBJECT_ID = const_cpu_to_le16(0x0020), 1915 VOLUME_REPAIR_OBJECT_ID = const_cpu_to_le16(0x0020),
1906 1916
1917 VOLUME_CHKDSK_UNDERWAY = const_cpu_to_le16(0x4000),
1907 VOLUME_MODIFIED_BY_CHKDSK = const_cpu_to_le16(0x8000), 1918 VOLUME_MODIFIED_BY_CHKDSK = const_cpu_to_le16(0x8000),
1908 1919
1909 VOLUME_FLAGS_MASK = const_cpu_to_le16(0x803f), 1920 VOLUME_FLAGS_MASK = const_cpu_to_le16(0xc03f),
1910 1921
1911 /* To make our life easier when checking if we must mount read-only. */ 1922 /* To make our life easier when checking if we must mount read-only. */
1912 VOLUME_MUST_MOUNT_RO_MASK = const_cpu_to_le16(0x8027), 1923 VOLUME_MUST_MOUNT_RO_MASK = const_cpu_to_le16(0xc027),
1913} __attribute__ ((__packed__)); 1924} __attribute__ ((__packed__));
1914 1925
1915typedef le16 VOLUME_FLAGS; 1926typedef le16 VOLUME_FLAGS;
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index 0c65cbb8c5cf..6499aafc2258 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -1,7 +1,7 @@
1/** 1/**
2 * mft.c - NTFS kernel mft record operations. Part of the Linux-NTFS project. 2 * mft.c - NTFS kernel mft record operations. Part of the Linux-NTFS project.
3 * 3 *
4 * Copyright (c) 2001-2005 Anton Altaparmakov 4 * Copyright (c) 2001-2006 Anton Altaparmakov
5 * Copyright (c) 2002 Richard Russon 5 * Copyright (c) 2002 Richard Russon
6 * 6 *
7 * This program/include file is free software; you can redistribute it and/or 7 * This program/include file is free software; you can redistribute it and/or
@@ -473,7 +473,7 @@ int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no,
473 runlist_element *rl; 473 runlist_element *rl;
474 unsigned int block_start, block_end, m_start, m_end, page_ofs; 474 unsigned int block_start, block_end, m_start, m_end, page_ofs;
475 int i_bhs, nr_bhs, err = 0; 475 int i_bhs, nr_bhs, err = 0;
476 unsigned char blocksize_bits = vol->mftmirr_ino->i_blkbits; 476 unsigned char blocksize_bits = vol->sb->s_blocksize_bits;
477 477
478 ntfs_debug("Entering for inode 0x%lx.", mft_no); 478 ntfs_debug("Entering for inode 0x%lx.", mft_no);
479 BUG_ON(!max_bhs); 479 BUG_ON(!max_bhs);
@@ -672,8 +672,8 @@ int write_mft_record_nolock(ntfs_inode *ni, MFT_RECORD *m, int sync)
672{ 672{
673 ntfs_volume *vol = ni->vol; 673 ntfs_volume *vol = ni->vol;
674 struct page *page = ni->page; 674 struct page *page = ni->page;
675 unsigned char blocksize_bits = vol->mft_ino->i_blkbits; 675 unsigned int blocksize = vol->sb->s_blocksize;
676 unsigned int blocksize = 1 << blocksize_bits; 676 unsigned char blocksize_bits = vol->sb->s_blocksize_bits;
677 int max_bhs = vol->mft_record_size / blocksize; 677 int max_bhs = vol->mft_record_size / blocksize;
678 struct buffer_head *bhs[max_bhs]; 678 struct buffer_head *bhs[max_bhs];
679 struct buffer_head *bh, *head; 679 struct buffer_head *bh, *head;
diff --git a/fs/ntfs/ntfs.h b/fs/ntfs/ntfs.h
index 446b5014115c..653d2a5c4899 100644
--- a/fs/ntfs/ntfs.h
+++ b/fs/ntfs/ntfs.h
@@ -50,11 +50,11 @@ typedef enum {
50/* Global variables. */ 50/* Global variables. */
51 51
52/* Slab caches (from super.c). */ 52/* Slab caches (from super.c). */
53extern kmem_cache_t *ntfs_name_cache; 53extern struct kmem_cache *ntfs_name_cache;
54extern kmem_cache_t *ntfs_inode_cache; 54extern struct kmem_cache *ntfs_inode_cache;
55extern kmem_cache_t *ntfs_big_inode_cache; 55extern struct kmem_cache *ntfs_big_inode_cache;
56extern kmem_cache_t *ntfs_attr_ctx_cache; 56extern struct kmem_cache *ntfs_attr_ctx_cache;
57extern kmem_cache_t *ntfs_index_ctx_cache; 57extern struct kmem_cache *ntfs_index_ctx_cache;
58 58
59/* The various operations structs defined throughout the driver files. */ 59/* The various operations structs defined throughout the driver files. */
60extern struct address_space_operations ntfs_aops; 60extern struct address_space_operations ntfs_aops;
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index c3a3f1a8310b..368a8ec10668 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * super.c - NTFS kernel super block handling. Part of the Linux-NTFS project. 2 * super.c - NTFS kernel super block handling. Part of the Linux-NTFS project.
3 * 3 *
4 * Copyright (c) 2001-2005 Anton Altaparmakov 4 * Copyright (c) 2001-2006 Anton Altaparmakov
5 * Copyright (c) 2001,2002 Richard Russon 5 * Copyright (c) 2001,2002 Richard Russon
6 * 6 *
7 * This program/include file is free software; you can redistribute it and/or 7 * This program/include file is free software; you can redistribute it and/or
@@ -22,6 +22,7 @@
22 22
23#include <linux/stddef.h> 23#include <linux/stddef.h>
24#include <linux/init.h> 24#include <linux/init.h>
25#include <linux/slab.h>
25#include <linux/string.h> 26#include <linux/string.h>
26#include <linux/spinlock.h> 27#include <linux/spinlock.h>
27#include <linux/blkdev.h> /* For bdev_hardsect_size(). */ 28#include <linux/blkdev.h> /* For bdev_hardsect_size(). */
@@ -471,9 +472,16 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
471 ntfs_error(sb, "Volume is dirty and read-only%s", es); 472 ntfs_error(sb, "Volume is dirty and read-only%s", es);
472 return -EROFS; 473 return -EROFS;
473 } 474 }
475 if (vol->vol_flags & VOLUME_MODIFIED_BY_CHKDSK) {
476 ntfs_error(sb, "Volume has been modified by chkdsk "
477 "and is read-only%s", es);
478 return -EROFS;
479 }
474 if (vol->vol_flags & VOLUME_MUST_MOUNT_RO_MASK) { 480 if (vol->vol_flags & VOLUME_MUST_MOUNT_RO_MASK) {
475 ntfs_error(sb, "Volume has unsupported flags set and " 481 ntfs_error(sb, "Volume has unsupported flags set "
476 "is read-only%s", es); 482 "(0x%x) and is read-only%s",
483 (unsigned)le16_to_cpu(vol->vol_flags),
484 es);
477 return -EROFS; 485 return -EROFS;
478 } 486 }
479 if (ntfs_set_volume_flags(vol, VOLUME_IS_DIRTY)) { 487 if (ntfs_set_volume_flags(vol, VOLUME_IS_DIRTY)) {
@@ -641,7 +649,7 @@ static struct buffer_head *read_ntfs_boot_sector(struct super_block *sb,
641{ 649{
642 const char *read_err_str = "Unable to read %s boot sector."; 650 const char *read_err_str = "Unable to read %s boot sector.";
643 struct buffer_head *bh_primary, *bh_backup; 651 struct buffer_head *bh_primary, *bh_backup;
644 long nr_blocks = NTFS_SB(sb)->nr_blocks; 652 sector_t nr_blocks = NTFS_SB(sb)->nr_blocks;
645 653
646 /* Try to read primary boot sector. */ 654 /* Try to read primary boot sector. */
647 if ((bh_primary = sb_bread(sb, 0))) { 655 if ((bh_primary = sb_bread(sb, 0))) {
@@ -688,13 +696,18 @@ hotfix_primary_boot_sector:
688 /* 696 /*
689 * If we managed to read sector zero and the volume is not 697 * If we managed to read sector zero and the volume is not
690 * read-only, copy the found, valid backup boot sector to the 698 * read-only, copy the found, valid backup boot sector to the
691 * primary boot sector. 699 * primary boot sector. Note we only copy the actual boot
700 * sector structure, not the actual whole device sector as that
701 * may be bigger and would potentially damage the $Boot system
702 * file (FIXME: Would be nice to know if the backup boot sector
703 * on a large sector device contains the whole boot loader or
704 * just the first 512 bytes).
692 */ 705 */
693 if (!(sb->s_flags & MS_RDONLY)) { 706 if (!(sb->s_flags & MS_RDONLY)) {
694 ntfs_warning(sb, "Hot-fix: Recovering invalid primary " 707 ntfs_warning(sb, "Hot-fix: Recovering invalid primary "
695 "boot sector from backup copy."); 708 "boot sector from backup copy.");
696 memcpy(bh_primary->b_data, bh_backup->b_data, 709 memcpy(bh_primary->b_data, bh_backup->b_data,
697 sb->s_blocksize); 710 NTFS_BLOCK_SIZE);
698 mark_buffer_dirty(bh_primary); 711 mark_buffer_dirty(bh_primary);
699 sync_dirty_buffer(bh_primary); 712 sync_dirty_buffer(bh_primary);
700 if (buffer_uptodate(bh_primary)) { 713 if (buffer_uptodate(bh_primary)) {
@@ -733,9 +746,13 @@ static BOOL parse_ntfs_boot_sector(ntfs_volume *vol, const NTFS_BOOT_SECTOR *b)
733 vol->sector_size); 746 vol->sector_size);
734 ntfs_debug("vol->sector_size_bits = %i (0x%x)", vol->sector_size_bits, 747 ntfs_debug("vol->sector_size_bits = %i (0x%x)", vol->sector_size_bits,
735 vol->sector_size_bits); 748 vol->sector_size_bits);
736 if (vol->sector_size != vol->sb->s_blocksize) 749 if (vol->sector_size < vol->sb->s_blocksize) {
737 ntfs_warning(vol->sb, "The boot sector indicates a sector size " 750 ntfs_error(vol->sb, "Sector size (%i) is smaller than the "
738 "different from the device sector size."); 751 "device block size (%lu). This is not "
752 "supported. Sorry.", vol->sector_size,
753 vol->sb->s_blocksize);
754 return FALSE;
755 }
739 ntfs_debug("sectors_per_cluster = 0x%x", b->bpb.sectors_per_cluster); 756 ntfs_debug("sectors_per_cluster = 0x%x", b->bpb.sectors_per_cluster);
740 sectors_per_cluster_bits = ffs(b->bpb.sectors_per_cluster) - 1; 757 sectors_per_cluster_bits = ffs(b->bpb.sectors_per_cluster) - 1;
741 ntfs_debug("sectors_per_cluster_bits = 0x%x", 758 ntfs_debug("sectors_per_cluster_bits = 0x%x",
@@ -748,16 +765,11 @@ static BOOL parse_ntfs_boot_sector(ntfs_volume *vol, const NTFS_BOOT_SECTOR *b)
748 ntfs_debug("vol->cluster_size = %i (0x%x)", vol->cluster_size, 765 ntfs_debug("vol->cluster_size = %i (0x%x)", vol->cluster_size,
749 vol->cluster_size); 766 vol->cluster_size);
750 ntfs_debug("vol->cluster_size_mask = 0x%x", vol->cluster_size_mask); 767 ntfs_debug("vol->cluster_size_mask = 0x%x", vol->cluster_size_mask);
751 ntfs_debug("vol->cluster_size_bits = %i (0x%x)", 768 ntfs_debug("vol->cluster_size_bits = %i", vol->cluster_size_bits);
752 vol->cluster_size_bits, vol->cluster_size_bits); 769 if (vol->cluster_size < vol->sector_size) {
753 if (vol->sector_size > vol->cluster_size) { 770 ntfs_error(vol->sb, "Cluster size (%i) is smaller than the "
754 ntfs_error(vol->sb, "Sector sizes above the cluster size are " 771 "sector size (%i). This is not supported. "
755 "not supported. Sorry."); 772 "Sorry.", vol->cluster_size, vol->sector_size);
756 return FALSE;
757 }
758 if (vol->sb->s_blocksize > vol->cluster_size) {
759 ntfs_error(vol->sb, "Cluster sizes smaller than the device "
760 "sector size are not supported. Sorry.");
761 return FALSE; 773 return FALSE;
762 } 774 }
763 clusters_per_mft_record = b->clusters_per_mft_record; 775 clusters_per_mft_record = b->clusters_per_mft_record;
@@ -786,11 +798,18 @@ static BOOL parse_ntfs_boot_sector(ntfs_volume *vol, const NTFS_BOOT_SECTOR *b)
786 * we store $MFT/$DATA, the table of mft records in the page cache. 798 * we store $MFT/$DATA, the table of mft records in the page cache.
787 */ 799 */
788 if (vol->mft_record_size > PAGE_CACHE_SIZE) { 800 if (vol->mft_record_size > PAGE_CACHE_SIZE) {
789 ntfs_error(vol->sb, "Mft record size %i (0x%x) exceeds the " 801 ntfs_error(vol->sb, "Mft record size (%i) exceeds the "
790 "page cache size on your system %lu (0x%lx). " 802 "PAGE_CACHE_SIZE on your system (%lu). "
791 "This is not supported. Sorry.", 803 "This is not supported. Sorry.",
792 vol->mft_record_size, vol->mft_record_size, 804 vol->mft_record_size, PAGE_CACHE_SIZE);
793 PAGE_CACHE_SIZE, PAGE_CACHE_SIZE); 805 return FALSE;
806 }
807 /* We cannot support mft record sizes below the sector size. */
808 if (vol->mft_record_size < vol->sector_size) {
809 ntfs_error(vol->sb, "Mft record size (%i) is smaller than the "
810 "sector size (%i). This is not supported. "
811 "Sorry.", vol->mft_record_size,
812 vol->sector_size);
794 return FALSE; 813 return FALSE;
795 } 814 }
796 clusters_per_index_record = b->clusters_per_index_record; 815 clusters_per_index_record = b->clusters_per_index_record;
@@ -816,6 +835,14 @@ static BOOL parse_ntfs_boot_sector(ntfs_volume *vol, const NTFS_BOOT_SECTOR *b)
816 ntfs_debug("vol->index_record_size_bits = %i (0x%x)", 835 ntfs_debug("vol->index_record_size_bits = %i (0x%x)",
817 vol->index_record_size_bits, 836 vol->index_record_size_bits,
818 vol->index_record_size_bits); 837 vol->index_record_size_bits);
838 /* We cannot support index record sizes below the sector size. */
839 if (vol->index_record_size < vol->sector_size) {
840 ntfs_error(vol->sb, "Index record size (%i) is smaller than "
841 "the sector size (%i). This is not "
842 "supported. Sorry.", vol->index_record_size,
843 vol->sector_size);
844 return FALSE;
845 }
819 /* 846 /*
820 * Get the size of the volume in clusters and check for 64-bit-ness. 847 * Get the size of the volume in clusters and check for 64-bit-ness.
821 * Windows currently only uses 32 bits to save the clusters so we do 848 * Windows currently only uses 32 bits to save the clusters so we do
@@ -845,15 +872,18 @@ static BOOL parse_ntfs_boot_sector(ntfs_volume *vol, const NTFS_BOOT_SECTOR *b)
845 } 872 }
846 ll = sle64_to_cpu(b->mft_lcn); 873 ll = sle64_to_cpu(b->mft_lcn);
847 if (ll >= vol->nr_clusters) { 874 if (ll >= vol->nr_clusters) {
848 ntfs_error(vol->sb, "MFT LCN is beyond end of volume. Weird."); 875 ntfs_error(vol->sb, "MFT LCN (%lli, 0x%llx) is beyond end of "
876 "volume. Weird.", (unsigned long long)ll,
877 (unsigned long long)ll);
849 return FALSE; 878 return FALSE;
850 } 879 }
851 vol->mft_lcn = ll; 880 vol->mft_lcn = ll;
852 ntfs_debug("vol->mft_lcn = 0x%llx", (long long)vol->mft_lcn); 881 ntfs_debug("vol->mft_lcn = 0x%llx", (long long)vol->mft_lcn);
853 ll = sle64_to_cpu(b->mftmirr_lcn); 882 ll = sle64_to_cpu(b->mftmirr_lcn);
854 if (ll >= vol->nr_clusters) { 883 if (ll >= vol->nr_clusters) {
855 ntfs_error(vol->sb, "MFTMirr LCN is beyond end of volume. " 884 ntfs_error(vol->sb, "MFTMirr LCN (%lli, 0x%llx) is beyond end "
856 "Weird."); 885 "of volume. Weird.", (unsigned long long)ll,
886 (unsigned long long)ll);
857 return FALSE; 887 return FALSE;
858 } 888 }
859 vol->mftmirr_lcn = ll; 889 vol->mftmirr_lcn = ll;
@@ -1822,11 +1852,24 @@ get_ctx_vol_failed:
1822 /* Make sure that no unsupported volume flags are set. */ 1852 /* Make sure that no unsupported volume flags are set. */
1823 if (vol->vol_flags & VOLUME_MUST_MOUNT_RO_MASK) { 1853 if (vol->vol_flags & VOLUME_MUST_MOUNT_RO_MASK) {
1824 static const char *es1a = "Volume is dirty"; 1854 static const char *es1a = "Volume is dirty";
1825 static const char *es1b = "Volume has unsupported flags set"; 1855 static const char *es1b = "Volume has been modified by chkdsk";
1826 static const char *es2 = ". Run chkdsk and mount in Windows."; 1856 static const char *es1c = "Volume has unsupported flags set";
1827 const char *es1; 1857 static const char *es2a = ". Run chkdsk and mount in Windows.";
1828 1858 static const char *es2b = ". Mount in Windows.";
1829 es1 = vol->vol_flags & VOLUME_IS_DIRTY ? es1a : es1b; 1859 const char *es1, *es2;
1860
1861 es2 = es2a;
1862 if (vol->vol_flags & VOLUME_IS_DIRTY)
1863 es1 = es1a;
1864 else if (vol->vol_flags & VOLUME_MODIFIED_BY_CHKDSK) {
1865 es1 = es1b;
1866 es2 = es2b;
1867 } else {
1868 es1 = es1c;
1869 ntfs_warning(sb, "Unsupported volume flags 0x%x "
1870 "encountered.",
1871 (unsigned)le16_to_cpu(vol->vol_flags));
1872 }
1830 /* If a read-write mount, convert it to a read-only mount. */ 1873 /* If a read-write mount, convert it to a read-only mount. */
1831 if (!(sb->s_flags & MS_RDONLY)) { 1874 if (!(sb->s_flags & MS_RDONLY)) {
1832 if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | 1875 if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
@@ -2685,7 +2728,7 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
2685 ntfs_volume *vol; 2728 ntfs_volume *vol;
2686 struct buffer_head *bh; 2729 struct buffer_head *bh;
2687 struct inode *tmp_ino; 2730 struct inode *tmp_ino;
2688 int result; 2731 int blocksize, result;
2689 2732
2690 ntfs_debug("Entering."); 2733 ntfs_debug("Entering.");
2691#ifndef NTFS_RW 2734#ifndef NTFS_RW
@@ -2724,60 +2767,85 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
2724 if (!parse_options(vol, (char*)opt)) 2767 if (!parse_options(vol, (char*)opt))
2725 goto err_out_now; 2768 goto err_out_now;
2726 2769
2770 /* We support sector sizes up to the PAGE_CACHE_SIZE. */
2771 if (bdev_hardsect_size(sb->s_bdev) > PAGE_CACHE_SIZE) {
2772 if (!silent)
2773 ntfs_error(sb, "Device has unsupported sector size "
2774 "(%i). The maximum supported sector "
2775 "size on this architecture is %lu "
2776 "bytes.",
2777 bdev_hardsect_size(sb->s_bdev),
2778 PAGE_CACHE_SIZE);
2779 goto err_out_now;
2780 }
2727 /* 2781 /*
2728 * TODO: Fail safety check. In the future we should really be able to 2782 * Setup the device access block size to NTFS_BLOCK_SIZE or the hard
2729 * cope with this being the case, but for now just bail out. 2783 * sector size, whichever is bigger.
2730 */ 2784 */
2731 if (bdev_hardsect_size(sb->s_bdev) > NTFS_BLOCK_SIZE) { 2785 blocksize = sb_min_blocksize(sb, NTFS_BLOCK_SIZE);
2786 if (blocksize < NTFS_BLOCK_SIZE) {
2732 if (!silent) 2787 if (!silent)
2733 ntfs_error(sb, "Device has unsupported hardsect_size."); 2788 ntfs_error(sb, "Unable to set device block size.");
2734 goto err_out_now; 2789 goto err_out_now;
2735 } 2790 }
2736 2791 BUG_ON(blocksize != sb->s_blocksize);
2737 /* Setup the device access block size to NTFS_BLOCK_SIZE. */ 2792 ntfs_debug("Set device block size to %i bytes (block size bits %i).",
2738 if (sb_set_blocksize(sb, NTFS_BLOCK_SIZE) != NTFS_BLOCK_SIZE) { 2793 blocksize, sb->s_blocksize_bits);
2794 /* Determine the size of the device in units of block_size bytes. */
2795 if (!i_size_read(sb->s_bdev->bd_inode)) {
2739 if (!silent) 2796 if (!silent)
2740 ntfs_error(sb, "Unable to set block size."); 2797 ntfs_error(sb, "Unable to determine device size.");
2741 goto err_out_now; 2798 goto err_out_now;
2742 } 2799 }
2743
2744 /* Get the size of the device in units of NTFS_BLOCK_SIZE bytes. */
2745 vol->nr_blocks = i_size_read(sb->s_bdev->bd_inode) >> 2800 vol->nr_blocks = i_size_read(sb->s_bdev->bd_inode) >>
2746 NTFS_BLOCK_SIZE_BITS; 2801 sb->s_blocksize_bits;
2747
2748 /* Read the boot sector and return unlocked buffer head to it. */ 2802 /* Read the boot sector and return unlocked buffer head to it. */
2749 if (!(bh = read_ntfs_boot_sector(sb, silent))) { 2803 if (!(bh = read_ntfs_boot_sector(sb, silent))) {
2750 if (!silent) 2804 if (!silent)
2751 ntfs_error(sb, "Not an NTFS volume."); 2805 ntfs_error(sb, "Not an NTFS volume.");
2752 goto err_out_now; 2806 goto err_out_now;
2753 } 2807 }
2754
2755 /* 2808 /*
2756 * Extract the data from the boot sector and setup the ntfs super block 2809 * Extract the data from the boot sector and setup the ntfs volume
2757 * using it. 2810 * using it.
2758 */ 2811 */
2759 result = parse_ntfs_boot_sector(vol, (NTFS_BOOT_SECTOR*)bh->b_data); 2812 result = parse_ntfs_boot_sector(vol, (NTFS_BOOT_SECTOR*)bh->b_data);
2760
2761 /* Initialize the cluster and mft allocators. */
2762 ntfs_setup_allocators(vol);
2763
2764 brelse(bh); 2813 brelse(bh);
2765
2766 if (!result) { 2814 if (!result) {
2767 if (!silent) 2815 if (!silent)
2768 ntfs_error(sb, "Unsupported NTFS filesystem."); 2816 ntfs_error(sb, "Unsupported NTFS filesystem.");
2769 goto err_out_now; 2817 goto err_out_now;
2770 } 2818 }
2771
2772 /* 2819 /*
2773 * TODO: When we start coping with sector sizes different from 2820 * If the boot sector indicates a sector size bigger than the current
2774 * NTFS_BLOCK_SIZE, we now probably need to set the blocksize of the 2821 * device block size, switch the device block size to the sector size.
2775 * device (probably to NTFS_BLOCK_SIZE). 2822 * TODO: It may be possible to support this case even when the set
2823 * below fails, we would just be breaking up the i/o for each sector
2824 * into multiple blocks for i/o purposes but otherwise it should just
2825 * work. However it is safer to leave disabled until someone hits this
2826 * error message and then we can get them to try it without the setting
2827 * so we know for sure that it works.
2776 */ 2828 */
2777 2829 if (vol->sector_size > blocksize) {
2830 blocksize = sb_set_blocksize(sb, vol->sector_size);
2831 if (blocksize != vol->sector_size) {
2832 if (!silent)
2833 ntfs_error(sb, "Unable to set device block "
2834 "size to sector size (%i).",
2835 vol->sector_size);
2836 goto err_out_now;
2837 }
2838 BUG_ON(blocksize != sb->s_blocksize);
2839 vol->nr_blocks = i_size_read(sb->s_bdev->bd_inode) >>
2840 sb->s_blocksize_bits;
2841 ntfs_debug("Changed device block size to %i bytes (block size "
2842 "bits %i) to match volume sector size.",
2843 blocksize, sb->s_blocksize_bits);
2844 }
2845 /* Initialize the cluster and mft allocators. */
2846 ntfs_setup_allocators(vol);
2778 /* Setup remaining fields in the super block. */ 2847 /* Setup remaining fields in the super block. */
2779 sb->s_magic = NTFS_SB_MAGIC; 2848 sb->s_magic = NTFS_SB_MAGIC;
2780
2781 /* 2849 /*
2782 * Ntfs allows 63 bits for the file size, i.e. correct would be: 2850 * Ntfs allows 63 bits for the file size, i.e. correct would be:
2783 * sb->s_maxbytes = ~0ULL >> 1; 2851 * sb->s_maxbytes = ~0ULL >> 1;
@@ -2787,9 +2855,8 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
2787 * without overflowing the index or to 2^63 - 1, whichever is smaller. 2855 * without overflowing the index or to 2^63 - 1, whichever is smaller.
2788 */ 2856 */
2789 sb->s_maxbytes = MAX_LFS_FILESIZE; 2857 sb->s_maxbytes = MAX_LFS_FILESIZE;
2790 2858 /* Ntfs measures time in 100ns intervals. */
2791 sb->s_time_gran = 100; 2859 sb->s_time_gran = 100;
2792
2793 /* 2860 /*
2794 * Now load the metadata required for the page cache and our address 2861 * Now load the metadata required for the page cache and our address
2795 * space operations to function. We do this by setting up a specialised 2862 * space operations to function. We do this by setting up a specialised
@@ -2987,14 +3054,14 @@ err_out_now:
2987 * strings of the maximum length allowed by NTFS, which is NTFS_MAX_NAME_LEN 3054 * strings of the maximum length allowed by NTFS, which is NTFS_MAX_NAME_LEN
2988 * (255) Unicode characters + a terminating NULL Unicode character. 3055 * (255) Unicode characters + a terminating NULL Unicode character.
2989 */ 3056 */
2990kmem_cache_t *ntfs_name_cache; 3057struct kmem_cache *ntfs_name_cache;
2991 3058
2992/* Slab caches for efficient allocation/deallocation of inodes. */ 3059/* Slab caches for efficient allocation/deallocation of inodes. */
2993kmem_cache_t *ntfs_inode_cache; 3060struct kmem_cache *ntfs_inode_cache;
2994kmem_cache_t *ntfs_big_inode_cache; 3061struct kmem_cache *ntfs_big_inode_cache;
2995 3062
2996/* Init once constructor for the inode slab cache. */ 3063/* Init once constructor for the inode slab cache. */
2997static void ntfs_big_inode_init_once(void *foo, kmem_cache_t *cachep, 3064static void ntfs_big_inode_init_once(void *foo, struct kmem_cache *cachep,
2998 unsigned long flags) 3065 unsigned long flags)
2999{ 3066{
3000 ntfs_inode *ni = (ntfs_inode *)foo; 3067 ntfs_inode *ni = (ntfs_inode *)foo;
@@ -3008,8 +3075,8 @@ static void ntfs_big_inode_init_once(void *foo, kmem_cache_t *cachep,
3008 * Slab caches to optimize allocations and deallocations of attribute search 3075 * Slab caches to optimize allocations and deallocations of attribute search
3009 * contexts and index contexts, respectively. 3076 * contexts and index contexts, respectively.
3010 */ 3077 */
3011kmem_cache_t *ntfs_attr_ctx_cache; 3078struct kmem_cache *ntfs_attr_ctx_cache;
3012kmem_cache_t *ntfs_index_ctx_cache; 3079struct kmem_cache *ntfs_index_ctx_cache;
3013 3080
3014/* Driver wide semaphore. */ 3081/* Driver wide semaphore. */
3015DECLARE_MUTEX(ntfs_lock); 3082DECLARE_MUTEX(ntfs_lock);
diff --git a/fs/ntfs/upcase.c b/fs/ntfs/upcase.c
index 879cdf1d5bd3..9101807dc81a 100644
--- a/fs/ntfs/upcase.c
+++ b/fs/ntfs/upcase.c
@@ -3,10 +3,7 @@
3 * Part of the Linux-NTFS project. 3 * Part of the Linux-NTFS project.
4 * 4 *
5 * Copyright (c) 2001 Richard Russon <ntfs@flatcap.org> 5 * Copyright (c) 2001 Richard Russon <ntfs@flatcap.org>
6 * Copyright (c) 2001-2004 Anton Altaparmakov 6 * Copyright (c) 2001-2006 Anton Altaparmakov
7 *
8 * Modified for mkntfs inclusion 9 June 2001 by Anton Altaparmakov.
9 * Modified for kernel inclusion 10 September 2001 by Anton Altparmakov.
10 * 7 *
11 * This program is free software; you can redistribute it and/or modify it 8 * This program is free software; you can redistribute it and/or modify it
12 * under the terms of the GNU General Public License as published by the Free 9 * under the terms of the GNU General Public License as published by the Free
@@ -75,12 +72,13 @@ ntfschar *generate_default_upcase(void)
75 if (!uc) 72 if (!uc)
76 return uc; 73 return uc;
77 memset(uc, 0, default_upcase_len * sizeof(ntfschar)); 74 memset(uc, 0, default_upcase_len * sizeof(ntfschar));
75 /* Generate the little endian Unicode upcase table used by ntfs. */
78 for (i = 0; i < default_upcase_len; i++) 76 for (i = 0; i < default_upcase_len; i++)
79 uc[i] = cpu_to_le16(i); 77 uc[i] = cpu_to_le16(i);
80 for (r = 0; uc_run_table[r][0]; r++) 78 for (r = 0; uc_run_table[r][0]; r++)
81 for (i = uc_run_table[r][0]; i < uc_run_table[r][1]; i++) 79 for (i = uc_run_table[r][0]; i < uc_run_table[r][1]; i++)
82 uc[i] = cpu_to_le16((le16_to_cpu(uc[i]) + 80 uc[i] = cpu_to_le16(le16_to_cpu(uc[i]) +
83 uc_run_table[r][2])); 81 uc_run_table[r][2]);
84 for (r = 0; uc_dup_table[r][0]; r++) 82 for (r = 0; uc_dup_table[r][0]; r++)
85 for (i = uc_dup_table[r][0]; i < uc_dup_table[r][1]; i += 2) 83 for (i = uc_dup_table[r][0]; i < uc_dup_table[r][1]; i += 2)
86 uc[i + 1] = cpu_to_le16(le16_to_cpu(uc[i + 1]) - 1); 84 uc[i + 1] = cpu_to_le16(le16_to_cpu(uc[i + 1]) - 1);
diff --git a/fs/ntfs/volume.h b/fs/ntfs/volume.h
index 375cd20a9f61..406ab55dfb32 100644
--- a/fs/ntfs/volume.h
+++ b/fs/ntfs/volume.h
@@ -2,7 +2,7 @@
2 * volume.h - Defines for volume structures in NTFS Linux kernel driver. Part 2 * volume.h - Defines for volume structures in NTFS Linux kernel driver. Part
3 * of the Linux-NTFS project. 3 * of the Linux-NTFS project.
4 * 4 *
5 * Copyright (c) 2001-2005 Anton Altaparmakov 5 * Copyright (c) 2001-2006 Anton Altaparmakov
6 * Copyright (c) 2002 Richard Russon 6 * Copyright (c) 2002 Richard Russon
7 * 7 *
8 * This program/include file is free software; you can redistribute it and/or 8 * This program/include file is free software; you can redistribute it and/or
@@ -41,10 +41,8 @@ typedef struct {
41 * structure has stabilized... (AIA) 41 * structure has stabilized... (AIA)
42 */ 42 */
43 /* Device specifics. */ 43 /* Device specifics. */
44 struct super_block *sb; /* Pointer back to the super_block, 44 struct super_block *sb; /* Pointer back to the super_block. */
45 so we don't have to get the offset 45 LCN nr_blocks; /* Number of sb->s_blocksize bytes
46 every time. */
47 LCN nr_blocks; /* Number of NTFS_BLOCK_SIZE bytes
48 sized blocks on the device. */ 46 sized blocks on the device. */
49 /* Configuration provided by user at mount time. */ 47 /* Configuration provided by user at mount time. */
50 unsigned long flags; /* Miscellaneous flags, see below. */ 48 unsigned long flags; /* Miscellaneous flags, see below. */
@@ -141,8 +139,8 @@ typedef enum {
141 NV_ShowSystemFiles, /* 1: Return system files in ntfs_readdir(). */ 139 NV_ShowSystemFiles, /* 1: Return system files in ntfs_readdir(). */
142 NV_CaseSensitive, /* 1: Treat file names as case sensitive and 140 NV_CaseSensitive, /* 1: Treat file names as case sensitive and
143 create filenames in the POSIX namespace. 141 create filenames in the POSIX namespace.
144 Otherwise be case insensitive and create 142 Otherwise be case insensitive but still
145 file names in WIN32 namespace. */ 143 create file names in POSIX namespace. */
146 NV_LogFileEmpty, /* 1: $LogFile journal is empty. */ 144 NV_LogFileEmpty, /* 1: $LogFile journal is empty. */
147 NV_QuotaOutOfDate, /* 1: $Quota is out of date. */ 145 NV_QuotaOutOfDate, /* 1: $Quota is out of date. */
148 NV_UsnJrnlStamped, /* 1: $UsnJrnl has been stamped. */ 146 NV_UsnJrnlStamped, /* 1: $UsnJrnl has been stamped. */
@@ -153,7 +151,7 @@ typedef enum {
153 * Macro tricks to expand the NVolFoo(), NVolSetFoo(), and NVolClearFoo() 151 * Macro tricks to expand the NVolFoo(), NVolSetFoo(), and NVolClearFoo()
154 * functions. 152 * functions.
155 */ 153 */
156#define NVOL_FNS(flag) \ 154#define DEFINE_NVOL_BIT_OPS(flag) \
157static inline int NVol##flag(ntfs_volume *vol) \ 155static inline int NVol##flag(ntfs_volume *vol) \
158{ \ 156{ \
159 return test_bit(NV_##flag, &(vol)->flags); \ 157 return test_bit(NV_##flag, &(vol)->flags); \
@@ -168,12 +166,12 @@ static inline void NVolClear##flag(ntfs_volume *vol) \
168} 166}
169 167
170/* Emit the ntfs volume bitops functions. */ 168/* Emit the ntfs volume bitops functions. */
171NVOL_FNS(Errors) 169DEFINE_NVOL_BIT_OPS(Errors)
172NVOL_FNS(ShowSystemFiles) 170DEFINE_NVOL_BIT_OPS(ShowSystemFiles)
173NVOL_FNS(CaseSensitive) 171DEFINE_NVOL_BIT_OPS(CaseSensitive)
174NVOL_FNS(LogFileEmpty) 172DEFINE_NVOL_BIT_OPS(LogFileEmpty)
175NVOL_FNS(QuotaOutOfDate) 173DEFINE_NVOL_BIT_OPS(QuotaOutOfDate)
176NVOL_FNS(UsnJrnlStamped) 174DEFINE_NVOL_BIT_OPS(UsnJrnlStamped)
177NVOL_FNS(SparseEnabled) 175DEFINE_NVOL_BIT_OPS(SparseEnabled)
178 176
179#endif /* _LINUX_NTFS_VOLUME_H */ 177#endif /* _LINUX_NTFS_VOLUME_H */
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index d424041b38e9..bae3d7548bea 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -58,7 +58,7 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
58 goto out; 58 goto out;
59 } 59 }
60 60
61 down(&OCFS2_I(inode)->ip_io_sem); 61 mutex_lock(&OCFS2_I(inode)->ip_io_mutex);
62 62
63 lock_buffer(bh); 63 lock_buffer(bh);
64 set_buffer_uptodate(bh); 64 set_buffer_uptodate(bh);
@@ -82,7 +82,7 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
82 brelse(bh); 82 brelse(bh);
83 } 83 }
84 84
85 up(&OCFS2_I(inode)->ip_io_sem); 85 mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
86out: 86out:
87 mlog_exit(ret); 87 mlog_exit(ret);
88 return ret; 88 return ret;
@@ -125,13 +125,13 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
125 flags &= ~OCFS2_BH_CACHED; 125 flags &= ~OCFS2_BH_CACHED;
126 126
127 if (inode) 127 if (inode)
128 down(&OCFS2_I(inode)->ip_io_sem); 128 mutex_lock(&OCFS2_I(inode)->ip_io_mutex);
129 for (i = 0 ; i < nr ; i++) { 129 for (i = 0 ; i < nr ; i++) {
130 if (bhs[i] == NULL) { 130 if (bhs[i] == NULL) {
131 bhs[i] = sb_getblk(sb, block++); 131 bhs[i] = sb_getblk(sb, block++);
132 if (bhs[i] == NULL) { 132 if (bhs[i] == NULL) {
133 if (inode) 133 if (inode)
134 up(&OCFS2_I(inode)->ip_io_sem); 134 mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
135 status = -EIO; 135 status = -EIO;
136 mlog_errno(status); 136 mlog_errno(status);
137 goto bail; 137 goto bail;
@@ -220,7 +220,7 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
220 ocfs2_set_buffer_uptodate(inode, bh); 220 ocfs2_set_buffer_uptodate(inode, bh);
221 } 221 }
222 if (inode) 222 if (inode)
223 up(&OCFS2_I(inode)->ip_io_sem); 223 mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
224 224
225 mlog(ML_BH_IO, "block=(%"MLFu64"), nr=(%d), cached=%s\n", block, nr, 225 mlog(ML_BH_IO, "block=(%"MLFu64"), nr=(%d), cached=%s\n", block, nr,
226 (!(flags & OCFS2_BH_CACHED) || ignore_cache) ? "no" : "yes"); 226 (!(flags & OCFS2_BH_CACHED) || ignore_cache) ? "no" : "yes");
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 7307ba528913..d08971d29b63 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -917,8 +917,9 @@ static int o2hb_thread(void *data)
917 elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb); 917 elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb);
918 918
919 mlog(0, "start = %lu.%lu, end = %lu.%lu, msec = %u\n", 919 mlog(0, "start = %lu.%lu, end = %lu.%lu, msec = %u\n",
920 before_hb.tv_sec, before_hb.tv_usec, 920 before_hb.tv_sec, (unsigned long) before_hb.tv_usec,
921 after_hb.tv_sec, after_hb.tv_usec, elapsed_msec); 921 after_hb.tv_sec, (unsigned long) after_hb.tv_usec,
922 elapsed_msec);
922 923
923 if (elapsed_msec < reg->hr_timeout_ms) { 924 if (elapsed_msec < reg->hr_timeout_ms) {
924 /* the kthread api has blocked signals for us so no 925 /* the kthread api has blocked signals for us so no
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index fd741cea5705..636593bf4d17 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -74,6 +74,7 @@ struct mlog_attribute {
74#define define_mask(_name) { \ 74#define define_mask(_name) { \
75 .attr = { \ 75 .attr = { \
76 .name = #_name, \ 76 .name = #_name, \
77 .owner = THIS_MODULE, \
77 .mode = S_IRUGO | S_IWUSR, \ 78 .mode = S_IRUGO | S_IWUSR, \
78 }, \ 79 }, \
79 .mask = ML_##_name, \ 80 .mask = ML_##_name, \
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index e8c56a3d9c64..2cadc3009c83 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -256,7 +256,7 @@ extern struct mlog_bits mlog_and_bits, mlog_not_bits;
256 } \ 256 } \
257} while (0) 257} while (0)
258 258
259#if (BITS_PER_LONG == 32) || defined(CONFIG_X86_64) 259#if (BITS_PER_LONG == 32) || defined(CONFIG_X86_64) || (defined(CONFIG_UML_X86) && defined(CONFIG_64BIT))
260#define MLFi64 "lld" 260#define MLFi64 "lld"
261#define MLFu64 "llu" 261#define MLFu64 "llu"
262#define MLFx64 "llx" 262#define MLFx64 "llx"
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index cf7828f23361..e1fceb8aa32d 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -756,7 +756,7 @@ static int __init init_o2nm(void)
756 if (!ocfs2_table_header) { 756 if (!ocfs2_table_header) {
757 printk(KERN_ERR "nodemanager: unable to register sysctl\n"); 757 printk(KERN_ERR "nodemanager: unable to register sysctl\n");
758 ret = -ENOMEM; /* or something. */ 758 ret = -ENOMEM; /* or something. */
759 goto out; 759 goto out_o2net;
760 } 760 }
761 761
762 ret = o2net_register_hb_callbacks(); 762 ret = o2net_register_hb_callbacks();
@@ -780,6 +780,8 @@ out_callbacks:
780 o2net_unregister_hb_callbacks(); 780 o2net_unregister_hb_callbacks();
781out_sysctl: 781out_sysctl:
782 unregister_sysctl_table(ocfs2_table_header); 782 unregister_sysctl_table(ocfs2_table_header);
783out_o2net:
784 o2net_exit();
783out: 785out:
784 return ret; 786 return ret;
785} 787}
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 35d92c01a972..0f60cc0d3985 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -1285,14 +1285,16 @@ static void o2net_idle_timer(unsigned long data)
1285 mlog(ML_NOTICE, "here are some times that might help debug the " 1285 mlog(ML_NOTICE, "here are some times that might help debug the "
1286 "situation: (tmr %ld.%ld now %ld.%ld dr %ld.%ld adv " 1286 "situation: (tmr %ld.%ld now %ld.%ld dr %ld.%ld adv "
1287 "%ld.%ld:%ld.%ld func (%08x:%u) %ld.%ld:%ld.%ld)\n", 1287 "%ld.%ld:%ld.%ld func (%08x:%u) %ld.%ld:%ld.%ld)\n",
1288 sc->sc_tv_timer.tv_sec, sc->sc_tv_timer.tv_usec, 1288 sc->sc_tv_timer.tv_sec, (long) sc->sc_tv_timer.tv_usec,
1289 now.tv_sec, now.tv_usec, 1289 now.tv_sec, (long) now.tv_usec,
1290 sc->sc_tv_data_ready.tv_sec, sc->sc_tv_data_ready.tv_usec, 1290 sc->sc_tv_data_ready.tv_sec, (long) sc->sc_tv_data_ready.tv_usec,
1291 sc->sc_tv_advance_start.tv_sec, sc->sc_tv_advance_start.tv_usec, 1291 sc->sc_tv_advance_start.tv_sec,
1292 sc->sc_tv_advance_stop.tv_sec, sc->sc_tv_advance_stop.tv_usec, 1292 (long) sc->sc_tv_advance_start.tv_usec,
1293 sc->sc_tv_advance_stop.tv_sec,
1294 (long) sc->sc_tv_advance_stop.tv_usec,
1293 sc->sc_msg_key, sc->sc_msg_type, 1295 sc->sc_msg_key, sc->sc_msg_type,
1294 sc->sc_tv_func_start.tv_sec, sc->sc_tv_func_start.tv_usec, 1296 sc->sc_tv_func_start.tv_sec, (long) sc->sc_tv_func_start.tv_usec,
1295 sc->sc_tv_func_stop.tv_sec, sc->sc_tv_func_stop.tv_usec); 1297 sc->sc_tv_func_stop.tv_sec, (long) sc->sc_tv_func_stop.tv_usec);
1296 1298
1297 o2net_sc_queue_work(sc, &sc->sc_shutdown_work); 1299 o2net_sc_queue_work(sc, &sc->sc_shutdown_work);
1298} 1300}
@@ -1316,7 +1318,7 @@ static void o2net_start_connect(void *arg)
1316{ 1318{
1317 struct o2net_node *nn = arg; 1319 struct o2net_node *nn = arg;
1318 struct o2net_sock_container *sc = NULL; 1320 struct o2net_sock_container *sc = NULL;
1319 struct o2nm_node *node = NULL; 1321 struct o2nm_node *node = NULL, *mynode = NULL;
1320 struct socket *sock = NULL; 1322 struct socket *sock = NULL;
1321 struct sockaddr_in myaddr = {0, }, remoteaddr = {0, }; 1323 struct sockaddr_in myaddr = {0, }, remoteaddr = {0, };
1322 int ret = 0; 1324 int ret = 0;
@@ -1332,6 +1334,12 @@ static void o2net_start_connect(void *arg)
1332 goto out; 1334 goto out;
1333 } 1335 }
1334 1336
1337 mynode = o2nm_get_node_by_num(o2nm_this_node());
1338 if (mynode == NULL) {
1339 ret = 0;
1340 goto out;
1341 }
1342
1335 spin_lock(&nn->nn_lock); 1343 spin_lock(&nn->nn_lock);
1336 /* see if we already have one pending or have given up */ 1344 /* see if we already have one pending or have given up */
1337 if (nn->nn_sc || nn->nn_persistent_error) 1345 if (nn->nn_sc || nn->nn_persistent_error)
@@ -1359,12 +1367,14 @@ static void o2net_start_connect(void *arg)
1359 sock->sk->sk_allocation = GFP_ATOMIC; 1367 sock->sk->sk_allocation = GFP_ATOMIC;
1360 1368
1361 myaddr.sin_family = AF_INET; 1369 myaddr.sin_family = AF_INET;
1370 myaddr.sin_addr.s_addr = (__force u32)mynode->nd_ipv4_address;
1362 myaddr.sin_port = (__force u16)htons(0); /* any port */ 1371 myaddr.sin_port = (__force u16)htons(0); /* any port */
1363 1372
1364 ret = sock->ops->bind(sock, (struct sockaddr *)&myaddr, 1373 ret = sock->ops->bind(sock, (struct sockaddr *)&myaddr,
1365 sizeof(myaddr)); 1374 sizeof(myaddr));
1366 if (ret) { 1375 if (ret) {
1367 mlog(0, "bind failed: %d\n", ret); 1376 mlog(ML_ERROR, "bind failed with %d at address %u.%u.%u.%u\n",
1377 ret, NIPQUAD(mynode->nd_ipv4_address));
1368 goto out; 1378 goto out;
1369 } 1379 }
1370 1380
@@ -1405,6 +1415,8 @@ out:
1405 sc_put(sc); 1415 sc_put(sc);
1406 if (node) 1416 if (node)
1407 o2nm_node_put(node); 1417 o2nm_node_put(node);
1418 if (mynode)
1419 o2nm_node_put(mynode);
1408 1420
1409 return; 1421 return;
1410} 1422}
diff --git a/fs/ocfs2/cluster/tcp.h b/fs/ocfs2/cluster/tcp.h
index a6f4585501c8..616ff2b8434a 100644
--- a/fs/ocfs2/cluster/tcp.h
+++ b/fs/ocfs2/cluster/tcp.h
@@ -85,13 +85,10 @@ enum {
85 O2NET_DRIVER_READY, 85 O2NET_DRIVER_READY,
86}; 86};
87 87
88int o2net_init_tcp_sock(struct inode *inode);
89int o2net_send_message(u32 msg_type, u32 key, void *data, u32 len, 88int o2net_send_message(u32 msg_type, u32 key, void *data, u32 len,
90 u8 target_node, int *status); 89 u8 target_node, int *status);
91int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *vec, 90int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *vec,
92 size_t veclen, u8 target_node, int *status); 91 size_t veclen, u8 target_node, int *status);
93int o2net_broadcast_message(u32 msg_type, u32 key, void *data, u32 len,
94 struct inode *group);
95 92
96int o2net_register_handler(u32 msg_type, u32 key, u32 max_len, 93int o2net_register_handler(u32 msg_type, u32 key, u32 max_len,
97 o2net_msg_handler_func *func, void *data, 94 o2net_msg_handler_func *func, void *data,
@@ -107,7 +104,5 @@ void o2net_disconnect_node(struct o2nm_node *node);
107 104
108int o2net_init(void); 105int o2net_init(void);
109void o2net_exit(void); 106void o2net_exit(void);
110int o2net_proc_init(struct proc_dir_entry *parent);
111void o2net_proc_exit(struct proc_dir_entry *parent);
112 107
113#endif /* O2CLUSTER_TCP_H */ 108#endif /* O2CLUSTER_TCP_H */
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index 3fecba0a6023..9c772583744a 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -37,9 +37,7 @@
37#define DLM_THREAD_SHUFFLE_INTERVAL 5 // flush everything every 5 passes 37#define DLM_THREAD_SHUFFLE_INTERVAL 5 // flush everything every 5 passes
38#define DLM_THREAD_MS 200 // flush at least every 200 ms 38#define DLM_THREAD_MS 200 // flush at least every 200 ms
39 39
40#define DLM_HASH_BITS 7 40#define DLM_HASH_BUCKETS (PAGE_SIZE / sizeof(struct hlist_head))
41#define DLM_HASH_SIZE (1 << DLM_HASH_BITS)
42#define DLM_HASH_MASK (DLM_HASH_SIZE - 1)
43 41
44enum dlm_ast_type { 42enum dlm_ast_type {
45 DLM_AST = 0, 43 DLM_AST = 0,
@@ -87,7 +85,7 @@ enum dlm_ctxt_state {
87struct dlm_ctxt 85struct dlm_ctxt
88{ 86{
89 struct list_head list; 87 struct list_head list;
90 struct list_head *resources; 88 struct hlist_head *lockres_hash;
91 struct list_head dirty_list; 89 struct list_head dirty_list;
92 struct list_head purge_list; 90 struct list_head purge_list;
93 struct list_head pending_asts; 91 struct list_head pending_asts;
@@ -208,13 +206,16 @@ static inline void __dlm_set_joining_node(struct dlm_ctxt *dlm,
208#define DLM_LOCK_RES_IN_PROGRESS 0x00000010 206#define DLM_LOCK_RES_IN_PROGRESS 0x00000010
209#define DLM_LOCK_RES_MIGRATING 0x00000020 207#define DLM_LOCK_RES_MIGRATING 0x00000020
210 208
209/* max milliseconds to wait to sync up a network failure with a node death */
210#define DLM_NODE_DEATH_WAIT_MAX (5 * 1000)
211
211#define DLM_PURGE_INTERVAL_MS (8 * 1000) 212#define DLM_PURGE_INTERVAL_MS (8 * 1000)
212 213
213struct dlm_lock_resource 214struct dlm_lock_resource
214{ 215{
215 /* WARNING: Please see the comment in dlm_init_lockres before 216 /* WARNING: Please see the comment in dlm_init_lockres before
216 * adding fields here. */ 217 * adding fields here. */
217 struct list_head list; 218 struct hlist_node hash_node;
218 struct kref refs; 219 struct kref refs;
219 220
220 /* please keep these next 3 in this order 221 /* please keep these next 3 in this order
@@ -657,6 +658,8 @@ void dlm_complete_thread(struct dlm_ctxt *dlm);
657int dlm_launch_recovery_thread(struct dlm_ctxt *dlm); 658int dlm_launch_recovery_thread(struct dlm_ctxt *dlm);
658void dlm_complete_recovery_thread(struct dlm_ctxt *dlm); 659void dlm_complete_recovery_thread(struct dlm_ctxt *dlm);
659void dlm_wait_for_recovery(struct dlm_ctxt *dlm); 660void dlm_wait_for_recovery(struct dlm_ctxt *dlm);
661int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node);
662int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout);
660 663
661void dlm_put(struct dlm_ctxt *dlm); 664void dlm_put(struct dlm_ctxt *dlm);
662struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm); 665struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm);
diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c
index 6001b22a997d..f66e2d818ccd 100644
--- a/fs/ocfs2/dlm/dlmconvert.c
+++ b/fs/ocfs2/dlm/dlmconvert.c
@@ -392,6 +392,11 @@ static enum dlm_status dlm_send_remote_convert_request(struct dlm_ctxt *dlm,
392 } else { 392 } else {
393 mlog_errno(tmpret); 393 mlog_errno(tmpret);
394 if (dlm_is_host_down(tmpret)) { 394 if (dlm_is_host_down(tmpret)) {
395 /* instead of logging the same network error over
396 * and over, sleep here and wait for the heartbeat
397 * to notice the node is dead. times out after 5s. */
398 dlm_wait_for_node_death(dlm, res->owner,
399 DLM_NODE_DEATH_WAIT_MAX);
395 ret = DLM_RECOVERING; 400 ret = DLM_RECOVERING;
396 mlog(0, "node %u died so returning DLM_RECOVERING " 401 mlog(0, "node %u died so returning DLM_RECOVERING "
397 "from convert message!\n", res->owner); 402 "from convert message!\n", res->owner);
@@ -421,7 +426,7 @@ int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data)
421 struct dlm_lockstatus *lksb; 426 struct dlm_lockstatus *lksb;
422 enum dlm_status status = DLM_NORMAL; 427 enum dlm_status status = DLM_NORMAL;
423 u32 flags; 428 u32 flags;
424 int call_ast = 0, kick_thread = 0; 429 int call_ast = 0, kick_thread = 0, ast_reserved = 0;
425 430
426 if (!dlm_grab(dlm)) { 431 if (!dlm_grab(dlm)) {
427 dlm_error(DLM_REJECTED); 432 dlm_error(DLM_REJECTED);
@@ -490,6 +495,7 @@ int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data)
490 status = __dlm_lockres_state_to_status(res); 495 status = __dlm_lockres_state_to_status(res);
491 if (status == DLM_NORMAL) { 496 if (status == DLM_NORMAL) {
492 __dlm_lockres_reserve_ast(res); 497 __dlm_lockres_reserve_ast(res);
498 ast_reserved = 1;
493 res->state |= DLM_LOCK_RES_IN_PROGRESS; 499 res->state |= DLM_LOCK_RES_IN_PROGRESS;
494 status = __dlmconvert_master(dlm, res, lock, flags, 500 status = __dlmconvert_master(dlm, res, lock, flags,
495 cnv->requested_type, 501 cnv->requested_type,
@@ -512,10 +518,10 @@ leave:
512 else 518 else
513 dlm_lock_put(lock); 519 dlm_lock_put(lock);
514 520
515 /* either queue the ast or release it */ 521 /* either queue the ast or release it, if reserved */
516 if (call_ast) 522 if (call_ast)
517 dlm_queue_ast(dlm, lock); 523 dlm_queue_ast(dlm, lock);
518 else 524 else if (ast_reserved)
519 dlm_lockres_release_ast(dlm, res); 525 dlm_lockres_release_ast(dlm, res);
520 526
521 if (kick_thread) 527 if (kick_thread)
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index f339fe27975a..54f61b76ab51 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -117,8 +117,8 @@ EXPORT_SYMBOL_GPL(dlm_print_one_lock);
117void dlm_dump_lock_resources(struct dlm_ctxt *dlm) 117void dlm_dump_lock_resources(struct dlm_ctxt *dlm)
118{ 118{
119 struct dlm_lock_resource *res; 119 struct dlm_lock_resource *res;
120 struct list_head *iter; 120 struct hlist_node *iter;
121 struct list_head *bucket; 121 struct hlist_head *bucket;
122 int i; 122 int i;
123 123
124 mlog(ML_NOTICE, "struct dlm_ctxt: %s, node=%u, key=%u\n", 124 mlog(ML_NOTICE, "struct dlm_ctxt: %s, node=%u, key=%u\n",
@@ -129,12 +129,10 @@ void dlm_dump_lock_resources(struct dlm_ctxt *dlm)
129 } 129 }
130 130
131 spin_lock(&dlm->spinlock); 131 spin_lock(&dlm->spinlock);
132 for (i=0; i<DLM_HASH_SIZE; i++) { 132 for (i=0; i<DLM_HASH_BUCKETS; i++) {
133 bucket = &(dlm->resources[i]); 133 bucket = &(dlm->lockres_hash[i]);
134 list_for_each(iter, bucket) { 134 hlist_for_each_entry(res, iter, bucket, hash_node)
135 res = list_entry(iter, struct dlm_lock_resource, list);
136 dlm_print_one_lock_resource(res); 135 dlm_print_one_lock_resource(res);
137 }
138 } 136 }
139 spin_unlock(&dlm->spinlock); 137 spin_unlock(&dlm->spinlock);
140} 138}
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index da3c22045f89..8f3a9e3106fd 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -77,26 +77,26 @@ static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm);
77 77
78void __dlm_unhash_lockres(struct dlm_lock_resource *lockres) 78void __dlm_unhash_lockres(struct dlm_lock_resource *lockres)
79{ 79{
80 list_del_init(&lockres->list); 80 hlist_del_init(&lockres->hash_node);
81 dlm_lockres_put(lockres); 81 dlm_lockres_put(lockres);
82} 82}
83 83
84void __dlm_insert_lockres(struct dlm_ctxt *dlm, 84void __dlm_insert_lockres(struct dlm_ctxt *dlm,
85 struct dlm_lock_resource *res) 85 struct dlm_lock_resource *res)
86{ 86{
87 struct list_head *bucket; 87 struct hlist_head *bucket;
88 struct qstr *q; 88 struct qstr *q;
89 89
90 assert_spin_locked(&dlm->spinlock); 90 assert_spin_locked(&dlm->spinlock);
91 91
92 q = &res->lockname; 92 q = &res->lockname;
93 q->hash = full_name_hash(q->name, q->len); 93 q->hash = full_name_hash(q->name, q->len);
94 bucket = &(dlm->resources[q->hash & DLM_HASH_MASK]); 94 bucket = &(dlm->lockres_hash[q->hash % DLM_HASH_BUCKETS]);
95 95
96 /* get a reference for our hashtable */ 96 /* get a reference for our hashtable */
97 dlm_lockres_get(res); 97 dlm_lockres_get(res);
98 98
99 list_add_tail(&res->list, bucket); 99 hlist_add_head(&res->hash_node, bucket);
100} 100}
101 101
102struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm, 102struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm,
@@ -104,9 +104,9 @@ struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm,
104 unsigned int len) 104 unsigned int len)
105{ 105{
106 unsigned int hash; 106 unsigned int hash;
107 struct list_head *iter; 107 struct hlist_node *iter;
108 struct dlm_lock_resource *tmpres=NULL; 108 struct dlm_lock_resource *tmpres=NULL;
109 struct list_head *bucket; 109 struct hlist_head *bucket;
110 110
111 mlog_entry("%.*s\n", len, name); 111 mlog_entry("%.*s\n", len, name);
112 112
@@ -114,11 +114,11 @@ struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm,
114 114
115 hash = full_name_hash(name, len); 115 hash = full_name_hash(name, len);
116 116
117 bucket = &(dlm->resources[hash & DLM_HASH_MASK]); 117 bucket = &(dlm->lockres_hash[hash % DLM_HASH_BUCKETS]);
118 118
119 /* check for pre-existing lock */ 119 /* check for pre-existing lock */
120 list_for_each(iter, bucket) { 120 hlist_for_each(iter, bucket) {
121 tmpres = list_entry(iter, struct dlm_lock_resource, list); 121 tmpres = hlist_entry(iter, struct dlm_lock_resource, hash_node);
122 if (tmpres->lockname.len == len && 122 if (tmpres->lockname.len == len &&
123 memcmp(tmpres->lockname.name, name, len) == 0) { 123 memcmp(tmpres->lockname.name, name, len) == 0) {
124 dlm_lockres_get(tmpres); 124 dlm_lockres_get(tmpres);
@@ -193,8 +193,8 @@ static int dlm_wait_on_domain_helper(const char *domain)
193 193
194static void dlm_free_ctxt_mem(struct dlm_ctxt *dlm) 194static void dlm_free_ctxt_mem(struct dlm_ctxt *dlm)
195{ 195{
196 if (dlm->resources) 196 if (dlm->lockres_hash)
197 free_page((unsigned long) dlm->resources); 197 free_page((unsigned long) dlm->lockres_hash);
198 198
199 if (dlm->name) 199 if (dlm->name)
200 kfree(dlm->name); 200 kfree(dlm->name);
@@ -303,10 +303,10 @@ static void dlm_migrate_all_locks(struct dlm_ctxt *dlm)
303 mlog(0, "Migrating locks from domain %s\n", dlm->name); 303 mlog(0, "Migrating locks from domain %s\n", dlm->name);
304restart: 304restart:
305 spin_lock(&dlm->spinlock); 305 spin_lock(&dlm->spinlock);
306 for (i=0; i<DLM_HASH_SIZE; i++) { 306 for (i = 0; i < DLM_HASH_BUCKETS; i++) {
307 while (!list_empty(&dlm->resources[i])) { 307 while (!hlist_empty(&dlm->lockres_hash[i])) {
308 res = list_entry(dlm->resources[i].next, 308 res = hlist_entry(dlm->lockres_hash[i].first,
309 struct dlm_lock_resource, list); 309 struct dlm_lock_resource, hash_node);
310 /* need reference when manually grabbing lockres */ 310 /* need reference when manually grabbing lockres */
311 dlm_lockres_get(res); 311 dlm_lockres_get(res);
312 /* this should unhash the lockres 312 /* this should unhash the lockres
@@ -573,8 +573,11 @@ static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data)
573 spin_lock(&dlm_domain_lock); 573 spin_lock(&dlm_domain_lock);
574 dlm = __dlm_lookup_domain_full(query->domain, query->name_len); 574 dlm = __dlm_lookup_domain_full(query->domain, query->name_len);
575 /* Once the dlm ctxt is marked as leaving then we don't want 575 /* Once the dlm ctxt is marked as leaving then we don't want
576 * to be put in someone's domain map. */ 576 * to be put in someone's domain map.
577 * Also, explicitly disallow joining at certain troublesome
578 * times (ie. during recovery). */
577 if (dlm && dlm->dlm_state != DLM_CTXT_LEAVING) { 579 if (dlm && dlm->dlm_state != DLM_CTXT_LEAVING) {
580 int bit = query->node_idx;
578 spin_lock(&dlm->spinlock); 581 spin_lock(&dlm->spinlock);
579 582
580 if (dlm->dlm_state == DLM_CTXT_NEW && 583 if (dlm->dlm_state == DLM_CTXT_NEW &&
@@ -586,6 +589,19 @@ static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data)
586 } else if (dlm->joining_node != DLM_LOCK_RES_OWNER_UNKNOWN) { 589 } else if (dlm->joining_node != DLM_LOCK_RES_OWNER_UNKNOWN) {
587 /* Disallow parallel joins. */ 590 /* Disallow parallel joins. */
588 response = JOIN_DISALLOW; 591 response = JOIN_DISALLOW;
592 } else if (dlm->reco.state & DLM_RECO_STATE_ACTIVE) {
593 mlog(ML_NOTICE, "node %u trying to join, but recovery "
594 "is ongoing.\n", bit);
595 response = JOIN_DISALLOW;
596 } else if (test_bit(bit, dlm->recovery_map)) {
597 mlog(ML_NOTICE, "node %u trying to join, but it "
598 "still needs recovery.\n", bit);
599 response = JOIN_DISALLOW;
600 } else if (test_bit(bit, dlm->domain_map)) {
601 mlog(ML_NOTICE, "node %u trying to join, but it "
602 "is still in the domain! needs recovery?\n",
603 bit);
604 response = JOIN_DISALLOW;
589 } else { 605 } else {
590 /* Alright we're fully a part of this domain 606 /* Alright we're fully a part of this domain
591 * so we keep some state as to who's joining 607 * so we keep some state as to who's joining
@@ -1175,18 +1191,17 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
1175 goto leave; 1191 goto leave;
1176 } 1192 }
1177 1193
1178 dlm->resources = (struct list_head *) __get_free_page(GFP_KERNEL); 1194 dlm->lockres_hash = (struct hlist_head *) __get_free_page(GFP_KERNEL);
1179 if (!dlm->resources) { 1195 if (!dlm->lockres_hash) {
1180 mlog_errno(-ENOMEM); 1196 mlog_errno(-ENOMEM);
1181 kfree(dlm->name); 1197 kfree(dlm->name);
1182 kfree(dlm); 1198 kfree(dlm);
1183 dlm = NULL; 1199 dlm = NULL;
1184 goto leave; 1200 goto leave;
1185 } 1201 }
1186 memset(dlm->resources, 0, PAGE_SIZE);
1187 1202
1188 for (i=0; i<DLM_HASH_SIZE; i++) 1203 for (i=0; i<DLM_HASH_BUCKETS; i++)
1189 INIT_LIST_HEAD(&dlm->resources[i]); 1204 INIT_HLIST_HEAD(&dlm->lockres_hash[i]);
1190 1205
1191 strcpy(dlm->name, domain); 1206 strcpy(dlm->name, domain);
1192 dlm->key = key; 1207 dlm->key = key;
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index d1a0038557a3..671d4ff222cc 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -220,6 +220,17 @@ static enum dlm_status dlmlock_remote(struct dlm_ctxt *dlm,
220 dlm_error(status); 220 dlm_error(status);
221 dlm_revert_pending_lock(res, lock); 221 dlm_revert_pending_lock(res, lock);
222 dlm_lock_put(lock); 222 dlm_lock_put(lock);
223 } else if (dlm_is_recovery_lock(res->lockname.name,
224 res->lockname.len)) {
225 /* special case for the $RECOVERY lock.
226 * there will never be an AST delivered to put
227 * this lock on the proper secondary queue
228 * (granted), so do it manually. */
229 mlog(0, "%s: $RECOVERY lock for this node (%u) is "
230 "mastered by %u; got lock, manually granting (no ast)\n",
231 dlm->name, dlm->node_num, res->owner);
232 list_del_init(&lock->list);
233 list_add_tail(&lock->list, &res->granted);
223 } 234 }
224 spin_unlock(&res->spinlock); 235 spin_unlock(&res->spinlock);
225 236
@@ -646,7 +657,19 @@ retry_lock:
646 mlog(0, "retrying lock with migration/" 657 mlog(0, "retrying lock with migration/"
647 "recovery/in progress\n"); 658 "recovery/in progress\n");
648 msleep(100); 659 msleep(100);
649 dlm_wait_for_recovery(dlm); 660 /* no waiting for dlm_reco_thread */
661 if (recovery) {
662 if (status == DLM_RECOVERING) {
663 mlog(0, "%s: got RECOVERING "
664 "for $REOCVERY lock, master "
665 "was %u\n", dlm->name,
666 res->owner);
667 dlm_wait_for_node_death(dlm, res->owner,
668 DLM_NODE_DEATH_WAIT_MAX);
669 }
670 } else {
671 dlm_wait_for_recovery(dlm);
672 }
650 goto retry_lock; 673 goto retry_lock;
651 } 674 }
652 675
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 27e984f7e4cd..847dd3cc4cf5 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -564,7 +564,7 @@ static void dlm_lockres_release(struct kref *kref)
564 564
565 /* By the time we're ready to blow this guy away, we shouldn't 565 /* By the time we're ready to blow this guy away, we shouldn't
566 * be on any lists. */ 566 * be on any lists. */
567 BUG_ON(!list_empty(&res->list)); 567 BUG_ON(!hlist_unhashed(&res->hash_node));
568 BUG_ON(!list_empty(&res->granted)); 568 BUG_ON(!list_empty(&res->granted));
569 BUG_ON(!list_empty(&res->converting)); 569 BUG_ON(!list_empty(&res->converting));
570 BUG_ON(!list_empty(&res->blocked)); 570 BUG_ON(!list_empty(&res->blocked));
@@ -605,7 +605,7 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
605 605
606 init_waitqueue_head(&res->wq); 606 init_waitqueue_head(&res->wq);
607 spin_lock_init(&res->spinlock); 607 spin_lock_init(&res->spinlock);
608 INIT_LIST_HEAD(&res->list); 608 INIT_HLIST_NODE(&res->hash_node);
609 INIT_LIST_HEAD(&res->granted); 609 INIT_LIST_HEAD(&res->granted);
610 INIT_LIST_HEAD(&res->converting); 610 INIT_LIST_HEAD(&res->converting);
611 INIT_LIST_HEAD(&res->blocked); 611 INIT_LIST_HEAD(&res->blocked);
@@ -1050,17 +1050,10 @@ static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm,
1050 node = dlm_bitmap_diff_iter_next(&bdi, &sc); 1050 node = dlm_bitmap_diff_iter_next(&bdi, &sc);
1051 while (node >= 0) { 1051 while (node >= 0) {
1052 if (sc == NODE_UP) { 1052 if (sc == NODE_UP) {
1053 /* a node came up. easy. might not even need 1053 /* a node came up. clear any old vote from
1054 * to talk to it if its node number is higher 1054 * the response map and set it in the vote map
1055 * or if we are already blocked. */ 1055 * then restart the mastery. */
1056 mlog(0, "node up! %d\n", node); 1056 mlog(ML_NOTICE, "node %d up while restarting\n", node);
1057 if (blocked)
1058 goto next;
1059
1060 if (node > dlm->node_num) {
1061 mlog(0, "node > this node. skipping.\n");
1062 goto next;
1063 }
1064 1057
1065 /* redo the master request, but only for the new node */ 1058 /* redo the master request, but only for the new node */
1066 mlog(0, "sending request to new node\n"); 1059 mlog(0, "sending request to new node\n");
@@ -2005,6 +1998,15 @@ fail:
2005 break; 1998 break;
2006 1999
2007 mlog(0, "timed out during migration\n"); 2000 mlog(0, "timed out during migration\n");
2001 /* avoid hang during shutdown when migrating lockres
2002 * to a node which also goes down */
2003 if (dlm_is_node_dead(dlm, target)) {
2004 mlog(0, "%s:%.*s: expected migration target %u "
2005 "is no longer up. restarting.\n",
2006 dlm->name, res->lockname.len,
2007 res->lockname.name, target);
2008 ret = -ERESTARTSYS;
2009 }
2008 } 2010 }
2009 if (ret == -ERESTARTSYS) { 2011 if (ret == -ERESTARTSYS) {
2010 /* migration failed, detach and clean up mle */ 2012 /* migration failed, detach and clean up mle */
@@ -2480,7 +2482,9 @@ top:
2480 atomic_set(&mle->woken, 1); 2482 atomic_set(&mle->woken, 1);
2481 spin_unlock(&mle->spinlock); 2483 spin_unlock(&mle->spinlock);
2482 wake_up(&mle->wq); 2484 wake_up(&mle->wq);
2483 /* final put will take care of list removal */ 2485 /* do not need events any longer, so detach
2486 * from heartbeat */
2487 __dlm_mle_detach_hb_events(dlm, mle);
2484 __dlm_put_mle(mle); 2488 __dlm_put_mle(mle);
2485 } 2489 }
2486 continue; 2490 continue;
@@ -2535,6 +2539,9 @@ top:
2535 spin_unlock(&res->spinlock); 2539 spin_unlock(&res->spinlock);
2536 dlm_lockres_put(res); 2540 dlm_lockres_put(res);
2537 2541
2542 /* about to get rid of mle, detach from heartbeat */
2543 __dlm_mle_detach_hb_events(dlm, mle);
2544
2538 /* dump the mle */ 2545 /* dump the mle */
2539 spin_lock(&dlm->master_lock); 2546 spin_lock(&dlm->master_lock);
2540 __dlm_put_mle(mle); 2547 __dlm_put_mle(mle);
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 0c8eb1093f00..1e232000f3f7 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -39,6 +39,7 @@
39#include <linux/inet.h> 39#include <linux/inet.h>
40#include <linux/timer.h> 40#include <linux/timer.h>
41#include <linux/kthread.h> 41#include <linux/kthread.h>
42#include <linux/delay.h>
42 43
43 44
44#include "cluster/heartbeat.h" 45#include "cluster/heartbeat.h"
@@ -256,6 +257,45 @@ static int dlm_recovery_thread(void *data)
256 return 0; 257 return 0;
257} 258}
258 259
260/* returns true when the recovery master has contacted us */
261static int dlm_reco_master_ready(struct dlm_ctxt *dlm)
262{
263 int ready;
264 spin_lock(&dlm->spinlock);
265 ready = (dlm->reco.new_master != O2NM_INVALID_NODE_NUM);
266 spin_unlock(&dlm->spinlock);
267 return ready;
268}
269
270/* returns true if node is no longer in the domain
271 * could be dead or just not joined */
272int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node)
273{
274 int dead;
275 spin_lock(&dlm->spinlock);
276 dead = test_bit(node, dlm->domain_map);
277 spin_unlock(&dlm->spinlock);
278 return dead;
279}
280
281int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout)
282{
283 if (timeout) {
284 mlog(ML_NOTICE, "%s: waiting %dms for notification of "
285 "death of node %u\n", dlm->name, timeout, node);
286 wait_event_timeout(dlm->dlm_reco_thread_wq,
287 dlm_is_node_dead(dlm, node),
288 msecs_to_jiffies(timeout));
289 } else {
290 mlog(ML_NOTICE, "%s: waiting indefinitely for notification "
291 "of death of node %u\n", dlm->name, node);
292 wait_event(dlm->dlm_reco_thread_wq,
293 dlm_is_node_dead(dlm, node));
294 }
295 /* for now, return 0 */
296 return 0;
297}
298
259/* callers of the top-level api calls (dlmlock/dlmunlock) should 299/* callers of the top-level api calls (dlmlock/dlmunlock) should
260 * block on the dlm->reco.event when recovery is in progress. 300 * block on the dlm->reco.event when recovery is in progress.
261 * the dlm recovery thread will set this state when it begins 301 * the dlm recovery thread will set this state when it begins
@@ -297,6 +337,7 @@ static void dlm_end_recovery(struct dlm_ctxt *dlm)
297static int dlm_do_recovery(struct dlm_ctxt *dlm) 337static int dlm_do_recovery(struct dlm_ctxt *dlm)
298{ 338{
299 int status = 0; 339 int status = 0;
340 int ret;
300 341
301 spin_lock(&dlm->spinlock); 342 spin_lock(&dlm->spinlock);
302 343
@@ -343,10 +384,13 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
343 goto master_here; 384 goto master_here;
344 385
345 if (dlm->reco.new_master == O2NM_INVALID_NODE_NUM) { 386 if (dlm->reco.new_master == O2NM_INVALID_NODE_NUM) {
346 /* choose a new master */ 387 /* choose a new master, returns 0 if this node
347 if (!dlm_pick_recovery_master(dlm)) { 388 * is the master, -EEXIST if it's another node.
389 * this does not return until a new master is chosen
390 * or recovery completes entirely. */
391 ret = dlm_pick_recovery_master(dlm);
392 if (!ret) {
348 /* already notified everyone. go. */ 393 /* already notified everyone. go. */
349 dlm->reco.new_master = dlm->node_num;
350 goto master_here; 394 goto master_here;
351 } 395 }
352 mlog(0, "another node will master this recovery session.\n"); 396 mlog(0, "another node will master this recovery session.\n");
@@ -371,8 +415,13 @@ master_here:
371 if (status < 0) { 415 if (status < 0) {
372 mlog(ML_ERROR, "error %d remastering locks for node %u, " 416 mlog(ML_ERROR, "error %d remastering locks for node %u, "
373 "retrying.\n", status, dlm->reco.dead_node); 417 "retrying.\n", status, dlm->reco.dead_node);
418 /* yield a bit to allow any final network messages
419 * to get handled on remaining nodes */
420 msleep(100);
374 } else { 421 } else {
375 /* success! see if any other nodes need recovery */ 422 /* success! see if any other nodes need recovery */
423 mlog(0, "DONE mastering recovery of %s:%u here(this=%u)!\n",
424 dlm->name, dlm->reco.dead_node, dlm->node_num);
376 dlm_reset_recovery(dlm); 425 dlm_reset_recovery(dlm);
377 } 426 }
378 dlm_end_recovery(dlm); 427 dlm_end_recovery(dlm);
@@ -477,7 +526,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
477 BUG(); 526 BUG();
478 break; 527 break;
479 case DLM_RECO_NODE_DATA_DEAD: 528 case DLM_RECO_NODE_DATA_DEAD:
480 mlog(0, "node %u died after " 529 mlog(ML_NOTICE, "node %u died after "
481 "requesting recovery info for " 530 "requesting recovery info for "
482 "node %u\n", ndata->node_num, 531 "node %u\n", ndata->node_num,
483 dead_node); 532 dead_node);
@@ -485,6 +534,19 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
485 // start all over 534 // start all over
486 destroy = 1; 535 destroy = 1;
487 status = -EAGAIN; 536 status = -EAGAIN;
537 /* instead of spinning like crazy here,
538 * wait for the domain map to catch up
539 * with the network state. otherwise this
540 * can be hit hundreds of times before
541 * the node is really seen as dead. */
542 wait_event_timeout(dlm->dlm_reco_thread_wq,
543 dlm_is_node_dead(dlm,
544 ndata->node_num),
545 msecs_to_jiffies(1000));
546 mlog(0, "waited 1 sec for %u, "
547 "dead? %s\n", ndata->node_num,
548 dlm_is_node_dead(dlm, ndata->node_num) ?
549 "yes" : "no");
488 goto leave; 550 goto leave;
489 case DLM_RECO_NODE_DATA_RECEIVING: 551 case DLM_RECO_NODE_DATA_RECEIVING:
490 case DLM_RECO_NODE_DATA_REQUESTED: 552 case DLM_RECO_NODE_DATA_REQUESTED:
@@ -678,11 +740,27 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data)
678 dlm = item->dlm; 740 dlm = item->dlm;
679 dead_node = item->u.ral.dead_node; 741 dead_node = item->u.ral.dead_node;
680 reco_master = item->u.ral.reco_master; 742 reco_master = item->u.ral.reco_master;
743 mres = (struct dlm_migratable_lockres *)data;
744
745 if (dead_node != dlm->reco.dead_node ||
746 reco_master != dlm->reco.new_master) {
747 /* show extra debug info if the recovery state is messed */
748 mlog(ML_ERROR, "%s: bad reco state: reco(dead=%u, master=%u), "
749 "request(dead=%u, master=%u)\n",
750 dlm->name, dlm->reco.dead_node, dlm->reco.new_master,
751 dead_node, reco_master);
752 mlog(ML_ERROR, "%s: name=%.*s master=%u locks=%u/%u flags=%u "
753 "entry[0]={c=%"MLFu64",l=%u,f=%u,t=%d,ct=%d,hb=%d,n=%u}\n",
754 dlm->name, mres->lockname_len, mres->lockname, mres->master,
755 mres->num_locks, mres->total_locks, mres->flags,
756 mres->ml[0].cookie, mres->ml[0].list, mres->ml[0].flags,
757 mres->ml[0].type, mres->ml[0].convert_type,
758 mres->ml[0].highest_blocked, mres->ml[0].node);
759 BUG();
760 }
681 BUG_ON(dead_node != dlm->reco.dead_node); 761 BUG_ON(dead_node != dlm->reco.dead_node);
682 BUG_ON(reco_master != dlm->reco.new_master); 762 BUG_ON(reco_master != dlm->reco.new_master);
683 763
684 mres = (struct dlm_migratable_lockres *)data;
685
686 /* lock resources should have already been moved to the 764 /* lock resources should have already been moved to the
687 * dlm->reco.resources list. now move items from that list 765 * dlm->reco.resources list. now move items from that list
688 * to a temp list if the dead owner matches. note that the 766 * to a temp list if the dead owner matches. note that the
@@ -757,15 +835,18 @@ int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data)
757 continue; 835 continue;
758 836
759 switch (ndata->state) { 837 switch (ndata->state) {
838 /* should have moved beyond INIT but not to FINALIZE yet */
760 case DLM_RECO_NODE_DATA_INIT: 839 case DLM_RECO_NODE_DATA_INIT:
761 case DLM_RECO_NODE_DATA_DEAD: 840 case DLM_RECO_NODE_DATA_DEAD:
762 case DLM_RECO_NODE_DATA_DONE:
763 case DLM_RECO_NODE_DATA_FINALIZE_SENT: 841 case DLM_RECO_NODE_DATA_FINALIZE_SENT:
764 mlog(ML_ERROR, "bad ndata state for node %u:" 842 mlog(ML_ERROR, "bad ndata state for node %u:"
765 " state=%d\n", ndata->node_num, 843 " state=%d\n", ndata->node_num,
766 ndata->state); 844 ndata->state);
767 BUG(); 845 BUG();
768 break; 846 break;
847 /* these states are possible at this point, anywhere along
848 * the line of recovery */
849 case DLM_RECO_NODE_DATA_DONE:
769 case DLM_RECO_NODE_DATA_RECEIVING: 850 case DLM_RECO_NODE_DATA_RECEIVING:
770 case DLM_RECO_NODE_DATA_REQUESTED: 851 case DLM_RECO_NODE_DATA_REQUESTED:
771 case DLM_RECO_NODE_DATA_REQUESTING: 852 case DLM_RECO_NODE_DATA_REQUESTING:
@@ -799,13 +880,31 @@ static void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm,
799{ 880{
800 struct dlm_lock_resource *res; 881 struct dlm_lock_resource *res;
801 struct list_head *iter, *iter2; 882 struct list_head *iter, *iter2;
883 struct dlm_lock *lock;
802 884
803 spin_lock(&dlm->spinlock); 885 spin_lock(&dlm->spinlock);
804 list_for_each_safe(iter, iter2, &dlm->reco.resources) { 886 list_for_each_safe(iter, iter2, &dlm->reco.resources) {
805 res = list_entry (iter, struct dlm_lock_resource, recovering); 887 res = list_entry (iter, struct dlm_lock_resource, recovering);
888 /* always prune any $RECOVERY entries for dead nodes,
889 * otherwise hangs can occur during later recovery */
806 if (dlm_is_recovery_lock(res->lockname.name, 890 if (dlm_is_recovery_lock(res->lockname.name,
807 res->lockname.len)) 891 res->lockname.len)) {
892 spin_lock(&res->spinlock);
893 list_for_each_entry(lock, &res->granted, list) {
894 if (lock->ml.node == dead_node) {
895 mlog(0, "AHA! there was "
896 "a $RECOVERY lock for dead "
897 "node %u (%s)!\n",
898 dead_node, dlm->name);
899 list_del_init(&lock->list);
900 dlm_lock_put(lock);
901 break;
902 }
903 }
904 spin_unlock(&res->spinlock);
808 continue; 905 continue;
906 }
907
809 if (res->owner == dead_node) { 908 if (res->owner == dead_node) {
810 mlog(0, "found lockres owned by dead node while " 909 mlog(0, "found lockres owned by dead node while "
811 "doing recovery for node %u. sending it.\n", 910 "doing recovery for node %u. sending it.\n",
@@ -1179,7 +1278,7 @@ static void dlm_mig_lockres_worker(struct dlm_work_item *item, void *data)
1179again: 1278again:
1180 ret = dlm_lockres_master_requery(dlm, res, &real_master); 1279 ret = dlm_lockres_master_requery(dlm, res, &real_master);
1181 if (ret < 0) { 1280 if (ret < 0) {
1182 mlog(0, "dlm_lockres_master_requery failure: %d\n", 1281 mlog(0, "dlm_lockres_master_requery ret=%d\n",
1183 ret); 1282 ret);
1184 goto again; 1283 goto again;
1185 } 1284 }
@@ -1594,7 +1693,10 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
1594 u8 dead_node, u8 new_master) 1693 u8 dead_node, u8 new_master)
1595{ 1694{
1596 int i; 1695 int i;
1597 struct list_head *iter, *iter2, *bucket; 1696 struct list_head *iter, *iter2;
1697 struct hlist_node *hash_iter;
1698 struct hlist_head *bucket;
1699
1598 struct dlm_lock_resource *res; 1700 struct dlm_lock_resource *res;
1599 1701
1600 mlog_entry_void(); 1702 mlog_entry_void();
@@ -1618,10 +1720,9 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
1618 * for now we need to run the whole hash, clear 1720 * for now we need to run the whole hash, clear
1619 * the RECOVERING state and set the owner 1721 * the RECOVERING state and set the owner
1620 * if necessary */ 1722 * if necessary */
1621 for (i=0; i<DLM_HASH_SIZE; i++) { 1723 for (i = 0; i < DLM_HASH_BUCKETS; i++) {
1622 bucket = &(dlm->resources[i]); 1724 bucket = &(dlm->lockres_hash[i]);
1623 list_for_each(iter, bucket) { 1725 hlist_for_each_entry(res, hash_iter, bucket, hash_node) {
1624 res = list_entry (iter, struct dlm_lock_resource, list);
1625 if (res->state & DLM_LOCK_RES_RECOVERING) { 1726 if (res->state & DLM_LOCK_RES_RECOVERING) {
1626 if (res->owner == dead_node) { 1727 if (res->owner == dead_node) {
1627 mlog(0, "(this=%u) res %.*s owner=%u " 1728 mlog(0, "(this=%u) res %.*s owner=%u "
@@ -1753,10 +1854,11 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
1753 1854
1754static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node) 1855static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
1755{ 1856{
1756 struct list_head *iter; 1857 struct hlist_node *iter;
1757 struct dlm_lock_resource *res; 1858 struct dlm_lock_resource *res;
1758 int i; 1859 int i;
1759 struct list_head *bucket; 1860 struct hlist_head *bucket;
1861 struct dlm_lock *lock;
1760 1862
1761 1863
1762 /* purge any stale mles */ 1864 /* purge any stale mles */
@@ -1776,14 +1878,28 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
1776 * can be kicked again to see if any ASTs or BASTs 1878 * can be kicked again to see if any ASTs or BASTs
1777 * need to be fired as a result. 1879 * need to be fired as a result.
1778 */ 1880 */
1779 for (i=0; i<DLM_HASH_SIZE; i++) { 1881 for (i = 0; i < DLM_HASH_BUCKETS; i++) {
1780 bucket = &(dlm->resources[i]); 1882 bucket = &(dlm->lockres_hash[i]);
1781 list_for_each(iter, bucket) { 1883 hlist_for_each_entry(res, iter, bucket, hash_node) {
1782 res = list_entry (iter, struct dlm_lock_resource, list); 1884 /* always prune any $RECOVERY entries for dead nodes,
1885 * otherwise hangs can occur during later recovery */
1783 if (dlm_is_recovery_lock(res->lockname.name, 1886 if (dlm_is_recovery_lock(res->lockname.name,
1784 res->lockname.len)) 1887 res->lockname.len)) {
1888 spin_lock(&res->spinlock);
1889 list_for_each_entry(lock, &res->granted, list) {
1890 if (lock->ml.node == dead_node) {
1891 mlog(0, "AHA! there was "
1892 "a $RECOVERY lock for dead "
1893 "node %u (%s)!\n",
1894 dead_node, dlm->name);
1895 list_del_init(&lock->list);
1896 dlm_lock_put(lock);
1897 break;
1898 }
1899 }
1900 spin_unlock(&res->spinlock);
1785 continue; 1901 continue;
1786 1902 }
1787 spin_lock(&res->spinlock); 1903 spin_lock(&res->spinlock);
1788 /* zero the lvb if necessary */ 1904 /* zero the lvb if necessary */
1789 dlm_revalidate_lvb(dlm, res, dead_node); 1905 dlm_revalidate_lvb(dlm, res, dead_node);
@@ -1869,12 +1985,9 @@ void dlm_hb_node_up_cb(struct o2nm_node *node, int idx, void *data)
1869 return; 1985 return;
1870 1986
1871 spin_lock(&dlm->spinlock); 1987 spin_lock(&dlm->spinlock);
1872
1873 set_bit(idx, dlm->live_nodes_map); 1988 set_bit(idx, dlm->live_nodes_map);
1874 1989 /* do NOT notify mle attached to the heartbeat events.
1875 /* notify any mles attached to the heartbeat events */ 1990 * new nodes are not interesting in mastery until joined. */
1876 dlm_hb_event_notify_attached(dlm, idx, 1);
1877
1878 spin_unlock(&dlm->spinlock); 1991 spin_unlock(&dlm->spinlock);
1879 1992
1880 dlm_put(dlm); 1993 dlm_put(dlm);
@@ -1897,7 +2010,18 @@ static void dlm_reco_unlock_ast(void *astdata, enum dlm_status st)
1897 mlog(0, "unlockast for recovery lock fired!\n"); 2010 mlog(0, "unlockast for recovery lock fired!\n");
1898} 2011}
1899 2012
1900 2013/*
2014 * dlm_pick_recovery_master will continually attempt to use
2015 * dlmlock() on the special "$RECOVERY" lockres with the
2016 * LKM_NOQUEUE flag to get an EX. every thread that enters
2017 * this function on each node racing to become the recovery
2018 * master will not stop attempting this until either:
2019 * a) this node gets the EX (and becomes the recovery master),
2020 * or b) dlm->reco.new_master gets set to some nodenum
2021 * != O2NM_INVALID_NODE_NUM (another node will do the reco).
2022 * so each time a recovery master is needed, the entire cluster
2023 * will sync at this point. if the new master dies, that will
2024 * be detected in dlm_do_recovery */
1901static int dlm_pick_recovery_master(struct dlm_ctxt *dlm) 2025static int dlm_pick_recovery_master(struct dlm_ctxt *dlm)
1902{ 2026{
1903 enum dlm_status ret; 2027 enum dlm_status ret;
@@ -1906,23 +2030,69 @@ static int dlm_pick_recovery_master(struct dlm_ctxt *dlm)
1906 2030
1907 mlog(0, "starting recovery of %s at %lu, dead=%u, this=%u\n", 2031 mlog(0, "starting recovery of %s at %lu, dead=%u, this=%u\n",
1908 dlm->name, jiffies, dlm->reco.dead_node, dlm->node_num); 2032 dlm->name, jiffies, dlm->reco.dead_node, dlm->node_num);
1909retry: 2033again:
1910 memset(&lksb, 0, sizeof(lksb)); 2034 memset(&lksb, 0, sizeof(lksb));
1911 2035
1912 ret = dlmlock(dlm, LKM_EXMODE, &lksb, LKM_NOQUEUE|LKM_RECOVERY, 2036 ret = dlmlock(dlm, LKM_EXMODE, &lksb, LKM_NOQUEUE|LKM_RECOVERY,
1913 DLM_RECOVERY_LOCK_NAME, dlm_reco_ast, dlm, dlm_reco_bast); 2037 DLM_RECOVERY_LOCK_NAME, dlm_reco_ast, dlm, dlm_reco_bast);
1914 2038
2039 mlog(0, "%s: dlmlock($RECOVERY) returned %d, lksb=%d\n",
2040 dlm->name, ret, lksb.status);
2041
1915 if (ret == DLM_NORMAL) { 2042 if (ret == DLM_NORMAL) {
1916 mlog(0, "dlm=%s dlmlock says I got it (this=%u)\n", 2043 mlog(0, "dlm=%s dlmlock says I got it (this=%u)\n",
1917 dlm->name, dlm->node_num); 2044 dlm->name, dlm->node_num);
1918 /* I am master, send message to all nodes saying 2045
1919 * that I am beginning a recovery session */ 2046 /* got the EX lock. check to see if another node
1920 status = dlm_send_begin_reco_message(dlm, 2047 * just became the reco master */
1921 dlm->reco.dead_node); 2048 if (dlm_reco_master_ready(dlm)) {
2049 mlog(0, "%s: got reco EX lock, but %u will "
2050 "do the recovery\n", dlm->name,
2051 dlm->reco.new_master);
2052 status = -EEXIST;
2053 } else {
2054 status = 0;
2055
2056 /* see if recovery was already finished elsewhere */
2057 spin_lock(&dlm->spinlock);
2058 if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) {
2059 status = -EINVAL;
2060 mlog(0, "%s: got reco EX lock, but "
2061 "node got recovered already\n", dlm->name);
2062 if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM) {
2063 mlog(ML_ERROR, "%s: new master is %u "
2064 "but no dead node!\n",
2065 dlm->name, dlm->reco.new_master);
2066 BUG();
2067 }
2068 }
2069 spin_unlock(&dlm->spinlock);
2070 }
2071
2072 /* if this node has actually become the recovery master,
2073 * set the master and send the messages to begin recovery */
2074 if (!status) {
2075 mlog(0, "%s: dead=%u, this=%u, sending "
2076 "begin_reco now\n", dlm->name,
2077 dlm->reco.dead_node, dlm->node_num);
2078 status = dlm_send_begin_reco_message(dlm,
2079 dlm->reco.dead_node);
2080 /* this always succeeds */
2081 BUG_ON(status);
2082
2083 /* set the new_master to this node */
2084 spin_lock(&dlm->spinlock);
2085 dlm->reco.new_master = dlm->node_num;
2086 spin_unlock(&dlm->spinlock);
2087 }
1922 2088
1923 /* recovery lock is a special case. ast will not get fired, 2089 /* recovery lock is a special case. ast will not get fired,
1924 * so just go ahead and unlock it. */ 2090 * so just go ahead and unlock it. */
1925 ret = dlmunlock(dlm, &lksb, 0, dlm_reco_unlock_ast, dlm); 2091 ret = dlmunlock(dlm, &lksb, 0, dlm_reco_unlock_ast, dlm);
2092 if (ret == DLM_DENIED) {
2093 mlog(0, "got DLM_DENIED, trying LKM_CANCEL\n");
2094 ret = dlmunlock(dlm, &lksb, LKM_CANCEL, dlm_reco_unlock_ast, dlm);
2095 }
1926 if (ret != DLM_NORMAL) { 2096 if (ret != DLM_NORMAL) {
1927 /* this would really suck. this could only happen 2097 /* this would really suck. this could only happen
1928 * if there was a network error during the unlock 2098 * if there was a network error during the unlock
@@ -1930,20 +2100,42 @@ retry:
1930 * is actually "done" and the lock structure is 2100 * is actually "done" and the lock structure is
1931 * even freed. we can continue, but only 2101 * even freed. we can continue, but only
1932 * because this specific lock name is special. */ 2102 * because this specific lock name is special. */
1933 mlog(0, "dlmunlock returned %d\n", ret); 2103 mlog(ML_ERROR, "dlmunlock returned %d\n", ret);
1934 }
1935
1936 if (status < 0) {
1937 mlog(0, "failed to send recovery message. "
1938 "must retry with new node map.\n");
1939 goto retry;
1940 } 2104 }
1941 } else if (ret == DLM_NOTQUEUED) { 2105 } else if (ret == DLM_NOTQUEUED) {
1942 mlog(0, "dlm=%s dlmlock says another node got it (this=%u)\n", 2106 mlog(0, "dlm=%s dlmlock says another node got it (this=%u)\n",
1943 dlm->name, dlm->node_num); 2107 dlm->name, dlm->node_num);
1944 /* another node is master. wait on 2108 /* another node is master. wait on
1945 * reco.new_master != O2NM_INVALID_NODE_NUM */ 2109 * reco.new_master != O2NM_INVALID_NODE_NUM
2110 * for at most one second */
2111 wait_event_timeout(dlm->dlm_reco_thread_wq,
2112 dlm_reco_master_ready(dlm),
2113 msecs_to_jiffies(1000));
2114 if (!dlm_reco_master_ready(dlm)) {
2115 mlog(0, "%s: reco master taking awhile\n",
2116 dlm->name);
2117 goto again;
2118 }
2119 /* another node has informed this one that it is reco master */
2120 mlog(0, "%s: reco master %u is ready to recover %u\n",
2121 dlm->name, dlm->reco.new_master, dlm->reco.dead_node);
1946 status = -EEXIST; 2122 status = -EEXIST;
2123 } else {
2124 struct dlm_lock_resource *res;
2125
2126 /* dlmlock returned something other than NOTQUEUED or NORMAL */
2127 mlog(ML_ERROR, "%s: got %s from dlmlock($RECOVERY), "
2128 "lksb.status=%s\n", dlm->name, dlm_errname(ret),
2129 dlm_errname(lksb.status));
2130 res = dlm_lookup_lockres(dlm, DLM_RECOVERY_LOCK_NAME,
2131 DLM_RECOVERY_LOCK_NAME_LEN);
2132 if (res) {
2133 dlm_print_one_lock_resource(res);
2134 dlm_lockres_put(res);
2135 } else {
2136 mlog(ML_ERROR, "recovery lock not found\n");
2137 }
2138 BUG();
1947 } 2139 }
1948 2140
1949 return status; 2141 return status;
@@ -1982,7 +2174,7 @@ static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node)
1982 mlog(0, "not sending begin reco to self\n"); 2174 mlog(0, "not sending begin reco to self\n");
1983 continue; 2175 continue;
1984 } 2176 }
1985 2177retry:
1986 ret = -EINVAL; 2178 ret = -EINVAL;
1987 mlog(0, "attempting to send begin reco msg to %d\n", 2179 mlog(0, "attempting to send begin reco msg to %d\n",
1988 nodenum); 2180 nodenum);
@@ -1991,8 +2183,17 @@ static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node)
1991 /* negative status is handled ok by caller here */ 2183 /* negative status is handled ok by caller here */
1992 if (ret >= 0) 2184 if (ret >= 0)
1993 ret = status; 2185 ret = status;
2186 if (dlm_is_host_down(ret)) {
2187 /* node is down. not involved in recovery
2188 * so just keep going */
2189 mlog(0, "%s: node %u was down when sending "
2190 "begin reco msg (%d)\n", dlm->name, nodenum, ret);
2191 ret = 0;
2192 }
1994 if (ret < 0) { 2193 if (ret < 0) {
1995 struct dlm_lock_resource *res; 2194 struct dlm_lock_resource *res;
2195 /* this is now a serious problem, possibly ENOMEM
2196 * in the network stack. must retry */
1996 mlog_errno(ret); 2197 mlog_errno(ret);
1997 mlog(ML_ERROR, "begin reco of dlm %s to node %u " 2198 mlog(ML_ERROR, "begin reco of dlm %s to node %u "
1998 " returned %d\n", dlm->name, nodenum, ret); 2199 " returned %d\n", dlm->name, nodenum, ret);
@@ -2004,7 +2205,10 @@ static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node)
2004 } else { 2205 } else {
2005 mlog(ML_ERROR, "recovery lock not found\n"); 2206 mlog(ML_ERROR, "recovery lock not found\n");
2006 } 2207 }
2007 break; 2208 /* sleep for a bit in hopes that we can avoid
2209 * another ENOMEM */
2210 msleep(100);
2211 goto retry;
2008 } 2212 }
2009 } 2213 }
2010 2214
@@ -2027,19 +2231,34 @@ int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data)
2027 2231
2028 spin_lock(&dlm->spinlock); 2232 spin_lock(&dlm->spinlock);
2029 if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM) { 2233 if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM) {
2030 mlog(0, "new_master already set to %u!\n", 2234 if (test_bit(dlm->reco.new_master, dlm->recovery_map)) {
2031 dlm->reco.new_master); 2235 mlog(0, "%s: new_master %u died, changing "
2236 "to %u\n", dlm->name, dlm->reco.new_master,
2237 br->node_idx);
2238 } else {
2239 mlog(0, "%s: new_master %u NOT DEAD, changing "
2240 "to %u\n", dlm->name, dlm->reco.new_master,
2241 br->node_idx);
2242 /* may not have seen the new master as dead yet */
2243 }
2032 } 2244 }
2033 if (dlm->reco.dead_node != O2NM_INVALID_NODE_NUM) { 2245 if (dlm->reco.dead_node != O2NM_INVALID_NODE_NUM) {
2034 mlog(0, "dead_node already set to %u!\n", 2246 mlog(ML_NOTICE, "%s: dead_node previously set to %u, "
2035 dlm->reco.dead_node); 2247 "node %u changing it to %u\n", dlm->name,
2248 dlm->reco.dead_node, br->node_idx, br->dead_node);
2036 } 2249 }
2037 dlm->reco.new_master = br->node_idx; 2250 dlm->reco.new_master = br->node_idx;
2038 dlm->reco.dead_node = br->dead_node; 2251 dlm->reco.dead_node = br->dead_node;
2039 if (!test_bit(br->dead_node, dlm->recovery_map)) { 2252 if (!test_bit(br->dead_node, dlm->recovery_map)) {
2040 mlog(ML_ERROR, "recovery master %u sees %u as dead, but this " 2253 mlog(0, "recovery master %u sees %u as dead, but this "
2041 "node has not yet. marking %u as dead\n", 2254 "node has not yet. marking %u as dead\n",
2042 br->node_idx, br->dead_node, br->dead_node); 2255 br->node_idx, br->dead_node, br->dead_node);
2256 if (!test_bit(br->dead_node, dlm->domain_map) ||
2257 !test_bit(br->dead_node, dlm->live_nodes_map))
2258 mlog(0, "%u not in domain/live_nodes map "
2259 "so setting it in reco map manually\n",
2260 br->dead_node);
2261 set_bit(br->dead_node, dlm->recovery_map);
2043 __dlm_hb_node_down(dlm, br->dead_node); 2262 __dlm_hb_node_down(dlm, br->dead_node);
2044 } 2263 }
2045 spin_unlock(&dlm->spinlock); 2264 spin_unlock(&dlm->spinlock);
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index cec2ce1cd318..c95f08d2e925 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -188,6 +188,19 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm,
188 actions &= ~(DLM_UNLOCK_REMOVE_LOCK| 188 actions &= ~(DLM_UNLOCK_REMOVE_LOCK|
189 DLM_UNLOCK_REGRANT_LOCK| 189 DLM_UNLOCK_REGRANT_LOCK|
190 DLM_UNLOCK_CLEAR_CONVERT_TYPE); 190 DLM_UNLOCK_CLEAR_CONVERT_TYPE);
191 } else if (status == DLM_RECOVERING ||
192 status == DLM_MIGRATING ||
193 status == DLM_FORWARD) {
194 /* must clear the actions because this unlock
195 * is about to be retried. cannot free or do
196 * any list manipulation. */
197 mlog(0, "%s:%.*s: clearing actions, %s\n",
198 dlm->name, res->lockname.len,
199 res->lockname.name,
200 status==DLM_RECOVERING?"recovering":
201 (status==DLM_MIGRATING?"migrating":
202 "forward"));
203 actions = 0;
191 } 204 }
192 if (flags & LKM_CANCEL) 205 if (flags & LKM_CANCEL)
193 lock->cancel_pending = 0; 206 lock->cancel_pending = 0;
diff --git a/fs/ocfs2/dlm/userdlm.c b/fs/ocfs2/dlm/userdlm.c
index e1fdd288796e..c3764f4744ee 100644
--- a/fs/ocfs2/dlm/userdlm.c
+++ b/fs/ocfs2/dlm/userdlm.c
@@ -27,7 +27,7 @@
27 * Boston, MA 021110-1307, USA. 27 * Boston, MA 021110-1307, USA.
28 */ 28 */
29 29
30#include <asm/signal.h> 30#include <linux/signal.h>
31 31
32#include <linux/module.h> 32#include <linux/module.h>
33#include <linux/fs.h> 33#include <linux/fs.h>
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index f2fb40cd296a..e6f207eebab4 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -181,6 +181,12 @@ static int ocfs2_extent_map_find_leaf(struct inode *inode,
181 ret = -EBADR; 181 ret = -EBADR;
182 if (rec_end > OCFS2_I(inode)->ip_clusters) { 182 if (rec_end > OCFS2_I(inode)->ip_clusters) {
183 mlog_errno(ret); 183 mlog_errno(ret);
184 ocfs2_error(inode->i_sb,
185 "Extent %d at e_blkno %"MLFu64" of inode %"MLFu64" goes past ip_clusters of %u\n",
186 i,
187 le64_to_cpu(rec->e_blkno),
188 OCFS2_I(inode)->ip_blkno,
189 OCFS2_I(inode)->ip_clusters);
184 goto out_free; 190 goto out_free;
185 } 191 }
186 192
@@ -226,6 +232,12 @@ static int ocfs2_extent_map_find_leaf(struct inode *inode,
226 ret = -EBADR; 232 ret = -EBADR;
227 if (blkno) { 233 if (blkno) {
228 mlog_errno(ret); 234 mlog_errno(ret);
235 ocfs2_error(inode->i_sb,
236 "Multiple extents for (cpos = %u, clusters = %u) on inode %"MLFu64"; e_blkno %"MLFu64" and rec %d at e_blkno %"MLFu64"\n",
237 cpos, clusters,
238 OCFS2_I(inode)->ip_blkno,
239 blkno, i,
240 le64_to_cpu(rec->e_blkno));
229 goto out_free; 241 goto out_free;
230 } 242 }
231 243
@@ -238,6 +250,10 @@ static int ocfs2_extent_map_find_leaf(struct inode *inode,
238 */ 250 */
239 ret = -EBADR; 251 ret = -EBADR;
240 if (!blkno) { 252 if (!blkno) {
253 ocfs2_error(inode->i_sb,
254 "No record found for (cpos = %u, clusters = %u) on inode %"MLFu64"\n",
255 cpos, clusters,
256 OCFS2_I(inode)->ip_blkno);
241 mlog_errno(ret); 257 mlog_errno(ret);
242 goto out_free; 258 goto out_free;
243 } 259 }
@@ -262,11 +278,24 @@ static int ocfs2_extent_map_find_leaf(struct inode *inode,
262 el = &eb->h_list; 278 el = &eb->h_list;
263 } 279 }
264 280
265 if (el->l_tree_depth) 281 BUG_ON(el->l_tree_depth);
266 BUG();
267 282
268 for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { 283 for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
269 rec = &el->l_recs[i]; 284 rec = &el->l_recs[i];
285
286 if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) >
287 OCFS2_I(inode)->ip_clusters) {
288 ret = -EBADR;
289 mlog_errno(ret);
290 ocfs2_error(inode->i_sb,
291 "Extent %d at e_blkno %"MLFu64" of inode %"MLFu64" goes past ip_clusters of %u\n",
292 i,
293 le64_to_cpu(rec->e_blkno),
294 OCFS2_I(inode)->ip_blkno,
295 OCFS2_I(inode)->ip_clusters);
296 return ret;
297 }
298
270 ret = ocfs2_extent_map_insert(inode, rec, 299 ret = ocfs2_extent_map_insert(inode, rec,
271 le16_to_cpu(el->l_tree_depth)); 300 le16_to_cpu(el->l_tree_depth));
272 if (ret) { 301 if (ret) {
@@ -364,8 +393,8 @@ static int ocfs2_extent_map_lookup_read(struct inode *inode,
364 return ret; 393 return ret;
365 } 394 }
366 395
367 if (ent->e_tree_depth) 396 /* FIXME: Make sure this isn't a corruption */
368 BUG(); /* FIXME: Make sure this isn't a corruption */ 397 BUG_ON(ent->e_tree_depth);
369 398
370 *ret_ent = ent; 399 *ret_ent = ent;
371 400
@@ -423,8 +452,7 @@ static int ocfs2_extent_map_try_insert(struct inode *inode,
423 le32_to_cpu(rec->e_clusters), NULL, 452 le32_to_cpu(rec->e_clusters), NULL,
424 NULL); 453 NULL);
425 454
426 if (!old_ent) 455 BUG_ON(!old_ent);
427 BUG();
428 456
429 ret = -EEXIST; 457 ret = -EEXIST;
430 if (old_ent->e_tree_depth < tree_depth) 458 if (old_ent->e_tree_depth < tree_depth)
@@ -528,6 +556,10 @@ static int ocfs2_extent_map_insert(struct inode *inode,
528 OCFS2_I(inode)->ip_map.em_clusters) { 556 OCFS2_I(inode)->ip_map.em_clusters) {
529 ret = -EBADR; 557 ret = -EBADR;
530 mlog_errno(ret); 558 mlog_errno(ret);
559 ocfs2_error(inode->i_sb,
560 "Zero e_clusters on non-tail extent record at e_blkno %"MLFu64" on inode %"MLFu64"\n",
561 le64_to_cpu(rec->e_blkno),
562 OCFS2_I(inode)->ip_blkno);
531 return ret; 563 return ret;
532 } 564 }
533 565
@@ -590,12 +622,12 @@ static int ocfs2_extent_map_insert(struct inode *inode,
590 * Existing record in the extent map: 622 * Existing record in the extent map:
591 * 623 *
592 * cpos = 10, len = 10 624 * cpos = 10, len = 10
593 * |---------| 625 * |---------|
594 * 626 *
595 * New Record: 627 * New Record:
596 * 628 *
597 * cpos = 10, len = 20 629 * cpos = 10, len = 20
598 * |------------------| 630 * |------------------|
599 * 631 *
600 * The passed record is the new on-disk record. The new_clusters value 632 * The passed record is the new on-disk record. The new_clusters value
601 * is how many clusters were added to the file. If the append is a 633 * is how many clusters were added to the file. If the append is a
@@ -988,7 +1020,7 @@ int __init init_ocfs2_extent_maps(void)
988 return 0; 1020 return 0;
989} 1021}
990 1022
991void __exit exit_ocfs2_extent_maps(void) 1023void exit_ocfs2_extent_maps(void)
992{ 1024{
993 kmem_cache_destroy(ocfs2_em_ent_cachep); 1025 kmem_cache_destroy(ocfs2_em_ent_cachep);
994} 1026}
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index eaf33caa0a1f..8a4048b55fdc 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -933,9 +933,6 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
933 struct file *filp = iocb->ki_filp; 933 struct file *filp = iocb->ki_filp;
934 struct inode *inode = filp->f_dentry->d_inode; 934 struct inode *inode = filp->f_dentry->d_inode;
935 loff_t newsize, saved_pos; 935 loff_t newsize, saved_pos;
936#ifdef OCFS2_ORACORE_WORKAROUNDS
937 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
938#endif
939 936
940 mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", filp, buf, 937 mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", filp, buf,
941 (unsigned int)count, 938 (unsigned int)count,
@@ -951,14 +948,6 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
951 return -EIO; 948 return -EIO;
952 } 949 }
953 950
954#ifdef OCFS2_ORACORE_WORKAROUNDS
955 /* ugh, work around some applications which open everything O_DIRECT +
956 * O_APPEND and really don't mean to use O_DIRECT. */
957 if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS &&
958 (filp->f_flags & O_APPEND) && (filp->f_flags & O_DIRECT))
959 filp->f_flags &= ~O_DIRECT;
960#endif
961
962 mutex_lock(&inode->i_mutex); 951 mutex_lock(&inode->i_mutex);
963 /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */ 952 /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */
964 if (filp->f_flags & O_DIRECT) { 953 if (filp->f_flags & O_DIRECT) {
@@ -1022,8 +1011,9 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
1022 } 1011 }
1023 newsize = count + saved_pos; 1012 newsize = count + saved_pos;
1024 1013
1025 mlog(0, "pos=%lld newsize=%"MLFu64" cursize=%lld\n", 1014 mlog(0, "pos=%lld newsize=%lld cursize=%lld\n",
1026 saved_pos, newsize, i_size_read(inode)); 1015 (long long) saved_pos, (long long) newsize,
1016 (long long) i_size_read(inode));
1027 1017
1028 /* No need for a higher level metadata lock if we're 1018 /* No need for a higher level metadata lock if we're
1029 * never going past i_size. */ 1019 * never going past i_size. */
@@ -1042,8 +1032,9 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
1042 spin_unlock(&OCFS2_I(inode)->ip_lock); 1032 spin_unlock(&OCFS2_I(inode)->ip_lock);
1043 1033
1044 mlog(0, "Writing at EOF, may need more allocation: " 1034 mlog(0, "Writing at EOF, may need more allocation: "
1045 "i_size = %lld, newsize = %"MLFu64", need %u clusters\n", 1035 "i_size = %lld, newsize = %lld, need %u clusters\n",
1046 i_size_read(inode), newsize, clusters); 1036 (long long) i_size_read(inode), (long long) newsize,
1037 clusters);
1047 1038
1048 /* We only want to continue the rest of this loop if 1039 /* We only want to continue the rest of this loop if
1049 * our extend will actually require more 1040 * our extend will actually require more
@@ -1077,27 +1068,7 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
1077 /* communicate with ocfs2_dio_end_io */ 1068 /* communicate with ocfs2_dio_end_io */
1078 ocfs2_iocb_set_rw_locked(iocb); 1069 ocfs2_iocb_set_rw_locked(iocb);
1079 1070
1080#ifdef OCFS2_ORACORE_WORKAROUNDS 1071 ret = generic_file_aio_write_nolock(iocb, &local_iov, 1, &iocb->ki_pos);
1081 if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS &&
1082 filp->f_flags & O_DIRECT) {
1083 unsigned int saved_flags = filp->f_flags;
1084 int sector_size = 1 << osb->s_sectsize_bits;
1085
1086 if ((saved_pos & (sector_size - 1)) ||
1087 (count & (sector_size - 1)) ||
1088 ((unsigned long)buf & (sector_size - 1))) {
1089 filp->f_flags |= O_SYNC;
1090 filp->f_flags &= ~O_DIRECT;
1091 }
1092
1093 ret = generic_file_aio_write_nolock(iocb, &local_iov, 1,
1094 &iocb->ki_pos);
1095
1096 filp->f_flags = saved_flags;
1097 } else
1098#endif
1099 ret = generic_file_aio_write_nolock(iocb, &local_iov, 1,
1100 &iocb->ki_pos);
1101 1072
1102 /* buffered aio wouldn't have proper lock coverage today */ 1073 /* buffered aio wouldn't have proper lock coverage today */
1103 BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT)); 1074 BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));
@@ -1138,9 +1109,6 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
1138 int ret = 0, rw_level = -1, have_alloc_sem = 0; 1109 int ret = 0, rw_level = -1, have_alloc_sem = 0;
1139 struct file *filp = iocb->ki_filp; 1110 struct file *filp = iocb->ki_filp;
1140 struct inode *inode = filp->f_dentry->d_inode; 1111 struct inode *inode = filp->f_dentry->d_inode;
1141#ifdef OCFS2_ORACORE_WORKAROUNDS
1142 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1143#endif
1144 1112
1145 mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", filp, buf, 1113 mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", filp, buf,
1146 (unsigned int)count, 1114 (unsigned int)count,
@@ -1153,21 +1121,6 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
1153 goto bail; 1121 goto bail;
1154 } 1122 }
1155 1123
1156#ifdef OCFS2_ORACORE_WORKAROUNDS
1157 if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS) {
1158 if (filp->f_flags & O_DIRECT) {
1159 int sector_size = 1 << osb->s_sectsize_bits;
1160
1161 if ((pos & (sector_size - 1)) ||
1162 (count & (sector_size - 1)) ||
1163 ((unsigned long)buf & (sector_size - 1)) ||
1164 (i_size_read(inode) & (sector_size -1))) {
1165 filp->f_flags &= ~O_DIRECT;
1166 }
1167 }
1168 }
1169#endif
1170
1171 /* 1124 /*
1172 * buffered reads protect themselves in ->readpage(). O_DIRECT reads 1125 * buffered reads protect themselves in ->readpage(). O_DIRECT reads
1173 * need locks to protect pending reads from racing with truncate. 1126 * need locks to protect pending reads from racing with truncate.
diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c
index 0bbd22f46c80..cbfd45a97a63 100644
--- a/fs/ocfs2/heartbeat.c
+++ b/fs/ocfs2/heartbeat.c
@@ -67,6 +67,7 @@ void ocfs2_init_node_maps(struct ocfs2_super *osb)
67 ocfs2_node_map_init(&osb->mounted_map); 67 ocfs2_node_map_init(&osb->mounted_map);
68 ocfs2_node_map_init(&osb->recovery_map); 68 ocfs2_node_map_init(&osb->recovery_map);
69 ocfs2_node_map_init(&osb->umount_map); 69 ocfs2_node_map_init(&osb->umount_map);
70 ocfs2_node_map_init(&osb->osb_recovering_orphan_dirs);
70} 71}
71 72
72static void ocfs2_do_node_down(int node_num, 73static void ocfs2_do_node_down(int node_num,
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index d4ecc0627716..315472a5c192 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -41,6 +41,7 @@
41#include "dlmglue.h" 41#include "dlmglue.h"
42#include "extent_map.h" 42#include "extent_map.h"
43#include "file.h" 43#include "file.h"
44#include "heartbeat.h"
44#include "inode.h" 45#include "inode.h"
45#include "journal.h" 46#include "journal.h"
46#include "namei.h" 47#include "namei.h"
@@ -544,6 +545,42 @@ bail:
544 return status; 545 return status;
545} 546}
546 547
548/*
549 * Serialize with orphan dir recovery. If the process doing
550 * recovery on this orphan dir does an iget() with the dir
551 * i_mutex held, we'll deadlock here. Instead we detect this
552 * and exit early - recovery will wipe this inode for us.
553 */
554static int ocfs2_check_orphan_recovery_state(struct ocfs2_super *osb,
555 int slot)
556{
557 int ret = 0;
558
559 spin_lock(&osb->osb_lock);
560 if (ocfs2_node_map_test_bit(osb, &osb->osb_recovering_orphan_dirs, slot)) {
561 mlog(0, "Recovery is happening on orphan dir %d, will skip "
562 "this inode\n", slot);
563 ret = -EDEADLK;
564 goto out;
565 }
566 /* This signals to the orphan recovery process that it should
567 * wait for us to handle the wipe. */
568 osb->osb_orphan_wipes[slot]++;
569out:
570 spin_unlock(&osb->osb_lock);
571 return ret;
572}
573
574static void ocfs2_signal_wipe_completion(struct ocfs2_super *osb,
575 int slot)
576{
577 spin_lock(&osb->osb_lock);
578 osb->osb_orphan_wipes[slot]--;
579 spin_unlock(&osb->osb_lock);
580
581 wake_up(&osb->osb_wipe_event);
582}
583
547static int ocfs2_wipe_inode(struct inode *inode, 584static int ocfs2_wipe_inode(struct inode *inode,
548 struct buffer_head *di_bh) 585 struct buffer_head *di_bh)
549{ 586{
@@ -555,6 +592,11 @@ static int ocfs2_wipe_inode(struct inode *inode,
555 /* We've already voted on this so it should be readonly - no 592 /* We've already voted on this so it should be readonly - no
556 * spinlock needed. */ 593 * spinlock needed. */
557 orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot; 594 orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot;
595
596 status = ocfs2_check_orphan_recovery_state(osb, orphaned_slot);
597 if (status)
598 return status;
599
558 orphan_dir_inode = ocfs2_get_system_file_inode(osb, 600 orphan_dir_inode = ocfs2_get_system_file_inode(osb,
559 ORPHAN_DIR_SYSTEM_INODE, 601 ORPHAN_DIR_SYSTEM_INODE,
560 orphaned_slot); 602 orphaned_slot);
@@ -597,6 +639,7 @@ bail_unlock_dir:
597 brelse(orphan_dir_bh); 639 brelse(orphan_dir_bh);
598bail: 640bail:
599 iput(orphan_dir_inode); 641 iput(orphan_dir_inode);
642 ocfs2_signal_wipe_completion(osb, orphaned_slot);
600 643
601 return status; 644 return status;
602} 645}
@@ -822,7 +865,8 @@ void ocfs2_delete_inode(struct inode *inode)
822 865
823 status = ocfs2_wipe_inode(inode, di_bh); 866 status = ocfs2_wipe_inode(inode, di_bh);
824 if (status < 0) { 867 if (status < 0) {
825 mlog_errno(status); 868 if (status != -EDEADLK)
869 mlog_errno(status);
826 goto bail_unlock_inode; 870 goto bail_unlock_inode;
827 } 871 }
828 872
@@ -903,10 +947,10 @@ void ocfs2_clear_inode(struct inode *inode)
903 "Clear inode of %"MLFu64", inode is locked\n", 947 "Clear inode of %"MLFu64", inode is locked\n",
904 oi->ip_blkno); 948 oi->ip_blkno);
905 949
906 mlog_bug_on_msg(down_trylock(&oi->ip_io_sem), 950 mlog_bug_on_msg(!mutex_trylock(&oi->ip_io_mutex),
907 "Clear inode of %"MLFu64", io_sem is locked\n", 951 "Clear inode of %"MLFu64", io_mutex is locked\n",
908 oi->ip_blkno); 952 oi->ip_blkno);
909 up(&oi->ip_io_sem); 953 mutex_unlock(&oi->ip_io_mutex);
910 954
911 /* 955 /*
912 * down_trylock() returns 0, down_write_trylock() returns 1 956 * down_trylock() returns 0, down_write_trylock() returns 1
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 9b0177433653..84c507961287 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -46,10 +46,10 @@ struct ocfs2_inode_info
46 struct list_head ip_io_markers; 46 struct list_head ip_io_markers;
47 int ip_orphaned_slot; 47 int ip_orphaned_slot;
48 48
49 struct semaphore ip_io_sem; 49 struct mutex ip_io_mutex;
50 50
51 /* Used by the journalling code to attach an inode to a 51 /* Used by the journalling code to attach an inode to a
52 * handle. These are protected by ip_io_sem in order to lock 52 * handle. These are protected by ip_io_mutex in order to lock
53 * out other I/O to the inode until we either commit or 53 * out other I/O to the inode until we either commit or
54 * abort. */ 54 * abort. */
55 struct list_head ip_handle_list; 55 struct list_head ip_handle_list;
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 303c8d96457f..4be801f4559b 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -147,8 +147,7 @@ struct ocfs2_journal_handle *ocfs2_start_trans(struct ocfs2_super *osb,
147 147
148 mlog_entry("(max_buffs = %d)\n", max_buffs); 148 mlog_entry("(max_buffs = %d)\n", max_buffs);
149 149
150 if (!osb || !osb->journal->j_journal) 150 BUG_ON(!osb || !osb->journal->j_journal);
151 BUG();
152 151
153 if (ocfs2_is_hard_readonly(osb)) { 152 if (ocfs2_is_hard_readonly(osb)) {
154 ret = -EROFS; 153 ret = -EROFS;
@@ -401,7 +400,7 @@ int ocfs2_journal_access(struct ocfs2_journal_handle *handle,
401 * j_trans_barrier for us. */ 400 * j_trans_barrier for us. */
402 ocfs2_set_inode_lock_trans(OCFS2_SB(inode->i_sb)->journal, inode); 401 ocfs2_set_inode_lock_trans(OCFS2_SB(inode->i_sb)->journal, inode);
403 402
404 down(&OCFS2_I(inode)->ip_io_sem); 403 mutex_lock(&OCFS2_I(inode)->ip_io_mutex);
405 switch (type) { 404 switch (type) {
406 case OCFS2_JOURNAL_ACCESS_CREATE: 405 case OCFS2_JOURNAL_ACCESS_CREATE:
407 case OCFS2_JOURNAL_ACCESS_WRITE: 406 case OCFS2_JOURNAL_ACCESS_WRITE:
@@ -416,7 +415,7 @@ int ocfs2_journal_access(struct ocfs2_journal_handle *handle,
416 status = -EINVAL; 415 status = -EINVAL;
417 mlog(ML_ERROR, "Uknown access type!\n"); 416 mlog(ML_ERROR, "Uknown access type!\n");
418 } 417 }
419 up(&OCFS2_I(inode)->ip_io_sem); 418 mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
420 419
421 if (status < 0) 420 if (status < 0)
422 mlog(ML_ERROR, "Error %d getting %d access to buffer!\n", 421 mlog(ML_ERROR, "Error %d getting %d access to buffer!\n",
@@ -561,7 +560,11 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
561 SET_INODE_JOURNAL(inode); 560 SET_INODE_JOURNAL(inode);
562 OCFS2_I(inode)->ip_open_count++; 561 OCFS2_I(inode)->ip_open_count++;
563 562
564 status = ocfs2_meta_lock(inode, NULL, &bh, 1); 563 /* Skip recovery waits here - journal inode metadata never
564 * changes in a live cluster so it can be considered an
565 * exception to the rule. */
566 status = ocfs2_meta_lock_full(inode, NULL, &bh, 1,
567 OCFS2_META_LOCK_RECOVERY);
565 if (status < 0) { 568 if (status < 0) {
566 if (status != -ERESTARTSYS) 569 if (status != -ERESTARTSYS)
567 mlog(ML_ERROR, "Could not get lock on journal!\n"); 570 mlog(ML_ERROR, "Could not get lock on journal!\n");
@@ -672,8 +675,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
672 675
673 mlog_entry_void(); 676 mlog_entry_void();
674 677
675 if (!osb) 678 BUG_ON(!osb);
676 BUG();
677 679
678 journal = osb->journal; 680 journal = osb->journal;
679 if (!journal) 681 if (!journal)
@@ -805,8 +807,7 @@ int ocfs2_journal_wipe(struct ocfs2_journal *journal, int full)
805 807
806 mlog_entry_void(); 808 mlog_entry_void();
807 809
808 if (!journal) 810 BUG_ON(!journal);
809 BUG();
810 811
811 status = journal_wipe(journal->j_journal, full); 812 status = journal_wipe(journal->j_journal, full);
812 if (status < 0) { 813 if (status < 0) {
@@ -1072,10 +1073,10 @@ restart:
1072 NULL); 1073 NULL);
1073 1074
1074bail: 1075bail:
1075 down(&osb->recovery_lock); 1076 mutex_lock(&osb->recovery_lock);
1076 if (!status && 1077 if (!status &&
1077 !ocfs2_node_map_is_empty(osb, &osb->recovery_map)) { 1078 !ocfs2_node_map_is_empty(osb, &osb->recovery_map)) {
1078 up(&osb->recovery_lock); 1079 mutex_unlock(&osb->recovery_lock);
1079 goto restart; 1080 goto restart;
1080 } 1081 }
1081 1082
@@ -1083,7 +1084,7 @@ bail:
1083 mb(); /* sync with ocfs2_recovery_thread_running */ 1084 mb(); /* sync with ocfs2_recovery_thread_running */
1084 wake_up(&osb->recovery_event); 1085 wake_up(&osb->recovery_event);
1085 1086
1086 up(&osb->recovery_lock); 1087 mutex_unlock(&osb->recovery_lock);
1087 1088
1088 mlog_exit(status); 1089 mlog_exit(status);
1089 /* no one is callint kthread_stop() for us so the kthread() api 1090 /* no one is callint kthread_stop() for us so the kthread() api
@@ -1098,7 +1099,7 @@ void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num)
1098 mlog_entry("(node_num=%d, osb->node_num = %d)\n", 1099 mlog_entry("(node_num=%d, osb->node_num = %d)\n",
1099 node_num, osb->node_num); 1100 node_num, osb->node_num);
1100 1101
1101 down(&osb->recovery_lock); 1102 mutex_lock(&osb->recovery_lock);
1102 if (osb->disable_recovery) 1103 if (osb->disable_recovery)
1103 goto out; 1104 goto out;
1104 1105
@@ -1120,7 +1121,7 @@ void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num)
1120 } 1121 }
1121 1122
1122out: 1123out:
1123 up(&osb->recovery_lock); 1124 mutex_unlock(&osb->recovery_lock);
1124 wake_up(&osb->recovery_event); 1125 wake_up(&osb->recovery_event);
1125 1126
1126 mlog_exit_void(); 1127 mlog_exit_void();
@@ -1271,8 +1272,7 @@ static int ocfs2_recover_node(struct ocfs2_super *osb,
1271 1272
1272 /* Should not ever be called to recover ourselves -- in that 1273 /* Should not ever be called to recover ourselves -- in that
1273 * case we should've called ocfs2_journal_load instead. */ 1274 * case we should've called ocfs2_journal_load instead. */
1274 if (osb->node_num == node_num) 1275 BUG_ON(osb->node_num == node_num);
1275 BUG();
1276 1276
1277 slot_num = ocfs2_node_num_to_slot(si, node_num); 1277 slot_num = ocfs2_node_num_to_slot(si, node_num);
1278 if (slot_num == OCFS2_INVALID_SLOT) { 1278 if (slot_num == OCFS2_INVALID_SLOT) {
@@ -1408,21 +1408,17 @@ bail:
1408 return status; 1408 return status;
1409} 1409}
1410 1410
1411static int ocfs2_recover_orphans(struct ocfs2_super *osb, 1411static int ocfs2_queue_orphans(struct ocfs2_super *osb,
1412 int slot) 1412 int slot,
1413 struct inode **head)
1413{ 1414{
1414 int status = 0; 1415 int status;
1415 int have_disk_lock = 0;
1416 struct inode *inode = NULL;
1417 struct inode *iter;
1418 struct inode *orphan_dir_inode = NULL; 1416 struct inode *orphan_dir_inode = NULL;
1417 struct inode *iter;
1419 unsigned long offset, blk, local; 1418 unsigned long offset, blk, local;
1420 struct buffer_head *bh = NULL; 1419 struct buffer_head *bh = NULL;
1421 struct ocfs2_dir_entry *de; 1420 struct ocfs2_dir_entry *de;
1422 struct super_block *sb = osb->sb; 1421 struct super_block *sb = osb->sb;
1423 struct ocfs2_inode_info *oi;
1424
1425 mlog(0, "Recover inodes from orphan dir in slot %d\n", slot);
1426 1422
1427 orphan_dir_inode = ocfs2_get_system_file_inode(osb, 1423 orphan_dir_inode = ocfs2_get_system_file_inode(osb,
1428 ORPHAN_DIR_SYSTEM_INODE, 1424 ORPHAN_DIR_SYSTEM_INODE,
@@ -1430,17 +1426,15 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
1430 if (!orphan_dir_inode) { 1426 if (!orphan_dir_inode) {
1431 status = -ENOENT; 1427 status = -ENOENT;
1432 mlog_errno(status); 1428 mlog_errno(status);
1433 goto out; 1429 return status;
1434 } 1430 }
1435 1431
1436 mutex_lock(&orphan_dir_inode->i_mutex); 1432 mutex_lock(&orphan_dir_inode->i_mutex);
1437 status = ocfs2_meta_lock(orphan_dir_inode, NULL, NULL, 0); 1433 status = ocfs2_meta_lock(orphan_dir_inode, NULL, NULL, 0);
1438 if (status < 0) { 1434 if (status < 0) {
1439 mutex_unlock(&orphan_dir_inode->i_mutex);
1440 mlog_errno(status); 1435 mlog_errno(status);
1441 goto out; 1436 goto out;
1442 } 1437 }
1443 have_disk_lock = 1;
1444 1438
1445 offset = 0; 1439 offset = 0;
1446 iter = NULL; 1440 iter = NULL;
@@ -1451,11 +1445,10 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
1451 if (!bh) 1445 if (!bh)
1452 status = -EINVAL; 1446 status = -EINVAL;
1453 if (status < 0) { 1447 if (status < 0) {
1454 mutex_unlock(&orphan_dir_inode->i_mutex);
1455 if (bh) 1448 if (bh)
1456 brelse(bh); 1449 brelse(bh);
1457 mlog_errno(status); 1450 mlog_errno(status);
1458 goto out; 1451 goto out_unlock;
1459 } 1452 }
1460 1453
1461 local = 0; 1454 local = 0;
@@ -1465,11 +1458,10 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
1465 1458
1466 if (!ocfs2_check_dir_entry(orphan_dir_inode, 1459 if (!ocfs2_check_dir_entry(orphan_dir_inode,
1467 de, bh, local)) { 1460 de, bh, local)) {
1468 mutex_unlock(&orphan_dir_inode->i_mutex);
1469 status = -EINVAL; 1461 status = -EINVAL;
1470 mlog_errno(status); 1462 mlog_errno(status);
1471 brelse(bh); 1463 brelse(bh);
1472 goto out; 1464 goto out_unlock;
1473 } 1465 }
1474 1466
1475 local += le16_to_cpu(de->rec_len); 1467 local += le16_to_cpu(de->rec_len);
@@ -1504,18 +1496,95 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
1504 1496
1505 mlog(0, "queue orphan %"MLFu64"\n", 1497 mlog(0, "queue orphan %"MLFu64"\n",
1506 OCFS2_I(iter)->ip_blkno); 1498 OCFS2_I(iter)->ip_blkno);
1507 OCFS2_I(iter)->ip_next_orphan = inode; 1499 /* No locking is required for the next_orphan
1508 inode = iter; 1500 * queue as there is only ever a single
1501 * process doing orphan recovery. */
1502 OCFS2_I(iter)->ip_next_orphan = *head;
1503 *head = iter;
1509 } 1504 }
1510 brelse(bh); 1505 brelse(bh);
1511 } 1506 }
1512 mutex_unlock(&orphan_dir_inode->i_mutex);
1513 1507
1508out_unlock:
1514 ocfs2_meta_unlock(orphan_dir_inode, 0); 1509 ocfs2_meta_unlock(orphan_dir_inode, 0);
1515 have_disk_lock = 0; 1510out:
1516 1511 mutex_unlock(&orphan_dir_inode->i_mutex);
1517 iput(orphan_dir_inode); 1512 iput(orphan_dir_inode);
1518 orphan_dir_inode = NULL; 1513 return status;
1514}
1515
1516static int ocfs2_orphan_recovery_can_continue(struct ocfs2_super *osb,
1517 int slot)
1518{
1519 int ret;
1520
1521 spin_lock(&osb->osb_lock);
1522 ret = !osb->osb_orphan_wipes[slot];
1523 spin_unlock(&osb->osb_lock);
1524 return ret;
1525}
1526
1527static void ocfs2_mark_recovering_orphan_dir(struct ocfs2_super *osb,
1528 int slot)
1529{
1530 spin_lock(&osb->osb_lock);
1531 /* Mark ourselves such that new processes in delete_inode()
1532 * know to quit early. */
1533 ocfs2_node_map_set_bit(osb, &osb->osb_recovering_orphan_dirs, slot);
1534 while (osb->osb_orphan_wipes[slot]) {
1535 /* If any processes are already in the middle of an
1536 * orphan wipe on this dir, then we need to wait for
1537 * them. */
1538 spin_unlock(&osb->osb_lock);
1539 wait_event_interruptible(osb->osb_wipe_event,
1540 ocfs2_orphan_recovery_can_continue(osb, slot));
1541 spin_lock(&osb->osb_lock);
1542 }
1543 spin_unlock(&osb->osb_lock);
1544}
1545
1546static void ocfs2_clear_recovering_orphan_dir(struct ocfs2_super *osb,
1547 int slot)
1548{
1549 ocfs2_node_map_clear_bit(osb, &osb->osb_recovering_orphan_dirs, slot);
1550}
1551
1552/*
1553 * Orphan recovery. Each mounted node has it's own orphan dir which we
1554 * must run during recovery. Our strategy here is to build a list of
1555 * the inodes in the orphan dir and iget/iput them. The VFS does
1556 * (most) of the rest of the work.
1557 *
1558 * Orphan recovery can happen at any time, not just mount so we have a
1559 * couple of extra considerations.
1560 *
1561 * - We grab as many inodes as we can under the orphan dir lock -
1562 * doing iget() outside the orphan dir risks getting a reference on
1563 * an invalid inode.
1564 * - We must be sure not to deadlock with other processes on the
1565 * system wanting to run delete_inode(). This can happen when they go
1566 * to lock the orphan dir and the orphan recovery process attempts to
1567 * iget() inside the orphan dir lock. This can be avoided by
1568 * advertising our state to ocfs2_delete_inode().
1569 */
1570static int ocfs2_recover_orphans(struct ocfs2_super *osb,
1571 int slot)
1572{
1573 int ret = 0;
1574 struct inode *inode = NULL;
1575 struct inode *iter;
1576 struct ocfs2_inode_info *oi;
1577
1578 mlog(0, "Recover inodes from orphan dir in slot %d\n", slot);
1579
1580 ocfs2_mark_recovering_orphan_dir(osb, slot);
1581 ret = ocfs2_queue_orphans(osb, slot, &inode);
1582 ocfs2_clear_recovering_orphan_dir(osb, slot);
1583
1584 /* Error here should be noted, but we want to continue with as
1585 * many queued inodes as we've got. */
1586 if (ret)
1587 mlog_errno(ret);
1519 1588
1520 while (inode) { 1589 while (inode) {
1521 oi = OCFS2_I(inode); 1590 oi = OCFS2_I(inode);
@@ -1541,14 +1610,7 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
1541 inode = iter; 1610 inode = iter;
1542 } 1611 }
1543 1612
1544out: 1613 return ret;
1545 if (have_disk_lock)
1546 ocfs2_meta_unlock(orphan_dir_inode, 0);
1547
1548 if (orphan_dir_inode)
1549 iput(orphan_dir_inode);
1550
1551 return status;
1552} 1614}
1553 1615
1554static int ocfs2_wait_on_mount(struct ocfs2_super *osb) 1616static int ocfs2_wait_on_mount(struct ocfs2_super *osb)
@@ -1584,10 +1646,9 @@ static int ocfs2_commit_thread(void *arg)
1584 while (!(kthread_should_stop() && 1646 while (!(kthread_should_stop() &&
1585 atomic_read(&journal->j_num_trans) == 0)) { 1647 atomic_read(&journal->j_num_trans) == 0)) {
1586 1648
1587 wait_event_interruptible_timeout(osb->checkpoint_event, 1649 wait_event_interruptible(osb->checkpoint_event,
1588 atomic_read(&journal->j_num_trans) 1650 atomic_read(&journal->j_num_trans)
1589 || kthread_should_stop(), 1651 || kthread_should_stop());
1590 OCFS2_CHECKPOINT_INTERVAL);
1591 1652
1592 status = ocfs2_commit_cache(osb); 1653 status = ocfs2_commit_cache(osb);
1593 if (status < 0) 1654 if (status < 0)
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 7d0a816184fa..2f3a6acdac45 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -29,8 +29,6 @@
29#include <linux/fs.h> 29#include <linux/fs.h>
30#include <linux/jbd.h> 30#include <linux/jbd.h>
31 31
32#define OCFS2_CHECKPOINT_INTERVAL (8 * HZ)
33
34enum ocfs2_journal_state { 32enum ocfs2_journal_state {
35 OCFS2_JOURNAL_FREE = 0, 33 OCFS2_JOURNAL_FREE = 0,
36 OCFS2_JOURNAL_LOADED, 34 OCFS2_JOURNAL_LOADED,
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index f468c600cf92..e89de9b6e491 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -33,6 +33,7 @@
33#include <linux/rbtree.h> 33#include <linux/rbtree.h>
34#include <linux/workqueue.h> 34#include <linux/workqueue.h>
35#include <linux/kref.h> 35#include <linux/kref.h>
36#include <linux/mutex.h>
36 37
37#include "cluster/nodemanager.h" 38#include "cluster/nodemanager.h"
38#include "cluster/heartbeat.h" 39#include "cluster/heartbeat.h"
@@ -173,9 +174,6 @@ enum ocfs2_mount_options
173 OCFS2_MOUNT_NOINTR = 1 << 2, /* Don't catch signals */ 174 OCFS2_MOUNT_NOINTR = 1 << 2, /* Don't catch signals */
174 OCFS2_MOUNT_ERRORS_PANIC = 1 << 3, /* Panic on errors */ 175 OCFS2_MOUNT_ERRORS_PANIC = 1 << 3, /* Panic on errors */
175 OCFS2_MOUNT_DATA_WRITEBACK = 1 << 4, /* No data ordering */ 176 OCFS2_MOUNT_DATA_WRITEBACK = 1 << 4, /* No data ordering */
176#ifdef OCFS2_ORACORE_WORKAROUNDS
177 OCFS2_MOUNT_COMPAT_OCFS = 1 << 30, /* ocfs1 compatibility mode */
178#endif
179}; 177};
180 178
181#define OCFS2_OSB_SOFT_RO 0x0001 179#define OCFS2_OSB_SOFT_RO 0x0001
@@ -233,7 +231,7 @@ struct ocfs2_super
233 struct proc_dir_entry *proc_sub_dir; /* points to /proc/fs/ocfs2/<maj_min> */ 231 struct proc_dir_entry *proc_sub_dir; /* points to /proc/fs/ocfs2/<maj_min> */
234 232
235 atomic_t vol_state; 233 atomic_t vol_state;
236 struct semaphore recovery_lock; 234 struct mutex recovery_lock;
237 struct task_struct *recovery_thread_task; 235 struct task_struct *recovery_thread_task;
238 int disable_recovery; 236 int disable_recovery;
239 wait_queue_head_t checkpoint_event; 237 wait_queue_head_t checkpoint_event;
@@ -289,6 +287,10 @@ struct ocfs2_super
289 struct inode *osb_tl_inode; 287 struct inode *osb_tl_inode;
290 struct buffer_head *osb_tl_bh; 288 struct buffer_head *osb_tl_bh;
291 struct work_struct osb_truncate_log_wq; 289 struct work_struct osb_truncate_log_wq;
290
291 struct ocfs2_node_map osb_recovering_orphan_dirs;
292 unsigned int *osb_orphan_wipes;
293 wait_queue_head_t osb_wipe_event;
292}; 294};
293 295
294#define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info) 296#define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info)
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index dfb8a5bedfc8..c5b1ac547c15 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -138,7 +138,6 @@
138 138
139/* Journal limits (in bytes) */ 139/* Journal limits (in bytes) */
140#define OCFS2_MIN_JOURNAL_SIZE (4 * 1024 * 1024) 140#define OCFS2_MIN_JOURNAL_SIZE (4 * 1024 * 1024)
141#define OCFS2_MAX_JOURNAL_SIZE (500 * 1024 * 1024)
142 141
143struct ocfs2_system_inode_info { 142struct ocfs2_system_inode_info {
144 char *si_name; 143 char *si_name;
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 364d64bd5f10..8dd3aafec499 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -932,7 +932,7 @@ static void ocfs2_inode_init_once(void *data,
932 oi->ip_dir_start_lookup = 0; 932 oi->ip_dir_start_lookup = 0;
933 933
934 init_rwsem(&oi->ip_alloc_sem); 934 init_rwsem(&oi->ip_alloc_sem);
935 init_MUTEX(&(oi->ip_io_sem)); 935 mutex_init(&oi->ip_io_mutex);
936 936
937 oi->ip_blkno = 0ULL; 937 oi->ip_blkno = 0ULL;
938 oi->ip_clusters = 0; 938 oi->ip_clusters = 0;
@@ -1137,9 +1137,9 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
1137 1137
1138 /* disable any new recovery threads and wait for any currently 1138 /* disable any new recovery threads and wait for any currently
1139 * running ones to exit. Do this before setting the vol_state. */ 1139 * running ones to exit. Do this before setting the vol_state. */
1140 down(&osb->recovery_lock); 1140 mutex_lock(&osb->recovery_lock);
1141 osb->disable_recovery = 1; 1141 osb->disable_recovery = 1;
1142 up(&osb->recovery_lock); 1142 mutex_unlock(&osb->recovery_lock);
1143 wait_event(osb->recovery_event, !ocfs2_recovery_thread_running(osb)); 1143 wait_event(osb->recovery_event, !ocfs2_recovery_thread_running(osb));
1144 1144
1145 /* At this point, we know that no more recovery threads can be 1145 /* At this point, we know that no more recovery threads can be
@@ -1254,8 +1254,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
1254 osb->sb = sb; 1254 osb->sb = sb;
1255 /* Save off for ocfs2_rw_direct */ 1255 /* Save off for ocfs2_rw_direct */
1256 osb->s_sectsize_bits = blksize_bits(sector_size); 1256 osb->s_sectsize_bits = blksize_bits(sector_size);
1257 if (!osb->s_sectsize_bits) 1257 BUG_ON(!osb->s_sectsize_bits);
1258 BUG();
1259 1258
1260 osb->net_response_ids = 0; 1259 osb->net_response_ids = 0;
1261 spin_lock_init(&osb->net_response_lock); 1260 spin_lock_init(&osb->net_response_lock);
@@ -1283,7 +1282,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
1283 snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u", 1282 snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u",
1284 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev)); 1283 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
1285 1284
1286 init_MUTEX(&osb->recovery_lock); 1285 mutex_init(&osb->recovery_lock);
1287 1286
1288 osb->disable_recovery = 0; 1287 osb->disable_recovery = 0;
1289 osb->recovery_thread_task = NULL; 1288 osb->recovery_thread_task = NULL;
@@ -1326,6 +1325,16 @@ static int ocfs2_initialize_super(struct super_block *sb,
1326 } 1325 }
1327 mlog(ML_NOTICE, "max_slots for this device: %u\n", osb->max_slots); 1326 mlog(ML_NOTICE, "max_slots for this device: %u\n", osb->max_slots);
1328 1327
1328 init_waitqueue_head(&osb->osb_wipe_event);
1329 osb->osb_orphan_wipes = kcalloc(osb->max_slots,
1330 sizeof(*osb->osb_orphan_wipes),
1331 GFP_KERNEL);
1332 if (!osb->osb_orphan_wipes) {
1333 status = -ENOMEM;
1334 mlog_errno(status);
1335 goto bail;
1336 }
1337
1329 osb->s_feature_compat = 1338 osb->s_feature_compat =
1330 le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_compat); 1339 le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_compat);
1331 osb->s_feature_ro_compat = 1340 osb->s_feature_ro_compat =
@@ -1639,6 +1648,7 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb)
1639 if (osb->slot_info) 1648 if (osb->slot_info)
1640 ocfs2_free_slot_info(osb->slot_info); 1649 ocfs2_free_slot_info(osb->slot_info);
1641 1650
1651 kfree(osb->osb_orphan_wipes);
1642 /* FIXME 1652 /* FIXME
1643 * This belongs in journal shutdown, but because we have to 1653 * This belongs in journal shutdown, but because we have to
1644 * allocate osb->journal at the start of ocfs2_initalize_osb(), 1654 * allocate osb->journal at the start of ocfs2_initalize_osb(),
diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c
index 600a8bc5b541..fc29cb7a437d 100644
--- a/fs/ocfs2/sysfile.c
+++ b/fs/ocfs2/sysfile.c
@@ -77,8 +77,7 @@ struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb,
77 if (arr && ((inode = *arr) != NULL)) { 77 if (arr && ((inode = *arr) != NULL)) {
78 /* get a ref in addition to the array ref */ 78 /* get a ref in addition to the array ref */
79 inode = igrab(inode); 79 inode = igrab(inode);
80 if (!inode) 80 BUG_ON(!inode);
81 BUG();
82 81
83 return inode; 82 return inode;
84 } 83 }
@@ -89,8 +88,7 @@ struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb,
89 /* add one more if putting into array for first time */ 88 /* add one more if putting into array for first time */
90 if (arr && inode) { 89 if (arr && inode) {
91 *arr = igrab(inode); 90 *arr = igrab(inode);
92 if (!*arr) 91 BUG_ON(!*arr);
93 BUG();
94 } 92 }
95 return inode; 93 return inode;
96} 94}
diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c
index 3a0458fd3e1b..300b5bedfb21 100644
--- a/fs/ocfs2/uptodate.c
+++ b/fs/ocfs2/uptodate.c
@@ -388,7 +388,7 @@ out_free:
388 } 388 }
389} 389}
390 390
391/* Item insertion is guarded by ip_io_sem, so the insertion path takes 391/* Item insertion is guarded by ip_io_mutex, so the insertion path takes
392 * advantage of this by not rechecking for a duplicate insert during 392 * advantage of this by not rechecking for a duplicate insert during
393 * the slow case. Additionally, if the cache needs to be bumped up to 393 * the slow case. Additionally, if the cache needs to be bumped up to
394 * a tree, the code will not recheck after acquiring the lock -- 394 * a tree, the code will not recheck after acquiring the lock --
@@ -418,7 +418,7 @@ void ocfs2_set_buffer_uptodate(struct inode *inode,
418 (unsigned long long) bh->b_blocknr); 418 (unsigned long long) bh->b_blocknr);
419 419
420 /* No need to recheck under spinlock - insertion is guarded by 420 /* No need to recheck under spinlock - insertion is guarded by
421 * ip_io_sem */ 421 * ip_io_mutex */
422 spin_lock(&oi->ip_lock); 422 spin_lock(&oi->ip_lock);
423 if (ocfs2_insert_can_use_array(oi, ci)) { 423 if (ocfs2_insert_can_use_array(oi, ci)) {
424 /* Fast case - it's an array and there's a free 424 /* Fast case - it's an array and there's a free
@@ -440,7 +440,7 @@ void ocfs2_set_buffer_uptodate(struct inode *inode,
440 440
441/* Called against a newly allocated buffer. Most likely nobody should 441/* Called against a newly allocated buffer. Most likely nobody should
442 * be able to read this sort of metadata while it's still being 442 * be able to read this sort of metadata while it's still being
443 * allocated, but this is careful to take ip_io_sem anyway. */ 443 * allocated, but this is careful to take ip_io_mutex anyway. */
444void ocfs2_set_new_buffer_uptodate(struct inode *inode, 444void ocfs2_set_new_buffer_uptodate(struct inode *inode,
445 struct buffer_head *bh) 445 struct buffer_head *bh)
446{ 446{
@@ -451,9 +451,9 @@ void ocfs2_set_new_buffer_uptodate(struct inode *inode,
451 451
452 set_buffer_uptodate(bh); 452 set_buffer_uptodate(bh);
453 453
454 down(&oi->ip_io_sem); 454 mutex_lock(&oi->ip_io_mutex);
455 ocfs2_set_buffer_uptodate(inode, bh); 455 ocfs2_set_buffer_uptodate(inode, bh);
456 up(&oi->ip_io_sem); 456 mutex_unlock(&oi->ip_io_mutex);
457} 457}
458 458
459/* Requires ip_lock. */ 459/* Requires ip_lock. */
@@ -537,7 +537,7 @@ int __init init_ocfs2_uptodate_cache(void)
537 return 0; 537 return 0;
538} 538}
539 539
540void __exit exit_ocfs2_uptodate_cache(void) 540void exit_ocfs2_uptodate_cache(void)
541{ 541{
542 if (ocfs2_uptodate_cachep) 542 if (ocfs2_uptodate_cachep)
543 kmem_cache_destroy(ocfs2_uptodate_cachep); 543 kmem_cache_destroy(ocfs2_uptodate_cachep);
diff --git a/fs/ocfs2/uptodate.h b/fs/ocfs2/uptodate.h
index e5aacdf4eabf..01cd32d26b06 100644
--- a/fs/ocfs2/uptodate.h
+++ b/fs/ocfs2/uptodate.h
@@ -27,7 +27,7 @@
27#define OCFS2_UPTODATE_H 27#define OCFS2_UPTODATE_H
28 28
29int __init init_ocfs2_uptodate_cache(void); 29int __init init_ocfs2_uptodate_cache(void);
30void __exit exit_ocfs2_uptodate_cache(void); 30void exit_ocfs2_uptodate_cache(void);
31 31
32void ocfs2_metadata_cache_init(struct inode *inode); 32void ocfs2_metadata_cache_init(struct inode *inode);
33void ocfs2_metadata_cache_purge(struct inode *inode); 33void ocfs2_metadata_cache_purge(struct inode *inode);
diff --git a/fs/partitions/ibm.c b/fs/partitions/ibm.c
index 78010ad60e47..1e4a93835fed 100644
--- a/fs/partitions/ibm.c
+++ b/fs/partitions/ibm.c
@@ -52,6 +52,7 @@ int
52ibm_partition(struct parsed_partitions *state, struct block_device *bdev) 52ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
53{ 53{
54 int blocksize, offset, size; 54 int blocksize, offset, size;
55 loff_t i_size;
55 dasd_information_t *info; 56 dasd_information_t *info;
56 struct hd_geometry *geo; 57 struct hd_geometry *geo;
57 char type[5] = {0,}; 58 char type[5] = {0,};
@@ -63,6 +64,13 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
63 unsigned char *data; 64 unsigned char *data;
64 Sector sect; 65 Sector sect;
65 66
67 blocksize = bdev_hardsect_size(bdev);
68 if (blocksize <= 0)
69 return 0;
70 i_size = i_size_read(bdev->bd_inode);
71 if (i_size == 0)
72 return 0;
73
66 if ((info = kmalloc(sizeof(dasd_information_t), GFP_KERNEL)) == NULL) 74 if ((info = kmalloc(sizeof(dasd_information_t), GFP_KERNEL)) == NULL)
67 goto out_noinfo; 75 goto out_noinfo;
68 if ((geo = kmalloc(sizeof(struct hd_geometry), GFP_KERNEL)) == NULL) 76 if ((geo = kmalloc(sizeof(struct hd_geometry), GFP_KERNEL)) == NULL)
@@ -73,9 +81,6 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
73 if (ioctl_by_bdev(bdev, BIODASDINFO, (unsigned long)info) != 0 || 81 if (ioctl_by_bdev(bdev, BIODASDINFO, (unsigned long)info) != 0 ||
74 ioctl_by_bdev(bdev, HDIO_GETGEO, (unsigned long)geo) != 0) 82 ioctl_by_bdev(bdev, HDIO_GETGEO, (unsigned long)geo) != 0)
75 goto out_noioctl; 83 goto out_noioctl;
76
77 if ((blocksize = bdev_hardsect_size(bdev)) <= 0)
78 goto out_badsect;
79 84
80 /* 85 /*
81 * Get volume label, extract name and type. 86 * Get volume label, extract name and type.
@@ -111,7 +116,7 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
111 } else { 116 } else {
112 printk("CMS1/%8s:", name); 117 printk("CMS1/%8s:", name);
113 offset = (info->label_block + 1); 118 offset = (info->label_block + 1);
114 size = bdev->bd_inode->i_size >> 9; 119 size = i_size >> 9;
115 } 120 }
116 put_partition(state, 1, offset*(blocksize >> 9), 121 put_partition(state, 1, offset*(blocksize >> 9),
117 size-offset*(blocksize >> 9)); 122 size-offset*(blocksize >> 9));
@@ -168,7 +173,7 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
168 else 173 else
169 printk("(nonl)/%8s:", name); 174 printk("(nonl)/%8s:", name);
170 offset = (info->label_block + 1); 175 offset = (info->label_block + 1);
171 size = (bdev->bd_inode->i_size >> 9); 176 size = i_size >> 9;
172 put_partition(state, 1, offset*(blocksize >> 9), 177 put_partition(state, 1, offset*(blocksize >> 9),
173 size-offset*(blocksize >> 9)); 178 size-offset*(blocksize >> 9));
174 } 179 }
@@ -180,7 +185,6 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
180 return 1; 185 return 1;
181 186
182out_readerr: 187out_readerr:
183out_badsect:
184out_noioctl: 188out_noioctl:
185 kfree(label); 189 kfree(label);
186out_nolab: 190out_nolab:
diff --git a/fs/pipe.c b/fs/pipe.c
index d722579df79a..8aada8e426f4 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -605,7 +605,7 @@ struct file_operations rdwr_fifo_fops = {
605 .fasync = pipe_rdwr_fasync, 605 .fasync = pipe_rdwr_fasync,
606}; 606};
607 607
608struct file_operations read_pipe_fops = { 608static struct file_operations read_pipe_fops = {
609 .llseek = no_llseek, 609 .llseek = no_llseek,
610 .read = pipe_read, 610 .read = pipe_read,
611 .readv = pipe_readv, 611 .readv = pipe_readv,
@@ -617,7 +617,7 @@ struct file_operations read_pipe_fops = {
617 .fasync = pipe_read_fasync, 617 .fasync = pipe_read_fasync,
618}; 618};
619 619
620struct file_operations write_pipe_fops = { 620static struct file_operations write_pipe_fops = {
621 .llseek = no_llseek, 621 .llseek = no_llseek,
622 .read = bad_pipe_r, 622 .read = bad_pipe_r,
623 .write = pipe_write, 623 .write = pipe_write,
@@ -629,7 +629,7 @@ struct file_operations write_pipe_fops = {
629 .fasync = pipe_write_fasync, 629 .fasync = pipe_write_fasync,
630}; 630};
631 631
632struct file_operations rdwr_pipe_fops = { 632static struct file_operations rdwr_pipe_fops = {
633 .llseek = no_llseek, 633 .llseek = no_llseek,
634 .read = pipe_read, 634 .read = pipe_read,
635 .readv = pipe_readv, 635 .readv = pipe_readv,
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 6573f31f1fd9..075d3e945602 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -204,10 +204,6 @@ int proc_fill_super(struct super_block *s, void *data, int silent)
204 root_inode = proc_get_inode(s, PROC_ROOT_INO, &proc_root); 204 root_inode = proc_get_inode(s, PROC_ROOT_INO, &proc_root);
205 if (!root_inode) 205 if (!root_inode)
206 goto out_no_root; 206 goto out_no_root;
207 /*
208 * Fixup the root inode's nlink value
209 */
210 root_inode->i_nlink += nr_processes();
211 root_inode->i_uid = 0; 207 root_inode->i_uid = 0;
212 root_inode->i_gid = 0; 208 root_inode->i_gid = 0;
213 s->s_root = d_alloc_root(root_inode); 209 s->s_root = d_alloc_root(root_inode);
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 8f8014285a34..1d24fead51a6 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -548,7 +548,7 @@ static int show_stat(struct seq_file *p, void *v)
548 } 548 }
549 seq_printf(p, "intr %llu", (unsigned long long)sum); 549 seq_printf(p, "intr %llu", (unsigned long long)sum);
550 550
551#if !defined(CONFIG_PPC64) && !defined(CONFIG_ALPHA) 551#if !defined(CONFIG_PPC64) && !defined(CONFIG_ALPHA) && !defined(CONFIG_IA64)
552 for (i = 0; i < NR_IRQS; i++) 552 for (i = 0; i < NR_IRQS; i++)
553 seq_printf(p, " %u", kstat_irqs(i)); 553 seq_printf(p, " %u", kstat_irqs(i));
554#endif 554#endif
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 68896283c8ae..c3fd3611112f 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -80,16 +80,16 @@ void __init proc_root_init(void)
80 proc_bus = proc_mkdir("bus", NULL); 80 proc_bus = proc_mkdir("bus", NULL);
81} 81}
82 82
83static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentry, struct nameidata *nd) 83static int proc_root_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat
84)
84{ 85{
85 /* 86 generic_fillattr(dentry->d_inode, stat);
86 * nr_threads is actually protected by the tasklist_lock; 87 stat->nlink = proc_root.nlink + nr_processes();
87 * however, it's conventional to do reads, especially for 88 return 0;
88 * reporting, without any locking whatsoever. 89}
89 */
90 if (dir->i_ino == PROC_ROOT_INO) /* check for safety... */
91 dir->i_nlink = proc_root.nlink + nr_threads;
92 90
91static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentry, struct nameidata *nd)
92{
93 if (!proc_lookup(dir, dentry, nd)) { 93 if (!proc_lookup(dir, dentry, nd)) {
94 return NULL; 94 return NULL;
95 } 95 }
@@ -134,6 +134,7 @@ static struct file_operations proc_root_operations = {
134 */ 134 */
135static struct inode_operations proc_root_inode_operations = { 135static struct inode_operations proc_root_inode_operations = {
136 .lookup = proc_root_lookup, 136 .lookup = proc_root_lookup,
137 .getattr = proc_root_getattr,
137}; 138};
138 139
139/* 140/*
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 0eaad41f4658..91b7c15ab373 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -204,7 +204,6 @@ static void smaps_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
204{ 204{
205 pte_t *pte, ptent; 205 pte_t *pte, ptent;
206 spinlock_t *ptl; 206 spinlock_t *ptl;
207 unsigned long pfn;
208 struct page *page; 207 struct page *page;
209 208
210 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 209 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
@@ -214,12 +213,12 @@ static void smaps_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
214 continue; 213 continue;
215 214
216 mss->resident += PAGE_SIZE; 215 mss->resident += PAGE_SIZE;
217 pfn = pte_pfn(ptent); 216
218 if (!pfn_valid(pfn)) 217 page = vm_normal_page(vma, addr, ptent);
218 if (!page)
219 continue; 219 continue;
220 220
221 page = pfn_to_page(pfn); 221 if (page_mapcount(page) >= 2) {
222 if (page_count(page) >= 2) {
223 if (pte_dirty(ptent)) 222 if (pte_dirty(ptent))
224 mss->shared_dirty += PAGE_SIZE; 223 mss->shared_dirty += PAGE_SIZE;
225 else 224 else
@@ -289,7 +288,7 @@ static int show_smap(struct seq_file *m, void *v)
289 struct mem_size_stats mss; 288 struct mem_size_stats mss;
290 289
291 memset(&mss, 0, sizeof mss); 290 memset(&mss, 0, sizeof mss);
292 if (vma->vm_mm) 291 if (vma->vm_mm && !is_vm_hugetlb_page(vma))
293 smaps_pgd_range(vma, vma->vm_start, vma->vm_end, &mss); 292 smaps_pgd_range(vma, vma->vm_start, vma->vm_end, &mss);
294 return show_map_internal(m, v, &mss); 293 return show_map_internal(m, v, &mss);
295} 294}
diff --git a/fs/quota_v2.c b/fs/quota_v2.c
index a4ef91bb4f3b..b4199ec3ece4 100644
--- a/fs/quota_v2.c
+++ b/fs/quota_v2.c
@@ -35,7 +35,7 @@ static int v2_check_quota_file(struct super_block *sb, int type)
35 35
36 size = sb->s_op->quota_read(sb, type, (char *)&dqhead, sizeof(struct v2_disk_dqheader), 0); 36 size = sb->s_op->quota_read(sb, type, (char *)&dqhead, sizeof(struct v2_disk_dqheader), 0);
37 if (size != sizeof(struct v2_disk_dqheader)) { 37 if (size != sizeof(struct v2_disk_dqheader)) {
38 printk("quota_v2: failed read expected=%d got=%d\n", 38 printk("quota_v2: failed read expected=%zd got=%zd\n",
39 sizeof(struct v2_disk_dqheader), size); 39 sizeof(struct v2_disk_dqheader), size);
40 return 0; 40 return 0;
41 } 41 }
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index c66bd5e4c05c..14bd2246fb6d 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -27,6 +27,7 @@
27#include <linux/fs.h> 27#include <linux/fs.h>
28#include <linux/pagemap.h> 28#include <linux/pagemap.h>
29#include <linux/highmem.h> 29#include <linux/highmem.h>
30#include <linux/time.h>
30#include <linux/init.h> 31#include <linux/init.h>
31#include <linux/string.h> 32#include <linux/string.h>
32#include <linux/smp_lock.h> 33#include <linux/smp_lock.h>
@@ -104,6 +105,7 @@ ramfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
104 d_instantiate(dentry, inode); 105 d_instantiate(dentry, inode);
105 dget(dentry); /* Extra count - pin the dentry in core */ 106 dget(dentry); /* Extra count - pin the dentry in core */
106 error = 0; 107 error = 0;
108 dir->i_mtime = dir->i_ctime = CURRENT_TIME;
107 } 109 }
108 return error; 110 return error;
109} 111}
@@ -135,6 +137,7 @@ static int ramfs_symlink(struct inode * dir, struct dentry *dentry, const char *
135 inode->i_gid = dir->i_gid; 137 inode->i_gid = dir->i_gid;
136 d_instantiate(dentry, inode); 138 d_instantiate(dentry, inode);
137 dget(dentry); 139 dget(dentry);
140 dir->i_mtime = dir->i_ctime = CURRENT_TIME;
138 } else 141 } else
139 iput(inode); 142 iput(inode);
140 } 143 }
diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
index 9dd71e807034..d71ac6579289 100644
--- a/fs/reiserfs/dir.c
+++ b/fs/reiserfs/dir.c
@@ -150,18 +150,15 @@ static int reiserfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
150 if (d_reclen <= 32) { 150 if (d_reclen <= 32) {
151 local_buf = small_buf; 151 local_buf = small_buf;
152 } else { 152 } else {
153 local_buf = 153 local_buf = kmalloc(d_reclen,
154 reiserfs_kmalloc(d_reclen, GFP_NOFS, 154 GFP_NOFS);
155 inode->i_sb);
156 if (!local_buf) { 155 if (!local_buf) {
157 pathrelse(&path_to_entry); 156 pathrelse(&path_to_entry);
158 ret = -ENOMEM; 157 ret = -ENOMEM;
159 goto out; 158 goto out;
160 } 159 }
161 if (item_moved(&tmp_ih, &path_to_entry)) { 160 if (item_moved(&tmp_ih, &path_to_entry)) {
162 reiserfs_kfree(local_buf, 161 kfree(local_buf);
163 d_reclen,
164 inode->i_sb);
165 goto research; 162 goto research;
166 } 163 }
167 } 164 }
@@ -174,15 +171,12 @@ static int reiserfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
174 (dirent, local_buf, d_reclen, d_off, d_ino, 171 (dirent, local_buf, d_reclen, d_off, d_ino,
175 DT_UNKNOWN) < 0) { 172 DT_UNKNOWN) < 0) {
176 if (local_buf != small_buf) { 173 if (local_buf != small_buf) {
177 reiserfs_kfree(local_buf, 174 kfree(local_buf);
178 d_reclen,
179 inode->i_sb);
180 } 175 }
181 goto end; 176 goto end;
182 } 177 }
183 if (local_buf != small_buf) { 178 if (local_buf != small_buf) {
184 reiserfs_kfree(local_buf, d_reclen, 179 kfree(local_buf);
185 inode->i_sb);
186 } 180 }
187 // next entry should be looked for with such offset 181 // next entry should be looked for with such offset
188 next_pos = deh_offset(deh) + 1; 182 next_pos = deh_offset(deh) + 1;
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index ad6fa964b0e7..be12879bb179 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -192,6 +192,8 @@ static int reiserfs_allocate_blocks_for_region(struct reiserfs_transaction_handl
192 192
193 allocated_blocks = kmalloc((blocks_to_allocate + will_prealloc) * 193 allocated_blocks = kmalloc((blocks_to_allocate + will_prealloc) *
194 sizeof(b_blocknr_t), GFP_NOFS); 194 sizeof(b_blocknr_t), GFP_NOFS);
195 if (!allocated_blocks)
196 return -ENOMEM;
195 197
196 /* First we compose a key to point at the writing position, we want to do 198 /* First we compose a key to point at the writing position, we want to do
197 that outside of any locking region. */ 199 that outside of any locking region. */
@@ -1285,6 +1287,23 @@ static ssize_t reiserfs_file_write(struct file *file, /* the file we are going t
1285 struct reiserfs_transaction_handle th; 1287 struct reiserfs_transaction_handle th;
1286 th.t_trans_id = 0; 1288 th.t_trans_id = 0;
1287 1289
1290 /* If a filesystem is converted from 3.5 to 3.6, we'll have v3.5 items
1291 * lying around (most of the disk, in fact). Despite the filesystem
1292 * now being a v3.6 format, the old items still can't support large
1293 * file sizes. Catch this case here, as the rest of the VFS layer is
1294 * oblivious to the different limitations between old and new items.
1295 * reiserfs_setattr catches this for truncates. This chunk is lifted
1296 * from generic_write_checks. */
1297 if (get_inode_item_key_version (inode) == KEY_FORMAT_3_5 &&
1298 *ppos + count > MAX_NON_LFS) {
1299 if (*ppos >= MAX_NON_LFS) {
1300 send_sig(SIGXFSZ, current, 0);
1301 return -EFBIG;
1302 }
1303 if (count > MAX_NON_LFS - (unsigned long)*ppos)
1304 count = MAX_NON_LFS - (unsigned long)*ppos;
1305 }
1306
1288 if (file->f_flags & O_DIRECT) { // Direct IO needs treatment 1307 if (file->f_flags & O_DIRECT) { // Direct IO needs treatment
1289 ssize_t result, after_file_end = 0; 1308 ssize_t result, after_file_end = 0;
1290 if ((*ppos + count >= inode->i_size) 1309 if ((*ppos + count >= inode->i_size)
@@ -1445,13 +1464,11 @@ static ssize_t reiserfs_file_write(struct file *file, /* the file we are going t
1445 partially overwritten pages, if needed. And lock the pages, 1464 partially overwritten pages, if needed. And lock the pages,
1446 so that nobody else can access these until we are done. 1465 so that nobody else can access these until we are done.
1447 We get number of actual blocks needed as a result. */ 1466 We get number of actual blocks needed as a result. */
1448 blocks_to_allocate = 1467 res = reiserfs_prepare_file_region_for_write(inode, pos,
1449 reiserfs_prepare_file_region_for_write(inode, pos, 1468 num_pages,
1450 num_pages, 1469 write_bytes,
1451 write_bytes, 1470 prepared_pages);
1452 prepared_pages); 1471 if (res < 0) {
1453 if (blocks_to_allocate < 0) {
1454 res = blocks_to_allocate;
1455 reiserfs_release_claimed_blocks(inode->i_sb, 1472 reiserfs_release_claimed_blocks(inode->i_sb,
1456 num_pages << 1473 num_pages <<
1457 (PAGE_CACHE_SHIFT - 1474 (PAGE_CACHE_SHIFT -
@@ -1459,6 +1476,8 @@ static ssize_t reiserfs_file_write(struct file *file, /* the file we are going t
1459 break; 1476 break;
1460 } 1477 }
1461 1478
1479 blocks_to_allocate = res;
1480
1462 /* First we correct our estimate of how many blocks we need */ 1481 /* First we correct our estimate of how many blocks we need */
1463 reiserfs_release_claimed_blocks(inode->i_sb, 1482 reiserfs_release_claimed_blocks(inode->i_sb,
1464 (num_pages << 1483 (num_pages <<
diff --git a/fs/reiserfs/fix_node.c b/fs/reiserfs/fix_node.c
index 45829889dcdc..aa22588019ec 100644
--- a/fs/reiserfs/fix_node.c
+++ b/fs/reiserfs/fix_node.c
@@ -2021,38 +2021,6 @@ static int get_neighbors(struct tree_balance *p_s_tb, int n_h)
2021 return CARRY_ON; 2021 return CARRY_ON;
2022} 2022}
2023 2023
2024#ifdef CONFIG_REISERFS_CHECK
2025void *reiserfs_kmalloc(size_t size, gfp_t flags, struct super_block *s)
2026{
2027 void *vp;
2028 static size_t malloced;
2029
2030 vp = kmalloc(size, flags);
2031 if (vp) {
2032 REISERFS_SB(s)->s_kmallocs += size;
2033 if (REISERFS_SB(s)->s_kmallocs > malloced + 200000) {
2034 reiserfs_warning(s,
2035 "vs-8301: reiserfs_kmalloc: allocated memory %d",
2036 REISERFS_SB(s)->s_kmallocs);
2037 malloced = REISERFS_SB(s)->s_kmallocs;
2038 }
2039 }
2040 return vp;
2041}
2042
2043void reiserfs_kfree(const void *vp, size_t size, struct super_block *s)
2044{
2045 kfree(vp);
2046
2047 REISERFS_SB(s)->s_kmallocs -= size;
2048 if (REISERFS_SB(s)->s_kmallocs < 0)
2049 reiserfs_warning(s,
2050 "vs-8302: reiserfs_kfree: allocated memory %d",
2051 REISERFS_SB(s)->s_kmallocs);
2052
2053}
2054#endif
2055
2056static int get_virtual_node_size(struct super_block *sb, struct buffer_head *bh) 2024static int get_virtual_node_size(struct super_block *sb, struct buffer_head *bh)
2057{ 2025{
2058 int max_num_of_items; 2026 int max_num_of_items;
@@ -2086,7 +2054,7 @@ static int get_mem_for_virtual_node(struct tree_balance *tb)
2086 /* we have to allocate more memory for virtual node */ 2054 /* we have to allocate more memory for virtual node */
2087 if (tb->vn_buf) { 2055 if (tb->vn_buf) {
2088 /* free memory allocated before */ 2056 /* free memory allocated before */
2089 reiserfs_kfree(tb->vn_buf, tb->vn_buf_size, tb->tb_sb); 2057 kfree(tb->vn_buf);
2090 /* this is not needed if kfree is atomic */ 2058 /* this is not needed if kfree is atomic */
2091 check_fs = 1; 2059 check_fs = 1;
2092 } 2060 }
@@ -2095,24 +2063,15 @@ static int get_mem_for_virtual_node(struct tree_balance *tb)
2095 tb->vn_buf_size = size; 2063 tb->vn_buf_size = size;
2096 2064
2097 /* get memory for virtual item */ 2065 /* get memory for virtual item */
2098 buf = 2066 buf = kmalloc(size, GFP_ATOMIC | __GFP_NOWARN);
2099 reiserfs_kmalloc(size, GFP_ATOMIC | __GFP_NOWARN,
2100 tb->tb_sb);
2101 if (!buf) { 2067 if (!buf) {
2102 /* getting memory with GFP_KERNEL priority may involve 2068 /* getting memory with GFP_KERNEL priority may involve
2103 balancing now (due to indirect_to_direct conversion on 2069 balancing now (due to indirect_to_direct conversion on
2104 dcache shrinking). So, release path and collected 2070 dcache shrinking). So, release path and collected
2105 resources here */ 2071 resources here */
2106 free_buffers_in_tb(tb); 2072 free_buffers_in_tb(tb);
2107 buf = reiserfs_kmalloc(size, GFP_NOFS, tb->tb_sb); 2073 buf = kmalloc(size, GFP_NOFS);
2108 if (!buf) { 2074 if (!buf) {
2109#ifdef CONFIG_REISERFS_CHECK
2110 reiserfs_warning(tb->tb_sb,
2111 "vs-8345: get_mem_for_virtual_node: "
2112 "kmalloc failed. reiserfs kmalloced %d bytes",
2113 REISERFS_SB(tb->tb_sb)->
2114 s_kmallocs);
2115#endif
2116 tb->vn_buf_size = 0; 2075 tb->vn_buf_size = 0;
2117 } 2076 }
2118 tb->vn_buf = buf; 2077 tb->vn_buf = buf;
@@ -2619,7 +2578,6 @@ void unfix_nodes(struct tree_balance *tb)
2619 } 2578 }
2620 } 2579 }
2621 2580
2622 if (tb->vn_buf) 2581 kfree(tb->vn_buf);
2623 reiserfs_kfree(tb->vn_buf, tb->vn_buf_size, tb->tb_sb);
2624 2582
2625} 2583}
diff --git a/fs/reiserfs/hashes.c b/fs/reiserfs/hashes.c
index a3ec238fd9e0..e664ac16fad9 100644
--- a/fs/reiserfs/hashes.c
+++ b/fs/reiserfs/hashes.c
@@ -21,7 +21,6 @@
21#include <linux/kernel.h> 21#include <linux/kernel.h>
22#include <linux/reiserfs_fs.h> 22#include <linux/reiserfs_fs.h>
23#include <asm/types.h> 23#include <asm/types.h>
24#include <asm/bug.h>
25 24
26#define DELTA 0x9E3779B9 25#define DELTA 0x9E3779B9
27#define FULLROUNDS 10 /* 32 is overkill, 16 is strong crypto */ 26#define FULLROUNDS 10 /* 32 is overkill, 16 is strong crypto */
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index ffa34b861bdb..d60f6238c66a 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -627,11 +627,6 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
627 reiserfs_write_lock(inode->i_sb); 627 reiserfs_write_lock(inode->i_sb);
628 version = get_inode_item_key_version(inode); 628 version = get_inode_item_key_version(inode);
629 629
630 if (block < 0) {
631 reiserfs_write_unlock(inode->i_sb);
632 return -EIO;
633 }
634
635 if (!file_capable(inode, block)) { 630 if (!file_capable(inode, block)) {
636 reiserfs_write_unlock(inode->i_sb); 631 reiserfs_write_unlock(inode->i_sb);
637 return -EFBIG; 632 return -EFBIG;
@@ -934,12 +929,13 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
934 //pos_in_item * inode->i_sb->s_blocksize, 929 //pos_in_item * inode->i_sb->s_blocksize,
935 TYPE_INDIRECT, 3); // key type is unimportant 930 TYPE_INDIRECT, 3); // key type is unimportant
936 931
932 RFALSE(cpu_key_k_offset(&tmp_key) > cpu_key_k_offset(&key),
933 "green-805: invalid offset");
937 blocks_needed = 934 blocks_needed =
938 1 + 935 1 +
939 ((cpu_key_k_offset(&key) - 936 ((cpu_key_k_offset(&key) -
940 cpu_key_k_offset(&tmp_key)) >> inode->i_sb-> 937 cpu_key_k_offset(&tmp_key)) >> inode->i_sb->
941 s_blocksize_bits); 938 s_blocksize_bits);
942 RFALSE(blocks_needed < 0, "green-805: invalid offset");
943 939
944 if (blocks_needed == 1) { 940 if (blocks_needed == 1) {
945 un = &unf_single; 941 un = &unf_single;
@@ -2363,6 +2359,13 @@ static int reiserfs_write_full_page(struct page *page,
2363 int bh_per_page = PAGE_CACHE_SIZE / s->s_blocksize; 2359 int bh_per_page = PAGE_CACHE_SIZE / s->s_blocksize;
2364 th.t_trans_id = 0; 2360 th.t_trans_id = 0;
2365 2361
2362 /* no logging allowed when nonblocking or from PF_MEMALLOC */
2363 if (checked && (current->flags & PF_MEMALLOC)) {
2364 redirty_page_for_writepage(wbc, page);
2365 unlock_page(page);
2366 return 0;
2367 }
2368
2366 /* The page dirty bit is cleared before writepage is called, which 2369 /* The page dirty bit is cleared before writepage is called, which
2367 * means we have to tell create_empty_buffers to make dirty buffers 2370 * means we have to tell create_empty_buffers to make dirty buffers
2368 * The page really should be up to date at this point, so tossing 2371 * The page really should be up to date at this point, so tossing
@@ -2743,6 +2746,7 @@ static int invalidatepage_can_drop(struct inode *inode, struct buffer_head *bh)
2743 int ret = 1; 2746 int ret = 1;
2744 struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb); 2747 struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb);
2745 2748
2749 lock_buffer(bh);
2746 spin_lock(&j->j_dirty_buffers_lock); 2750 spin_lock(&j->j_dirty_buffers_lock);
2747 if (!buffer_mapped(bh)) { 2751 if (!buffer_mapped(bh)) {
2748 goto free_jh; 2752 goto free_jh;
@@ -2758,7 +2762,7 @@ static int invalidatepage_can_drop(struct inode *inode, struct buffer_head *bh)
2758 if (buffer_journaled(bh) || buffer_journal_dirty(bh)) { 2762 if (buffer_journaled(bh) || buffer_journal_dirty(bh)) {
2759 ret = 0; 2763 ret = 0;
2760 } 2764 }
2761 } else if (buffer_dirty(bh) || buffer_locked(bh)) { 2765 } else if (buffer_dirty(bh)) {
2762 struct reiserfs_journal_list *jl; 2766 struct reiserfs_journal_list *jl;
2763 struct reiserfs_jh *jh = bh->b_private; 2767 struct reiserfs_jh *jh = bh->b_private;
2764 2768
@@ -2784,6 +2788,7 @@ static int invalidatepage_can_drop(struct inode *inode, struct buffer_head *bh)
2784 reiserfs_free_jh(bh); 2788 reiserfs_free_jh(bh);
2785 } 2789 }
2786 spin_unlock(&j->j_dirty_buffers_lock); 2790 spin_unlock(&j->j_dirty_buffers_lock);
2791 unlock_buffer(bh);
2787 return ret; 2792 return ret;
2788} 2793}
2789 2794
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 4491fcf2a0e6..5a9d2722fa0a 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -152,18 +152,16 @@ static struct reiserfs_bitmap_node *allocate_bitmap_node(struct super_block
152 struct reiserfs_bitmap_node *bn; 152 struct reiserfs_bitmap_node *bn;
153 static int id; 153 static int id;
154 154
155 bn = reiserfs_kmalloc(sizeof(struct reiserfs_bitmap_node), GFP_NOFS, 155 bn = kmalloc(sizeof(struct reiserfs_bitmap_node), GFP_NOFS);
156 p_s_sb);
157 if (!bn) { 156 if (!bn) {
158 return NULL; 157 return NULL;
159 } 158 }
160 bn->data = reiserfs_kmalloc(p_s_sb->s_blocksize, GFP_NOFS, p_s_sb); 159 bn->data = kzalloc(p_s_sb->s_blocksize, GFP_NOFS);
161 if (!bn->data) { 160 if (!bn->data) {
162 reiserfs_kfree(bn, sizeof(struct reiserfs_bitmap_node), p_s_sb); 161 kfree(bn);
163 return NULL; 162 return NULL;
164 } 163 }
165 bn->id = id++; 164 bn->id = id++;
166 memset(bn->data, 0, p_s_sb->s_blocksize);
167 INIT_LIST_HEAD(&bn->list); 165 INIT_LIST_HEAD(&bn->list);
168 return bn; 166 return bn;
169} 167}
@@ -197,8 +195,8 @@ static inline void free_bitmap_node(struct super_block *p_s_sb,
197 struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb); 195 struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
198 journal->j_used_bitmap_nodes--; 196 journal->j_used_bitmap_nodes--;
199 if (journal->j_free_bitmap_nodes > REISERFS_MAX_BITMAP_NODES) { 197 if (journal->j_free_bitmap_nodes > REISERFS_MAX_BITMAP_NODES) {
200 reiserfs_kfree(bn->data, p_s_sb->s_blocksize, p_s_sb); 198 kfree(bn->data);
201 reiserfs_kfree(bn, sizeof(struct reiserfs_bitmap_node), p_s_sb); 199 kfree(bn);
202 } else { 200 } else {
203 list_add(&bn->list, &journal->j_bitmap_nodes); 201 list_add(&bn->list, &journal->j_bitmap_nodes);
204 journal->j_free_bitmap_nodes++; 202 journal->j_free_bitmap_nodes++;
@@ -276,8 +274,8 @@ static int free_bitmap_nodes(struct super_block *p_s_sb)
276 while (next != &journal->j_bitmap_nodes) { 274 while (next != &journal->j_bitmap_nodes) {
277 bn = list_entry(next, struct reiserfs_bitmap_node, list); 275 bn = list_entry(next, struct reiserfs_bitmap_node, list);
278 list_del(next); 276 list_del(next);
279 reiserfs_kfree(bn->data, p_s_sb->s_blocksize, p_s_sb); 277 kfree(bn->data);
280 reiserfs_kfree(bn, sizeof(struct reiserfs_bitmap_node), p_s_sb); 278 kfree(bn);
281 next = journal->j_bitmap_nodes.next; 279 next = journal->j_bitmap_nodes.next;
282 journal->j_free_bitmap_nodes--; 280 journal->j_free_bitmap_nodes--;
283 } 281 }
@@ -581,7 +579,7 @@ static inline void put_journal_list(struct super_block *s,
581 jl->j_trans_id, jl->j_refcount); 579 jl->j_trans_id, jl->j_refcount);
582 } 580 }
583 if (--jl->j_refcount == 0) 581 if (--jl->j_refcount == 0)
584 reiserfs_kfree(jl, sizeof(struct reiserfs_journal_list), s); 582 kfree(jl);
585} 583}
586 584
587/* 585/*
@@ -848,6 +846,14 @@ static int write_ordered_buffers(spinlock_t * lock,
848 spin_lock(lock); 846 spin_lock(lock);
849 goto loop_next; 847 goto loop_next;
850 } 848 }
849 /* in theory, dirty non-uptodate buffers should never get here,
850 * but the upper layer io error paths still have a few quirks.
851 * Handle them here as gracefully as we can
852 */
853 if (!buffer_uptodate(bh) && buffer_dirty(bh)) {
854 clear_buffer_dirty(bh);
855 ret = -EIO;
856 }
851 if (buffer_dirty(bh)) { 857 if (buffer_dirty(bh)) {
852 list_del_init(&jh->list); 858 list_del_init(&jh->list);
853 list_add(&jh->list, &tmp); 859 list_add(&jh->list, &tmp);
@@ -879,6 +885,19 @@ static int write_ordered_buffers(spinlock_t * lock,
879 if (!buffer_uptodate(bh)) { 885 if (!buffer_uptodate(bh)) {
880 ret = -EIO; 886 ret = -EIO;
881 } 887 }
888 /* ugly interaction with invalidatepage here.
889 * reiserfs_invalidate_page will pin any buffer that has a valid
890 * journal head from an older transaction. If someone else sets
891 * our buffer dirty after we write it in the first loop, and
892 * then someone truncates the page away, nobody will ever write
893 * the buffer. We're safe if we write the page one last time
894 * after freeing the journal header.
895 */
896 if (buffer_dirty(bh) && unlikely(bh->b_page->mapping == NULL)) {
897 spin_unlock(lock);
898 ll_rw_block(WRITE, 1, &bh);
899 spin_lock(lock);
900 }
882 put_bh(bh); 901 put_bh(bh);
883 cond_resched_lock(lock); 902 cond_resched_lock(lock);
884 } 903 }
@@ -977,6 +996,7 @@ static int flush_commit_list(struct super_block *s,
977 struct reiserfs_journal *journal = SB_JOURNAL(s); 996 struct reiserfs_journal *journal = SB_JOURNAL(s);
978 int barrier = 0; 997 int barrier = 0;
979 int retval = 0; 998 int retval = 0;
999 int write_len;
980 1000
981 reiserfs_check_lock_depth(s, "flush_commit_list"); 1001 reiserfs_check_lock_depth(s, "flush_commit_list");
982 1002
@@ -1018,24 +1038,35 @@ static int flush_commit_list(struct super_block *s,
1018 } 1038 }
1019 1039
1020 if (!list_empty(&jl->j_bh_list)) { 1040 if (!list_empty(&jl->j_bh_list)) {
1041 int ret;
1021 unlock_kernel(); 1042 unlock_kernel();
1022 write_ordered_buffers(&journal->j_dirty_buffers_lock, 1043 ret = write_ordered_buffers(&journal->j_dirty_buffers_lock,
1023 journal, jl, &jl->j_bh_list); 1044 journal, jl, &jl->j_bh_list);
1045 if (ret < 0 && retval == 0)
1046 retval = ret;
1024 lock_kernel(); 1047 lock_kernel();
1025 } 1048 }
1026 BUG_ON(!list_empty(&jl->j_bh_list)); 1049 BUG_ON(!list_empty(&jl->j_bh_list));
1027 /* 1050 /*
1028 * for the description block and all the log blocks, submit any buffers 1051 * for the description block and all the log blocks, submit any buffers
1029 * that haven't already reached the disk 1052 * that haven't already reached the disk. Try to write at least 256
1053 * log blocks. later on, we will only wait on blocks that correspond
1054 * to this transaction, but while we're unplugging we might as well
1055 * get a chunk of data on there.
1030 */ 1056 */
1031 atomic_inc(&journal->j_async_throttle); 1057 atomic_inc(&journal->j_async_throttle);
1032 for (i = 0; i < (jl->j_len + 1); i++) { 1058 write_len = jl->j_len + 1;
1059 if (write_len < 256)
1060 write_len = 256;
1061 for (i = 0 ; i < write_len ; i++) {
1033 bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) + (jl->j_start + i) % 1062 bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) + (jl->j_start + i) %
1034 SB_ONDISK_JOURNAL_SIZE(s); 1063 SB_ONDISK_JOURNAL_SIZE(s);
1035 tbh = journal_find_get_block(s, bn); 1064 tbh = journal_find_get_block(s, bn);
1036 if (buffer_dirty(tbh)) /* redundant, ll_rw_block() checks */ 1065 if (tbh) {
1037 ll_rw_block(SWRITE, 1, &tbh); 1066 if (buffer_dirty(tbh))
1038 put_bh(tbh); 1067 ll_rw_block(WRITE, 1, &tbh) ;
1068 put_bh(tbh) ;
1069 }
1039 } 1070 }
1040 atomic_dec(&journal->j_async_throttle); 1071 atomic_dec(&journal->j_async_throttle);
1041 1072
@@ -1818,8 +1849,7 @@ void remove_journal_hash(struct super_block *sb,
1818static void free_journal_ram(struct super_block *p_s_sb) 1849static void free_journal_ram(struct super_block *p_s_sb)
1819{ 1850{
1820 struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb); 1851 struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
1821 reiserfs_kfree(journal->j_current_jl, 1852 kfree(journal->j_current_jl);
1822 sizeof(struct reiserfs_journal_list), p_s_sb);
1823 journal->j_num_lists--; 1853 journal->j_num_lists--;
1824 1854
1825 vfree(journal->j_cnode_free_orig); 1855 vfree(journal->j_cnode_free_orig);
@@ -2093,21 +2123,15 @@ static int journal_read_transaction(struct super_block *p_s_sb,
2093 } 2123 }
2094 trans_id = get_desc_trans_id(desc); 2124 trans_id = get_desc_trans_id(desc);
2095 /* now we know we've got a good transaction, and it was inside the valid time ranges */ 2125 /* now we know we've got a good transaction, and it was inside the valid time ranges */
2096 log_blocks = 2126 log_blocks = kmalloc(get_desc_trans_len(desc) *
2097 reiserfs_kmalloc(get_desc_trans_len(desc) * 2127 sizeof(struct buffer_head *), GFP_NOFS);
2098 sizeof(struct buffer_head *), GFP_NOFS, p_s_sb); 2128 real_blocks = kmalloc(get_desc_trans_len(desc) *
2099 real_blocks = 2129 sizeof(struct buffer_head *), GFP_NOFS);
2100 reiserfs_kmalloc(get_desc_trans_len(desc) *
2101 sizeof(struct buffer_head *), GFP_NOFS, p_s_sb);
2102 if (!log_blocks || !real_blocks) { 2130 if (!log_blocks || !real_blocks) {
2103 brelse(c_bh); 2131 brelse(c_bh);
2104 brelse(d_bh); 2132 brelse(d_bh);
2105 reiserfs_kfree(log_blocks, 2133 kfree(log_blocks);
2106 get_desc_trans_len(desc) * 2134 kfree(real_blocks);
2107 sizeof(struct buffer_head *), p_s_sb);
2108 reiserfs_kfree(real_blocks,
2109 get_desc_trans_len(desc) *
2110 sizeof(struct buffer_head *), p_s_sb);
2111 reiserfs_warning(p_s_sb, 2135 reiserfs_warning(p_s_sb,
2112 "journal-1169: kmalloc failed, unable to mount FS"); 2136 "journal-1169: kmalloc failed, unable to mount FS");
2113 return -1; 2137 return -1;
@@ -2145,12 +2169,8 @@ static int journal_read_transaction(struct super_block *p_s_sb,
2145 brelse_array(real_blocks, i); 2169 brelse_array(real_blocks, i);
2146 brelse(c_bh); 2170 brelse(c_bh);
2147 brelse(d_bh); 2171 brelse(d_bh);
2148 reiserfs_kfree(log_blocks, 2172 kfree(log_blocks);
2149 get_desc_trans_len(desc) * 2173 kfree(real_blocks);
2150 sizeof(struct buffer_head *), p_s_sb);
2151 reiserfs_kfree(real_blocks,
2152 get_desc_trans_len(desc) *
2153 sizeof(struct buffer_head *), p_s_sb);
2154 return -1; 2174 return -1;
2155 } 2175 }
2156 } 2176 }
@@ -2166,12 +2186,8 @@ static int journal_read_transaction(struct super_block *p_s_sb,
2166 brelse_array(real_blocks, get_desc_trans_len(desc)); 2186 brelse_array(real_blocks, get_desc_trans_len(desc));
2167 brelse(c_bh); 2187 brelse(c_bh);
2168 brelse(d_bh); 2188 brelse(d_bh);
2169 reiserfs_kfree(log_blocks, 2189 kfree(log_blocks);
2170 get_desc_trans_len(desc) * 2190 kfree(real_blocks);
2171 sizeof(struct buffer_head *), p_s_sb);
2172 reiserfs_kfree(real_blocks,
2173 get_desc_trans_len(desc) *
2174 sizeof(struct buffer_head *), p_s_sb);
2175 return -1; 2191 return -1;
2176 } 2192 }
2177 memcpy(real_blocks[i]->b_data, log_blocks[i]->b_data, 2193 memcpy(real_blocks[i]->b_data, log_blocks[i]->b_data,
@@ -2193,12 +2209,8 @@ static int journal_read_transaction(struct super_block *p_s_sb,
2193 get_desc_trans_len(desc) - i); 2209 get_desc_trans_len(desc) - i);
2194 brelse(c_bh); 2210 brelse(c_bh);
2195 brelse(d_bh); 2211 brelse(d_bh);
2196 reiserfs_kfree(log_blocks, 2212 kfree(log_blocks);
2197 get_desc_trans_len(desc) * 2213 kfree(real_blocks);
2198 sizeof(struct buffer_head *), p_s_sb);
2199 reiserfs_kfree(real_blocks,
2200 get_desc_trans_len(desc) *
2201 sizeof(struct buffer_head *), p_s_sb);
2202 return -1; 2214 return -1;
2203 } 2215 }
2204 brelse(real_blocks[i]); 2216 brelse(real_blocks[i]);
@@ -2217,12 +2229,8 @@ static int journal_read_transaction(struct super_block *p_s_sb,
2217 journal->j_trans_id = trans_id + 1; 2229 journal->j_trans_id = trans_id + 1;
2218 brelse(c_bh); 2230 brelse(c_bh);
2219 brelse(d_bh); 2231 brelse(d_bh);
2220 reiserfs_kfree(log_blocks, 2232 kfree(log_blocks);
2221 le32_to_cpu(desc->j_len) * sizeof(struct buffer_head *), 2233 kfree(real_blocks);
2222 p_s_sb);
2223 reiserfs_kfree(real_blocks,
2224 le32_to_cpu(desc->j_len) * sizeof(struct buffer_head *),
2225 p_s_sb);
2226 return 0; 2234 return 0;
2227} 2235}
2228 2236
@@ -2311,8 +2319,7 @@ static int journal_read(struct super_block *p_s_sb)
2311 return 1; 2319 return 1;
2312 } 2320 }
2313 jh = (struct reiserfs_journal_header *)(journal->j_header_bh->b_data); 2321 jh = (struct reiserfs_journal_header *)(journal->j_header_bh->b_data);
2314 if (le32_to_cpu(jh->j_first_unflushed_offset) >= 0 && 2322 if (le32_to_cpu(jh->j_first_unflushed_offset) <
2315 le32_to_cpu(jh->j_first_unflushed_offset) <
2316 SB_ONDISK_JOURNAL_SIZE(p_s_sb) 2323 SB_ONDISK_JOURNAL_SIZE(p_s_sb)
2317 && le32_to_cpu(jh->j_last_flush_trans_id) > 0) { 2324 && le32_to_cpu(jh->j_last_flush_trans_id) > 0) {
2318 oldest_start = 2325 oldest_start =
@@ -2471,14 +2478,8 @@ static int journal_read(struct super_block *p_s_sb)
2471static struct reiserfs_journal_list *alloc_journal_list(struct super_block *s) 2478static struct reiserfs_journal_list *alloc_journal_list(struct super_block *s)
2472{ 2479{
2473 struct reiserfs_journal_list *jl; 2480 struct reiserfs_journal_list *jl;
2474 retry: 2481 jl = kzalloc(sizeof(struct reiserfs_journal_list),
2475 jl = reiserfs_kmalloc(sizeof(struct reiserfs_journal_list), GFP_NOFS, 2482 GFP_NOFS | __GFP_NOFAIL);
2476 s);
2477 if (!jl) {
2478 yield();
2479 goto retry;
2480 }
2481 memset(jl, 0, sizeof(*jl));
2482 INIT_LIST_HEAD(&jl->j_list); 2483 INIT_LIST_HEAD(&jl->j_list);
2483 INIT_LIST_HEAD(&jl->j_working_list); 2484 INIT_LIST_HEAD(&jl->j_working_list);
2484 INIT_LIST_HEAD(&jl->j_tail_bh_list); 2485 INIT_LIST_HEAD(&jl->j_tail_bh_list);
@@ -2821,6 +2822,9 @@ int journal_transaction_should_end(struct reiserfs_transaction_handle *th,
2821 journal->j_cnode_free < (journal->j_trans_max * 3)) { 2822 journal->j_cnode_free < (journal->j_trans_max * 3)) {
2822 return 1; 2823 return 1;
2823 } 2824 }
2825 /* protected by the BKL here */
2826 journal->j_len_alloc += new_alloc;
2827 th->t_blocks_allocated += new_alloc ;
2824 return 0; 2828 return 0;
2825} 2829}
2826 2830
@@ -3042,14 +3046,12 @@ struct reiserfs_transaction_handle *reiserfs_persistent_transaction(struct
3042 } 3046 }
3043 return th; 3047 return th;
3044 } 3048 }
3045 th = reiserfs_kmalloc(sizeof(struct reiserfs_transaction_handle), 3049 th = kmalloc(sizeof(struct reiserfs_transaction_handle), GFP_NOFS);
3046 GFP_NOFS, s);
3047 if (!th) 3050 if (!th)
3048 return NULL; 3051 return NULL;
3049 ret = journal_begin(th, s, nblocks); 3052 ret = journal_begin(th, s, nblocks);
3050 if (ret) { 3053 if (ret) {
3051 reiserfs_kfree(th, sizeof(struct reiserfs_transaction_handle), 3054 kfree(th);
3052 s);
3053 return NULL; 3055 return NULL;
3054 } 3056 }
3055 3057
@@ -3067,8 +3069,7 @@ int reiserfs_end_persistent_transaction(struct reiserfs_transaction_handle *th)
3067 ret = -EIO; 3069 ret = -EIO;
3068 if (th->t_refcount == 0) { 3070 if (th->t_refcount == 0) {
3069 SB_JOURNAL(s)->j_persistent_trans--; 3071 SB_JOURNAL(s)->j_persistent_trans--;
3070 reiserfs_kfree(th, sizeof(struct reiserfs_transaction_handle), 3072 kfree(th);
3071 s);
3072 } 3073 }
3073 return ret; 3074 return ret;
3074} 3075}
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 8f8d8d01107c..284f7852de8b 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -247,7 +247,7 @@ static int linear_search_in_dir_item(struct cpu_key *key,
247 /* mark, that this generation number is used */ 247 /* mark, that this generation number is used */
248 if (de->de_gen_number_bit_string) 248 if (de->de_gen_number_bit_string)
249 set_bit(GET_GENERATION_NUMBER(deh_offset(deh)), 249 set_bit(GET_GENERATION_NUMBER(deh_offset(deh)),
250 (unsigned long *)de->de_gen_number_bit_string); 250 de->de_gen_number_bit_string);
251 251
252 // calculate pointer to name and namelen 252 // calculate pointer to name and namelen
253 de->de_entry_num = i; 253 de->de_entry_num = i;
@@ -431,7 +431,7 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th,
431 struct reiserfs_de_head *deh; 431 struct reiserfs_de_head *deh;
432 INITIALIZE_PATH(path); 432 INITIALIZE_PATH(path);
433 struct reiserfs_dir_entry de; 433 struct reiserfs_dir_entry de;
434 int bit_string[MAX_GENERATION_NUMBER / (sizeof(int) * 8) + 1]; 434 DECLARE_BITMAP(bit_string, MAX_GENERATION_NUMBER + 1);
435 int gen_number; 435 int gen_number;
436 char small_buf[32 + DEH_SIZE]; /* 48 bytes now and we avoid kmalloc 436 char small_buf[32 + DEH_SIZE]; /* 48 bytes now and we avoid kmalloc
437 if we create file with short name */ 437 if we create file with short name */
@@ -456,7 +456,7 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th,
456 /* get memory for composing the entry */ 456 /* get memory for composing the entry */
457 buflen = DEH_SIZE + ROUND_UP(namelen); 457 buflen = DEH_SIZE + ROUND_UP(namelen);
458 if (buflen > sizeof(small_buf)) { 458 if (buflen > sizeof(small_buf)) {
459 buffer = reiserfs_kmalloc(buflen, GFP_NOFS, dir->i_sb); 459 buffer = kmalloc(buflen, GFP_NOFS);
460 if (buffer == 0) 460 if (buffer == 0)
461 return -ENOMEM; 461 return -ENOMEM;
462 } else 462 } else
@@ -486,11 +486,11 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th,
486 486
487 /* find the proper place for the new entry */ 487 /* find the proper place for the new entry */
488 memset(bit_string, 0, sizeof(bit_string)); 488 memset(bit_string, 0, sizeof(bit_string));
489 de.de_gen_number_bit_string = (char *)bit_string; 489 de.de_gen_number_bit_string = bit_string;
490 retval = reiserfs_find_entry(dir, name, namelen, &path, &de); 490 retval = reiserfs_find_entry(dir, name, namelen, &path, &de);
491 if (retval != NAME_NOT_FOUND) { 491 if (retval != NAME_NOT_FOUND) {
492 if (buffer != small_buf) 492 if (buffer != small_buf)
493 reiserfs_kfree(buffer, buflen, dir->i_sb); 493 kfree(buffer);
494 pathrelse(&path); 494 pathrelse(&path);
495 495
496 if (retval == IO_ERROR) { 496 if (retval == IO_ERROR) {
@@ -508,14 +508,14 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th,
508 } 508 }
509 509
510 gen_number = 510 gen_number =
511 find_first_zero_bit((unsigned long *)bit_string, 511 find_first_zero_bit(bit_string,
512 MAX_GENERATION_NUMBER + 1); 512 MAX_GENERATION_NUMBER + 1);
513 if (gen_number > MAX_GENERATION_NUMBER) { 513 if (gen_number > MAX_GENERATION_NUMBER) {
514 /* there is no free generation number */ 514 /* there is no free generation number */
515 reiserfs_warning(dir->i_sb, 515 reiserfs_warning(dir->i_sb,
516 "reiserfs_add_entry: Congratulations! we have got hash function screwed up"); 516 "reiserfs_add_entry: Congratulations! we have got hash function screwed up");
517 if (buffer != small_buf) 517 if (buffer != small_buf)
518 reiserfs_kfree(buffer, buflen, dir->i_sb); 518 kfree(buffer);
519 pathrelse(&path); 519 pathrelse(&path);
520 return -EBUSY; 520 return -EBUSY;
521 } 521 }
@@ -535,7 +535,7 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th,
535 &entry_key); 535 &entry_key);
536 536
537 if (buffer != small_buf) 537 if (buffer != small_buf)
538 reiserfs_kfree(buffer, buflen, dir->i_sb); 538 kfree(buffer);
539 pathrelse(&path); 539 pathrelse(&path);
540 return -EBUSY; 540 return -EBUSY;
541 } 541 }
@@ -546,7 +546,7 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th,
546 reiserfs_paste_into_item(th, &path, &entry_key, dir, buffer, 546 reiserfs_paste_into_item(th, &path, &entry_key, dir, buffer,
547 paste_size); 547 paste_size);
548 if (buffer != small_buf) 548 if (buffer != small_buf)
549 reiserfs_kfree(buffer, buflen, dir->i_sb); 549 kfree(buffer);
550 if (retval) { 550 if (retval) {
551 reiserfs_check_path(&path); 551 reiserfs_check_path(&path);
552 return retval; 552 return retval;
@@ -1065,7 +1065,7 @@ static int reiserfs_symlink(struct inode *parent_dir,
1065 goto out_failed; 1065 goto out_failed;
1066 } 1066 }
1067 1067
1068 name = reiserfs_kmalloc(item_len, GFP_NOFS, parent_dir->i_sb); 1068 name = kmalloc(item_len, GFP_NOFS);
1069 if (!name) { 1069 if (!name) {
1070 drop_new_inode(inode); 1070 drop_new_inode(inode);
1071 retval = -ENOMEM; 1071 retval = -ENOMEM;
@@ -1079,14 +1079,14 @@ static int reiserfs_symlink(struct inode *parent_dir,
1079 retval = journal_begin(&th, parent_dir->i_sb, jbegin_count); 1079 retval = journal_begin(&th, parent_dir->i_sb, jbegin_count);
1080 if (retval) { 1080 if (retval) {
1081 drop_new_inode(inode); 1081 drop_new_inode(inode);
1082 reiserfs_kfree(name, item_len, parent_dir->i_sb); 1082 kfree(name);
1083 goto out_failed; 1083 goto out_failed;
1084 } 1084 }
1085 1085
1086 retval = 1086 retval =
1087 reiserfs_new_inode(&th, parent_dir, mode, name, strlen(symname), 1087 reiserfs_new_inode(&th, parent_dir, mode, name, strlen(symname),
1088 dentry, inode); 1088 dentry, inode);
1089 reiserfs_kfree(name, item_len, parent_dir->i_sb); 1089 kfree(name);
1090 if (retval) { /* reiserfs_new_inode iputs for us */ 1090 if (retval) { /* reiserfs_new_inode iputs for us */
1091 goto out_failed; 1091 goto out_failed;
1092 } 1092 }
diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c
index fc2f43c75df4..ef6caed9336b 100644
--- a/fs/reiserfs/procfs.c
+++ b/fs/reiserfs/procfs.c
@@ -88,7 +88,6 @@ static int show_super(struct seq_file *m, struct super_block *sb)
88 seq_printf(m, "state: \t%s\n" 88 seq_printf(m, "state: \t%s\n"
89 "mount options: \t%s%s%s%s%s%s%s%s%s%s%s\n" 89 "mount options: \t%s%s%s%s%s%s%s%s%s%s%s\n"
90 "gen. counter: \t%i\n" 90 "gen. counter: \t%i\n"
91 "s_kmallocs: \t%i\n"
92 "s_disk_reads: \t%i\n" 91 "s_disk_reads: \t%i\n"
93 "s_disk_writes: \t%i\n" 92 "s_disk_writes: \t%i\n"
94 "s_fix_nodes: \t%i\n" 93 "s_fix_nodes: \t%i\n"
@@ -128,7 +127,7 @@ static int show_super(struct seq_file *m, struct super_block *sb)
128 "SMALL_TAILS " : "NO_TAILS ", 127 "SMALL_TAILS " : "NO_TAILS ",
129 replay_only(sb) ? "REPLAY_ONLY " : "", 128 replay_only(sb) ? "REPLAY_ONLY " : "",
130 convert_reiserfs(sb) ? "CONV " : "", 129 convert_reiserfs(sb) ? "CONV " : "",
131 atomic_read(&r->s_generation_counter), SF(s_kmallocs), 130 atomic_read(&r->s_generation_counter),
132 SF(s_disk_reads), SF(s_disk_writes), SF(s_fix_nodes), 131 SF(s_disk_reads), SF(s_disk_writes), SF(s_fix_nodes),
133 SF(s_do_balance), SF(s_unneeded_left_neighbor), 132 SF(s_do_balance), SF(s_unneeded_left_neighbor),
134 SF(s_good_search_by_key_reada), SF(s_bmaps), 133 SF(s_good_search_by_key_reada), SF(s_bmaps),
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 397d9590c8f2..d63da756eb49 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -472,12 +472,6 @@ static void reiserfs_put_super(struct super_block *s)
472 472
473 print_statistics(s); 473 print_statistics(s);
474 474
475 if (REISERFS_SB(s)->s_kmallocs != 0) {
476 reiserfs_warning(s,
477 "vs-2004: reiserfs_put_super: allocated memory left %d",
478 REISERFS_SB(s)->s_kmallocs);
479 }
480
481 if (REISERFS_SB(s)->reserved_blocks != 0) { 475 if (REISERFS_SB(s)->reserved_blocks != 0) {
482 reiserfs_warning(s, 476 reiserfs_warning(s,
483 "green-2005: reiserfs_put_super: reserved blocks left %d", 477 "green-2005: reiserfs_put_super: reserved blocks left %d",
@@ -1130,8 +1124,6 @@ static void handle_attrs(struct super_block *s)
1130 "reiserfs: cannot support attributes until flag is set in super-block"); 1124 "reiserfs: cannot support attributes until flag is set in super-block");
1131 REISERFS_SB(s)->s_mount_opt &= ~(1 << REISERFS_ATTRS); 1125 REISERFS_SB(s)->s_mount_opt &= ~(1 << REISERFS_ATTRS);
1132 } 1126 }
1133 } else if (le32_to_cpu(rs->s_flags) & reiserfs_attrs_cleared) {
1134 REISERFS_SB(s)->s_mount_opt |= REISERFS_ATTRS;
1135 } 1127 }
1136} 1128}
1137 1129
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index cc061bfd437b..ffb79c48c5bf 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -368,15 +368,13 @@ static int __xattr_readdir(struct file *filp, void *dirent, filldir_t filldir)
368 if (d_reclen <= 32) { 368 if (d_reclen <= 32) {
369 local_buf = small_buf; 369 local_buf = small_buf;
370 } else { 370 } else {
371 local_buf = 371 local_buf = kmalloc(d_reclen, GFP_NOFS);
372 reiserfs_kmalloc(d_reclen, GFP_NOFS, inode->i_sb);
373 if (!local_buf) { 372 if (!local_buf) {
374 pathrelse(&path_to_entry); 373 pathrelse(&path_to_entry);
375 return -ENOMEM; 374 return -ENOMEM;
376 } 375 }
377 if (item_moved(&tmp_ih, &path_to_entry)) { 376 if (item_moved(&tmp_ih, &path_to_entry)) {
378 reiserfs_kfree(local_buf, d_reclen, 377 kfree(local_buf);
379 inode->i_sb);
380 378
381 /* sigh, must retry. Do this same offset again */ 379 /* sigh, must retry. Do this same offset again */
382 next_pos = d_off; 380 next_pos = d_off;
@@ -399,13 +397,12 @@ static int __xattr_readdir(struct file *filp, void *dirent, filldir_t filldir)
399 if (filldir(dirent, local_buf, d_reclen, d_off, d_ino, 397 if (filldir(dirent, local_buf, d_reclen, d_off, d_ino,
400 DT_UNKNOWN) < 0) { 398 DT_UNKNOWN) < 0) {
401 if (local_buf != small_buf) { 399 if (local_buf != small_buf) {
402 reiserfs_kfree(local_buf, d_reclen, 400 kfree(local_buf);
403 inode->i_sb);
404 } 401 }
405 goto end; 402 goto end;
406 } 403 }
407 if (local_buf != small_buf) { 404 if (local_buf != small_buf) {
408 reiserfs_kfree(local_buf, d_reclen, inode->i_sb); 405 kfree(local_buf);
409 } 406 }
410 } /* while */ 407 } /* while */
411 408
@@ -1322,109 +1319,44 @@ int reiserfs_xattr_init(struct super_block *s, int mount_flags)
1322 return err; 1319 return err;
1323} 1320}
1324 1321
1325static int 1322static int reiserfs_check_acl(struct inode *inode, int mask)
1326__reiserfs_permission(struct inode *inode, int mask, struct nameidata *nd,
1327 int need_lock)
1328{ 1323{
1329 umode_t mode = inode->i_mode; 1324 struct posix_acl *acl;
1330 1325 int error = -EAGAIN; /* do regular unix permission checks by default */
1331 if (mask & MAY_WRITE) {
1332 /*
1333 * Nobody gets write access to a read-only fs.
1334 */
1335 if (IS_RDONLY(inode) &&
1336 (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
1337 return -EROFS;
1338 1326
1339 /* 1327 reiserfs_read_lock_xattr_i(inode);
1340 * Nobody gets write access to an immutable file. 1328 reiserfs_read_lock_xattrs(inode->i_sb);
1341 */
1342 if (IS_IMMUTABLE(inode))
1343 return -EACCES;
1344 }
1345 1329
1346 /* We don't do permission checks on the internal objects. 1330 acl = reiserfs_get_acl(inode, ACL_TYPE_ACCESS);
1347 * Permissions are determined by the "owning" object. */
1348 if (is_reiserfs_priv_object(inode))
1349 return 0;
1350 1331
1351 if (current->fsuid == inode->i_uid) { 1332 reiserfs_read_unlock_xattrs(inode->i_sb);
1352 mode >>= 6; 1333 reiserfs_read_unlock_xattr_i(inode);
1353#ifdef CONFIG_REISERFS_FS_POSIX_ACL
1354 } else if (reiserfs_posixacl(inode->i_sb) &&
1355 get_inode_sd_version(inode) != STAT_DATA_V1) {
1356 struct posix_acl *acl;
1357
1358 /* ACL can't contain additional permissions if
1359 the ACL_MASK entry is 0 */
1360 if (!(mode & S_IRWXG))
1361 goto check_groups;
1362
1363 if (need_lock) {
1364 reiserfs_read_lock_xattr_i(inode);
1365 reiserfs_read_lock_xattrs(inode->i_sb);
1366 }
1367 acl = reiserfs_get_acl(inode, ACL_TYPE_ACCESS);
1368 if (need_lock) {
1369 reiserfs_read_unlock_xattrs(inode->i_sb);
1370 reiserfs_read_unlock_xattr_i(inode);
1371 }
1372 if (IS_ERR(acl)) {
1373 if (PTR_ERR(acl) == -ENODATA)
1374 goto check_groups;
1375 return PTR_ERR(acl);
1376 }
1377 1334
1378 if (acl) { 1335 if (acl) {
1379 int err = posix_acl_permission(inode, acl, mask); 1336 if (!IS_ERR(acl)) {
1337 error = posix_acl_permission(inode, acl, mask);
1380 posix_acl_release(acl); 1338 posix_acl_release(acl);
1381 if (err == -EACCES) { 1339 } else if (PTR_ERR(acl) != -ENODATA)
1382 goto check_capabilities; 1340 error = PTR_ERR(acl);
1383 }
1384 return err;
1385 } else {
1386 goto check_groups;
1387 }
1388#endif
1389 } else {
1390 check_groups:
1391 if (in_group_p(inode->i_gid))
1392 mode >>= 3;
1393 } 1341 }
1394 1342
1395 /* 1343 return error;
1396 * If the DACs are ok we don't need any capability check. 1344}
1397 */
1398 if (((mode & mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == mask))
1399 return 0;
1400 1345
1401 check_capabilities: 1346int reiserfs_permission(struct inode *inode, int mask, struct nameidata *nd)
1347{
1402 /* 1348 /*
1403 * Read/write DACs are always overridable. 1349 * We don't do permission checks on the internal objects.
1404 * Executable DACs are overridable if at least one exec bit is set. 1350 * Permissions are determined by the "owning" object.
1405 */ 1351 */
1406 if (!(mask & MAY_EXEC) || 1352 if (is_reiserfs_priv_object(inode))
1407 (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode)) 1353 return 0;
1408 if (capable(CAP_DAC_OVERRIDE))
1409 return 0;
1410 1354
1411 /* 1355 /*
1412 * Searching includes executable on directories, else just read. 1356 * Stat data v1 doesn't support ACLs.
1413 */ 1357 */
1414 if (mask == MAY_READ || (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE))) 1358 if (get_inode_sd_version(inode) == STAT_DATA_V1)
1415 if (capable(CAP_DAC_READ_SEARCH)) 1359 return generic_permission(inode, mask, NULL);
1416 return 0; 1360 else
1417 1361 return generic_permission(inode, mask, reiserfs_check_acl);
1418 return -EACCES;
1419}
1420
1421int reiserfs_permission(struct inode *inode, int mask, struct nameidata *nd)
1422{
1423 return __reiserfs_permission(inode, mask, nd, 1);
1424}
1425
1426int
1427reiserfs_permission_locked(struct inode *inode, int mask, struct nameidata *nd)
1428{
1429 return __reiserfs_permission(inode, mask, nd, 0);
1430} 1362}
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index 43de3ba83332..ab8894c3b9e5 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -228,7 +228,8 @@ struct posix_acl *reiserfs_get_acl(struct inode *inode, int type)
228 acl = ERR_PTR(retval); 228 acl = ERR_PTR(retval);
229 } else { 229 } else {
230 acl = posix_acl_from_disk(value, retval); 230 acl = posix_acl_from_disk(value, retval);
231 *p_acl = posix_acl_dup(acl); 231 if (!IS_ERR(acl))
232 *p_acl = posix_acl_dup(acl);
232 } 233 }
233 234
234 kfree(value); 235 kfree(value);
diff --git a/fs/select.c b/fs/select.c
index c0f02d36c60e..1815a57d2255 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -398,11 +398,15 @@ asmlinkage long sys_select(int n, fd_set __user *inp, fd_set __user *outp,
398 ret = core_sys_select(n, inp, outp, exp, &timeout); 398 ret = core_sys_select(n, inp, outp, exp, &timeout);
399 399
400 if (tvp) { 400 if (tvp) {
401 struct timeval rtv;
402
401 if (current->personality & STICKY_TIMEOUTS) 403 if (current->personality & STICKY_TIMEOUTS)
402 goto sticky; 404 goto sticky;
403 tv.tv_usec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)); 405 rtv.tv_usec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ));
404 tv.tv_sec = timeout; 406 rtv.tv_sec = timeout;
405 if (copy_to_user(tvp, &tv, sizeof(tv))) { 407 if (timeval_compare(&rtv, &tv) >= 0)
408 rtv = tv;
409 if (copy_to_user(tvp, &rtv, sizeof(rtv))) {
406sticky: 410sticky:
407 /* 411 /*
408 * If an application puts its timeval in read-only 412 * If an application puts its timeval in read-only
@@ -460,11 +464,16 @@ asmlinkage long sys_pselect7(int n, fd_set __user *inp, fd_set __user *outp,
460 ret = core_sys_select(n, inp, outp, exp, &timeout); 464 ret = core_sys_select(n, inp, outp, exp, &timeout);
461 465
462 if (tsp) { 466 if (tsp) {
467 struct timespec rts;
468
463 if (current->personality & STICKY_TIMEOUTS) 469 if (current->personality & STICKY_TIMEOUTS)
464 goto sticky; 470 goto sticky;
465 ts.tv_nsec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)) * 1000; 471 rts.tv_nsec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)) *
466 ts.tv_sec = timeout; 472 1000;
467 if (copy_to_user(tsp, &ts, sizeof(ts))) { 473 rts.tv_sec = timeout;
474 if (timespec_compare(&rts, &ts) >= 0)
475 rts = ts;
476 if (copy_to_user(tsp, &rts, sizeof(rts))) {
468sticky: 477sticky:
469 /* 478 /*
470 * If an application puts its timeval in read-only 479 * If an application puts its timeval in read-only
@@ -510,9 +519,9 @@ asmlinkage long sys_pselect6(int n, fd_set __user *inp, fd_set __user *outp,
510 519
511 if (sig) { 520 if (sig) {
512 if (!access_ok(VERIFY_READ, sig, sizeof(void *)+sizeof(size_t)) 521 if (!access_ok(VERIFY_READ, sig, sizeof(void *)+sizeof(size_t))
513 || __get_user(up, (sigset_t * __user *)sig) 522 || __get_user(up, (sigset_t __user * __user *)sig)
514 || __get_user(sigsetsize, 523 || __get_user(sigsetsize,
515 (size_t * __user)(sig+sizeof(void *)))) 524 (size_t __user *)(sig+sizeof(void *))))
516 return -EFAULT; 525 return -EFAULT;
517 } 526 }
518 527
@@ -758,12 +767,17 @@ asmlinkage long sys_ppoll(struct pollfd __user *ufds, unsigned int nfds,
758 sigprocmask(SIG_SETMASK, &sigsaved, NULL); 767 sigprocmask(SIG_SETMASK, &sigsaved, NULL);
759 768
760 if (tsp && timeout >= 0) { 769 if (tsp && timeout >= 0) {
770 struct timespec rts;
771
761 if (current->personality & STICKY_TIMEOUTS) 772 if (current->personality & STICKY_TIMEOUTS)
762 goto sticky; 773 goto sticky;
763 /* Yes, we know it's actually an s64, but it's also positive. */ 774 /* Yes, we know it's actually an s64, but it's also positive. */
764 ts.tv_nsec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)) * 1000; 775 rts.tv_nsec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)) *
765 ts.tv_sec = timeout; 776 1000;
766 if (copy_to_user(tsp, &ts, sizeof(ts))) { 777 rts.tv_sec = timeout;
778 if (timespec_compare(&rts, &ts) >= 0)
779 rts = ts;
780 if (copy_to_user(tsp, &rts, sizeof(rts))) {
767 sticky: 781 sticky:
768 /* 782 /*
769 * If an application puts its timeval in read-only 783 * If an application puts its timeval in read-only
diff --git a/fs/smbfs/dir.c b/fs/smbfs/dir.c
index c6c33e15143a..0424d06b147e 100644
--- a/fs/smbfs/dir.c
+++ b/fs/smbfs/dir.c
@@ -209,6 +209,8 @@ init_cache:
209 ctl.valid = 1; 209 ctl.valid = 1;
210read_really: 210read_really:
211 result = server->ops->readdir(filp, dirent, filldir, &ctl); 211 result = server->ops->readdir(filp, dirent, filldir, &ctl);
212 if (result == -ERESTARTSYS && page)
213 ClearPageUptodate(page);
212 if (ctl.idx == -1) 214 if (ctl.idx == -1)
213 goto invalid_cache; /* retry */ 215 goto invalid_cache; /* retry */
214 ctl.head.end = ctl.fpos - 1; 216 ctl.head.end = ctl.fpos - 1;
@@ -217,7 +219,8 @@ finished:
217 if (page) { 219 if (page) {
218 cache->head = ctl.head; 220 cache->head = ctl.head;
219 kunmap(page); 221 kunmap(page);
220 SetPageUptodate(page); 222 if (result != -ERESTARTSYS)
223 SetPageUptodate(page);
221 unlock_page(page); 224 unlock_page(page);
222 page_cache_release(page); 225 page_cache_release(page);
223 } 226 }
diff --git a/fs/stat.c b/fs/stat.c
index 24211b030f39..9948cc1685a4 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -261,6 +261,7 @@ asmlinkage long sys_newlstat(char __user *filename, struct stat __user *statbuf)
261 return error; 261 return error;
262} 262}
263 263
264#ifndef __ARCH_WANT_STAT64
264asmlinkage long sys_newfstatat(int dfd, char __user *filename, 265asmlinkage long sys_newfstatat(int dfd, char __user *filename,
265 struct stat __user *statbuf, int flag) 266 struct stat __user *statbuf, int flag)
266{ 267{
@@ -281,6 +282,7 @@ asmlinkage long sys_newfstatat(int dfd, char __user *filename,
281out: 282out:
282 return error; 283 return error;
283} 284}
285#endif
284 286
285asmlinkage long sys_newfstat(unsigned int fd, struct stat __user *statbuf) 287asmlinkage long sys_newfstat(unsigned int fd, struct stat __user *statbuf)
286{ 288{
@@ -395,6 +397,26 @@ asmlinkage long sys_fstat64(unsigned long fd, struct stat64 __user * statbuf)
395 return error; 397 return error;
396} 398}
397 399
400asmlinkage long sys_fstatat64(int dfd, char __user *filename,
401 struct stat64 __user *statbuf, int flag)
402{
403 struct kstat stat;
404 int error = -EINVAL;
405
406 if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0)
407 goto out;
408
409 if (flag & AT_SYMLINK_NOFOLLOW)
410 error = vfs_lstat_fd(dfd, filename, &stat);
411 else
412 error = vfs_stat_fd(dfd, filename, &stat);
413
414 if (!error)
415 error = cp_new_stat64(&stat, statbuf);
416
417out:
418 return error;
419}
398#endif /* __ARCH_WANT_STAT64 */ 420#endif /* __ARCH_WANT_STAT64 */
399 421
400void inode_add_bytes(struct inode *inode, loff_t bytes) 422void inode_add_bytes(struct inode *inode, loff_t bytes)
diff --git a/fs/super.c b/fs/super.c
index c177b92419c5..e20b5580afd5 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -247,8 +247,9 @@ void generic_shutdown_super(struct super_block *sb)
247 247
248 /* Forget any remaining inodes */ 248 /* Forget any remaining inodes */
249 if (invalidate_inodes(sb)) { 249 if (invalidate_inodes(sb)) {
250 printk("VFS: Busy inodes after unmount. " 250 printk("VFS: Busy inodes after unmount of %s. "
251 "Self-destruct in 5 seconds. Have a nice day...\n"); 251 "Self-destruct in 5 seconds. Have a nice day...\n",
252 sb->s_id);
252 } 253 }
253 254
254 unlock_kernel(); 255 unlock_kernel();
@@ -665,6 +666,16 @@ static int test_bdev_super(struct super_block *s, void *data)
665 return (void *)s->s_bdev == data; 666 return (void *)s->s_bdev == data;
666} 667}
667 668
669static void bdev_uevent(struct block_device *bdev, enum kobject_action action)
670{
671 if (bdev->bd_disk) {
672 if (bdev->bd_part)
673 kobject_uevent(&bdev->bd_part->kobj, action);
674 else
675 kobject_uevent(&bdev->bd_disk->kobj, action);
676 }
677}
678
668struct super_block *get_sb_bdev(struct file_system_type *fs_type, 679struct super_block *get_sb_bdev(struct file_system_type *fs_type,
669 int flags, const char *dev_name, void *data, 680 int flags, const char *dev_name, void *data,
670 int (*fill_super)(struct super_block *, void *, int)) 681 int (*fill_super)(struct super_block *, void *, int))
@@ -706,8 +717,10 @@ struct super_block *get_sb_bdev(struct file_system_type *fs_type,
706 up_write(&s->s_umount); 717 up_write(&s->s_umount);
707 deactivate_super(s); 718 deactivate_super(s);
708 s = ERR_PTR(error); 719 s = ERR_PTR(error);
709 } else 720 } else {
710 s->s_flags |= MS_ACTIVE; 721 s->s_flags |= MS_ACTIVE;
722 bdev_uevent(bdev, KOBJ_MOUNT);
723 }
711 } 724 }
712 725
713 return s; 726 return s;
@@ -723,6 +736,7 @@ void kill_block_super(struct super_block *sb)
723{ 736{
724 struct block_device *bdev = sb->s_bdev; 737 struct block_device *bdev = sb->s_bdev;
725 738
739 bdev_uevent(bdev, KOBJ_UMOUNT);
726 generic_shutdown_super(sb); 740 generic_shutdown_super(sb);
727 sync_blockdev(bdev); 741 sync_blockdev(bdev);
728 close_bdev_excl(bdev); 742 close_bdev_excl(bdev);
diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
index 4fae57d9d115..201049ac8a96 100644
--- a/fs/udf/balloc.c
+++ b/fs/udf/balloc.c
@@ -579,10 +579,9 @@ static void udf_table_free_blocks(struct super_block * sb,
579 { 579 {
580 loffset = nextoffset; 580 loffset = nextoffset;
581 aed->lengthAllocDescs = cpu_to_le32(adsize); 581 aed->lengthAllocDescs = cpu_to_le32(adsize);
582 if (obh) 582 sptr = UDF_I_DATA(inode) + nextoffset -
583 sptr = UDF_I_DATA(inode) + nextoffset - udf_file_entry_alloc_offset(inode) + UDF_I_LENEATTR(inode) - adsize; 583 udf_file_entry_alloc_offset(inode) +
584 else 584 UDF_I_LENEATTR(inode) - adsize;
585 sptr = obh->b_data + nextoffset - adsize;
586 dptr = nbh->b_data + sizeof(struct allocExtDesc); 585 dptr = nbh->b_data + sizeof(struct allocExtDesc);
587 memcpy(dptr, sptr, adsize); 586 memcpy(dptr, sptr, adsize);
588 nextoffset = sizeof(struct allocExtDesc) + adsize; 587 nextoffset = sizeof(struct allocExtDesc) + adsize;
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 395e582ee542..d04cff2273b6 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -1045,10 +1045,14 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
1045 } 1045 }
1046 1046
1047 inode->i_uid = le32_to_cpu(fe->uid); 1047 inode->i_uid = le32_to_cpu(fe->uid);
1048 if ( inode->i_uid == -1 ) inode->i_uid = UDF_SB(inode->i_sb)->s_uid; 1048 if (inode->i_uid == -1 || UDF_QUERY_FLAG(inode->i_sb,
1049 UDF_FLAG_UID_IGNORE))
1050 inode->i_uid = UDF_SB(inode->i_sb)->s_uid;
1049 1051
1050 inode->i_gid = le32_to_cpu(fe->gid); 1052 inode->i_gid = le32_to_cpu(fe->gid);
1051 if ( inode->i_gid == -1 ) inode->i_gid = UDF_SB(inode->i_sb)->s_gid; 1053 if (inode->i_gid == -1 || UDF_QUERY_FLAG(inode->i_sb,
1054 UDF_FLAG_GID_IGNORE))
1055 inode->i_gid = UDF_SB(inode->i_sb)->s_gid;
1052 1056
1053 inode->i_nlink = le16_to_cpu(fe->fileLinkCount); 1057 inode->i_nlink = le16_to_cpu(fe->fileLinkCount);
1054 if (!inode->i_nlink) 1058 if (!inode->i_nlink)
@@ -1335,10 +1339,14 @@ udf_update_inode(struct inode *inode, int do_sync)
1335 return err; 1339 return err;
1336 } 1340 }
1337 1341
1338 if (inode->i_uid != UDF_SB(inode->i_sb)->s_uid) 1342 if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_UID_FORGET))
1343 fe->uid = cpu_to_le32(-1);
1344 else if (inode->i_uid != UDF_SB(inode->i_sb)->s_uid)
1339 fe->uid = cpu_to_le32(inode->i_uid); 1345 fe->uid = cpu_to_le32(inode->i_uid);
1340 1346
1341 if (inode->i_gid != UDF_SB(inode->i_sb)->s_gid) 1347 if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_GID_FORGET))
1348 fe->gid = cpu_to_le32(-1);
1349 else if (inode->i_gid != UDF_SB(inode->i_sb)->s_gid)
1342 fe->gid = cpu_to_le32(inode->i_gid); 1350 fe->gid = cpu_to_le32(inode->i_gid);
1343 1351
1344 udfperms = ((inode->i_mode & S_IRWXO) ) | 1352 udfperms = ((inode->i_mode & S_IRWXO) ) |
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index ca732e79c48b..ab9a7629d23e 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -296,7 +296,7 @@ static struct dentry *
296udf_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) 296udf_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
297{ 297{
298 struct inode *inode = NULL; 298 struct inode *inode = NULL;
299 struct fileIdentDesc cfi, *fi; 299 struct fileIdentDesc cfi;
300 struct udf_fileident_bh fibh; 300 struct udf_fileident_bh fibh;
301 301
302 if (dentry->d_name.len > UDF_NAME_LEN-2) 302 if (dentry->d_name.len > UDF_NAME_LEN-2)
@@ -318,7 +318,7 @@ udf_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
318 else 318 else
319#endif /* UDF_RECOVERY */ 319#endif /* UDF_RECOVERY */
320 320
321 if ((fi = udf_find_entry(dir, dentry, &fibh, &cfi))) 321 if (udf_find_entry(dir, dentry, &fibh, &cfi))
322 { 322 {
323 if (fibh.sbh != fibh.ebh) 323 if (fibh.sbh != fibh.ebh)
324 udf_release_data(fibh.ebh); 324 udf_release_data(fibh.ebh);
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 4a6f49adc609..368d8f81fe54 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -269,7 +269,7 @@ enum {
269 Opt_gid, Opt_uid, Opt_umask, Opt_session, Opt_lastblock, 269 Opt_gid, Opt_uid, Opt_umask, Opt_session, Opt_lastblock,
270 Opt_anchor, Opt_volume, Opt_partition, Opt_fileset, 270 Opt_anchor, Opt_volume, Opt_partition, Opt_fileset,
271 Opt_rootdir, Opt_utf8, Opt_iocharset, 271 Opt_rootdir, Opt_utf8, Opt_iocharset,
272 Opt_err 272 Opt_err, Opt_uforget, Opt_uignore, Opt_gforget, Opt_gignore
273}; 273};
274 274
275static match_table_t tokens = { 275static match_table_t tokens = {
@@ -282,6 +282,10 @@ static match_table_t tokens = {
282 {Opt_adinicb, "adinicb"}, 282 {Opt_adinicb, "adinicb"},
283 {Opt_shortad, "shortad"}, 283 {Opt_shortad, "shortad"},
284 {Opt_longad, "longad"}, 284 {Opt_longad, "longad"},
285 {Opt_uforget, "uid=forget"},
286 {Opt_uignore, "uid=ignore"},
287 {Opt_gforget, "gid=forget"},
288 {Opt_gignore, "gid=ignore"},
285 {Opt_gid, "gid=%u"}, 289 {Opt_gid, "gid=%u"},
286 {Opt_uid, "uid=%u"}, 290 {Opt_uid, "uid=%u"},
287 {Opt_umask, "umask=%o"}, 291 {Opt_umask, "umask=%o"},
@@ -414,6 +418,18 @@ udf_parse_options(char *options, struct udf_options *uopt)
414 uopt->flags |= (1 << UDF_FLAG_NLS_MAP); 418 uopt->flags |= (1 << UDF_FLAG_NLS_MAP);
415 break; 419 break;
416#endif 420#endif
421 case Opt_uignore:
422 uopt->flags |= (1 << UDF_FLAG_UID_IGNORE);
423 break;
424 case Opt_uforget:
425 uopt->flags |= (1 << UDF_FLAG_UID_FORGET);
426 break;
427 case Opt_gignore:
428 uopt->flags |= (1 << UDF_FLAG_GID_IGNORE);
429 break;
430 case Opt_gforget:
431 uopt->flags |= (1 << UDF_FLAG_GID_FORGET);
432 break;
417 default: 433 default:
418 printk(KERN_ERR "udf: bad mount option \"%s\" " 434 printk(KERN_ERR "udf: bad mount option \"%s\" "
419 "or missing value\n", p); 435 "or missing value\n", p);
diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h
index 663669810be6..110f8d62616f 100644
--- a/fs/udf/udf_sb.h
+++ b/fs/udf/udf_sb.h
@@ -20,6 +20,10 @@
20#define UDF_FLAG_VARCONV 8 20#define UDF_FLAG_VARCONV 8
21#define UDF_FLAG_NLS_MAP 9 21#define UDF_FLAG_NLS_MAP 9
22#define UDF_FLAG_UTF8 10 22#define UDF_FLAG_UTF8 10
23#define UDF_FLAG_UID_FORGET 11 /* save -1 for uid to disk */
24#define UDF_FLAG_UID_IGNORE 12 /* use sb uid instead of on disk uid */
25#define UDF_FLAG_GID_FORGET 13
26#define UDF_FLAG_GID_IGNORE 14
23 27
24#define UDF_PART_FLAG_UNALLOC_BITMAP 0x0001 28#define UDF_PART_FLAG_UNALLOC_BITMAP 0x0001
25#define UDF_PART_FLAG_UNALLOC_TABLE 0x0002 29#define UDF_PART_FLAG_UNALLOC_TABLE 0x0002
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index e0c04e36a051..3c3f62ce2ad9 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -376,7 +376,7 @@ out:
376 * This function gets the block which contains the fragment. 376 * This function gets the block which contains the fragment.
377 */ 377 */
378 378
379static int ufs_getfrag_block (struct inode *inode, sector_t fragment, struct buffer_head *bh_result, int create) 379int ufs_getfrag_block (struct inode *inode, sector_t fragment, struct buffer_head *bh_result, int create)
380{ 380{
381 struct super_block * sb = inode->i_sb; 381 struct super_block * sb = inode->i_sb;
382 struct ufs_sb_private_info * uspi = UFS_SB(sb)->s_uspi; 382 struct ufs_sb_private_info * uspi = UFS_SB(sb)->s_uspi;
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index d4aacee593ff..e9055ef7f5ac 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -388,7 +388,8 @@ static int ufs_parse_options (char * options, unsigned * mount_options)
388/* 388/*
389 * Read on-disk structures associated with cylinder groups 389 * Read on-disk structures associated with cylinder groups
390 */ 390 */
391static int ufs_read_cylinder_structures (struct super_block *sb) { 391static int ufs_read_cylinder_structures (struct super_block *sb)
392{
392 struct ufs_sb_info * sbi = UFS_SB(sb); 393 struct ufs_sb_info * sbi = UFS_SB(sb);
393 struct ufs_sb_private_info * uspi; 394 struct ufs_sb_private_info * uspi;
394 struct ufs_super_block *usb; 395 struct ufs_super_block *usb;
@@ -415,6 +416,7 @@ static int ufs_read_cylinder_structures (struct super_block *sb) {
415 base = space = kmalloc(size, GFP_KERNEL); 416 base = space = kmalloc(size, GFP_KERNEL);
416 if (!base) 417 if (!base)
417 goto failed; 418 goto failed;
419 sbi->s_csp = (struct ufs_csum *)space;
418 for (i = 0; i < blks; i += uspi->s_fpb) { 420 for (i = 0; i < blks; i += uspi->s_fpb) {
419 size = uspi->s_bsize; 421 size = uspi->s_bsize;
420 if (i + uspi->s_fpb > blks) 422 if (i + uspi->s_fpb > blks)
@@ -430,7 +432,6 @@ static int ufs_read_cylinder_structures (struct super_block *sb) {
430 goto failed; 432 goto failed;
431 433
432 ubh_ubhcpymem (space, ubh, size); 434 ubh_ubhcpymem (space, ubh, size);
433 sbi->s_csp[ufs_fragstoblks(i)]=(struct ufs_csum *)space;
434 435
435 space += size; 436 space += size;
436 ubh_brelse (ubh); 437 ubh_brelse (ubh);
@@ -486,7 +487,8 @@ failed:
486 * Put on-disk structures associated with cylinder groups and 487 * Put on-disk structures associated with cylinder groups and
487 * write them back to disk 488 * write them back to disk
488 */ 489 */
489static void ufs_put_cylinder_structures (struct super_block *sb) { 490static void ufs_put_cylinder_structures (struct super_block *sb)
491{
490 struct ufs_sb_info * sbi = UFS_SB(sb); 492 struct ufs_sb_info * sbi = UFS_SB(sb);
491 struct ufs_sb_private_info * uspi; 493 struct ufs_sb_private_info * uspi;
492 struct ufs_buffer_head * ubh; 494 struct ufs_buffer_head * ubh;
@@ -499,7 +501,7 @@ static void ufs_put_cylinder_structures (struct super_block *sb) {
499 501
500 size = uspi->s_cssize; 502 size = uspi->s_cssize;
501 blks = (size + uspi->s_fsize - 1) >> uspi->s_fshift; 503 blks = (size + uspi->s_fsize - 1) >> uspi->s_fshift;
502 base = space = (char*) sbi->s_csp[0]; 504 base = space = (char*) sbi->s_csp;
503 for (i = 0; i < blks; i += uspi->s_fpb) { 505 for (i = 0; i < blks; i += uspi->s_fpb) {
504 size = uspi->s_bsize; 506 size = uspi->s_bsize;
505 if (i + uspi->s_fpb > blks) 507 if (i + uspi->s_fpb > blks)
diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c
index 61d2e35012a4..02e86291ef8a 100644
--- a/fs/ufs/truncate.c
+++ b/fs/ufs/truncate.c
@@ -29,6 +29,11 @@
29 * Idea from Pierre del Perugia <delperug@gla.ecoledoc.ibp.fr> 29 * Idea from Pierre del Perugia <delperug@gla.ecoledoc.ibp.fr>
30 */ 30 */
31 31
32/*
33 * Modified to avoid infinite loop on 2006 by
34 * Evgeniy Dushistov <dushistov@mail.ru>
35 */
36
32#include <linux/errno.h> 37#include <linux/errno.h>
33#include <linux/fs.h> 38#include <linux/fs.h>
34#include <linux/ufs_fs.h> 39#include <linux/ufs_fs.h>
@@ -65,19 +70,16 @@
65#define DIRECT_BLOCK ((inode->i_size + uspi->s_bsize - 1) >> uspi->s_bshift) 70#define DIRECT_BLOCK ((inode->i_size + uspi->s_bsize - 1) >> uspi->s_bshift)
66#define DIRECT_FRAGMENT ((inode->i_size + uspi->s_fsize - 1) >> uspi->s_fshift) 71#define DIRECT_FRAGMENT ((inode->i_size + uspi->s_fsize - 1) >> uspi->s_fshift)
67 72
68#define DATA_BUFFER_USED(bh) \
69 (atomic_read(&bh->b_count)>1 || buffer_locked(bh))
70 73
71static int ufs_trunc_direct (struct inode * inode) 74static int ufs_trunc_direct (struct inode * inode)
72{ 75{
73 struct ufs_inode_info *ufsi = UFS_I(inode); 76 struct ufs_inode_info *ufsi = UFS_I(inode);
74 struct super_block * sb; 77 struct super_block * sb;
75 struct ufs_sb_private_info * uspi; 78 struct ufs_sb_private_info * uspi;
76 struct buffer_head * bh;
77 __fs32 * p; 79 __fs32 * p;
78 unsigned frag1, frag2, frag3, frag4, block1, block2; 80 unsigned frag1, frag2, frag3, frag4, block1, block2;
79 unsigned frag_to_free, free_count; 81 unsigned frag_to_free, free_count;
80 unsigned i, j, tmp; 82 unsigned i, tmp;
81 int retry; 83 int retry;
82 84
83 UFSD(("ENTER\n")) 85 UFSD(("ENTER\n"))
@@ -117,15 +119,7 @@ static int ufs_trunc_direct (struct inode * inode)
117 ufs_panic (sb, "ufs_trunc_direct", "internal error"); 119 ufs_panic (sb, "ufs_trunc_direct", "internal error");
118 frag1 = ufs_fragnum (frag1); 120 frag1 = ufs_fragnum (frag1);
119 frag2 = ufs_fragnum (frag2); 121 frag2 = ufs_fragnum (frag2);
120 for (j = frag1; j < frag2; j++) { 122
121 bh = sb_find_get_block (sb, tmp + j);
122 if ((bh && DATA_BUFFER_USED(bh)) || tmp != fs32_to_cpu(sb, *p)) {
123 retry = 1;
124 brelse (bh);
125 goto next1;
126 }
127 bforget (bh);
128 }
129 inode->i_blocks -= (frag2-frag1) << uspi->s_nspfshift; 123 inode->i_blocks -= (frag2-frag1) << uspi->s_nspfshift;
130 mark_inode_dirty(inode); 124 mark_inode_dirty(inode);
131 ufs_free_fragments (inode, tmp + frag1, frag2 - frag1); 125 ufs_free_fragments (inode, tmp + frag1, frag2 - frag1);
@@ -140,15 +134,7 @@ next1:
140 tmp = fs32_to_cpu(sb, *p); 134 tmp = fs32_to_cpu(sb, *p);
141 if (!tmp) 135 if (!tmp)
142 continue; 136 continue;
143 for (j = 0; j < uspi->s_fpb; j++) { 137
144 bh = sb_find_get_block(sb, tmp + j);
145 if ((bh && DATA_BUFFER_USED(bh)) || tmp != fs32_to_cpu(sb, *p)) {
146 retry = 1;
147 brelse (bh);
148 goto next2;
149 }
150 bforget (bh);
151 }
152 *p = 0; 138 *p = 0;
153 inode->i_blocks -= uspi->s_nspb; 139 inode->i_blocks -= uspi->s_nspb;
154 mark_inode_dirty(inode); 140 mark_inode_dirty(inode);
@@ -162,7 +148,6 @@ next1:
162 frag_to_free = tmp; 148 frag_to_free = tmp;
163 free_count = uspi->s_fpb; 149 free_count = uspi->s_fpb;
164 } 150 }
165next2:;
166 } 151 }
167 152
168 if (free_count > 0) 153 if (free_count > 0)
@@ -179,15 +164,7 @@ next2:;
179 if (!tmp ) 164 if (!tmp )
180 ufs_panic(sb, "ufs_truncate_direct", "internal error"); 165 ufs_panic(sb, "ufs_truncate_direct", "internal error");
181 frag4 = ufs_fragnum (frag4); 166 frag4 = ufs_fragnum (frag4);
182 for (j = 0; j < frag4; j++) { 167
183 bh = sb_find_get_block (sb, tmp + j);
184 if ((bh && DATA_BUFFER_USED(bh)) || tmp != fs32_to_cpu(sb, *p)) {
185 retry = 1;
186 brelse (bh);
187 goto next1;
188 }
189 bforget (bh);
190 }
191 *p = 0; 168 *p = 0;
192 inode->i_blocks -= frag4 << uspi->s_nspfshift; 169 inode->i_blocks -= frag4 << uspi->s_nspfshift;
193 mark_inode_dirty(inode); 170 mark_inode_dirty(inode);
@@ -204,9 +181,8 @@ static int ufs_trunc_indirect (struct inode * inode, unsigned offset, __fs32 *p)
204 struct super_block * sb; 181 struct super_block * sb;
205 struct ufs_sb_private_info * uspi; 182 struct ufs_sb_private_info * uspi;
206 struct ufs_buffer_head * ind_ubh; 183 struct ufs_buffer_head * ind_ubh;
207 struct buffer_head * bh;
208 __fs32 * ind; 184 __fs32 * ind;
209 unsigned indirect_block, i, j, tmp; 185 unsigned indirect_block, i, tmp;
210 unsigned frag_to_free, free_count; 186 unsigned frag_to_free, free_count;
211 int retry; 187 int retry;
212 188
@@ -238,15 +214,7 @@ static int ufs_trunc_indirect (struct inode * inode, unsigned offset, __fs32 *p)
238 tmp = fs32_to_cpu(sb, *ind); 214 tmp = fs32_to_cpu(sb, *ind);
239 if (!tmp) 215 if (!tmp)
240 continue; 216 continue;
241 for (j = 0; j < uspi->s_fpb; j++) { 217
242 bh = sb_find_get_block(sb, tmp + j);
243 if ((bh && DATA_BUFFER_USED(bh)) || tmp != fs32_to_cpu(sb, *ind)) {
244 retry = 1;
245 brelse (bh);
246 goto next;
247 }
248 bforget (bh);
249 }
250 *ind = 0; 218 *ind = 0;
251 ubh_mark_buffer_dirty(ind_ubh); 219 ubh_mark_buffer_dirty(ind_ubh);
252 if (free_count == 0) { 220 if (free_count == 0) {
@@ -261,7 +229,6 @@ static int ufs_trunc_indirect (struct inode * inode, unsigned offset, __fs32 *p)
261 } 229 }
262 inode->i_blocks -= uspi->s_nspb; 230 inode->i_blocks -= uspi->s_nspb;
263 mark_inode_dirty(inode); 231 mark_inode_dirty(inode);
264next:;
265 } 232 }
266 233
267 if (free_count > 0) { 234 if (free_count > 0) {
@@ -430,9 +397,7 @@ void ufs_truncate (struct inode * inode)
430 struct ufs_inode_info *ufsi = UFS_I(inode); 397 struct ufs_inode_info *ufsi = UFS_I(inode);
431 struct super_block * sb; 398 struct super_block * sb;
432 struct ufs_sb_private_info * uspi; 399 struct ufs_sb_private_info * uspi;
433 struct buffer_head * bh; 400 int retry;
434 unsigned offset;
435 int err, retry;
436 401
437 UFSD(("ENTER\n")) 402 UFSD(("ENTER\n"))
438 sb = inode->i_sb; 403 sb = inode->i_sb;
@@ -442,6 +407,9 @@ void ufs_truncate (struct inode * inode)
442 return; 407 return;
443 if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) 408 if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
444 return; 409 return;
410
411 block_truncate_page(inode->i_mapping, inode->i_size, ufs_getfrag_block);
412
445 lock_kernel(); 413 lock_kernel();
446 while (1) { 414 while (1) {
447 retry = ufs_trunc_direct(inode); 415 retry = ufs_trunc_direct(inode);
@@ -457,15 +425,7 @@ void ufs_truncate (struct inode * inode)
457 blk_run_address_space(inode->i_mapping); 425 blk_run_address_space(inode->i_mapping);
458 yield(); 426 yield();
459 } 427 }
460 offset = inode->i_size & uspi->s_fshift; 428
461 if (offset) {
462 bh = ufs_bread (inode, inode->i_size >> uspi->s_fshift, 0, &err);
463 if (bh) {
464 memset (bh->b_data + offset, 0, uspi->s_fsize - offset);
465 mark_buffer_dirty (bh);
466 brelse (bh);
467 }
468 }
469 inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; 429 inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
470 ufsi->i_lastfrag = DIRECT_FRAGMENT; 430 ufsi->i_lastfrag = DIRECT_FRAGMENT;
471 unlock_kernel(); 431 unlock_kernel();
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 120626789406..74d8be87f983 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -540,7 +540,7 @@ xfs_probe_cluster(
540 540
541 /* First sum forwards in this page */ 541 /* First sum forwards in this page */
542 do { 542 do {
543 if (mapped != buffer_mapped(bh)) 543 if (!buffer_uptodate(bh) || (mapped != buffer_mapped(bh)))
544 return total; 544 return total;
545 total += bh->b_size; 545 total += bh->b_size;
546 } while ((bh = bh->b_this_page) != head); 546 } while ((bh = bh->b_this_page) != head);
@@ -747,10 +747,11 @@ xfs_convert_page(
747 struct backing_dev_info *bdi; 747 struct backing_dev_info *bdi;
748 748
749 bdi = inode->i_mapping->backing_dev_info; 749 bdi = inode->i_mapping->backing_dev_info;
750 wbc->nr_to_write--;
750 if (bdi_write_congested(bdi)) { 751 if (bdi_write_congested(bdi)) {
751 wbc->encountered_congestion = 1; 752 wbc->encountered_congestion = 1;
752 done = 1; 753 done = 1;
753 } else if (--wbc->nr_to_write <= 0) { 754 } else if (wbc->nr_to_write <= 0) {
754 done = 1; 755 done = 1;
755 } 756 }
756 } 757 }
@@ -1462,4 +1463,5 @@ struct address_space_operations linvfs_aops = {
1462 .commit_write = generic_commit_write, 1463 .commit_write = generic_commit_write,
1463 .bmap = linvfs_bmap, 1464 .bmap = linvfs_bmap,
1464 .direct_IO = linvfs_direct_IO, 1465 .direct_IO = linvfs_direct_IO,
1466 .migratepage = buffer_migrate_page,
1465}; 1467};
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index e44b7c1a3a36..bfb4f2917bb6 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -822,6 +822,13 @@ xfs_buf_rele(
822 822
823 XB_TRACE(bp, "rele", bp->b_relse); 823 XB_TRACE(bp, "rele", bp->b_relse);
824 824
825 if (unlikely(!hash)) {
826 ASSERT(!bp->b_relse);
827 if (atomic_dec_and_test(&bp->b_hold))
828 xfs_buf_free(bp);
829 return;
830 }
831
825 if (atomic_dec_and_lock(&bp->b_hold, &hash->bh_lock)) { 832 if (atomic_dec_and_lock(&bp->b_hold, &hash->bh_lock)) {
826 if (bp->b_relse) { 833 if (bp->b_relse) {
827 atomic_inc(&bp->b_hold); 834 atomic_inc(&bp->b_hold);
@@ -1514,6 +1521,7 @@ xfs_mapping_buftarg(
1514 struct address_space *mapping; 1521 struct address_space *mapping;
1515 static struct address_space_operations mapping_aops = { 1522 static struct address_space_operations mapping_aops = {
1516 .sync_page = block_sync_page, 1523 .sync_page = block_sync_page,
1524 .migratepage = fail_migrate_page,
1517 }; 1525 };
1518 1526
1519 inode = new_inode(bdev->bd_inode->i_sb); 1527 inode = new_inode(bdev->bd_inode->i_sb);
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 76c6df34d0db..d7f6f2d8ac8e 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -262,6 +262,31 @@ has_fs_struct(struct task_struct *task)
262 return (task->fs != init_task.fs); 262 return (task->fs != init_task.fs);
263} 263}
264 264
265STATIC inline void
266cleanup_inode(
267 vnode_t *dvp,
268 vnode_t *vp,
269 struct dentry *dentry,
270 int mode)
271{
272 struct dentry teardown = {};
273 int err2;
274
275 /* Oh, the horror.
276 * If we can't add the ACL or we fail in
277 * linvfs_init_security we must back out.
278 * ENOSPC can hit here, among other things.
279 */
280 teardown.d_inode = LINVFS_GET_IP(vp);
281 teardown.d_name = dentry->d_name;
282
283 if (S_ISDIR(mode))
284 VOP_RMDIR(dvp, &teardown, NULL, err2);
285 else
286 VOP_REMOVE(dvp, &teardown, NULL, err2);
287 VN_RELE(vp);
288}
289
265STATIC int 290STATIC int
266linvfs_mknod( 291linvfs_mknod(
267 struct inode *dir, 292 struct inode *dir,
@@ -316,30 +341,19 @@ linvfs_mknod(
316 } 341 }
317 342
318 if (!error) 343 if (!error)
344 {
319 error = linvfs_init_security(vp, dir); 345 error = linvfs_init_security(vp, dir);
346 if (error)
347 cleanup_inode(dvp, vp, dentry, mode);
348 }
320 349
321 if (default_acl) { 350 if (default_acl) {
322 if (!error) { 351 if (!error) {
323 error = _ACL_INHERIT(vp, &va, default_acl); 352 error = _ACL_INHERIT(vp, &va, default_acl);
324 if (!error) { 353 if (!error)
325 VMODIFY(vp); 354 VMODIFY(vp);
326 } else { 355 else
327 struct dentry teardown = {}; 356 cleanup_inode(dvp, vp, dentry, mode);
328 int err2;
329
330 /* Oh, the horror.
331 * If we can't add the ACL we must back out.
332 * ENOSPC can hit here, among other things.
333 */
334 teardown.d_inode = ip = LINVFS_GET_IP(vp);
335 teardown.d_name = dentry->d_name;
336
337 if (S_ISDIR(mode))
338 VOP_RMDIR(dvp, &teardown, NULL, err2);
339 else
340 VOP_REMOVE(dvp, &teardown, NULL, err2);
341 VN_RELE(vp);
342 }
343 } 357 }
344 _ACL_FREE(default_acl); 358 _ACL_FREE(default_acl);
345 } 359 }
@@ -659,6 +673,8 @@ linvfs_setattr(
659 if (ia_valid & ATTR_ATIME) { 673 if (ia_valid & ATTR_ATIME) {
660 vattr.va_mask |= XFS_AT_ATIME; 674 vattr.va_mask |= XFS_AT_ATIME;
661 vattr.va_atime = attr->ia_atime; 675 vattr.va_atime = attr->ia_atime;
676 if (ia_valid & ATTR_ATIME_SET)
677 inode->i_atime = attr->ia_atime;
662 } 678 }
663 if (ia_valid & ATTR_MTIME) { 679 if (ia_valid & ATTR_MTIME) {
664 vattr.va_mask |= XFS_AT_MTIME; 680 vattr.va_mask |= XFS_AT_MTIME;
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 53a00fb217fa..7c0e39dc6189 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -68,6 +68,9 @@ kmem_zone_t *qm_dqzone;
68kmem_zone_t *qm_dqtrxzone; 68kmem_zone_t *qm_dqtrxzone;
69STATIC kmem_shaker_t xfs_qm_shaker; 69STATIC kmem_shaker_t xfs_qm_shaker;
70 70
71STATIC cred_t xfs_zerocr;
72STATIC xfs_inode_t xfs_zeroino;
73
71STATIC void xfs_qm_list_init(xfs_dqlist_t *, char *, int); 74STATIC void xfs_qm_list_init(xfs_dqlist_t *, char *, int);
72STATIC void xfs_qm_list_destroy(xfs_dqlist_t *); 75STATIC void xfs_qm_list_destroy(xfs_dqlist_t *);
73 76
@@ -1393,8 +1396,6 @@ xfs_qm_qino_alloc(
1393 xfs_trans_t *tp; 1396 xfs_trans_t *tp;
1394 int error; 1397 int error;
1395 unsigned long s; 1398 unsigned long s;
1396 cred_t zerocr;
1397 xfs_inode_t zeroino;
1398 int committed; 1399 int committed;
1399 1400
1400 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QINOCREATE); 1401 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QINOCREATE);
@@ -1406,11 +1407,9 @@ xfs_qm_qino_alloc(
1406 xfs_trans_cancel(tp, 0); 1407 xfs_trans_cancel(tp, 0);
1407 return error; 1408 return error;
1408 } 1409 }
1409 memset(&zerocr, 0, sizeof(zerocr));
1410 memset(&zeroino, 0, sizeof(zeroino));
1411 1410
1412 if ((error = xfs_dir_ialloc(&tp, &zeroino, S_IFREG, 1, 0, 1411 if ((error = xfs_dir_ialloc(&tp, &xfs_zeroino, S_IFREG, 1, 0,
1413 &zerocr, 0, 1, ip, &committed))) { 1412 &xfs_zerocr, 0, 1, ip, &committed))) {
1414 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | 1413 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES |
1415 XFS_TRANS_ABORT); 1414 XFS_TRANS_ABORT);
1416 return error; 1415 return error;
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 06fc061c50fc..5b413946b1c5 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -130,7 +130,8 @@ xfs_growfs_rt_alloc(
130 /* 130 /*
131 * Lock the inode. 131 * Lock the inode.
132 */ 132 */
133 if ((error = xfs_trans_iget(mp, tp, ino, 0, XFS_ILOCK_EXCL, &ip))) 133 if ((error = xfs_trans_iget(mp, tp, ino, 0,
134 XFS_ILOCK_EXCL, &ip)))
134 goto error_exit; 135 goto error_exit;
135 XFS_BMAP_INIT(&flist, &firstblock); 136 XFS_BMAP_INIT(&flist, &firstblock);
136 /* 137 /*
@@ -170,8 +171,8 @@ xfs_growfs_rt_alloc(
170 /* 171 /*
171 * Lock the bitmap inode. 172 * Lock the bitmap inode.
172 */ 173 */
173 if ((error = xfs_trans_iget(mp, tp, ino, 0, XFS_ILOCK_EXCL, 174 if ((error = xfs_trans_iget(mp, tp, ino, 0,
174 &ip))) 175 XFS_ILOCK_EXCL, &ip)))
175 goto error_exit; 176 goto error_exit;
176 /* 177 /*
177 * Get a buffer for the block. 178 * Get a buffer for the block.
@@ -2023,8 +2024,8 @@ xfs_growfs_rt(
2023 /* 2024 /*
2024 * Lock out other callers by grabbing the bitmap inode lock. 2025 * Lock out other callers by grabbing the bitmap inode lock.
2025 */ 2026 */
2026 if ((error = xfs_trans_iget(mp, tp, 0, mp->m_sb.sb_rbmino, 2027 if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0,
2027 XFS_ILOCK_EXCL, &ip))) 2028 XFS_ILOCK_EXCL, &ip)))
2028 goto error_exit; 2029 goto error_exit;
2029 ASSERT(ip == mp->m_rbmip); 2030 ASSERT(ip == mp->m_rbmip);
2030 /* 2031 /*
@@ -2037,8 +2038,8 @@ xfs_growfs_rt(
2037 /* 2038 /*
2038 * Get the summary inode into the transaction. 2039 * Get the summary inode into the transaction.
2039 */ 2040 */
2040 if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rsumino, 2041 if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rsumino, 0,
2041 0, XFS_ILOCK_EXCL, &ip))) 2042 XFS_ILOCK_EXCL, &ip)))
2042 goto error_exit; 2043 goto error_exit;
2043 ASSERT(ip == mp->m_rsumip); 2044 ASSERT(ip == mp->m_rsumip);
2044 /* 2045 /*
@@ -2158,10 +2159,9 @@ xfs_rtallocate_extent(
2158 /* 2159 /*
2159 * Lock out other callers by grabbing the bitmap inode lock. 2160 * Lock out other callers by grabbing the bitmap inode lock.
2160 */ 2161 */
2161 error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0, XFS_ILOCK_EXCL, &ip); 2162 if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0,
2162 if (error) { 2163 XFS_ILOCK_EXCL, &ip)))
2163 return error; 2164 return error;
2164 }
2165 sumbp = NULL; 2165 sumbp = NULL;
2166 /* 2166 /*
2167 * Allocate by size, or near another block, or exactly at some block. 2167 * Allocate by size, or near another block, or exactly at some block.
@@ -2221,10 +2221,9 @@ xfs_rtfree_extent(
2221 /* 2221 /*
2222 * Synchronize by locking the bitmap inode. 2222 * Synchronize by locking the bitmap inode.
2223 */ 2223 */
2224 error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0, XFS_ILOCK_EXCL, &ip); 2224 if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0,
2225 if (error) { 2225 XFS_ILOCK_EXCL, &ip)))
2226 return error; 2226 return error;
2227 }
2228#if defined(__KERNEL__) && defined(DEBUG) 2227#if defined(__KERNEL__) && defined(DEBUG)
2229 /* 2228 /*
2230 * Check to see that this whole range is currently allocated. 2229 * Check to see that this whole range is currently allocated.
@@ -2365,8 +2364,8 @@ xfs_rtpick_extent(
2365 __uint64_t seq; /* sequence number of file creation */ 2364 __uint64_t seq; /* sequence number of file creation */
2366 __uint64_t *seqp; /* pointer to seqno in inode */ 2365 __uint64_t *seqp; /* pointer to seqno in inode */
2367 2366
2368 error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0, XFS_ILOCK_EXCL, &ip); 2367 if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0,
2369 if (error) 2368 XFS_ILOCK_EXCL, &ip)))
2370 return error; 2369 return error;
2371 ASSERT(ip == mp->m_rbmip); 2370 ASSERT(ip == mp->m_rbmip);
2372 seqp = (__uint64_t *)&ip->i_d.di_atime; 2371 seqp = (__uint64_t *)&ip->i_d.di_atime;