aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/bio.c9
-rw-r--r--fs/cifs/CHANGES3
-rw-r--r--fs/cifs/cifsglob.h2
-rw-r--r--fs/cifs/cifspdu.h6
-rw-r--r--fs/cifs/cifsproto.h7
-rw-r--r--fs/cifs/cifssmb.c360
-rw-r--r--fs/cifs/connect.c8
-rw-r--r--fs/cifs/inode.c3
-rw-r--r--fs/cifs/misc.c2
-rw-r--r--fs/cifs/xattr.c8
-rw-r--r--fs/dlm/ast.c74
-rw-r--r--fs/dlm/ast.h4
-rw-r--r--fs/dlm/debug_fs.c2
-rw-r--r--fs/dlm/dlm_internal.h10
-rw-r--r--fs/dlm/lock.c120
-rw-r--r--fs/dlm/lockspace.c14
-rw-r--r--fs/dlm/user.c10
-rw-r--r--fs/dlm/user.h4
-rw-r--r--fs/ext4/ext4.h2
-rw-r--r--fs/file.c2
-rw-r--r--fs/fuse/dev.c30
-rw-r--r--fs/gfs2/aops.c4
-rw-r--r--fs/gfs2/glock.c75
-rw-r--r--fs/gfs2/glock.h7
-rw-r--r--fs/gfs2/glops.c16
-rw-r--r--fs/gfs2/incore.h5
-rw-r--r--fs/gfs2/inode.c6
-rw-r--r--fs/gfs2/lock_dlm.c5
-rw-r--r--fs/gfs2/lops.c4
-rw-r--r--fs/gfs2/main.c28
-rw-r--r--fs/gfs2/meta_io.c46
-rw-r--r--fs/gfs2/meta_io.h12
-rw-r--r--fs/gfs2/ops_fstype.c4
-rw-r--r--fs/gfs2/super.c27
-rw-r--r--fs/gfs2/sys.c2
-rw-r--r--fs/gfs2/util.c1
-rw-r--r--fs/gfs2/util.h1
-rw-r--r--fs/nfs/Kconfig3
-rw-r--r--fs/nfs/iostat.h4
-rw-r--r--fs/nilfs2/dat.c3
-rw-r--r--fs/nilfs2/ioctl.c66
-rw-r--r--fs/nilfs2/recovery.c41
-rw-r--r--fs/nilfs2/segbuf.c18
-rw-r--r--fs/nilfs2/segbuf.h5
-rw-r--r--fs/nilfs2/segment.c120
-rw-r--r--fs/nilfs2/segment.h2
-rw-r--r--fs/nilfs2/super.c15
-rw-r--r--fs/nilfs2/the_nilfs.c38
-rw-r--r--fs/nilfs2/the_nilfs.h3
-rw-r--r--fs/ocfs2/Makefile1
-rw-r--r--fs/ocfs2/alloc.c5
-rw-r--r--fs/ocfs2/aops.c5
-rw-r--r--fs/ocfs2/cluster/masklog.c1
-rw-r--r--fs/ocfs2/cluster/masklog.h7
-rw-r--r--fs/ocfs2/dir.c2
-rw-r--r--fs/ocfs2/dlm/Makefile3
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c2
-rw-r--r--fs/ocfs2/dlmfs/Makefile5
-rw-r--r--fs/ocfs2/dlmfs/dlmfs.c (renamed from fs/ocfs2/dlm/dlmfs.c)127
-rw-r--r--fs/ocfs2/dlmfs/dlmfsver.c (renamed from fs/ocfs2/dlm/dlmfsver.c)0
-rw-r--r--fs/ocfs2/dlmfs/dlmfsver.h (renamed from fs/ocfs2/dlm/dlmfsver.h)0
-rw-r--r--fs/ocfs2/dlmfs/userdlm.c (renamed from fs/ocfs2/dlm/userdlm.c)308
-rw-r--r--fs/ocfs2/dlmfs/userdlm.h (renamed from fs/ocfs2/dlm/userdlm.h)16
-rw-r--r--fs/ocfs2/dlmglue.c284
-rw-r--r--fs/ocfs2/file.c13
-rw-r--r--fs/ocfs2/ioctl.h6
-rw-r--r--fs/ocfs2/localalloc.c2
-rw-r--r--fs/ocfs2/ocfs2.h32
-rw-r--r--fs/ocfs2/ocfs2_fs.h57
-rw-r--r--fs/ocfs2/ocfs2_ioctl.h79
-rw-r--r--fs/ocfs2/ocfs2_lockingver.h2
-rw-r--r--fs/ocfs2/refcounttree.c6
-rw-r--r--fs/ocfs2/stack_o2cb.c37
-rw-r--r--fs/ocfs2/stack_user.c49
-rw-r--r--fs/ocfs2/stackglue.c98
-rw-r--r--fs/ocfs2/stackglue.h95
-rw-r--r--fs/ocfs2/suballoc.c171
-rw-r--r--fs/ocfs2/suballoc.h1
-rw-r--r--fs/ocfs2/super.c10
-rw-r--r--fs/ocfs2/xattr.c2182
-rw-r--r--fs/partitions/check.c7
-rw-r--r--fs/proc/array.c2
-rw-r--r--fs/proc/base.c6
-rw-r--r--fs/proc/kmsg.c14
-rw-r--r--fs/proc/proc_devtree.c7
-rw-r--r--fs/seq_file.c130
-rw-r--r--fs/xfs/Makefile2
-rw-r--r--fs/xfs/linux-2.6/kmem.c56
-rw-r--r--fs/xfs/linux-2.6/kmem.h21
-rw-r--r--fs/xfs/linux-2.6/xfs_acl.c11
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c320
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.h52
-rw-r--r--fs/xfs/linux-2.6/xfs_fs_subr.c2
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl.c21
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl.h12
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl32.c4
-rw-r--r--fs/xfs/linux-2.6/xfs_iops.c4
-rw-r--r--fs/xfs/linux-2.6/xfs_lrw.c62
-rw-r--r--fs/xfs/linux-2.6/xfs_lrw.h3
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c169
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.c186
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.h1
-rw-r--r--fs/xfs/linux-2.6/xfs_trace.h81
-rw-r--r--fs/xfs/linux-2.6/xfs_xattr.c27
-rw-r--r--fs/xfs/quota/xfs_dquot.c47
-rw-r--r--fs/xfs/quota/xfs_dquot_item.c99
-rw-r--r--fs/xfs/quota/xfs_dquot_item.h4
-rw-r--r--fs/xfs/quota/xfs_qm.c40
-rw-r--r--fs/xfs/quota/xfs_qm_bhv.c2
-rw-r--r--fs/xfs/quota/xfs_qm_syscalls.c4
-rw-r--r--fs/xfs/quota/xfs_trans_dquot.c49
-rw-r--r--fs/xfs/xfs_acl.h4
-rw-r--r--fs/xfs/xfs_ag.h16
-rw-r--r--fs/xfs/xfs_alloc.c96
-rw-r--r--fs/xfs/xfs_alloc_btree.c9
-rw-r--r--fs/xfs/xfs_attr.c52
-rw-r--r--fs/xfs/xfs_attr.h3
-rw-r--r--fs/xfs/xfs_attr_leaf.c30
-rw-r--r--fs/xfs/xfs_attr_sf.h2
-rw-r--r--fs/xfs/xfs_bmap.c17
-rw-r--r--fs/xfs/xfs_bmap_btree.c2
-rw-r--r--fs/xfs/xfs_bmap_btree.h1
-rw-r--r--fs/xfs/xfs_btree.c4
-rw-r--r--fs/xfs/xfs_buf_item.c72
-rw-r--r--fs/xfs/xfs_da_btree.c4
-rw-r--r--fs/xfs/xfs_da_btree.h5
-rw-r--r--fs/xfs/xfs_dfrag.c43
-rw-r--r--fs/xfs/xfs_dfrag.h3
-rw-r--r--fs/xfs/xfs_dir2.c8
-rw-r--r--fs/xfs/xfs_dir2.h4
-rw-r--r--fs/xfs/xfs_dir2_block.c9
-rw-r--r--fs/xfs/xfs_dir2_leaf.c2
-rw-r--r--fs/xfs/xfs_dir2_node.c2
-rw-r--r--fs/xfs/xfs_dir2_node.h2
-rw-r--r--fs/xfs/xfs_dir2_sf.c2
-rw-r--r--fs/xfs/xfs_extfree_item.c4
-rw-r--r--fs/xfs/xfs_filestream.c42
-rw-r--r--fs/xfs/xfs_filestream.h28
-rw-r--r--fs/xfs/xfs_fsops.c42
-rw-r--r--fs/xfs/xfs_ialloc.c62
-rw-r--r--fs/xfs/xfs_iget.c10
-rw-r--r--fs/xfs/xfs_inode.c126
-rw-r--r--fs/xfs/xfs_inode.h11
-rw-r--r--fs/xfs/xfs_inode_item.c129
-rw-r--r--fs/xfs/xfs_inode_item.h6
-rw-r--r--fs/xfs/xfs_itable.c12
-rw-r--r--fs/xfs/xfs_log.c383
-rw-r--r--fs/xfs/xfs_log.h19
-rw-r--r--fs/xfs/xfs_log_priv.h5
-rw-r--r--fs/xfs/xfs_log_recover.c222
-rw-r--r--fs/xfs/xfs_log_recover.h23
-rw-r--r--fs/xfs/xfs_mount.c181
-rw-r--r--fs/xfs/xfs_mount.h29
-rw-r--r--fs/xfs/xfs_mru_cache.c2
-rw-r--r--fs/xfs/xfs_mru_cache.h1
-rw-r--r--fs/xfs/xfs_quota.h9
-rw-r--r--fs/xfs/xfs_rw.c155
-rw-r--r--fs/xfs/xfs_rw.h4
-rw-r--r--fs/xfs/xfs_trans.c7
-rw-r--r--fs/xfs/xfs_trans.h3
-rw-r--r--fs/xfs/xfs_trans_ail.c34
-rw-r--r--fs/xfs/xfs_trans_buf.c27
-rw-r--r--fs/xfs/xfs_types.h4
-rw-r--r--fs/xfs/xfs_vnodeops.c33
-rw-r--r--fs/xfs/xfs_vnodeops.h10
165 files changed, 4511 insertions, 3796 deletions
diff --git a/fs/bio.c b/fs/bio.c
index 88094afc29ea..dc17afd672e3 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -507,10 +507,8 @@ int bio_get_nr_vecs(struct block_device *bdev)
507 int nr_pages; 507 int nr_pages;
508 508
509 nr_pages = ((queue_max_sectors(q) << 9) + PAGE_SIZE - 1) >> PAGE_SHIFT; 509 nr_pages = ((queue_max_sectors(q) << 9) + PAGE_SIZE - 1) >> PAGE_SHIFT;
510 if (nr_pages > queue_max_phys_segments(q)) 510 if (nr_pages > queue_max_segments(q))
511 nr_pages = queue_max_phys_segments(q); 511 nr_pages = queue_max_segments(q);
512 if (nr_pages > queue_max_hw_segments(q))
513 nr_pages = queue_max_hw_segments(q);
514 512
515 return nr_pages; 513 return nr_pages;
516} 514}
@@ -575,8 +573,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
575 * make this too complex. 573 * make this too complex.
576 */ 574 */
577 575
578 while (bio->bi_phys_segments >= queue_max_phys_segments(q) 576 while (bio->bi_phys_segments >= queue_max_segments(q)) {
579 || bio->bi_phys_segments >= queue_max_hw_segments(q)) {
580 577
581 if (retried_segments) 578 if (retried_segments)
582 return 0; 579 return 0;
diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index 49503d2edc7e..bc0025cdd1c9 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -1,6 +1,7 @@
1Version 1.62 1Version 1.62
2------------ 2------------
3Add sockopt=TCP_NODELAY mount option. 3Add sockopt=TCP_NODELAY mount option. EA (xattr) routines hardened
4to more strictly handle corrupt frames.
4 5
5Version 1.61 6Version 1.61
6------------ 7------------
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index ed751bb657db..a1c817eb291a 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -205,7 +205,7 @@ struct cifsUidInfo {
205struct cifsSesInfo { 205struct cifsSesInfo {
206 struct list_head smb_ses_list; 206 struct list_head smb_ses_list;
207 struct list_head tcon_list; 207 struct list_head tcon_list;
208 struct semaphore sesSem; 208 struct mutex session_mutex;
209#if 0 209#if 0
210 struct cifsUidInfo *uidInfo; /* pointer to user info */ 210 struct cifsUidInfo *uidInfo; /* pointer to user info */
211#endif 211#endif
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index 3877737f96a6..14d036d8db11 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -415,10 +415,10 @@ struct smb_hdr {
415 __u8 WordCount; 415 __u8 WordCount;
416} __attribute__((packed)); 416} __attribute__((packed));
417/* given a pointer to an smb_hdr retrieve the value of byte count */ 417/* given a pointer to an smb_hdr retrieve the value of byte count */
418#define BCC(smb_var) (*(__u16 *)((char *)smb_var + sizeof(struct smb_hdr) + (2 * smb_var->WordCount))) 418#define BCC(smb_var) (*(__u16 *)((char *)(smb_var) + sizeof(struct smb_hdr) + (2 * (smb_var)->WordCount)))
419#define BCC_LE(smb_var) (*(__le16 *)((char *)smb_var + sizeof(struct smb_hdr) + (2 * smb_var->WordCount))) 419#define BCC_LE(smb_var) (*(__le16 *)((char *)(smb_var) + sizeof(struct smb_hdr) + (2 * (smb_var)->WordCount)))
420/* given a pointer to an smb_hdr retrieve the pointer to the byte area */ 420/* given a pointer to an smb_hdr retrieve the pointer to the byte area */
421#define pByteArea(smb_var) ((unsigned char *)smb_var + sizeof(struct smb_hdr) + (2 * smb_var->WordCount) + 2) 421#define pByteArea(smb_var) ((unsigned char *)(smb_var) + sizeof(struct smb_hdr) + (2 * (smb_var)->WordCount) + 2)
422 422
423/* 423/*
424 * Computer Name Length (since Netbios name was length 16 with last byte 0x20) 424 * Computer Name Length (since Netbios name was length 16 with last byte 0x20)
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 5646727e33f5..88e2bc44ac58 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -363,13 +363,10 @@ extern int CIFSSMBNotify(const int xid, struct cifsTconInfo *tcon,
363 __u32 filter, struct file *file, int multishot, 363 __u32 filter, struct file *file, int multishot,
364 const struct nls_table *nls_codepage); 364 const struct nls_table *nls_codepage);
365extern ssize_t CIFSSMBQAllEAs(const int xid, struct cifsTconInfo *tcon, 365extern ssize_t CIFSSMBQAllEAs(const int xid, struct cifsTconInfo *tcon,
366 const unsigned char *searchName, char *EAData, 366 const unsigned char *searchName,
367 const unsigned char *ea_name, char *EAData,
367 size_t bufsize, const struct nls_table *nls_codepage, 368 size_t bufsize, const struct nls_table *nls_codepage,
368 int remap_special_chars); 369 int remap_special_chars);
369extern ssize_t CIFSSMBQueryEA(const int xid, struct cifsTconInfo *tcon,
370 const unsigned char *searchName, const unsigned char *ea_name,
371 unsigned char *ea_value, size_t buf_size,
372 const struct nls_table *nls_codepage, int remap_special_chars);
373extern int CIFSSMBSetEA(const int xid, struct cifsTconInfo *tcon, 370extern int CIFSSMBSetEA(const int xid, struct cifsTconInfo *tcon,
374 const char *fileName, const char *ea_name, 371 const char *fileName, const char *ea_name,
375 const void *ea_value, const __u16 ea_value_len, 372 const void *ea_value, const __u16 ea_value_len,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 941441d3e386..9d17df3e0768 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -170,19 +170,19 @@ cifs_reconnect_tcon(struct cifsTconInfo *tcon, int smb_command)
170 * need to prevent multiple threads trying to simultaneously 170 * need to prevent multiple threads trying to simultaneously
171 * reconnect the same SMB session 171 * reconnect the same SMB session
172 */ 172 */
173 down(&ses->sesSem); 173 mutex_lock(&ses->session_mutex);
174 if (ses->need_reconnect) 174 if (ses->need_reconnect)
175 rc = cifs_setup_session(0, ses, nls_codepage); 175 rc = cifs_setup_session(0, ses, nls_codepage);
176 176
177 /* do we need to reconnect tcon? */ 177 /* do we need to reconnect tcon? */
178 if (rc || !tcon->need_reconnect) { 178 if (rc || !tcon->need_reconnect) {
179 up(&ses->sesSem); 179 mutex_unlock(&ses->session_mutex);
180 goto out; 180 goto out;
181 } 181 }
182 182
183 mark_open_files_invalid(tcon); 183 mark_open_files_invalid(tcon);
184 rc = CIFSTCon(0, ses, tcon->treeName, tcon, nls_codepage); 184 rc = CIFSTCon(0, ses, tcon->treeName, tcon, nls_codepage);
185 up(&ses->sesSem); 185 mutex_unlock(&ses->session_mutex);
186 cFYI(1, ("reconnect tcon rc = %d", rc)); 186 cFYI(1, ("reconnect tcon rc = %d", rc));
187 187
188 if (rc) 188 if (rc)
@@ -700,13 +700,13 @@ CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses)
700 if (!ses || !ses->server) 700 if (!ses || !ses->server)
701 return -EIO; 701 return -EIO;
702 702
703 down(&ses->sesSem); 703 mutex_lock(&ses->session_mutex);
704 if (ses->need_reconnect) 704 if (ses->need_reconnect)
705 goto session_already_dead; /* no need to send SMBlogoff if uid 705 goto session_already_dead; /* no need to send SMBlogoff if uid
706 already closed due to reconnect */ 706 already closed due to reconnect */
707 rc = small_smb_init(SMB_COM_LOGOFF_ANDX, 2, NULL, (void **)&pSMB); 707 rc = small_smb_init(SMB_COM_LOGOFF_ANDX, 2, NULL, (void **)&pSMB);
708 if (rc) { 708 if (rc) {
709 up(&ses->sesSem); 709 mutex_unlock(&ses->session_mutex);
710 return rc; 710 return rc;
711 } 711 }
712 712
@@ -721,7 +721,7 @@ CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses)
721 pSMB->AndXCommand = 0xFF; 721 pSMB->AndXCommand = 0xFF;
722 rc = SendReceiveNoRsp(xid, ses, (struct smb_hdr *) pSMB, 0); 722 rc = SendReceiveNoRsp(xid, ses, (struct smb_hdr *) pSMB, 0);
723session_already_dead: 723session_already_dead:
724 up(&ses->sesSem); 724 mutex_unlock(&ses->session_mutex);
725 725
726 /* if session dead then we do not need to do ulogoff, 726 /* if session dead then we do not need to do ulogoff,
727 since server closed smb session, no sense reporting 727 since server closed smb session, no sense reporting
@@ -5269,22 +5269,34 @@ int CIFSSMBNotify(const int xid, struct cifsTconInfo *tcon,
5269 cifs_buf_release(pSMB); 5269 cifs_buf_release(pSMB);
5270 return rc; 5270 return rc;
5271} 5271}
5272
5272#ifdef CONFIG_CIFS_XATTR 5273#ifdef CONFIG_CIFS_XATTR
5274/*
5275 * Do a path-based QUERY_ALL_EAS call and parse the result. This is a common
5276 * function used by listxattr and getxattr type calls. When ea_name is set,
5277 * it looks for that attribute name and stuffs that value into the EAData
5278 * buffer. When ea_name is NULL, it stuffs a list of attribute names into the
5279 * buffer. In both cases, the return value is either the length of the
5280 * resulting data or a negative error code. If EAData is a NULL pointer then
5281 * the data isn't copied to it, but the length is returned.
5282 */
5273ssize_t 5283ssize_t
5274CIFSSMBQAllEAs(const int xid, struct cifsTconInfo *tcon, 5284CIFSSMBQAllEAs(const int xid, struct cifsTconInfo *tcon,
5275 const unsigned char *searchName, 5285 const unsigned char *searchName, const unsigned char *ea_name,
5276 char *EAData, size_t buf_size, 5286 char *EAData, size_t buf_size,
5277 const struct nls_table *nls_codepage, int remap) 5287 const struct nls_table *nls_codepage, int remap)
5278{ 5288{
5279 /* BB assumes one setup word */ 5289 /* BB assumes one setup word */
5280 TRANSACTION2_QPI_REQ *pSMB = NULL; 5290 TRANSACTION2_QPI_REQ *pSMB = NULL;
5281 TRANSACTION2_QPI_RSP *pSMBr = NULL; 5291 TRANSACTION2_QPI_RSP *pSMBr = NULL;
5282 int rc = 0; 5292 int rc = 0;
5283 int bytes_returned; 5293 int bytes_returned;
5284 int name_len; 5294 int list_len;
5295 struct fealist *ea_response_data;
5285 struct fea *temp_fea; 5296 struct fea *temp_fea;
5286 char *temp_ptr; 5297 char *temp_ptr;
5287 __u16 params, byte_count; 5298 char *end_of_smb;
5299 __u16 params, byte_count, data_offset;
5288 5300
5289 cFYI(1, ("In Query All EAs path %s", searchName)); 5301 cFYI(1, ("In Query All EAs path %s", searchName));
5290QAllEAsRetry: 5302QAllEAsRetry:
@@ -5294,22 +5306,22 @@ QAllEAsRetry:
5294 return rc; 5306 return rc;
5295 5307
5296 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { 5308 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
5297 name_len = 5309 list_len =
5298 cifsConvertToUCS((__le16 *) pSMB->FileName, searchName, 5310 cifsConvertToUCS((__le16 *) pSMB->FileName, searchName,
5299 PATH_MAX, nls_codepage, remap); 5311 PATH_MAX, nls_codepage, remap);
5300 name_len++; /* trailing null */ 5312 list_len++; /* trailing null */
5301 name_len *= 2; 5313 list_len *= 2;
5302 } else { /* BB improve the check for buffer overruns BB */ 5314 } else { /* BB improve the check for buffer overruns BB */
5303 name_len = strnlen(searchName, PATH_MAX); 5315 list_len = strnlen(searchName, PATH_MAX);
5304 name_len++; /* trailing null */ 5316 list_len++; /* trailing null */
5305 strncpy(pSMB->FileName, searchName, name_len); 5317 strncpy(pSMB->FileName, searchName, list_len);
5306 } 5318 }
5307 5319
5308 params = 2 /* level */ + 4 /* reserved */ + name_len /* includes NUL */; 5320 params = 2 /* level */ + 4 /* reserved */ + list_len /* includes NUL */;
5309 pSMB->TotalDataCount = 0; 5321 pSMB->TotalDataCount = 0;
5310 pSMB->MaxParameterCount = cpu_to_le16(2); 5322 pSMB->MaxParameterCount = cpu_to_le16(2);
5311 /* BB find exact max SMB PDU from sess structure BB */ 5323 /* BB find exact max SMB PDU from sess structure BB */
5312 pSMB->MaxDataCount = cpu_to_le16(4000); 5324 pSMB->MaxDataCount = cpu_to_le16(CIFSMaxBufSize);
5313 pSMB->MaxSetupCount = 0; 5325 pSMB->MaxSetupCount = 0;
5314 pSMB->Reserved = 0; 5326 pSMB->Reserved = 0;
5315 pSMB->Flags = 0; 5327 pSMB->Flags = 0;
@@ -5334,237 +5346,117 @@ QAllEAsRetry:
5334 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 5346 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
5335 if (rc) { 5347 if (rc) {
5336 cFYI(1, ("Send error in QueryAllEAs = %d", rc)); 5348 cFYI(1, ("Send error in QueryAllEAs = %d", rc));
5337 } else { /* decode response */ 5349 goto QAllEAsOut;
5338 rc = validate_t2((struct smb_t2_rsp *)pSMBr); 5350 }
5339 5351
5340 /* BB also check enough total bytes returned */ 5352
5341 /* BB we need to improve the validity checking 5353 /* BB also check enough total bytes returned */
5342 of these trans2 responses */ 5354 /* BB we need to improve the validity checking
5343 if (rc || (pSMBr->ByteCount < 4)) 5355 of these trans2 responses */
5344 rc = -EIO; /* bad smb */ 5356
5345 /* else if (pFindData){ 5357 rc = validate_t2((struct smb_t2_rsp *)pSMBr);
5346 memcpy((char *) pFindData, 5358 if (rc || (pSMBr->ByteCount < 4)) {
5347 (char *) &pSMBr->hdr.Protocol + 5359 rc = -EIO; /* bad smb */
5348 data_offset, kl); 5360 goto QAllEAsOut;
5349 }*/ else {
5350 /* check that length of list is not more than bcc */
5351 /* check that each entry does not go beyond length
5352 of list */
5353 /* check that each element of each entry does not
5354 go beyond end of list */
5355 __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
5356 struct fealist *ea_response_data;
5357 rc = 0;
5358 /* validate_trans2_offsets() */
5359 /* BB check if start of smb + data_offset > &bcc+ bcc */
5360 ea_response_data = (struct fealist *)
5361 (((char *) &pSMBr->hdr.Protocol) +
5362 data_offset);
5363 name_len = le32_to_cpu(ea_response_data->list_len);
5364 cFYI(1, ("ea length %d", name_len));
5365 if (name_len <= 8) {
5366 /* returned EA size zeroed at top of function */
5367 cFYI(1, ("empty EA list returned from server"));
5368 } else {
5369 /* account for ea list len */
5370 name_len -= 4;
5371 temp_fea = ea_response_data->list;
5372 temp_ptr = (char *)temp_fea;
5373 while (name_len > 0) {
5374 __u16 value_len;
5375 name_len -= 4;
5376 temp_ptr += 4;
5377 rc += temp_fea->name_len;
5378 /* account for prefix user. and trailing null */
5379 rc = rc + 5 + 1;
5380 if (rc < (int)buf_size) {
5381 memcpy(EAData, "user.", 5);
5382 EAData += 5;
5383 memcpy(EAData, temp_ptr,
5384 temp_fea->name_len);
5385 EAData += temp_fea->name_len;
5386 /* null terminate name */
5387 *EAData = 0;
5388 EAData = EAData + 1;
5389 } else if (buf_size == 0) {
5390 /* skip copy - calc size only */
5391 } else {
5392 /* stop before overrun buffer */
5393 rc = -ERANGE;
5394 break;
5395 }
5396 name_len -= temp_fea->name_len;
5397 temp_ptr += temp_fea->name_len;
5398 /* account for trailing null */
5399 name_len--;
5400 temp_ptr++;
5401 value_len =
5402 le16_to_cpu(temp_fea->value_len);
5403 name_len -= value_len;
5404 temp_ptr += value_len;
5405 /* BB check that temp_ptr is still
5406 within the SMB BB*/
5407
5408 /* no trailing null to account for
5409 in value len */
5410 /* go on to next EA */
5411 temp_fea = (struct fea *)temp_ptr;
5412 }
5413 }
5414 }
5415 } 5361 }
5416 cifs_buf_release(pSMB);
5417 if (rc == -EAGAIN)
5418 goto QAllEAsRetry;
5419 5362
5420 return (ssize_t)rc; 5363 /* check that length of list is not more than bcc */
5421} 5364 /* check that each entry does not go beyond length
5365 of list */
5366 /* check that each element of each entry does not
5367 go beyond end of list */
5368 /* validate_trans2_offsets() */
5369 /* BB check if start of smb + data_offset > &bcc+ bcc */
5422 5370
5423ssize_t CIFSSMBQueryEA(const int xid, struct cifsTconInfo *tcon, 5371 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
5424 const unsigned char *searchName, const unsigned char *ea_name, 5372 ea_response_data = (struct fealist *)
5425 unsigned char *ea_value, size_t buf_size, 5373 (((char *) &pSMBr->hdr.Protocol) + data_offset);
5426 const struct nls_table *nls_codepage, int remap)
5427{
5428 TRANSACTION2_QPI_REQ *pSMB = NULL;
5429 TRANSACTION2_QPI_RSP *pSMBr = NULL;
5430 int rc = 0;
5431 int bytes_returned;
5432 int name_len;
5433 struct fea *temp_fea;
5434 char *temp_ptr;
5435 __u16 params, byte_count;
5436 5374
5437 cFYI(1, ("In Query EA path %s", searchName)); 5375 list_len = le32_to_cpu(ea_response_data->list_len);
5438QEARetry: 5376 cFYI(1, ("ea length %d", list_len));
5439 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, 5377 if (list_len <= 8) {
5440 (void **) &pSMBr); 5378 cFYI(1, ("empty EA list returned from server"));
5441 if (rc) 5379 goto QAllEAsOut;
5442 return rc; 5380 }
5443 5381
5444 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { 5382 /* make sure list_len doesn't go past end of SMB */
5445 name_len = 5383 end_of_smb = (char *)pByteArea(&pSMBr->hdr) + BCC(&pSMBr->hdr);
5446 cifsConvertToUCS((__le16 *) pSMB->FileName, searchName, 5384 if ((char *)ea_response_data + list_len > end_of_smb) {
5447 PATH_MAX, nls_codepage, remap); 5385 cFYI(1, ("EA list appears to go beyond SMB"));
5448 name_len++; /* trailing null */ 5386 rc = -EIO;
5449 name_len *= 2; 5387 goto QAllEAsOut;
5450 } else { /* BB improve the check for buffer overruns BB */
5451 name_len = strnlen(searchName, PATH_MAX);
5452 name_len++; /* trailing null */
5453 strncpy(pSMB->FileName, searchName, name_len);
5454 } 5388 }
5455 5389
5456 params = 2 /* level */ + 4 /* reserved */ + name_len /* includes NUL */; 5390 /* account for ea list len */
5457 pSMB->TotalDataCount = 0; 5391 list_len -= 4;
5458 pSMB->MaxParameterCount = cpu_to_le16(2); 5392 temp_fea = ea_response_data->list;
5459 /* BB find exact max SMB PDU from sess structure BB */ 5393 temp_ptr = (char *)temp_fea;
5460 pSMB->MaxDataCount = cpu_to_le16(4000); 5394 while (list_len > 0) {
5461 pSMB->MaxSetupCount = 0; 5395 unsigned int name_len;
5462 pSMB->Reserved = 0; 5396 __u16 value_len;
5463 pSMB->Flags = 0; 5397
5464 pSMB->Timeout = 0; 5398 list_len -= 4;
5465 pSMB->Reserved2 = 0; 5399 temp_ptr += 4;
5466 pSMB->ParameterOffset = cpu_to_le16(offsetof( 5400 /* make sure we can read name_len and value_len */
5467 struct smb_com_transaction2_qpi_req, InformationLevel) - 4); 5401 if (list_len < 0) {
5468 pSMB->DataCount = 0; 5402 cFYI(1, ("EA entry goes beyond length of list"));
5469 pSMB->DataOffset = 0; 5403 rc = -EIO;
5470 pSMB->SetupCount = 1; 5404 goto QAllEAsOut;
5471 pSMB->Reserved3 = 0; 5405 }
5472 pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_PATH_INFORMATION);
5473 byte_count = params + 1 /* pad */ ;
5474 pSMB->TotalParameterCount = cpu_to_le16(params);
5475 pSMB->ParameterCount = pSMB->TotalParameterCount;
5476 pSMB->InformationLevel = cpu_to_le16(SMB_INFO_QUERY_ALL_EAS);
5477 pSMB->Reserved4 = 0;
5478 pSMB->hdr.smb_buf_length += byte_count;
5479 pSMB->ByteCount = cpu_to_le16(byte_count);
5480 5406
5481 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 5407 name_len = temp_fea->name_len;
5482 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 5408 value_len = le16_to_cpu(temp_fea->value_len);
5483 if (rc) { 5409 list_len -= name_len + 1 + value_len;
5484 cFYI(1, ("Send error in Query EA = %d", rc)); 5410 if (list_len < 0) {
5485 } else { /* decode response */ 5411 cFYI(1, ("EA entry goes beyond length of list"));
5486 rc = validate_t2((struct smb_t2_rsp *)pSMBr); 5412 rc = -EIO;
5413 goto QAllEAsOut;
5414 }
5487 5415
5488 /* BB also check enough total bytes returned */ 5416 if (ea_name) {
5489 /* BB we need to improve the validity checking 5417 if (strncmp(ea_name, temp_ptr, name_len) == 0) {
5490 of these trans2 responses */ 5418 temp_ptr += name_len + 1;
5491 if (rc || (pSMBr->ByteCount < 4)) 5419 rc = value_len;
5492 rc = -EIO; /* bad smb */ 5420 if (buf_size == 0)
5493 /* else if (pFindData){ 5421 goto QAllEAsOut;
5494 memcpy((char *) pFindData, 5422 if ((size_t)value_len > buf_size) {
5495 (char *) &pSMBr->hdr.Protocol + 5423 rc = -ERANGE;
5496 data_offset, kl); 5424 goto QAllEAsOut;
5497 }*/ else {
5498 /* check that length of list is not more than bcc */
5499 /* check that each entry does not go beyond length
5500 of list */
5501 /* check that each element of each entry does not
5502 go beyond end of list */
5503 __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
5504 struct fealist *ea_response_data;
5505 rc = -ENODATA;
5506 /* validate_trans2_offsets() */
5507 /* BB check if start of smb + data_offset > &bcc+ bcc*/
5508 ea_response_data = (struct fealist *)
5509 (((char *) &pSMBr->hdr.Protocol) +
5510 data_offset);
5511 name_len = le32_to_cpu(ea_response_data->list_len);
5512 cFYI(1, ("ea length %d", name_len));
5513 if (name_len <= 8) {
5514 /* returned EA size zeroed at top of function */
5515 cFYI(1, ("empty EA list returned from server"));
5516 } else {
5517 /* account for ea list len */
5518 name_len -= 4;
5519 temp_fea = ea_response_data->list;
5520 temp_ptr = (char *)temp_fea;
5521 /* loop through checking if we have a matching
5522 name and then return the associated value */
5523 while (name_len > 0) {
5524 __u16 value_len;
5525 name_len -= 4;
5526 temp_ptr += 4;
5527 value_len =
5528 le16_to_cpu(temp_fea->value_len);
5529 /* BB validate that value_len falls within SMB,
5530 even though maximum for name_len is 255 */
5531 if (memcmp(temp_fea->name, ea_name,
5532 temp_fea->name_len) == 0) {
5533 /* found a match */
5534 rc = value_len;
5535 /* account for prefix user. and trailing null */
5536 if (rc <= (int)buf_size) {
5537 memcpy(ea_value,
5538 temp_fea->name+temp_fea->name_len+1,
5539 rc);
5540 /* ea values, unlike ea
5541 names, are not null
5542 terminated */
5543 } else if (buf_size == 0) {
5544 /* skip copy - calc size only */
5545 } else {
5546 /* stop before overrun buffer */
5547 rc = -ERANGE;
5548 }
5549 break;
5550 }
5551 name_len -= temp_fea->name_len;
5552 temp_ptr += temp_fea->name_len;
5553 /* account for trailing null */
5554 name_len--;
5555 temp_ptr++;
5556 name_len -= value_len;
5557 temp_ptr += value_len;
5558 /* No trailing null to account for in
5559 value_len. Go on to next EA */
5560 temp_fea = (struct fea *)temp_ptr;
5561 } 5425 }
5426 memcpy(EAData, temp_ptr, value_len);
5427 goto QAllEAsOut;
5428 }
5429 } else {
5430 /* account for prefix user. and trailing null */
5431 rc += (5 + 1 + name_len);
5432 if (rc < (int) buf_size) {
5433 memcpy(EAData, "user.", 5);
5434 EAData += 5;
5435 memcpy(EAData, temp_ptr, name_len);
5436 EAData += name_len;
5437 /* null terminate name */
5438 *EAData = 0;
5439 ++EAData;
5440 } else if (buf_size == 0) {
5441 /* skip copy - calc size only */
5442 } else {
5443 /* stop before overrun buffer */
5444 rc = -ERANGE;
5445 break;
5562 } 5446 }
5563 } 5447 }
5448 temp_ptr += name_len + 1 + value_len;
5449 temp_fea = (struct fea *)temp_ptr;
5564 } 5450 }
5451
5452 /* didn't find the named attribute */
5453 if (ea_name)
5454 rc = -ENODATA;
5455
5456QAllEAsOut:
5565 cifs_buf_release(pSMB); 5457 cifs_buf_release(pSMB);
5566 if (rc == -EAGAIN) 5458 if (rc == -EAGAIN)
5567 goto QEARetry; 5459 goto QAllEAsRetry;
5568 5460
5569 return (ssize_t)rc; 5461 return (ssize_t)rc;
5570} 5462}
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 2e9e09ca0e30..45eb6cba793f 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -2388,13 +2388,13 @@ try_mount_again:
2388 */ 2388 */
2389 cifs_put_tcp_session(srvTcp); 2389 cifs_put_tcp_session(srvTcp);
2390 2390
2391 down(&pSesInfo->sesSem); 2391 mutex_lock(&pSesInfo->session_mutex);
2392 if (pSesInfo->need_reconnect) { 2392 if (pSesInfo->need_reconnect) {
2393 cFYI(1, ("Session needs reconnect")); 2393 cFYI(1, ("Session needs reconnect"));
2394 rc = cifs_setup_session(xid, pSesInfo, 2394 rc = cifs_setup_session(xid, pSesInfo,
2395 cifs_sb->local_nls); 2395 cifs_sb->local_nls);
2396 } 2396 }
2397 up(&pSesInfo->sesSem); 2397 mutex_unlock(&pSesInfo->session_mutex);
2398 } else if (!rc) { 2398 } else if (!rc) {
2399 cFYI(1, ("Existing smb sess not found")); 2399 cFYI(1, ("Existing smb sess not found"));
2400 pSesInfo = sesInfoAlloc(); 2400 pSesInfo = sesInfoAlloc();
@@ -2437,12 +2437,12 @@ try_mount_again:
2437 } 2437 }
2438 pSesInfo->linux_uid = volume_info->linux_uid; 2438 pSesInfo->linux_uid = volume_info->linux_uid;
2439 pSesInfo->overrideSecFlg = volume_info->secFlg; 2439 pSesInfo->overrideSecFlg = volume_info->secFlg;
2440 down(&pSesInfo->sesSem); 2440 mutex_lock(&pSesInfo->session_mutex);
2441 2441
2442 /* BB FIXME need to pass vol->secFlgs BB */ 2442 /* BB FIXME need to pass vol->secFlgs BB */
2443 rc = cifs_setup_session(xid, pSesInfo, 2443 rc = cifs_setup_session(xid, pSesInfo,
2444 cifs_sb->local_nls); 2444 cifs_sb->local_nls);
2445 up(&pSesInfo->sesSem); 2445 mutex_unlock(&pSesInfo->session_mutex);
2446 } 2446 }
2447 2447
2448 /* search for existing tcon to this server share */ 2448 /* search for existing tcon to this server share */
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index e3fda978f481..8bdbc818164c 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -111,6 +111,7 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
111 111
112 cifs_i->delete_pending = fattr->cf_flags & CIFS_FATTR_DELETE_PENDING; 112 cifs_i->delete_pending = fattr->cf_flags & CIFS_FATTR_DELETE_PENDING;
113 113
114 cifs_i->server_eof = fattr->cf_eof;
114 /* 115 /*
115 * Can't safely change the file size here if the client is writing to 116 * Can't safely change the file size here if the client is writing to
116 * it due to potential races. 117 * it due to potential races.
@@ -366,7 +367,7 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path,
366 char ea_value[4]; 367 char ea_value[4];
367 __u32 mode; 368 __u32 mode;
368 369
369 rc = CIFSSMBQueryEA(xid, cifs_sb->tcon, path, "SETFILEBITS", 370 rc = CIFSSMBQAllEAs(xid, cifs_sb->tcon, path, "SETFILEBITS",
370 ea_value, 4 /* size of buf */, cifs_sb->local_nls, 371 ea_value, 4 /* size of buf */, cifs_sb->local_nls,
371 cifs_sb->mnt_cifs_flags & 372 cifs_sb->mnt_cifs_flags &
372 CIFS_MOUNT_MAP_SPECIAL_CHR); 373 CIFS_MOUNT_MAP_SPECIAL_CHR);
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index d27d4ec6579b..d1474996a812 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -79,7 +79,7 @@ sesInfoAlloc(void)
79 ++ret_buf->ses_count; 79 ++ret_buf->ses_count;
80 INIT_LIST_HEAD(&ret_buf->smb_ses_list); 80 INIT_LIST_HEAD(&ret_buf->smb_ses_list);
81 INIT_LIST_HEAD(&ret_buf->tcon_list); 81 INIT_LIST_HEAD(&ret_buf->tcon_list);
82 init_MUTEX(&ret_buf->sesSem); 82 mutex_init(&ret_buf->session_mutex);
83 } 83 }
84 return ret_buf; 84 return ret_buf;
85} 85}
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index a75afa3dd9e1..3e2ef0de1209 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -244,7 +244,7 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
244 /* revalidate/getattr then populate from inode */ 244 /* revalidate/getattr then populate from inode */
245 } /* BB add else when above is implemented */ 245 } /* BB add else when above is implemented */
246 ea_name += 5; /* skip past user. prefix */ 246 ea_name += 5; /* skip past user. prefix */
247 rc = CIFSSMBQueryEA(xid, pTcon, full_path, ea_name, ea_value, 247 rc = CIFSSMBQAllEAs(xid, pTcon, full_path, ea_name, ea_value,
248 buf_size, cifs_sb->local_nls, 248 buf_size, cifs_sb->local_nls,
249 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 249 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
250 } else if (strncmp(ea_name, CIFS_XATTR_OS2_PREFIX, 4) == 0) { 250 } else if (strncmp(ea_name, CIFS_XATTR_OS2_PREFIX, 4) == 0) {
@@ -252,7 +252,7 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
252 goto get_ea_exit; 252 goto get_ea_exit;
253 253
254 ea_name += 4; /* skip past os2. prefix */ 254 ea_name += 4; /* skip past os2. prefix */
255 rc = CIFSSMBQueryEA(xid, pTcon, full_path, ea_name, ea_value, 255 rc = CIFSSMBQAllEAs(xid, pTcon, full_path, ea_name, ea_value,
256 buf_size, cifs_sb->local_nls, 256 buf_size, cifs_sb->local_nls,
257 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 257 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
258 } else if (strncmp(ea_name, POSIX_ACL_XATTR_ACCESS, 258 } else if (strncmp(ea_name, POSIX_ACL_XATTR_ACCESS,
@@ -364,8 +364,8 @@ ssize_t cifs_listxattr(struct dentry *direntry, char *data, size_t buf_size)
364 /* if proc/fs/cifs/streamstoxattr is set then 364 /* if proc/fs/cifs/streamstoxattr is set then
365 search server for EAs or streams to 365 search server for EAs or streams to
366 returns as xattrs */ 366 returns as xattrs */
367 rc = CIFSSMBQAllEAs(xid, pTcon, full_path, data, buf_size, 367 rc = CIFSSMBQAllEAs(xid, pTcon, full_path, NULL, data,
368 cifs_sb->local_nls, 368 buf_size, cifs_sb->local_nls,
369 cifs_sb->mnt_cifs_flags & 369 cifs_sb->mnt_cifs_flags &
370 CIFS_MOUNT_MAP_SPECIAL_CHR); 370 CIFS_MOUNT_MAP_SPECIAL_CHR);
371 371
diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
index dc2ad6008b2d..4314f0d48d85 100644
--- a/fs/dlm/ast.c
+++ b/fs/dlm/ast.c
@@ -2,7 +2,7 @@
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. 5** Copyright (C) 2004-2010 Red Hat, Inc. All rights reserved.
6** 6**
7** This copyrighted material is made available to anyone wishing to use, 7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions 8** modify, copy, or redistribute it subject to the terms and conditions
@@ -33,10 +33,10 @@ void dlm_del_ast(struct dlm_lkb *lkb)
33 spin_unlock(&ast_queue_lock); 33 spin_unlock(&ast_queue_lock);
34} 34}
35 35
36void dlm_add_ast(struct dlm_lkb *lkb, int type, int bastmode) 36void dlm_add_ast(struct dlm_lkb *lkb, int type, int mode)
37{ 37{
38 if (lkb->lkb_flags & DLM_IFL_USER) { 38 if (lkb->lkb_flags & DLM_IFL_USER) {
39 dlm_user_add_ast(lkb, type, bastmode); 39 dlm_user_add_ast(lkb, type, mode);
40 return; 40 return;
41 } 41 }
42 42
@@ -44,10 +44,21 @@ void dlm_add_ast(struct dlm_lkb *lkb, int type, int bastmode)
44 if (!(lkb->lkb_ast_type & (AST_COMP | AST_BAST))) { 44 if (!(lkb->lkb_ast_type & (AST_COMP | AST_BAST))) {
45 kref_get(&lkb->lkb_ref); 45 kref_get(&lkb->lkb_ref);
46 list_add_tail(&lkb->lkb_astqueue, &ast_queue); 46 list_add_tail(&lkb->lkb_astqueue, &ast_queue);
47 lkb->lkb_ast_first = type;
47 } 48 }
49
50 /* sanity check, this should not happen */
51
52 if ((type == AST_COMP) && (lkb->lkb_ast_type & AST_COMP))
53 log_print("repeat cast %d castmode %d lock %x %s",
54 mode, lkb->lkb_castmode,
55 lkb->lkb_id, lkb->lkb_resource->res_name);
56
48 lkb->lkb_ast_type |= type; 57 lkb->lkb_ast_type |= type;
49 if (bastmode) 58 if (type == AST_BAST)
50 lkb->lkb_bastmode = bastmode; 59 lkb->lkb_bastmode = mode;
60 else
61 lkb->lkb_castmode = mode;
51 spin_unlock(&ast_queue_lock); 62 spin_unlock(&ast_queue_lock);
52 63
53 set_bit(WAKE_ASTS, &astd_wakeflags); 64 set_bit(WAKE_ASTS, &astd_wakeflags);
@@ -59,9 +70,9 @@ static void process_asts(void)
59 struct dlm_ls *ls = NULL; 70 struct dlm_ls *ls = NULL;
60 struct dlm_rsb *r = NULL; 71 struct dlm_rsb *r = NULL;
61 struct dlm_lkb *lkb; 72 struct dlm_lkb *lkb;
62 void (*cast) (void *astparam); 73 void (*castfn) (void *astparam);
63 void (*bast) (void *astparam, int mode); 74 void (*bastfn) (void *astparam, int mode);
64 int type = 0, bastmode; 75 int type, first, bastmode, castmode, do_bast, do_cast, last_castmode;
65 76
66repeat: 77repeat:
67 spin_lock(&ast_queue_lock); 78 spin_lock(&ast_queue_lock);
@@ -75,17 +86,48 @@ repeat:
75 list_del(&lkb->lkb_astqueue); 86 list_del(&lkb->lkb_astqueue);
76 type = lkb->lkb_ast_type; 87 type = lkb->lkb_ast_type;
77 lkb->lkb_ast_type = 0; 88 lkb->lkb_ast_type = 0;
89 first = lkb->lkb_ast_first;
90 lkb->lkb_ast_first = 0;
78 bastmode = lkb->lkb_bastmode; 91 bastmode = lkb->lkb_bastmode;
79 92 castmode = lkb->lkb_castmode;
93 castfn = lkb->lkb_astfn;
94 bastfn = lkb->lkb_bastfn;
80 spin_unlock(&ast_queue_lock); 95 spin_unlock(&ast_queue_lock);
81 cast = lkb->lkb_astfn;
82 bast = lkb->lkb_bastfn;
83
84 if ((type & AST_COMP) && cast)
85 cast(lkb->lkb_astparam);
86 96
87 if ((type & AST_BAST) && bast) 97 do_cast = (type & AST_COMP) && castfn;
88 bast(lkb->lkb_astparam, bastmode); 98 do_bast = (type & AST_BAST) && bastfn;
99
100 /* Skip a bast if its blocking mode is compatible with the
101 granted mode of the preceding cast. */
102
103 if (do_bast) {
104 if (first == AST_COMP)
105 last_castmode = castmode;
106 else
107 last_castmode = lkb->lkb_castmode_done;
108 if (dlm_modes_compat(bastmode, last_castmode))
109 do_bast = 0;
110 }
111
112 if (first == AST_COMP) {
113 if (do_cast)
114 castfn(lkb->lkb_astparam);
115 if (do_bast)
116 bastfn(lkb->lkb_astparam, bastmode);
117 } else if (first == AST_BAST) {
118 if (do_bast)
119 bastfn(lkb->lkb_astparam, bastmode);
120 if (do_cast)
121 castfn(lkb->lkb_astparam);
122 } else {
123 log_error(ls, "bad ast_first %d ast_type %d",
124 first, type);
125 }
126
127 if (do_cast)
128 lkb->lkb_castmode_done = castmode;
129 if (do_bast)
130 lkb->lkb_bastmode_done = bastmode;
89 131
90 /* this removes the reference added by dlm_add_ast 132 /* this removes the reference added by dlm_add_ast
91 and may result in the lkb being freed */ 133 and may result in the lkb being freed */
diff --git a/fs/dlm/ast.h b/fs/dlm/ast.h
index 1b5fc5f428fd..bcb1aaba519d 100644
--- a/fs/dlm/ast.h
+++ b/fs/dlm/ast.h
@@ -1,7 +1,7 @@
1/****************************************************************************** 1/******************************************************************************
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved. 4** Copyright (C) 2005-2010 Red Hat, Inc. All rights reserved.
5** 5**
6** This copyrighted material is made available to anyone wishing to use, 6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions 7** modify, copy, or redistribute it subject to the terms and conditions
@@ -13,7 +13,7 @@
13#ifndef __ASTD_DOT_H__ 13#ifndef __ASTD_DOT_H__
14#define __ASTD_DOT_H__ 14#define __ASTD_DOT_H__
15 15
16void dlm_add_ast(struct dlm_lkb *lkb, int type, int bastmode); 16void dlm_add_ast(struct dlm_lkb *lkb, int type, int mode);
17void dlm_del_ast(struct dlm_lkb *lkb); 17void dlm_del_ast(struct dlm_lkb *lkb);
18 18
19void dlm_astd_wake(void); 19void dlm_astd_wake(void);
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index 375a2359b3bf..29d6139c35fc 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -256,7 +256,7 @@ static int print_format3_lock(struct seq_file *s, struct dlm_lkb *lkb,
256 lkb->lkb_status, 256 lkb->lkb_status,
257 lkb->lkb_grmode, 257 lkb->lkb_grmode,
258 lkb->lkb_rqmode, 258 lkb->lkb_rqmode,
259 lkb->lkb_highbast, 259 lkb->lkb_bastmode,
260 rsb_lookup, 260 rsb_lookup,
261 lkb->lkb_wait_type, 261 lkb->lkb_wait_type,
262 lkb->lkb_lvbseq, 262 lkb->lkb_lvbseq,
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index 826d3dc6e0ab..f632b58cd222 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -2,7 +2,7 @@
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. 5** Copyright (C) 2004-2010 Red Hat, Inc. All rights reserved.
6** 6**
7** This copyrighted material is made available to anyone wishing to use, 7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions 8** modify, copy, or redistribute it subject to the terms and conditions
@@ -232,11 +232,17 @@ struct dlm_lkb {
232 int8_t lkb_status; /* granted, waiting, convert */ 232 int8_t lkb_status; /* granted, waiting, convert */
233 int8_t lkb_rqmode; /* requested lock mode */ 233 int8_t lkb_rqmode; /* requested lock mode */
234 int8_t lkb_grmode; /* granted lock mode */ 234 int8_t lkb_grmode; /* granted lock mode */
235 int8_t lkb_bastmode; /* requested mode */
236 int8_t lkb_highbast; /* highest mode bast sent for */ 235 int8_t lkb_highbast; /* highest mode bast sent for */
236
237 int8_t lkb_wait_type; /* type of reply waiting for */ 237 int8_t lkb_wait_type; /* type of reply waiting for */
238 int8_t lkb_wait_count; 238 int8_t lkb_wait_count;
239 int8_t lkb_ast_type; /* type of ast queued for */ 239 int8_t lkb_ast_type; /* type of ast queued for */
240 int8_t lkb_ast_first; /* type of first ast queued */
241
242 int8_t lkb_bastmode; /* req mode of queued bast */
243 int8_t lkb_castmode; /* gr mode of queued cast */
244 int8_t lkb_bastmode_done; /* last delivered bastmode */
245 int8_t lkb_castmode_done; /* last delivered castmode */
240 246
241 struct list_head lkb_idtbl_list; /* lockspace lkbtbl */ 247 struct list_head lkb_idtbl_list; /* lockspace lkbtbl */
242 struct list_head lkb_statequeue; /* rsb g/c/w list */ 248 struct list_head lkb_statequeue; /* rsb g/c/w list */
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 9c0c1db1e105..46ffd3eeaaf7 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -1,7 +1,7 @@
1/****************************************************************************** 1/******************************************************************************
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved. 4** Copyright (C) 2005-2010 Red Hat, Inc. All rights reserved.
5** 5**
6** This copyrighted material is made available to anyone wishing to use, 6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions 7** modify, copy, or redistribute it subject to the terms and conditions
@@ -307,7 +307,7 @@ static void queue_cast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
307 lkb->lkb_lksb->sb_status = rv; 307 lkb->lkb_lksb->sb_status = rv;
308 lkb->lkb_lksb->sb_flags = lkb->lkb_sbflags; 308 lkb->lkb_lksb->sb_flags = lkb->lkb_sbflags;
309 309
310 dlm_add_ast(lkb, AST_COMP, 0); 310 dlm_add_ast(lkb, AST_COMP, lkb->lkb_grmode);
311} 311}
312 312
313static inline void queue_cast_overlap(struct dlm_rsb *r, struct dlm_lkb *lkb) 313static inline void queue_cast_overlap(struct dlm_rsb *r, struct dlm_lkb *lkb)
@@ -320,10 +320,12 @@ static void queue_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rqmode)
320{ 320{
321 lkb->lkb_time_bast = ktime_get(); 321 lkb->lkb_time_bast = ktime_get();
322 322
323 if (is_master_copy(lkb)) 323 if (is_master_copy(lkb)) {
324 lkb->lkb_bastmode = rqmode; /* printed by debugfs */
324 send_bast(r, lkb, rqmode); 325 send_bast(r, lkb, rqmode);
325 else 326 } else {
326 dlm_add_ast(lkb, AST_BAST, rqmode); 327 dlm_add_ast(lkb, AST_BAST, rqmode);
328 }
327} 329}
328 330
329/* 331/*
@@ -2280,20 +2282,30 @@ static int do_request(struct dlm_rsb *r, struct dlm_lkb *lkb)
2280 if (can_be_queued(lkb)) { 2282 if (can_be_queued(lkb)) {
2281 error = -EINPROGRESS; 2283 error = -EINPROGRESS;
2282 add_lkb(r, lkb, DLM_LKSTS_WAITING); 2284 add_lkb(r, lkb, DLM_LKSTS_WAITING);
2283 send_blocking_asts(r, lkb);
2284 add_timeout(lkb); 2285 add_timeout(lkb);
2285 goto out; 2286 goto out;
2286 } 2287 }
2287 2288
2288 error = -EAGAIN; 2289 error = -EAGAIN;
2289 if (force_blocking_asts(lkb))
2290 send_blocking_asts_all(r, lkb);
2291 queue_cast(r, lkb, -EAGAIN); 2290 queue_cast(r, lkb, -EAGAIN);
2292
2293 out: 2291 out:
2294 return error; 2292 return error;
2295} 2293}
2296 2294
2295static void do_request_effects(struct dlm_rsb *r, struct dlm_lkb *lkb,
2296 int error)
2297{
2298 switch (error) {
2299 case -EAGAIN:
2300 if (force_blocking_asts(lkb))
2301 send_blocking_asts_all(r, lkb);
2302 break;
2303 case -EINPROGRESS:
2304 send_blocking_asts(r, lkb);
2305 break;
2306 }
2307}
2308
2297static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb) 2309static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
2298{ 2310{
2299 int error = 0; 2311 int error = 0;
@@ -2304,7 +2316,6 @@ static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
2304 if (can_be_granted(r, lkb, 1, &deadlk)) { 2316 if (can_be_granted(r, lkb, 1, &deadlk)) {
2305 grant_lock(r, lkb); 2317 grant_lock(r, lkb);
2306 queue_cast(r, lkb, 0); 2318 queue_cast(r, lkb, 0);
2307 grant_pending_locks(r);
2308 goto out; 2319 goto out;
2309 } 2320 }
2310 2321
@@ -2334,7 +2345,6 @@ static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
2334 if (_can_be_granted(r, lkb, 1)) { 2345 if (_can_be_granted(r, lkb, 1)) {
2335 grant_lock(r, lkb); 2346 grant_lock(r, lkb);
2336 queue_cast(r, lkb, 0); 2347 queue_cast(r, lkb, 0);
2337 grant_pending_locks(r);
2338 goto out; 2348 goto out;
2339 } 2349 }
2340 /* else fall through and move to convert queue */ 2350 /* else fall through and move to convert queue */
@@ -2344,28 +2354,47 @@ static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
2344 error = -EINPROGRESS; 2354 error = -EINPROGRESS;
2345 del_lkb(r, lkb); 2355 del_lkb(r, lkb);
2346 add_lkb(r, lkb, DLM_LKSTS_CONVERT); 2356 add_lkb(r, lkb, DLM_LKSTS_CONVERT);
2347 send_blocking_asts(r, lkb);
2348 add_timeout(lkb); 2357 add_timeout(lkb);
2349 goto out; 2358 goto out;
2350 } 2359 }
2351 2360
2352 error = -EAGAIN; 2361 error = -EAGAIN;
2353 if (force_blocking_asts(lkb))
2354 send_blocking_asts_all(r, lkb);
2355 queue_cast(r, lkb, -EAGAIN); 2362 queue_cast(r, lkb, -EAGAIN);
2356
2357 out: 2363 out:
2358 return error; 2364 return error;
2359} 2365}
2360 2366
2367static void do_convert_effects(struct dlm_rsb *r, struct dlm_lkb *lkb,
2368 int error)
2369{
2370 switch (error) {
2371 case 0:
2372 grant_pending_locks(r);
2373 /* grant_pending_locks also sends basts */
2374 break;
2375 case -EAGAIN:
2376 if (force_blocking_asts(lkb))
2377 send_blocking_asts_all(r, lkb);
2378 break;
2379 case -EINPROGRESS:
2380 send_blocking_asts(r, lkb);
2381 break;
2382 }
2383}
2384
2361static int do_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb) 2385static int do_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
2362{ 2386{
2363 remove_lock(r, lkb); 2387 remove_lock(r, lkb);
2364 queue_cast(r, lkb, -DLM_EUNLOCK); 2388 queue_cast(r, lkb, -DLM_EUNLOCK);
2365 grant_pending_locks(r);
2366 return -DLM_EUNLOCK; 2389 return -DLM_EUNLOCK;
2367} 2390}
2368 2391
2392static void do_unlock_effects(struct dlm_rsb *r, struct dlm_lkb *lkb,
2393 int error)
2394{
2395 grant_pending_locks(r);
2396}
2397
2369/* returns: 0 did nothing, -DLM_ECANCEL canceled lock */ 2398/* returns: 0 did nothing, -DLM_ECANCEL canceled lock */
2370 2399
2371static int do_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb) 2400static int do_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb)
@@ -2375,12 +2404,18 @@ static int do_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb)
2375 error = revert_lock(r, lkb); 2404 error = revert_lock(r, lkb);
2376 if (error) { 2405 if (error) {
2377 queue_cast(r, lkb, -DLM_ECANCEL); 2406 queue_cast(r, lkb, -DLM_ECANCEL);
2378 grant_pending_locks(r);
2379 return -DLM_ECANCEL; 2407 return -DLM_ECANCEL;
2380 } 2408 }
2381 return 0; 2409 return 0;
2382} 2410}
2383 2411
2412static void do_cancel_effects(struct dlm_rsb *r, struct dlm_lkb *lkb,
2413 int error)
2414{
2415 if (error)
2416 grant_pending_locks(r);
2417}
2418
2384/* 2419/*
2385 * Four stage 3 varieties: 2420 * Four stage 3 varieties:
2386 * _request_lock(), _convert_lock(), _unlock_lock(), _cancel_lock() 2421 * _request_lock(), _convert_lock(), _unlock_lock(), _cancel_lock()
@@ -2402,11 +2437,15 @@ static int _request_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
2402 goto out; 2437 goto out;
2403 } 2438 }
2404 2439
2405 if (is_remote(r)) 2440 if (is_remote(r)) {
2406 /* receive_request() calls do_request() on remote node */ 2441 /* receive_request() calls do_request() on remote node */
2407 error = send_request(r, lkb); 2442 error = send_request(r, lkb);
2408 else 2443 } else {
2409 error = do_request(r, lkb); 2444 error = do_request(r, lkb);
2445 /* for remote locks the request_reply is sent
2446 between do_request and do_request_effects */
2447 do_request_effects(r, lkb, error);
2448 }
2410 out: 2449 out:
2411 return error; 2450 return error;
2412} 2451}
@@ -2417,11 +2456,15 @@ static int _convert_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
2417{ 2456{
2418 int error; 2457 int error;
2419 2458
2420 if (is_remote(r)) 2459 if (is_remote(r)) {
2421 /* receive_convert() calls do_convert() on remote node */ 2460 /* receive_convert() calls do_convert() on remote node */
2422 error = send_convert(r, lkb); 2461 error = send_convert(r, lkb);
2423 else 2462 } else {
2424 error = do_convert(r, lkb); 2463 error = do_convert(r, lkb);
2464 /* for remote locks the convert_reply is sent
2465 between do_convert and do_convert_effects */
2466 do_convert_effects(r, lkb, error);
2467 }
2425 2468
2426 return error; 2469 return error;
2427} 2470}
@@ -2432,11 +2475,15 @@ static int _unlock_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
2432{ 2475{
2433 int error; 2476 int error;
2434 2477
2435 if (is_remote(r)) 2478 if (is_remote(r)) {
2436 /* receive_unlock() calls do_unlock() on remote node */ 2479 /* receive_unlock() calls do_unlock() on remote node */
2437 error = send_unlock(r, lkb); 2480 error = send_unlock(r, lkb);
2438 else 2481 } else {
2439 error = do_unlock(r, lkb); 2482 error = do_unlock(r, lkb);
2483 /* for remote locks the unlock_reply is sent
2484 between do_unlock and do_unlock_effects */
2485 do_unlock_effects(r, lkb, error);
2486 }
2440 2487
2441 return error; 2488 return error;
2442} 2489}
@@ -2447,11 +2494,15 @@ static int _cancel_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
2447{ 2494{
2448 int error; 2495 int error;
2449 2496
2450 if (is_remote(r)) 2497 if (is_remote(r)) {
2451 /* receive_cancel() calls do_cancel() on remote node */ 2498 /* receive_cancel() calls do_cancel() on remote node */
2452 error = send_cancel(r, lkb); 2499 error = send_cancel(r, lkb);
2453 else 2500 } else {
2454 error = do_cancel(r, lkb); 2501 error = do_cancel(r, lkb);
2502 /* for remote locks the cancel_reply is sent
2503 between do_cancel and do_cancel_effects */
2504 do_cancel_effects(r, lkb, error);
2505 }
2455 2506
2456 return error; 2507 return error;
2457} 2508}
@@ -3191,6 +3242,7 @@ static void receive_request(struct dlm_ls *ls, struct dlm_message *ms)
3191 attach_lkb(r, lkb); 3242 attach_lkb(r, lkb);
3192 error = do_request(r, lkb); 3243 error = do_request(r, lkb);
3193 send_request_reply(r, lkb, error); 3244 send_request_reply(r, lkb, error);
3245 do_request_effects(r, lkb, error);
3194 3246
3195 unlock_rsb(r); 3247 unlock_rsb(r);
3196 put_rsb(r); 3248 put_rsb(r);
@@ -3226,15 +3278,19 @@ static void receive_convert(struct dlm_ls *ls, struct dlm_message *ms)
3226 goto out; 3278 goto out;
3227 3279
3228 receive_flags(lkb, ms); 3280 receive_flags(lkb, ms);
3281
3229 error = receive_convert_args(ls, lkb, ms); 3282 error = receive_convert_args(ls, lkb, ms);
3230 if (error) 3283 if (error) {
3231 goto out_reply; 3284 send_convert_reply(r, lkb, error);
3285 goto out;
3286 }
3287
3232 reply = !down_conversion(lkb); 3288 reply = !down_conversion(lkb);
3233 3289
3234 error = do_convert(r, lkb); 3290 error = do_convert(r, lkb);
3235 out_reply:
3236 if (reply) 3291 if (reply)
3237 send_convert_reply(r, lkb, error); 3292 send_convert_reply(r, lkb, error);
3293 do_convert_effects(r, lkb, error);
3238 out: 3294 out:
3239 unlock_rsb(r); 3295 unlock_rsb(r);
3240 put_rsb(r); 3296 put_rsb(r);
@@ -3266,13 +3322,16 @@ static void receive_unlock(struct dlm_ls *ls, struct dlm_message *ms)
3266 goto out; 3322 goto out;
3267 3323
3268 receive_flags(lkb, ms); 3324 receive_flags(lkb, ms);
3325
3269 error = receive_unlock_args(ls, lkb, ms); 3326 error = receive_unlock_args(ls, lkb, ms);
3270 if (error) 3327 if (error) {
3271 goto out_reply; 3328 send_unlock_reply(r, lkb, error);
3329 goto out;
3330 }
3272 3331
3273 error = do_unlock(r, lkb); 3332 error = do_unlock(r, lkb);
3274 out_reply:
3275 send_unlock_reply(r, lkb, error); 3333 send_unlock_reply(r, lkb, error);
3334 do_unlock_effects(r, lkb, error);
3276 out: 3335 out:
3277 unlock_rsb(r); 3336 unlock_rsb(r);
3278 put_rsb(r); 3337 put_rsb(r);
@@ -3307,6 +3366,7 @@ static void receive_cancel(struct dlm_ls *ls, struct dlm_message *ms)
3307 3366
3308 error = do_cancel(r, lkb); 3367 error = do_cancel(r, lkb);
3309 send_cancel_reply(r, lkb, error); 3368 send_cancel_reply(r, lkb, error);
3369 do_cancel_effects(r, lkb, error);
3310 out: 3370 out:
3311 unlock_rsb(r); 3371 unlock_rsb(r);
3312 put_rsb(r); 3372 put_rsb(r);
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index c010ecfc0d29..26a8bd40400a 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -191,6 +191,18 @@ static int do_uevent(struct dlm_ls *ls, int in)
191 return error; 191 return error;
192} 192}
193 193
194static int dlm_uevent(struct kset *kset, struct kobject *kobj,
195 struct kobj_uevent_env *env)
196{
197 struct dlm_ls *ls = container_of(kobj, struct dlm_ls, ls_kobj);
198
199 add_uevent_var(env, "LOCKSPACE=%s", ls->ls_name);
200 return 0;
201}
202
203static struct kset_uevent_ops dlm_uevent_ops = {
204 .uevent = dlm_uevent,
205};
194 206
195int __init dlm_lockspace_init(void) 207int __init dlm_lockspace_init(void)
196{ 208{
@@ -199,7 +211,7 @@ int __init dlm_lockspace_init(void)
199 INIT_LIST_HEAD(&lslist); 211 INIT_LIST_HEAD(&lslist);
200 spin_lock_init(&lslist_lock); 212 spin_lock_init(&lslist_lock);
201 213
202 dlm_kset = kset_create_and_add("dlm", NULL, kernel_kobj); 214 dlm_kset = kset_create_and_add("dlm", &dlm_uevent_ops, kernel_kobj);
203 if (!dlm_kset) { 215 if (!dlm_kset) {
204 printk(KERN_WARNING "%s: can not create kset\n", __func__); 216 printk(KERN_WARNING "%s: can not create kset\n", __func__);
205 return -ENOMEM; 217 return -ENOMEM;
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index e73a4bb572aa..a4bfd31ac45b 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (C) 2006-2009 Red Hat, Inc. All rights reserved. 2 * Copyright (C) 2006-2010 Red Hat, Inc. All rights reserved.
3 * 3 *
4 * This copyrighted material is made available to anyone wishing to use, 4 * This copyrighted material is made available to anyone wishing to use,
5 * modify, copy, or redistribute it subject to the terms and conditions 5 * modify, copy, or redistribute it subject to the terms and conditions
@@ -173,7 +173,7 @@ static int lkb_is_endoflife(struct dlm_lkb *lkb, int sb_status, int type)
173/* we could possibly check if the cancel of an orphan has resulted in the lkb 173/* we could possibly check if the cancel of an orphan has resulted in the lkb
174 being removed and then remove that lkb from the orphans list and free it */ 174 being removed and then remove that lkb from the orphans list and free it */
175 175
176void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int bastmode) 176void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int mode)
177{ 177{
178 struct dlm_ls *ls; 178 struct dlm_ls *ls;
179 struct dlm_user_args *ua; 179 struct dlm_user_args *ua;
@@ -206,8 +206,10 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int bastmode)
206 206
207 ast_type = lkb->lkb_ast_type; 207 ast_type = lkb->lkb_ast_type;
208 lkb->lkb_ast_type |= type; 208 lkb->lkb_ast_type |= type;
209 if (bastmode) 209 if (type == AST_BAST)
210 lkb->lkb_bastmode = bastmode; 210 lkb->lkb_bastmode = mode;
211 else
212 lkb->lkb_castmode = mode;
211 213
212 if (!ast_type) { 214 if (!ast_type) {
213 kref_get(&lkb->lkb_ref); 215 kref_get(&lkb->lkb_ref);
diff --git a/fs/dlm/user.h b/fs/dlm/user.h
index 1c9686492286..f196091dd7ff 100644
--- a/fs/dlm/user.h
+++ b/fs/dlm/user.h
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (C) 2006-2008 Red Hat, Inc. All rights reserved. 2 * Copyright (C) 2006-2010 Red Hat, Inc. All rights reserved.
3 * 3 *
4 * This copyrighted material is made available to anyone wishing to use, 4 * This copyrighted material is made available to anyone wishing to use,
5 * modify, copy, or redistribute it subject to the terms and conditions 5 * modify, copy, or redistribute it subject to the terms and conditions
@@ -9,7 +9,7 @@
9#ifndef __USER_DOT_H__ 9#ifndef __USER_DOT_H__
10#define __USER_DOT_H__ 10#define __USER_DOT_H__
11 11
12void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int bastmode); 12void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int mode);
13int dlm_user_init(void); 13int dlm_user_init(void);
14void dlm_user_exit(void); 14void dlm_user_exit(void);
15int dlm_device_deregister(struct dlm_ls *ls); 15int dlm_device_deregister(struct dlm_ls *ls);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 874d169a193e..4cedc91ec59d 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1014,7 +1014,7 @@ struct ext4_sb_info {
1014 atomic_t s_lock_busy; 1014 atomic_t s_lock_busy;
1015 1015
1016 /* locality groups */ 1016 /* locality groups */
1017 struct ext4_locality_group *s_locality_groups; 1017 struct ext4_locality_group __percpu *s_locality_groups;
1018 1018
1019 /* for write statistics */ 1019 /* for write statistics */
1020 unsigned long s_sectors_written_start; 1020 unsigned long s_sectors_written_start;
diff --git a/fs/file.c b/fs/file.c
index 87e129030ab1..38039af67663 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -478,7 +478,7 @@ repeat:
478 error = fd; 478 error = fd;
479#if 1 479#if 1
480 /* Sanity check */ 480 /* Sanity check */
481 if (rcu_dereference(fdt->fd[fd]) != NULL) { 481 if (rcu_dereference_raw(fdt->fd[fd]) != NULL) {
482 printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd); 482 printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd);
483 rcu_assign_pointer(fdt->fd[fd], NULL); 483 rcu_assign_pointer(fdt->fd[fd], NULL);
484 } 484 }
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 51d9e33d634f..eb7e9423691f 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -865,13 +865,10 @@ static int fuse_notify_inval_inode(struct fuse_conn *fc, unsigned int size,
865 865
866 down_read(&fc->killsb); 866 down_read(&fc->killsb);
867 err = -ENOENT; 867 err = -ENOENT;
868 if (!fc->sb) 868 if (fc->sb) {
869 goto err_unlock; 869 err = fuse_reverse_inval_inode(fc->sb, outarg.ino,
870 870 outarg.off, outarg.len);
871 err = fuse_reverse_inval_inode(fc->sb, outarg.ino, 871 }
872 outarg.off, outarg.len);
873
874err_unlock:
875 up_read(&fc->killsb); 872 up_read(&fc->killsb);
876 return err; 873 return err;
877 874
@@ -884,10 +881,15 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size,
884 struct fuse_copy_state *cs) 881 struct fuse_copy_state *cs)
885{ 882{
886 struct fuse_notify_inval_entry_out outarg; 883 struct fuse_notify_inval_entry_out outarg;
887 int err = -EINVAL; 884 int err = -ENOMEM;
888 char buf[FUSE_NAME_MAX+1]; 885 char *buf;
889 struct qstr name; 886 struct qstr name;
890 887
888 buf = kzalloc(FUSE_NAME_MAX + 1, GFP_KERNEL);
889 if (!buf)
890 goto err;
891
892 err = -EINVAL;
891 if (size < sizeof(outarg)) 893 if (size < sizeof(outarg))
892 goto err; 894 goto err;
893 895
@@ -910,16 +912,14 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size,
910 912
911 down_read(&fc->killsb); 913 down_read(&fc->killsb);
912 err = -ENOENT; 914 err = -ENOENT;
913 if (!fc->sb) 915 if (fc->sb)
914 goto err_unlock; 916 err = fuse_reverse_inval_entry(fc->sb, outarg.parent, &name);
915
916 err = fuse_reverse_inval_entry(fc->sb, outarg.parent, &name);
917
918err_unlock:
919 up_read(&fc->killsb); 917 up_read(&fc->killsb);
918 kfree(buf);
920 return err; 919 return err;
921 920
922err: 921err:
922 kfree(buf);
923 fuse_copy_finish(cs); 923 fuse_copy_finish(cs);
924 return err; 924 return err;
925} 925}
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 7b8da9415267..0c1d0b82dcf1 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -1061,8 +1061,8 @@ out:
1061 1061
1062int gfs2_releasepage(struct page *page, gfp_t gfp_mask) 1062int gfs2_releasepage(struct page *page, gfp_t gfp_mask)
1063{ 1063{
1064 struct inode *aspace = page->mapping->host; 1064 struct address_space *mapping = page->mapping;
1065 struct gfs2_sbd *sdp = aspace->i_sb->s_fs_info; 1065 struct gfs2_sbd *sdp = gfs2_mapping2sbd(mapping);
1066 struct buffer_head *bh, *head; 1066 struct buffer_head *bh, *head;
1067 struct gfs2_bufdata *bd; 1067 struct gfs2_bufdata *bd;
1068 1068
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index f42663325931..454d4b4eb36b 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -19,7 +19,6 @@
19#include <linux/list.h> 19#include <linux/list.h>
20#include <linux/wait.h> 20#include <linux/wait.h>
21#include <linux/module.h> 21#include <linux/module.h>
22#include <linux/rwsem.h>
23#include <asm/uaccess.h> 22#include <asm/uaccess.h>
24#include <linux/seq_file.h> 23#include <linux/seq_file.h>
25#include <linux/debugfs.h> 24#include <linux/debugfs.h>
@@ -60,7 +59,6 @@ static int __dump_glock(struct seq_file *seq, const struct gfs2_glock *gl);
60#define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { __dump_glock(NULL, gl); BUG(); } } while(0) 59#define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { __dump_glock(NULL, gl); BUG(); } } while(0)
61static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target); 60static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target);
62 61
63static DECLARE_RWSEM(gfs2_umount_flush_sem);
64static struct dentry *gfs2_root; 62static struct dentry *gfs2_root;
65static struct workqueue_struct *glock_workqueue; 63static struct workqueue_struct *glock_workqueue;
66struct workqueue_struct *gfs2_delete_workqueue; 64struct workqueue_struct *gfs2_delete_workqueue;
@@ -154,12 +152,14 @@ static unsigned int gl_hash(const struct gfs2_sbd *sdp,
154static void glock_free(struct gfs2_glock *gl) 152static void glock_free(struct gfs2_glock *gl)
155{ 153{
156 struct gfs2_sbd *sdp = gl->gl_sbd; 154 struct gfs2_sbd *sdp = gl->gl_sbd;
157 struct inode *aspace = gl->gl_aspace; 155 struct address_space *mapping = gfs2_glock2aspace(gl);
156 struct kmem_cache *cachep = gfs2_glock_cachep;
158 157
159 if (aspace) 158 GLOCK_BUG_ON(gl, mapping && mapping->nrpages);
160 gfs2_aspace_put(aspace);
161 trace_gfs2_glock_put(gl); 159 trace_gfs2_glock_put(gl);
162 sdp->sd_lockstruct.ls_ops->lm_put_lock(gfs2_glock_cachep, gl); 160 if (mapping)
161 cachep = gfs2_glock_aspace_cachep;
162 sdp->sd_lockstruct.ls_ops->lm_put_lock(cachep, gl);
163} 163}
164 164
165/** 165/**
@@ -712,7 +712,6 @@ static void glock_work_func(struct work_struct *work)
712 finish_xmote(gl, gl->gl_reply); 712 finish_xmote(gl, gl->gl_reply);
713 drop_ref = 1; 713 drop_ref = 1;
714 } 714 }
715 down_read(&gfs2_umount_flush_sem);
716 spin_lock(&gl->gl_spin); 715 spin_lock(&gl->gl_spin);
717 if (test_and_clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) && 716 if (test_and_clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
718 gl->gl_state != LM_ST_UNLOCKED && 717 gl->gl_state != LM_ST_UNLOCKED &&
@@ -725,7 +724,6 @@ static void glock_work_func(struct work_struct *work)
725 } 724 }
726 run_queue(gl, 0); 725 run_queue(gl, 0);
727 spin_unlock(&gl->gl_spin); 726 spin_unlock(&gl->gl_spin);
728 up_read(&gfs2_umount_flush_sem);
729 if (!delay || 727 if (!delay ||
730 queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0) 728 queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0)
731 gfs2_glock_put(gl); 729 gfs2_glock_put(gl);
@@ -750,10 +748,11 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
750 const struct gfs2_glock_operations *glops, int create, 748 const struct gfs2_glock_operations *glops, int create,
751 struct gfs2_glock **glp) 749 struct gfs2_glock **glp)
752{ 750{
751 struct super_block *s = sdp->sd_vfs;
753 struct lm_lockname name = { .ln_number = number, .ln_type = glops->go_type }; 752 struct lm_lockname name = { .ln_number = number, .ln_type = glops->go_type };
754 struct gfs2_glock *gl, *tmp; 753 struct gfs2_glock *gl, *tmp;
755 unsigned int hash = gl_hash(sdp, &name); 754 unsigned int hash = gl_hash(sdp, &name);
756 int error; 755 struct address_space *mapping;
757 756
758 read_lock(gl_lock_addr(hash)); 757 read_lock(gl_lock_addr(hash));
759 gl = search_bucket(hash, sdp, &name); 758 gl = search_bucket(hash, sdp, &name);
@@ -765,7 +764,10 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
765 if (!create) 764 if (!create)
766 return -ENOENT; 765 return -ENOENT;
767 766
768 gl = kmem_cache_alloc(gfs2_glock_cachep, GFP_KERNEL); 767 if (glops->go_flags & GLOF_ASPACE)
768 gl = kmem_cache_alloc(gfs2_glock_aspace_cachep, GFP_KERNEL);
769 else
770 gl = kmem_cache_alloc(gfs2_glock_cachep, GFP_KERNEL);
769 if (!gl) 771 if (!gl)
770 return -ENOMEM; 772 return -ENOMEM;
771 773
@@ -784,18 +786,18 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
784 gl->gl_tchange = jiffies; 786 gl->gl_tchange = jiffies;
785 gl->gl_object = NULL; 787 gl->gl_object = NULL;
786 gl->gl_sbd = sdp; 788 gl->gl_sbd = sdp;
787 gl->gl_aspace = NULL;
788 INIT_DELAYED_WORK(&gl->gl_work, glock_work_func); 789 INIT_DELAYED_WORK(&gl->gl_work, glock_work_func);
789 INIT_WORK(&gl->gl_delete, delete_work_func); 790 INIT_WORK(&gl->gl_delete, delete_work_func);
790 791
791 /* If this glock protects actual on-disk data or metadata blocks, 792 mapping = gfs2_glock2aspace(gl);
792 create a VFS inode to manage the pages/buffers holding them. */ 793 if (mapping) {
793 if (glops == &gfs2_inode_glops || glops == &gfs2_rgrp_glops) { 794 mapping->a_ops = &gfs2_meta_aops;
794 gl->gl_aspace = gfs2_aspace_get(sdp); 795 mapping->host = s->s_bdev->bd_inode;
795 if (!gl->gl_aspace) { 796 mapping->flags = 0;
796 error = -ENOMEM; 797 mapping_set_gfp_mask(mapping, GFP_NOFS);
797 goto fail; 798 mapping->assoc_mapping = NULL;
798 } 799 mapping->backing_dev_info = s->s_bdi;
800 mapping->writeback_index = 0;
799 } 801 }
800 802
801 write_lock(gl_lock_addr(hash)); 803 write_lock(gl_lock_addr(hash));
@@ -812,10 +814,6 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
812 *glp = gl; 814 *glp = gl;
813 815
814 return 0; 816 return 0;
815
816fail:
817 kmem_cache_free(gfs2_glock_cachep, gl);
818 return error;
819} 817}
820 818
821/** 819/**
@@ -1510,35 +1508,10 @@ void gfs2_glock_thaw(struct gfs2_sbd *sdp)
1510 1508
1511void gfs2_gl_hash_clear(struct gfs2_sbd *sdp) 1509void gfs2_gl_hash_clear(struct gfs2_sbd *sdp)
1512{ 1510{
1513 unsigned long t;
1514 unsigned int x; 1511 unsigned int x;
1515 int cont;
1516 1512
1517 t = jiffies; 1513 for (x = 0; x < GFS2_GL_HASH_SIZE; x++)
1518 1514 examine_bucket(clear_glock, sdp, x);
1519 for (;;) {
1520 cont = 0;
1521 for (x = 0; x < GFS2_GL_HASH_SIZE; x++) {
1522 if (examine_bucket(clear_glock, sdp, x))
1523 cont = 1;
1524 }
1525
1526 if (!cont)
1527 break;
1528
1529 if (time_after_eq(jiffies,
1530 t + gfs2_tune_get(sdp, gt_stall_secs) * HZ)) {
1531 fs_warn(sdp, "Unmount seems to be stalled. "
1532 "Dumping lock state...\n");
1533 gfs2_dump_lockstate(sdp);
1534 t = jiffies;
1535 }
1536
1537 down_write(&gfs2_umount_flush_sem);
1538 invalidate_inodes(sdp->sd_vfs);
1539 up_write(&gfs2_umount_flush_sem);
1540 msleep(10);
1541 }
1542 flush_workqueue(glock_workqueue); 1515 flush_workqueue(glock_workqueue);
1543 wait_event(sdp->sd_glock_wait, atomic_read(&sdp->sd_glock_disposal) == 0); 1516 wait_event(sdp->sd_glock_wait, atomic_read(&sdp->sd_glock_disposal) == 0);
1544 gfs2_dump_lockstate(sdp); 1517 gfs2_dump_lockstate(sdp);
@@ -1685,7 +1658,7 @@ static int __dump_glock(struct seq_file *seq, const struct gfs2_glock *gl)
1685 dtime *= 1000000/HZ; /* demote time in uSec */ 1658 dtime *= 1000000/HZ; /* demote time in uSec */
1686 if (!test_bit(GLF_DEMOTE, &gl->gl_flags)) 1659 if (!test_bit(GLF_DEMOTE, &gl->gl_flags))
1687 dtime = 0; 1660 dtime = 0;
1688 gfs2_print_dbg(seq, "G: s:%s n:%u/%llu f:%s t:%s d:%s/%llu a:%d r:%d\n", 1661 gfs2_print_dbg(seq, "G: s:%s n:%u/%llx f:%s t:%s d:%s/%llu a:%d r:%d\n",
1689 state2str(gl->gl_state), 1662 state2str(gl->gl_state),
1690 gl->gl_name.ln_type, 1663 gl->gl_name.ln_type,
1691 (unsigned long long)gl->gl_name.ln_number, 1664 (unsigned long long)gl->gl_name.ln_number,
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index c0262faf4725..2bda1911b156 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -180,6 +180,13 @@ static inline int gfs2_glock_is_held_shrd(struct gfs2_glock *gl)
180 return gl->gl_state == LM_ST_SHARED; 180 return gl->gl_state == LM_ST_SHARED;
181} 181}
182 182
183static inline struct address_space *gfs2_glock2aspace(struct gfs2_glock *gl)
184{
185 if (gl->gl_ops->go_flags & GLOF_ASPACE)
186 return (struct address_space *)(gl + 1);
187 return NULL;
188}
189
183int gfs2_glock_get(struct gfs2_sbd *sdp, 190int gfs2_glock_get(struct gfs2_sbd *sdp,
184 u64 number, const struct gfs2_glock_operations *glops, 191 u64 number, const struct gfs2_glock_operations *glops,
185 int create, struct gfs2_glock **glp); 192 int create, struct gfs2_glock **glp);
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 78554acc0605..38e3749d476c 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -87,7 +87,7 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
87 87
88static void rgrp_go_sync(struct gfs2_glock *gl) 88static void rgrp_go_sync(struct gfs2_glock *gl)
89{ 89{
90 struct address_space *metamapping = gl->gl_aspace->i_mapping; 90 struct address_space *metamapping = gfs2_glock2aspace(gl);
91 int error; 91 int error;
92 92
93 if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags)) 93 if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags))
@@ -113,7 +113,7 @@ static void rgrp_go_sync(struct gfs2_glock *gl)
113 113
114static void rgrp_go_inval(struct gfs2_glock *gl, int flags) 114static void rgrp_go_inval(struct gfs2_glock *gl, int flags)
115{ 115{
116 struct address_space *mapping = gl->gl_aspace->i_mapping; 116 struct address_space *mapping = gfs2_glock2aspace(gl);
117 117
118 BUG_ON(!(flags & DIO_METADATA)); 118 BUG_ON(!(flags & DIO_METADATA));
119 gfs2_assert_withdraw(gl->gl_sbd, !atomic_read(&gl->gl_ail_count)); 119 gfs2_assert_withdraw(gl->gl_sbd, !atomic_read(&gl->gl_ail_count));
@@ -134,7 +134,7 @@ static void rgrp_go_inval(struct gfs2_glock *gl, int flags)
134static void inode_go_sync(struct gfs2_glock *gl) 134static void inode_go_sync(struct gfs2_glock *gl)
135{ 135{
136 struct gfs2_inode *ip = gl->gl_object; 136 struct gfs2_inode *ip = gl->gl_object;
137 struct address_space *metamapping = gl->gl_aspace->i_mapping; 137 struct address_space *metamapping = gfs2_glock2aspace(gl);
138 int error; 138 int error;
139 139
140 if (ip && !S_ISREG(ip->i_inode.i_mode)) 140 if (ip && !S_ISREG(ip->i_inode.i_mode))
@@ -183,7 +183,7 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags)
183 gfs2_assert_withdraw(gl->gl_sbd, !atomic_read(&gl->gl_ail_count)); 183 gfs2_assert_withdraw(gl->gl_sbd, !atomic_read(&gl->gl_ail_count));
184 184
185 if (flags & DIO_METADATA) { 185 if (flags & DIO_METADATA) {
186 struct address_space *mapping = gl->gl_aspace->i_mapping; 186 struct address_space *mapping = gfs2_glock2aspace(gl);
187 truncate_inode_pages(mapping, 0); 187 truncate_inode_pages(mapping, 0);
188 if (ip) { 188 if (ip) {
189 set_bit(GIF_INVALID, &ip->i_flags); 189 set_bit(GIF_INVALID, &ip->i_flags);
@@ -282,7 +282,8 @@ static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
282 282
283static int rgrp_go_demote_ok(const struct gfs2_glock *gl) 283static int rgrp_go_demote_ok(const struct gfs2_glock *gl)
284{ 284{
285 return !gl->gl_aspace->i_mapping->nrpages; 285 const struct address_space *mapping = (const struct address_space *)(gl + 1);
286 return !mapping->nrpages;
286} 287}
287 288
288/** 289/**
@@ -387,8 +388,7 @@ static void iopen_go_callback(struct gfs2_glock *gl)
387 struct gfs2_inode *ip = (struct gfs2_inode *)gl->gl_object; 388 struct gfs2_inode *ip = (struct gfs2_inode *)gl->gl_object;
388 389
389 if (gl->gl_demote_state == LM_ST_UNLOCKED && 390 if (gl->gl_demote_state == LM_ST_UNLOCKED &&
390 gl->gl_state == LM_ST_SHARED && 391 gl->gl_state == LM_ST_SHARED && ip) {
391 ip && test_bit(GIF_USER, &ip->i_flags)) {
392 gfs2_glock_hold(gl); 392 gfs2_glock_hold(gl);
393 if (queue_work(gfs2_delete_workqueue, &gl->gl_delete) == 0) 393 if (queue_work(gfs2_delete_workqueue, &gl->gl_delete) == 0)
394 gfs2_glock_put_nolock(gl); 394 gfs2_glock_put_nolock(gl);
@@ -407,6 +407,7 @@ const struct gfs2_glock_operations gfs2_inode_glops = {
407 .go_dump = inode_go_dump, 407 .go_dump = inode_go_dump,
408 .go_type = LM_TYPE_INODE, 408 .go_type = LM_TYPE_INODE,
409 .go_min_hold_time = HZ / 5, 409 .go_min_hold_time = HZ / 5,
410 .go_flags = GLOF_ASPACE,
410}; 411};
411 412
412const struct gfs2_glock_operations gfs2_rgrp_glops = { 413const struct gfs2_glock_operations gfs2_rgrp_glops = {
@@ -418,6 +419,7 @@ const struct gfs2_glock_operations gfs2_rgrp_glops = {
418 .go_dump = gfs2_rgrp_dump, 419 .go_dump = gfs2_rgrp_dump,
419 .go_type = LM_TYPE_RGRP, 420 .go_type = LM_TYPE_RGRP,
420 .go_min_hold_time = HZ / 5, 421 .go_min_hold_time = HZ / 5,
422 .go_flags = GLOF_ASPACE,
421}; 423};
422 424
423const struct gfs2_glock_operations gfs2_trans_glops = { 425const struct gfs2_glock_operations gfs2_trans_glops = {
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index bc0ad158e6b4..b8025e51cabf 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -162,6 +162,8 @@ struct gfs2_glock_operations {
162 void (*go_callback) (struct gfs2_glock *gl); 162 void (*go_callback) (struct gfs2_glock *gl);
163 const int go_type; 163 const int go_type;
164 const unsigned long go_min_hold_time; 164 const unsigned long go_min_hold_time;
165 const unsigned long go_flags;
166#define GLOF_ASPACE 1
165}; 167};
166 168
167enum { 169enum {
@@ -225,7 +227,6 @@ struct gfs2_glock {
225 227
226 struct gfs2_sbd *gl_sbd; 228 struct gfs2_sbd *gl_sbd;
227 229
228 struct inode *gl_aspace;
229 struct list_head gl_ail_list; 230 struct list_head gl_ail_list;
230 atomic_t gl_ail_count; 231 atomic_t gl_ail_count;
231 struct delayed_work gl_work; 232 struct delayed_work gl_work;
@@ -258,7 +259,6 @@ enum {
258 GIF_INVALID = 0, 259 GIF_INVALID = 0,
259 GIF_QD_LOCKED = 1, 260 GIF_QD_LOCKED = 1,
260 GIF_SW_PAGED = 3, 261 GIF_SW_PAGED = 3,
261 GIF_USER = 4, /* user inode, not metadata addr space */
262}; 262};
263 263
264 264
@@ -451,7 +451,6 @@ struct gfs2_tune {
451 unsigned int gt_quota_quantum; /* Secs between syncs to quota file */ 451 unsigned int gt_quota_quantum; /* Secs between syncs to quota file */
452 unsigned int gt_new_files_jdata; 452 unsigned int gt_new_files_jdata;
453 unsigned int gt_max_readahead; /* Max bytes to read-ahead from disk */ 453 unsigned int gt_max_readahead; /* Max bytes to read-ahead from disk */
454 unsigned int gt_stall_secs; /* Detects trouble! */
455 unsigned int gt_complain_secs; 454 unsigned int gt_complain_secs;
456 unsigned int gt_statfs_quantum; 455 unsigned int gt_statfs_quantum;
457 unsigned int gt_statfs_slow; 456 unsigned int gt_statfs_slow;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 6e220f4eee7d..b1bf2694fb2b 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -45,7 +45,7 @@ static int iget_test(struct inode *inode, void *opaque)
45 struct gfs2_inode *ip = GFS2_I(inode); 45 struct gfs2_inode *ip = GFS2_I(inode);
46 u64 *no_addr = opaque; 46 u64 *no_addr = opaque;
47 47
48 if (ip->i_no_addr == *no_addr && test_bit(GIF_USER, &ip->i_flags)) 48 if (ip->i_no_addr == *no_addr)
49 return 1; 49 return 1;
50 50
51 return 0; 51 return 0;
@@ -58,7 +58,6 @@ static int iget_set(struct inode *inode, void *opaque)
58 58
59 inode->i_ino = (unsigned long)*no_addr; 59 inode->i_ino = (unsigned long)*no_addr;
60 ip->i_no_addr = *no_addr; 60 ip->i_no_addr = *no_addr;
61 set_bit(GIF_USER, &ip->i_flags);
62 return 0; 61 return 0;
63} 62}
64 63
@@ -84,7 +83,7 @@ static int iget_skip_test(struct inode *inode, void *opaque)
84 struct gfs2_inode *ip = GFS2_I(inode); 83 struct gfs2_inode *ip = GFS2_I(inode);
85 struct gfs2_skip_data *data = opaque; 84 struct gfs2_skip_data *data = opaque;
86 85
87 if (ip->i_no_addr == data->no_addr && test_bit(GIF_USER, &ip->i_flags)){ 86 if (ip->i_no_addr == data->no_addr) {
88 if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE)){ 87 if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE)){
89 data->skipped = 1; 88 data->skipped = 1;
90 return 0; 89 return 0;
@@ -103,7 +102,6 @@ static int iget_skip_set(struct inode *inode, void *opaque)
103 return 1; 102 return 1;
104 inode->i_ino = (unsigned long)(data->no_addr); 103 inode->i_ino = (unsigned long)(data->no_addr);
105 ip->i_no_addr = data->no_addr; 104 ip->i_no_addr = data->no_addr;
106 set_bit(GIF_USER, &ip->i_flags);
107 return 0; 105 return 0;
108} 106}
109 107
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 0e5e0e7022e5..569b46240f61 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -30,7 +30,10 @@ static void gdlm_ast(void *arg)
30 30
31 switch (gl->gl_lksb.sb_status) { 31 switch (gl->gl_lksb.sb_status) {
32 case -DLM_EUNLOCK: /* Unlocked, so glock can be freed */ 32 case -DLM_EUNLOCK: /* Unlocked, so glock can be freed */
33 kmem_cache_free(gfs2_glock_cachep, gl); 33 if (gl->gl_ops->go_flags & GLOF_ASPACE)
34 kmem_cache_free(gfs2_glock_aspace_cachep, gl);
35 else
36 kmem_cache_free(gfs2_glock_cachep, gl);
34 if (atomic_dec_and_test(&sdp->sd_glock_disposal)) 37 if (atomic_dec_and_test(&sdp->sd_glock_disposal))
35 wake_up(&sdp->sd_glock_wait); 38 wake_up(&sdp->sd_glock_wait);
36 return; 39 return;
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index de97632ba32f..adc260fbea90 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -528,9 +528,9 @@ static void databuf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
528 gfs2_pin(sdp, bd->bd_bh); 528 gfs2_pin(sdp, bd->bd_bh);
529 tr->tr_num_databuf_new++; 529 tr->tr_num_databuf_new++;
530 sdp->sd_log_num_databuf++; 530 sdp->sd_log_num_databuf++;
531 list_add(&le->le_list, &sdp->sd_log_le_databuf); 531 list_add_tail(&le->le_list, &sdp->sd_log_le_databuf);
532 } else { 532 } else {
533 list_add(&le->le_list, &sdp->sd_log_le_ordered); 533 list_add_tail(&le->le_list, &sdp->sd_log_le_ordered);
534 } 534 }
535out: 535out:
536 gfs2_log_unlock(sdp); 536 gfs2_log_unlock(sdp);
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 5b31f7741a8f..a88fadc704bb 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -52,6 +52,22 @@ static void gfs2_init_glock_once(void *foo)
52 atomic_set(&gl->gl_ail_count, 0); 52 atomic_set(&gl->gl_ail_count, 0);
53} 53}
54 54
55static void gfs2_init_gl_aspace_once(void *foo)
56{
57 struct gfs2_glock *gl = foo;
58 struct address_space *mapping = (struct address_space *)(gl + 1);
59
60 gfs2_init_glock_once(gl);
61 memset(mapping, 0, sizeof(*mapping));
62 INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC);
63 spin_lock_init(&mapping->tree_lock);
64 spin_lock_init(&mapping->i_mmap_lock);
65 INIT_LIST_HEAD(&mapping->private_list);
66 spin_lock_init(&mapping->private_lock);
67 INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap);
68 INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
69}
70
55/** 71/**
56 * init_gfs2_fs - Register GFS2 as a filesystem 72 * init_gfs2_fs - Register GFS2 as a filesystem
57 * 73 *
@@ -78,6 +94,14 @@ static int __init init_gfs2_fs(void)
78 if (!gfs2_glock_cachep) 94 if (!gfs2_glock_cachep)
79 goto fail; 95 goto fail;
80 96
97 gfs2_glock_aspace_cachep = kmem_cache_create("gfs2_glock (aspace)",
98 sizeof(struct gfs2_glock) +
99 sizeof(struct address_space),
100 0, 0, gfs2_init_gl_aspace_once);
101
102 if (!gfs2_glock_aspace_cachep)
103 goto fail;
104
81 gfs2_inode_cachep = kmem_cache_create("gfs2_inode", 105 gfs2_inode_cachep = kmem_cache_create("gfs2_inode",
82 sizeof(struct gfs2_inode), 106 sizeof(struct gfs2_inode),
83 0, SLAB_RECLAIM_ACCOUNT| 107 0, SLAB_RECLAIM_ACCOUNT|
@@ -144,6 +168,9 @@ fail:
144 if (gfs2_inode_cachep) 168 if (gfs2_inode_cachep)
145 kmem_cache_destroy(gfs2_inode_cachep); 169 kmem_cache_destroy(gfs2_inode_cachep);
146 170
171 if (gfs2_glock_aspace_cachep)
172 kmem_cache_destroy(gfs2_glock_aspace_cachep);
173
147 if (gfs2_glock_cachep) 174 if (gfs2_glock_cachep)
148 kmem_cache_destroy(gfs2_glock_cachep); 175 kmem_cache_destroy(gfs2_glock_cachep);
149 176
@@ -169,6 +196,7 @@ static void __exit exit_gfs2_fs(void)
169 kmem_cache_destroy(gfs2_rgrpd_cachep); 196 kmem_cache_destroy(gfs2_rgrpd_cachep);
170 kmem_cache_destroy(gfs2_bufdata_cachep); 197 kmem_cache_destroy(gfs2_bufdata_cachep);
171 kmem_cache_destroy(gfs2_inode_cachep); 198 kmem_cache_destroy(gfs2_inode_cachep);
199 kmem_cache_destroy(gfs2_glock_aspace_cachep);
172 kmem_cache_destroy(gfs2_glock_cachep); 200 kmem_cache_destroy(gfs2_glock_cachep);
173 201
174 gfs2_sys_uninit(); 202 gfs2_sys_uninit();
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 6f68a5f18eb8..0bb12c80937a 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -93,49 +93,13 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb
93 return err; 93 return err;
94} 94}
95 95
96static const struct address_space_operations aspace_aops = { 96const struct address_space_operations gfs2_meta_aops = {
97 .writepage = gfs2_aspace_writepage, 97 .writepage = gfs2_aspace_writepage,
98 .releasepage = gfs2_releasepage, 98 .releasepage = gfs2_releasepage,
99 .sync_page = block_sync_page, 99 .sync_page = block_sync_page,
100}; 100};
101 101
102/** 102/**
103 * gfs2_aspace_get - Create and initialize a struct inode structure
104 * @sdp: the filesystem the aspace is in
105 *
106 * Right now a struct inode is just a struct inode. Maybe Linux
107 * will supply a more lightweight address space construct (that works)
108 * in the future.
109 *
110 * Make sure pages/buffers in this aspace aren't in high memory.
111 *
112 * Returns: the aspace
113 */
114
115struct inode *gfs2_aspace_get(struct gfs2_sbd *sdp)
116{
117 struct inode *aspace;
118 struct gfs2_inode *ip;
119
120 aspace = new_inode(sdp->sd_vfs);
121 if (aspace) {
122 mapping_set_gfp_mask(aspace->i_mapping, GFP_NOFS);
123 aspace->i_mapping->a_ops = &aspace_aops;
124 aspace->i_size = MAX_LFS_FILESIZE;
125 ip = GFS2_I(aspace);
126 clear_bit(GIF_USER, &ip->i_flags);
127 insert_inode_hash(aspace);
128 }
129 return aspace;
130}
131
132void gfs2_aspace_put(struct inode *aspace)
133{
134 remove_inode_hash(aspace);
135 iput(aspace);
136}
137
138/**
139 * gfs2_meta_sync - Sync all buffers associated with a glock 103 * gfs2_meta_sync - Sync all buffers associated with a glock
140 * @gl: The glock 104 * @gl: The glock
141 * 105 *
@@ -143,7 +107,7 @@ void gfs2_aspace_put(struct inode *aspace)
143 107
144void gfs2_meta_sync(struct gfs2_glock *gl) 108void gfs2_meta_sync(struct gfs2_glock *gl)
145{ 109{
146 struct address_space *mapping = gl->gl_aspace->i_mapping; 110 struct address_space *mapping = gfs2_glock2aspace(gl);
147 int error; 111 int error;
148 112
149 filemap_fdatawrite(mapping); 113 filemap_fdatawrite(mapping);
@@ -164,7 +128,7 @@ void gfs2_meta_sync(struct gfs2_glock *gl)
164 128
165struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create) 129struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create)
166{ 130{
167 struct address_space *mapping = gl->gl_aspace->i_mapping; 131 struct address_space *mapping = gfs2_glock2aspace(gl);
168 struct gfs2_sbd *sdp = gl->gl_sbd; 132 struct gfs2_sbd *sdp = gl->gl_sbd;
169 struct page *page; 133 struct page *page;
170 struct buffer_head *bh; 134 struct buffer_head *bh;
@@ -344,8 +308,10 @@ void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh,
344 308
345void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int meta) 309void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int meta)
346{ 310{
347 struct gfs2_sbd *sdp = GFS2_SB(bh->b_page->mapping->host); 311 struct address_space *mapping = bh->b_page->mapping;
312 struct gfs2_sbd *sdp = gfs2_mapping2sbd(mapping);
348 struct gfs2_bufdata *bd = bh->b_private; 313 struct gfs2_bufdata *bd = bh->b_private;
314
349 if (test_clear_buffer_pinned(bh)) { 315 if (test_clear_buffer_pinned(bh)) {
350 list_del_init(&bd->bd_le.le_list); 316 list_del_init(&bd->bd_le.le_list);
351 if (meta) { 317 if (meta) {
diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h
index de270c2f9b63..6a1d9ba16411 100644
--- a/fs/gfs2/meta_io.h
+++ b/fs/gfs2/meta_io.h
@@ -37,8 +37,16 @@ static inline void gfs2_buffer_copy_tail(struct buffer_head *to_bh,
37 0, from_head - to_head); 37 0, from_head - to_head);
38} 38}
39 39
40struct inode *gfs2_aspace_get(struct gfs2_sbd *sdp); 40extern const struct address_space_operations gfs2_meta_aops;
41void gfs2_aspace_put(struct inode *aspace); 41
42static inline struct gfs2_sbd *gfs2_mapping2sbd(struct address_space *mapping)
43{
44 struct inode *inode = mapping->host;
45 if (mapping->a_ops == &gfs2_meta_aops)
46 return (((struct gfs2_glock *)mapping) - 1)->gl_sbd;
47 else
48 return inode->i_sb->s_fs_info;
49}
42 50
43void gfs2_meta_sync(struct gfs2_glock *gl); 51void gfs2_meta_sync(struct gfs2_glock *gl);
44 52
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index a86ed6381566..a054b526dc08 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -65,7 +65,6 @@ static void gfs2_tune_init(struct gfs2_tune *gt)
65 gt->gt_quota_scale_den = 1; 65 gt->gt_quota_scale_den = 1;
66 gt->gt_new_files_jdata = 0; 66 gt->gt_new_files_jdata = 0;
67 gt->gt_max_readahead = 1 << 18; 67 gt->gt_max_readahead = 1 << 18;
68 gt->gt_stall_secs = 600;
69 gt->gt_complain_secs = 10; 68 gt->gt_complain_secs = 10;
70} 69}
71 70
@@ -1241,10 +1240,9 @@ fail_sb:
1241fail_locking: 1240fail_locking:
1242 init_locking(sdp, &mount_gh, UNDO); 1241 init_locking(sdp, &mount_gh, UNDO);
1243fail_lm: 1242fail_lm:
1243 invalidate_inodes(sb);
1244 gfs2_gl_hash_clear(sdp); 1244 gfs2_gl_hash_clear(sdp);
1245 gfs2_lm_unmount(sdp); 1245 gfs2_lm_unmount(sdp);
1246 while (invalidate_inodes(sb))
1247 yield();
1248fail_sys: 1246fail_sys:
1249 gfs2_sys_fs_del(sdp); 1247 gfs2_sys_fs_del(sdp);
1250fail: 1248fail:
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index b9dd3da22c0a..e5e22629da67 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -722,8 +722,7 @@ static int gfs2_write_inode(struct inode *inode, int sync)
722 int ret = 0; 722 int ret = 0;
723 723
724 /* Check this is a "normal" inode, etc */ 724 /* Check this is a "normal" inode, etc */
725 if (!test_bit(GIF_USER, &ip->i_flags) || 725 if (current->flags & PF_MEMALLOC)
726 (current->flags & PF_MEMALLOC))
727 return 0; 726 return 0;
728 ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); 727 ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
729 if (ret) 728 if (ret)
@@ -860,6 +859,7 @@ restart:
860 gfs2_clear_rgrpd(sdp); 859 gfs2_clear_rgrpd(sdp);
861 gfs2_jindex_free(sdp); 860 gfs2_jindex_free(sdp);
862 /* Take apart glock structures and buffer lists */ 861 /* Take apart glock structures and buffer lists */
862 invalidate_inodes(sdp->sd_vfs);
863 gfs2_gl_hash_clear(sdp); 863 gfs2_gl_hash_clear(sdp);
864 /* Unmount the locking protocol */ 864 /* Unmount the locking protocol */
865 gfs2_lm_unmount(sdp); 865 gfs2_lm_unmount(sdp);
@@ -1194,7 +1194,7 @@ static void gfs2_drop_inode(struct inode *inode)
1194{ 1194{
1195 struct gfs2_inode *ip = GFS2_I(inode); 1195 struct gfs2_inode *ip = GFS2_I(inode);
1196 1196
1197 if (test_bit(GIF_USER, &ip->i_flags) && inode->i_nlink) { 1197 if (inode->i_nlink) {
1198 struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl; 1198 struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl;
1199 if (gl && test_bit(GLF_DEMOTE, &gl->gl_flags)) 1199 if (gl && test_bit(GLF_DEMOTE, &gl->gl_flags))
1200 clear_nlink(inode); 1200 clear_nlink(inode);
@@ -1212,18 +1212,12 @@ static void gfs2_clear_inode(struct inode *inode)
1212{ 1212{
1213 struct gfs2_inode *ip = GFS2_I(inode); 1213 struct gfs2_inode *ip = GFS2_I(inode);
1214 1214
1215 /* This tells us its a "real" inode and not one which only 1215 ip->i_gl->gl_object = NULL;
1216 * serves to contain an address space (see rgrp.c, meta_io.c) 1216 gfs2_glock_put(ip->i_gl);
1217 * which therefore doesn't have its own glocks. 1217 ip->i_gl = NULL;
1218 */ 1218 if (ip->i_iopen_gh.gh_gl) {
1219 if (test_bit(GIF_USER, &ip->i_flags)) { 1219 ip->i_iopen_gh.gh_gl->gl_object = NULL;
1220 ip->i_gl->gl_object = NULL; 1220 gfs2_glock_dq_uninit(&ip->i_iopen_gh);
1221 gfs2_glock_put(ip->i_gl);
1222 ip->i_gl = NULL;
1223 if (ip->i_iopen_gh.gh_gl) {
1224 ip->i_iopen_gh.gh_gl->gl_object = NULL;
1225 gfs2_glock_dq_uninit(&ip->i_iopen_gh);
1226 }
1227 } 1221 }
1228} 1222}
1229 1223
@@ -1358,9 +1352,6 @@ static void gfs2_delete_inode(struct inode *inode)
1358 struct gfs2_holder gh; 1352 struct gfs2_holder gh;
1359 int error; 1353 int error;
1360 1354
1361 if (!test_bit(GIF_USER, &ip->i_flags))
1362 goto out;
1363
1364 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); 1355 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
1365 if (unlikely(error)) { 1356 if (unlikely(error)) {
1366 gfs2_glock_dq_uninit(&ip->i_iopen_gh); 1357 gfs2_glock_dq_uninit(&ip->i_iopen_gh);
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 0dc34621f6a6..a0db1c94317d 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -478,7 +478,6 @@ TUNE_ATTR(complain_secs, 0);
478TUNE_ATTR(statfs_slow, 0); 478TUNE_ATTR(statfs_slow, 0);
479TUNE_ATTR(new_files_jdata, 0); 479TUNE_ATTR(new_files_jdata, 0);
480TUNE_ATTR(quota_simul_sync, 1); 480TUNE_ATTR(quota_simul_sync, 1);
481TUNE_ATTR(stall_secs, 1);
482TUNE_ATTR(statfs_quantum, 1); 481TUNE_ATTR(statfs_quantum, 1);
483TUNE_ATTR_3(quota_scale, quota_scale_show, quota_scale_store); 482TUNE_ATTR_3(quota_scale, quota_scale_show, quota_scale_store);
484 483
@@ -491,7 +490,6 @@ static struct attribute *tune_attrs[] = {
491 &tune_attr_complain_secs.attr, 490 &tune_attr_complain_secs.attr,
492 &tune_attr_statfs_slow.attr, 491 &tune_attr_statfs_slow.attr,
493 &tune_attr_quota_simul_sync.attr, 492 &tune_attr_quota_simul_sync.attr,
494 &tune_attr_stall_secs.attr,
495 &tune_attr_statfs_quantum.attr, 493 &tune_attr_statfs_quantum.attr,
496 &tune_attr_quota_scale.attr, 494 &tune_attr_quota_scale.attr,
497 &tune_attr_new_files_jdata.attr, 495 &tune_attr_new_files_jdata.attr,
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index f6a7efa34eb9..226f2bfbf16a 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -21,6 +21,7 @@
21#include "util.h" 21#include "util.h"
22 22
23struct kmem_cache *gfs2_glock_cachep __read_mostly; 23struct kmem_cache *gfs2_glock_cachep __read_mostly;
24struct kmem_cache *gfs2_glock_aspace_cachep __read_mostly;
24struct kmem_cache *gfs2_inode_cachep __read_mostly; 25struct kmem_cache *gfs2_inode_cachep __read_mostly;
25struct kmem_cache *gfs2_bufdata_cachep __read_mostly; 26struct kmem_cache *gfs2_bufdata_cachep __read_mostly;
26struct kmem_cache *gfs2_rgrpd_cachep __read_mostly; 27struct kmem_cache *gfs2_rgrpd_cachep __read_mostly;
diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h
index 33e96b0ce9ab..b432e04600de 100644
--- a/fs/gfs2/util.h
+++ b/fs/gfs2/util.h
@@ -145,6 +145,7 @@ gfs2_io_error_bh_i((sdp), (bh), __func__, __FILE__, __LINE__);
145 145
146 146
147extern struct kmem_cache *gfs2_glock_cachep; 147extern struct kmem_cache *gfs2_glock_cachep;
148extern struct kmem_cache *gfs2_glock_aspace_cachep;
148extern struct kmem_cache *gfs2_inode_cachep; 149extern struct kmem_cache *gfs2_inode_cachep;
149extern struct kmem_cache *gfs2_bufdata_cachep; 150extern struct kmem_cache *gfs2_bufdata_cachep;
150extern struct kmem_cache *gfs2_rgrpd_cachep; 151extern struct kmem_cache *gfs2_rgrpd_cachep;
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index 59e5673b4597..a43d07e7b924 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -95,8 +95,7 @@ config ROOT_NFS
95 Most people say N here. 95 Most people say N here.
96 96
97config NFS_FSCACHE 97config NFS_FSCACHE
98 bool "Provide NFS client caching support (EXPERIMENTAL)" 98 bool "Provide NFS client caching support"
99 depends on EXPERIMENTAL
100 depends on NFS_FS=m && FSCACHE || NFS_FS=y && FSCACHE=y 99 depends on NFS_FS=m && FSCACHE || NFS_FS=y && FSCACHE=y
101 help 100 help
102 Say Y here if you want NFS data to be cached locally on disc through 101 Say Y here if you want NFS data to be cached locally on disc through
diff --git a/fs/nfs/iostat.h b/fs/nfs/iostat.h
index 46d779abafd3..1d8d5c813b01 100644
--- a/fs/nfs/iostat.h
+++ b/fs/nfs/iostat.h
@@ -57,12 +57,12 @@ static inline void nfs_add_fscache_stats(struct inode *inode,
57} 57}
58#endif 58#endif
59 59
60static inline struct nfs_iostats *nfs_alloc_iostats(void) 60static inline struct nfs_iostats __percpu *nfs_alloc_iostats(void)
61{ 61{
62 return alloc_percpu(struct nfs_iostats); 62 return alloc_percpu(struct nfs_iostats);
63} 63}
64 64
65static inline void nfs_free_iostats(struct nfs_iostats *stats) 65static inline void nfs_free_iostats(struct nfs_iostats __percpu *stats)
66{ 66{
67 if (stats != NULL) 67 if (stats != NULL)
68 free_percpu(stats); 68 free_percpu(stats);
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index 187dd07ba86c..9d1e5de91afb 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -388,8 +388,7 @@ int nilfs_dat_translate(struct inode *dat, __u64 vblocknr, sector_t *blocknrp)
388 ret = -ENOENT; 388 ret = -ENOENT;
389 goto out; 389 goto out;
390 } 390 }
391 if (blocknrp != NULL) 391 *blocknrp = blocknr;
392 *blocknrp = blocknr;
393 392
394 out: 393 out:
395 kunmap_atomic(kaddr, KM_USER0); 394 kunmap_atomic(kaddr, KM_USER0);
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index d6b2b83de363..313d0a21da48 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -26,6 +26,7 @@
26#include <linux/capability.h> /* capable() */ 26#include <linux/capability.h> /* capable() */
27#include <linux/uaccess.h> /* copy_from_user(), copy_to_user() */ 27#include <linux/uaccess.h> /* copy_from_user(), copy_to_user() */
28#include <linux/vmalloc.h> 28#include <linux/vmalloc.h>
29#include <linux/mount.h> /* mnt_want_write(), mnt_drop_write() */
29#include <linux/nilfs2_fs.h> 30#include <linux/nilfs2_fs.h>
30#include "nilfs.h" 31#include "nilfs.h"
31#include "segment.h" 32#include "segment.h"
@@ -107,20 +108,28 @@ static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp,
107 108
108 if (!capable(CAP_SYS_ADMIN)) 109 if (!capable(CAP_SYS_ADMIN))
109 return -EPERM; 110 return -EPERM;
111
112 ret = mnt_want_write(filp->f_path.mnt);
113 if (ret)
114 return ret;
115
116 ret = -EFAULT;
110 if (copy_from_user(&cpmode, argp, sizeof(cpmode))) 117 if (copy_from_user(&cpmode, argp, sizeof(cpmode)))
111 return -EFAULT; 118 goto out;
112 119
113 mutex_lock(&nilfs->ns_mount_mutex); 120 mutex_lock(&nilfs->ns_mount_mutex);
121
114 nilfs_transaction_begin(inode->i_sb, &ti, 0); 122 nilfs_transaction_begin(inode->i_sb, &ti, 0);
115 ret = nilfs_cpfile_change_cpmode( 123 ret = nilfs_cpfile_change_cpmode(
116 cpfile, cpmode.cm_cno, cpmode.cm_mode); 124 cpfile, cpmode.cm_cno, cpmode.cm_mode);
117 if (unlikely(ret < 0)) { 125 if (unlikely(ret < 0))
118 nilfs_transaction_abort(inode->i_sb); 126 nilfs_transaction_abort(inode->i_sb);
119 mutex_unlock(&nilfs->ns_mount_mutex); 127 else
120 return ret; 128 nilfs_transaction_commit(inode->i_sb); /* never fails */
121 } 129
122 nilfs_transaction_commit(inode->i_sb); /* never fails */
123 mutex_unlock(&nilfs->ns_mount_mutex); 130 mutex_unlock(&nilfs->ns_mount_mutex);
131out:
132 mnt_drop_write(filp->f_path.mnt);
124 return ret; 133 return ret;
125} 134}
126 135
@@ -135,16 +144,23 @@ nilfs_ioctl_delete_checkpoint(struct inode *inode, struct file *filp,
135 144
136 if (!capable(CAP_SYS_ADMIN)) 145 if (!capable(CAP_SYS_ADMIN))
137 return -EPERM; 146 return -EPERM;
147
148 ret = mnt_want_write(filp->f_path.mnt);
149 if (ret)
150 return ret;
151
152 ret = -EFAULT;
138 if (copy_from_user(&cno, argp, sizeof(cno))) 153 if (copy_from_user(&cno, argp, sizeof(cno)))
139 return -EFAULT; 154 goto out;
140 155
141 nilfs_transaction_begin(inode->i_sb, &ti, 0); 156 nilfs_transaction_begin(inode->i_sb, &ti, 0);
142 ret = nilfs_cpfile_delete_checkpoint(cpfile, cno); 157 ret = nilfs_cpfile_delete_checkpoint(cpfile, cno);
143 if (unlikely(ret < 0)) { 158 if (unlikely(ret < 0))
144 nilfs_transaction_abort(inode->i_sb); 159 nilfs_transaction_abort(inode->i_sb);
145 return ret; 160 else
146 } 161 nilfs_transaction_commit(inode->i_sb); /* never fails */
147 nilfs_transaction_commit(inode->i_sb); /* never fails */ 162out:
163 mnt_drop_write(filp->f_path.mnt);
148 return ret; 164 return ret;
149} 165}
150 166
@@ -496,12 +512,19 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
496 if (!capable(CAP_SYS_ADMIN)) 512 if (!capable(CAP_SYS_ADMIN))
497 return -EPERM; 513 return -EPERM;
498 514
515 ret = mnt_want_write(filp->f_path.mnt);
516 if (ret)
517 return ret;
518
519 ret = -EFAULT;
499 if (copy_from_user(argv, argp, sizeof(argv))) 520 if (copy_from_user(argv, argp, sizeof(argv)))
500 return -EFAULT; 521 goto out;
501 522
523 ret = -EINVAL;
502 nsegs = argv[4].v_nmembs; 524 nsegs = argv[4].v_nmembs;
503 if (argv[4].v_size != argsz[4]) 525 if (argv[4].v_size != argsz[4])
504 return -EINVAL; 526 goto out;
527
505 /* 528 /*
506 * argv[4] points to segment numbers this ioctl cleans. We 529 * argv[4] points to segment numbers this ioctl cleans. We
507 * use kmalloc() for its buffer because memory used for the 530 * use kmalloc() for its buffer because memory used for the
@@ -509,9 +532,10 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
509 */ 532 */
510 kbufs[4] = memdup_user((void __user *)(unsigned long)argv[4].v_base, 533 kbufs[4] = memdup_user((void __user *)(unsigned long)argv[4].v_base,
511 nsegs * sizeof(__u64)); 534 nsegs * sizeof(__u64));
512 if (IS_ERR(kbufs[4])) 535 if (IS_ERR(kbufs[4])) {
513 return PTR_ERR(kbufs[4]); 536 ret = PTR_ERR(kbufs[4]);
514 537 goto out;
538 }
515 nilfs = NILFS_SB(inode->i_sb)->s_nilfs; 539 nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
516 540
517 for (n = 0; n < 4; n++) { 541 for (n = 0; n < 4; n++) {
@@ -563,10 +587,12 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
563 nilfs_remove_all_gcinode(nilfs); 587 nilfs_remove_all_gcinode(nilfs);
564 clear_nilfs_gc_running(nilfs); 588 clear_nilfs_gc_running(nilfs);
565 589
566 out_free: 590out_free:
567 while (--n >= 0) 591 while (--n >= 0)
568 vfree(kbufs[n]); 592 vfree(kbufs[n]);
569 kfree(kbufs[4]); 593 kfree(kbufs[4]);
594out:
595 mnt_drop_write(filp->f_path.mnt);
570 return ret; 596 return ret;
571} 597}
572 598
@@ -575,13 +601,17 @@ static int nilfs_ioctl_sync(struct inode *inode, struct file *filp,
575{ 601{
576 __u64 cno; 602 __u64 cno;
577 int ret; 603 int ret;
604 struct the_nilfs *nilfs;
578 605
579 ret = nilfs_construct_segment(inode->i_sb); 606 ret = nilfs_construct_segment(inode->i_sb);
580 if (ret < 0) 607 if (ret < 0)
581 return ret; 608 return ret;
582 609
583 if (argp != NULL) { 610 if (argp != NULL) {
584 cno = NILFS_SB(inode->i_sb)->s_nilfs->ns_cno - 1; 611 nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
612 down_read(&nilfs->ns_segctor_sem);
613 cno = nilfs->ns_cno - 1;
614 up_read(&nilfs->ns_segctor_sem);
585 if (copy_to_user(argp, &cno, sizeof(cno))) 615 if (copy_to_user(argp, &cno, sizeof(cno)))
586 return -EFAULT; 616 return -EFAULT;
587 } 617 }
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index c9c96c7825dc..017bedc761a0 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -39,7 +39,6 @@ enum {
39 NILFS_SEG_FAIL_IO, 39 NILFS_SEG_FAIL_IO,
40 NILFS_SEG_FAIL_MAGIC, 40 NILFS_SEG_FAIL_MAGIC,
41 NILFS_SEG_FAIL_SEQ, 41 NILFS_SEG_FAIL_SEQ,
42 NILFS_SEG_FAIL_CHECKSUM_SEGSUM,
43 NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT, 42 NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT,
44 NILFS_SEG_FAIL_CHECKSUM_FULL, 43 NILFS_SEG_FAIL_CHECKSUM_FULL,
45 NILFS_SEG_FAIL_CONSISTENCY, 44 NILFS_SEG_FAIL_CONSISTENCY,
@@ -71,10 +70,6 @@ static int nilfs_warn_segment_error(int err)
71 printk(KERN_WARNING 70 printk(KERN_WARNING
72 "NILFS warning: Sequence number mismatch\n"); 71 "NILFS warning: Sequence number mismatch\n");
73 break; 72 break;
74 case NILFS_SEG_FAIL_CHECKSUM_SEGSUM:
75 printk(KERN_WARNING
76 "NILFS warning: Checksum error in segment summary\n");
77 break;
78 case NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT: 73 case NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT:
79 printk(KERN_WARNING 74 printk(KERN_WARNING
80 "NILFS warning: Checksum error in super root\n"); 75 "NILFS warning: Checksum error in super root\n");
@@ -206,19 +201,15 @@ int nilfs_read_super_root_block(struct super_block *sb, sector_t sr_block,
206 * @pseg_start: start disk block number of partial segment 201 * @pseg_start: start disk block number of partial segment
207 * @seg_seq: sequence number requested 202 * @seg_seq: sequence number requested
208 * @ssi: pointer to nilfs_segsum_info struct to store information 203 * @ssi: pointer to nilfs_segsum_info struct to store information
209 * @full_check: full check flag
210 * (0: only checks segment summary CRC, 1: data CRC)
211 */ 204 */
212static int 205static int
213load_segment_summary(struct nilfs_sb_info *sbi, sector_t pseg_start, 206load_segment_summary(struct nilfs_sb_info *sbi, sector_t pseg_start,
214 u64 seg_seq, struct nilfs_segsum_info *ssi, 207 u64 seg_seq, struct nilfs_segsum_info *ssi)
215 int full_check)
216{ 208{
217 struct buffer_head *bh_sum; 209 struct buffer_head *bh_sum;
218 struct nilfs_segment_summary *sum; 210 struct nilfs_segment_summary *sum;
219 unsigned long offset, nblock; 211 unsigned long nblock;
220 u64 check_bytes; 212 u32 crc;
221 u32 crc, crc_sum;
222 int ret = NILFS_SEG_FAIL_IO; 213 int ret = NILFS_SEG_FAIL_IO;
223 214
224 bh_sum = sb_bread(sbi->s_super, pseg_start); 215 bh_sum = sb_bread(sbi->s_super, pseg_start);
@@ -237,34 +228,24 @@ load_segment_summary(struct nilfs_sb_info *sbi, sector_t pseg_start,
237 ret = NILFS_SEG_FAIL_SEQ; 228 ret = NILFS_SEG_FAIL_SEQ;
238 goto failed; 229 goto failed;
239 } 230 }
240 if (full_check) {
241 offset = sizeof(sum->ss_datasum);
242 check_bytes =
243 ((u64)ssi->nblocks << sbi->s_super->s_blocksize_bits);
244 nblock = ssi->nblocks;
245 crc_sum = le32_to_cpu(sum->ss_datasum);
246 ret = NILFS_SEG_FAIL_CHECKSUM_FULL;
247 } else { /* only checks segment summary */
248 offset = sizeof(sum->ss_datasum) + sizeof(sum->ss_sumsum);
249 check_bytes = ssi->sumbytes;
250 nblock = ssi->nsumblk;
251 crc_sum = le32_to_cpu(sum->ss_sumsum);
252 ret = NILFS_SEG_FAIL_CHECKSUM_SEGSUM;
253 }
254 231
232 nblock = ssi->nblocks;
255 if (unlikely(nblock == 0 || 233 if (unlikely(nblock == 0 ||
256 nblock > sbi->s_nilfs->ns_blocks_per_segment)) { 234 nblock > sbi->s_nilfs->ns_blocks_per_segment)) {
257 /* This limits the number of blocks read in the CRC check */ 235 /* This limits the number of blocks read in the CRC check */
258 ret = NILFS_SEG_FAIL_CONSISTENCY; 236 ret = NILFS_SEG_FAIL_CONSISTENCY;
259 goto failed; 237 goto failed;
260 } 238 }
261 if (calc_crc_cont(sbi, bh_sum, &crc, offset, check_bytes, 239 if (calc_crc_cont(sbi, bh_sum, &crc, sizeof(sum->ss_datasum),
240 ((u64)nblock << sbi->s_super->s_blocksize_bits),
262 pseg_start, nblock)) { 241 pseg_start, nblock)) {
263 ret = NILFS_SEG_FAIL_IO; 242 ret = NILFS_SEG_FAIL_IO;
264 goto failed; 243 goto failed;
265 } 244 }
266 if (crc == crc_sum) 245 if (crc == le32_to_cpu(sum->ss_datasum))
267 ret = 0; 246 ret = 0;
247 else
248 ret = NILFS_SEG_FAIL_CHECKSUM_FULL;
268 failed: 249 failed:
269 brelse(bh_sum); 250 brelse(bh_sum);
270 out: 251 out:
@@ -598,7 +579,7 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
598 579
599 while (segnum != ri->ri_segnum || pseg_start <= ri->ri_pseg_start) { 580 while (segnum != ri->ri_segnum || pseg_start <= ri->ri_pseg_start) {
600 581
601 ret = load_segment_summary(sbi, pseg_start, seg_seq, &ssi, 1); 582 ret = load_segment_summary(sbi, pseg_start, seg_seq, &ssi);
602 if (ret) { 583 if (ret) {
603 if (ret == NILFS_SEG_FAIL_IO) { 584 if (ret == NILFS_SEG_FAIL_IO) {
604 err = -EIO; 585 err = -EIO;
@@ -821,7 +802,7 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
821 802
822 for (;;) { 803 for (;;) {
823 /* Load segment summary */ 804 /* Load segment summary */
824 ret = load_segment_summary(sbi, pseg_start, seg_seq, &ssi, 1); 805 ret = load_segment_summary(sbi, pseg_start, seg_seq, &ssi);
825 if (ret) { 806 if (ret) {
826 if (ret == NILFS_SEG_FAIL_IO) 807 if (ret == NILFS_SEG_FAIL_IO)
827 goto failed; 808 goto failed;
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index 645c78656aa0..ab56fe44e377 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -40,6 +40,11 @@ struct nilfs_write_info {
40}; 40};
41 41
42 42
43static int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
44 struct the_nilfs *nilfs);
45static int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf);
46
47
43static struct kmem_cache *nilfs_segbuf_cachep; 48static struct kmem_cache *nilfs_segbuf_cachep;
44 49
45static void nilfs_segbuf_init_once(void *obj) 50static void nilfs_segbuf_init_once(void *obj)
@@ -302,6 +307,19 @@ void nilfs_truncate_logs(struct list_head *logs,
302 } 307 }
303} 308}
304 309
310int nilfs_write_logs(struct list_head *logs, struct the_nilfs *nilfs)
311{
312 struct nilfs_segment_buffer *segbuf;
313 int ret = 0;
314
315 list_for_each_entry(segbuf, logs, sb_list) {
316 ret = nilfs_segbuf_write(segbuf, nilfs);
317 if (ret)
318 break;
319 }
320 return ret;
321}
322
305int nilfs_wait_on_logs(struct list_head *logs) 323int nilfs_wait_on_logs(struct list_head *logs)
306{ 324{
307 struct nilfs_segment_buffer *segbuf; 325 struct nilfs_segment_buffer *segbuf;
diff --git a/fs/nilfs2/segbuf.h b/fs/nilfs2/segbuf.h
index 6af1630fb401..94dfd3517bc0 100644
--- a/fs/nilfs2/segbuf.h
+++ b/fs/nilfs2/segbuf.h
@@ -166,13 +166,10 @@ nilfs_segbuf_add_file_buffer(struct nilfs_segment_buffer *segbuf,
166 segbuf->sb_sum.nfileblk++; 166 segbuf->sb_sum.nfileblk++;
167} 167}
168 168
169int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
170 struct the_nilfs *nilfs);
171int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf);
172
173void nilfs_clear_logs(struct list_head *logs); 169void nilfs_clear_logs(struct list_head *logs);
174void nilfs_truncate_logs(struct list_head *logs, 170void nilfs_truncate_logs(struct list_head *logs,
175 struct nilfs_segment_buffer *last); 171 struct nilfs_segment_buffer *last);
172int nilfs_write_logs(struct list_head *logs, struct the_nilfs *nilfs);
176int nilfs_wait_on_logs(struct list_head *logs); 173int nilfs_wait_on_logs(struct list_head *logs);
177 174
178static inline void nilfs_destroy_logs(struct list_head *logs) 175static inline void nilfs_destroy_logs(struct list_head *logs)
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 105b508b47a8..ada2f1b947a3 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -1764,14 +1764,9 @@ static int nilfs_segctor_prepare_write(struct nilfs_sc_info *sci,
1764static int nilfs_segctor_write(struct nilfs_sc_info *sci, 1764static int nilfs_segctor_write(struct nilfs_sc_info *sci,
1765 struct the_nilfs *nilfs) 1765 struct the_nilfs *nilfs)
1766{ 1766{
1767 struct nilfs_segment_buffer *segbuf; 1767 int ret;
1768 int ret = 0;
1769 1768
1770 list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) { 1769 ret = nilfs_write_logs(&sci->sc_segbufs, nilfs);
1771 ret = nilfs_segbuf_write(segbuf, nilfs);
1772 if (ret)
1773 break;
1774 }
1775 list_splice_tail_init(&sci->sc_segbufs, &sci->sc_write_logs); 1770 list_splice_tail_init(&sci->sc_segbufs, &sci->sc_write_logs);
1776 return ret; 1771 return ret;
1777} 1772}
@@ -1937,8 +1932,7 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
1937{ 1932{
1938 struct nilfs_segment_buffer *segbuf; 1933 struct nilfs_segment_buffer *segbuf;
1939 struct page *bd_page = NULL, *fs_page = NULL; 1934 struct page *bd_page = NULL, *fs_page = NULL;
1940 struct nilfs_sb_info *sbi = sci->sc_sbi; 1935 struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs;
1941 struct the_nilfs *nilfs = sbi->s_nilfs;
1942 int update_sr = (sci->sc_super_root != NULL); 1936 int update_sr = (sci->sc_super_root != NULL);
1943 1937
1944 list_for_each_entry(segbuf, &sci->sc_write_logs, sb_list) { 1938 list_for_each_entry(segbuf, &sci->sc_write_logs, sb_list) {
@@ -2020,7 +2014,7 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
2020 if (update_sr) { 2014 if (update_sr) {
2021 nilfs_set_last_segment(nilfs, segbuf->sb_pseg_start, 2015 nilfs_set_last_segment(nilfs, segbuf->sb_pseg_start,
2022 segbuf->sb_sum.seg_seq, nilfs->ns_cno++); 2016 segbuf->sb_sum.seg_seq, nilfs->ns_cno++);
2023 sbi->s_super->s_dirt = 1; 2017 set_nilfs_sb_dirty(nilfs);
2024 2018
2025 clear_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags); 2019 clear_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags);
2026 clear_bit(NILFS_SC_DIRTY, &sci->sc_flags); 2020 clear_bit(NILFS_SC_DIRTY, &sci->sc_flags);
@@ -2425,43 +2419,43 @@ int nilfs_construct_dsync_segment(struct super_block *sb, struct inode *inode,
2425 return err; 2419 return err;
2426} 2420}
2427 2421
2428struct nilfs_segctor_req {
2429 int mode;
2430 __u32 seq_accepted;
2431 int sc_err; /* construction failure */
2432 int sb_err; /* super block writeback failure */
2433};
2434
2435#define FLUSH_FILE_BIT (0x1) /* data file only */ 2422#define FLUSH_FILE_BIT (0x1) /* data file only */
2436#define FLUSH_DAT_BIT (1 << NILFS_DAT_INO) /* DAT only */ 2423#define FLUSH_DAT_BIT (1 << NILFS_DAT_INO) /* DAT only */
2437 2424
2438static void nilfs_segctor_accept(struct nilfs_sc_info *sci, 2425/**
2439 struct nilfs_segctor_req *req) 2426 * nilfs_segctor_accept - record accepted sequence count of log-write requests
2427 * @sci: segment constructor object
2428 */
2429static void nilfs_segctor_accept(struct nilfs_sc_info *sci)
2440{ 2430{
2441 req->sc_err = req->sb_err = 0;
2442 spin_lock(&sci->sc_state_lock); 2431 spin_lock(&sci->sc_state_lock);
2443 req->seq_accepted = sci->sc_seq_request; 2432 sci->sc_seq_accepted = sci->sc_seq_request;
2444 spin_unlock(&sci->sc_state_lock); 2433 spin_unlock(&sci->sc_state_lock);
2445 2434
2446 if (sci->sc_timer) 2435 if (sci->sc_timer)
2447 del_timer_sync(sci->sc_timer); 2436 del_timer_sync(sci->sc_timer);
2448} 2437}
2449 2438
2450static void nilfs_segctor_notify(struct nilfs_sc_info *sci, 2439/**
2451 struct nilfs_segctor_req *req) 2440 * nilfs_segctor_notify - notify the result of request to caller threads
2441 * @sci: segment constructor object
2442 * @mode: mode of log forming
2443 * @err: error code to be notified
2444 */
2445static void nilfs_segctor_notify(struct nilfs_sc_info *sci, int mode, int err)
2452{ 2446{
2453 /* Clear requests (even when the construction failed) */ 2447 /* Clear requests (even when the construction failed) */
2454 spin_lock(&sci->sc_state_lock); 2448 spin_lock(&sci->sc_state_lock);
2455 2449
2456 if (req->mode == SC_LSEG_SR) { 2450 if (mode == SC_LSEG_SR) {
2457 sci->sc_state &= ~NILFS_SEGCTOR_COMMIT; 2451 sci->sc_state &= ~NILFS_SEGCTOR_COMMIT;
2458 sci->sc_seq_done = req->seq_accepted; 2452 sci->sc_seq_done = sci->sc_seq_accepted;
2459 nilfs_segctor_wakeup(sci, req->sc_err ? : req->sb_err); 2453 nilfs_segctor_wakeup(sci, err);
2460 sci->sc_flush_request = 0; 2454 sci->sc_flush_request = 0;
2461 } else { 2455 } else {
2462 if (req->mode == SC_FLUSH_FILE) 2456 if (mode == SC_FLUSH_FILE)
2463 sci->sc_flush_request &= ~FLUSH_FILE_BIT; 2457 sci->sc_flush_request &= ~FLUSH_FILE_BIT;
2464 else if (req->mode == SC_FLUSH_DAT) 2458 else if (mode == SC_FLUSH_DAT)
2465 sci->sc_flush_request &= ~FLUSH_DAT_BIT; 2459 sci->sc_flush_request &= ~FLUSH_DAT_BIT;
2466 2460
2467 /* re-enable timer if checkpoint creation was not done */ 2461 /* re-enable timer if checkpoint creation was not done */
@@ -2472,30 +2466,37 @@ static void nilfs_segctor_notify(struct nilfs_sc_info *sci,
2472 spin_unlock(&sci->sc_state_lock); 2466 spin_unlock(&sci->sc_state_lock);
2473} 2467}
2474 2468
2475static int nilfs_segctor_construct(struct nilfs_sc_info *sci, 2469/**
2476 struct nilfs_segctor_req *req) 2470 * nilfs_segctor_construct - form logs and write them to disk
2471 * @sci: segment constructor object
2472 * @mode: mode of log forming
2473 */
2474static int nilfs_segctor_construct(struct nilfs_sc_info *sci, int mode)
2477{ 2475{
2478 struct nilfs_sb_info *sbi = sci->sc_sbi; 2476 struct nilfs_sb_info *sbi = sci->sc_sbi;
2479 struct the_nilfs *nilfs = sbi->s_nilfs; 2477 struct the_nilfs *nilfs = sbi->s_nilfs;
2480 int err = 0; 2478 int err = 0;
2481 2479
2480 nilfs_segctor_accept(sci);
2481
2482 if (nilfs_discontinued(nilfs)) 2482 if (nilfs_discontinued(nilfs))
2483 req->mode = SC_LSEG_SR; 2483 mode = SC_LSEG_SR;
2484 if (!nilfs_segctor_confirm(sci)) { 2484 if (!nilfs_segctor_confirm(sci))
2485 err = nilfs_segctor_do_construct(sci, req->mode); 2485 err = nilfs_segctor_do_construct(sci, mode);
2486 req->sc_err = err; 2486
2487 }
2488 if (likely(!err)) { 2487 if (likely(!err)) {
2489 if (req->mode != SC_FLUSH_DAT) 2488 if (mode != SC_FLUSH_DAT)
2490 atomic_set(&nilfs->ns_ndirtyblks, 0); 2489 atomic_set(&nilfs->ns_ndirtyblks, 0);
2491 if (test_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags) && 2490 if (test_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags) &&
2492 nilfs_discontinued(nilfs)) { 2491 nilfs_discontinued(nilfs)) {
2493 down_write(&nilfs->ns_sem); 2492 down_write(&nilfs->ns_sem);
2494 req->sb_err = nilfs_commit_super(sbi, 2493 err = nilfs_commit_super(
2495 nilfs_altsb_need_update(nilfs)); 2494 sbi, nilfs_altsb_need_update(nilfs));
2496 up_write(&nilfs->ns_sem); 2495 up_write(&nilfs->ns_sem);
2497 } 2496 }
2498 } 2497 }
2498
2499 nilfs_segctor_notify(sci, mode, err);
2499 return err; 2500 return err;
2500} 2501}
2501 2502
@@ -2526,7 +2527,6 @@ int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
2526 struct nilfs_sc_info *sci = NILFS_SC(sbi); 2527 struct nilfs_sc_info *sci = NILFS_SC(sbi);
2527 struct the_nilfs *nilfs = sbi->s_nilfs; 2528 struct the_nilfs *nilfs = sbi->s_nilfs;
2528 struct nilfs_transaction_info ti; 2529 struct nilfs_transaction_info ti;
2529 struct nilfs_segctor_req req = { .mode = SC_LSEG_SR };
2530 int err; 2530 int err;
2531 2531
2532 if (unlikely(!sci)) 2532 if (unlikely(!sci))
@@ -2547,10 +2547,8 @@ int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
2547 list_splice_tail_init(&nilfs->ns_gc_inodes, &sci->sc_gc_inodes); 2547 list_splice_tail_init(&nilfs->ns_gc_inodes, &sci->sc_gc_inodes);
2548 2548
2549 for (;;) { 2549 for (;;) {
2550 nilfs_segctor_accept(sci, &req); 2550 err = nilfs_segctor_construct(sci, SC_LSEG_SR);
2551 err = nilfs_segctor_construct(sci, &req);
2552 nilfs_remove_written_gcinodes(nilfs, &sci->sc_gc_inodes); 2551 nilfs_remove_written_gcinodes(nilfs, &sci->sc_gc_inodes);
2553 nilfs_segctor_notify(sci, &req);
2554 2552
2555 if (likely(!err)) 2553 if (likely(!err))
2556 break; 2554 break;
@@ -2560,6 +2558,16 @@ int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
2560 set_current_state(TASK_INTERRUPTIBLE); 2558 set_current_state(TASK_INTERRUPTIBLE);
2561 schedule_timeout(sci->sc_interval); 2559 schedule_timeout(sci->sc_interval);
2562 } 2560 }
2561 if (nilfs_test_opt(sbi, DISCARD)) {
2562 int ret = nilfs_discard_segments(nilfs, sci->sc_freesegs,
2563 sci->sc_nfreesegs);
2564 if (ret) {
2565 printk(KERN_WARNING
2566 "NILFS warning: error %d on discard request, "
2567 "turning discards off for the device\n", ret);
2568 nilfs_clear_opt(sbi, DISCARD);
2569 }
2570 }
2563 2571
2564 out_unlock: 2572 out_unlock:
2565 sci->sc_freesegs = NULL; 2573 sci->sc_freesegs = NULL;
@@ -2573,13 +2581,9 @@ static void nilfs_segctor_thread_construct(struct nilfs_sc_info *sci, int mode)
2573{ 2581{
2574 struct nilfs_sb_info *sbi = sci->sc_sbi; 2582 struct nilfs_sb_info *sbi = sci->sc_sbi;
2575 struct nilfs_transaction_info ti; 2583 struct nilfs_transaction_info ti;
2576 struct nilfs_segctor_req req = { .mode = mode };
2577 2584
2578 nilfs_transaction_lock(sbi, &ti, 0); 2585 nilfs_transaction_lock(sbi, &ti, 0);
2579 2586 nilfs_segctor_construct(sci, mode);
2580 nilfs_segctor_accept(sci, &req);
2581 nilfs_segctor_construct(sci, &req);
2582 nilfs_segctor_notify(sci, &req);
2583 2587
2584 /* 2588 /*
2585 * Unclosed segment should be retried. We do this using sc_timer. 2589 * Unclosed segment should be retried. We do this using sc_timer.
@@ -2635,6 +2639,7 @@ static int nilfs_segctor_flush_mode(struct nilfs_sc_info *sci)
2635static int nilfs_segctor_thread(void *arg) 2639static int nilfs_segctor_thread(void *arg)
2636{ 2640{
2637 struct nilfs_sc_info *sci = (struct nilfs_sc_info *)arg; 2641 struct nilfs_sc_info *sci = (struct nilfs_sc_info *)arg;
2642 struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs;
2638 struct timer_list timer; 2643 struct timer_list timer;
2639 int timeout = 0; 2644 int timeout = 0;
2640 2645
@@ -2680,7 +2685,6 @@ static int nilfs_segctor_thread(void *arg)
2680 } else { 2685 } else {
2681 DEFINE_WAIT(wait); 2686 DEFINE_WAIT(wait);
2682 int should_sleep = 1; 2687 int should_sleep = 1;
2683 struct the_nilfs *nilfs;
2684 2688
2685 prepare_to_wait(&sci->sc_wait_daemon, &wait, 2689 prepare_to_wait(&sci->sc_wait_daemon, &wait,
2686 TASK_INTERRUPTIBLE); 2690 TASK_INTERRUPTIBLE);
@@ -2701,8 +2705,8 @@ static int nilfs_segctor_thread(void *arg)
2701 finish_wait(&sci->sc_wait_daemon, &wait); 2705 finish_wait(&sci->sc_wait_daemon, &wait);
2702 timeout = ((sci->sc_state & NILFS_SEGCTOR_COMMIT) && 2706 timeout = ((sci->sc_state & NILFS_SEGCTOR_COMMIT) &&
2703 time_after_eq(jiffies, sci->sc_timer->expires)); 2707 time_after_eq(jiffies, sci->sc_timer->expires));
2704 nilfs = sci->sc_sbi->s_nilfs; 2708
2705 if (sci->sc_super->s_dirt && nilfs_sb_need_update(nilfs)) 2709 if (nilfs_sb_dirty(nilfs) && nilfs_sb_need_update(nilfs))
2706 set_nilfs_discontinued(nilfs); 2710 set_nilfs_discontinued(nilfs);
2707 } 2711 }
2708 goto loop; 2712 goto loop;
@@ -2797,12 +2801,9 @@ static void nilfs_segctor_write_out(struct nilfs_sc_info *sci)
2797 do { 2801 do {
2798 struct nilfs_sb_info *sbi = sci->sc_sbi; 2802 struct nilfs_sb_info *sbi = sci->sc_sbi;
2799 struct nilfs_transaction_info ti; 2803 struct nilfs_transaction_info ti;
2800 struct nilfs_segctor_req req = { .mode = SC_LSEG_SR };
2801 2804
2802 nilfs_transaction_lock(sbi, &ti, 0); 2805 nilfs_transaction_lock(sbi, &ti, 0);
2803 nilfs_segctor_accept(sci, &req); 2806 ret = nilfs_segctor_construct(sci, SC_LSEG_SR);
2804 ret = nilfs_segctor_construct(sci, &req);
2805 nilfs_segctor_notify(sci, &req);
2806 nilfs_transaction_unlock(sbi); 2807 nilfs_transaction_unlock(sbi);
2807 2808
2808 } while (ret && retrycount-- > 0); 2809 } while (ret && retrycount-- > 0);
@@ -2865,8 +2866,15 @@ int nilfs_attach_segment_constructor(struct nilfs_sb_info *sbi)
2865 struct the_nilfs *nilfs = sbi->s_nilfs; 2866 struct the_nilfs *nilfs = sbi->s_nilfs;
2866 int err; 2867 int err;
2867 2868
2868 /* Each field of nilfs_segctor is cleared through the initialization 2869 if (NILFS_SC(sbi)) {
2869 of super-block info */ 2870 /*
2871 * This happens if the filesystem was remounted
2872 * read/write after nilfs_error degenerated it into a
2873 * read-only mount.
2874 */
2875 nilfs_detach_segment_constructor(sbi);
2876 }
2877
2870 sbi->s_sc_info = nilfs_segctor_new(sbi); 2878 sbi->s_sc_info = nilfs_segctor_new(sbi);
2871 if (!sbi->s_sc_info) 2879 if (!sbi->s_sc_info)
2872 return -ENOMEM; 2880 return -ENOMEM;
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index 3d3ab2f9864c..3155e0c7f415 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -116,6 +116,7 @@ struct nilfs_segsum_pointer {
116 * @sc_wait_daemon: Daemon wait queue 116 * @sc_wait_daemon: Daemon wait queue
117 * @sc_wait_task: Start/end wait queue to control segctord task 117 * @sc_wait_task: Start/end wait queue to control segctord task
118 * @sc_seq_request: Request counter 118 * @sc_seq_request: Request counter
119 * @sc_seq_accept: Accepted request count
119 * @sc_seq_done: Completion counter 120 * @sc_seq_done: Completion counter
120 * @sc_sync: Request of explicit sync operation 121 * @sc_sync: Request of explicit sync operation
121 * @sc_interval: Timeout value of background construction 122 * @sc_interval: Timeout value of background construction
@@ -169,6 +170,7 @@ struct nilfs_sc_info {
169 wait_queue_head_t sc_wait_task; 170 wait_queue_head_t sc_wait_task;
170 171
171 __u32 sc_seq_request; 172 __u32 sc_seq_request;
173 __u32 sc_seq_accepted;
172 __u32 sc_seq_done; 174 __u32 sc_seq_done;
173 175
174 int sc_sync; 176 int sc_sync;
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 8173faee31e6..92579cc4c935 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -96,9 +96,6 @@ void nilfs_error(struct super_block *sb, const char *function,
96 if (!(sb->s_flags & MS_RDONLY)) { 96 if (!(sb->s_flags & MS_RDONLY)) {
97 struct the_nilfs *nilfs = sbi->s_nilfs; 97 struct the_nilfs *nilfs = sbi->s_nilfs;
98 98
99 if (!nilfs_test_opt(sbi, ERRORS_CONT))
100 nilfs_detach_segment_constructor(sbi);
101
102 down_write(&nilfs->ns_sem); 99 down_write(&nilfs->ns_sem);
103 if (!(nilfs->ns_mount_state & NILFS_ERROR_FS)) { 100 if (!(nilfs->ns_mount_state & NILFS_ERROR_FS)) {
104 nilfs->ns_mount_state |= NILFS_ERROR_FS; 101 nilfs->ns_mount_state |= NILFS_ERROR_FS;
@@ -301,7 +298,7 @@ int nilfs_commit_super(struct nilfs_sb_info *sbi, int dupsb)
301 memcpy(sbp[1], sbp[0], nilfs->ns_sbsize); 298 memcpy(sbp[1], sbp[0], nilfs->ns_sbsize);
302 nilfs->ns_sbwtime[1] = t; 299 nilfs->ns_sbwtime[1] = t;
303 } 300 }
304 sbi->s_super->s_dirt = 0; 301 clear_nilfs_sb_dirty(nilfs);
305 return nilfs_sync_super(sbi, dupsb); 302 return nilfs_sync_super(sbi, dupsb);
306} 303}
307 304
@@ -345,7 +342,7 @@ static int nilfs_sync_fs(struct super_block *sb, int wait)
345 err = nilfs_construct_segment(sb); 342 err = nilfs_construct_segment(sb);
346 343
347 down_write(&nilfs->ns_sem); 344 down_write(&nilfs->ns_sem);
348 if (sb->s_dirt) 345 if (nilfs_sb_dirty(nilfs))
349 nilfs_commit_super(sbi, 1); 346 nilfs_commit_super(sbi, 1);
350 up_write(&nilfs->ns_sem); 347 up_write(&nilfs->ns_sem);
351 348
@@ -481,6 +478,8 @@ static int nilfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
481 seq_printf(seq, ",order=strict"); 478 seq_printf(seq, ",order=strict");
482 if (nilfs_test_opt(sbi, NORECOVERY)) 479 if (nilfs_test_opt(sbi, NORECOVERY))
483 seq_printf(seq, ",norecovery"); 480 seq_printf(seq, ",norecovery");
481 if (nilfs_test_opt(sbi, DISCARD))
482 seq_printf(seq, ",discard");
484 483
485 return 0; 484 return 0;
486} 485}
@@ -550,7 +549,7 @@ static const struct export_operations nilfs_export_ops = {
550enum { 549enum {
551 Opt_err_cont, Opt_err_panic, Opt_err_ro, 550 Opt_err_cont, Opt_err_panic, Opt_err_ro,
552 Opt_nobarrier, Opt_snapshot, Opt_order, Opt_norecovery, 551 Opt_nobarrier, Opt_snapshot, Opt_order, Opt_norecovery,
553 Opt_err, 552 Opt_discard, Opt_err,
554}; 553};
555 554
556static match_table_t tokens = { 555static match_table_t tokens = {
@@ -561,6 +560,7 @@ static match_table_t tokens = {
561 {Opt_snapshot, "cp=%u"}, 560 {Opt_snapshot, "cp=%u"},
562 {Opt_order, "order=%s"}, 561 {Opt_order, "order=%s"},
563 {Opt_norecovery, "norecovery"}, 562 {Opt_norecovery, "norecovery"},
563 {Opt_discard, "discard"},
564 {Opt_err, NULL} 564 {Opt_err, NULL}
565}; 565};
566 566
@@ -614,6 +614,9 @@ static int parse_options(char *options, struct super_block *sb)
614 case Opt_norecovery: 614 case Opt_norecovery:
615 nilfs_set_opt(sbi, NORECOVERY); 615 nilfs_set_opt(sbi, NORECOVERY);
616 break; 616 break;
617 case Opt_discard:
618 nilfs_set_opt(sbi, DISCARD);
619 break;
617 default: 620 default:
618 printk(KERN_ERR 621 printk(KERN_ERR
619 "NILFS: Unrecognized mount option \"%s\"\n", p); 622 "NILFS: Unrecognized mount option \"%s\"\n", p);
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 6241e1722efc..92733d5651d2 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -646,6 +646,44 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
646 goto out; 646 goto out;
647} 647}
648 648
649int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump,
650 size_t nsegs)
651{
652 sector_t seg_start, seg_end;
653 sector_t start = 0, nblocks = 0;
654 unsigned int sects_per_block;
655 __u64 *sn;
656 int ret = 0;
657
658 sects_per_block = (1 << nilfs->ns_blocksize_bits) /
659 bdev_logical_block_size(nilfs->ns_bdev);
660 for (sn = segnump; sn < segnump + nsegs; sn++) {
661 nilfs_get_segment_range(nilfs, *sn, &seg_start, &seg_end);
662
663 if (!nblocks) {
664 start = seg_start;
665 nblocks = seg_end - seg_start + 1;
666 } else if (start + nblocks == seg_start) {
667 nblocks += seg_end - seg_start + 1;
668 } else {
669 ret = blkdev_issue_discard(nilfs->ns_bdev,
670 start * sects_per_block,
671 nblocks * sects_per_block,
672 GFP_NOFS,
673 DISCARD_FL_BARRIER);
674 if (ret < 0)
675 return ret;
676 nblocks = 0;
677 }
678 }
679 if (nblocks)
680 ret = blkdev_issue_discard(nilfs->ns_bdev,
681 start * sects_per_block,
682 nblocks * sects_per_block,
683 GFP_NOFS, DISCARD_FL_BARRIER);
684 return ret;
685}
686
649int nilfs_count_free_blocks(struct the_nilfs *nilfs, sector_t *nblocks) 687int nilfs_count_free_blocks(struct the_nilfs *nilfs, sector_t *nblocks)
650{ 688{
651 struct inode *dat = nilfs_dat_inode(nilfs); 689 struct inode *dat = nilfs_dat_inode(nilfs);
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index 589786e33464..e9795f1724d7 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -38,6 +38,7 @@ enum {
38 the latest checkpoint was loaded */ 38 the latest checkpoint was loaded */
39 THE_NILFS_DISCONTINUED, /* 'next' pointer chain has broken */ 39 THE_NILFS_DISCONTINUED, /* 'next' pointer chain has broken */
40 THE_NILFS_GC_RUNNING, /* gc process is running */ 40 THE_NILFS_GC_RUNNING, /* gc process is running */
41 THE_NILFS_SB_DIRTY, /* super block is dirty */
41}; 42};
42 43
43/** 44/**
@@ -197,6 +198,7 @@ THE_NILFS_FNS(INIT, init)
197THE_NILFS_FNS(LOADED, loaded) 198THE_NILFS_FNS(LOADED, loaded)
198THE_NILFS_FNS(DISCONTINUED, discontinued) 199THE_NILFS_FNS(DISCONTINUED, discontinued)
199THE_NILFS_FNS(GC_RUNNING, gc_running) 200THE_NILFS_FNS(GC_RUNNING, gc_running)
201THE_NILFS_FNS(SB_DIRTY, sb_dirty)
200 202
201/* Minimum interval of periodical update of superblocks (in seconds) */ 203/* Minimum interval of periodical update of superblocks (in seconds) */
202#define NILFS_SB_FREQ 10 204#define NILFS_SB_FREQ 10
@@ -221,6 +223,7 @@ struct the_nilfs *find_or_create_nilfs(struct block_device *);
221void put_nilfs(struct the_nilfs *); 223void put_nilfs(struct the_nilfs *);
222int init_nilfs(struct the_nilfs *, struct nilfs_sb_info *, char *); 224int init_nilfs(struct the_nilfs *, struct nilfs_sb_info *, char *);
223int load_nilfs(struct the_nilfs *, struct nilfs_sb_info *); 225int load_nilfs(struct the_nilfs *, struct nilfs_sb_info *);
226int nilfs_discard_segments(struct the_nilfs *, __u64 *, size_t);
224int nilfs_count_free_blocks(struct the_nilfs *, sector_t *); 227int nilfs_count_free_blocks(struct the_nilfs *, sector_t *);
225struct nilfs_sb_info *nilfs_find_sbinfo(struct the_nilfs *, int, __u64); 228struct nilfs_sb_info *nilfs_find_sbinfo(struct the_nilfs *, int, __u64);
226int nilfs_checkpoint_is_mounted(struct the_nilfs *, __u64, int); 229int nilfs_checkpoint_is_mounted(struct the_nilfs *, __u64, int);
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index 600d2d2ade11..791c0886c060 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -46,6 +46,7 @@ ocfs2_stackglue-objs := stackglue.o
46ocfs2_stack_o2cb-objs := stack_o2cb.o 46ocfs2_stack_o2cb-objs := stack_o2cb.o
47ocfs2_stack_user-objs := stack_user.o 47ocfs2_stack_user-objs := stack_user.o
48 48
49obj-$(CONFIG_OCFS2_FS) += dlmfs/
49# cluster/ is always needed when OCFS2_FS for masklog support 50# cluster/ is always needed when OCFS2_FS for masklog support
50obj-$(CONFIG_OCFS2_FS) += cluster/ 51obj-$(CONFIG_OCFS2_FS) += cluster/
51obj-$(CONFIG_OCFS2_FS_O2CB) += dlm/ 52obj-$(CONFIG_OCFS2_FS_O2CB) += dlm/
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index d17bdc718f74..2bbe1ecc08c0 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -1050,7 +1050,8 @@ static int ocfs2_create_new_meta_bhs(handle_t *handle,
1050 strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE); 1050 strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
1051 eb->h_blkno = cpu_to_le64(first_blkno); 1051 eb->h_blkno = cpu_to_le64(first_blkno);
1052 eb->h_fs_generation = cpu_to_le32(osb->fs_generation); 1052 eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
1053 eb->h_suballoc_slot = cpu_to_le16(osb->slot_num); 1053 eb->h_suballoc_slot =
1054 cpu_to_le16(meta_ac->ac_alloc_slot);
1054 eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start); 1055 eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start);
1055 eb->h_list.l_count = 1056 eb->h_list.l_count =
1056 cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb)); 1057 cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
@@ -6037,7 +6038,7 @@ static void ocfs2_truncate_log_worker(struct work_struct *work)
6037 if (status < 0) 6038 if (status < 0)
6038 mlog_errno(status); 6039 mlog_errno(status);
6039 else 6040 else
6040 ocfs2_init_inode_steal_slot(osb); 6041 ocfs2_init_steal_slots(osb);
6041 6042
6042 mlog_exit(status); 6043 mlog_exit(status);
6043} 6044}
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 7e9df11260f4..4c2a6d282c4d 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -577,8 +577,9 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
577 goto bail; 577 goto bail;
578 } 578 }
579 579
580 /* We should already CoW the refcounted extent. */ 580 /* We should already CoW the refcounted extent in case of create. */
581 BUG_ON(ext_flags & OCFS2_EXT_REFCOUNTED); 581 BUG_ON(create && (ext_flags & OCFS2_EXT_REFCOUNTED));
582
582 /* 583 /*
583 * get_more_blocks() expects us to describe a hole by clearing 584 * get_more_blocks() expects us to describe a hole by clearing
584 * the mapped bit on bh_result(). 585 * the mapped bit on bh_result().
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index 1cd2934de615..b39da877b12f 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -112,6 +112,7 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
112 define_mask(XATTR), 112 define_mask(XATTR),
113 define_mask(QUOTA), 113 define_mask(QUOTA),
114 define_mask(REFCOUNT), 114 define_mask(REFCOUNT),
115 define_mask(BASTS),
115 define_mask(ERROR), 116 define_mask(ERROR),
116 define_mask(NOTICE), 117 define_mask(NOTICE),
117 define_mask(KTHREAD), 118 define_mask(KTHREAD),
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index 9b4d11726cf2..3dfddbec32f2 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -114,6 +114,7 @@
114#define ML_XATTR 0x0000000020000000ULL /* ocfs2 extended attributes */ 114#define ML_XATTR 0x0000000020000000ULL /* ocfs2 extended attributes */
115#define ML_QUOTA 0x0000000040000000ULL /* ocfs2 quota operations */ 115#define ML_QUOTA 0x0000000040000000ULL /* ocfs2 quota operations */
116#define ML_REFCOUNT 0x0000000080000000ULL /* refcount tree operations */ 116#define ML_REFCOUNT 0x0000000080000000ULL /* refcount tree operations */
117#define ML_BASTS 0x0000001000000000ULL /* dlmglue asts and basts */
117/* bits that are infrequently given and frequently matched in the high word */ 118/* bits that are infrequently given and frequently matched in the high word */
118#define ML_ERROR 0x0000000100000000ULL /* sent to KERN_ERR */ 119#define ML_ERROR 0x0000000100000000ULL /* sent to KERN_ERR */
119#define ML_NOTICE 0x0000000200000000ULL /* setn to KERN_NOTICE */ 120#define ML_NOTICE 0x0000000200000000ULL /* setn to KERN_NOTICE */
@@ -194,9 +195,9 @@ extern struct mlog_bits mlog_and_bits, mlog_not_bits;
194 * previous token if args expands to nothing. 195 * previous token if args expands to nothing.
195 */ 196 */
196#define __mlog_printk(level, fmt, args...) \ 197#define __mlog_printk(level, fmt, args...) \
197 printk(level "(%u,%lu):%s:%d " fmt, task_pid_nr(current), \ 198 printk(level "(%s,%u,%lu):%s:%d " fmt, current->comm, \
198 __mlog_cpu_guess, __PRETTY_FUNCTION__, __LINE__ , \ 199 task_pid_nr(current), __mlog_cpu_guess, \
199 ##args) 200 __PRETTY_FUNCTION__, __LINE__ , ##args)
200 201
201#define mlog(mask, fmt, args...) do { \ 202#define mlog(mask, fmt, args...) do { \
202 u64 __m = MLOG_MASK_PREFIX | (mask); \ 203 u64 __m = MLOG_MASK_PREFIX | (mask); \
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 28c3ec238796..765d66c70989 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -2439,7 +2439,7 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
2439 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data; 2439 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
2440 memset(dx_root, 0, osb->sb->s_blocksize); 2440 memset(dx_root, 0, osb->sb->s_blocksize);
2441 strcpy(dx_root->dr_signature, OCFS2_DX_ROOT_SIGNATURE); 2441 strcpy(dx_root->dr_signature, OCFS2_DX_ROOT_SIGNATURE);
2442 dx_root->dr_suballoc_slot = cpu_to_le16(osb->slot_num); 2442 dx_root->dr_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
2443 dx_root->dr_suballoc_bit = cpu_to_le16(dr_suballoc_bit); 2443 dx_root->dr_suballoc_bit = cpu_to_le16(dr_suballoc_bit);
2444 dx_root->dr_fs_generation = cpu_to_le32(osb->fs_generation); 2444 dx_root->dr_fs_generation = cpu_to_le32(osb->fs_generation);
2445 dx_root->dr_blkno = cpu_to_le64(dr_blkno); 2445 dx_root->dr_blkno = cpu_to_le64(dr_blkno);
diff --git a/fs/ocfs2/dlm/Makefile b/fs/ocfs2/dlm/Makefile
index 190361375700..dcebf0d920fa 100644
--- a/fs/ocfs2/dlm/Makefile
+++ b/fs/ocfs2/dlm/Makefile
@@ -1,8 +1,7 @@
1EXTRA_CFLAGS += -Ifs/ocfs2 1EXTRA_CFLAGS += -Ifs/ocfs2
2 2
3obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_dlm.o ocfs2_dlmfs.o 3obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_dlm.o
4 4
5ocfs2_dlm-objs := dlmdomain.o dlmdebug.o dlmthread.o dlmrecovery.o \ 5ocfs2_dlm-objs := dlmdomain.o dlmdebug.o dlmthread.o dlmrecovery.o \
6 dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o dlmver.o 6 dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o dlmver.o
7 7
8ocfs2_dlmfs-objs := userdlm.o dlmfs.o dlmfsver.o
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 344bcf90cbf4..b4f99de2caf3 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -310,7 +310,7 @@ static int dlm_recovery_thread(void *data)
310 mlog(0, "dlm thread running for %s...\n", dlm->name); 310 mlog(0, "dlm thread running for %s...\n", dlm->name);
311 311
312 while (!kthread_should_stop()) { 312 while (!kthread_should_stop()) {
313 if (dlm_joined(dlm)) { 313 if (dlm_domain_fully_joined(dlm)) {
314 status = dlm_do_recovery(dlm); 314 status = dlm_do_recovery(dlm);
315 if (status == -EAGAIN) { 315 if (status == -EAGAIN) {
316 /* do not sleep, recheck immediately. */ 316 /* do not sleep, recheck immediately. */
diff --git a/fs/ocfs2/dlmfs/Makefile b/fs/ocfs2/dlmfs/Makefile
new file mode 100644
index 000000000000..df69b4856d0d
--- /dev/null
+++ b/fs/ocfs2/dlmfs/Makefile
@@ -0,0 +1,5 @@
1EXTRA_CFLAGS += -Ifs/ocfs2
2
3obj-$(CONFIG_OCFS2_FS) += ocfs2_dlmfs.o
4
5ocfs2_dlmfs-objs := userdlm.o dlmfs.o dlmfsver.o
diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index 02bf17808bdc..1b0de157a08c 100644
--- a/fs/ocfs2/dlm/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -43,24 +43,17 @@
43#include <linux/init.h> 43#include <linux/init.h>
44#include <linux/string.h> 44#include <linux/string.h>
45#include <linux/backing-dev.h> 45#include <linux/backing-dev.h>
46#include <linux/poll.h>
46 47
47#include <asm/uaccess.h> 48#include <asm/uaccess.h>
48 49
49 50#include "stackglue.h"
50#include "cluster/nodemanager.h"
51#include "cluster/heartbeat.h"
52#include "cluster/tcp.h"
53
54#include "dlmapi.h"
55
56#include "userdlm.h" 51#include "userdlm.h"
57
58#include "dlmfsver.h" 52#include "dlmfsver.h"
59 53
60#define MLOG_MASK_PREFIX ML_DLMFS 54#define MLOG_MASK_PREFIX ML_DLMFS
61#include "cluster/masklog.h" 55#include "cluster/masklog.h"
62 56
63#include "ocfs2_lockingver.h"
64 57
65static const struct super_operations dlmfs_ops; 58static const struct super_operations dlmfs_ops;
66static const struct file_operations dlmfs_file_operations; 59static const struct file_operations dlmfs_file_operations;
@@ -71,15 +64,46 @@ static struct kmem_cache *dlmfs_inode_cache;
71 64
72struct workqueue_struct *user_dlm_worker; 65struct workqueue_struct *user_dlm_worker;
73 66
67
68
74/* 69/*
75 * This is the userdlmfs locking protocol version. 70 * These are the ABI capabilities of dlmfs.
71 *
72 * Over time, dlmfs has added some features that were not part of the
73 * initial ABI. Unfortunately, some of these features are not detectable
74 * via standard usage. For example, Linux's default poll always returns
75 * POLLIN, so there is no way for a caller of poll(2) to know when dlmfs
76 * added poll support. Instead, we provide this list of new capabilities.
77 *
78 * Capabilities is a read-only attribute. We do it as a module parameter
79 * so we can discover it whether dlmfs is built in, loaded, or even not
80 * loaded.
76 * 81 *
77 * See fs/ocfs2/dlmglue.c for more details on locking versions. 82 * The ABI features are local to this machine's dlmfs mount. This is
83 * distinct from the locking protocol, which is concerned with inter-node
84 * interaction.
85 *
86 * Capabilities:
87 * - bast : POLLIN against the file descriptor of a held lock
88 * signifies a bast fired on the lock.
78 */ 89 */
79static const struct dlm_protocol_version user_locking_protocol = { 90#define DLMFS_CAPABILITIES "bast stackglue"
80 .pv_major = OCFS2_LOCKING_PROTOCOL_MAJOR, 91extern int param_set_dlmfs_capabilities(const char *val,
81 .pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR, 92 struct kernel_param *kp)
82}; 93{
94 printk(KERN_ERR "%s: readonly parameter\n", kp->name);
95 return -EINVAL;
96}
97static int param_get_dlmfs_capabilities(char *buffer,
98 struct kernel_param *kp)
99{
100 return strlcpy(buffer, DLMFS_CAPABILITIES,
101 strlen(DLMFS_CAPABILITIES) + 1);
102}
103module_param_call(capabilities, param_set_dlmfs_capabilities,
104 param_get_dlmfs_capabilities, NULL, 0444);
105MODULE_PARM_DESC(capabilities, DLMFS_CAPABILITIES);
106
83 107
84/* 108/*
85 * decodes a set of open flags into a valid lock level and a set of flags. 109 * decodes a set of open flags into a valid lock level and a set of flags.
@@ -179,13 +203,46 @@ static int dlmfs_file_release(struct inode *inode,
179 return 0; 203 return 0;
180} 204}
181 205
206/*
207 * We do ->setattr() just to override size changes. Our size is the size
208 * of the LVB and nothing else.
209 */
210static int dlmfs_file_setattr(struct dentry *dentry, struct iattr *attr)
211{
212 int error;
213 struct inode *inode = dentry->d_inode;
214
215 attr->ia_valid &= ~ATTR_SIZE;
216 error = inode_change_ok(inode, attr);
217 if (!error)
218 error = inode_setattr(inode, attr);
219
220 return error;
221}
222
223static unsigned int dlmfs_file_poll(struct file *file, poll_table *wait)
224{
225 int event = 0;
226 struct inode *inode = file->f_path.dentry->d_inode;
227 struct dlmfs_inode_private *ip = DLMFS_I(inode);
228
229 poll_wait(file, &ip->ip_lockres.l_event, wait);
230
231 spin_lock(&ip->ip_lockres.l_lock);
232 if (ip->ip_lockres.l_flags & USER_LOCK_BLOCKED)
233 event = POLLIN | POLLRDNORM;
234 spin_unlock(&ip->ip_lockres.l_lock);
235
236 return event;
237}
238
182static ssize_t dlmfs_file_read(struct file *filp, 239static ssize_t dlmfs_file_read(struct file *filp,
183 char __user *buf, 240 char __user *buf,
184 size_t count, 241 size_t count,
185 loff_t *ppos) 242 loff_t *ppos)
186{ 243{
187 int bytes_left; 244 int bytes_left;
188 ssize_t readlen; 245 ssize_t readlen, got;
189 char *lvb_buf; 246 char *lvb_buf;
190 struct inode *inode = filp->f_path.dentry->d_inode; 247 struct inode *inode = filp->f_path.dentry->d_inode;
191 248
@@ -211,9 +268,13 @@ static ssize_t dlmfs_file_read(struct file *filp,
211 if (!lvb_buf) 268 if (!lvb_buf)
212 return -ENOMEM; 269 return -ENOMEM;
213 270
214 user_dlm_read_lvb(inode, lvb_buf, readlen); 271 got = user_dlm_read_lvb(inode, lvb_buf, readlen);
215 bytes_left = __copy_to_user(buf, lvb_buf, readlen); 272 if (got) {
216 readlen -= bytes_left; 273 BUG_ON(got != readlen);
274 bytes_left = __copy_to_user(buf, lvb_buf, readlen);
275 readlen -= bytes_left;
276 } else
277 readlen = 0;
217 278
218 kfree(lvb_buf); 279 kfree(lvb_buf);
219 280
@@ -272,7 +333,7 @@ static void dlmfs_init_once(void *foo)
272 struct dlmfs_inode_private *ip = 333 struct dlmfs_inode_private *ip =
273 (struct dlmfs_inode_private *) foo; 334 (struct dlmfs_inode_private *) foo;
274 335
275 ip->ip_dlm = NULL; 336 ip->ip_conn = NULL;
276 ip->ip_parent = NULL; 337 ip->ip_parent = NULL;
277 338
278 inode_init_once(&ip->ip_vfs_inode); 339 inode_init_once(&ip->ip_vfs_inode);
@@ -314,14 +375,14 @@ static void dlmfs_clear_inode(struct inode *inode)
314 goto clear_fields; 375 goto clear_fields;
315 } 376 }
316 377
317 mlog(0, "we're a directory, ip->ip_dlm = 0x%p\n", ip->ip_dlm); 378 mlog(0, "we're a directory, ip->ip_conn = 0x%p\n", ip->ip_conn);
318 /* we must be a directory. If required, lets unregister the 379 /* we must be a directory. If required, lets unregister the
319 * dlm context now. */ 380 * dlm context now. */
320 if (ip->ip_dlm) 381 if (ip->ip_conn)
321 user_dlm_unregister_context(ip->ip_dlm); 382 user_dlm_unregister(ip->ip_conn);
322clear_fields: 383clear_fields:
323 ip->ip_parent = NULL; 384 ip->ip_parent = NULL;
324 ip->ip_dlm = NULL; 385 ip->ip_conn = NULL;
325} 386}
326 387
327static struct backing_dev_info dlmfs_backing_dev_info = { 388static struct backing_dev_info dlmfs_backing_dev_info = {
@@ -371,7 +432,7 @@ static struct inode *dlmfs_get_inode(struct inode *parent,
371 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 432 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
372 433
373 ip = DLMFS_I(inode); 434 ip = DLMFS_I(inode);
374 ip->ip_dlm = DLMFS_I(parent)->ip_dlm; 435 ip->ip_conn = DLMFS_I(parent)->ip_conn;
375 436
376 switch (mode & S_IFMT) { 437 switch (mode & S_IFMT) {
377 default: 438 default:
@@ -425,13 +486,12 @@ static int dlmfs_mkdir(struct inode * dir,
425 struct inode *inode = NULL; 486 struct inode *inode = NULL;
426 struct qstr *domain = &dentry->d_name; 487 struct qstr *domain = &dentry->d_name;
427 struct dlmfs_inode_private *ip; 488 struct dlmfs_inode_private *ip;
428 struct dlm_ctxt *dlm; 489 struct ocfs2_cluster_connection *conn;
429 struct dlm_protocol_version proto = user_locking_protocol;
430 490
431 mlog(0, "mkdir %.*s\n", domain->len, domain->name); 491 mlog(0, "mkdir %.*s\n", domain->len, domain->name);
432 492
433 /* verify that we have a proper domain */ 493 /* verify that we have a proper domain */
434 if (domain->len >= O2NM_MAX_NAME_LEN) { 494 if (domain->len >= GROUP_NAME_MAX) {
435 status = -EINVAL; 495 status = -EINVAL;
436 mlog(ML_ERROR, "invalid domain name for directory.\n"); 496 mlog(ML_ERROR, "invalid domain name for directory.\n");
437 goto bail; 497 goto bail;
@@ -446,14 +506,14 @@ static int dlmfs_mkdir(struct inode * dir,
446 506
447 ip = DLMFS_I(inode); 507 ip = DLMFS_I(inode);
448 508
449 dlm = user_dlm_register_context(domain, &proto); 509 conn = user_dlm_register(domain);
450 if (IS_ERR(dlm)) { 510 if (IS_ERR(conn)) {
451 status = PTR_ERR(dlm); 511 status = PTR_ERR(conn);
452 mlog(ML_ERROR, "Error %d could not register domain \"%.*s\"\n", 512 mlog(ML_ERROR, "Error %d could not register domain \"%.*s\"\n",
453 status, domain->len, domain->name); 513 status, domain->len, domain->name);
454 goto bail; 514 goto bail;
455 } 515 }
456 ip->ip_dlm = dlm; 516 ip->ip_conn = conn;
457 517
458 inc_nlink(dir); 518 inc_nlink(dir);
459 d_instantiate(dentry, inode); 519 d_instantiate(dentry, inode);
@@ -549,6 +609,7 @@ static int dlmfs_fill_super(struct super_block * sb,
549static const struct file_operations dlmfs_file_operations = { 609static const struct file_operations dlmfs_file_operations = {
550 .open = dlmfs_file_open, 610 .open = dlmfs_file_open,
551 .release = dlmfs_file_release, 611 .release = dlmfs_file_release,
612 .poll = dlmfs_file_poll,
552 .read = dlmfs_file_read, 613 .read = dlmfs_file_read,
553 .write = dlmfs_file_write, 614 .write = dlmfs_file_write,
554}; 615};
@@ -576,6 +637,7 @@ static const struct super_operations dlmfs_ops = {
576 637
577static const struct inode_operations dlmfs_file_inode_operations = { 638static const struct inode_operations dlmfs_file_inode_operations = {
578 .getattr = simple_getattr, 639 .getattr = simple_getattr,
640 .setattr = dlmfs_file_setattr,
579}; 641};
580 642
581static int dlmfs_get_sb(struct file_system_type *fs_type, 643static int dlmfs_get_sb(struct file_system_type *fs_type,
@@ -620,6 +682,7 @@ static int __init init_dlmfs_fs(void)
620 } 682 }
621 cleanup_worker = 1; 683 cleanup_worker = 1;
622 684
685 user_dlm_set_locking_protocol();
623 status = register_filesystem(&dlmfs_fs_type); 686 status = register_filesystem(&dlmfs_fs_type);
624bail: 687bail:
625 if (status) { 688 if (status) {
diff --git a/fs/ocfs2/dlm/dlmfsver.c b/fs/ocfs2/dlmfs/dlmfsver.c
index a733b3321f83..a733b3321f83 100644
--- a/fs/ocfs2/dlm/dlmfsver.c
+++ b/fs/ocfs2/dlmfs/dlmfsver.c
diff --git a/fs/ocfs2/dlm/dlmfsver.h b/fs/ocfs2/dlmfs/dlmfsver.h
index f35eadbed25c..f35eadbed25c 100644
--- a/fs/ocfs2/dlm/dlmfsver.h
+++ b/fs/ocfs2/dlmfs/dlmfsver.h
diff --git a/fs/ocfs2/dlm/userdlm.c b/fs/ocfs2/dlmfs/userdlm.c
index 4cb1d3dae250..0499e3fb7bdb 100644
--- a/fs/ocfs2/dlm/userdlm.c
+++ b/fs/ocfs2/dlmfs/userdlm.c
@@ -34,18 +34,19 @@
34#include <linux/types.h> 34#include <linux/types.h>
35#include <linux/crc32.h> 35#include <linux/crc32.h>
36 36
37 37#include "ocfs2_lockingver.h"
38#include "cluster/nodemanager.h" 38#include "stackglue.h"
39#include "cluster/heartbeat.h"
40#include "cluster/tcp.h"
41
42#include "dlmapi.h"
43
44#include "userdlm.h" 39#include "userdlm.h"
45 40
46#define MLOG_MASK_PREFIX ML_DLMFS 41#define MLOG_MASK_PREFIX ML_DLMFS
47#include "cluster/masklog.h" 42#include "cluster/masklog.h"
48 43
44
45static inline struct user_lock_res *user_lksb_to_lock_res(struct ocfs2_dlm_lksb *lksb)
46{
47 return container_of(lksb, struct user_lock_res, l_lksb);
48}
49
49static inline int user_check_wait_flag(struct user_lock_res *lockres, 50static inline int user_check_wait_flag(struct user_lock_res *lockres,
50 int flag) 51 int flag)
51{ 52{
@@ -73,15 +74,15 @@ static inline void user_wait_on_blocked_lock(struct user_lock_res *lockres)
73} 74}
74 75
75/* I heart container_of... */ 76/* I heart container_of... */
76static inline struct dlm_ctxt * 77static inline struct ocfs2_cluster_connection *
77dlm_ctxt_from_user_lockres(struct user_lock_res *lockres) 78cluster_connection_from_user_lockres(struct user_lock_res *lockres)
78{ 79{
79 struct dlmfs_inode_private *ip; 80 struct dlmfs_inode_private *ip;
80 81
81 ip = container_of(lockres, 82 ip = container_of(lockres,
82 struct dlmfs_inode_private, 83 struct dlmfs_inode_private,
83 ip_lockres); 84 ip_lockres);
84 return ip->ip_dlm; 85 return ip->ip_conn;
85} 86}
86 87
87static struct inode * 88static struct inode *
@@ -103,9 +104,9 @@ static inline void user_recover_from_dlm_error(struct user_lock_res *lockres)
103} 104}
104 105
105#define user_log_dlm_error(_func, _stat, _lockres) do { \ 106#define user_log_dlm_error(_func, _stat, _lockres) do { \
106 mlog(ML_ERROR, "Dlm error \"%s\" while calling %s on " \ 107 mlog(ML_ERROR, "Dlm error %d while calling %s on " \
107 "resource %.*s: %s\n", dlm_errname(_stat), _func, \ 108 "resource %.*s\n", _stat, _func, \
108 _lockres->l_namelen, _lockres->l_name, dlm_errmsg(_stat)); \ 109 _lockres->l_namelen, _lockres->l_name); \
109} while (0) 110} while (0)
110 111
111/* WARNING: This function lives in a world where the only three lock 112/* WARNING: This function lives in a world where the only three lock
@@ -113,34 +114,35 @@ static inline void user_recover_from_dlm_error(struct user_lock_res *lockres)
113 * lock types are added. */ 114 * lock types are added. */
114static inline int user_highest_compat_lock_level(int level) 115static inline int user_highest_compat_lock_level(int level)
115{ 116{
116 int new_level = LKM_EXMODE; 117 int new_level = DLM_LOCK_EX;
117 118
118 if (level == LKM_EXMODE) 119 if (level == DLM_LOCK_EX)
119 new_level = LKM_NLMODE; 120 new_level = DLM_LOCK_NL;
120 else if (level == LKM_PRMODE) 121 else if (level == DLM_LOCK_PR)
121 new_level = LKM_PRMODE; 122 new_level = DLM_LOCK_PR;
122 return new_level; 123 return new_level;
123} 124}
124 125
125static void user_ast(void *opaque) 126static void user_ast(struct ocfs2_dlm_lksb *lksb)
126{ 127{
127 struct user_lock_res *lockres = opaque; 128 struct user_lock_res *lockres = user_lksb_to_lock_res(lksb);
128 struct dlm_lockstatus *lksb; 129 int status;
129 130
130 mlog(0, "AST fired for lockres %.*s\n", lockres->l_namelen, 131 mlog(ML_BASTS, "AST fired for lockres %.*s, level %d => %d\n",
131 lockres->l_name); 132 lockres->l_namelen, lockres->l_name, lockres->l_level,
133 lockres->l_requested);
132 134
133 spin_lock(&lockres->l_lock); 135 spin_lock(&lockres->l_lock);
134 136
135 lksb = &(lockres->l_lksb); 137 status = ocfs2_dlm_lock_status(&lockres->l_lksb);
136 if (lksb->status != DLM_NORMAL) { 138 if (status) {
137 mlog(ML_ERROR, "lksb status value of %u on lockres %.*s\n", 139 mlog(ML_ERROR, "lksb status value of %u on lockres %.*s\n",
138 lksb->status, lockres->l_namelen, lockres->l_name); 140 status, lockres->l_namelen, lockres->l_name);
139 spin_unlock(&lockres->l_lock); 141 spin_unlock(&lockres->l_lock);
140 return; 142 return;
141 } 143 }
142 144
143 mlog_bug_on_msg(lockres->l_requested == LKM_IVMODE, 145 mlog_bug_on_msg(lockres->l_requested == DLM_LOCK_IV,
144 "Lockres %.*s, requested ivmode. flags 0x%x\n", 146 "Lockres %.*s, requested ivmode. flags 0x%x\n",
145 lockres->l_namelen, lockres->l_name, lockres->l_flags); 147 lockres->l_namelen, lockres->l_name, lockres->l_flags);
146 148
@@ -148,13 +150,13 @@ static void user_ast(void *opaque)
148 if (lockres->l_requested < lockres->l_level) { 150 if (lockres->l_requested < lockres->l_level) {
149 if (lockres->l_requested <= 151 if (lockres->l_requested <=
150 user_highest_compat_lock_level(lockres->l_blocking)) { 152 user_highest_compat_lock_level(lockres->l_blocking)) {
151 lockres->l_blocking = LKM_NLMODE; 153 lockres->l_blocking = DLM_LOCK_NL;
152 lockres->l_flags &= ~USER_LOCK_BLOCKED; 154 lockres->l_flags &= ~USER_LOCK_BLOCKED;
153 } 155 }
154 } 156 }
155 157
156 lockres->l_level = lockres->l_requested; 158 lockres->l_level = lockres->l_requested;
157 lockres->l_requested = LKM_IVMODE; 159 lockres->l_requested = DLM_LOCK_IV;
158 lockres->l_flags |= USER_LOCK_ATTACHED; 160 lockres->l_flags |= USER_LOCK_ATTACHED;
159 lockres->l_flags &= ~USER_LOCK_BUSY; 161 lockres->l_flags &= ~USER_LOCK_BUSY;
160 162
@@ -193,11 +195,11 @@ static void __user_dlm_cond_queue_lockres(struct user_lock_res *lockres)
193 return; 195 return;
194 196
195 switch (lockres->l_blocking) { 197 switch (lockres->l_blocking) {
196 case LKM_EXMODE: 198 case DLM_LOCK_EX:
197 if (!lockres->l_ex_holders && !lockres->l_ro_holders) 199 if (!lockres->l_ex_holders && !lockres->l_ro_holders)
198 queue = 1; 200 queue = 1;
199 break; 201 break;
200 case LKM_PRMODE: 202 case DLM_LOCK_PR:
201 if (!lockres->l_ex_holders) 203 if (!lockres->l_ex_holders)
202 queue = 1; 204 queue = 1;
203 break; 205 break;
@@ -209,12 +211,12 @@ static void __user_dlm_cond_queue_lockres(struct user_lock_res *lockres)
209 __user_dlm_queue_lockres(lockres); 211 __user_dlm_queue_lockres(lockres);
210} 212}
211 213
212static void user_bast(void *opaque, int level) 214static void user_bast(struct ocfs2_dlm_lksb *lksb, int level)
213{ 215{
214 struct user_lock_res *lockres = opaque; 216 struct user_lock_res *lockres = user_lksb_to_lock_res(lksb);
215 217
216 mlog(0, "Blocking AST fired for lockres %.*s. Blocking level %d\n", 218 mlog(ML_BASTS, "BAST fired for lockres %.*s, blocking %d, level %d\n",
217 lockres->l_namelen, lockres->l_name, level); 219 lockres->l_namelen, lockres->l_name, level, lockres->l_level);
218 220
219 spin_lock(&lockres->l_lock); 221 spin_lock(&lockres->l_lock);
220 lockres->l_flags |= USER_LOCK_BLOCKED; 222 lockres->l_flags |= USER_LOCK_BLOCKED;
@@ -227,15 +229,15 @@ static void user_bast(void *opaque, int level)
227 wake_up(&lockres->l_event); 229 wake_up(&lockres->l_event);
228} 230}
229 231
230static void user_unlock_ast(void *opaque, enum dlm_status status) 232static void user_unlock_ast(struct ocfs2_dlm_lksb *lksb, int status)
231{ 233{
232 struct user_lock_res *lockres = opaque; 234 struct user_lock_res *lockres = user_lksb_to_lock_res(lksb);
233 235
234 mlog(0, "UNLOCK AST called on lock %.*s\n", lockres->l_namelen, 236 mlog(ML_BASTS, "UNLOCK AST fired for lockres %.*s, flags 0x%x\n",
235 lockres->l_name); 237 lockres->l_namelen, lockres->l_name, lockres->l_flags);
236 238
237 if (status != DLM_NORMAL && status != DLM_CANCELGRANT) 239 if (status)
238 mlog(ML_ERROR, "Dlm returns status %d\n", status); 240 mlog(ML_ERROR, "dlm returns status %d\n", status);
239 241
240 spin_lock(&lockres->l_lock); 242 spin_lock(&lockres->l_lock);
241 /* The teardown flag gets set early during the unlock process, 243 /* The teardown flag gets set early during the unlock process,
@@ -243,7 +245,7 @@ static void user_unlock_ast(void *opaque, enum dlm_status status)
243 * for a concurrent cancel. */ 245 * for a concurrent cancel. */
244 if (lockres->l_flags & USER_LOCK_IN_TEARDOWN 246 if (lockres->l_flags & USER_LOCK_IN_TEARDOWN
245 && !(lockres->l_flags & USER_LOCK_IN_CANCEL)) { 247 && !(lockres->l_flags & USER_LOCK_IN_CANCEL)) {
246 lockres->l_level = LKM_IVMODE; 248 lockres->l_level = DLM_LOCK_IV;
247 } else if (status == DLM_CANCELGRANT) { 249 } else if (status == DLM_CANCELGRANT) {
248 /* We tried to cancel a convert request, but it was 250 /* We tried to cancel a convert request, but it was
249 * already granted. Don't clear the busy flag - the 251 * already granted. Don't clear the busy flag - the
@@ -254,7 +256,7 @@ static void user_unlock_ast(void *opaque, enum dlm_status status)
254 } else { 256 } else {
255 BUG_ON(!(lockres->l_flags & USER_LOCK_IN_CANCEL)); 257 BUG_ON(!(lockres->l_flags & USER_LOCK_IN_CANCEL));
256 /* Cancel succeeded, we want to re-queue */ 258 /* Cancel succeeded, we want to re-queue */
257 lockres->l_requested = LKM_IVMODE; /* cancel an 259 lockres->l_requested = DLM_LOCK_IV; /* cancel an
258 * upconvert 260 * upconvert
259 * request. */ 261 * request. */
260 lockres->l_flags &= ~USER_LOCK_IN_CANCEL; 262 lockres->l_flags &= ~USER_LOCK_IN_CANCEL;
@@ -271,6 +273,21 @@ out_noclear:
271 wake_up(&lockres->l_event); 273 wake_up(&lockres->l_event);
272} 274}
273 275
276/*
277 * This is the userdlmfs locking protocol version.
278 *
279 * See fs/ocfs2/dlmglue.c for more details on locking versions.
280 */
281static struct ocfs2_locking_protocol user_dlm_lproto = {
282 .lp_max_version = {
283 .pv_major = OCFS2_LOCKING_PROTOCOL_MAJOR,
284 .pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR,
285 },
286 .lp_lock_ast = user_ast,
287 .lp_blocking_ast = user_bast,
288 .lp_unlock_ast = user_unlock_ast,
289};
290
274static inline void user_dlm_drop_inode_ref(struct user_lock_res *lockres) 291static inline void user_dlm_drop_inode_ref(struct user_lock_res *lockres)
275{ 292{
276 struct inode *inode; 293 struct inode *inode;
@@ -283,10 +300,10 @@ static void user_dlm_unblock_lock(struct work_struct *work)
283 int new_level, status; 300 int new_level, status;
284 struct user_lock_res *lockres = 301 struct user_lock_res *lockres =
285 container_of(work, struct user_lock_res, l_work); 302 container_of(work, struct user_lock_res, l_work);
286 struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres); 303 struct ocfs2_cluster_connection *conn =
304 cluster_connection_from_user_lockres(lockres);
287 305
288 mlog(0, "processing lockres %.*s\n", lockres->l_namelen, 306 mlog(0, "lockres %.*s\n", lockres->l_namelen, lockres->l_name);
289 lockres->l_name);
290 307
291 spin_lock(&lockres->l_lock); 308 spin_lock(&lockres->l_lock);
292 309
@@ -304,17 +321,23 @@ static void user_dlm_unblock_lock(struct work_struct *work)
304 * flag, and finally we might get another bast which re-queues 321 * flag, and finally we might get another bast which re-queues
305 * us before our ast for the downconvert is called. */ 322 * us before our ast for the downconvert is called. */
306 if (!(lockres->l_flags & USER_LOCK_BLOCKED)) { 323 if (!(lockres->l_flags & USER_LOCK_BLOCKED)) {
324 mlog(ML_BASTS, "lockres %.*s USER_LOCK_BLOCKED\n",
325 lockres->l_namelen, lockres->l_name);
307 spin_unlock(&lockres->l_lock); 326 spin_unlock(&lockres->l_lock);
308 goto drop_ref; 327 goto drop_ref;
309 } 328 }
310 329
311 if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) { 330 if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) {
331 mlog(ML_BASTS, "lockres %.*s USER_LOCK_IN_TEARDOWN\n",
332 lockres->l_namelen, lockres->l_name);
312 spin_unlock(&lockres->l_lock); 333 spin_unlock(&lockres->l_lock);
313 goto drop_ref; 334 goto drop_ref;
314 } 335 }
315 336
316 if (lockres->l_flags & USER_LOCK_BUSY) { 337 if (lockres->l_flags & USER_LOCK_BUSY) {
317 if (lockres->l_flags & USER_LOCK_IN_CANCEL) { 338 if (lockres->l_flags & USER_LOCK_IN_CANCEL) {
339 mlog(ML_BASTS, "lockres %.*s USER_LOCK_IN_CANCEL\n",
340 lockres->l_namelen, lockres->l_name);
318 spin_unlock(&lockres->l_lock); 341 spin_unlock(&lockres->l_lock);
319 goto drop_ref; 342 goto drop_ref;
320 } 343 }
@@ -322,32 +345,31 @@ static void user_dlm_unblock_lock(struct work_struct *work)
322 lockres->l_flags |= USER_LOCK_IN_CANCEL; 345 lockres->l_flags |= USER_LOCK_IN_CANCEL;
323 spin_unlock(&lockres->l_lock); 346 spin_unlock(&lockres->l_lock);
324 347
325 status = dlmunlock(dlm, 348 status = ocfs2_dlm_unlock(conn, &lockres->l_lksb,
326 &lockres->l_lksb, 349 DLM_LKF_CANCEL);
327 LKM_CANCEL, 350 if (status)
328 user_unlock_ast, 351 user_log_dlm_error("ocfs2_dlm_unlock", status, lockres);
329 lockres);
330 if (status != DLM_NORMAL)
331 user_log_dlm_error("dlmunlock", status, lockres);
332 goto drop_ref; 352 goto drop_ref;
333 } 353 }
334 354
335 /* If there are still incompat holders, we can exit safely 355 /* If there are still incompat holders, we can exit safely
336 * without worrying about re-queueing this lock as that will 356 * without worrying about re-queueing this lock as that will
337 * happen on the last call to user_cluster_unlock. */ 357 * happen on the last call to user_cluster_unlock. */
338 if ((lockres->l_blocking == LKM_EXMODE) 358 if ((lockres->l_blocking == DLM_LOCK_EX)
339 && (lockres->l_ex_holders || lockres->l_ro_holders)) { 359 && (lockres->l_ex_holders || lockres->l_ro_holders)) {
340 spin_unlock(&lockres->l_lock); 360 spin_unlock(&lockres->l_lock);
341 mlog(0, "can't downconvert for ex: ro = %u, ex = %u\n", 361 mlog(ML_BASTS, "lockres %.*s, EX/PR Holders %u,%u\n",
342 lockres->l_ro_holders, lockres->l_ex_holders); 362 lockres->l_namelen, lockres->l_name,
363 lockres->l_ex_holders, lockres->l_ro_holders);
343 goto drop_ref; 364 goto drop_ref;
344 } 365 }
345 366
346 if ((lockres->l_blocking == LKM_PRMODE) 367 if ((lockres->l_blocking == DLM_LOCK_PR)
347 && lockres->l_ex_holders) { 368 && lockres->l_ex_holders) {
348 spin_unlock(&lockres->l_lock); 369 spin_unlock(&lockres->l_lock);
349 mlog(0, "can't downconvert for pr: ex = %u\n", 370 mlog(ML_BASTS, "lockres %.*s, EX Holders %u\n",
350 lockres->l_ex_holders); 371 lockres->l_namelen, lockres->l_name,
372 lockres->l_ex_holders);
351 goto drop_ref; 373 goto drop_ref;
352 } 374 }
353 375
@@ -355,22 +377,17 @@ static void user_dlm_unblock_lock(struct work_struct *work)
355 new_level = user_highest_compat_lock_level(lockres->l_blocking); 377 new_level = user_highest_compat_lock_level(lockres->l_blocking);
356 lockres->l_requested = new_level; 378 lockres->l_requested = new_level;
357 lockres->l_flags |= USER_LOCK_BUSY; 379 lockres->l_flags |= USER_LOCK_BUSY;
358 mlog(0, "Downconvert lock from %d to %d\n", 380 mlog(ML_BASTS, "lockres %.*s, downconvert %d => %d\n",
359 lockres->l_level, new_level); 381 lockres->l_namelen, lockres->l_name, lockres->l_level, new_level);
360 spin_unlock(&lockres->l_lock); 382 spin_unlock(&lockres->l_lock);
361 383
362 /* need lock downconvert request now... */ 384 /* need lock downconvert request now... */
363 status = dlmlock(dlm, 385 status = ocfs2_dlm_lock(conn, new_level, &lockres->l_lksb,
364 new_level, 386 DLM_LKF_CONVERT|DLM_LKF_VALBLK,
365 &lockres->l_lksb, 387 lockres->l_name,
366 LKM_CONVERT|LKM_VALBLK, 388 lockres->l_namelen);
367 lockres->l_name, 389 if (status) {
368 lockres->l_namelen, 390 user_log_dlm_error("ocfs2_dlm_lock", status, lockres);
369 user_ast,
370 lockres,
371 user_bast);
372 if (status != DLM_NORMAL) {
373 user_log_dlm_error("dlmlock", status, lockres);
374 user_recover_from_dlm_error(lockres); 391 user_recover_from_dlm_error(lockres);
375 } 392 }
376 393
@@ -382,10 +399,10 @@ static inline void user_dlm_inc_holders(struct user_lock_res *lockres,
382 int level) 399 int level)
383{ 400{
384 switch(level) { 401 switch(level) {
385 case LKM_EXMODE: 402 case DLM_LOCK_EX:
386 lockres->l_ex_holders++; 403 lockres->l_ex_holders++;
387 break; 404 break;
388 case LKM_PRMODE: 405 case DLM_LOCK_PR:
389 lockres->l_ro_holders++; 406 lockres->l_ro_holders++;
390 break; 407 break;
391 default: 408 default:
@@ -410,20 +427,19 @@ int user_dlm_cluster_lock(struct user_lock_res *lockres,
410 int lkm_flags) 427 int lkm_flags)
411{ 428{
412 int status, local_flags; 429 int status, local_flags;
413 struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres); 430 struct ocfs2_cluster_connection *conn =
431 cluster_connection_from_user_lockres(lockres);
414 432
415 if (level != LKM_EXMODE && 433 if (level != DLM_LOCK_EX &&
416 level != LKM_PRMODE) { 434 level != DLM_LOCK_PR) {
417 mlog(ML_ERROR, "lockres %.*s: invalid request!\n", 435 mlog(ML_ERROR, "lockres %.*s: invalid request!\n",
418 lockres->l_namelen, lockres->l_name); 436 lockres->l_namelen, lockres->l_name);
419 status = -EINVAL; 437 status = -EINVAL;
420 goto bail; 438 goto bail;
421 } 439 }
422 440
423 mlog(0, "lockres %.*s: asking for %s lock, passed flags = 0x%x\n", 441 mlog(ML_BASTS, "lockres %.*s, level %d, flags = 0x%x\n",
424 lockres->l_namelen, lockres->l_name, 442 lockres->l_namelen, lockres->l_name, level, lkm_flags);
425 (level == LKM_EXMODE) ? "LKM_EXMODE" : "LKM_PRMODE",
426 lkm_flags);
427 443
428again: 444again:
429 if (signal_pending(current)) { 445 if (signal_pending(current)) {
@@ -457,35 +473,26 @@ again:
457 } 473 }
458 474
459 if (level > lockres->l_level) { 475 if (level > lockres->l_level) {
460 local_flags = lkm_flags | LKM_VALBLK; 476 local_flags = lkm_flags | DLM_LKF_VALBLK;
461 if (lockres->l_level != LKM_IVMODE) 477 if (lockres->l_level != DLM_LOCK_IV)
462 local_flags |= LKM_CONVERT; 478 local_flags |= DLM_LKF_CONVERT;
463 479
464 lockres->l_requested = level; 480 lockres->l_requested = level;
465 lockres->l_flags |= USER_LOCK_BUSY; 481 lockres->l_flags |= USER_LOCK_BUSY;
466 spin_unlock(&lockres->l_lock); 482 spin_unlock(&lockres->l_lock);
467 483
468 BUG_ON(level == LKM_IVMODE); 484 BUG_ON(level == DLM_LOCK_IV);
469 BUG_ON(level == LKM_NLMODE); 485 BUG_ON(level == DLM_LOCK_NL);
470 486
471 /* call dlm_lock to upgrade lock now */ 487 /* call dlm_lock to upgrade lock now */
472 status = dlmlock(dlm, 488 status = ocfs2_dlm_lock(conn, level, &lockres->l_lksb,
473 level, 489 local_flags, lockres->l_name,
474 &lockres->l_lksb, 490 lockres->l_namelen);
475 local_flags, 491 if (status) {
476 lockres->l_name, 492 if ((lkm_flags & DLM_LKF_NOQUEUE) &&
477 lockres->l_namelen, 493 (status != -EAGAIN))
478 user_ast, 494 user_log_dlm_error("ocfs2_dlm_lock",
479 lockres, 495 status, lockres);
480 user_bast);
481 if (status != DLM_NORMAL) {
482 if ((lkm_flags & LKM_NOQUEUE) &&
483 (status == DLM_NOTQUEUED))
484 status = -EAGAIN;
485 else {
486 user_log_dlm_error("dlmlock", status, lockres);
487 status = -EINVAL;
488 }
489 user_recover_from_dlm_error(lockres); 496 user_recover_from_dlm_error(lockres);
490 goto bail; 497 goto bail;
491 } 498 }
@@ -506,11 +513,11 @@ static inline void user_dlm_dec_holders(struct user_lock_res *lockres,
506 int level) 513 int level)
507{ 514{
508 switch(level) { 515 switch(level) {
509 case LKM_EXMODE: 516 case DLM_LOCK_EX:
510 BUG_ON(!lockres->l_ex_holders); 517 BUG_ON(!lockres->l_ex_holders);
511 lockres->l_ex_holders--; 518 lockres->l_ex_holders--;
512 break; 519 break;
513 case LKM_PRMODE: 520 case DLM_LOCK_PR:
514 BUG_ON(!lockres->l_ro_holders); 521 BUG_ON(!lockres->l_ro_holders);
515 lockres->l_ro_holders--; 522 lockres->l_ro_holders--;
516 break; 523 break;
@@ -522,8 +529,8 @@ static inline void user_dlm_dec_holders(struct user_lock_res *lockres,
522void user_dlm_cluster_unlock(struct user_lock_res *lockres, 529void user_dlm_cluster_unlock(struct user_lock_res *lockres,
523 int level) 530 int level)
524{ 531{
525 if (level != LKM_EXMODE && 532 if (level != DLM_LOCK_EX &&
526 level != LKM_PRMODE) { 533 level != DLM_LOCK_PR) {
527 mlog(ML_ERROR, "lockres %.*s: invalid request!\n", 534 mlog(ML_ERROR, "lockres %.*s: invalid request!\n",
528 lockres->l_namelen, lockres->l_name); 535 lockres->l_namelen, lockres->l_name);
529 return; 536 return;
@@ -540,33 +547,40 @@ void user_dlm_write_lvb(struct inode *inode,
540 unsigned int len) 547 unsigned int len)
541{ 548{
542 struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres; 549 struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres;
543 char *lvb = lockres->l_lksb.lvb; 550 char *lvb;
544 551
545 BUG_ON(len > DLM_LVB_LEN); 552 BUG_ON(len > DLM_LVB_LEN);
546 553
547 spin_lock(&lockres->l_lock); 554 spin_lock(&lockres->l_lock);
548 555
549 BUG_ON(lockres->l_level < LKM_EXMODE); 556 BUG_ON(lockres->l_level < DLM_LOCK_EX);
557 lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
550 memcpy(lvb, val, len); 558 memcpy(lvb, val, len);
551 559
552 spin_unlock(&lockres->l_lock); 560 spin_unlock(&lockres->l_lock);
553} 561}
554 562
555void user_dlm_read_lvb(struct inode *inode, 563ssize_t user_dlm_read_lvb(struct inode *inode,
556 char *val, 564 char *val,
557 unsigned int len) 565 unsigned int len)
558{ 566{
559 struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres; 567 struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres;
560 char *lvb = lockres->l_lksb.lvb; 568 char *lvb;
569 ssize_t ret = len;
561 570
562 BUG_ON(len > DLM_LVB_LEN); 571 BUG_ON(len > DLM_LVB_LEN);
563 572
564 spin_lock(&lockres->l_lock); 573 spin_lock(&lockres->l_lock);
565 574
566 BUG_ON(lockres->l_level < LKM_PRMODE); 575 BUG_ON(lockres->l_level < DLM_LOCK_PR);
567 memcpy(val, lvb, len); 576 if (ocfs2_dlm_lvb_valid(&lockres->l_lksb)) {
577 lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
578 memcpy(val, lvb, len);
579 } else
580 ret = 0;
568 581
569 spin_unlock(&lockres->l_lock); 582 spin_unlock(&lockres->l_lock);
583 return ret;
570} 584}
571 585
572void user_dlm_lock_res_init(struct user_lock_res *lockres, 586void user_dlm_lock_res_init(struct user_lock_res *lockres,
@@ -576,9 +590,9 @@ void user_dlm_lock_res_init(struct user_lock_res *lockres,
576 590
577 spin_lock_init(&lockres->l_lock); 591 spin_lock_init(&lockres->l_lock);
578 init_waitqueue_head(&lockres->l_event); 592 init_waitqueue_head(&lockres->l_event);
579 lockres->l_level = LKM_IVMODE; 593 lockres->l_level = DLM_LOCK_IV;
580 lockres->l_requested = LKM_IVMODE; 594 lockres->l_requested = DLM_LOCK_IV;
581 lockres->l_blocking = LKM_IVMODE; 595 lockres->l_blocking = DLM_LOCK_IV;
582 596
583 /* should have been checked before getting here. */ 597 /* should have been checked before getting here. */
584 BUG_ON(dentry->d_name.len >= USER_DLM_LOCK_ID_MAX_LEN); 598 BUG_ON(dentry->d_name.len >= USER_DLM_LOCK_ID_MAX_LEN);
@@ -592,9 +606,10 @@ void user_dlm_lock_res_init(struct user_lock_res *lockres,
592int user_dlm_destroy_lock(struct user_lock_res *lockres) 606int user_dlm_destroy_lock(struct user_lock_res *lockres)
593{ 607{
594 int status = -EBUSY; 608 int status = -EBUSY;
595 struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres); 609 struct ocfs2_cluster_connection *conn =
610 cluster_connection_from_user_lockres(lockres);
596 611
597 mlog(0, "asked to destroy %.*s\n", lockres->l_namelen, lockres->l_name); 612 mlog(ML_BASTS, "lockres %.*s\n", lockres->l_namelen, lockres->l_name);
598 613
599 spin_lock(&lockres->l_lock); 614 spin_lock(&lockres->l_lock);
600 if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) { 615 if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) {
@@ -627,14 +642,9 @@ int user_dlm_destroy_lock(struct user_lock_res *lockres)
627 lockres->l_flags |= USER_LOCK_BUSY; 642 lockres->l_flags |= USER_LOCK_BUSY;
628 spin_unlock(&lockres->l_lock); 643 spin_unlock(&lockres->l_lock);
629 644
630 status = dlmunlock(dlm, 645 status = ocfs2_dlm_unlock(conn, &lockres->l_lksb, DLM_LKF_VALBLK);
631 &lockres->l_lksb, 646 if (status) {
632 LKM_VALBLK, 647 user_log_dlm_error("ocfs2_dlm_unlock", status, lockres);
633 user_unlock_ast,
634 lockres);
635 if (status != DLM_NORMAL) {
636 user_log_dlm_error("dlmunlock", status, lockres);
637 status = -EINVAL;
638 goto bail; 648 goto bail;
639 } 649 }
640 650
@@ -645,32 +655,34 @@ bail:
645 return status; 655 return status;
646} 656}
647 657
648struct dlm_ctxt *user_dlm_register_context(struct qstr *name, 658static void user_dlm_recovery_handler_noop(int node_num,
649 struct dlm_protocol_version *proto) 659 void *recovery_data)
650{ 660{
651 struct dlm_ctxt *dlm; 661 /* We ignore recovery events */
652 u32 dlm_key; 662 return;
653 char *domain; 663}
654
655 domain = kmalloc(name->len + 1, GFP_NOFS);
656 if (!domain) {
657 mlog_errno(-ENOMEM);
658 return ERR_PTR(-ENOMEM);
659 }
660 664
661 dlm_key = crc32_le(0, name->name, name->len); 665void user_dlm_set_locking_protocol(void)
666{
667 ocfs2_stack_glue_set_max_proto_version(&user_dlm_lproto.lp_max_version);
668}
662 669
663 snprintf(domain, name->len + 1, "%.*s", name->len, name->name); 670struct ocfs2_cluster_connection *user_dlm_register(struct qstr *name)
671{
672 int rc;
673 struct ocfs2_cluster_connection *conn;
664 674
665 dlm = dlm_register_domain(domain, dlm_key, proto); 675 rc = ocfs2_cluster_connect_agnostic(name->name, name->len,
666 if (IS_ERR(dlm)) 676 &user_dlm_lproto,
667 mlog_errno(PTR_ERR(dlm)); 677 user_dlm_recovery_handler_noop,
678 NULL, &conn);
679 if (rc)
680 mlog_errno(rc);
668 681
669 kfree(domain); 682 return rc ? ERR_PTR(rc) : conn;
670 return dlm;
671} 683}
672 684
673void user_dlm_unregister_context(struct dlm_ctxt *dlm) 685void user_dlm_unregister(struct ocfs2_cluster_connection *conn)
674{ 686{
675 dlm_unregister_domain(dlm); 687 ocfs2_cluster_disconnect(conn, 0);
676} 688}
diff --git a/fs/ocfs2/dlm/userdlm.h b/fs/ocfs2/dlmfs/userdlm.h
index 0c3cc03c61fa..3b42d79531d7 100644
--- a/fs/ocfs2/dlm/userdlm.h
+++ b/fs/ocfs2/dlmfs/userdlm.h
@@ -57,7 +57,7 @@ struct user_lock_res {
57 int l_level; 57 int l_level;
58 unsigned int l_ro_holders; 58 unsigned int l_ro_holders;
59 unsigned int l_ex_holders; 59 unsigned int l_ex_holders;
60 struct dlm_lockstatus l_lksb; 60 struct ocfs2_dlm_lksb l_lksb;
61 61
62 int l_requested; 62 int l_requested;
63 int l_blocking; 63 int l_blocking;
@@ -80,15 +80,15 @@ void user_dlm_cluster_unlock(struct user_lock_res *lockres,
80void user_dlm_write_lvb(struct inode *inode, 80void user_dlm_write_lvb(struct inode *inode,
81 const char *val, 81 const char *val,
82 unsigned int len); 82 unsigned int len);
83void user_dlm_read_lvb(struct inode *inode, 83ssize_t user_dlm_read_lvb(struct inode *inode,
84 char *val, 84 char *val,
85 unsigned int len); 85 unsigned int len);
86struct dlm_ctxt *user_dlm_register_context(struct qstr *name, 86struct ocfs2_cluster_connection *user_dlm_register(struct qstr *name);
87 struct dlm_protocol_version *proto); 87void user_dlm_unregister(struct ocfs2_cluster_connection *conn);
88void user_dlm_unregister_context(struct dlm_ctxt *dlm); 88void user_dlm_set_locking_protocol(void);
89 89
90struct dlmfs_inode_private { 90struct dlmfs_inode_private {
91 struct dlm_ctxt *ip_dlm; 91 struct ocfs2_cluster_connection *ip_conn;
92 92
93 struct user_lock_res ip_lockres; /* unused for directories. */ 93 struct user_lock_res ip_lockres; /* unused for directories. */
94 struct inode *ip_parent; 94 struct inode *ip_parent;
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index e044019cb3b1..8298608d4165 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -297,6 +297,11 @@ static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
297 lockres->l_type == OCFS2_LOCK_TYPE_OPEN; 297 lockres->l_type == OCFS2_LOCK_TYPE_OPEN;
298} 298}
299 299
300static inline struct ocfs2_lock_res *ocfs2_lksb_to_lock_res(struct ocfs2_dlm_lksb *lksb)
301{
302 return container_of(lksb, struct ocfs2_lock_res, l_lksb);
303}
304
300static inline struct inode *ocfs2_lock_res_inode(struct ocfs2_lock_res *lockres) 305static inline struct inode *ocfs2_lock_res_inode(struct ocfs2_lock_res *lockres)
301{ 306{
302 BUG_ON(!ocfs2_is_inode_lock(lockres)); 307 BUG_ON(!ocfs2_is_inode_lock(lockres));
@@ -927,6 +932,10 @@ static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres,
927 lockres->l_blocking = level; 932 lockres->l_blocking = level;
928 } 933 }
929 934
935 mlog(ML_BASTS, "lockres %s, block %d, level %d, l_block %d, dwn %d\n",
936 lockres->l_name, level, lockres->l_level, lockres->l_blocking,
937 needs_downconvert);
938
930 if (needs_downconvert) 939 if (needs_downconvert)
931 lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED); 940 lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED);
932 941
@@ -1040,18 +1049,17 @@ static unsigned int lockres_set_pending(struct ocfs2_lock_res *lockres)
1040 return lockres->l_pending_gen; 1049 return lockres->l_pending_gen;
1041} 1050}
1042 1051
1043 1052static void ocfs2_blocking_ast(struct ocfs2_dlm_lksb *lksb, int level)
1044static void ocfs2_blocking_ast(void *opaque, int level)
1045{ 1053{
1046 struct ocfs2_lock_res *lockres = opaque; 1054 struct ocfs2_lock_res *lockres = ocfs2_lksb_to_lock_res(lksb);
1047 struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres); 1055 struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres);
1048 int needs_downconvert; 1056 int needs_downconvert;
1049 unsigned long flags; 1057 unsigned long flags;
1050 1058
1051 BUG_ON(level <= DLM_LOCK_NL); 1059 BUG_ON(level <= DLM_LOCK_NL);
1052 1060
1053 mlog(0, "BAST fired for lockres %s, blocking %d, level %d type %s\n", 1061 mlog(ML_BASTS, "BAST fired for lockres %s, blocking %d, level %d, "
1054 lockres->l_name, level, lockres->l_level, 1062 "type %s\n", lockres->l_name, level, lockres->l_level,
1055 ocfs2_lock_type_string(lockres->l_type)); 1063 ocfs2_lock_type_string(lockres->l_type));
1056 1064
1057 /* 1065 /*
@@ -1072,9 +1080,9 @@ static void ocfs2_blocking_ast(void *opaque, int level)
1072 ocfs2_wake_downconvert_thread(osb); 1080 ocfs2_wake_downconvert_thread(osb);
1073} 1081}
1074 1082
1075static void ocfs2_locking_ast(void *opaque) 1083static void ocfs2_locking_ast(struct ocfs2_dlm_lksb *lksb)
1076{ 1084{
1077 struct ocfs2_lock_res *lockres = opaque; 1085 struct ocfs2_lock_res *lockres = ocfs2_lksb_to_lock_res(lksb);
1078 struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres); 1086 struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres);
1079 unsigned long flags; 1087 unsigned long flags;
1080 int status; 1088 int status;
@@ -1095,6 +1103,10 @@ static void ocfs2_locking_ast(void *opaque)
1095 return; 1103 return;
1096 } 1104 }
1097 1105
1106 mlog(ML_BASTS, "AST fired for lockres %s, action %d, unlock %d, "
1107 "level %d => %d\n", lockres->l_name, lockres->l_action,
1108 lockres->l_unlock_action, lockres->l_level, lockres->l_requested);
1109
1098 switch(lockres->l_action) { 1110 switch(lockres->l_action) {
1099 case OCFS2_AST_ATTACH: 1111 case OCFS2_AST_ATTACH:
1100 ocfs2_generic_handle_attach_action(lockres); 1112 ocfs2_generic_handle_attach_action(lockres);
@@ -1107,8 +1119,8 @@ static void ocfs2_locking_ast(void *opaque)
1107 ocfs2_generic_handle_downconvert_action(lockres); 1119 ocfs2_generic_handle_downconvert_action(lockres);
1108 break; 1120 break;
1109 default: 1121 default:
1110 mlog(ML_ERROR, "lockres %s: ast fired with invalid action: %u " 1122 mlog(ML_ERROR, "lockres %s: AST fired with invalid action: %u, "
1111 "lockres flags = 0x%lx, unlock action: %u\n", 1123 "flags 0x%lx, unlock: %u\n",
1112 lockres->l_name, lockres->l_action, lockres->l_flags, 1124 lockres->l_name, lockres->l_action, lockres->l_flags,
1113 lockres->l_unlock_action); 1125 lockres->l_unlock_action);
1114 BUG(); 1126 BUG();
@@ -1134,6 +1146,88 @@ out:
1134 spin_unlock_irqrestore(&lockres->l_lock, flags); 1146 spin_unlock_irqrestore(&lockres->l_lock, flags);
1135} 1147}
1136 1148
1149static void ocfs2_unlock_ast(struct ocfs2_dlm_lksb *lksb, int error)
1150{
1151 struct ocfs2_lock_res *lockres = ocfs2_lksb_to_lock_res(lksb);
1152 unsigned long flags;
1153
1154 mlog_entry_void();
1155
1156 mlog(ML_BASTS, "UNLOCK AST fired for lockres %s, action = %d\n",
1157 lockres->l_name, lockres->l_unlock_action);
1158
1159 spin_lock_irqsave(&lockres->l_lock, flags);
1160 if (error) {
1161 mlog(ML_ERROR, "Dlm passes error %d for lock %s, "
1162 "unlock_action %d\n", error, lockres->l_name,
1163 lockres->l_unlock_action);
1164 spin_unlock_irqrestore(&lockres->l_lock, flags);
1165 mlog_exit_void();
1166 return;
1167 }
1168
1169 switch(lockres->l_unlock_action) {
1170 case OCFS2_UNLOCK_CANCEL_CONVERT:
1171 mlog(0, "Cancel convert success for %s\n", lockres->l_name);
1172 lockres->l_action = OCFS2_AST_INVALID;
1173 /* Downconvert thread may have requeued this lock, we
1174 * need to wake it. */
1175 if (lockres->l_flags & OCFS2_LOCK_BLOCKED)
1176 ocfs2_wake_downconvert_thread(ocfs2_get_lockres_osb(lockres));
1177 break;
1178 case OCFS2_UNLOCK_DROP_LOCK:
1179 lockres->l_level = DLM_LOCK_IV;
1180 break;
1181 default:
1182 BUG();
1183 }
1184
1185 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
1186 lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
1187 wake_up(&lockres->l_event);
1188 spin_unlock_irqrestore(&lockres->l_lock, flags);
1189
1190 mlog_exit_void();
1191}
1192
1193/*
1194 * This is the filesystem locking protocol. It provides the lock handling
1195 * hooks for the underlying DLM. It has a maximum version number.
1196 * The version number allows interoperability with systems running at
1197 * the same major number and an equal or smaller minor number.
1198 *
1199 * Whenever the filesystem does new things with locks (adds or removes a
1200 * lock, orders them differently, does different things underneath a lock),
1201 * the version must be changed. The protocol is negotiated when joining
1202 * the dlm domain. A node may join the domain if its major version is
1203 * identical to all other nodes and its minor version is greater than
1204 * or equal to all other nodes. When its minor version is greater than
1205 * the other nodes, it will run at the minor version specified by the
1206 * other nodes.
1207 *
1208 * If a locking change is made that will not be compatible with older
1209 * versions, the major number must be increased and the minor version set
1210 * to zero. If a change merely adds a behavior that can be disabled when
1211 * speaking to older versions, the minor version must be increased. If a
1212 * change adds a fully backwards compatible change (eg, LVB changes that
1213 * are just ignored by older versions), the version does not need to be
1214 * updated.
1215 */
1216static struct ocfs2_locking_protocol lproto = {
1217 .lp_max_version = {
1218 .pv_major = OCFS2_LOCKING_PROTOCOL_MAJOR,
1219 .pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR,
1220 },
1221 .lp_lock_ast = ocfs2_locking_ast,
1222 .lp_blocking_ast = ocfs2_blocking_ast,
1223 .lp_unlock_ast = ocfs2_unlock_ast,
1224};
1225
1226void ocfs2_set_locking_protocol(void)
1227{
1228 ocfs2_stack_glue_set_max_proto_version(&lproto.lp_max_version);
1229}
1230
1137static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres, 1231static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
1138 int convert) 1232 int convert)
1139{ 1233{
@@ -1189,8 +1283,7 @@ static int ocfs2_lock_create(struct ocfs2_super *osb,
1189 &lockres->l_lksb, 1283 &lockres->l_lksb,
1190 dlm_flags, 1284 dlm_flags,
1191 lockres->l_name, 1285 lockres->l_name,
1192 OCFS2_LOCK_ID_MAX_LEN - 1, 1286 OCFS2_LOCK_ID_MAX_LEN - 1);
1193 lockres);
1194 lockres_clear_pending(lockres, gen, osb); 1287 lockres_clear_pending(lockres, gen, osb);
1195 if (ret) { 1288 if (ret) {
1196 ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres); 1289 ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres);
@@ -1412,7 +1505,7 @@ again:
1412 BUG_ON(level == DLM_LOCK_IV); 1505 BUG_ON(level == DLM_LOCK_IV);
1413 BUG_ON(level == DLM_LOCK_NL); 1506 BUG_ON(level == DLM_LOCK_NL);
1414 1507
1415 mlog(0, "lock %s, convert from %d to level = %d\n", 1508 mlog(ML_BASTS, "lockres %s, convert from %d to %d\n",
1416 lockres->l_name, lockres->l_level, level); 1509 lockres->l_name, lockres->l_level, level);
1417 1510
1418 /* call dlm_lock to upgrade lock now */ 1511 /* call dlm_lock to upgrade lock now */
@@ -1421,8 +1514,7 @@ again:
1421 &lockres->l_lksb, 1514 &lockres->l_lksb,
1422 lkm_flags, 1515 lkm_flags,
1423 lockres->l_name, 1516 lockres->l_name,
1424 OCFS2_LOCK_ID_MAX_LEN - 1, 1517 OCFS2_LOCK_ID_MAX_LEN - 1);
1425 lockres);
1426 lockres_clear_pending(lockres, gen, osb); 1518 lockres_clear_pending(lockres, gen, osb);
1427 if (ret) { 1519 if (ret) {
1428 if (!(lkm_flags & DLM_LKF_NOQUEUE) || 1520 if (!(lkm_flags & DLM_LKF_NOQUEUE) ||
@@ -1859,8 +1951,7 @@ int ocfs2_file_lock(struct file *file, int ex, int trylock)
1859 spin_unlock_irqrestore(&lockres->l_lock, flags); 1951 spin_unlock_irqrestore(&lockres->l_lock, flags);
1860 1952
1861 ret = ocfs2_dlm_lock(osb->cconn, level, &lockres->l_lksb, lkm_flags, 1953 ret = ocfs2_dlm_lock(osb->cconn, level, &lockres->l_lksb, lkm_flags,
1862 lockres->l_name, OCFS2_LOCK_ID_MAX_LEN - 1, 1954 lockres->l_name, OCFS2_LOCK_ID_MAX_LEN - 1);
1863 lockres);
1864 if (ret) { 1955 if (ret) {
1865 if (!trylock || (ret != -EAGAIN)) { 1956 if (!trylock || (ret != -EAGAIN)) {
1866 ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres); 1957 ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres);
@@ -2989,7 +3080,7 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)
2989 status = ocfs2_cluster_connect(osb->osb_cluster_stack, 3080 status = ocfs2_cluster_connect(osb->osb_cluster_stack,
2990 osb->uuid_str, 3081 osb->uuid_str,
2991 strlen(osb->uuid_str), 3082 strlen(osb->uuid_str),
2992 ocfs2_do_node_down, osb, 3083 &lproto, ocfs2_do_node_down, osb,
2993 &conn); 3084 &conn);
2994 if (status) { 3085 if (status) {
2995 mlog_errno(status); 3086 mlog_errno(status);
@@ -3056,50 +3147,6 @@ void ocfs2_dlm_shutdown(struct ocfs2_super *osb,
3056 mlog_exit_void(); 3147 mlog_exit_void();
3057} 3148}
3058 3149
3059static void ocfs2_unlock_ast(void *opaque, int error)
3060{
3061 struct ocfs2_lock_res *lockres = opaque;
3062 unsigned long flags;
3063
3064 mlog_entry_void();
3065
3066 mlog(0, "UNLOCK AST called on lock %s, action = %d\n", lockres->l_name,
3067 lockres->l_unlock_action);
3068
3069 spin_lock_irqsave(&lockres->l_lock, flags);
3070 if (error) {
3071 mlog(ML_ERROR, "Dlm passes error %d for lock %s, "
3072 "unlock_action %d\n", error, lockres->l_name,
3073 lockres->l_unlock_action);
3074 spin_unlock_irqrestore(&lockres->l_lock, flags);
3075 mlog_exit_void();
3076 return;
3077 }
3078
3079 switch(lockres->l_unlock_action) {
3080 case OCFS2_UNLOCK_CANCEL_CONVERT:
3081 mlog(0, "Cancel convert success for %s\n", lockres->l_name);
3082 lockres->l_action = OCFS2_AST_INVALID;
3083 /* Downconvert thread may have requeued this lock, we
3084 * need to wake it. */
3085 if (lockres->l_flags & OCFS2_LOCK_BLOCKED)
3086 ocfs2_wake_downconvert_thread(ocfs2_get_lockres_osb(lockres));
3087 break;
3088 case OCFS2_UNLOCK_DROP_LOCK:
3089 lockres->l_level = DLM_LOCK_IV;
3090 break;
3091 default:
3092 BUG();
3093 }
3094
3095 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
3096 lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
3097 wake_up(&lockres->l_event);
3098 spin_unlock_irqrestore(&lockres->l_lock, flags);
3099
3100 mlog_exit_void();
3101}
3102
3103static int ocfs2_drop_lock(struct ocfs2_super *osb, 3150static int ocfs2_drop_lock(struct ocfs2_super *osb,
3104 struct ocfs2_lock_res *lockres) 3151 struct ocfs2_lock_res *lockres)
3105{ 3152{
@@ -3167,8 +3214,7 @@ static int ocfs2_drop_lock(struct ocfs2_super *osb,
3167 3214
3168 mlog(0, "lock %s\n", lockres->l_name); 3215 mlog(0, "lock %s\n", lockres->l_name);
3169 3216
3170 ret = ocfs2_dlm_unlock(osb->cconn, &lockres->l_lksb, lkm_flags, 3217 ret = ocfs2_dlm_unlock(osb->cconn, &lockres->l_lksb, lkm_flags);
3171 lockres);
3172 if (ret) { 3218 if (ret) {
3173 ocfs2_log_dlm_error("ocfs2_dlm_unlock", ret, lockres); 3219 ocfs2_log_dlm_error("ocfs2_dlm_unlock", ret, lockres);
3174 mlog(ML_ERROR, "lockres flags: %lu\n", lockres->l_flags); 3220 mlog(ML_ERROR, "lockres flags: %lu\n", lockres->l_flags);
@@ -3276,13 +3322,20 @@ static unsigned int ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres,
3276 BUG_ON(lockres->l_blocking <= DLM_LOCK_NL); 3322 BUG_ON(lockres->l_blocking <= DLM_LOCK_NL);
3277 3323
3278 if (lockres->l_level <= new_level) { 3324 if (lockres->l_level <= new_level) {
3279 mlog(ML_ERROR, "lockres->l_level (%d) <= new_level (%d)\n", 3325 mlog(ML_ERROR, "lockres %s, lvl %d <= %d, blcklst %d, mask %d, "
3280 lockres->l_level, new_level); 3326 "type %d, flags 0x%lx, hold %d %d, act %d %d, req %d, "
3327 "block %d, pgen %d\n", lockres->l_name, lockres->l_level,
3328 new_level, list_empty(&lockres->l_blocked_list),
3329 list_empty(&lockres->l_mask_waiters), lockres->l_type,
3330 lockres->l_flags, lockres->l_ro_holders,
3331 lockres->l_ex_holders, lockres->l_action,
3332 lockres->l_unlock_action, lockres->l_requested,
3333 lockres->l_blocking, lockres->l_pending_gen);
3281 BUG(); 3334 BUG();
3282 } 3335 }
3283 3336
3284 mlog(0, "lock %s, new_level = %d, l_blocking = %d\n", 3337 mlog(ML_BASTS, "lockres %s, level %d => %d, blocking %d\n",
3285 lockres->l_name, new_level, lockres->l_blocking); 3338 lockres->l_name, lockres->l_level, new_level, lockres->l_blocking);
3286 3339
3287 lockres->l_action = OCFS2_AST_DOWNCONVERT; 3340 lockres->l_action = OCFS2_AST_DOWNCONVERT;
3288 lockres->l_requested = new_level; 3341 lockres->l_requested = new_level;
@@ -3301,6 +3354,9 @@ static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
3301 3354
3302 mlog_entry_void(); 3355 mlog_entry_void();
3303 3356
3357 mlog(ML_BASTS, "lockres %s, level %d => %d\n", lockres->l_name,
3358 lockres->l_level, new_level);
3359
3304 if (lvb) 3360 if (lvb)
3305 dlm_flags |= DLM_LKF_VALBLK; 3361 dlm_flags |= DLM_LKF_VALBLK;
3306 3362
@@ -3309,8 +3365,7 @@ static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
3309 &lockres->l_lksb, 3365 &lockres->l_lksb,
3310 dlm_flags, 3366 dlm_flags,
3311 lockres->l_name, 3367 lockres->l_name,
3312 OCFS2_LOCK_ID_MAX_LEN - 1, 3368 OCFS2_LOCK_ID_MAX_LEN - 1);
3313 lockres);
3314 lockres_clear_pending(lockres, generation, osb); 3369 lockres_clear_pending(lockres, generation, osb);
3315 if (ret) { 3370 if (ret) {
3316 ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres); 3371 ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres);
@@ -3331,14 +3386,12 @@ static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb,
3331 assert_spin_locked(&lockres->l_lock); 3386 assert_spin_locked(&lockres->l_lock);
3332 3387
3333 mlog_entry_void(); 3388 mlog_entry_void();
3334 mlog(0, "lock %s\n", lockres->l_name);
3335 3389
3336 if (lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) { 3390 if (lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) {
3337 /* If we're already trying to cancel a lock conversion 3391 /* If we're already trying to cancel a lock conversion
3338 * then just drop the spinlock and allow the caller to 3392 * then just drop the spinlock and allow the caller to
3339 * requeue this lock. */ 3393 * requeue this lock. */
3340 3394 mlog(ML_BASTS, "lockres %s, skip convert\n", lockres->l_name);
3341 mlog(0, "Lockres %s, skip convert\n", lockres->l_name);
3342 return 0; 3395 return 0;
3343 } 3396 }
3344 3397
@@ -3353,6 +3406,8 @@ static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb,
3353 "lock %s, invalid flags: 0x%lx\n", 3406 "lock %s, invalid flags: 0x%lx\n",
3354 lockres->l_name, lockres->l_flags); 3407 lockres->l_name, lockres->l_flags);
3355 3408
3409 mlog(ML_BASTS, "lockres %s\n", lockres->l_name);
3410
3356 return 1; 3411 return 1;
3357} 3412}
3358 3413
@@ -3362,16 +3417,15 @@ static int ocfs2_cancel_convert(struct ocfs2_super *osb,
3362 int ret; 3417 int ret;
3363 3418
3364 mlog_entry_void(); 3419 mlog_entry_void();
3365 mlog(0, "lock %s\n", lockres->l_name);
3366 3420
3367 ret = ocfs2_dlm_unlock(osb->cconn, &lockres->l_lksb, 3421 ret = ocfs2_dlm_unlock(osb->cconn, &lockres->l_lksb,
3368 DLM_LKF_CANCEL, lockres); 3422 DLM_LKF_CANCEL);
3369 if (ret) { 3423 if (ret) {
3370 ocfs2_log_dlm_error("ocfs2_dlm_unlock", ret, lockres); 3424 ocfs2_log_dlm_error("ocfs2_dlm_unlock", ret, lockres);
3371 ocfs2_recover_from_dlm_error(lockres, 0); 3425 ocfs2_recover_from_dlm_error(lockres, 0);
3372 } 3426 }
3373 3427
3374 mlog(0, "lock %s return from ocfs2_dlm_unlock\n", lockres->l_name); 3428 mlog(ML_BASTS, "lockres %s\n", lockres->l_name);
3375 3429
3376 mlog_exit(ret); 3430 mlog_exit(ret);
3377 return ret; 3431 return ret;
@@ -3428,8 +3482,11 @@ recheck:
3428 * at the same time they set OCFS2_DLM_BUSY. They must 3482 * at the same time they set OCFS2_DLM_BUSY. They must
3429 * clear OCFS2_DLM_PENDING after dlm_lock() returns. 3483 * clear OCFS2_DLM_PENDING after dlm_lock() returns.
3430 */ 3484 */
3431 if (lockres->l_flags & OCFS2_LOCK_PENDING) 3485 if (lockres->l_flags & OCFS2_LOCK_PENDING) {
3486 mlog(ML_BASTS, "lockres %s, ReQ: Pending\n",
3487 lockres->l_name);
3432 goto leave_requeue; 3488 goto leave_requeue;
3489 }
3433 3490
3434 ctl->requeue = 1; 3491 ctl->requeue = 1;
3435 ret = ocfs2_prepare_cancel_convert(osb, lockres); 3492 ret = ocfs2_prepare_cancel_convert(osb, lockres);
@@ -3461,6 +3518,7 @@ recheck:
3461 */ 3518 */
3462 if (lockres->l_level == DLM_LOCK_NL) { 3519 if (lockres->l_level == DLM_LOCK_NL) {
3463 BUG_ON(lockres->l_ex_holders || lockres->l_ro_holders); 3520 BUG_ON(lockres->l_ex_holders || lockres->l_ro_holders);
3521 mlog(ML_BASTS, "lockres %s, Aborting dc\n", lockres->l_name);
3464 lockres->l_blocking = DLM_LOCK_NL; 3522 lockres->l_blocking = DLM_LOCK_NL;
3465 lockres_clear_flags(lockres, OCFS2_LOCK_BLOCKED); 3523 lockres_clear_flags(lockres, OCFS2_LOCK_BLOCKED);
3466 spin_unlock_irqrestore(&lockres->l_lock, flags); 3524 spin_unlock_irqrestore(&lockres->l_lock, flags);
@@ -3470,28 +3528,41 @@ recheck:
3470 /* if we're blocking an exclusive and we have *any* holders, 3528 /* if we're blocking an exclusive and we have *any* holders,
3471 * then requeue. */ 3529 * then requeue. */
3472 if ((lockres->l_blocking == DLM_LOCK_EX) 3530 if ((lockres->l_blocking == DLM_LOCK_EX)
3473 && (lockres->l_ex_holders || lockres->l_ro_holders)) 3531 && (lockres->l_ex_holders || lockres->l_ro_holders)) {
3532 mlog(ML_BASTS, "lockres %s, ReQ: EX/PR Holders %u,%u\n",
3533 lockres->l_name, lockres->l_ex_holders,
3534 lockres->l_ro_holders);
3474 goto leave_requeue; 3535 goto leave_requeue;
3536 }
3475 3537
3476 /* If it's a PR we're blocking, then only 3538 /* If it's a PR we're blocking, then only
3477 * requeue if we've got any EX holders */ 3539 * requeue if we've got any EX holders */
3478 if (lockres->l_blocking == DLM_LOCK_PR && 3540 if (lockres->l_blocking == DLM_LOCK_PR &&
3479 lockres->l_ex_holders) 3541 lockres->l_ex_holders) {
3542 mlog(ML_BASTS, "lockres %s, ReQ: EX Holders %u\n",
3543 lockres->l_name, lockres->l_ex_holders);
3480 goto leave_requeue; 3544 goto leave_requeue;
3545 }
3481 3546
3482 /* 3547 /*
3483 * Can we get a lock in this state if the holder counts are 3548 * Can we get a lock in this state if the holder counts are
3484 * zero? The meta data unblock code used to check this. 3549 * zero? The meta data unblock code used to check this.
3485 */ 3550 */
3486 if ((lockres->l_ops->flags & LOCK_TYPE_REQUIRES_REFRESH) 3551 if ((lockres->l_ops->flags & LOCK_TYPE_REQUIRES_REFRESH)
3487 && (lockres->l_flags & OCFS2_LOCK_REFRESHING)) 3552 && (lockres->l_flags & OCFS2_LOCK_REFRESHING)) {
3553 mlog(ML_BASTS, "lockres %s, ReQ: Lock Refreshing\n",
3554 lockres->l_name);
3488 goto leave_requeue; 3555 goto leave_requeue;
3556 }
3489 3557
3490 new_level = ocfs2_highest_compat_lock_level(lockres->l_blocking); 3558 new_level = ocfs2_highest_compat_lock_level(lockres->l_blocking);
3491 3559
3492 if (lockres->l_ops->check_downconvert 3560 if (lockres->l_ops->check_downconvert
3493 && !lockres->l_ops->check_downconvert(lockres, new_level)) 3561 && !lockres->l_ops->check_downconvert(lockres, new_level)) {
3562 mlog(ML_BASTS, "lockres %s, ReQ: Checkpointing\n",
3563 lockres->l_name);
3494 goto leave_requeue; 3564 goto leave_requeue;
3565 }
3495 3566
3496 /* If we get here, then we know that there are no more 3567 /* If we get here, then we know that there are no more
3497 * incompatible holders (and anyone asking for an incompatible 3568 * incompatible holders (and anyone asking for an incompatible
@@ -3509,13 +3580,19 @@ recheck:
3509 3580
3510 ctl->unblock_action = lockres->l_ops->downconvert_worker(lockres, blocking); 3581 ctl->unblock_action = lockres->l_ops->downconvert_worker(lockres, blocking);
3511 3582
3512 if (ctl->unblock_action == UNBLOCK_STOP_POST) 3583 if (ctl->unblock_action == UNBLOCK_STOP_POST) {
3584 mlog(ML_BASTS, "lockres %s, UNBLOCK_STOP_POST\n",
3585 lockres->l_name);
3513 goto leave; 3586 goto leave;
3587 }
3514 3588
3515 spin_lock_irqsave(&lockres->l_lock, flags); 3589 spin_lock_irqsave(&lockres->l_lock, flags);
3516 if ((blocking != lockres->l_blocking) || (level != lockres->l_level)) { 3590 if ((blocking != lockres->l_blocking) || (level != lockres->l_level)) {
3517 /* If this changed underneath us, then we can't drop 3591 /* If this changed underneath us, then we can't drop
3518 * it just yet. */ 3592 * it just yet. */
3593 mlog(ML_BASTS, "lockres %s, block=%d:%d, level=%d:%d, "
3594 "Recheck\n", lockres->l_name, blocking,
3595 lockres->l_blocking, level, lockres->l_level);
3519 goto recheck; 3596 goto recheck;
3520 } 3597 }
3521 3598
@@ -3910,45 +3987,6 @@ void ocfs2_refcount_unlock(struct ocfs2_refcount_tree *ref_tree, int ex)
3910 ocfs2_cluster_unlock(osb, lockres, level); 3987 ocfs2_cluster_unlock(osb, lockres, level);
3911} 3988}
3912 3989
3913/*
3914 * This is the filesystem locking protocol. It provides the lock handling
3915 * hooks for the underlying DLM. It has a maximum version number.
3916 * The version number allows interoperability with systems running at
3917 * the same major number and an equal or smaller minor number.
3918 *
3919 * Whenever the filesystem does new things with locks (adds or removes a
3920 * lock, orders them differently, does different things underneath a lock),
3921 * the version must be changed. The protocol is negotiated when joining
3922 * the dlm domain. A node may join the domain if its major version is
3923 * identical to all other nodes and its minor version is greater than
3924 * or equal to all other nodes. When its minor version is greater than
3925 * the other nodes, it will run at the minor version specified by the
3926 * other nodes.
3927 *
3928 * If a locking change is made that will not be compatible with older
3929 * versions, the major number must be increased and the minor version set
3930 * to zero. If a change merely adds a behavior that can be disabled when
3931 * speaking to older versions, the minor version must be increased. If a
3932 * change adds a fully backwards compatible change (eg, LVB changes that
3933 * are just ignored by older versions), the version does not need to be
3934 * updated.
3935 */
3936static struct ocfs2_locking_protocol lproto = {
3937 .lp_max_version = {
3938 .pv_major = OCFS2_LOCKING_PROTOCOL_MAJOR,
3939 .pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR,
3940 },
3941 .lp_lock_ast = ocfs2_locking_ast,
3942 .lp_blocking_ast = ocfs2_blocking_ast,
3943 .lp_unlock_ast = ocfs2_unlock_ast,
3944};
3945
3946void ocfs2_set_locking_protocol(void)
3947{
3948 ocfs2_stack_glue_set_locking_protocol(&lproto);
3949}
3950
3951
3952static void ocfs2_process_blocked_lock(struct ocfs2_super *osb, 3990static void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
3953 struct ocfs2_lock_res *lockres) 3991 struct ocfs2_lock_res *lockres)
3954{ 3992{
@@ -3965,7 +4003,7 @@ static void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
3965 BUG_ON(!lockres); 4003 BUG_ON(!lockres);
3966 BUG_ON(!lockres->l_ops); 4004 BUG_ON(!lockres->l_ops);
3967 4005
3968 mlog(0, "lockres %s blocked.\n", lockres->l_name); 4006 mlog(ML_BASTS, "lockres %s blocked\n", lockres->l_name);
3969 4007
3970 /* Detect whether a lock has been marked as going away while 4008 /* Detect whether a lock has been marked as going away while
3971 * the downconvert thread was processing other things. A lock can 4009 * the downconvert thread was processing other things. A lock can
@@ -3988,7 +4026,7 @@ unqueue:
3988 } else 4026 } else
3989 ocfs2_schedule_blocked_lock(osb, lockres); 4027 ocfs2_schedule_blocked_lock(osb, lockres);
3990 4028
3991 mlog(0, "lockres %s, requeue = %s.\n", lockres->l_name, 4029 mlog(ML_BASTS, "lockres %s, requeue = %s.\n", lockres->l_name,
3992 ctl.requeue ? "yes" : "no"); 4030 ctl.requeue ? "yes" : "no");
3993 spin_unlock_irqrestore(&lockres->l_lock, flags); 4031 spin_unlock_irqrestore(&lockres->l_lock, flags);
3994 4032
@@ -4010,7 +4048,7 @@ static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
4010 /* Do not schedule a lock for downconvert when it's on 4048 /* Do not schedule a lock for downconvert when it's on
4011 * the way to destruction - any nodes wanting access 4049 * the way to destruction - any nodes wanting access
4012 * to the resource will get it soon. */ 4050 * to the resource will get it soon. */
4013 mlog(0, "Lockres %s won't be scheduled: flags 0x%lx\n", 4051 mlog(ML_BASTS, "lockres %s won't be scheduled: flags 0x%lx\n",
4014 lockres->l_name, lockres->l_flags); 4052 lockres->l_name, lockres->l_flags);
4015 return; 4053 return;
4016 } 4054 }
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 558ce0312421..5b52547d6299 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -993,10 +993,9 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
993 } 993 }
994 994
995 if (size_change && attr->ia_size != i_size_read(inode)) { 995 if (size_change && attr->ia_size != i_size_read(inode)) {
996 if (attr->ia_size > sb->s_maxbytes) { 996 status = inode_newsize_ok(inode, attr->ia_size);
997 status = -EFBIG; 997 if (status)
998 goto bail_unlock; 998 goto bail_unlock;
999 }
1000 999
1001 if (i_size_read(inode) > attr->ia_size) { 1000 if (i_size_read(inode) > attr->ia_size) {
1002 if (ocfs2_should_order_data(inode)) { 1001 if (ocfs2_should_order_data(inode)) {
@@ -1836,6 +1835,8 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
1836 &meta_level); 1835 &meta_level);
1837 if (has_refcount) 1836 if (has_refcount)
1838 *has_refcount = 1; 1837 *has_refcount = 1;
1838 if (direct_io)
1839 *direct_io = 0;
1839 } 1840 }
1840 1841
1841 if (ret < 0) { 1842 if (ret < 0) {
@@ -1859,10 +1860,6 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
1859 break; 1860 break;
1860 } 1861 }
1861 1862
1862 if (has_refcount && *has_refcount == 1) {
1863 *direct_io = 0;
1864 break;
1865 }
1866 /* 1863 /*
1867 * Allowing concurrent direct writes means 1864 * Allowing concurrent direct writes means
1868 * i_size changes wouldn't be synchronized, so 1865 * i_size changes wouldn't be synchronized, so
@@ -2043,7 +2040,7 @@ out_dio:
2043 * async dio is going to do it in the future or an end_io after an 2040 * async dio is going to do it in the future or an end_io after an
2044 * error has already done it. 2041 * error has already done it.
2045 */ 2042 */
2046 if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) { 2043 if ((ret == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) {
2047 rw_level = -1; 2044 rw_level = -1;
2048 have_alloc_sem = 0; 2045 have_alloc_sem = 0;
2049 } 2046 }
diff --git a/fs/ocfs2/ioctl.h b/fs/ocfs2/ioctl.h
index cf9a5ee30fef..0cd5323bd3f0 100644
--- a/fs/ocfs2/ioctl.h
+++ b/fs/ocfs2/ioctl.h
@@ -7,10 +7,10 @@
7 * 7 *
8 */ 8 */
9 9
10#ifndef OCFS2_IOCTL_H 10#ifndef OCFS2_IOCTL_PROTO_H
11#define OCFS2_IOCTL_H 11#define OCFS2_IOCTL_PROTO_H
12 12
13long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); 13long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
14long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg); 14long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg);
15 15
16#endif /* OCFS2_IOCTL_H */ 16#endif /* OCFS2_IOCTL_PROTO_H */
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index ac10f83edb95..ca992d91f511 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -476,7 +476,7 @@ out_mutex:
476 476
477out: 477out:
478 if (!status) 478 if (!status)
479 ocfs2_init_inode_steal_slot(osb); 479 ocfs2_init_steal_slots(osb);
480 mlog_exit(status); 480 mlog_exit(status);
481 return status; 481 return status;
482} 482}
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 740f448041e2..1238b491db90 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -42,6 +42,7 @@
42 42
43#include "ocfs2_fs.h" 43#include "ocfs2_fs.h"
44#include "ocfs2_lockid.h" 44#include "ocfs2_lockid.h"
45#include "ocfs2_ioctl.h"
45 46
46/* For struct ocfs2_blockcheck_stats */ 47/* For struct ocfs2_blockcheck_stats */
47#include "blockcheck.h" 48#include "blockcheck.h"
@@ -159,7 +160,7 @@ struct ocfs2_lock_res {
159 int l_level; 160 int l_level;
160 unsigned int l_ro_holders; 161 unsigned int l_ro_holders;
161 unsigned int l_ex_holders; 162 unsigned int l_ex_holders;
162 union ocfs2_dlm_lksb l_lksb; 163 struct ocfs2_dlm_lksb l_lksb;
163 164
164 /* used from AST/BAST funcs. */ 165 /* used from AST/BAST funcs. */
165 enum ocfs2_ast_action l_action; 166 enum ocfs2_ast_action l_action;
@@ -305,7 +306,9 @@ struct ocfs2_super
305 u32 s_next_generation; 306 u32 s_next_generation;
306 unsigned long osb_flags; 307 unsigned long osb_flags;
307 s16 s_inode_steal_slot; 308 s16 s_inode_steal_slot;
309 s16 s_meta_steal_slot;
308 atomic_t s_num_inodes_stolen; 310 atomic_t s_num_inodes_stolen;
311 atomic_t s_num_meta_stolen;
309 312
310 unsigned long s_mount_opt; 313 unsigned long s_mount_opt;
311 unsigned int s_atime_quantum; 314 unsigned int s_atime_quantum;
@@ -760,33 +763,6 @@ static inline unsigned int ocfs2_megabytes_to_clusters(struct super_block *sb,
760 return megs << (20 - OCFS2_SB(sb)->s_clustersize_bits); 763 return megs << (20 - OCFS2_SB(sb)->s_clustersize_bits);
761} 764}
762 765
763static inline void ocfs2_init_inode_steal_slot(struct ocfs2_super *osb)
764{
765 spin_lock(&osb->osb_lock);
766 osb->s_inode_steal_slot = OCFS2_INVALID_SLOT;
767 spin_unlock(&osb->osb_lock);
768 atomic_set(&osb->s_num_inodes_stolen, 0);
769}
770
771static inline void ocfs2_set_inode_steal_slot(struct ocfs2_super *osb,
772 s16 slot)
773{
774 spin_lock(&osb->osb_lock);
775 osb->s_inode_steal_slot = slot;
776 spin_unlock(&osb->osb_lock);
777}
778
779static inline s16 ocfs2_get_inode_steal_slot(struct ocfs2_super *osb)
780{
781 s16 slot;
782
783 spin_lock(&osb->osb_lock);
784 slot = osb->s_inode_steal_slot;
785 spin_unlock(&osb->osb_lock);
786
787 return slot;
788}
789
790#define ocfs2_set_bit ext2_set_bit 766#define ocfs2_set_bit ext2_set_bit
791#define ocfs2_clear_bit ext2_clear_bit 767#define ocfs2_clear_bit ext2_clear_bit
792#define ocfs2_test_bit ext2_test_bit 768#define ocfs2_test_bit ext2_test_bit
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 7638a38c32bc..bb37218a7978 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -254,63 +254,6 @@
254 * refcount tree */ 254 * refcount tree */
255 255
256/* 256/*
257 * ioctl commands
258 */
259#define OCFS2_IOC_GETFLAGS _IOR('f', 1, long)
260#define OCFS2_IOC_SETFLAGS _IOW('f', 2, long)
261#define OCFS2_IOC32_GETFLAGS _IOR('f', 1, int)
262#define OCFS2_IOC32_SETFLAGS _IOW('f', 2, int)
263
264/*
265 * Space reservation / allocation / free ioctls and argument structure
266 * are designed to be compatible with XFS.
267 *
268 * ALLOCSP* and FREESP* are not and will never be supported, but are
269 * included here for completeness.
270 */
271struct ocfs2_space_resv {
272 __s16 l_type;
273 __s16 l_whence;
274 __s64 l_start;
275 __s64 l_len; /* len == 0 means until end of file */
276 __s32 l_sysid;
277 __u32 l_pid;
278 __s32 l_pad[4]; /* reserve area */
279};
280
281#define OCFS2_IOC_ALLOCSP _IOW ('X', 10, struct ocfs2_space_resv)
282#define OCFS2_IOC_FREESP _IOW ('X', 11, struct ocfs2_space_resv)
283#define OCFS2_IOC_RESVSP _IOW ('X', 40, struct ocfs2_space_resv)
284#define OCFS2_IOC_UNRESVSP _IOW ('X', 41, struct ocfs2_space_resv)
285#define OCFS2_IOC_ALLOCSP64 _IOW ('X', 36, struct ocfs2_space_resv)
286#define OCFS2_IOC_FREESP64 _IOW ('X', 37, struct ocfs2_space_resv)
287#define OCFS2_IOC_RESVSP64 _IOW ('X', 42, struct ocfs2_space_resv)
288#define OCFS2_IOC_UNRESVSP64 _IOW ('X', 43, struct ocfs2_space_resv)
289
290/* Used to pass group descriptor data when online resize is done */
291struct ocfs2_new_group_input {
292 __u64 group; /* Group descriptor's blkno. */
293 __u32 clusters; /* Total number of clusters in this group */
294 __u32 frees; /* Total free clusters in this group */
295 __u16 chain; /* Chain for this group */
296 __u16 reserved1;
297 __u32 reserved2;
298};
299
300#define OCFS2_IOC_GROUP_EXTEND _IOW('o', 1, int)
301#define OCFS2_IOC_GROUP_ADD _IOW('o', 2,struct ocfs2_new_group_input)
302#define OCFS2_IOC_GROUP_ADD64 _IOW('o', 3,struct ocfs2_new_group_input)
303
304/* Used to pass 2 file names to reflink. */
305struct reflink_arguments {
306 __u64 old_path;
307 __u64 new_path;
308 __u64 preserve;
309};
310#define OCFS2_IOC_REFLINK _IOW('o', 4, struct reflink_arguments)
311
312
313/*
314 * Journal Flags (ocfs2_dinode.id1.journal1.i_flags) 257 * Journal Flags (ocfs2_dinode.id1.journal1.i_flags)
315 */ 258 */
316#define OCFS2_JOURNAL_DIRTY_FL (0x00000001) /* Journal needs recovery */ 259#define OCFS2_JOURNAL_DIRTY_FL (0x00000001) /* Journal needs recovery */
diff --git a/fs/ocfs2/ocfs2_ioctl.h b/fs/ocfs2/ocfs2_ioctl.h
new file mode 100644
index 000000000000..2d3420af1a83
--- /dev/null
+++ b/fs/ocfs2/ocfs2_ioctl.h
@@ -0,0 +1,79 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * ocfs2_ioctl.h
5 *
6 * Defines OCFS2 ioctls.
7 *
8 * Copyright (C) 2010 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License, version 2, as published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * General Public License for more details.
18 */
19
20#ifndef OCFS2_IOCTL_H
21#define OCFS2_IOCTL_H
22
23/*
24 * ioctl commands
25 */
26#define OCFS2_IOC_GETFLAGS _IOR('f', 1, long)
27#define OCFS2_IOC_SETFLAGS _IOW('f', 2, long)
28#define OCFS2_IOC32_GETFLAGS _IOR('f', 1, int)
29#define OCFS2_IOC32_SETFLAGS _IOW('f', 2, int)
30
31/*
32 * Space reservation / allocation / free ioctls and argument structure
33 * are designed to be compatible with XFS.
34 *
35 * ALLOCSP* and FREESP* are not and will never be supported, but are
36 * included here for completeness.
37 */
38struct ocfs2_space_resv {
39 __s16 l_type;
40 __s16 l_whence;
41 __s64 l_start;
42 __s64 l_len; /* len == 0 means until end of file */
43 __s32 l_sysid;
44 __u32 l_pid;
45 __s32 l_pad[4]; /* reserve area */
46};
47
48#define OCFS2_IOC_ALLOCSP _IOW ('X', 10, struct ocfs2_space_resv)
49#define OCFS2_IOC_FREESP _IOW ('X', 11, struct ocfs2_space_resv)
50#define OCFS2_IOC_RESVSP _IOW ('X', 40, struct ocfs2_space_resv)
51#define OCFS2_IOC_UNRESVSP _IOW ('X', 41, struct ocfs2_space_resv)
52#define OCFS2_IOC_ALLOCSP64 _IOW ('X', 36, struct ocfs2_space_resv)
53#define OCFS2_IOC_FREESP64 _IOW ('X', 37, struct ocfs2_space_resv)
54#define OCFS2_IOC_RESVSP64 _IOW ('X', 42, struct ocfs2_space_resv)
55#define OCFS2_IOC_UNRESVSP64 _IOW ('X', 43, struct ocfs2_space_resv)
56
57/* Used to pass group descriptor data when online resize is done */
58struct ocfs2_new_group_input {
59 __u64 group; /* Group descriptor's blkno. */
60 __u32 clusters; /* Total number of clusters in this group */
61 __u32 frees; /* Total free clusters in this group */
62 __u16 chain; /* Chain for this group */
63 __u16 reserved1;
64 __u32 reserved2;
65};
66
67#define OCFS2_IOC_GROUP_EXTEND _IOW('o', 1, int)
68#define OCFS2_IOC_GROUP_ADD _IOW('o', 2,struct ocfs2_new_group_input)
69#define OCFS2_IOC_GROUP_ADD64 _IOW('o', 3,struct ocfs2_new_group_input)
70
71/* Used to pass 2 file names to reflink. */
72struct reflink_arguments {
73 __u64 old_path;
74 __u64 new_path;
75 __u64 preserve;
76};
77#define OCFS2_IOC_REFLINK _IOW('o', 4, struct reflink_arguments)
78
79#endif /* OCFS2_IOCTL_H */
diff --git a/fs/ocfs2/ocfs2_lockingver.h b/fs/ocfs2/ocfs2_lockingver.h
index 82d5eeac0fff..2e45c8d2ea7e 100644
--- a/fs/ocfs2/ocfs2_lockingver.h
+++ b/fs/ocfs2/ocfs2_lockingver.h
@@ -23,6 +23,8 @@
23/* 23/*
24 * The protocol version for ocfs2 cluster locking. See dlmglue.c for 24 * The protocol version for ocfs2 cluster locking. See dlmglue.c for
25 * more details. 25 * more details.
26 *
27 * 1.0 - Initial locking version from ocfs2 1.4.
26 */ 28 */
27#define OCFS2_LOCKING_PROTOCOL_MAJOR 1 29#define OCFS2_LOCKING_PROTOCOL_MAJOR 1
28#define OCFS2_LOCKING_PROTOCOL_MINOR 0 30#define OCFS2_LOCKING_PROTOCOL_MINOR 0
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 8ae65c9c020c..fb6aa7acf54b 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -626,7 +626,7 @@ static int ocfs2_create_refcount_tree(struct inode *inode,
626 rb = (struct ocfs2_refcount_block *)new_bh->b_data; 626 rb = (struct ocfs2_refcount_block *)new_bh->b_data;
627 memset(rb, 0, inode->i_sb->s_blocksize); 627 memset(rb, 0, inode->i_sb->s_blocksize);
628 strcpy((void *)rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE); 628 strcpy((void *)rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE);
629 rb->rf_suballoc_slot = cpu_to_le16(osb->slot_num); 629 rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
630 rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start); 630 rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
631 rb->rf_fs_generation = cpu_to_le32(osb->fs_generation); 631 rb->rf_fs_generation = cpu_to_le32(osb->fs_generation);
632 rb->rf_blkno = cpu_to_le64(first_blkno); 632 rb->rf_blkno = cpu_to_le64(first_blkno);
@@ -1330,7 +1330,7 @@ static int ocfs2_expand_inline_ref_root(handle_t *handle,
1330 memcpy(new_bh->b_data, ref_root_bh->b_data, sb->s_blocksize); 1330 memcpy(new_bh->b_data, ref_root_bh->b_data, sb->s_blocksize);
1331 1331
1332 new_rb = (struct ocfs2_refcount_block *)new_bh->b_data; 1332 new_rb = (struct ocfs2_refcount_block *)new_bh->b_data;
1333 new_rb->rf_suballoc_slot = cpu_to_le16(OCFS2_SB(sb)->slot_num); 1333 new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
1334 new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start); 1334 new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
1335 new_rb->rf_blkno = cpu_to_le64(blkno); 1335 new_rb->rf_blkno = cpu_to_le64(blkno);
1336 new_rb->rf_cpos = cpu_to_le32(0); 1336 new_rb->rf_cpos = cpu_to_le32(0);
@@ -1576,7 +1576,7 @@ static int ocfs2_new_leaf_refcount_block(handle_t *handle,
1576 new_rb = (struct ocfs2_refcount_block *)new_bh->b_data; 1576 new_rb = (struct ocfs2_refcount_block *)new_bh->b_data;
1577 memset(new_rb, 0, sb->s_blocksize); 1577 memset(new_rb, 0, sb->s_blocksize);
1578 strcpy((void *)new_rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE); 1578 strcpy((void *)new_rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE);
1579 new_rb->rf_suballoc_slot = cpu_to_le16(OCFS2_SB(sb)->slot_num); 1579 new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
1580 new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start); 1580 new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
1581 new_rb->rf_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation); 1581 new_rb->rf_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
1582 new_rb->rf_blkno = cpu_to_le64(blkno); 1582 new_rb->rf_blkno = cpu_to_le64(blkno);
diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
index 3038c92af493..7020e1253ffa 100644
--- a/fs/ocfs2/stack_o2cb.c
+++ b/fs/ocfs2/stack_o2cb.c
@@ -161,24 +161,23 @@ static int dlm_status_to_errno(enum dlm_status status)
161 161
162static void o2dlm_lock_ast_wrapper(void *astarg) 162static void o2dlm_lock_ast_wrapper(void *astarg)
163{ 163{
164 BUG_ON(o2cb_stack.sp_proto == NULL); 164 struct ocfs2_dlm_lksb *lksb = astarg;
165 165
166 o2cb_stack.sp_proto->lp_lock_ast(astarg); 166 lksb->lksb_conn->cc_proto->lp_lock_ast(lksb);
167} 167}
168 168
169static void o2dlm_blocking_ast_wrapper(void *astarg, int level) 169static void o2dlm_blocking_ast_wrapper(void *astarg, int level)
170{ 170{
171 BUG_ON(o2cb_stack.sp_proto == NULL); 171 struct ocfs2_dlm_lksb *lksb = astarg;
172 172
173 o2cb_stack.sp_proto->lp_blocking_ast(astarg, level); 173 lksb->lksb_conn->cc_proto->lp_blocking_ast(lksb, level);
174} 174}
175 175
176static void o2dlm_unlock_ast_wrapper(void *astarg, enum dlm_status status) 176static void o2dlm_unlock_ast_wrapper(void *astarg, enum dlm_status status)
177{ 177{
178 struct ocfs2_dlm_lksb *lksb = astarg;
178 int error = dlm_status_to_errno(status); 179 int error = dlm_status_to_errno(status);
179 180
180 BUG_ON(o2cb_stack.sp_proto == NULL);
181
182 /* 181 /*
183 * In o2dlm, you can get both the lock_ast() for the lock being 182 * In o2dlm, you can get both the lock_ast() for the lock being
184 * granted and the unlock_ast() for the CANCEL failing. A 183 * granted and the unlock_ast() for the CANCEL failing. A
@@ -193,16 +192,15 @@ static void o2dlm_unlock_ast_wrapper(void *astarg, enum dlm_status status)
193 if (status == DLM_CANCELGRANT) 192 if (status == DLM_CANCELGRANT)
194 return; 193 return;
195 194
196 o2cb_stack.sp_proto->lp_unlock_ast(astarg, error); 195 lksb->lksb_conn->cc_proto->lp_unlock_ast(lksb, error);
197} 196}
198 197
199static int o2cb_dlm_lock(struct ocfs2_cluster_connection *conn, 198static int o2cb_dlm_lock(struct ocfs2_cluster_connection *conn,
200 int mode, 199 int mode,
201 union ocfs2_dlm_lksb *lksb, 200 struct ocfs2_dlm_lksb *lksb,
202 u32 flags, 201 u32 flags,
203 void *name, 202 void *name,
204 unsigned int namelen, 203 unsigned int namelen)
205 void *astarg)
206{ 204{
207 enum dlm_status status; 205 enum dlm_status status;
208 int o2dlm_mode = mode_to_o2dlm(mode); 206 int o2dlm_mode = mode_to_o2dlm(mode);
@@ -211,28 +209,27 @@ static int o2cb_dlm_lock(struct ocfs2_cluster_connection *conn,
211 209
212 status = dlmlock(conn->cc_lockspace, o2dlm_mode, &lksb->lksb_o2dlm, 210 status = dlmlock(conn->cc_lockspace, o2dlm_mode, &lksb->lksb_o2dlm,
213 o2dlm_flags, name, namelen, 211 o2dlm_flags, name, namelen,
214 o2dlm_lock_ast_wrapper, astarg, 212 o2dlm_lock_ast_wrapper, lksb,
215 o2dlm_blocking_ast_wrapper); 213 o2dlm_blocking_ast_wrapper);
216 ret = dlm_status_to_errno(status); 214 ret = dlm_status_to_errno(status);
217 return ret; 215 return ret;
218} 216}
219 217
220static int o2cb_dlm_unlock(struct ocfs2_cluster_connection *conn, 218static int o2cb_dlm_unlock(struct ocfs2_cluster_connection *conn,
221 union ocfs2_dlm_lksb *lksb, 219 struct ocfs2_dlm_lksb *lksb,
222 u32 flags, 220 u32 flags)
223 void *astarg)
224{ 221{
225 enum dlm_status status; 222 enum dlm_status status;
226 int o2dlm_flags = flags_to_o2dlm(flags); 223 int o2dlm_flags = flags_to_o2dlm(flags);
227 int ret; 224 int ret;
228 225
229 status = dlmunlock(conn->cc_lockspace, &lksb->lksb_o2dlm, 226 status = dlmunlock(conn->cc_lockspace, &lksb->lksb_o2dlm,
230 o2dlm_flags, o2dlm_unlock_ast_wrapper, astarg); 227 o2dlm_flags, o2dlm_unlock_ast_wrapper, lksb);
231 ret = dlm_status_to_errno(status); 228 ret = dlm_status_to_errno(status);
232 return ret; 229 return ret;
233} 230}
234 231
235static int o2cb_dlm_lock_status(union ocfs2_dlm_lksb *lksb) 232static int o2cb_dlm_lock_status(struct ocfs2_dlm_lksb *lksb)
236{ 233{
237 return dlm_status_to_errno(lksb->lksb_o2dlm.status); 234 return dlm_status_to_errno(lksb->lksb_o2dlm.status);
238} 235}
@@ -242,17 +239,17 @@ static int o2cb_dlm_lock_status(union ocfs2_dlm_lksb *lksb)
242 * contents, it will zero out the LVB. Thus the caller can always trust 239 * contents, it will zero out the LVB. Thus the caller can always trust
243 * the contents. 240 * the contents.
244 */ 241 */
245static int o2cb_dlm_lvb_valid(union ocfs2_dlm_lksb *lksb) 242static int o2cb_dlm_lvb_valid(struct ocfs2_dlm_lksb *lksb)
246{ 243{
247 return 1; 244 return 1;
248} 245}
249 246
250static void *o2cb_dlm_lvb(union ocfs2_dlm_lksb *lksb) 247static void *o2cb_dlm_lvb(struct ocfs2_dlm_lksb *lksb)
251{ 248{
252 return (void *)(lksb->lksb_o2dlm.lvb); 249 return (void *)(lksb->lksb_o2dlm.lvb);
253} 250}
254 251
255static void o2cb_dump_lksb(union ocfs2_dlm_lksb *lksb) 252static void o2cb_dump_lksb(struct ocfs2_dlm_lksb *lksb)
256{ 253{
257 dlm_print_one_lock(lksb->lksb_o2dlm.lockid); 254 dlm_print_one_lock(lksb->lksb_o2dlm.lockid);
258} 255}
@@ -280,7 +277,7 @@ static int o2cb_cluster_connect(struct ocfs2_cluster_connection *conn)
280 struct dlm_protocol_version fs_version; 277 struct dlm_protocol_version fs_version;
281 278
282 BUG_ON(conn == NULL); 279 BUG_ON(conn == NULL);
283 BUG_ON(o2cb_stack.sp_proto == NULL); 280 BUG_ON(conn->cc_proto == NULL);
284 281
285 /* for now we only have one cluster/node, make sure we see it 282 /* for now we only have one cluster/node, make sure we see it
286 * in the heartbeat universe */ 283 * in the heartbeat universe */
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index da78a2a334fd..5ae8812b2864 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -25,7 +25,6 @@
25#include <linux/reboot.h> 25#include <linux/reboot.h>
26#include <asm/uaccess.h> 26#include <asm/uaccess.h>
27 27
28#include "ocfs2.h" /* For struct ocfs2_lock_res */
29#include "stackglue.h" 28#include "stackglue.h"
30 29
31#include <linux/dlm_plock.h> 30#include <linux/dlm_plock.h>
@@ -63,8 +62,8 @@
63 * negotiated by the client. The client negotiates based on the maximum 62 * negotiated by the client. The client negotiates based on the maximum
64 * version advertised in /sys/fs/ocfs2/max_locking_protocol. The major 63 * version advertised in /sys/fs/ocfs2/max_locking_protocol. The major
65 * number from the "SETV" message must match 64 * number from the "SETV" message must match
66 * ocfs2_user_plugin.sp_proto->lp_max_version.pv_major, and the minor number 65 * ocfs2_user_plugin.sp_max_proto.pv_major, and the minor number
67 * must be less than or equal to ...->lp_max_version.pv_minor. 66 * must be less than or equal to ...sp_max_version.pv_minor.
68 * 67 *
69 * Once this information has been set, mounts will be allowed. From this 68 * Once this information has been set, mounts will be allowed. From this
70 * point on, the "DOWN" message can be sent for node down notification. 69 * point on, the "DOWN" message can be sent for node down notification.
@@ -401,7 +400,7 @@ static int ocfs2_control_do_setversion_msg(struct file *file,
401 char *ptr = NULL; 400 char *ptr = NULL;
402 struct ocfs2_control_private *p = file->private_data; 401 struct ocfs2_control_private *p = file->private_data;
403 struct ocfs2_protocol_version *max = 402 struct ocfs2_protocol_version *max =
404 &ocfs2_user_plugin.sp_proto->lp_max_version; 403 &ocfs2_user_plugin.sp_max_proto;
405 404
406 if (ocfs2_control_get_handshake_state(file) != 405 if (ocfs2_control_get_handshake_state(file) !=
407 OCFS2_CONTROL_HANDSHAKE_PROTOCOL) 406 OCFS2_CONTROL_HANDSHAKE_PROTOCOL)
@@ -664,18 +663,10 @@ static void ocfs2_control_exit(void)
664 -rc); 663 -rc);
665} 664}
666 665
667static struct dlm_lksb *fsdlm_astarg_to_lksb(void *astarg)
668{
669 struct ocfs2_lock_res *res = astarg;
670 return &res->l_lksb.lksb_fsdlm;
671}
672
673static void fsdlm_lock_ast_wrapper(void *astarg) 666static void fsdlm_lock_ast_wrapper(void *astarg)
674{ 667{
675 struct dlm_lksb *lksb = fsdlm_astarg_to_lksb(astarg); 668 struct ocfs2_dlm_lksb *lksb = astarg;
676 int status = lksb->sb_status; 669 int status = lksb->lksb_fsdlm.sb_status;
677
678 BUG_ON(ocfs2_user_plugin.sp_proto == NULL);
679 670
680 /* 671 /*
681 * For now we're punting on the issue of other non-standard errors 672 * For now we're punting on the issue of other non-standard errors
@@ -688,25 +679,24 @@ static void fsdlm_lock_ast_wrapper(void *astarg)
688 */ 679 */
689 680
690 if (status == -DLM_EUNLOCK || status == -DLM_ECANCEL) 681 if (status == -DLM_EUNLOCK || status == -DLM_ECANCEL)
691 ocfs2_user_plugin.sp_proto->lp_unlock_ast(astarg, 0); 682 lksb->lksb_conn->cc_proto->lp_unlock_ast(lksb, 0);
692 else 683 else
693 ocfs2_user_plugin.sp_proto->lp_lock_ast(astarg); 684 lksb->lksb_conn->cc_proto->lp_lock_ast(lksb);
694} 685}
695 686
696static void fsdlm_blocking_ast_wrapper(void *astarg, int level) 687static void fsdlm_blocking_ast_wrapper(void *astarg, int level)
697{ 688{
698 BUG_ON(ocfs2_user_plugin.sp_proto == NULL); 689 struct ocfs2_dlm_lksb *lksb = astarg;
699 690
700 ocfs2_user_plugin.sp_proto->lp_blocking_ast(astarg, level); 691 lksb->lksb_conn->cc_proto->lp_blocking_ast(lksb, level);
701} 692}
702 693
703static int user_dlm_lock(struct ocfs2_cluster_connection *conn, 694static int user_dlm_lock(struct ocfs2_cluster_connection *conn,
704 int mode, 695 int mode,
705 union ocfs2_dlm_lksb *lksb, 696 struct ocfs2_dlm_lksb *lksb,
706 u32 flags, 697 u32 flags,
707 void *name, 698 void *name,
708 unsigned int namelen, 699 unsigned int namelen)
709 void *astarg)
710{ 700{
711 int ret; 701 int ret;
712 702
@@ -716,36 +706,35 @@ static int user_dlm_lock(struct ocfs2_cluster_connection *conn,
716 706
717 ret = dlm_lock(conn->cc_lockspace, mode, &lksb->lksb_fsdlm, 707 ret = dlm_lock(conn->cc_lockspace, mode, &lksb->lksb_fsdlm,
718 flags|DLM_LKF_NODLCKWT, name, namelen, 0, 708 flags|DLM_LKF_NODLCKWT, name, namelen, 0,
719 fsdlm_lock_ast_wrapper, astarg, 709 fsdlm_lock_ast_wrapper, lksb,
720 fsdlm_blocking_ast_wrapper); 710 fsdlm_blocking_ast_wrapper);
721 return ret; 711 return ret;
722} 712}
723 713
724static int user_dlm_unlock(struct ocfs2_cluster_connection *conn, 714static int user_dlm_unlock(struct ocfs2_cluster_connection *conn,
725 union ocfs2_dlm_lksb *lksb, 715 struct ocfs2_dlm_lksb *lksb,
726 u32 flags, 716 u32 flags)
727 void *astarg)
728{ 717{
729 int ret; 718 int ret;
730 719
731 ret = dlm_unlock(conn->cc_lockspace, lksb->lksb_fsdlm.sb_lkid, 720 ret = dlm_unlock(conn->cc_lockspace, lksb->lksb_fsdlm.sb_lkid,
732 flags, &lksb->lksb_fsdlm, astarg); 721 flags, &lksb->lksb_fsdlm, lksb);
733 return ret; 722 return ret;
734} 723}
735 724
736static int user_dlm_lock_status(union ocfs2_dlm_lksb *lksb) 725static int user_dlm_lock_status(struct ocfs2_dlm_lksb *lksb)
737{ 726{
738 return lksb->lksb_fsdlm.sb_status; 727 return lksb->lksb_fsdlm.sb_status;
739} 728}
740 729
741static int user_dlm_lvb_valid(union ocfs2_dlm_lksb *lksb) 730static int user_dlm_lvb_valid(struct ocfs2_dlm_lksb *lksb)
742{ 731{
743 int invalid = lksb->lksb_fsdlm.sb_flags & DLM_SBF_VALNOTVALID; 732 int invalid = lksb->lksb_fsdlm.sb_flags & DLM_SBF_VALNOTVALID;
744 733
745 return !invalid; 734 return !invalid;
746} 735}
747 736
748static void *user_dlm_lvb(union ocfs2_dlm_lksb *lksb) 737static void *user_dlm_lvb(struct ocfs2_dlm_lksb *lksb)
749{ 738{
750 if (!lksb->lksb_fsdlm.sb_lvbptr) 739 if (!lksb->lksb_fsdlm.sb_lvbptr)
751 lksb->lksb_fsdlm.sb_lvbptr = (char *)lksb + 740 lksb->lksb_fsdlm.sb_lvbptr = (char *)lksb +
@@ -753,7 +742,7 @@ static void *user_dlm_lvb(union ocfs2_dlm_lksb *lksb)
753 return (void *)(lksb->lksb_fsdlm.sb_lvbptr); 742 return (void *)(lksb->lksb_fsdlm.sb_lvbptr);
754} 743}
755 744
756static void user_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb) 745static void user_dlm_dump_lksb(struct ocfs2_dlm_lksb *lksb)
757{ 746{
758} 747}
759 748
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index f3df0baa9a48..39abf89697ed 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -36,7 +36,7 @@
36#define OCFS2_STACK_PLUGIN_USER "user" 36#define OCFS2_STACK_PLUGIN_USER "user"
37#define OCFS2_MAX_HB_CTL_PATH 256 37#define OCFS2_MAX_HB_CTL_PATH 256
38 38
39static struct ocfs2_locking_protocol *lproto; 39static struct ocfs2_protocol_version locking_max_version;
40static DEFINE_SPINLOCK(ocfs2_stack_lock); 40static DEFINE_SPINLOCK(ocfs2_stack_lock);
41static LIST_HEAD(ocfs2_stack_list); 41static LIST_HEAD(ocfs2_stack_list);
42static char cluster_stack_name[OCFS2_STACK_LABEL_LEN + 1]; 42static char cluster_stack_name[OCFS2_STACK_LABEL_LEN + 1];
@@ -176,7 +176,7 @@ int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin)
176 spin_lock(&ocfs2_stack_lock); 176 spin_lock(&ocfs2_stack_lock);
177 if (!ocfs2_stack_lookup(plugin->sp_name)) { 177 if (!ocfs2_stack_lookup(plugin->sp_name)) {
178 plugin->sp_count = 0; 178 plugin->sp_count = 0;
179 plugin->sp_proto = lproto; 179 plugin->sp_max_proto = locking_max_version;
180 list_add(&plugin->sp_list, &ocfs2_stack_list); 180 list_add(&plugin->sp_list, &ocfs2_stack_list);
181 printk(KERN_INFO "ocfs2: Registered cluster interface %s\n", 181 printk(KERN_INFO "ocfs2: Registered cluster interface %s\n",
182 plugin->sp_name); 182 plugin->sp_name);
@@ -213,77 +213,76 @@ void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin)
213} 213}
214EXPORT_SYMBOL_GPL(ocfs2_stack_glue_unregister); 214EXPORT_SYMBOL_GPL(ocfs2_stack_glue_unregister);
215 215
216void ocfs2_stack_glue_set_locking_protocol(struct ocfs2_locking_protocol *proto) 216void ocfs2_stack_glue_set_max_proto_version(struct ocfs2_protocol_version *max_proto)
217{ 217{
218 struct ocfs2_stack_plugin *p; 218 struct ocfs2_stack_plugin *p;
219 219
220 BUG_ON(proto == NULL);
221
222 spin_lock(&ocfs2_stack_lock); 220 spin_lock(&ocfs2_stack_lock);
223 BUG_ON(active_stack != NULL); 221 if (memcmp(max_proto, &locking_max_version,
222 sizeof(struct ocfs2_protocol_version))) {
223 BUG_ON(locking_max_version.pv_major != 0);
224 224
225 lproto = proto; 225 locking_max_version = *max_proto;
226 list_for_each_entry(p, &ocfs2_stack_list, sp_list) { 226 list_for_each_entry(p, &ocfs2_stack_list, sp_list) {
227 p->sp_proto = lproto; 227 p->sp_max_proto = locking_max_version;
228 }
228 } 229 }
229
230 spin_unlock(&ocfs2_stack_lock); 230 spin_unlock(&ocfs2_stack_lock);
231} 231}
232EXPORT_SYMBOL_GPL(ocfs2_stack_glue_set_locking_protocol); 232EXPORT_SYMBOL_GPL(ocfs2_stack_glue_set_max_proto_version);
233 233
234 234
235/* 235/*
236 * The ocfs2_dlm_lock() and ocfs2_dlm_unlock() functions take 236 * The ocfs2_dlm_lock() and ocfs2_dlm_unlock() functions take no argument
237 * "struct ocfs2_lock_res *astarg" instead of "void *astarg" because the 237 * for the ast and bast functions. They will pass the lksb to the ast
238 * underlying stack plugins need to pilfer the lksb off of the lock_res. 238 * and bast. The caller can wrap the lksb with their own structure to
239 * If some other structure needs to be passed as an astarg, the plugins 239 * get more information.
240 * will need to be given a different avenue to the lksb.
241 */ 240 */
242int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn, 241int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn,
243 int mode, 242 int mode,
244 union ocfs2_dlm_lksb *lksb, 243 struct ocfs2_dlm_lksb *lksb,
245 u32 flags, 244 u32 flags,
246 void *name, 245 void *name,
247 unsigned int namelen, 246 unsigned int namelen)
248 struct ocfs2_lock_res *astarg)
249{ 247{
250 BUG_ON(lproto == NULL); 248 if (!lksb->lksb_conn)
251 249 lksb->lksb_conn = conn;
250 else
251 BUG_ON(lksb->lksb_conn != conn);
252 return active_stack->sp_ops->dlm_lock(conn, mode, lksb, flags, 252 return active_stack->sp_ops->dlm_lock(conn, mode, lksb, flags,
253 name, namelen, astarg); 253 name, namelen);
254} 254}
255EXPORT_SYMBOL_GPL(ocfs2_dlm_lock); 255EXPORT_SYMBOL_GPL(ocfs2_dlm_lock);
256 256
257int ocfs2_dlm_unlock(struct ocfs2_cluster_connection *conn, 257int ocfs2_dlm_unlock(struct ocfs2_cluster_connection *conn,
258 union ocfs2_dlm_lksb *lksb, 258 struct ocfs2_dlm_lksb *lksb,
259 u32 flags, 259 u32 flags)
260 struct ocfs2_lock_res *astarg)
261{ 260{
262 BUG_ON(lproto == NULL); 261 BUG_ON(lksb->lksb_conn == NULL);
263 262
264 return active_stack->sp_ops->dlm_unlock(conn, lksb, flags, astarg); 263 return active_stack->sp_ops->dlm_unlock(conn, lksb, flags);
265} 264}
266EXPORT_SYMBOL_GPL(ocfs2_dlm_unlock); 265EXPORT_SYMBOL_GPL(ocfs2_dlm_unlock);
267 266
268int ocfs2_dlm_lock_status(union ocfs2_dlm_lksb *lksb) 267int ocfs2_dlm_lock_status(struct ocfs2_dlm_lksb *lksb)
269{ 268{
270 return active_stack->sp_ops->lock_status(lksb); 269 return active_stack->sp_ops->lock_status(lksb);
271} 270}
272EXPORT_SYMBOL_GPL(ocfs2_dlm_lock_status); 271EXPORT_SYMBOL_GPL(ocfs2_dlm_lock_status);
273 272
274int ocfs2_dlm_lvb_valid(union ocfs2_dlm_lksb *lksb) 273int ocfs2_dlm_lvb_valid(struct ocfs2_dlm_lksb *lksb)
275{ 274{
276 return active_stack->sp_ops->lvb_valid(lksb); 275 return active_stack->sp_ops->lvb_valid(lksb);
277} 276}
278EXPORT_SYMBOL_GPL(ocfs2_dlm_lvb_valid); 277EXPORT_SYMBOL_GPL(ocfs2_dlm_lvb_valid);
279 278
280void *ocfs2_dlm_lvb(union ocfs2_dlm_lksb *lksb) 279void *ocfs2_dlm_lvb(struct ocfs2_dlm_lksb *lksb)
281{ 280{
282 return active_stack->sp_ops->lock_lvb(lksb); 281 return active_stack->sp_ops->lock_lvb(lksb);
283} 282}
284EXPORT_SYMBOL_GPL(ocfs2_dlm_lvb); 283EXPORT_SYMBOL_GPL(ocfs2_dlm_lvb);
285 284
286void ocfs2_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb) 285void ocfs2_dlm_dump_lksb(struct ocfs2_dlm_lksb *lksb)
287{ 286{
288 active_stack->sp_ops->dump_lksb(lksb); 287 active_stack->sp_ops->dump_lksb(lksb);
289} 288}
@@ -312,6 +311,7 @@ EXPORT_SYMBOL_GPL(ocfs2_plock);
312int ocfs2_cluster_connect(const char *stack_name, 311int ocfs2_cluster_connect(const char *stack_name,
313 const char *group, 312 const char *group,
314 int grouplen, 313 int grouplen,
314 struct ocfs2_locking_protocol *lproto,
315 void (*recovery_handler)(int node_num, 315 void (*recovery_handler)(int node_num,
316 void *recovery_data), 316 void *recovery_data),
317 void *recovery_data, 317 void *recovery_data,
@@ -329,6 +329,12 @@ int ocfs2_cluster_connect(const char *stack_name,
329 goto out; 329 goto out;
330 } 330 }
331 331
332 if (memcmp(&lproto->lp_max_version, &locking_max_version,
333 sizeof(struct ocfs2_protocol_version))) {
334 rc = -EINVAL;
335 goto out;
336 }
337
332 new_conn = kzalloc(sizeof(struct ocfs2_cluster_connection), 338 new_conn = kzalloc(sizeof(struct ocfs2_cluster_connection),
333 GFP_KERNEL); 339 GFP_KERNEL);
334 if (!new_conn) { 340 if (!new_conn) {
@@ -341,6 +347,7 @@ int ocfs2_cluster_connect(const char *stack_name,
341 new_conn->cc_recovery_handler = recovery_handler; 347 new_conn->cc_recovery_handler = recovery_handler;
342 new_conn->cc_recovery_data = recovery_data; 348 new_conn->cc_recovery_data = recovery_data;
343 349
350 new_conn->cc_proto = lproto;
344 /* Start the new connection at our maximum compatibility level */ 351 /* Start the new connection at our maximum compatibility level */
345 new_conn->cc_version = lproto->lp_max_version; 352 new_conn->cc_version = lproto->lp_max_version;
346 353
@@ -366,6 +373,24 @@ out:
366} 373}
367EXPORT_SYMBOL_GPL(ocfs2_cluster_connect); 374EXPORT_SYMBOL_GPL(ocfs2_cluster_connect);
368 375
376/* The caller will ensure all nodes have the same cluster stack */
377int ocfs2_cluster_connect_agnostic(const char *group,
378 int grouplen,
379 struct ocfs2_locking_protocol *lproto,
380 void (*recovery_handler)(int node_num,
381 void *recovery_data),
382 void *recovery_data,
383 struct ocfs2_cluster_connection **conn)
384{
385 char *stack_name = NULL;
386
387 if (cluster_stack_name[0])
388 stack_name = cluster_stack_name;
389 return ocfs2_cluster_connect(stack_name, group, grouplen, lproto,
390 recovery_handler, recovery_data, conn);
391}
392EXPORT_SYMBOL_GPL(ocfs2_cluster_connect_agnostic);
393
369/* If hangup_pending is 0, the stack driver will be dropped */ 394/* If hangup_pending is 0, the stack driver will be dropped */
370int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn, 395int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn,
371 int hangup_pending) 396 int hangup_pending)
@@ -453,10 +478,10 @@ static ssize_t ocfs2_max_locking_protocol_show(struct kobject *kobj,
453 ssize_t ret = 0; 478 ssize_t ret = 0;
454 479
455 spin_lock(&ocfs2_stack_lock); 480 spin_lock(&ocfs2_stack_lock);
456 if (lproto) 481 if (locking_max_version.pv_major)
457 ret = snprintf(buf, PAGE_SIZE, "%u.%u\n", 482 ret = snprintf(buf, PAGE_SIZE, "%u.%u\n",
458 lproto->lp_max_version.pv_major, 483 locking_max_version.pv_major,
459 lproto->lp_max_version.pv_minor); 484 locking_max_version.pv_minor);
460 spin_unlock(&ocfs2_stack_lock); 485 spin_unlock(&ocfs2_stack_lock);
461 486
462 return ret; 487 return ret;
@@ -685,7 +710,10 @@ static int __init ocfs2_stack_glue_init(void)
685 710
686static void __exit ocfs2_stack_glue_exit(void) 711static void __exit ocfs2_stack_glue_exit(void)
687{ 712{
688 lproto = NULL; 713 memset(&locking_max_version, 0,
714 sizeof(struct ocfs2_protocol_version));
715 locking_max_version.pv_major = 0;
716 locking_max_version.pv_minor = 0;
689 ocfs2_sysfs_exit(); 717 ocfs2_sysfs_exit();
690 if (ocfs2_table_header) 718 if (ocfs2_table_header)
691 unregister_sysctl_table(ocfs2_table_header); 719 unregister_sysctl_table(ocfs2_table_header);
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h
index 03a44d60eac9..8ce7398ae1d2 100644
--- a/fs/ocfs2/stackglue.h
+++ b/fs/ocfs2/stackglue.h
@@ -56,17 +56,6 @@ struct ocfs2_protocol_version {
56}; 56};
57 57
58/* 58/*
59 * The ocfs2_locking_protocol defines the handlers called on ocfs2's behalf.
60 */
61struct ocfs2_locking_protocol {
62 struct ocfs2_protocol_version lp_max_version;
63 void (*lp_lock_ast)(void *astarg);
64 void (*lp_blocking_ast)(void *astarg, int level);
65 void (*lp_unlock_ast)(void *astarg, int error);
66};
67
68
69/*
70 * The dlm_lockstatus struct includes lvb space, but the dlm_lksb struct only 59 * The dlm_lockstatus struct includes lvb space, but the dlm_lksb struct only
71 * has a pointer to separately allocated lvb space. This struct exists only to 60 * has a pointer to separately allocated lvb space. This struct exists only to
72 * include in the lksb union to make space for a combined dlm_lksb and lvb. 61 * include in the lksb union to make space for a combined dlm_lksb and lvb.
@@ -81,12 +70,27 @@ struct fsdlm_lksb_plus_lvb {
81 * size of the union is known. Lock status structures are embedded in 70 * size of the union is known. Lock status structures are embedded in
82 * ocfs2 inodes. 71 * ocfs2 inodes.
83 */ 72 */
84union ocfs2_dlm_lksb { 73struct ocfs2_cluster_connection;
85 struct dlm_lockstatus lksb_o2dlm; 74struct ocfs2_dlm_lksb {
86 struct dlm_lksb lksb_fsdlm; 75 union {
87 struct fsdlm_lksb_plus_lvb padding; 76 struct dlm_lockstatus lksb_o2dlm;
77 struct dlm_lksb lksb_fsdlm;
78 struct fsdlm_lksb_plus_lvb padding;
79 };
80 struct ocfs2_cluster_connection *lksb_conn;
81};
82
83/*
84 * The ocfs2_locking_protocol defines the handlers called on ocfs2's behalf.
85 */
86struct ocfs2_locking_protocol {
87 struct ocfs2_protocol_version lp_max_version;
88 void (*lp_lock_ast)(struct ocfs2_dlm_lksb *lksb);
89 void (*lp_blocking_ast)(struct ocfs2_dlm_lksb *lksb, int level);
90 void (*lp_unlock_ast)(struct ocfs2_dlm_lksb *lksb, int error);
88}; 91};
89 92
93
90/* 94/*
91 * A cluster connection. Mostly opaque to ocfs2, the connection holds 95 * A cluster connection. Mostly opaque to ocfs2, the connection holds
92 * state for the underlying stack. ocfs2 does use cc_version to determine 96 * state for the underlying stack. ocfs2 does use cc_version to determine
@@ -96,6 +100,7 @@ struct ocfs2_cluster_connection {
96 char cc_name[GROUP_NAME_MAX]; 100 char cc_name[GROUP_NAME_MAX];
97 int cc_namelen; 101 int cc_namelen;
98 struct ocfs2_protocol_version cc_version; 102 struct ocfs2_protocol_version cc_version;
103 struct ocfs2_locking_protocol *cc_proto;
99 void (*cc_recovery_handler)(int node_num, void *recovery_data); 104 void (*cc_recovery_handler)(int node_num, void *recovery_data);
100 void *cc_recovery_data; 105 void *cc_recovery_data;
101 void *cc_lockspace; 106 void *cc_lockspace;
@@ -155,27 +160,29 @@ struct ocfs2_stack_operations {
155 * 160 *
156 * ast and bast functions are not part of the call because the 161 * ast and bast functions are not part of the call because the
157 * stack will likely want to wrap ast and bast calls before passing 162 * stack will likely want to wrap ast and bast calls before passing
158 * them to stack->sp_proto. 163 * them to stack->sp_proto. There is no astarg. The lksb will
164 * be passed back to the ast and bast functions. The caller can
165 * use this to find their object.
159 */ 166 */
160 int (*dlm_lock)(struct ocfs2_cluster_connection *conn, 167 int (*dlm_lock)(struct ocfs2_cluster_connection *conn,
161 int mode, 168 int mode,
162 union ocfs2_dlm_lksb *lksb, 169 struct ocfs2_dlm_lksb *lksb,
163 u32 flags, 170 u32 flags,
164 void *name, 171 void *name,
165 unsigned int namelen, 172 unsigned int namelen);
166 void *astarg);
167 173
168 /* 174 /*
169 * Call the underlying dlm unlock function. The ->dlm_unlock() 175 * Call the underlying dlm unlock function. The ->dlm_unlock()
170 * function should convert the flags as appropriate. 176 * function should convert the flags as appropriate.
171 * 177 *
172 * The unlock ast is not passed, as the stack will want to wrap 178 * The unlock ast is not passed, as the stack will want to wrap
173 * it before calling stack->sp_proto->lp_unlock_ast(). 179 * it before calling stack->sp_proto->lp_unlock_ast(). There is
180 * no astarg. The lksb will be passed back to the unlock ast
181 * function. The caller can use this to find their object.
174 */ 182 */
175 int (*dlm_unlock)(struct ocfs2_cluster_connection *conn, 183 int (*dlm_unlock)(struct ocfs2_cluster_connection *conn,
176 union ocfs2_dlm_lksb *lksb, 184 struct ocfs2_dlm_lksb *lksb,
177 u32 flags, 185 u32 flags);
178 void *astarg);
179 186
180 /* 187 /*
181 * Return the status of the current lock status block. The fs 188 * Return the status of the current lock status block. The fs
@@ -183,17 +190,17 @@ struct ocfs2_stack_operations {
183 * callback pulls out the stack-specific lksb, converts the status 190 * callback pulls out the stack-specific lksb, converts the status
184 * to a proper errno, and returns it. 191 * to a proper errno, and returns it.
185 */ 192 */
186 int (*lock_status)(union ocfs2_dlm_lksb *lksb); 193 int (*lock_status)(struct ocfs2_dlm_lksb *lksb);
187 194
188 /* 195 /*
189 * Return non-zero if the LVB is valid. 196 * Return non-zero if the LVB is valid.
190 */ 197 */
191 int (*lvb_valid)(union ocfs2_dlm_lksb *lksb); 198 int (*lvb_valid)(struct ocfs2_dlm_lksb *lksb);
192 199
193 /* 200 /*
194 * Pull the lvb pointer off of the stack-specific lksb. 201 * Pull the lvb pointer off of the stack-specific lksb.
195 */ 202 */
196 void *(*lock_lvb)(union ocfs2_dlm_lksb *lksb); 203 void *(*lock_lvb)(struct ocfs2_dlm_lksb *lksb);
197 204
198 /* 205 /*
199 * Cluster-aware posix locks 206 * Cluster-aware posix locks
@@ -210,7 +217,7 @@ struct ocfs2_stack_operations {
210 * This is an optoinal debugging hook. If provided, the 217 * This is an optoinal debugging hook. If provided, the
211 * stack can dump debugging information about this lock. 218 * stack can dump debugging information about this lock.
212 */ 219 */
213 void (*dump_lksb)(union ocfs2_dlm_lksb *lksb); 220 void (*dump_lksb)(struct ocfs2_dlm_lksb *lksb);
214}; 221};
215 222
216/* 223/*
@@ -226,7 +233,7 @@ struct ocfs2_stack_plugin {
226 /* These are managed by the stackglue code. */ 233 /* These are managed by the stackglue code. */
227 struct list_head sp_list; 234 struct list_head sp_list;
228 unsigned int sp_count; 235 unsigned int sp_count;
229 struct ocfs2_locking_protocol *sp_proto; 236 struct ocfs2_protocol_version sp_max_proto;
230}; 237};
231 238
232 239
@@ -234,10 +241,22 @@ struct ocfs2_stack_plugin {
234int ocfs2_cluster_connect(const char *stack_name, 241int ocfs2_cluster_connect(const char *stack_name,
235 const char *group, 242 const char *group,
236 int grouplen, 243 int grouplen,
244 struct ocfs2_locking_protocol *lproto,
237 void (*recovery_handler)(int node_num, 245 void (*recovery_handler)(int node_num,
238 void *recovery_data), 246 void *recovery_data),
239 void *recovery_data, 247 void *recovery_data,
240 struct ocfs2_cluster_connection **conn); 248 struct ocfs2_cluster_connection **conn);
249/*
250 * Used by callers that don't store their stack name. They must ensure
251 * all nodes have the same stack.
252 */
253int ocfs2_cluster_connect_agnostic(const char *group,
254 int grouplen,
255 struct ocfs2_locking_protocol *lproto,
256 void (*recovery_handler)(int node_num,
257 void *recovery_data),
258 void *recovery_data,
259 struct ocfs2_cluster_connection **conn);
241int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn, 260int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn,
242 int hangup_pending); 261 int hangup_pending);
243void ocfs2_cluster_hangup(const char *group, int grouplen); 262void ocfs2_cluster_hangup(const char *group, int grouplen);
@@ -246,26 +265,24 @@ int ocfs2_cluster_this_node(unsigned int *node);
246struct ocfs2_lock_res; 265struct ocfs2_lock_res;
247int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn, 266int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn,
248 int mode, 267 int mode,
249 union ocfs2_dlm_lksb *lksb, 268 struct ocfs2_dlm_lksb *lksb,
250 u32 flags, 269 u32 flags,
251 void *name, 270 void *name,
252 unsigned int namelen, 271 unsigned int namelen);
253 struct ocfs2_lock_res *astarg);
254int ocfs2_dlm_unlock(struct ocfs2_cluster_connection *conn, 272int ocfs2_dlm_unlock(struct ocfs2_cluster_connection *conn,
255 union ocfs2_dlm_lksb *lksb, 273 struct ocfs2_dlm_lksb *lksb,
256 u32 flags, 274 u32 flags);
257 struct ocfs2_lock_res *astarg);
258 275
259int ocfs2_dlm_lock_status(union ocfs2_dlm_lksb *lksb); 276int ocfs2_dlm_lock_status(struct ocfs2_dlm_lksb *lksb);
260int ocfs2_dlm_lvb_valid(union ocfs2_dlm_lksb *lksb); 277int ocfs2_dlm_lvb_valid(struct ocfs2_dlm_lksb *lksb);
261void *ocfs2_dlm_lvb(union ocfs2_dlm_lksb *lksb); 278void *ocfs2_dlm_lvb(struct ocfs2_dlm_lksb *lksb);
262void ocfs2_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb); 279void ocfs2_dlm_dump_lksb(struct ocfs2_dlm_lksb *lksb);
263 280
264int ocfs2_stack_supports_plocks(void); 281int ocfs2_stack_supports_plocks(void);
265int ocfs2_plock(struct ocfs2_cluster_connection *conn, u64 ino, 282int ocfs2_plock(struct ocfs2_cluster_connection *conn, u64 ino,
266 struct file *file, int cmd, struct file_lock *fl); 283 struct file *file, int cmd, struct file_lock *fl);
267 284
268void ocfs2_stack_glue_set_locking_protocol(struct ocfs2_locking_protocol *proto); 285void ocfs2_stack_glue_set_max_proto_version(struct ocfs2_protocol_version *max_proto);
269 286
270 287
271/* Used by stack plugins */ 288/* Used by stack plugins */
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index c30b644d9572..c3c60bc3e072 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -51,7 +51,7 @@
51#define ALLOC_NEW_GROUP 0x1 51#define ALLOC_NEW_GROUP 0x1
52#define ALLOC_GROUPS_FROM_GLOBAL 0x2 52#define ALLOC_GROUPS_FROM_GLOBAL 0x2
53 53
54#define OCFS2_MAX_INODES_TO_STEAL 1024 54#define OCFS2_MAX_TO_STEAL 1024
55 55
56static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg); 56static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg);
57static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe); 57static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe);
@@ -637,12 +637,113 @@ bail:
637 return status; 637 return status;
638} 638}
639 639
640static void ocfs2_init_inode_steal_slot(struct ocfs2_super *osb)
641{
642 spin_lock(&osb->osb_lock);
643 osb->s_inode_steal_slot = OCFS2_INVALID_SLOT;
644 spin_unlock(&osb->osb_lock);
645 atomic_set(&osb->s_num_inodes_stolen, 0);
646}
647
648static void ocfs2_init_meta_steal_slot(struct ocfs2_super *osb)
649{
650 spin_lock(&osb->osb_lock);
651 osb->s_meta_steal_slot = OCFS2_INVALID_SLOT;
652 spin_unlock(&osb->osb_lock);
653 atomic_set(&osb->s_num_meta_stolen, 0);
654}
655
656void ocfs2_init_steal_slots(struct ocfs2_super *osb)
657{
658 ocfs2_init_inode_steal_slot(osb);
659 ocfs2_init_meta_steal_slot(osb);
660}
661
662static void __ocfs2_set_steal_slot(struct ocfs2_super *osb, int slot, int type)
663{
664 spin_lock(&osb->osb_lock);
665 if (type == INODE_ALLOC_SYSTEM_INODE)
666 osb->s_inode_steal_slot = slot;
667 else if (type == EXTENT_ALLOC_SYSTEM_INODE)
668 osb->s_meta_steal_slot = slot;
669 spin_unlock(&osb->osb_lock);
670}
671
672static int __ocfs2_get_steal_slot(struct ocfs2_super *osb, int type)
673{
674 int slot = OCFS2_INVALID_SLOT;
675
676 spin_lock(&osb->osb_lock);
677 if (type == INODE_ALLOC_SYSTEM_INODE)
678 slot = osb->s_inode_steal_slot;
679 else if (type == EXTENT_ALLOC_SYSTEM_INODE)
680 slot = osb->s_meta_steal_slot;
681 spin_unlock(&osb->osb_lock);
682
683 return slot;
684}
685
686static int ocfs2_get_inode_steal_slot(struct ocfs2_super *osb)
687{
688 return __ocfs2_get_steal_slot(osb, INODE_ALLOC_SYSTEM_INODE);
689}
690
691static int ocfs2_get_meta_steal_slot(struct ocfs2_super *osb)
692{
693 return __ocfs2_get_steal_slot(osb, EXTENT_ALLOC_SYSTEM_INODE);
694}
695
696static int ocfs2_steal_resource(struct ocfs2_super *osb,
697 struct ocfs2_alloc_context *ac,
698 int type)
699{
700 int i, status = -ENOSPC;
701 int slot = __ocfs2_get_steal_slot(osb, type);
702
703 /* Start to steal resource from the first slot after ours. */
704 if (slot == OCFS2_INVALID_SLOT)
705 slot = osb->slot_num + 1;
706
707 for (i = 0; i < osb->max_slots; i++, slot++) {
708 if (slot == osb->max_slots)
709 slot = 0;
710
711 if (slot == osb->slot_num)
712 continue;
713
714 status = ocfs2_reserve_suballoc_bits(osb, ac,
715 type,
716 (u32)slot, NULL,
717 NOT_ALLOC_NEW_GROUP);
718 if (status >= 0) {
719 __ocfs2_set_steal_slot(osb, slot, type);
720 break;
721 }
722
723 ocfs2_free_ac_resource(ac);
724 }
725
726 return status;
727}
728
729static int ocfs2_steal_inode(struct ocfs2_super *osb,
730 struct ocfs2_alloc_context *ac)
731{
732 return ocfs2_steal_resource(osb, ac, INODE_ALLOC_SYSTEM_INODE);
733}
734
735static int ocfs2_steal_meta(struct ocfs2_super *osb,
736 struct ocfs2_alloc_context *ac)
737{
738 return ocfs2_steal_resource(osb, ac, EXTENT_ALLOC_SYSTEM_INODE);
739}
740
640int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb, 741int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
641 int blocks, 742 int blocks,
642 struct ocfs2_alloc_context **ac) 743 struct ocfs2_alloc_context **ac)
643{ 744{
644 int status; 745 int status;
645 u32 slot; 746 int slot = ocfs2_get_meta_steal_slot(osb);
646 747
647 *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL); 748 *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
648 if (!(*ac)) { 749 if (!(*ac)) {
@@ -653,12 +754,34 @@ int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
653 754
654 (*ac)->ac_bits_wanted = blocks; 755 (*ac)->ac_bits_wanted = blocks;
655 (*ac)->ac_which = OCFS2_AC_USE_META; 756 (*ac)->ac_which = OCFS2_AC_USE_META;
656 slot = osb->slot_num;
657 (*ac)->ac_group_search = ocfs2_block_group_search; 757 (*ac)->ac_group_search = ocfs2_block_group_search;
658 758
759 if (slot != OCFS2_INVALID_SLOT &&
760 atomic_read(&osb->s_num_meta_stolen) < OCFS2_MAX_TO_STEAL)
761 goto extent_steal;
762
763 atomic_set(&osb->s_num_meta_stolen, 0);
659 status = ocfs2_reserve_suballoc_bits(osb, (*ac), 764 status = ocfs2_reserve_suballoc_bits(osb, (*ac),
660 EXTENT_ALLOC_SYSTEM_INODE, 765 EXTENT_ALLOC_SYSTEM_INODE,
661 slot, NULL, ALLOC_NEW_GROUP); 766 (u32)osb->slot_num, NULL,
767 ALLOC_NEW_GROUP);
768
769
770 if (status >= 0) {
771 status = 0;
772 if (slot != OCFS2_INVALID_SLOT)
773 ocfs2_init_meta_steal_slot(osb);
774 goto bail;
775 } else if (status < 0 && status != -ENOSPC) {
776 mlog_errno(status);
777 goto bail;
778 }
779
780 ocfs2_free_ac_resource(*ac);
781
782extent_steal:
783 status = ocfs2_steal_meta(osb, *ac);
784 atomic_inc(&osb->s_num_meta_stolen);
662 if (status < 0) { 785 if (status < 0) {
663 if (status != -ENOSPC) 786 if (status != -ENOSPC)
664 mlog_errno(status); 787 mlog_errno(status);
@@ -685,43 +808,11 @@ int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
685 ac); 808 ac);
686} 809}
687 810
688static int ocfs2_steal_inode_from_other_nodes(struct ocfs2_super *osb,
689 struct ocfs2_alloc_context *ac)
690{
691 int i, status = -ENOSPC;
692 s16 slot = ocfs2_get_inode_steal_slot(osb);
693
694 /* Start to steal inodes from the first slot after ours. */
695 if (slot == OCFS2_INVALID_SLOT)
696 slot = osb->slot_num + 1;
697
698 for (i = 0; i < osb->max_slots; i++, slot++) {
699 if (slot == osb->max_slots)
700 slot = 0;
701
702 if (slot == osb->slot_num)
703 continue;
704
705 status = ocfs2_reserve_suballoc_bits(osb, ac,
706 INODE_ALLOC_SYSTEM_INODE,
707 slot, NULL,
708 NOT_ALLOC_NEW_GROUP);
709 if (status >= 0) {
710 ocfs2_set_inode_steal_slot(osb, slot);
711 break;
712 }
713
714 ocfs2_free_ac_resource(ac);
715 }
716
717 return status;
718}
719
720int ocfs2_reserve_new_inode(struct ocfs2_super *osb, 811int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
721 struct ocfs2_alloc_context **ac) 812 struct ocfs2_alloc_context **ac)
722{ 813{
723 int status; 814 int status;
724 s16 slot = ocfs2_get_inode_steal_slot(osb); 815 int slot = ocfs2_get_inode_steal_slot(osb);
725 u64 alloc_group; 816 u64 alloc_group;
726 817
727 *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL); 818 *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
@@ -754,14 +845,14 @@ int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
754 * need to check our slots to see whether there is some space for us. 845 * need to check our slots to see whether there is some space for us.
755 */ 846 */
756 if (slot != OCFS2_INVALID_SLOT && 847 if (slot != OCFS2_INVALID_SLOT &&
757 atomic_read(&osb->s_num_inodes_stolen) < OCFS2_MAX_INODES_TO_STEAL) 848 atomic_read(&osb->s_num_inodes_stolen) < OCFS2_MAX_TO_STEAL)
758 goto inode_steal; 849 goto inode_steal;
759 850
760 atomic_set(&osb->s_num_inodes_stolen, 0); 851 atomic_set(&osb->s_num_inodes_stolen, 0);
761 alloc_group = osb->osb_inode_alloc_group; 852 alloc_group = osb->osb_inode_alloc_group;
762 status = ocfs2_reserve_suballoc_bits(osb, *ac, 853 status = ocfs2_reserve_suballoc_bits(osb, *ac,
763 INODE_ALLOC_SYSTEM_INODE, 854 INODE_ALLOC_SYSTEM_INODE,
764 osb->slot_num, 855 (u32)osb->slot_num,
765 &alloc_group, 856 &alloc_group,
766 ALLOC_NEW_GROUP | 857 ALLOC_NEW_GROUP |
767 ALLOC_GROUPS_FROM_GLOBAL); 858 ALLOC_GROUPS_FROM_GLOBAL);
@@ -789,7 +880,7 @@ int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
789 ocfs2_free_ac_resource(*ac); 880 ocfs2_free_ac_resource(*ac);
790 881
791inode_steal: 882inode_steal:
792 status = ocfs2_steal_inode_from_other_nodes(osb, *ac); 883 status = ocfs2_steal_inode(osb, *ac);
793 atomic_inc(&osb->s_num_inodes_stolen); 884 atomic_inc(&osb->s_num_inodes_stolen);
794 if (status < 0) { 885 if (status < 0) {
795 if (status != -ENOSPC) 886 if (status != -ENOSPC)
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index 8c9a78a43164..fa60723c43e8 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -56,6 +56,7 @@ struct ocfs2_alloc_context {
56 is the same as ~0 - unlimited */ 56 is the same as ~0 - unlimited */
57}; 57};
58 58
59void ocfs2_init_steal_slots(struct ocfs2_super *osb);
59void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac); 60void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac);
60static inline int ocfs2_alloc_context_bits_left(struct ocfs2_alloc_context *ac) 61static inline int ocfs2_alloc_context_bits_left(struct ocfs2_alloc_context *ac)
61{ 62{
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 755cd49a5ef3..dee03197a494 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -69,6 +69,7 @@
69#include "xattr.h" 69#include "xattr.h"
70#include "quota.h" 70#include "quota.h"
71#include "refcounttree.h" 71#include "refcounttree.h"
72#include "suballoc.h"
72 73
73#include "buffer_head_io.h" 74#include "buffer_head_io.h"
74 75
@@ -301,9 +302,12 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len)
301 302
302 spin_lock(&osb->osb_lock); 303 spin_lock(&osb->osb_lock);
303 out += snprintf(buf + out, len - out, 304 out += snprintf(buf + out, len - out,
304 "%10s => Slot: %d NumStolen: %d\n", "Steal", 305 "%10s => InodeSlot: %d StolenInodes: %d, "
306 "MetaSlot: %d StolenMeta: %d\n", "Steal",
305 osb->s_inode_steal_slot, 307 osb->s_inode_steal_slot,
306 atomic_read(&osb->s_num_inodes_stolen)); 308 atomic_read(&osb->s_num_inodes_stolen),
309 osb->s_meta_steal_slot,
310 atomic_read(&osb->s_num_meta_stolen));
307 spin_unlock(&osb->osb_lock); 311 spin_unlock(&osb->osb_lock);
308 312
309 out += snprintf(buf + out, len - out, "OrphanScan => "); 313 out += snprintf(buf + out, len - out, "OrphanScan => ");
@@ -1997,7 +2001,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
1997 osb->blocked_lock_count = 0; 2001 osb->blocked_lock_count = 0;
1998 spin_lock_init(&osb->osb_lock); 2002 spin_lock_init(&osb->osb_lock);
1999 spin_lock_init(&osb->osb_xattr_lock); 2003 spin_lock_init(&osb->osb_xattr_lock);
2000 ocfs2_init_inode_steal_slot(osb); 2004 ocfs2_init_steal_slots(osb);
2001 2005
2002 atomic_set(&osb->alloc_stats.moves, 0); 2006 atomic_set(&osb->alloc_stats.moves, 0);
2003 atomic_set(&osb->alloc_stats.local_data, 0); 2007 atomic_set(&osb->alloc_stats.local_data, 0);
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 8fc6fb071c6d..d1b0d386f6d1 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -116,10 +116,11 @@ static struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = {
116}; 116};
117 117
118struct ocfs2_xattr_info { 118struct ocfs2_xattr_info {
119 int name_index; 119 int xi_name_index;
120 const char *name; 120 const char *xi_name;
121 const void *value; 121 int xi_name_len;
122 size_t value_len; 122 const void *xi_value;
123 size_t xi_value_len;
123}; 124};
124 125
125struct ocfs2_xattr_search { 126struct ocfs2_xattr_search {
@@ -137,6 +138,115 @@ struct ocfs2_xattr_search {
137 int not_found; 138 int not_found;
138}; 139};
139 140
141/* Operations on struct ocfs2_xa_entry */
142struct ocfs2_xa_loc;
143struct ocfs2_xa_loc_operations {
144 /*
145 * Journal functions
146 */
147 int (*xlo_journal_access)(handle_t *handle, struct ocfs2_xa_loc *loc,
148 int type);
149 void (*xlo_journal_dirty)(handle_t *handle, struct ocfs2_xa_loc *loc);
150
151 /*
152 * Return a pointer to the appropriate buffer in loc->xl_storage
153 * at the given offset from loc->xl_header.
154 */
155 void *(*xlo_offset_pointer)(struct ocfs2_xa_loc *loc, int offset);
156
157 /* Can we reuse the existing entry for the new value? */
158 int (*xlo_can_reuse)(struct ocfs2_xa_loc *loc,
159 struct ocfs2_xattr_info *xi);
160
161 /* How much space is needed for the new value? */
162 int (*xlo_check_space)(struct ocfs2_xa_loc *loc,
163 struct ocfs2_xattr_info *xi);
164
165 /*
166 * Return the offset of the first name+value pair. This is
167 * the start of our downward-filling free space.
168 */
169 int (*xlo_get_free_start)(struct ocfs2_xa_loc *loc);
170
171 /*
172 * Remove the name+value at this location. Do whatever is
173 * appropriate with the remaining name+value pairs.
174 */
175 void (*xlo_wipe_namevalue)(struct ocfs2_xa_loc *loc);
176
177 /* Fill xl_entry with a new entry */
178 void (*xlo_add_entry)(struct ocfs2_xa_loc *loc, u32 name_hash);
179
180 /* Add name+value storage to an entry */
181 void (*xlo_add_namevalue)(struct ocfs2_xa_loc *loc, int size);
182
183 /*
184 * Initialize the value buf's access and bh fields for this entry.
185 * ocfs2_xa_fill_value_buf() will handle the xv pointer.
186 */
187 void (*xlo_fill_value_buf)(struct ocfs2_xa_loc *loc,
188 struct ocfs2_xattr_value_buf *vb);
189};
190
191/*
192 * Describes an xattr entry location. This is a memory structure
193 * tracking the on-disk structure.
194 */
195struct ocfs2_xa_loc {
196 /* This xattr belongs to this inode */
197 struct inode *xl_inode;
198
199 /* The ocfs2_xattr_header inside the on-disk storage. Not NULL. */
200 struct ocfs2_xattr_header *xl_header;
201
202 /* Bytes from xl_header to the end of the storage */
203 int xl_size;
204
205 /*
206 * The ocfs2_xattr_entry this location describes. If this is
207 * NULL, this location describes the on-disk structure where it
208 * would have been.
209 */
210 struct ocfs2_xattr_entry *xl_entry;
211
212 /*
213 * Internal housekeeping
214 */
215
216 /* Buffer(s) containing this entry */
217 void *xl_storage;
218
219 /* Operations on the storage backing this location */
220 const struct ocfs2_xa_loc_operations *xl_ops;
221};
222
223/*
224 * Convenience functions to calculate how much space is needed for a
225 * given name+value pair
226 */
227static int namevalue_size(int name_len, uint64_t value_len)
228{
229 if (value_len > OCFS2_XATTR_INLINE_SIZE)
230 return OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
231 else
232 return OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_SIZE(value_len);
233}
234
235static int namevalue_size_xi(struct ocfs2_xattr_info *xi)
236{
237 return namevalue_size(xi->xi_name_len, xi->xi_value_len);
238}
239
240static int namevalue_size_xe(struct ocfs2_xattr_entry *xe)
241{
242 u64 value_len = le64_to_cpu(xe->xe_value_size);
243
244 BUG_ON((value_len > OCFS2_XATTR_INLINE_SIZE) &&
245 ocfs2_xattr_is_local(xe));
246 return namevalue_size(xe->xe_name_len, value_len);
247}
248
249
140static int ocfs2_xattr_bucket_get_name_value(struct super_block *sb, 250static int ocfs2_xattr_bucket_get_name_value(struct super_block *sb,
141 struct ocfs2_xattr_header *xh, 251 struct ocfs2_xattr_header *xh,
142 int index, 252 int index,
@@ -212,14 +322,6 @@ static inline u16 ocfs2_blocks_per_xattr_bucket(struct super_block *sb)
212 return OCFS2_XATTR_BUCKET_SIZE / (1 << sb->s_blocksize_bits); 322 return OCFS2_XATTR_BUCKET_SIZE / (1 << sb->s_blocksize_bits);
213} 323}
214 324
215static inline u16 ocfs2_xattr_max_xe_in_bucket(struct super_block *sb)
216{
217 u16 len = sb->s_blocksize -
218 offsetof(struct ocfs2_xattr_header, xh_entries);
219
220 return len / sizeof(struct ocfs2_xattr_entry);
221}
222
223#define bucket_blkno(_b) ((_b)->bu_bhs[0]->b_blocknr) 325#define bucket_blkno(_b) ((_b)->bu_bhs[0]->b_blocknr)
224#define bucket_block(_b, _n) ((_b)->bu_bhs[(_n)]->b_data) 326#define bucket_block(_b, _n) ((_b)->bu_bhs[(_n)]->b_data)
225#define bucket_xh(_b) ((struct ocfs2_xattr_header *)bucket_block((_b), 0)) 327#define bucket_xh(_b) ((struct ocfs2_xattr_header *)bucket_block((_b), 0))
@@ -463,35 +565,22 @@ static u32 ocfs2_xattr_name_hash(struct inode *inode,
463 return hash; 565 return hash;
464} 566}
465 567
466/* 568static int ocfs2_xattr_entry_real_size(int name_len, size_t value_len)
467 * ocfs2_xattr_hash_entry()
468 *
469 * Compute the hash of an extended attribute.
470 */
471static void ocfs2_xattr_hash_entry(struct inode *inode,
472 struct ocfs2_xattr_header *header,
473 struct ocfs2_xattr_entry *entry)
474{ 569{
475 u32 hash = 0; 570 return namevalue_size(name_len, value_len) +
476 char *name = (char *)header + le16_to_cpu(entry->xe_name_offset); 571 sizeof(struct ocfs2_xattr_entry);
477
478 hash = ocfs2_xattr_name_hash(inode, name, entry->xe_name_len);
479 entry->xe_name_hash = cpu_to_le32(hash);
480
481 return;
482} 572}
483 573
484static int ocfs2_xattr_entry_real_size(int name_len, size_t value_len) 574static int ocfs2_xi_entry_usage(struct ocfs2_xattr_info *xi)
485{ 575{
486 int size = 0; 576 return namevalue_size_xi(xi) +
487 577 sizeof(struct ocfs2_xattr_entry);
488 if (value_len <= OCFS2_XATTR_INLINE_SIZE) 578}
489 size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_SIZE(value_len);
490 else
491 size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
492 size += sizeof(struct ocfs2_xattr_entry);
493 579
494 return size; 580static int ocfs2_xe_entry_usage(struct ocfs2_xattr_entry *xe)
581{
582 return namevalue_size_xe(xe) +
583 sizeof(struct ocfs2_xattr_entry);
495} 584}
496 585
497int ocfs2_calc_security_init(struct inode *dir, 586int ocfs2_calc_security_init(struct inode *dir,
@@ -1308,452 +1397,897 @@ out:
1308 return ret; 1397 return ret;
1309} 1398}
1310 1399
1311static int ocfs2_xattr_cleanup(struct inode *inode, 1400static int ocfs2_xa_check_space_helper(int needed_space, int free_start,
1312 handle_t *handle, 1401 int num_entries)
1313 struct ocfs2_xattr_info *xi,
1314 struct ocfs2_xattr_search *xs,
1315 struct ocfs2_xattr_value_buf *vb,
1316 size_t offs)
1317{ 1402{
1318 int ret = 0; 1403 int free_space;
1319 size_t name_len = strlen(xi->name);
1320 void *val = xs->base + offs;
1321 size_t size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
1322 1404
1323 ret = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh, 1405 if (!needed_space)
1324 OCFS2_JOURNAL_ACCESS_WRITE); 1406 return 0;
1325 if (ret) {
1326 mlog_errno(ret);
1327 goto out;
1328 }
1329 /* Decrease xattr count */
1330 le16_add_cpu(&xs->header->xh_count, -1);
1331 /* Remove the xattr entry and tree root which has already be set*/
1332 memset((void *)xs->here, 0, sizeof(struct ocfs2_xattr_entry));
1333 memset(val, 0, size);
1334 1407
1335 ret = ocfs2_journal_dirty(handle, vb->vb_bh); 1408 free_space = free_start -
1336 if (ret < 0) 1409 sizeof(struct ocfs2_xattr_header) -
1337 mlog_errno(ret); 1410 (num_entries * sizeof(struct ocfs2_xattr_entry)) -
1338out: 1411 OCFS2_XATTR_HEADER_GAP;
1339 return ret; 1412 if (free_space < 0)
1413 return -EIO;
1414 if (free_space < needed_space)
1415 return -ENOSPC;
1416
1417 return 0;
1340} 1418}
1341 1419
1342static int ocfs2_xattr_update_entry(struct inode *inode, 1420static int ocfs2_xa_journal_access(handle_t *handle, struct ocfs2_xa_loc *loc,
1343 handle_t *handle, 1421 int type)
1344 struct ocfs2_xattr_info *xi,
1345 struct ocfs2_xattr_search *xs,
1346 struct ocfs2_xattr_value_buf *vb,
1347 size_t offs)
1348{ 1422{
1349 int ret; 1423 return loc->xl_ops->xlo_journal_access(handle, loc, type);
1424}
1350 1425
1351 ret = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh, 1426static void ocfs2_xa_journal_dirty(handle_t *handle, struct ocfs2_xa_loc *loc)
1352 OCFS2_JOURNAL_ACCESS_WRITE); 1427{
1353 if (ret) { 1428 loc->xl_ops->xlo_journal_dirty(handle, loc);
1354 mlog_errno(ret); 1429}
1355 goto out;
1356 }
1357 1430
1358 xs->here->xe_name_offset = cpu_to_le16(offs); 1431/* Give a pointer into the storage for the given offset */
1359 xs->here->xe_value_size = cpu_to_le64(xi->value_len); 1432static void *ocfs2_xa_offset_pointer(struct ocfs2_xa_loc *loc, int offset)
1360 if (xi->value_len <= OCFS2_XATTR_INLINE_SIZE) 1433{
1361 ocfs2_xattr_set_local(xs->here, 1); 1434 BUG_ON(offset >= loc->xl_size);
1362 else 1435 return loc->xl_ops->xlo_offset_pointer(loc, offset);
1363 ocfs2_xattr_set_local(xs->here, 0); 1436}
1364 ocfs2_xattr_hash_entry(inode, xs->header, xs->here);
1365 1437
1366 ret = ocfs2_journal_dirty(handle, vb->vb_bh); 1438/*
1367 if (ret < 0) 1439 * Wipe the name+value pair and allow the storage to reclaim it. This
1368 mlog_errno(ret); 1440 * must be followed by either removal of the entry or a call to
1369out: 1441 * ocfs2_xa_add_namevalue().
1370 return ret; 1442 */
1443static void ocfs2_xa_wipe_namevalue(struct ocfs2_xa_loc *loc)
1444{
1445 loc->xl_ops->xlo_wipe_namevalue(loc);
1371} 1446}
1372 1447
1373/* 1448/*
1374 * ocfs2_xattr_set_value_outside() 1449 * Find lowest offset to a name+value pair. This is the start of our
1375 * 1450 * downward-growing free space.
1376 * Set large size value in B tree.
1377 */ 1451 */
1378static int ocfs2_xattr_set_value_outside(struct inode *inode, 1452static int ocfs2_xa_get_free_start(struct ocfs2_xa_loc *loc)
1379 struct ocfs2_xattr_info *xi,
1380 struct ocfs2_xattr_search *xs,
1381 struct ocfs2_xattr_set_ctxt *ctxt,
1382 struct ocfs2_xattr_value_buf *vb,
1383 size_t offs)
1384{ 1453{
1385 size_t name_len = strlen(xi->name); 1454 return loc->xl_ops->xlo_get_free_start(loc);
1386 void *val = xs->base + offs; 1455}
1387 struct ocfs2_xattr_value_root *xv = NULL;
1388 size_t size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
1389 int ret = 0;
1390 1456
1391 memset(val, 0, size); 1457/* Can we reuse loc->xl_entry for xi? */
1392 memcpy(val, xi->name, name_len); 1458static int ocfs2_xa_can_reuse_entry(struct ocfs2_xa_loc *loc,
1393 xv = (struct ocfs2_xattr_value_root *) 1459 struct ocfs2_xattr_info *xi)
1394 (val + OCFS2_XATTR_SIZE(name_len)); 1460{
1395 xv->xr_clusters = 0; 1461 return loc->xl_ops->xlo_can_reuse(loc, xi);
1396 xv->xr_last_eb_blk = 0; 1462}
1397 xv->xr_list.l_tree_depth = 0; 1463
1398 xv->xr_list.l_count = cpu_to_le16(1); 1464/* How much free space is needed to set the new value */
1399 xv->xr_list.l_next_free_rec = 0; 1465static int ocfs2_xa_check_space(struct ocfs2_xa_loc *loc,
1400 vb->vb_xv = xv; 1466 struct ocfs2_xattr_info *xi)
1401 1467{
1402 ret = ocfs2_xattr_value_truncate(inode, vb, xi->value_len, ctxt); 1468 return loc->xl_ops->xlo_check_space(loc, xi);
1403 if (ret < 0) { 1469}
1404 mlog_errno(ret); 1470
1405 return ret; 1471static void ocfs2_xa_add_entry(struct ocfs2_xa_loc *loc, u32 name_hash)
1472{
1473 loc->xl_ops->xlo_add_entry(loc, name_hash);
1474 loc->xl_entry->xe_name_hash = cpu_to_le32(name_hash);
1475 /*
1476 * We can't leave the new entry's xe_name_offset at zero or
1477 * add_namevalue() will go nuts. We set it to the size of our
1478 * storage so that it can never be less than any other entry.
1479 */
1480 loc->xl_entry->xe_name_offset = cpu_to_le16(loc->xl_size);
1481}
1482
1483static void ocfs2_xa_add_namevalue(struct ocfs2_xa_loc *loc,
1484 struct ocfs2_xattr_info *xi)
1485{
1486 int size = namevalue_size_xi(xi);
1487 int nameval_offset;
1488 char *nameval_buf;
1489
1490 loc->xl_ops->xlo_add_namevalue(loc, size);
1491 loc->xl_entry->xe_value_size = cpu_to_le64(xi->xi_value_len);
1492 loc->xl_entry->xe_name_len = xi->xi_name_len;
1493 ocfs2_xattr_set_type(loc->xl_entry, xi->xi_name_index);
1494 ocfs2_xattr_set_local(loc->xl_entry,
1495 xi->xi_value_len <= OCFS2_XATTR_INLINE_SIZE);
1496
1497 nameval_offset = le16_to_cpu(loc->xl_entry->xe_name_offset);
1498 nameval_buf = ocfs2_xa_offset_pointer(loc, nameval_offset);
1499 memset(nameval_buf, 0, size);
1500 memcpy(nameval_buf, xi->xi_name, xi->xi_name_len);
1501}
1502
1503static void ocfs2_xa_fill_value_buf(struct ocfs2_xa_loc *loc,
1504 struct ocfs2_xattr_value_buf *vb)
1505{
1506 int nameval_offset = le16_to_cpu(loc->xl_entry->xe_name_offset);
1507 int name_size = OCFS2_XATTR_SIZE(loc->xl_entry->xe_name_len);
1508
1509 /* Value bufs are for value trees */
1510 BUG_ON(ocfs2_xattr_is_local(loc->xl_entry));
1511 BUG_ON(namevalue_size_xe(loc->xl_entry) !=
1512 (name_size + OCFS2_XATTR_ROOT_SIZE));
1513
1514 loc->xl_ops->xlo_fill_value_buf(loc, vb);
1515 vb->vb_xv =
1516 (struct ocfs2_xattr_value_root *)ocfs2_xa_offset_pointer(loc,
1517 nameval_offset +
1518 name_size);
1519}
1520
1521static int ocfs2_xa_block_journal_access(handle_t *handle,
1522 struct ocfs2_xa_loc *loc, int type)
1523{
1524 struct buffer_head *bh = loc->xl_storage;
1525 ocfs2_journal_access_func access;
1526
1527 if (loc->xl_size == (bh->b_size -
1528 offsetof(struct ocfs2_xattr_block,
1529 xb_attrs.xb_header)))
1530 access = ocfs2_journal_access_xb;
1531 else
1532 access = ocfs2_journal_access_di;
1533 return access(handle, INODE_CACHE(loc->xl_inode), bh, type);
1534}
1535
1536static void ocfs2_xa_block_journal_dirty(handle_t *handle,
1537 struct ocfs2_xa_loc *loc)
1538{
1539 struct buffer_head *bh = loc->xl_storage;
1540
1541 ocfs2_journal_dirty(handle, bh);
1542}
1543
1544static void *ocfs2_xa_block_offset_pointer(struct ocfs2_xa_loc *loc,
1545 int offset)
1546{
1547 return (char *)loc->xl_header + offset;
1548}
1549
1550static int ocfs2_xa_block_can_reuse(struct ocfs2_xa_loc *loc,
1551 struct ocfs2_xattr_info *xi)
1552{
1553 /*
1554 * Block storage is strict. If the sizes aren't exact, we will
1555 * remove the old one and reinsert the new.
1556 */
1557 return namevalue_size_xe(loc->xl_entry) ==
1558 namevalue_size_xi(xi);
1559}
1560
1561static int ocfs2_xa_block_get_free_start(struct ocfs2_xa_loc *loc)
1562{
1563 struct ocfs2_xattr_header *xh = loc->xl_header;
1564 int i, count = le16_to_cpu(xh->xh_count);
1565 int offset, free_start = loc->xl_size;
1566
1567 for (i = 0; i < count; i++) {
1568 offset = le16_to_cpu(xh->xh_entries[i].xe_name_offset);
1569 if (offset < free_start)
1570 free_start = offset;
1406 } 1571 }
1407 ret = ocfs2_xattr_update_entry(inode, ctxt->handle, xi, xs, vb, offs); 1572
1408 if (ret < 0) { 1573 return free_start;
1409 mlog_errno(ret); 1574}
1410 return ret; 1575
1576static int ocfs2_xa_block_check_space(struct ocfs2_xa_loc *loc,
1577 struct ocfs2_xattr_info *xi)
1578{
1579 int count = le16_to_cpu(loc->xl_header->xh_count);
1580 int free_start = ocfs2_xa_get_free_start(loc);
1581 int needed_space = ocfs2_xi_entry_usage(xi);
1582
1583 /*
1584 * Block storage will reclaim the original entry before inserting
1585 * the new value, so we only need the difference. If the new
1586 * entry is smaller than the old one, we don't need anything.
1587 */
1588 if (loc->xl_entry) {
1589 /* Don't need space if we're reusing! */
1590 if (ocfs2_xa_can_reuse_entry(loc, xi))
1591 needed_space = 0;
1592 else
1593 needed_space -= ocfs2_xe_entry_usage(loc->xl_entry);
1411 } 1594 }
1412 ret = __ocfs2_xattr_set_value_outside(inode, ctxt->handle, vb, 1595 if (needed_space < 0)
1413 xi->value, xi->value_len); 1596 needed_space = 0;
1414 if (ret < 0) 1597 return ocfs2_xa_check_space_helper(needed_space, free_start, count);
1415 mlog_errno(ret); 1598}
1416 1599
1417 return ret; 1600/*
1601 * Block storage for xattrs keeps the name+value pairs compacted. When
1602 * we remove one, we have to shift any that preceded it towards the end.
1603 */
1604static void ocfs2_xa_block_wipe_namevalue(struct ocfs2_xa_loc *loc)
1605{
1606 int i, offset;
1607 int namevalue_offset, first_namevalue_offset, namevalue_size;
1608 struct ocfs2_xattr_entry *entry = loc->xl_entry;
1609 struct ocfs2_xattr_header *xh = loc->xl_header;
1610 int count = le16_to_cpu(xh->xh_count);
1611
1612 namevalue_offset = le16_to_cpu(entry->xe_name_offset);
1613 namevalue_size = namevalue_size_xe(entry);
1614 first_namevalue_offset = ocfs2_xa_get_free_start(loc);
1615
1616 /* Shift the name+value pairs */
1617 memmove((char *)xh + first_namevalue_offset + namevalue_size,
1618 (char *)xh + first_namevalue_offset,
1619 namevalue_offset - first_namevalue_offset);
1620 memset((char *)xh + first_namevalue_offset, 0, namevalue_size);
1621
1622 /* Now tell xh->xh_entries about it */
1623 for (i = 0; i < count; i++) {
1624 offset = le16_to_cpu(xh->xh_entries[i].xe_name_offset);
1625 if (offset < namevalue_offset)
1626 le16_add_cpu(&xh->xh_entries[i].xe_name_offset,
1627 namevalue_size);
1628 }
1629
1630 /*
1631 * Note that we don't update xh_free_start or xh_name_value_len
1632 * because they're not used in block-stored xattrs.
1633 */
1634}
1635
1636static void ocfs2_xa_block_add_entry(struct ocfs2_xa_loc *loc, u32 name_hash)
1637{
1638 int count = le16_to_cpu(loc->xl_header->xh_count);
1639 loc->xl_entry = &(loc->xl_header->xh_entries[count]);
1640 le16_add_cpu(&loc->xl_header->xh_count, 1);
1641 memset(loc->xl_entry, 0, sizeof(struct ocfs2_xattr_entry));
1642}
1643
1644static void ocfs2_xa_block_add_namevalue(struct ocfs2_xa_loc *loc, int size)
1645{
1646 int free_start = ocfs2_xa_get_free_start(loc);
1647
1648 loc->xl_entry->xe_name_offset = cpu_to_le16(free_start - size);
1649}
1650
1651static void ocfs2_xa_block_fill_value_buf(struct ocfs2_xa_loc *loc,
1652 struct ocfs2_xattr_value_buf *vb)
1653{
1654 struct buffer_head *bh = loc->xl_storage;
1655
1656 if (loc->xl_size == (bh->b_size -
1657 offsetof(struct ocfs2_xattr_block,
1658 xb_attrs.xb_header)))
1659 vb->vb_access = ocfs2_journal_access_xb;
1660 else
1661 vb->vb_access = ocfs2_journal_access_di;
1662 vb->vb_bh = bh;
1418} 1663}
1419 1664
1420/* 1665/*
1421 * ocfs2_xattr_set_entry_local() 1666 * Operations for xattrs stored in blocks. This includes inline inode
1422 * 1667 * storage and unindexed ocfs2_xattr_blocks.
1423 * Set, replace or remove extended attribute in local.
1424 */ 1668 */
1425static void ocfs2_xattr_set_entry_local(struct inode *inode, 1669static const struct ocfs2_xa_loc_operations ocfs2_xa_block_loc_ops = {
1426 struct ocfs2_xattr_info *xi, 1670 .xlo_journal_access = ocfs2_xa_block_journal_access,
1427 struct ocfs2_xattr_search *xs, 1671 .xlo_journal_dirty = ocfs2_xa_block_journal_dirty,
1428 struct ocfs2_xattr_entry *last, 1672 .xlo_offset_pointer = ocfs2_xa_block_offset_pointer,
1429 size_t min_offs) 1673 .xlo_check_space = ocfs2_xa_block_check_space,
1674 .xlo_can_reuse = ocfs2_xa_block_can_reuse,
1675 .xlo_get_free_start = ocfs2_xa_block_get_free_start,
1676 .xlo_wipe_namevalue = ocfs2_xa_block_wipe_namevalue,
1677 .xlo_add_entry = ocfs2_xa_block_add_entry,
1678 .xlo_add_namevalue = ocfs2_xa_block_add_namevalue,
1679 .xlo_fill_value_buf = ocfs2_xa_block_fill_value_buf,
1680};
1681
1682static int ocfs2_xa_bucket_journal_access(handle_t *handle,
1683 struct ocfs2_xa_loc *loc, int type)
1430{ 1684{
1431 size_t name_len = strlen(xi->name); 1685 struct ocfs2_xattr_bucket *bucket = loc->xl_storage;
1432 int i;
1433 1686
1434 if (xi->value && xs->not_found) { 1687 return ocfs2_xattr_bucket_journal_access(handle, bucket, type);
1435 /* Insert the new xattr entry. */ 1688}
1436 le16_add_cpu(&xs->header->xh_count, 1); 1689
1437 ocfs2_xattr_set_type(last, xi->name_index); 1690static void ocfs2_xa_bucket_journal_dirty(handle_t *handle,
1438 ocfs2_xattr_set_local(last, 1); 1691 struct ocfs2_xa_loc *loc)
1439 last->xe_name_len = name_len; 1692{
1440 } else { 1693 struct ocfs2_xattr_bucket *bucket = loc->xl_storage;
1441 void *first_val; 1694
1442 void *val; 1695 ocfs2_xattr_bucket_journal_dirty(handle, bucket);
1443 size_t offs, size; 1696}
1444 1697
1445 first_val = xs->base + min_offs; 1698static void *ocfs2_xa_bucket_offset_pointer(struct ocfs2_xa_loc *loc,
1446 offs = le16_to_cpu(xs->here->xe_name_offset); 1699 int offset)
1447 val = xs->base + offs; 1700{
1448 1701 struct ocfs2_xattr_bucket *bucket = loc->xl_storage;
1449 if (le64_to_cpu(xs->here->xe_value_size) > 1702 int block, block_offset;
1450 OCFS2_XATTR_INLINE_SIZE) 1703
1451 size = OCFS2_XATTR_SIZE(name_len) + 1704 /* The header is at the front of the bucket */
1452 OCFS2_XATTR_ROOT_SIZE; 1705 block = offset >> loc->xl_inode->i_sb->s_blocksize_bits;
1706 block_offset = offset % loc->xl_inode->i_sb->s_blocksize;
1707
1708 return bucket_block(bucket, block) + block_offset;
1709}
1710
1711static int ocfs2_xa_bucket_can_reuse(struct ocfs2_xa_loc *loc,
1712 struct ocfs2_xattr_info *xi)
1713{
1714 return namevalue_size_xe(loc->xl_entry) >=
1715 namevalue_size_xi(xi);
1716}
1717
1718static int ocfs2_xa_bucket_get_free_start(struct ocfs2_xa_loc *loc)
1719{
1720 struct ocfs2_xattr_bucket *bucket = loc->xl_storage;
1721 return le16_to_cpu(bucket_xh(bucket)->xh_free_start);
1722}
1723
1724static int ocfs2_bucket_align_free_start(struct super_block *sb,
1725 int free_start, int size)
1726{
1727 /*
1728 * We need to make sure that the name+value pair fits within
1729 * one block.
1730 */
1731 if (((free_start - size) >> sb->s_blocksize_bits) !=
1732 ((free_start - 1) >> sb->s_blocksize_bits))
1733 free_start -= free_start % sb->s_blocksize;
1734
1735 return free_start;
1736}
1737
1738static int ocfs2_xa_bucket_check_space(struct ocfs2_xa_loc *loc,
1739 struct ocfs2_xattr_info *xi)
1740{
1741 int rc;
1742 int count = le16_to_cpu(loc->xl_header->xh_count);
1743 int free_start = ocfs2_xa_get_free_start(loc);
1744 int needed_space = ocfs2_xi_entry_usage(xi);
1745 int size = namevalue_size_xi(xi);
1746 struct super_block *sb = loc->xl_inode->i_sb;
1747
1748 /*
1749 * Bucket storage does not reclaim name+value pairs it cannot
1750 * reuse. They live as holes until the bucket fills, and then
1751 * the bucket is defragmented. However, the bucket can reclaim
1752 * the ocfs2_xattr_entry.
1753 */
1754 if (loc->xl_entry) {
1755 /* Don't need space if we're reusing! */
1756 if (ocfs2_xa_can_reuse_entry(loc, xi))
1757 needed_space = 0;
1453 else 1758 else
1454 size = OCFS2_XATTR_SIZE(name_len) + 1759 needed_space -= sizeof(struct ocfs2_xattr_entry);
1455 OCFS2_XATTR_SIZE(le64_to_cpu(xs->here->xe_value_size)); 1760 }
1456 1761 BUG_ON(needed_space < 0);
1457 if (xi->value && size == OCFS2_XATTR_SIZE(name_len) +
1458 OCFS2_XATTR_SIZE(xi->value_len)) {
1459 /* The old and the new value have the
1460 same size. Just replace the value. */
1461 ocfs2_xattr_set_local(xs->here, 1);
1462 xs->here->xe_value_size = cpu_to_le64(xi->value_len);
1463 /* Clear value bytes. */
1464 memset(val + OCFS2_XATTR_SIZE(name_len),
1465 0,
1466 OCFS2_XATTR_SIZE(xi->value_len));
1467 memcpy(val + OCFS2_XATTR_SIZE(name_len),
1468 xi->value,
1469 xi->value_len);
1470 return;
1471 }
1472 /* Remove the old name+value. */
1473 memmove(first_val + size, first_val, val - first_val);
1474 memset(first_val, 0, size);
1475 xs->here->xe_name_hash = 0;
1476 xs->here->xe_name_offset = 0;
1477 ocfs2_xattr_set_local(xs->here, 1);
1478 xs->here->xe_value_size = 0;
1479
1480 min_offs += size;
1481
1482 /* Adjust all value offsets. */
1483 last = xs->header->xh_entries;
1484 for (i = 0 ; i < le16_to_cpu(xs->header->xh_count); i++) {
1485 size_t o = le16_to_cpu(last->xe_name_offset);
1486
1487 if (o < offs)
1488 last->xe_name_offset = cpu_to_le16(o + size);
1489 last += 1;
1490 }
1491 1762
1492 if (!xi->value) { 1763 if (free_start < size) {
1493 /* Remove the old entry. */ 1764 if (needed_space)
1494 last -= 1; 1765 return -ENOSPC;
1495 memmove(xs->here, xs->here + 1, 1766 } else {
1496 (void *)last - (void *)xs->here); 1767 /*
1497 memset(last, 0, sizeof(struct ocfs2_xattr_entry)); 1768 * First we check if it would fit in the first place.
1498 le16_add_cpu(&xs->header->xh_count, -1); 1769 * Below, we align the free start to a block. This may
1499 } 1770 * slide us below the minimum gap. By checking unaligned
1771 * first, we avoid that error.
1772 */
1773 rc = ocfs2_xa_check_space_helper(needed_space, free_start,
1774 count);
1775 if (rc)
1776 return rc;
1777 free_start = ocfs2_bucket_align_free_start(sb, free_start,
1778 size);
1500 } 1779 }
1501 if (xi->value) { 1780 return ocfs2_xa_check_space_helper(needed_space, free_start, count);
1502 /* Insert the new name+value. */ 1781}
1503 size_t size = OCFS2_XATTR_SIZE(name_len) + 1782
1504 OCFS2_XATTR_SIZE(xi->value_len); 1783static void ocfs2_xa_bucket_wipe_namevalue(struct ocfs2_xa_loc *loc)
1505 void *val = xs->base + min_offs - size; 1784{
1785 le16_add_cpu(&loc->xl_header->xh_name_value_len,
1786 -namevalue_size_xe(loc->xl_entry));
1787}
1506 1788
1507 xs->here->xe_name_offset = cpu_to_le16(min_offs - size); 1789static void ocfs2_xa_bucket_add_entry(struct ocfs2_xa_loc *loc, u32 name_hash)
1508 memset(val, 0, size); 1790{
1509 memcpy(val, xi->name, name_len); 1791 struct ocfs2_xattr_header *xh = loc->xl_header;
1510 memcpy(val + OCFS2_XATTR_SIZE(name_len), 1792 int count = le16_to_cpu(xh->xh_count);
1511 xi->value, 1793 int low = 0, high = count - 1, tmp;
1512 xi->value_len); 1794 struct ocfs2_xattr_entry *tmp_xe;
1513 xs->here->xe_value_size = cpu_to_le64(xi->value_len); 1795
1514 ocfs2_xattr_set_local(xs->here, 1); 1796 /*
1515 ocfs2_xattr_hash_entry(inode, xs->header, xs->here); 1797 * We keep buckets sorted by name_hash, so we need to find
1798 * our insert place.
1799 */
1800 while (low <= high && count) {
1801 tmp = (low + high) / 2;
1802 tmp_xe = &xh->xh_entries[tmp];
1803
1804 if (name_hash > le32_to_cpu(tmp_xe->xe_name_hash))
1805 low = tmp + 1;
1806 else if (name_hash < le32_to_cpu(tmp_xe->xe_name_hash))
1807 high = tmp - 1;
1808 else {
1809 low = tmp;
1810 break;
1811 }
1516 } 1812 }
1517 1813
1518 return; 1814 if (low != count)
1815 memmove(&xh->xh_entries[low + 1],
1816 &xh->xh_entries[low],
1817 ((count - low) * sizeof(struct ocfs2_xattr_entry)));
1818
1819 le16_add_cpu(&xh->xh_count, 1);
1820 loc->xl_entry = &xh->xh_entries[low];
1821 memset(loc->xl_entry, 0, sizeof(struct ocfs2_xattr_entry));
1822}
1823
1824static void ocfs2_xa_bucket_add_namevalue(struct ocfs2_xa_loc *loc, int size)
1825{
1826 int free_start = ocfs2_xa_get_free_start(loc);
1827 struct ocfs2_xattr_header *xh = loc->xl_header;
1828 struct super_block *sb = loc->xl_inode->i_sb;
1829 int nameval_offset;
1830
1831 free_start = ocfs2_bucket_align_free_start(sb, free_start, size);
1832 nameval_offset = free_start - size;
1833 loc->xl_entry->xe_name_offset = cpu_to_le16(nameval_offset);
1834 xh->xh_free_start = cpu_to_le16(nameval_offset);
1835 le16_add_cpu(&xh->xh_name_value_len, size);
1836
1837}
1838
1839static void ocfs2_xa_bucket_fill_value_buf(struct ocfs2_xa_loc *loc,
1840 struct ocfs2_xattr_value_buf *vb)
1841{
1842 struct ocfs2_xattr_bucket *bucket = loc->xl_storage;
1843 struct super_block *sb = loc->xl_inode->i_sb;
1844 int nameval_offset = le16_to_cpu(loc->xl_entry->xe_name_offset);
1845 int size = namevalue_size_xe(loc->xl_entry);
1846 int block_offset = nameval_offset >> sb->s_blocksize_bits;
1847
1848 /* Values are not allowed to straddle block boundaries */
1849 BUG_ON(block_offset !=
1850 ((nameval_offset + size - 1) >> sb->s_blocksize_bits));
1851 /* We expect the bucket to be filled in */
1852 BUG_ON(!bucket->bu_bhs[block_offset]);
1853
1854 vb->vb_access = ocfs2_journal_access;
1855 vb->vb_bh = bucket->bu_bhs[block_offset];
1856}
1857
1858/* Operations for xattrs stored in buckets. */
1859static const struct ocfs2_xa_loc_operations ocfs2_xa_bucket_loc_ops = {
1860 .xlo_journal_access = ocfs2_xa_bucket_journal_access,
1861 .xlo_journal_dirty = ocfs2_xa_bucket_journal_dirty,
1862 .xlo_offset_pointer = ocfs2_xa_bucket_offset_pointer,
1863 .xlo_check_space = ocfs2_xa_bucket_check_space,
1864 .xlo_can_reuse = ocfs2_xa_bucket_can_reuse,
1865 .xlo_get_free_start = ocfs2_xa_bucket_get_free_start,
1866 .xlo_wipe_namevalue = ocfs2_xa_bucket_wipe_namevalue,
1867 .xlo_add_entry = ocfs2_xa_bucket_add_entry,
1868 .xlo_add_namevalue = ocfs2_xa_bucket_add_namevalue,
1869 .xlo_fill_value_buf = ocfs2_xa_bucket_fill_value_buf,
1870};
1871
1872static unsigned int ocfs2_xa_value_clusters(struct ocfs2_xa_loc *loc)
1873{
1874 struct ocfs2_xattr_value_buf vb;
1875
1876 if (ocfs2_xattr_is_local(loc->xl_entry))
1877 return 0;
1878
1879 ocfs2_xa_fill_value_buf(loc, &vb);
1880 return le32_to_cpu(vb.vb_xv->xr_clusters);
1881}
1882
1883static int ocfs2_xa_value_truncate(struct ocfs2_xa_loc *loc, u64 bytes,
1884 struct ocfs2_xattr_set_ctxt *ctxt)
1885{
1886 int trunc_rc, access_rc;
1887 struct ocfs2_xattr_value_buf vb;
1888
1889 ocfs2_xa_fill_value_buf(loc, &vb);
1890 trunc_rc = ocfs2_xattr_value_truncate(loc->xl_inode, &vb, bytes,
1891 ctxt);
1892
1893 /*
1894 * The caller of ocfs2_xa_value_truncate() has already called
1895 * ocfs2_xa_journal_access on the loc. However, The truncate code
1896 * calls ocfs2_extend_trans(). This may commit the previous
1897 * transaction and open a new one. If this is a bucket, truncate
1898 * could leave only vb->vb_bh set up for journaling. Meanwhile,
1899 * the caller is expecting to dirty the entire bucket. So we must
1900 * reset the journal work. We do this even if truncate has failed,
1901 * as it could have failed after committing the extend.
1902 */
1903 access_rc = ocfs2_xa_journal_access(ctxt->handle, loc,
1904 OCFS2_JOURNAL_ACCESS_WRITE);
1905
1906 /* Errors in truncate take precedence */
1907 return trunc_rc ? trunc_rc : access_rc;
1908}
1909
1910static void ocfs2_xa_remove_entry(struct ocfs2_xa_loc *loc)
1911{
1912 int index, count;
1913 struct ocfs2_xattr_header *xh = loc->xl_header;
1914 struct ocfs2_xattr_entry *entry = loc->xl_entry;
1915
1916 ocfs2_xa_wipe_namevalue(loc);
1917 loc->xl_entry = NULL;
1918
1919 le16_add_cpu(&xh->xh_count, -1);
1920 count = le16_to_cpu(xh->xh_count);
1921
1922 /*
1923 * Only zero out the entry if there are more remaining. This is
1924 * important for an empty bucket, as it keeps track of the
1925 * bucket's hash value. It doesn't hurt empty block storage.
1926 */
1927 if (count) {
1928 index = ((char *)entry - (char *)&xh->xh_entries) /
1929 sizeof(struct ocfs2_xattr_entry);
1930 memmove(&xh->xh_entries[index], &xh->xh_entries[index + 1],
1931 (count - index) * sizeof(struct ocfs2_xattr_entry));
1932 memset(&xh->xh_entries[count], 0,
1933 sizeof(struct ocfs2_xattr_entry));
1934 }
1519} 1935}
1520 1936
1521/* 1937/*
1522 * ocfs2_xattr_set_entry() 1938 * If we have a problem adjusting the size of an external value during
1939 * ocfs2_xa_prepare_entry() or ocfs2_xa_remove(), we may have an xattr
1940 * in an intermediate state. For example, the value may be partially
1941 * truncated.
1942 *
1943 * If the value tree hasn't changed, the extend/truncate went nowhere.
1944 * We have nothing to do. The caller can treat it as a straight error.
1523 * 1945 *
1524 * Set extended attribute entry into inode or block. 1946 * If the value tree got partially truncated, we now have a corrupted
1947 * extended attribute. We're going to wipe its entry and leak the
1948 * clusters. Better to leak some storage than leave a corrupt entry.
1525 * 1949 *
1526 * If extended attribute value size > OCFS2_XATTR_INLINE_SIZE, 1950 * If the value tree grew, it obviously didn't grow enough for the
1527 * We first insert tree root(ocfs2_xattr_value_root) with set_entry_local(), 1951 * new entry. We're not going to try and reclaim those clusters either.
1528 * then set value in B tree with set_value_outside(). 1952 * If there was already an external value there (orig_clusters != 0),
1953 * the new clusters are attached safely and we can just leave the old
1954 * value in place. If there was no external value there, we remove
1955 * the entry.
1956 *
1957 * This way, the xattr block we store in the journal will be consistent.
1958 * If the size change broke because of the journal, no changes will hit
1959 * disk anyway.
1529 */ 1960 */
1530static int ocfs2_xattr_set_entry(struct inode *inode, 1961static void ocfs2_xa_cleanup_value_truncate(struct ocfs2_xa_loc *loc,
1531 struct ocfs2_xattr_info *xi, 1962 const char *what,
1532 struct ocfs2_xattr_search *xs, 1963 unsigned int orig_clusters)
1533 struct ocfs2_xattr_set_ctxt *ctxt, 1964{
1534 int flag) 1965 unsigned int new_clusters = ocfs2_xa_value_clusters(loc);
1535{ 1966 char *nameval_buf = ocfs2_xa_offset_pointer(loc,
1536 struct ocfs2_xattr_entry *last; 1967 le16_to_cpu(loc->xl_entry->xe_name_offset));
1537 struct ocfs2_inode_info *oi = OCFS2_I(inode); 1968
1538 struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data; 1969 if (new_clusters < orig_clusters) {
1539 size_t min_offs = xs->end - xs->base, name_len = strlen(xi->name); 1970 mlog(ML_ERROR,
1540 size_t size_l = 0; 1971 "Partial truncate while %s xattr %.*s. Leaking "
1541 handle_t *handle = ctxt->handle; 1972 "%u clusters and removing the entry\n",
1542 int free, i, ret; 1973 what, loc->xl_entry->xe_name_len, nameval_buf,
1543 struct ocfs2_xattr_info xi_l = { 1974 orig_clusters - new_clusters);
1544 .name_index = xi->name_index, 1975 ocfs2_xa_remove_entry(loc);
1545 .name = xi->name, 1976 } else if (!orig_clusters) {
1546 .value = xi->value, 1977 mlog(ML_ERROR,
1547 .value_len = xi->value_len, 1978 "Unable to allocate an external value for xattr "
1548 }; 1979 "%.*s safely. Leaking %u clusters and removing the "
1549 struct ocfs2_xattr_value_buf vb = { 1980 "entry\n",
1550 .vb_bh = xs->xattr_bh, 1981 loc->xl_entry->xe_name_len, nameval_buf,
1551 .vb_access = ocfs2_journal_access_di, 1982 new_clusters - orig_clusters);
1552 }; 1983 ocfs2_xa_remove_entry(loc);
1984 } else if (new_clusters > orig_clusters)
1985 mlog(ML_ERROR,
1986 "Unable to grow xattr %.*s safely. %u new clusters "
1987 "have been added, but the value will not be "
1988 "modified\n",
1989 loc->xl_entry->xe_name_len, nameval_buf,
1990 new_clusters - orig_clusters);
1991}
1992
1993static int ocfs2_xa_remove(struct ocfs2_xa_loc *loc,
1994 struct ocfs2_xattr_set_ctxt *ctxt)
1995{
1996 int rc = 0;
1997 unsigned int orig_clusters;
1998
1999 if (!ocfs2_xattr_is_local(loc->xl_entry)) {
2000 orig_clusters = ocfs2_xa_value_clusters(loc);
2001 rc = ocfs2_xa_value_truncate(loc, 0, ctxt);
2002 if (rc) {
2003 mlog_errno(rc);
2004 /*
2005 * Since this is remove, we can return 0 if
2006 * ocfs2_xa_cleanup_value_truncate() is going to
2007 * wipe the entry anyway. So we check the
2008 * cluster count as well.
2009 */
2010 if (orig_clusters != ocfs2_xa_value_clusters(loc))
2011 rc = 0;
2012 ocfs2_xa_cleanup_value_truncate(loc, "removing",
2013 orig_clusters);
2014 if (rc)
2015 goto out;
2016 }
2017 }
1553 2018
1554 if (!(flag & OCFS2_INLINE_XATTR_FL)) { 2019 ocfs2_xa_remove_entry(loc);
1555 BUG_ON(xs->xattr_bh == xs->inode_bh);
1556 vb.vb_access = ocfs2_journal_access_xb;
1557 } else
1558 BUG_ON(xs->xattr_bh != xs->inode_bh);
1559 2020
1560 /* Compute min_offs, last and free space. */ 2021out:
1561 last = xs->header->xh_entries; 2022 return rc;
2023}
1562 2024
1563 for (i = 0 ; i < le16_to_cpu(xs->header->xh_count); i++) { 2025static void ocfs2_xa_install_value_root(struct ocfs2_xa_loc *loc)
1564 size_t offs = le16_to_cpu(last->xe_name_offset); 2026{
1565 if (offs < min_offs) 2027 int name_size = OCFS2_XATTR_SIZE(loc->xl_entry->xe_name_len);
1566 min_offs = offs; 2028 char *nameval_buf;
1567 last += 1;
1568 }
1569 2029
1570 free = min_offs - ((void *)last - xs->base) - OCFS2_XATTR_HEADER_GAP; 2030 nameval_buf = ocfs2_xa_offset_pointer(loc,
1571 if (free < 0) 2031 le16_to_cpu(loc->xl_entry->xe_name_offset));
1572 return -EIO; 2032 memcpy(nameval_buf + name_size, &def_xv, OCFS2_XATTR_ROOT_SIZE);
2033}
1573 2034
1574 if (!xs->not_found) { 2035/*
1575 size_t size = 0; 2036 * Take an existing entry and make it ready for the new value. This
1576 if (ocfs2_xattr_is_local(xs->here)) 2037 * won't allocate space, but it may free space. It should be ready for
1577 size = OCFS2_XATTR_SIZE(name_len) + 2038 * ocfs2_xa_prepare_entry() to finish the work.
1578 OCFS2_XATTR_SIZE(le64_to_cpu(xs->here->xe_value_size)); 2039 */
1579 else 2040static int ocfs2_xa_reuse_entry(struct ocfs2_xa_loc *loc,
1580 size = OCFS2_XATTR_SIZE(name_len) + 2041 struct ocfs2_xattr_info *xi,
1581 OCFS2_XATTR_ROOT_SIZE; 2042 struct ocfs2_xattr_set_ctxt *ctxt)
1582 free += (size + sizeof(struct ocfs2_xattr_entry)); 2043{
1583 } 2044 int rc = 0;
1584 /* Check free space in inode or block */ 2045 int name_size = OCFS2_XATTR_SIZE(xi->xi_name_len);
1585 if (xi->value && xi->value_len > OCFS2_XATTR_INLINE_SIZE) { 2046 unsigned int orig_clusters;
1586 if (free < sizeof(struct ocfs2_xattr_entry) + 2047 char *nameval_buf;
1587 OCFS2_XATTR_SIZE(name_len) + 2048 int xe_local = ocfs2_xattr_is_local(loc->xl_entry);
1588 OCFS2_XATTR_ROOT_SIZE) { 2049 int xi_local = xi->xi_value_len <= OCFS2_XATTR_INLINE_SIZE;
1589 ret = -ENOSPC; 2050
1590 goto out; 2051 BUG_ON(OCFS2_XATTR_SIZE(loc->xl_entry->xe_name_len) !=
2052 name_size);
2053
2054 nameval_buf = ocfs2_xa_offset_pointer(loc,
2055 le16_to_cpu(loc->xl_entry->xe_name_offset));
2056 if (xe_local) {
2057 memset(nameval_buf + name_size, 0,
2058 namevalue_size_xe(loc->xl_entry) - name_size);
2059 if (!xi_local)
2060 ocfs2_xa_install_value_root(loc);
2061 } else {
2062 orig_clusters = ocfs2_xa_value_clusters(loc);
2063 if (xi_local) {
2064 rc = ocfs2_xa_value_truncate(loc, 0, ctxt);
2065 if (rc < 0)
2066 mlog_errno(rc);
2067 else
2068 memset(nameval_buf + name_size, 0,
2069 namevalue_size_xe(loc->xl_entry) -
2070 name_size);
2071 } else if (le64_to_cpu(loc->xl_entry->xe_value_size) >
2072 xi->xi_value_len) {
2073 rc = ocfs2_xa_value_truncate(loc, xi->xi_value_len,
2074 ctxt);
2075 if (rc < 0)
2076 mlog_errno(rc);
1591 } 2077 }
1592 size_l = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE; 2078
1593 xi_l.value = (void *)&def_xv; 2079 if (rc) {
1594 xi_l.value_len = OCFS2_XATTR_ROOT_SIZE; 2080 ocfs2_xa_cleanup_value_truncate(loc, "reusing",
1595 } else if (xi->value) { 2081 orig_clusters);
1596 if (free < sizeof(struct ocfs2_xattr_entry) +
1597 OCFS2_XATTR_SIZE(name_len) +
1598 OCFS2_XATTR_SIZE(xi->value_len)) {
1599 ret = -ENOSPC;
1600 goto out; 2082 goto out;
1601 } 2083 }
1602 } 2084 }
1603 2085
1604 if (!xs->not_found) { 2086 loc->xl_entry->xe_value_size = cpu_to_le64(xi->xi_value_len);
1605 /* For existing extended attribute */ 2087 ocfs2_xattr_set_local(loc->xl_entry, xi_local);
1606 size_t size = OCFS2_XATTR_SIZE(name_len) +
1607 OCFS2_XATTR_SIZE(le64_to_cpu(xs->here->xe_value_size));
1608 size_t offs = le16_to_cpu(xs->here->xe_name_offset);
1609 void *val = xs->base + offs;
1610 2088
1611 if (ocfs2_xattr_is_local(xs->here) && size == size_l) { 2089out:
1612 /* Replace existing local xattr with tree root */ 2090 return rc;
1613 ret = ocfs2_xattr_set_value_outside(inode, xi, xs, 2091}
1614 ctxt, &vb, offs);
1615 if (ret < 0)
1616 mlog_errno(ret);
1617 goto out;
1618 } else if (!ocfs2_xattr_is_local(xs->here)) {
1619 /* For existing xattr which has value outside */
1620 vb.vb_xv = (struct ocfs2_xattr_value_root *)
1621 (val + OCFS2_XATTR_SIZE(name_len));
1622 2092
1623 if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) { 2093/*
1624 /* 2094 * Prepares loc->xl_entry to receive the new xattr. This includes
1625 * If new value need set outside also, 2095 * properly setting up the name+value pair region. If loc->xl_entry
1626 * first truncate old value to new value, 2096 * already exists, it will take care of modifying it appropriately.
1627 * then set new value with set_value_outside(). 2097 *
1628 */ 2098 * Note that this modifies the data. You did journal_access already,
1629 ret = ocfs2_xattr_value_truncate(inode, 2099 * right?
1630 &vb, 2100 */
1631 xi->value_len, 2101static int ocfs2_xa_prepare_entry(struct ocfs2_xa_loc *loc,
1632 ctxt); 2102 struct ocfs2_xattr_info *xi,
1633 if (ret < 0) { 2103 u32 name_hash,
1634 mlog_errno(ret); 2104 struct ocfs2_xattr_set_ctxt *ctxt)
1635 goto out; 2105{
1636 } 2106 int rc = 0;
2107 unsigned int orig_clusters;
2108 __le64 orig_value_size = 0;
1637 2109
1638 ret = ocfs2_xattr_update_entry(inode, 2110 rc = ocfs2_xa_check_space(loc, xi);
1639 handle, 2111 if (rc)
1640 xi, 2112 goto out;
1641 xs,
1642 &vb,
1643 offs);
1644 if (ret < 0) {
1645 mlog_errno(ret);
1646 goto out;
1647 }
1648 2113
1649 ret = __ocfs2_xattr_set_value_outside(inode, 2114 if (loc->xl_entry) {
1650 handle, 2115 if (ocfs2_xa_can_reuse_entry(loc, xi)) {
1651 &vb, 2116 orig_value_size = loc->xl_entry->xe_value_size;
1652 xi->value, 2117 rc = ocfs2_xa_reuse_entry(loc, xi, ctxt);
1653 xi->value_len); 2118 if (rc)
1654 if (ret < 0) 2119 goto out;
1655 mlog_errno(ret); 2120 goto alloc_value;
2121 }
2122
2123 if (!ocfs2_xattr_is_local(loc->xl_entry)) {
2124 orig_clusters = ocfs2_xa_value_clusters(loc);
2125 rc = ocfs2_xa_value_truncate(loc, 0, ctxt);
2126 if (rc) {
2127 mlog_errno(rc);
2128 ocfs2_xa_cleanup_value_truncate(loc,
2129 "overwriting",
2130 orig_clusters);
1656 goto out; 2131 goto out;
1657 } else {
1658 /*
1659 * If new value need set in local,
1660 * just trucate old value to zero.
1661 */
1662 ret = ocfs2_xattr_value_truncate(inode,
1663 &vb,
1664 0,
1665 ctxt);
1666 if (ret < 0)
1667 mlog_errno(ret);
1668 } 2132 }
1669 } 2133 }
2134 ocfs2_xa_wipe_namevalue(loc);
2135 } else
2136 ocfs2_xa_add_entry(loc, name_hash);
2137
2138 /*
2139 * If we get here, we have a blank entry. Fill it. We grow our
2140 * name+value pair back from the end.
2141 */
2142 ocfs2_xa_add_namevalue(loc, xi);
2143 if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE)
2144 ocfs2_xa_install_value_root(loc);
2145
2146alloc_value:
2147 if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) {
2148 orig_clusters = ocfs2_xa_value_clusters(loc);
2149 rc = ocfs2_xa_value_truncate(loc, xi->xi_value_len, ctxt);
2150 if (rc < 0) {
2151 /*
2152 * If we tried to grow an existing external value,
2153 * ocfs2_xa_cleanuP-value_truncate() is going to
2154 * let it stand. We have to restore its original
2155 * value size.
2156 */
2157 loc->xl_entry->xe_value_size = orig_value_size;
2158 ocfs2_xa_cleanup_value_truncate(loc, "growing",
2159 orig_clusters);
2160 mlog_errno(rc);
2161 }
1670 } 2162 }
1671 2163
1672 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), xs->inode_bh, 2164out:
2165 return rc;
2166}
2167
2168/*
2169 * Store the value portion of the name+value pair. This will skip
2170 * values that are stored externally. Their tree roots were set up
2171 * by ocfs2_xa_prepare_entry().
2172 */
2173static int ocfs2_xa_store_value(struct ocfs2_xa_loc *loc,
2174 struct ocfs2_xattr_info *xi,
2175 struct ocfs2_xattr_set_ctxt *ctxt)
2176{
2177 int rc = 0;
2178 int nameval_offset = le16_to_cpu(loc->xl_entry->xe_name_offset);
2179 int name_size = OCFS2_XATTR_SIZE(xi->xi_name_len);
2180 char *nameval_buf;
2181 struct ocfs2_xattr_value_buf vb;
2182
2183 nameval_buf = ocfs2_xa_offset_pointer(loc, nameval_offset);
2184 if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) {
2185 ocfs2_xa_fill_value_buf(loc, &vb);
2186 rc = __ocfs2_xattr_set_value_outside(loc->xl_inode,
2187 ctxt->handle, &vb,
2188 xi->xi_value,
2189 xi->xi_value_len);
2190 } else
2191 memcpy(nameval_buf + name_size, xi->xi_value, xi->xi_value_len);
2192
2193 return rc;
2194}
2195
2196static int ocfs2_xa_set(struct ocfs2_xa_loc *loc,
2197 struct ocfs2_xattr_info *xi,
2198 struct ocfs2_xattr_set_ctxt *ctxt)
2199{
2200 int ret;
2201 u32 name_hash = ocfs2_xattr_name_hash(loc->xl_inode, xi->xi_name,
2202 xi->xi_name_len);
2203
2204 ret = ocfs2_xa_journal_access(ctxt->handle, loc,
1673 OCFS2_JOURNAL_ACCESS_WRITE); 2205 OCFS2_JOURNAL_ACCESS_WRITE);
1674 if (ret) { 2206 if (ret) {
1675 mlog_errno(ret); 2207 mlog_errno(ret);
1676 goto out; 2208 goto out;
1677 } 2209 }
1678 2210
1679 if (!(flag & OCFS2_INLINE_XATTR_FL)) {
1680 ret = vb.vb_access(handle, INODE_CACHE(inode), vb.vb_bh,
1681 OCFS2_JOURNAL_ACCESS_WRITE);
1682 if (ret) {
1683 mlog_errno(ret);
1684 goto out;
1685 }
1686 }
1687
1688 /* 2211 /*
1689 * Set value in local, include set tree root in local. 2212 * From here on out, everything is going to modify the buffer a
1690 * This is the first step for value size >INLINE_SIZE. 2213 * little. Errors are going to leave the xattr header in a
2214 * sane state. Thus, even with errors we dirty the sucker.
1691 */ 2215 */
1692 ocfs2_xattr_set_entry_local(inode, &xi_l, xs, last, min_offs);
1693 2216
1694 if (!(flag & OCFS2_INLINE_XATTR_FL)) { 2217 /* Don't worry, we are never called with !xi_value and !xl_entry */
1695 ret = ocfs2_journal_dirty(handle, xs->xattr_bh); 2218 if (!xi->xi_value) {
1696 if (ret < 0) { 2219 ret = ocfs2_xa_remove(loc, ctxt);
1697 mlog_errno(ret); 2220 goto out_dirty;
1698 goto out;
1699 }
1700 } 2221 }
1701 2222
1702 if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) && 2223 ret = ocfs2_xa_prepare_entry(loc, xi, name_hash, ctxt);
1703 (flag & OCFS2_INLINE_XATTR_FL)) { 2224 if (ret) {
1704 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 2225 if (ret != -ENOSPC)
1705 unsigned int xattrsize = osb->s_xattr_inline_size; 2226 mlog_errno(ret);
1706 2227 goto out_dirty;
1707 /*
1708 * Adjust extent record count or inline data size
1709 * to reserve space for extended attribute.
1710 */
1711 if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1712 struct ocfs2_inline_data *idata = &di->id2.i_data;
1713 le16_add_cpu(&idata->id_count, -xattrsize);
1714 } else if (!(ocfs2_inode_is_fast_symlink(inode))) {
1715 struct ocfs2_extent_list *el = &di->id2.i_list;
1716 le16_add_cpu(&el->l_count, -(xattrsize /
1717 sizeof(struct ocfs2_extent_rec)));
1718 }
1719 di->i_xattr_inline_size = cpu_to_le16(xattrsize);
1720 } 2228 }
1721 /* Update xattr flag */
1722 spin_lock(&oi->ip_lock);
1723 oi->ip_dyn_features |= flag;
1724 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
1725 spin_unlock(&oi->ip_lock);
1726 2229
1727 ret = ocfs2_journal_dirty(handle, xs->inode_bh); 2230 ret = ocfs2_xa_store_value(loc, xi, ctxt);
1728 if (ret < 0) 2231 if (ret)
1729 mlog_errno(ret); 2232 mlog_errno(ret);
1730 2233
1731 if (!ret && xi->value_len > OCFS2_XATTR_INLINE_SIZE) { 2234out_dirty:
1732 /* 2235 ocfs2_xa_journal_dirty(ctxt->handle, loc);
1733 * Set value outside in B tree.
1734 * This is the second step for value size > INLINE_SIZE.
1735 */
1736 size_t offs = le16_to_cpu(xs->here->xe_name_offset);
1737 ret = ocfs2_xattr_set_value_outside(inode, xi, xs, ctxt,
1738 &vb, offs);
1739 if (ret < 0) {
1740 int ret2;
1741 2236
1742 mlog_errno(ret);
1743 /*
1744 * If set value outside failed, we have to clean
1745 * the junk tree root we have already set in local.
1746 */
1747 ret2 = ocfs2_xattr_cleanup(inode, ctxt->handle,
1748 xi, xs, &vb, offs);
1749 if (ret2 < 0)
1750 mlog_errno(ret2);
1751 }
1752 }
1753out: 2237out:
1754 return ret; 2238 return ret;
1755} 2239}
1756 2240
2241static void ocfs2_init_dinode_xa_loc(struct ocfs2_xa_loc *loc,
2242 struct inode *inode,
2243 struct buffer_head *bh,
2244 struct ocfs2_xattr_entry *entry)
2245{
2246 struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
2247
2248 BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_XATTR_FL));
2249
2250 loc->xl_inode = inode;
2251 loc->xl_ops = &ocfs2_xa_block_loc_ops;
2252 loc->xl_storage = bh;
2253 loc->xl_entry = entry;
2254 loc->xl_size = le16_to_cpu(di->i_xattr_inline_size);
2255 loc->xl_header =
2256 (struct ocfs2_xattr_header *)(bh->b_data + bh->b_size -
2257 loc->xl_size);
2258}
2259
2260static void ocfs2_init_xattr_block_xa_loc(struct ocfs2_xa_loc *loc,
2261 struct inode *inode,
2262 struct buffer_head *bh,
2263 struct ocfs2_xattr_entry *entry)
2264{
2265 struct ocfs2_xattr_block *xb =
2266 (struct ocfs2_xattr_block *)bh->b_data;
2267
2268 BUG_ON(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED);
2269
2270 loc->xl_inode = inode;
2271 loc->xl_ops = &ocfs2_xa_block_loc_ops;
2272 loc->xl_storage = bh;
2273 loc->xl_header = &(xb->xb_attrs.xb_header);
2274 loc->xl_entry = entry;
2275 loc->xl_size = bh->b_size - offsetof(struct ocfs2_xattr_block,
2276 xb_attrs.xb_header);
2277}
2278
2279static void ocfs2_init_xattr_bucket_xa_loc(struct ocfs2_xa_loc *loc,
2280 struct ocfs2_xattr_bucket *bucket,
2281 struct ocfs2_xattr_entry *entry)
2282{
2283 loc->xl_inode = bucket->bu_inode;
2284 loc->xl_ops = &ocfs2_xa_bucket_loc_ops;
2285 loc->xl_storage = bucket;
2286 loc->xl_header = bucket_xh(bucket);
2287 loc->xl_entry = entry;
2288 loc->xl_size = OCFS2_XATTR_BUCKET_SIZE;
2289}
2290
1757/* 2291/*
1758 * In xattr remove, if it is stored outside and refcounted, we may have 2292 * In xattr remove, if it is stored outside and refcounted, we may have
1759 * the chance to split the refcount tree. So need the allocators. 2293 * the chance to split the refcount tree. So need the allocators.
@@ -2149,6 +2683,55 @@ static int ocfs2_xattr_ibody_find(struct inode *inode,
2149 return 0; 2683 return 0;
2150} 2684}
2151 2685
2686static int ocfs2_xattr_ibody_init(struct inode *inode,
2687 struct buffer_head *di_bh,
2688 struct ocfs2_xattr_set_ctxt *ctxt)
2689{
2690 int ret;
2691 struct ocfs2_inode_info *oi = OCFS2_I(inode);
2692 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
2693 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2694 unsigned int xattrsize = osb->s_xattr_inline_size;
2695
2696 if (!ocfs2_xattr_has_space_inline(inode, di)) {
2697 ret = -ENOSPC;
2698 goto out;
2699 }
2700
2701 ret = ocfs2_journal_access_di(ctxt->handle, INODE_CACHE(inode), di_bh,
2702 OCFS2_JOURNAL_ACCESS_WRITE);
2703 if (ret) {
2704 mlog_errno(ret);
2705 goto out;
2706 }
2707
2708 /*
2709 * Adjust extent record count or inline data size
2710 * to reserve space for extended attribute.
2711 */
2712 if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
2713 struct ocfs2_inline_data *idata = &di->id2.i_data;
2714 le16_add_cpu(&idata->id_count, -xattrsize);
2715 } else if (!(ocfs2_inode_is_fast_symlink(inode))) {
2716 struct ocfs2_extent_list *el = &di->id2.i_list;
2717 le16_add_cpu(&el->l_count, -(xattrsize /
2718 sizeof(struct ocfs2_extent_rec)));
2719 }
2720 di->i_xattr_inline_size = cpu_to_le16(xattrsize);
2721
2722 spin_lock(&oi->ip_lock);
2723 oi->ip_dyn_features |= OCFS2_INLINE_XATTR_FL|OCFS2_HAS_XATTR_FL;
2724 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
2725 spin_unlock(&oi->ip_lock);
2726
2727 ret = ocfs2_journal_dirty(ctxt->handle, di_bh);
2728 if (ret < 0)
2729 mlog_errno(ret);
2730
2731out:
2732 return ret;
2733}
2734
2152/* 2735/*
2153 * ocfs2_xattr_ibody_set() 2736 * ocfs2_xattr_ibody_set()
2154 * 2737 *
@@ -2160,9 +2743,10 @@ static int ocfs2_xattr_ibody_set(struct inode *inode,
2160 struct ocfs2_xattr_search *xs, 2743 struct ocfs2_xattr_search *xs,
2161 struct ocfs2_xattr_set_ctxt *ctxt) 2744 struct ocfs2_xattr_set_ctxt *ctxt)
2162{ 2745{
2746 int ret;
2163 struct ocfs2_inode_info *oi = OCFS2_I(inode); 2747 struct ocfs2_inode_info *oi = OCFS2_I(inode);
2164 struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data; 2748 struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
2165 int ret; 2749 struct ocfs2_xa_loc loc;
2166 2750
2167 if (inode->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE) 2751 if (inode->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE)
2168 return -ENOSPC; 2752 return -ENOSPC;
@@ -2175,8 +2759,25 @@ static int ocfs2_xattr_ibody_set(struct inode *inode,
2175 } 2759 }
2176 } 2760 }
2177 2761
2178 ret = ocfs2_xattr_set_entry(inode, xi, xs, ctxt, 2762 if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) {
2179 (OCFS2_INLINE_XATTR_FL | OCFS2_HAS_XATTR_FL)); 2763 ret = ocfs2_xattr_ibody_init(inode, xs->inode_bh, ctxt);
2764 if (ret) {
2765 if (ret != -ENOSPC)
2766 mlog_errno(ret);
2767 goto out;
2768 }
2769 }
2770
2771 ocfs2_init_dinode_xa_loc(&loc, inode, xs->inode_bh,
2772 xs->not_found ? NULL : xs->here);
2773 ret = ocfs2_xa_set(&loc, xi, ctxt);
2774 if (ret) {
2775 if (ret != -ENOSPC)
2776 mlog_errno(ret);
2777 goto out;
2778 }
2779 xs->here = loc.xl_entry;
2780
2180out: 2781out:
2181 up_write(&oi->ip_alloc_sem); 2782 up_write(&oi->ip_alloc_sem);
2182 2783
@@ -2236,12 +2837,11 @@ cleanup:
2236 return ret; 2837 return ret;
2237} 2838}
2238 2839
2239static int ocfs2_create_xattr_block(handle_t *handle, 2840static int ocfs2_create_xattr_block(struct inode *inode,
2240 struct inode *inode,
2241 struct buffer_head *inode_bh, 2841 struct buffer_head *inode_bh,
2242 struct ocfs2_alloc_context *meta_ac, 2842 struct ocfs2_xattr_set_ctxt *ctxt,
2243 struct buffer_head **ret_bh, 2843 int indexed,
2244 int indexed) 2844 struct buffer_head **ret_bh)
2245{ 2845{
2246 int ret; 2846 int ret;
2247 u16 suballoc_bit_start; 2847 u16 suballoc_bit_start;
@@ -2252,14 +2852,14 @@ static int ocfs2_create_xattr_block(handle_t *handle,
2252 struct buffer_head *new_bh = NULL; 2852 struct buffer_head *new_bh = NULL;
2253 struct ocfs2_xattr_block *xblk; 2853 struct ocfs2_xattr_block *xblk;
2254 2854
2255 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), inode_bh, 2855 ret = ocfs2_journal_access_di(ctxt->handle, INODE_CACHE(inode),
2256 OCFS2_JOURNAL_ACCESS_CREATE); 2856 inode_bh, OCFS2_JOURNAL_ACCESS_CREATE);
2257 if (ret < 0) { 2857 if (ret < 0) {
2258 mlog_errno(ret); 2858 mlog_errno(ret);
2259 goto end; 2859 goto end;
2260 } 2860 }
2261 2861
2262 ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1, 2862 ret = ocfs2_claim_metadata(osb, ctxt->handle, ctxt->meta_ac, 1,
2263 &suballoc_bit_start, &num_got, 2863 &suballoc_bit_start, &num_got,
2264 &first_blkno); 2864 &first_blkno);
2265 if (ret < 0) { 2865 if (ret < 0) {
@@ -2270,7 +2870,7 @@ static int ocfs2_create_xattr_block(handle_t *handle,
2270 new_bh = sb_getblk(inode->i_sb, first_blkno); 2870 new_bh = sb_getblk(inode->i_sb, first_blkno);
2271 ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), new_bh); 2871 ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), new_bh);
2272 2872
2273 ret = ocfs2_journal_access_xb(handle, INODE_CACHE(inode), 2873 ret = ocfs2_journal_access_xb(ctxt->handle, INODE_CACHE(inode),
2274 new_bh, 2874 new_bh,
2275 OCFS2_JOURNAL_ACCESS_CREATE); 2875 OCFS2_JOURNAL_ACCESS_CREATE);
2276 if (ret < 0) { 2876 if (ret < 0) {
@@ -2282,11 +2882,10 @@ static int ocfs2_create_xattr_block(handle_t *handle,
2282 xblk = (struct ocfs2_xattr_block *)new_bh->b_data; 2882 xblk = (struct ocfs2_xattr_block *)new_bh->b_data;
2283 memset(xblk, 0, inode->i_sb->s_blocksize); 2883 memset(xblk, 0, inode->i_sb->s_blocksize);
2284 strcpy((void *)xblk, OCFS2_XATTR_BLOCK_SIGNATURE); 2884 strcpy((void *)xblk, OCFS2_XATTR_BLOCK_SIGNATURE);
2285 xblk->xb_suballoc_slot = cpu_to_le16(osb->slot_num); 2885 xblk->xb_suballoc_slot = cpu_to_le16(ctxt->meta_ac->ac_alloc_slot);
2286 xblk->xb_suballoc_bit = cpu_to_le16(suballoc_bit_start); 2886 xblk->xb_suballoc_bit = cpu_to_le16(suballoc_bit_start);
2287 xblk->xb_fs_generation = cpu_to_le32(osb->fs_generation); 2887 xblk->xb_fs_generation = cpu_to_le32(osb->fs_generation);
2288 xblk->xb_blkno = cpu_to_le64(first_blkno); 2888 xblk->xb_blkno = cpu_to_le64(first_blkno);
2289
2290 if (indexed) { 2889 if (indexed) {
2291 struct ocfs2_xattr_tree_root *xr = &xblk->xb_attrs.xb_root; 2890 struct ocfs2_xattr_tree_root *xr = &xblk->xb_attrs.xb_root;
2292 xr->xt_clusters = cpu_to_le32(1); 2891 xr->xt_clusters = cpu_to_le32(1);
@@ -2297,14 +2896,17 @@ static int ocfs2_create_xattr_block(handle_t *handle,
2297 xr->xt_list.l_next_free_rec = cpu_to_le16(1); 2896 xr->xt_list.l_next_free_rec = cpu_to_le16(1);
2298 xblk->xb_flags = cpu_to_le16(OCFS2_XATTR_INDEXED); 2897 xblk->xb_flags = cpu_to_le16(OCFS2_XATTR_INDEXED);
2299 } 2898 }
2899 ocfs2_journal_dirty(ctxt->handle, new_bh);
2300 2900
2301 ret = ocfs2_journal_dirty(handle, new_bh); 2901 /* Add it to the inode */
2302 if (ret < 0) {
2303 mlog_errno(ret);
2304 goto end;
2305 }
2306 di->i_xattr_loc = cpu_to_le64(first_blkno); 2902 di->i_xattr_loc = cpu_to_le64(first_blkno);
2307 ocfs2_journal_dirty(handle, inode_bh); 2903
2904 spin_lock(&OCFS2_I(inode)->ip_lock);
2905 OCFS2_I(inode)->ip_dyn_features |= OCFS2_HAS_XATTR_FL;
2906 di->i_dyn_features = cpu_to_le16(OCFS2_I(inode)->ip_dyn_features);
2907 spin_unlock(&OCFS2_I(inode)->ip_lock);
2908
2909 ocfs2_journal_dirty(ctxt->handle, inode_bh);
2308 2910
2309 *ret_bh = new_bh; 2911 *ret_bh = new_bh;
2310 new_bh = NULL; 2912 new_bh = NULL;
@@ -2326,13 +2928,13 @@ static int ocfs2_xattr_block_set(struct inode *inode,
2326 struct ocfs2_xattr_set_ctxt *ctxt) 2928 struct ocfs2_xattr_set_ctxt *ctxt)
2327{ 2929{
2328 struct buffer_head *new_bh = NULL; 2930 struct buffer_head *new_bh = NULL;
2329 handle_t *handle = ctxt->handle;
2330 struct ocfs2_xattr_block *xblk = NULL; 2931 struct ocfs2_xattr_block *xblk = NULL;
2331 int ret; 2932 int ret;
2933 struct ocfs2_xa_loc loc;
2332 2934
2333 if (!xs->xattr_bh) { 2935 if (!xs->xattr_bh) {
2334 ret = ocfs2_create_xattr_block(handle, inode, xs->inode_bh, 2936 ret = ocfs2_create_xattr_block(inode, xs->inode_bh, ctxt,
2335 ctxt->meta_ac, &new_bh, 0); 2937 0, &new_bh);
2336 if (ret) { 2938 if (ret) {
2337 mlog_errno(ret); 2939 mlog_errno(ret);
2338 goto end; 2940 goto end;
@@ -2348,21 +2950,25 @@ static int ocfs2_xattr_block_set(struct inode *inode,
2348 xblk = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data; 2950 xblk = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data;
2349 2951
2350 if (!(le16_to_cpu(xblk->xb_flags) & OCFS2_XATTR_INDEXED)) { 2952 if (!(le16_to_cpu(xblk->xb_flags) & OCFS2_XATTR_INDEXED)) {
2351 /* Set extended attribute into external block */ 2953 ocfs2_init_xattr_block_xa_loc(&loc, inode, xs->xattr_bh,
2352 ret = ocfs2_xattr_set_entry(inode, xi, xs, ctxt, 2954 xs->not_found ? NULL : xs->here);
2353 OCFS2_HAS_XATTR_FL);
2354 if (!ret || ret != -ENOSPC)
2355 goto end;
2356 2955
2357 ret = ocfs2_xattr_create_index_block(inode, xs, ctxt); 2956 ret = ocfs2_xa_set(&loc, xi, ctxt);
2358 if (ret) 2957 if (!ret)
2958 xs->here = loc.xl_entry;
2959 else if (ret != -ENOSPC)
2359 goto end; 2960 goto end;
2961 else {
2962 ret = ocfs2_xattr_create_index_block(inode, xs, ctxt);
2963 if (ret)
2964 goto end;
2965 }
2360 } 2966 }
2361 2967
2362 ret = ocfs2_xattr_set_entry_index_block(inode, xi, xs, ctxt); 2968 if (le16_to_cpu(xblk->xb_flags) & OCFS2_XATTR_INDEXED)
2969 ret = ocfs2_xattr_set_entry_index_block(inode, xi, xs, ctxt);
2363 2970
2364end: 2971end:
2365
2366 return ret; 2972 return ret;
2367} 2973}
2368 2974
@@ -2371,7 +2977,6 @@ static int ocfs2_xattr_can_be_in_inode(struct inode *inode,
2371 struct ocfs2_xattr_info *xi, 2977 struct ocfs2_xattr_info *xi,
2372 struct ocfs2_xattr_search *xs) 2978 struct ocfs2_xattr_search *xs)
2373{ 2979{
2374 u64 value_size;
2375 struct ocfs2_xattr_entry *last; 2980 struct ocfs2_xattr_entry *last;
2376 int free, i; 2981 int free, i;
2377 size_t min_offs = xs->end - xs->base; 2982 size_t min_offs = xs->end - xs->base;
@@ -2394,13 +2999,7 @@ static int ocfs2_xattr_can_be_in_inode(struct inode *inode,
2394 2999
2395 BUG_ON(!xs->not_found); 3000 BUG_ON(!xs->not_found);
2396 3001
2397 if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) 3002 if (free >= (sizeof(struct ocfs2_xattr_entry) + namevalue_size_xi(xi)))
2398 value_size = OCFS2_XATTR_ROOT_SIZE;
2399 else
2400 value_size = OCFS2_XATTR_SIZE(xi->value_len);
2401
2402 if (free >= sizeof(struct ocfs2_xattr_entry) +
2403 OCFS2_XATTR_SIZE(strlen(xi->name)) + value_size)
2404 return 1; 3003 return 1;
2405 3004
2406 return 0; 3005 return 0;
@@ -2424,7 +3023,7 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode,
2424 char *base = NULL; 3023 char *base = NULL;
2425 int name_offset, name_len = 0; 3024 int name_offset, name_len = 0;
2426 u32 new_clusters = ocfs2_clusters_for_bytes(inode->i_sb, 3025 u32 new_clusters = ocfs2_clusters_for_bytes(inode->i_sb,
2427 xi->value_len); 3026 xi->xi_value_len);
2428 u64 value_size; 3027 u64 value_size;
2429 3028
2430 /* 3029 /*
@@ -2432,14 +3031,14 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode,
2432 * No matter whether we replace an old one or add a new one, 3031 * No matter whether we replace an old one or add a new one,
2433 * we need this for writing. 3032 * we need this for writing.
2434 */ 3033 */
2435 if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) 3034 if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE)
2436 credits += new_clusters * 3035 credits += new_clusters *
2437 ocfs2_clusters_to_blocks(inode->i_sb, 1); 3036 ocfs2_clusters_to_blocks(inode->i_sb, 1);
2438 3037
2439 if (xis->not_found && xbs->not_found) { 3038 if (xis->not_found && xbs->not_found) {
2440 credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb); 3039 credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
2441 3040
2442 if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) { 3041 if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) {
2443 clusters_add += new_clusters; 3042 clusters_add += new_clusters;
2444 credits += ocfs2_calc_extend_credits(inode->i_sb, 3043 credits += ocfs2_calc_extend_credits(inode->i_sb,
2445 &def_xv.xv.xr_list, 3044 &def_xv.xv.xr_list,
@@ -2484,7 +3083,7 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode,
2484 * The credits for removing the value tree will be extended 3083 * The credits for removing the value tree will be extended
2485 * by ocfs2_remove_extent itself. 3084 * by ocfs2_remove_extent itself.
2486 */ 3085 */
2487 if (!xi->value) { 3086 if (!xi->xi_value) {
2488 if (!ocfs2_xattr_is_local(xe)) 3087 if (!ocfs2_xattr_is_local(xe))
2489 credits += ocfs2_remove_extent_credits(inode->i_sb); 3088 credits += ocfs2_remove_extent_credits(inode->i_sb);
2490 3089
@@ -2514,7 +3113,7 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode,
2514 } 3113 }
2515 } 3114 }
2516 3115
2517 if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) { 3116 if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) {
2518 /* the new values will be stored outside. */ 3117 /* the new values will be stored outside. */
2519 u32 old_clusters = 0; 3118 u32 old_clusters = 0;
2520 3119
@@ -2547,9 +3146,10 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode,
2547 * value, we don't need any allocation, otherwise we have 3146 * value, we don't need any allocation, otherwise we have
2548 * to guess metadata allocation. 3147 * to guess metadata allocation.
2549 */ 3148 */
2550 if ((ocfs2_xattr_is_local(xe) && value_size >= xi->value_len) || 3149 if ((ocfs2_xattr_is_local(xe) &&
3150 (value_size >= xi->xi_value_len)) ||
2551 (!ocfs2_xattr_is_local(xe) && 3151 (!ocfs2_xattr_is_local(xe) &&
2552 OCFS2_XATTR_ROOT_SIZE >= xi->value_len)) 3152 OCFS2_XATTR_ROOT_SIZE >= xi->xi_value_len))
2553 goto out; 3153 goto out;
2554 } 3154 }
2555 3155
@@ -2639,7 +3239,7 @@ static int ocfs2_init_xattr_set_ctxt(struct inode *inode,
2639 3239
2640 meta_add += extra_meta; 3240 meta_add += extra_meta;
2641 mlog(0, "Set xattr %s, reserve meta blocks = %d, clusters = %d, " 3241 mlog(0, "Set xattr %s, reserve meta blocks = %d, clusters = %d, "
2642 "credits = %d\n", xi->name, meta_add, clusters_add, *credits); 3242 "credits = %d\n", xi->xi_name, meta_add, clusters_add, *credits);
2643 3243
2644 if (meta_add) { 3244 if (meta_add) {
2645 ret = ocfs2_reserve_new_metadata_blocks(osb, meta_add, 3245 ret = ocfs2_reserve_new_metadata_blocks(osb, meta_add,
@@ -2679,7 +3279,7 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
2679{ 3279{
2680 int ret = 0, credits, old_found; 3280 int ret = 0, credits, old_found;
2681 3281
2682 if (!xi->value) { 3282 if (!xi->xi_value) {
2683 /* Remove existing extended attribute */ 3283 /* Remove existing extended attribute */
2684 if (!xis->not_found) 3284 if (!xis->not_found)
2685 ret = ocfs2_xattr_ibody_set(inode, xi, xis, ctxt); 3285 ret = ocfs2_xattr_ibody_set(inode, xi, xis, ctxt);
@@ -2693,8 +3293,8 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
2693 * If succeed and that extended attribute existing in 3293 * If succeed and that extended attribute existing in
2694 * external block, then we will remove it. 3294 * external block, then we will remove it.
2695 */ 3295 */
2696 xi->value = NULL; 3296 xi->xi_value = NULL;
2697 xi->value_len = 0; 3297 xi->xi_value_len = 0;
2698 3298
2699 old_found = xis->not_found; 3299 old_found = xis->not_found;
2700 xis->not_found = -ENODATA; 3300 xis->not_found = -ENODATA;
@@ -2722,8 +3322,8 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
2722 } else if (ret == -ENOSPC) { 3322 } else if (ret == -ENOSPC) {
2723 if (di->i_xattr_loc && !xbs->xattr_bh) { 3323 if (di->i_xattr_loc && !xbs->xattr_bh) {
2724 ret = ocfs2_xattr_block_find(inode, 3324 ret = ocfs2_xattr_block_find(inode,
2725 xi->name_index, 3325 xi->xi_name_index,
2726 xi->name, xbs); 3326 xi->xi_name, xbs);
2727 if (ret) 3327 if (ret)
2728 goto out; 3328 goto out;
2729 3329
@@ -2762,8 +3362,8 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
2762 * If succeed and that extended attribute 3362 * If succeed and that extended attribute
2763 * existing in inode, we will remove it. 3363 * existing in inode, we will remove it.
2764 */ 3364 */
2765 xi->value = NULL; 3365 xi->xi_value = NULL;
2766 xi->value_len = 0; 3366 xi->xi_value_len = 0;
2767 xbs->not_found = -ENODATA; 3367 xbs->not_found = -ENODATA;
2768 ret = ocfs2_calc_xattr_set_need(inode, 3368 ret = ocfs2_calc_xattr_set_need(inode,
2769 di, 3369 di,
@@ -2829,10 +3429,11 @@ int ocfs2_xattr_set_handle(handle_t *handle,
2829 int ret; 3429 int ret;
2830 3430
2831 struct ocfs2_xattr_info xi = { 3431 struct ocfs2_xattr_info xi = {
2832 .name_index = name_index, 3432 .xi_name_index = name_index,
2833 .name = name, 3433 .xi_name = name,
2834 .value = value, 3434 .xi_name_len = strlen(name),
2835 .value_len = value_len, 3435 .xi_value = value,
3436 .xi_value_len = value_len,
2836 }; 3437 };
2837 3438
2838 struct ocfs2_xattr_search xis = { 3439 struct ocfs2_xattr_search xis = {
@@ -2912,10 +3513,11 @@ int ocfs2_xattr_set(struct inode *inode,
2912 struct ocfs2_refcount_tree *ref_tree = NULL; 3513 struct ocfs2_refcount_tree *ref_tree = NULL;
2913 3514
2914 struct ocfs2_xattr_info xi = { 3515 struct ocfs2_xattr_info xi = {
2915 .name_index = name_index, 3516 .xi_name_index = name_index,
2916 .name = name, 3517 .xi_name = name,
2917 .value = value, 3518 .xi_name_len = strlen(name),
2918 .value_len = value_len, 3519 .xi_value = value,
3520 .xi_value_len = value_len,
2919 }; 3521 };
2920 3522
2921 struct ocfs2_xattr_search xis = { 3523 struct ocfs2_xattr_search xis = {
@@ -3759,7 +4361,7 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode,
3759 struct ocfs2_xattr_bucket *bucket) 4361 struct ocfs2_xattr_bucket *bucket)
3760{ 4362{
3761 int ret, i; 4363 int ret, i;
3762 size_t end, offset, len, value_len; 4364 size_t end, offset, len;
3763 struct ocfs2_xattr_header *xh; 4365 struct ocfs2_xattr_header *xh;
3764 char *entries, *buf, *bucket_buf = NULL; 4366 char *entries, *buf, *bucket_buf = NULL;
3765 u64 blkno = bucket_blkno(bucket); 4367 u64 blkno = bucket_blkno(bucket);
@@ -3813,12 +4415,7 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode,
3813 end = OCFS2_XATTR_BUCKET_SIZE; 4415 end = OCFS2_XATTR_BUCKET_SIZE;
3814 for (i = 0; i < le16_to_cpu(xh->xh_count); i++, xe++) { 4416 for (i = 0; i < le16_to_cpu(xh->xh_count); i++, xe++) {
3815 offset = le16_to_cpu(xe->xe_name_offset); 4417 offset = le16_to_cpu(xe->xe_name_offset);
3816 if (ocfs2_xattr_is_local(xe)) 4418 len = namevalue_size_xe(xe);
3817 value_len = OCFS2_XATTR_SIZE(
3818 le64_to_cpu(xe->xe_value_size));
3819 else
3820 value_len = OCFS2_XATTR_ROOT_SIZE;
3821 len = OCFS2_XATTR_SIZE(xe->xe_name_len) + value_len;
3822 4419
3823 /* 4420 /*
3824 * We must make sure that the name/value pair 4421 * We must make sure that the name/value pair
@@ -4007,7 +4604,7 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
4007 int new_bucket_head) 4604 int new_bucket_head)
4008{ 4605{
4009 int ret, i; 4606 int ret, i;
4010 int count, start, len, name_value_len = 0, xe_len, name_offset = 0; 4607 int count, start, len, name_value_len = 0, name_offset = 0;
4011 struct ocfs2_xattr_bucket *s_bucket = NULL, *t_bucket = NULL; 4608 struct ocfs2_xattr_bucket *s_bucket = NULL, *t_bucket = NULL;
4012 struct ocfs2_xattr_header *xh; 4609 struct ocfs2_xattr_header *xh;
4013 struct ocfs2_xattr_entry *xe; 4610 struct ocfs2_xattr_entry *xe;
@@ -4098,13 +4695,7 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
4098 name_value_len = 0; 4695 name_value_len = 0;
4099 for (i = 0; i < start; i++) { 4696 for (i = 0; i < start; i++) {
4100 xe = &xh->xh_entries[i]; 4697 xe = &xh->xh_entries[i];
4101 xe_len = OCFS2_XATTR_SIZE(xe->xe_name_len); 4698 name_value_len += namevalue_size_xe(xe);
4102 if (ocfs2_xattr_is_local(xe))
4103 xe_len +=
4104 OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size));
4105 else
4106 xe_len += OCFS2_XATTR_ROOT_SIZE;
4107 name_value_len += xe_len;
4108 if (le16_to_cpu(xe->xe_name_offset) < name_offset) 4699 if (le16_to_cpu(xe->xe_name_offset) < name_offset)
4109 name_offset = le16_to_cpu(xe->xe_name_offset); 4700 name_offset = le16_to_cpu(xe->xe_name_offset);
4110 } 4701 }
@@ -4134,12 +4725,6 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
4134 xh->xh_free_start = cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE); 4725 xh->xh_free_start = cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE);
4135 for (i = 0; i < le16_to_cpu(xh->xh_count); i++) { 4726 for (i = 0; i < le16_to_cpu(xh->xh_count); i++) {
4136 xe = &xh->xh_entries[i]; 4727 xe = &xh->xh_entries[i];
4137 xe_len = OCFS2_XATTR_SIZE(xe->xe_name_len);
4138 if (ocfs2_xattr_is_local(xe))
4139 xe_len +=
4140 OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size));
4141 else
4142 xe_len += OCFS2_XATTR_ROOT_SIZE;
4143 if (le16_to_cpu(xe->xe_name_offset) < 4728 if (le16_to_cpu(xe->xe_name_offset) <
4144 le16_to_cpu(xh->xh_free_start)) 4729 le16_to_cpu(xh->xh_free_start))
4145 xh->xh_free_start = xe->xe_name_offset; 4730 xh->xh_free_start = xe->xe_name_offset;
@@ -4751,195 +5336,6 @@ static inline char *ocfs2_xattr_bucket_get_val(struct inode *inode,
4751} 5336}
4752 5337
4753/* 5338/*
4754 * Handle the normal xattr set, including replace, delete and new.
4755 *
4756 * Note: "local" indicates the real data's locality. So we can't
4757 * just its bucket locality by its length.
4758 */
4759static void ocfs2_xattr_set_entry_normal(struct inode *inode,
4760 struct ocfs2_xattr_info *xi,
4761 struct ocfs2_xattr_search *xs,
4762 u32 name_hash,
4763 int local)
4764{
4765 struct ocfs2_xattr_entry *last, *xe;
4766 int name_len = strlen(xi->name);
4767 struct ocfs2_xattr_header *xh = xs->header;
4768 u16 count = le16_to_cpu(xh->xh_count), start;
4769 size_t blocksize = inode->i_sb->s_blocksize;
4770 char *val;
4771 size_t offs, size, new_size;
4772
4773 last = &xh->xh_entries[count];
4774 if (!xs->not_found) {
4775 xe = xs->here;
4776 offs = le16_to_cpu(xe->xe_name_offset);
4777 if (ocfs2_xattr_is_local(xe))
4778 size = OCFS2_XATTR_SIZE(name_len) +
4779 OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size));
4780 else
4781 size = OCFS2_XATTR_SIZE(name_len) +
4782 OCFS2_XATTR_SIZE(OCFS2_XATTR_ROOT_SIZE);
4783
4784 /*
4785 * If the new value will be stored outside, xi->value has been
4786 * initalized as an empty ocfs2_xattr_value_root, and the same
4787 * goes with xi->value_len, so we can set new_size safely here.
4788 * See ocfs2_xattr_set_in_bucket.
4789 */
4790 new_size = OCFS2_XATTR_SIZE(name_len) +
4791 OCFS2_XATTR_SIZE(xi->value_len);
4792
4793 le16_add_cpu(&xh->xh_name_value_len, -size);
4794 if (xi->value) {
4795 if (new_size > size)
4796 goto set_new_name_value;
4797
4798 /* Now replace the old value with new one. */
4799 if (local)
4800 xe->xe_value_size = cpu_to_le64(xi->value_len);
4801 else
4802 xe->xe_value_size = 0;
4803
4804 val = ocfs2_xattr_bucket_get_val(inode,
4805 xs->bucket, offs);
4806 memset(val + OCFS2_XATTR_SIZE(name_len), 0,
4807 size - OCFS2_XATTR_SIZE(name_len));
4808 if (OCFS2_XATTR_SIZE(xi->value_len) > 0)
4809 memcpy(val + OCFS2_XATTR_SIZE(name_len),
4810 xi->value, xi->value_len);
4811
4812 le16_add_cpu(&xh->xh_name_value_len, new_size);
4813 ocfs2_xattr_set_local(xe, local);
4814 return;
4815 } else {
4816 /*
4817 * Remove the old entry if there is more than one.
4818 * We don't remove the last entry so that we can
4819 * use it to indicate the hash value of the empty
4820 * bucket.
4821 */
4822 last -= 1;
4823 le16_add_cpu(&xh->xh_count, -1);
4824 if (xh->xh_count) {
4825 memmove(xe, xe + 1,
4826 (void *)last - (void *)xe);
4827 memset(last, 0,
4828 sizeof(struct ocfs2_xattr_entry));
4829 } else
4830 xh->xh_free_start =
4831 cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE);
4832
4833 return;
4834 }
4835 } else {
4836 /* find a new entry for insert. */
4837 int low = 0, high = count - 1, tmp;
4838 struct ocfs2_xattr_entry *tmp_xe;
4839
4840 while (low <= high && count) {
4841 tmp = (low + high) / 2;
4842 tmp_xe = &xh->xh_entries[tmp];
4843
4844 if (name_hash > le32_to_cpu(tmp_xe->xe_name_hash))
4845 low = tmp + 1;
4846 else if (name_hash <
4847 le32_to_cpu(tmp_xe->xe_name_hash))
4848 high = tmp - 1;
4849 else {
4850 low = tmp;
4851 break;
4852 }
4853 }
4854
4855 xe = &xh->xh_entries[low];
4856 if (low != count)
4857 memmove(xe + 1, xe, (void *)last - (void *)xe);
4858
4859 le16_add_cpu(&xh->xh_count, 1);
4860 memset(xe, 0, sizeof(struct ocfs2_xattr_entry));
4861 xe->xe_name_hash = cpu_to_le32(name_hash);
4862 xe->xe_name_len = name_len;
4863 ocfs2_xattr_set_type(xe, xi->name_index);
4864 }
4865
4866set_new_name_value:
4867 /* Insert the new name+value. */
4868 size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_SIZE(xi->value_len);
4869
4870 /*
4871 * We must make sure that the name/value pair
4872 * exists in the same block.
4873 */
4874 offs = le16_to_cpu(xh->xh_free_start);
4875 start = offs - size;
4876
4877 if (start >> inode->i_sb->s_blocksize_bits !=
4878 (offs - 1) >> inode->i_sb->s_blocksize_bits) {
4879 offs = offs - offs % blocksize;
4880 xh->xh_free_start = cpu_to_le16(offs);
4881 }
4882
4883 val = ocfs2_xattr_bucket_get_val(inode, xs->bucket, offs - size);
4884 xe->xe_name_offset = cpu_to_le16(offs - size);
4885
4886 memset(val, 0, size);
4887 memcpy(val, xi->name, name_len);
4888 memcpy(val + OCFS2_XATTR_SIZE(name_len), xi->value, xi->value_len);
4889
4890 xe->xe_value_size = cpu_to_le64(xi->value_len);
4891 ocfs2_xattr_set_local(xe, local);
4892 xs->here = xe;
4893 le16_add_cpu(&xh->xh_free_start, -size);
4894 le16_add_cpu(&xh->xh_name_value_len, size);
4895
4896 return;
4897}
4898
4899/*
4900 * Set the xattr entry in the specified bucket.
4901 * The bucket is indicated by xs->bucket and it should have the enough
4902 * space for the xattr insertion.
4903 */
4904static int ocfs2_xattr_set_entry_in_bucket(struct inode *inode,
4905 handle_t *handle,
4906 struct ocfs2_xattr_info *xi,
4907 struct ocfs2_xattr_search *xs,
4908 u32 name_hash,
4909 int local)
4910{
4911 int ret;
4912 u64 blkno;
4913
4914 mlog(0, "Set xattr entry len = %lu index = %d in bucket %llu\n",
4915 (unsigned long)xi->value_len, xi->name_index,
4916 (unsigned long long)bucket_blkno(xs->bucket));
4917
4918 if (!xs->bucket->bu_bhs[1]) {
4919 blkno = bucket_blkno(xs->bucket);
4920 ocfs2_xattr_bucket_relse(xs->bucket);
4921 ret = ocfs2_read_xattr_bucket(xs->bucket, blkno);
4922 if (ret) {
4923 mlog_errno(ret);
4924 goto out;
4925 }
4926 }
4927
4928 ret = ocfs2_xattr_bucket_journal_access(handle, xs->bucket,
4929 OCFS2_JOURNAL_ACCESS_WRITE);
4930 if (ret < 0) {
4931 mlog_errno(ret);
4932 goto out;
4933 }
4934
4935 ocfs2_xattr_set_entry_normal(inode, xi, xs, name_hash, local);
4936 ocfs2_xattr_bucket_journal_dirty(handle, xs->bucket);
4937
4938out:
4939 return ret;
4940}
4941
4942/*
4943 * Truncate the specified xe_off entry in xattr bucket. 5339 * Truncate the specified xe_off entry in xattr bucket.
4944 * bucket is indicated by header_bh and len is the new length. 5340 * bucket is indicated by header_bh and len is the new length.
4945 * Both the ocfs2_xattr_value_root and the entry will be updated here. 5341 * Both the ocfs2_xattr_value_root and the entry will be updated here.
@@ -5009,66 +5405,6 @@ out:
5009 return ret; 5405 return ret;
5010} 5406}
5011 5407
5012static int ocfs2_xattr_bucket_value_truncate_xs(struct inode *inode,
5013 struct ocfs2_xattr_search *xs,
5014 int len,
5015 struct ocfs2_xattr_set_ctxt *ctxt)
5016{
5017 int ret, offset;
5018 struct ocfs2_xattr_entry *xe = xs->here;
5019 struct ocfs2_xattr_header *xh = (struct ocfs2_xattr_header *)xs->base;
5020
5021 BUG_ON(!xs->bucket->bu_bhs[0] || !xe || ocfs2_xattr_is_local(xe));
5022
5023 offset = xe - xh->xh_entries;
5024 ret = ocfs2_xattr_bucket_value_truncate(inode, xs->bucket,
5025 offset, len, ctxt);
5026 if (ret)
5027 mlog_errno(ret);
5028
5029 return ret;
5030}
5031
5032static int ocfs2_xattr_bucket_set_value_outside(struct inode *inode,
5033 handle_t *handle,
5034 struct ocfs2_xattr_search *xs,
5035 char *val,
5036 int value_len)
5037{
5038 int ret, offset, block_off;
5039 struct ocfs2_xattr_value_root *xv;
5040 struct ocfs2_xattr_entry *xe = xs->here;
5041 struct ocfs2_xattr_header *xh = bucket_xh(xs->bucket);
5042 void *base;
5043 struct ocfs2_xattr_value_buf vb = {
5044 .vb_access = ocfs2_journal_access,
5045 };
5046
5047 BUG_ON(!xs->base || !xe || ocfs2_xattr_is_local(xe));
5048
5049 ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb, xh,
5050 xe - xh->xh_entries,
5051 &block_off,
5052 &offset);
5053 if (ret) {
5054 mlog_errno(ret);
5055 goto out;
5056 }
5057
5058 base = bucket_block(xs->bucket, block_off);
5059 xv = (struct ocfs2_xattr_value_root *)(base + offset +
5060 OCFS2_XATTR_SIZE(xe->xe_name_len));
5061
5062 vb.vb_xv = xv;
5063 vb.vb_bh = xs->bucket->bu_bhs[block_off];
5064 ret = __ocfs2_xattr_set_value_outside(inode, handle,
5065 &vb, val, value_len);
5066 if (ret)
5067 mlog_errno(ret);
5068out:
5069 return ret;
5070}
5071
5072static int ocfs2_rm_xattr_cluster(struct inode *inode, 5408static int ocfs2_rm_xattr_cluster(struct inode *inode,
5073 struct buffer_head *root_bh, 5409 struct buffer_head *root_bh,
5074 u64 blkno, 5410 u64 blkno,
@@ -5167,128 +5503,6 @@ out:
5167 return ret; 5503 return ret;
5168} 5504}
5169 5505
5170static void ocfs2_xattr_bucket_remove_xs(struct inode *inode,
5171 handle_t *handle,
5172 struct ocfs2_xattr_search *xs)
5173{
5174 struct ocfs2_xattr_header *xh = bucket_xh(xs->bucket);
5175 struct ocfs2_xattr_entry *last = &xh->xh_entries[
5176 le16_to_cpu(xh->xh_count) - 1];
5177 int ret = 0;
5178
5179 ret = ocfs2_xattr_bucket_journal_access(handle, xs->bucket,
5180 OCFS2_JOURNAL_ACCESS_WRITE);
5181 if (ret) {
5182 mlog_errno(ret);
5183 return;
5184 }
5185
5186 /* Remove the old entry. */
5187 memmove(xs->here, xs->here + 1,
5188 (void *)last - (void *)xs->here);
5189 memset(last, 0, sizeof(struct ocfs2_xattr_entry));
5190 le16_add_cpu(&xh->xh_count, -1);
5191
5192 ocfs2_xattr_bucket_journal_dirty(handle, xs->bucket);
5193}
5194
5195/*
5196 * Set the xattr name/value in the bucket specified in xs.
5197 *
5198 * As the new value in xi may be stored in the bucket or in an outside cluster,
5199 * we divide the whole process into 3 steps:
5200 * 1. insert name/value in the bucket(ocfs2_xattr_set_entry_in_bucket)
5201 * 2. truncate of the outside cluster(ocfs2_xattr_bucket_value_truncate_xs)
5202 * 3. Set the value to the outside cluster(ocfs2_xattr_bucket_set_value_outside)
5203 * 4. If the clusters for the new outside value can't be allocated, we need
5204 * to free the xattr we allocated in set.
5205 */
5206static int ocfs2_xattr_set_in_bucket(struct inode *inode,
5207 struct ocfs2_xattr_info *xi,
5208 struct ocfs2_xattr_search *xs,
5209 struct ocfs2_xattr_set_ctxt *ctxt)
5210{
5211 int ret, local = 1;
5212 size_t value_len;
5213 char *val = (char *)xi->value;
5214 struct ocfs2_xattr_entry *xe = xs->here;
5215 u32 name_hash = ocfs2_xattr_name_hash(inode, xi->name,
5216 strlen(xi->name));
5217
5218 if (!xs->not_found && !ocfs2_xattr_is_local(xe)) {
5219 /*
5220 * We need to truncate the xattr storage first.
5221 *
5222 * If both the old and new value are stored to
5223 * outside block, we only need to truncate
5224 * the storage and then set the value outside.
5225 *
5226 * If the new value should be stored within block,
5227 * we should free all the outside block first and
5228 * the modification to the xattr block will be done
5229 * by following steps.
5230 */
5231 if (xi->value_len > OCFS2_XATTR_INLINE_SIZE)
5232 value_len = xi->value_len;
5233 else
5234 value_len = 0;
5235
5236 ret = ocfs2_xattr_bucket_value_truncate_xs(inode, xs,
5237 value_len,
5238 ctxt);
5239 if (ret)
5240 goto out;
5241
5242 if (value_len)
5243 goto set_value_outside;
5244 }
5245
5246 value_len = xi->value_len;
5247 /* So we have to handle the inside block change now. */
5248 if (value_len > OCFS2_XATTR_INLINE_SIZE) {
5249 /*
5250 * If the new value will be stored outside of block,
5251 * initalize a new empty value root and insert it first.
5252 */
5253 local = 0;
5254 xi->value = &def_xv;
5255 xi->value_len = OCFS2_XATTR_ROOT_SIZE;
5256 }
5257
5258 ret = ocfs2_xattr_set_entry_in_bucket(inode, ctxt->handle, xi, xs,
5259 name_hash, local);
5260 if (ret) {
5261 mlog_errno(ret);
5262 goto out;
5263 }
5264
5265 if (value_len <= OCFS2_XATTR_INLINE_SIZE)
5266 goto out;
5267
5268 /* allocate the space now for the outside block storage. */
5269 ret = ocfs2_xattr_bucket_value_truncate_xs(inode, xs,
5270 value_len, ctxt);
5271 if (ret) {
5272 mlog_errno(ret);
5273
5274 if (xs->not_found) {
5275 /*
5276 * We can't allocate enough clusters for outside
5277 * storage and we have allocated xattr already,
5278 * so need to remove it.
5279 */
5280 ocfs2_xattr_bucket_remove_xs(inode, ctxt->handle, xs);
5281 }
5282 goto out;
5283 }
5284
5285set_value_outside:
5286 ret = ocfs2_xattr_bucket_set_value_outside(inode, ctxt->handle,
5287 xs, val, value_len);
5288out:
5289 return ret;
5290}
5291
5292/* 5506/*
5293 * check whether the xattr bucket is filled up with the same hash value. 5507 * check whether the xattr bucket is filled up with the same hash value.
5294 * If we want to insert the xattr with the same hash, return -ENOSPC. 5508 * If we want to insert the xattr with the same hash, return -ENOSPC.
@@ -5317,156 +5531,116 @@ static int ocfs2_check_xattr_bucket_collision(struct inode *inode,
5317 return 0; 5531 return 0;
5318} 5532}
5319 5533
5320static int ocfs2_xattr_set_entry_index_block(struct inode *inode, 5534/*
5321 struct ocfs2_xattr_info *xi, 5535 * Try to set the entry in the current bucket. If we fail, the caller
5322 struct ocfs2_xattr_search *xs, 5536 * will handle getting us another bucket.
5323 struct ocfs2_xattr_set_ctxt *ctxt) 5537 */
5538static int ocfs2_xattr_set_entry_bucket(struct inode *inode,
5539 struct ocfs2_xattr_info *xi,
5540 struct ocfs2_xattr_search *xs,
5541 struct ocfs2_xattr_set_ctxt *ctxt)
5324{ 5542{
5325 struct ocfs2_xattr_header *xh; 5543 int ret;
5326 struct ocfs2_xattr_entry *xe; 5544 struct ocfs2_xa_loc loc;
5327 u16 count, header_size, xh_free_start;
5328 int free, max_free, need, old;
5329 size_t value_size = 0, name_len = strlen(xi->name);
5330 size_t blocksize = inode->i_sb->s_blocksize;
5331 int ret, allocation = 0;
5332
5333 mlog_entry("Set xattr %s in xattr index block\n", xi->name);
5334
5335try_again:
5336 xh = xs->header;
5337 count = le16_to_cpu(xh->xh_count);
5338 xh_free_start = le16_to_cpu(xh->xh_free_start);
5339 header_size = sizeof(struct ocfs2_xattr_header) +
5340 count * sizeof(struct ocfs2_xattr_entry);
5341 max_free = OCFS2_XATTR_BUCKET_SIZE - header_size -
5342 le16_to_cpu(xh->xh_name_value_len) - OCFS2_XATTR_HEADER_GAP;
5343
5344 mlog_bug_on_msg(header_size > blocksize, "bucket %llu has header size "
5345 "of %u which exceed block size\n",
5346 (unsigned long long)bucket_blkno(xs->bucket),
5347 header_size);
5348 5545
5349 if (xi->value && xi->value_len > OCFS2_XATTR_INLINE_SIZE) 5546 mlog_entry("Set xattr %s in xattr bucket\n", xi->xi_name);
5350 value_size = OCFS2_XATTR_ROOT_SIZE;
5351 else if (xi->value)
5352 value_size = OCFS2_XATTR_SIZE(xi->value_len);
5353 5547
5354 if (xs->not_found) 5548 ocfs2_init_xattr_bucket_xa_loc(&loc, xs->bucket,
5355 need = sizeof(struct ocfs2_xattr_entry) + 5549 xs->not_found ? NULL : xs->here);
5356 OCFS2_XATTR_SIZE(name_len) + value_size; 5550 ret = ocfs2_xa_set(&loc, xi, ctxt);
5357 else { 5551 if (!ret) {
5358 need = value_size + OCFS2_XATTR_SIZE(name_len); 5552 xs->here = loc.xl_entry;
5553 goto out;
5554 }
5555 if (ret != -ENOSPC) {
5556 mlog_errno(ret);
5557 goto out;
5558 }
5359 5559
5360 /* 5560 /* Ok, we need space. Let's try defragmenting the bucket. */
5361 * We only replace the old value if the new length is smaller 5561 ret = ocfs2_defrag_xattr_bucket(inode, ctxt->handle,
5362 * than the old one. Otherwise we will allocate new space in the 5562 xs->bucket);
5363 * bucket to store it. 5563 if (ret) {
5364 */ 5564 mlog_errno(ret);
5365 xe = xs->here; 5565 goto out;
5366 if (ocfs2_xattr_is_local(xe)) 5566 }
5367 old = OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size));
5368 else
5369 old = OCFS2_XATTR_SIZE(OCFS2_XATTR_ROOT_SIZE);
5370 5567
5371 if (old >= value_size) 5568 ret = ocfs2_xa_set(&loc, xi, ctxt);
5372 need = 0; 5569 if (!ret) {
5570 xs->here = loc.xl_entry;
5571 goto out;
5373 } 5572 }
5573 if (ret != -ENOSPC)
5574 mlog_errno(ret);
5374 5575
5375 free = xh_free_start - header_size - OCFS2_XATTR_HEADER_GAP;
5376 /*
5377 * We need to make sure the new name/value pair
5378 * can exist in the same block.
5379 */
5380 if (xh_free_start % blocksize < need)
5381 free -= xh_free_start % blocksize;
5382
5383 mlog(0, "xs->not_found = %d, in xattr bucket %llu: free = %d, "
5384 "need = %d, max_free = %d, xh_free_start = %u, xh_name_value_len ="
5385 " %u\n", xs->not_found,
5386 (unsigned long long)bucket_blkno(xs->bucket),
5387 free, need, max_free, le16_to_cpu(xh->xh_free_start),
5388 le16_to_cpu(xh->xh_name_value_len));
5389
5390 if (free < need ||
5391 (xs->not_found &&
5392 count == ocfs2_xattr_max_xe_in_bucket(inode->i_sb))) {
5393 if (need <= max_free &&
5394 count < ocfs2_xattr_max_xe_in_bucket(inode->i_sb)) {
5395 /*
5396 * We can create the space by defragment. Since only the
5397 * name/value will be moved, the xe shouldn't be changed
5398 * in xs.
5399 */
5400 ret = ocfs2_defrag_xattr_bucket(inode, ctxt->handle,
5401 xs->bucket);
5402 if (ret) {
5403 mlog_errno(ret);
5404 goto out;
5405 }
5406 5576
5407 xh_free_start = le16_to_cpu(xh->xh_free_start); 5577out:
5408 free = xh_free_start - header_size 5578 mlog_exit(ret);
5409 - OCFS2_XATTR_HEADER_GAP; 5579 return ret;
5410 if (xh_free_start % blocksize < need) 5580}
5411 free -= xh_free_start % blocksize;
5412 5581
5413 if (free >= need) 5582static int ocfs2_xattr_set_entry_index_block(struct inode *inode,
5414 goto xattr_set; 5583 struct ocfs2_xattr_info *xi,
5584 struct ocfs2_xattr_search *xs,
5585 struct ocfs2_xattr_set_ctxt *ctxt)
5586{
5587 int ret;
5415 5588
5416 mlog(0, "Can't get enough space for xattr insert by " 5589 mlog_entry("Set xattr %s in xattr index block\n", xi->xi_name);
5417 "defragment. Need %u bytes, but we have %d, so "
5418 "allocate new bucket for it.\n", need, free);
5419 }
5420 5590
5421 /* 5591 ret = ocfs2_xattr_set_entry_bucket(inode, xi, xs, ctxt);
5422 * We have to add new buckets or clusters and one 5592 if (!ret)
5423 * allocation should leave us enough space for insert. 5593 goto out;
5424 */ 5594 if (ret != -ENOSPC) {
5425 BUG_ON(allocation); 5595 mlog_errno(ret);
5596 goto out;
5597 }
5426 5598
5427 /* 5599 /* Ack, need more space. Let's try to get another bucket! */
5428 * We do not allow for overlapping ranges between buckets. And
5429 * the maximum number of collisions we will allow for then is
5430 * one bucket's worth, so check it here whether we need to
5431 * add a new bucket for the insert.
5432 */
5433 ret = ocfs2_check_xattr_bucket_collision(inode,
5434 xs->bucket,
5435 xi->name);
5436 if (ret) {
5437 mlog_errno(ret);
5438 goto out;
5439 }
5440 5600
5441 ret = ocfs2_add_new_xattr_bucket(inode, 5601 /*
5442 xs->xattr_bh, 5602 * We do not allow for overlapping ranges between buckets. And
5603 * the maximum number of collisions we will allow for then is
5604 * one bucket's worth, so check it here whether we need to
5605 * add a new bucket for the insert.
5606 */
5607 ret = ocfs2_check_xattr_bucket_collision(inode,
5443 xs->bucket, 5608 xs->bucket,
5444 ctxt); 5609 xi->xi_name);
5445 if (ret) { 5610 if (ret) {
5446 mlog_errno(ret); 5611 mlog_errno(ret);
5447 goto out; 5612 goto out;
5448 } 5613 }
5449 5614
5450 /* 5615 ret = ocfs2_add_new_xattr_bucket(inode,
5451 * ocfs2_add_new_xattr_bucket() will have updated 5616 xs->xattr_bh,
5452 * xs->bucket if it moved, but it will not have updated 5617 xs->bucket,
5453 * any of the other search fields. Thus, we drop it and 5618 ctxt);
5454 * re-search. Everything should be cached, so it'll be 5619 if (ret) {
5455 * quick. 5620 mlog_errno(ret);
5456 */ 5621 goto out;
5457 ocfs2_xattr_bucket_relse(xs->bucket);
5458 ret = ocfs2_xattr_index_block_find(inode, xs->xattr_bh,
5459 xi->name_index,
5460 xi->name, xs);
5461 if (ret && ret != -ENODATA)
5462 goto out;
5463 xs->not_found = ret;
5464 allocation = 1;
5465 goto try_again;
5466 } 5622 }
5467 5623
5468xattr_set: 5624 /*
5469 ret = ocfs2_xattr_set_in_bucket(inode, xi, xs, ctxt); 5625 * ocfs2_add_new_xattr_bucket() will have updated
5626 * xs->bucket if it moved, but it will not have updated
5627 * any of the other search fields. Thus, we drop it and
5628 * re-search. Everything should be cached, so it'll be
5629 * quick.
5630 */
5631 ocfs2_xattr_bucket_relse(xs->bucket);
5632 ret = ocfs2_xattr_index_block_find(inode, xs->xattr_bh,
5633 xi->xi_name_index,
5634 xi->xi_name, xs);
5635 if (ret && ret != -ENODATA)
5636 goto out;
5637 xs->not_found = ret;
5638
5639 /* Ok, we have a new bucket, let's try again */
5640 ret = ocfs2_xattr_set_entry_bucket(inode, xi, xs, ctxt);
5641 if (ret && (ret != -ENOSPC))
5642 mlog_errno(ret);
5643
5470out: 5644out:
5471 mlog_exit(ret); 5645 mlog_exit(ret);
5472 return ret; 5646 return ret;
@@ -5678,7 +5852,7 @@ static int ocfs2_prepare_refcount_xattr(struct inode *inode,
5678 * refcount tree, and make the original extent become 3. So we will need 5852 * refcount tree, and make the original extent become 3. So we will need
5679 * 2 * cluster more extent recs at most. 5853 * 2 * cluster more extent recs at most.
5680 */ 5854 */
5681 if (!xi->value || xi->value_len <= OCFS2_XATTR_INLINE_SIZE) { 5855 if (!xi->xi_value || xi->xi_value_len <= OCFS2_XATTR_INLINE_SIZE) {
5682 5856
5683 ret = ocfs2_refcounted_xattr_delete_need(inode, 5857 ret = ocfs2_refcounted_xattr_delete_need(inode,
5684 &(*ref_tree)->rf_ci, 5858 &(*ref_tree)->rf_ci,
@@ -6354,9 +6528,11 @@ static int ocfs2_create_empty_xattr_block(struct inode *inode,
6354 int indexed) 6528 int indexed)
6355{ 6529{
6356 int ret; 6530 int ret;
6357 handle_t *handle;
6358 struct ocfs2_alloc_context *meta_ac; 6531 struct ocfs2_alloc_context *meta_ac;
6359 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 6532 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
6533 struct ocfs2_xattr_set_ctxt ctxt = {
6534 .meta_ac = meta_ac,
6535 };
6360 6536
6361 ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac); 6537 ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac);
6362 if (ret < 0) { 6538 if (ret < 0) {
@@ -6364,21 +6540,21 @@ static int ocfs2_create_empty_xattr_block(struct inode *inode,
6364 return ret; 6540 return ret;
6365 } 6541 }
6366 6542
6367 handle = ocfs2_start_trans(osb, OCFS2_XATTR_BLOCK_CREATE_CREDITS); 6543 ctxt.handle = ocfs2_start_trans(osb, OCFS2_XATTR_BLOCK_CREATE_CREDITS);
6368 if (IS_ERR(handle)) { 6544 if (IS_ERR(ctxt.handle)) {
6369 ret = PTR_ERR(handle); 6545 ret = PTR_ERR(ctxt.handle);
6370 mlog_errno(ret); 6546 mlog_errno(ret);
6371 goto out; 6547 goto out;
6372 } 6548 }
6373 6549
6374 mlog(0, "create new xattr block for inode %llu, index = %d\n", 6550 mlog(0, "create new xattr block for inode %llu, index = %d\n",
6375 (unsigned long long)fe_bh->b_blocknr, indexed); 6551 (unsigned long long)fe_bh->b_blocknr, indexed);
6376 ret = ocfs2_create_xattr_block(handle, inode, fe_bh, 6552 ret = ocfs2_create_xattr_block(inode, fe_bh, &ctxt, indexed,
6377 meta_ac, ret_bh, indexed); 6553 ret_bh);
6378 if (ret) 6554 if (ret)
6379 mlog_errno(ret); 6555 mlog_errno(ret);
6380 6556
6381 ocfs2_commit_trans(osb, handle); 6557 ocfs2_commit_trans(osb, ctxt.handle);
6382out: 6558out:
6383 ocfs2_free_alloc_context(meta_ac); 6559 ocfs2_free_alloc_context(meta_ac);
6384 return ret; 6560 return ret;
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 64bc8998ac9a..e8865c11777f 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -412,9 +412,10 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
412 pdev = part_to_dev(p); 412 pdev = part_to_dev(p);
413 413
414 p->start_sect = start; 414 p->start_sect = start;
415 p->alignment_offset = queue_sector_alignment_offset(disk->queue, start); 415 p->alignment_offset =
416 p->discard_alignment = queue_sector_discard_alignment(disk->queue, 416 queue_limit_alignment_offset(&disk->queue->limits, start);
417 start); 417 p->discard_alignment =
418 queue_limit_discard_alignment(&disk->queue->limits, start);
418 p->nr_sects = len; 419 p->nr_sects = len;
419 p->partno = partno; 420 p->partno = partno;
420 p->policy = get_disk_ro(disk); 421 p->policy = get_disk_ro(disk);
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 13b5d0708175..18e20feee251 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -270,7 +270,9 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p)
270 blocked = p->blocked; 270 blocked = p->blocked;
271 collect_sigign_sigcatch(p, &ignored, &caught); 271 collect_sigign_sigcatch(p, &ignored, &caught);
272 num_threads = atomic_read(&p->signal->count); 272 num_threads = atomic_read(&p->signal->count);
273 rcu_read_lock(); /* FIXME: is this correct? */
273 qsize = atomic_read(&__task_cred(p)->user->sigpending); 274 qsize = atomic_read(&__task_cred(p)->user->sigpending);
275 rcu_read_unlock();
274 qlim = p->signal->rlim[RLIMIT_SIGPENDING].rlim_cur; 276 qlim = p->signal->rlim[RLIMIT_SIGPENDING].rlim_cur;
275 unlock_task_sighand(p, &flags); 277 unlock_task_sighand(p, &flags);
276 } 278 }
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 58324c299165..623e2ffb5d2b 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1095,8 +1095,12 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
1095 if (!capable(CAP_AUDIT_CONTROL)) 1095 if (!capable(CAP_AUDIT_CONTROL))
1096 return -EPERM; 1096 return -EPERM;
1097 1097
1098 if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) 1098 rcu_read_lock();
1099 if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) {
1100 rcu_read_unlock();
1099 return -EPERM; 1101 return -EPERM;
1102 }
1103 rcu_read_unlock();
1100 1104
1101 if (count >= PAGE_SIZE) 1105 if (count >= PAGE_SIZE)
1102 count = PAGE_SIZE - 1; 1106 count = PAGE_SIZE - 1;
diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c
index 7ca78346d3f0..cfe90a48a6e8 100644
--- a/fs/proc/kmsg.c
+++ b/fs/proc/kmsg.c
@@ -12,37 +12,37 @@
12#include <linux/poll.h> 12#include <linux/poll.h>
13#include <linux/proc_fs.h> 13#include <linux/proc_fs.h>
14#include <linux/fs.h> 14#include <linux/fs.h>
15#include <linux/syslog.h>
15 16
16#include <asm/uaccess.h> 17#include <asm/uaccess.h>
17#include <asm/io.h> 18#include <asm/io.h>
18 19
19extern wait_queue_head_t log_wait; 20extern wait_queue_head_t log_wait;
20 21
21extern int do_syslog(int type, char __user *bug, int count);
22
23static int kmsg_open(struct inode * inode, struct file * file) 22static int kmsg_open(struct inode * inode, struct file * file)
24{ 23{
25 return do_syslog(1,NULL,0); 24 return do_syslog(SYSLOG_ACTION_OPEN, NULL, 0, SYSLOG_FROM_FILE);
26} 25}
27 26
28static int kmsg_release(struct inode * inode, struct file * file) 27static int kmsg_release(struct inode * inode, struct file * file)
29{ 28{
30 (void) do_syslog(0,NULL,0); 29 (void) do_syslog(SYSLOG_ACTION_CLOSE, NULL, 0, SYSLOG_FROM_FILE);
31 return 0; 30 return 0;
32} 31}
33 32
34static ssize_t kmsg_read(struct file *file, char __user *buf, 33static ssize_t kmsg_read(struct file *file, char __user *buf,
35 size_t count, loff_t *ppos) 34 size_t count, loff_t *ppos)
36{ 35{
37 if ((file->f_flags & O_NONBLOCK) && !do_syslog(9, NULL, 0)) 36 if ((file->f_flags & O_NONBLOCK) &&
37 !do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_FILE))
38 return -EAGAIN; 38 return -EAGAIN;
39 return do_syslog(2, buf, count); 39 return do_syslog(SYSLOG_ACTION_READ, buf, count, SYSLOG_FROM_FILE);
40} 40}
41 41
42static unsigned int kmsg_poll(struct file *file, poll_table *wait) 42static unsigned int kmsg_poll(struct file *file, poll_table *wait)
43{ 43{
44 poll_wait(file, &log_wait, wait); 44 poll_wait(file, &log_wait, wait);
45 if (do_syslog(9, NULL, 0)) 45 if (do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_FILE))
46 return POLLIN | POLLRDNORM; 46 return POLLIN | POLLRDNORM;
47 return 0; 47 return 0;
48} 48}
diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c
index 123257bb356b..f8650dce74fb 100644
--- a/fs/proc/proc_devtree.c
+++ b/fs/proc/proc_devtree.c
@@ -10,16 +10,19 @@
10#include <linux/seq_file.h> 10#include <linux/seq_file.h>
11#include <linux/stat.h> 11#include <linux/stat.h>
12#include <linux/string.h> 12#include <linux/string.h>
13#include <linux/of.h>
14#include <linux/module.h>
13#include <asm/prom.h> 15#include <asm/prom.h>
14#include <asm/uaccess.h> 16#include <asm/uaccess.h>
15#include "internal.h" 17#include "internal.h"
16 18
17#ifndef HAVE_ARCH_DEVTREE_FIXUPS
18static inline void set_node_proc_entry(struct device_node *np, 19static inline void set_node_proc_entry(struct device_node *np,
19 struct proc_dir_entry *de) 20 struct proc_dir_entry *de)
20{ 21{
21} 22#ifdef HAVE_ARCH_DEVTREE_FIXUPS
23 np->pde = de;
22#endif 24#endif
25}
23 26
24static struct proc_dir_entry *proc_device_tree; 27static struct proc_dir_entry *proc_device_tree;
25 28
diff --git a/fs/seq_file.c b/fs/seq_file.c
index eae7d9dbf3ff..5afd554efad3 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -674,7 +674,6 @@ struct list_head *seq_list_start(struct list_head *head, loff_t pos)
674 674
675 return NULL; 675 return NULL;
676} 676}
677
678EXPORT_SYMBOL(seq_list_start); 677EXPORT_SYMBOL(seq_list_start);
679 678
680struct list_head *seq_list_start_head(struct list_head *head, loff_t pos) 679struct list_head *seq_list_start_head(struct list_head *head, loff_t pos)
@@ -684,7 +683,6 @@ struct list_head *seq_list_start_head(struct list_head *head, loff_t pos)
684 683
685 return seq_list_start(head, pos - 1); 684 return seq_list_start(head, pos - 1);
686} 685}
687
688EXPORT_SYMBOL(seq_list_start_head); 686EXPORT_SYMBOL(seq_list_start_head);
689 687
690struct list_head *seq_list_next(void *v, struct list_head *head, loff_t *ppos) 688struct list_head *seq_list_next(void *v, struct list_head *head, loff_t *ppos)
@@ -695,5 +693,131 @@ struct list_head *seq_list_next(void *v, struct list_head *head, loff_t *ppos)
695 ++*ppos; 693 ++*ppos;
696 return lh == head ? NULL : lh; 694 return lh == head ? NULL : lh;
697} 695}
698
699EXPORT_SYMBOL(seq_list_next); 696EXPORT_SYMBOL(seq_list_next);
697
698/**
699 * seq_hlist_start - start an iteration of a hlist
700 * @head: the head of the hlist
701 * @pos: the start position of the sequence
702 *
703 * Called at seq_file->op->start().
704 */
705struct hlist_node *seq_hlist_start(struct hlist_head *head, loff_t pos)
706{
707 struct hlist_node *node;
708
709 hlist_for_each(node, head)
710 if (pos-- == 0)
711 return node;
712 return NULL;
713}
714EXPORT_SYMBOL(seq_hlist_start);
715
716/**
717 * seq_hlist_start_head - start an iteration of a hlist
718 * @head: the head of the hlist
719 * @pos: the start position of the sequence
720 *
721 * Called at seq_file->op->start(). Call this function if you want to
722 * print a header at the top of the output.
723 */
724struct hlist_node *seq_hlist_start_head(struct hlist_head *head, loff_t pos)
725{
726 if (!pos)
727 return SEQ_START_TOKEN;
728
729 return seq_hlist_start(head, pos - 1);
730}
731EXPORT_SYMBOL(seq_hlist_start_head);
732
733/**
734 * seq_hlist_next - move to the next position of the hlist
735 * @v: the current iterator
736 * @head: the head of the hlist
737 * @pos: the current posision
738 *
739 * Called at seq_file->op->next().
740 */
741struct hlist_node *seq_hlist_next(void *v, struct hlist_head *head,
742 loff_t *ppos)
743{
744 struct hlist_node *node = v;
745
746 ++*ppos;
747 if (v == SEQ_START_TOKEN)
748 return head->first;
749 else
750 return node->next;
751}
752EXPORT_SYMBOL(seq_hlist_next);
753
754/**
755 * seq_hlist_start_rcu - start an iteration of a hlist protected by RCU
756 * @head: the head of the hlist
757 * @pos: the start position of the sequence
758 *
759 * Called at seq_file->op->start().
760 *
761 * This list-traversal primitive may safely run concurrently with
762 * the _rcu list-mutation primitives such as hlist_add_head_rcu()
763 * as long as the traversal is guarded by rcu_read_lock().
764 */
765struct hlist_node *seq_hlist_start_rcu(struct hlist_head *head,
766 loff_t pos)
767{
768 struct hlist_node *node;
769
770 __hlist_for_each_rcu(node, head)
771 if (pos-- == 0)
772 return node;
773 return NULL;
774}
775EXPORT_SYMBOL(seq_hlist_start_rcu);
776
777/**
778 * seq_hlist_start_head_rcu - start an iteration of a hlist protected by RCU
779 * @head: the head of the hlist
780 * @pos: the start position of the sequence
781 *
782 * Called at seq_file->op->start(). Call this function if you want to
783 * print a header at the top of the output.
784 *
785 * This list-traversal primitive may safely run concurrently with
786 * the _rcu list-mutation primitives such as hlist_add_head_rcu()
787 * as long as the traversal is guarded by rcu_read_lock().
788 */
789struct hlist_node *seq_hlist_start_head_rcu(struct hlist_head *head,
790 loff_t pos)
791{
792 if (!pos)
793 return SEQ_START_TOKEN;
794
795 return seq_hlist_start_rcu(head, pos - 1);
796}
797EXPORT_SYMBOL(seq_hlist_start_head_rcu);
798
799/**
800 * seq_hlist_next_rcu - move to the next position of the hlist protected by RCU
801 * @v: the current iterator
802 * @head: the head of the hlist
803 * @pos: the current posision
804 *
805 * Called at seq_file->op->next().
806 *
807 * This list-traversal primitive may safely run concurrently with
808 * the _rcu list-mutation primitives such as hlist_add_head_rcu()
809 * as long as the traversal is guarded by rcu_read_lock().
810 */
811struct hlist_node *seq_hlist_next_rcu(void *v,
812 struct hlist_head *head,
813 loff_t *ppos)
814{
815 struct hlist_node *node = v;
816
817 ++*ppos;
818 if (v == SEQ_START_TOKEN)
819 return rcu_dereference(head->first);
820 else
821 return rcu_dereference(node->next);
822}
823EXPORT_SYMBOL(seq_hlist_next_rcu);
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 56641fe52a23..5c5a366aa332 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -16,7 +16,7 @@
16# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 16# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17# 17#
18 18
19EXTRA_CFLAGS += -I$(src) -I$(src)/linux-2.6 -funsigned-char 19EXTRA_CFLAGS += -I$(src) -I$(src)/linux-2.6
20 20
21XFS_LINUX := linux-2.6 21XFS_LINUX := linux-2.6
22 22
diff --git a/fs/xfs/linux-2.6/kmem.c b/fs/xfs/linux-2.6/kmem.c
index 2d3f90afe5f1..bc7405585def 100644
--- a/fs/xfs/linux-2.6/kmem.c
+++ b/fs/xfs/linux-2.6/kmem.c
@@ -16,7 +16,6 @@
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */ 17 */
18#include <linux/mm.h> 18#include <linux/mm.h>
19#include <linux/vmalloc.h>
20#include <linux/highmem.h> 19#include <linux/highmem.h>
21#include <linux/swap.h> 20#include <linux/swap.h>
22#include <linux/blkdev.h> 21#include <linux/blkdev.h>
@@ -24,8 +23,25 @@
24#include "time.h" 23#include "time.h"
25#include "kmem.h" 24#include "kmem.h"
26 25
27#define MAX_VMALLOCS 6 26/*
28#define MAX_SLAB_SIZE 0x20000 27 * Greedy allocation. May fail and may return vmalloced memory.
28 *
29 * Must be freed using kmem_free_large.
30 */
31void *
32kmem_zalloc_greedy(size_t *size, size_t minsize, size_t maxsize)
33{
34 void *ptr;
35 size_t kmsize = maxsize;
36
37 while (!(ptr = kmem_zalloc_large(kmsize))) {
38 if ((kmsize >>= 1) <= minsize)
39 kmsize = minsize;
40 }
41 if (ptr)
42 *size = kmsize;
43 return ptr;
44}
29 45
30void * 46void *
31kmem_alloc(size_t size, unsigned int __nocast flags) 47kmem_alloc(size_t size, unsigned int __nocast flags)
@@ -34,19 +50,8 @@ kmem_alloc(size_t size, unsigned int __nocast flags)
34 gfp_t lflags = kmem_flags_convert(flags); 50 gfp_t lflags = kmem_flags_convert(flags);
35 void *ptr; 51 void *ptr;
36 52
37#ifdef DEBUG
38 if (unlikely(!(flags & KM_LARGE) && (size > PAGE_SIZE))) {
39 printk(KERN_WARNING "Large %s attempt, size=%ld\n",
40 __func__, (long)size);
41 dump_stack();
42 }
43#endif
44
45 do { 53 do {
46 if (size < MAX_SLAB_SIZE || retries > MAX_VMALLOCS) 54 ptr = kmalloc(size, lflags);
47 ptr = kmalloc(size, lflags);
48 else
49 ptr = __vmalloc(size, lflags, PAGE_KERNEL);
50 if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP))) 55 if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP)))
51 return ptr; 56 return ptr;
52 if (!(++retries % 100)) 57 if (!(++retries % 100))
@@ -68,27 +73,6 @@ kmem_zalloc(size_t size, unsigned int __nocast flags)
68 return ptr; 73 return ptr;
69} 74}
70 75
71void *
72kmem_zalloc_greedy(size_t *size, size_t minsize, size_t maxsize,
73 unsigned int __nocast flags)
74{
75 void *ptr;
76 size_t kmsize = maxsize;
77 unsigned int kmflags = (flags & ~KM_SLEEP) | KM_NOSLEEP;
78
79 while (!(ptr = kmem_zalloc(kmsize, kmflags))) {
80 if ((kmsize <= minsize) && (flags & KM_NOSLEEP))
81 break;
82 if ((kmsize >>= 1) <= minsize) {
83 kmsize = minsize;
84 kmflags = flags;
85 }
86 }
87 if (ptr)
88 *size = kmsize;
89 return ptr;
90}
91
92void 76void
93kmem_free(const void *ptr) 77kmem_free(const void *ptr)
94{ 78{
diff --git a/fs/xfs/linux-2.6/kmem.h b/fs/xfs/linux-2.6/kmem.h
index 179cbd630f69..f7c8f7a9ea6d 100644
--- a/fs/xfs/linux-2.6/kmem.h
+++ b/fs/xfs/linux-2.6/kmem.h
@@ -21,6 +21,7 @@
21#include <linux/slab.h> 21#include <linux/slab.h>
22#include <linux/sched.h> 22#include <linux/sched.h>
23#include <linux/mm.h> 23#include <linux/mm.h>
24#include <linux/vmalloc.h>
24 25
25/* 26/*
26 * General memory allocation interfaces 27 * General memory allocation interfaces
@@ -30,7 +31,6 @@
30#define KM_NOSLEEP 0x0002u 31#define KM_NOSLEEP 0x0002u
31#define KM_NOFS 0x0004u 32#define KM_NOFS 0x0004u
32#define KM_MAYFAIL 0x0008u 33#define KM_MAYFAIL 0x0008u
33#define KM_LARGE 0x0010u
34 34
35/* 35/*
36 * We use a special process flag to avoid recursive callbacks into 36 * We use a special process flag to avoid recursive callbacks into
@@ -42,7 +42,7 @@ kmem_flags_convert(unsigned int __nocast flags)
42{ 42{
43 gfp_t lflags; 43 gfp_t lflags;
44 44
45 BUG_ON(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS|KM_MAYFAIL|KM_LARGE)); 45 BUG_ON(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS|KM_MAYFAIL));
46 46
47 if (flags & KM_NOSLEEP) { 47 if (flags & KM_NOSLEEP) {
48 lflags = GFP_ATOMIC | __GFP_NOWARN; 48 lflags = GFP_ATOMIC | __GFP_NOWARN;
@@ -56,10 +56,25 @@ kmem_flags_convert(unsigned int __nocast flags)
56 56
57extern void *kmem_alloc(size_t, unsigned int __nocast); 57extern void *kmem_alloc(size_t, unsigned int __nocast);
58extern void *kmem_zalloc(size_t, unsigned int __nocast); 58extern void *kmem_zalloc(size_t, unsigned int __nocast);
59extern void *kmem_zalloc_greedy(size_t *, size_t, size_t, unsigned int __nocast);
60extern void *kmem_realloc(const void *, size_t, size_t, unsigned int __nocast); 59extern void *kmem_realloc(const void *, size_t, size_t, unsigned int __nocast);
61extern void kmem_free(const void *); 60extern void kmem_free(const void *);
62 61
62static inline void *kmem_zalloc_large(size_t size)
63{
64 void *ptr;
65
66 ptr = vmalloc(size);
67 if (ptr)
68 memset(ptr, 0, size);
69 return ptr;
70}
71static inline void kmem_free_large(void *ptr)
72{
73 vfree(ptr);
74}
75
76extern void *kmem_zalloc_greedy(size_t *, size_t, size_t);
77
63/* 78/*
64 * Zone interfaces 79 * Zone interfaces
65 */ 80 */
diff --git a/fs/xfs/linux-2.6/xfs_acl.c b/fs/xfs/linux-2.6/xfs_acl.c
index 883ca5ab8af5..bf85bbe4a9ae 100644
--- a/fs/xfs/linux-2.6/xfs_acl.c
+++ b/fs/xfs/linux-2.6/xfs_acl.c
@@ -106,7 +106,7 @@ xfs_get_acl(struct inode *inode, int type)
106 struct posix_acl *acl; 106 struct posix_acl *acl;
107 struct xfs_acl *xfs_acl; 107 struct xfs_acl *xfs_acl;
108 int len = sizeof(struct xfs_acl); 108 int len = sizeof(struct xfs_acl);
109 char *ea_name; 109 unsigned char *ea_name;
110 int error; 110 int error;
111 111
112 acl = get_cached_acl(inode, type); 112 acl = get_cached_acl(inode, type);
@@ -133,7 +133,8 @@ xfs_get_acl(struct inode *inode, int type)
133 if (!xfs_acl) 133 if (!xfs_acl)
134 return ERR_PTR(-ENOMEM); 134 return ERR_PTR(-ENOMEM);
135 135
136 error = -xfs_attr_get(ip, ea_name, (char *)xfs_acl, &len, ATTR_ROOT); 136 error = -xfs_attr_get(ip, ea_name, (unsigned char *)xfs_acl,
137 &len, ATTR_ROOT);
137 if (error) { 138 if (error) {
138 /* 139 /*
139 * If the attribute doesn't exist make sure we have a negative 140 * If the attribute doesn't exist make sure we have a negative
@@ -162,7 +163,7 @@ STATIC int
162xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl) 163xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
163{ 164{
164 struct xfs_inode *ip = XFS_I(inode); 165 struct xfs_inode *ip = XFS_I(inode);
165 char *ea_name; 166 unsigned char *ea_name;
166 int error; 167 int error;
167 168
168 if (S_ISLNK(inode->i_mode)) 169 if (S_ISLNK(inode->i_mode))
@@ -194,7 +195,7 @@ xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
194 (sizeof(struct xfs_acl_entry) * 195 (sizeof(struct xfs_acl_entry) *
195 (XFS_ACL_MAX_ENTRIES - acl->a_count)); 196 (XFS_ACL_MAX_ENTRIES - acl->a_count));
196 197
197 error = -xfs_attr_set(ip, ea_name, (char *)xfs_acl, 198 error = -xfs_attr_set(ip, ea_name, (unsigned char *)xfs_acl,
198 len, ATTR_ROOT); 199 len, ATTR_ROOT);
199 200
200 kfree(xfs_acl); 201 kfree(xfs_acl);
@@ -262,7 +263,7 @@ xfs_set_mode(struct inode *inode, mode_t mode)
262} 263}
263 264
264static int 265static int
265xfs_acl_exists(struct inode *inode, char *name) 266xfs_acl_exists(struct inode *inode, unsigned char *name)
266{ 267{
267 int len = sizeof(struct xfs_acl); 268 int len = sizeof(struct xfs_acl);
268 269
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 77b8be81c769..6f76ba85f193 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -33,6 +33,7 @@
33#include <linux/migrate.h> 33#include <linux/migrate.h>
34#include <linux/backing-dev.h> 34#include <linux/backing-dev.h>
35#include <linux/freezer.h> 35#include <linux/freezer.h>
36#include <linux/list_sort.h>
36 37
37#include "xfs_sb.h" 38#include "xfs_sb.h"
38#include "xfs_inum.h" 39#include "xfs_inum.h"
@@ -76,6 +77,27 @@ struct workqueue_struct *xfsconvertd_workqueue;
76#define xfs_buf_deallocate(bp) \ 77#define xfs_buf_deallocate(bp) \
77 kmem_zone_free(xfs_buf_zone, (bp)); 78 kmem_zone_free(xfs_buf_zone, (bp));
78 79
80static inline int
81xfs_buf_is_vmapped(
82 struct xfs_buf *bp)
83{
84 /*
85 * Return true if the buffer is vmapped.
86 *
87 * The XBF_MAPPED flag is set if the buffer should be mapped, but the
88 * code is clever enough to know it doesn't have to map a single page,
89 * so the check has to be both for XBF_MAPPED and bp->b_page_count > 1.
90 */
91 return (bp->b_flags & XBF_MAPPED) && bp->b_page_count > 1;
92}
93
94static inline int
95xfs_buf_vmap_len(
96 struct xfs_buf *bp)
97{
98 return (bp->b_page_count * PAGE_SIZE) - bp->b_offset;
99}
100
79/* 101/*
80 * Page Region interfaces. 102 * Page Region interfaces.
81 * 103 *
@@ -314,7 +336,7 @@ xfs_buf_free(
314 if (bp->b_flags & (_XBF_PAGE_CACHE|_XBF_PAGES)) { 336 if (bp->b_flags & (_XBF_PAGE_CACHE|_XBF_PAGES)) {
315 uint i; 337 uint i;
316 338
317 if ((bp->b_flags & XBF_MAPPED) && (bp->b_page_count > 1)) 339 if (xfs_buf_is_vmapped(bp))
318 free_address(bp->b_addr - bp->b_offset); 340 free_address(bp->b_addr - bp->b_offset);
319 341
320 for (i = 0; i < bp->b_page_count; i++) { 342 for (i = 0; i < bp->b_page_count; i++) {
@@ -1051,22 +1073,30 @@ xfs_buf_ioerror(
1051} 1073}
1052 1074
1053int 1075int
1054xfs_bawrite( 1076xfs_bwrite(
1055 void *mp, 1077 struct xfs_mount *mp,
1056 struct xfs_buf *bp) 1078 struct xfs_buf *bp)
1057{ 1079{
1058 trace_xfs_buf_bawrite(bp, _RET_IP_); 1080 int iowait = (bp->b_flags & XBF_ASYNC) == 0;
1081 int error = 0;
1059 1082
1060 ASSERT(bp->b_bn != XFS_BUF_DADDR_NULL); 1083 bp->b_strat = xfs_bdstrat_cb;
1084 bp->b_mount = mp;
1085 bp->b_flags |= XBF_WRITE;
1086 if (!iowait)
1087 bp->b_flags |= _XBF_RUN_QUEUES;
1061 1088
1062 xfs_buf_delwri_dequeue(bp); 1089 xfs_buf_delwri_dequeue(bp);
1090 xfs_buf_iostrategy(bp);
1063 1091
1064 bp->b_flags &= ~(XBF_READ | XBF_DELWRI | XBF_READ_AHEAD); 1092 if (iowait) {
1065 bp->b_flags |= (XBF_WRITE | XBF_ASYNC | _XBF_RUN_QUEUES); 1093 error = xfs_buf_iowait(bp);
1094 if (error)
1095 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
1096 xfs_buf_relse(bp);
1097 }
1066 1098
1067 bp->b_mount = mp; 1099 return error;
1068 bp->b_strat = xfs_bdstrat_cb;
1069 return xfs_bdstrat_cb(bp);
1070} 1100}
1071 1101
1072void 1102void
@@ -1085,6 +1115,126 @@ xfs_bdwrite(
1085 xfs_buf_delwri_queue(bp, 1); 1115 xfs_buf_delwri_queue(bp, 1);
1086} 1116}
1087 1117
1118/*
1119 * Called when we want to stop a buffer from getting written or read.
1120 * We attach the EIO error, muck with its flags, and call biodone
1121 * so that the proper iodone callbacks get called.
1122 */
1123STATIC int
1124xfs_bioerror(
1125 xfs_buf_t *bp)
1126{
1127#ifdef XFSERRORDEBUG
1128 ASSERT(XFS_BUF_ISREAD(bp) || bp->b_iodone);
1129#endif
1130
1131 /*
1132 * No need to wait until the buffer is unpinned, we aren't flushing it.
1133 */
1134 XFS_BUF_ERROR(bp, EIO);
1135
1136 /*
1137 * We're calling biodone, so delete XBF_DONE flag.
1138 */
1139 XFS_BUF_UNREAD(bp);
1140 XFS_BUF_UNDELAYWRITE(bp);
1141 XFS_BUF_UNDONE(bp);
1142 XFS_BUF_STALE(bp);
1143
1144 XFS_BUF_CLR_BDSTRAT_FUNC(bp);
1145 xfs_biodone(bp);
1146
1147 return EIO;
1148}
1149
1150/*
1151 * Same as xfs_bioerror, except that we are releasing the buffer
1152 * here ourselves, and avoiding the biodone call.
1153 * This is meant for userdata errors; metadata bufs come with
1154 * iodone functions attached, so that we can track down errors.
1155 */
1156STATIC int
1157xfs_bioerror_relse(
1158 struct xfs_buf *bp)
1159{
1160 int64_t fl = XFS_BUF_BFLAGS(bp);
1161 /*
1162 * No need to wait until the buffer is unpinned.
1163 * We aren't flushing it.
1164 *
1165 * chunkhold expects B_DONE to be set, whether
1166 * we actually finish the I/O or not. We don't want to
1167 * change that interface.
1168 */
1169 XFS_BUF_UNREAD(bp);
1170 XFS_BUF_UNDELAYWRITE(bp);
1171 XFS_BUF_DONE(bp);
1172 XFS_BUF_STALE(bp);
1173 XFS_BUF_CLR_IODONE_FUNC(bp);
1174 XFS_BUF_CLR_BDSTRAT_FUNC(bp);
1175 if (!(fl & XBF_ASYNC)) {
1176 /*
1177 * Mark b_error and B_ERROR _both_.
1178 * Lot's of chunkcache code assumes that.
1179 * There's no reason to mark error for
1180 * ASYNC buffers.
1181 */
1182 XFS_BUF_ERROR(bp, EIO);
1183 XFS_BUF_FINISH_IOWAIT(bp);
1184 } else {
1185 xfs_buf_relse(bp);
1186 }
1187
1188 return EIO;
1189}
1190
1191
1192/*
1193 * All xfs metadata buffers except log state machine buffers
1194 * get this attached as their b_bdstrat callback function.
1195 * This is so that we can catch a buffer
1196 * after prematurely unpinning it to forcibly shutdown the filesystem.
1197 */
1198int
1199xfs_bdstrat_cb(
1200 struct xfs_buf *bp)
1201{
1202 if (XFS_FORCED_SHUTDOWN(bp->b_mount)) {
1203 trace_xfs_bdstrat_shut(bp, _RET_IP_);
1204 /*
1205 * Metadata write that didn't get logged but
1206 * written delayed anyway. These aren't associated
1207 * with a transaction, and can be ignored.
1208 */
1209 if (!bp->b_iodone && !XFS_BUF_ISREAD(bp))
1210 return xfs_bioerror_relse(bp);
1211 else
1212 return xfs_bioerror(bp);
1213 }
1214
1215 xfs_buf_iorequest(bp);
1216 return 0;
1217}
1218
1219/*
1220 * Wrapper around bdstrat so that we can stop data from going to disk in case
1221 * we are shutting down the filesystem. Typically user data goes thru this
1222 * path; one of the exceptions is the superblock.
1223 */
1224void
1225xfsbdstrat(
1226 struct xfs_mount *mp,
1227 struct xfs_buf *bp)
1228{
1229 if (XFS_FORCED_SHUTDOWN(mp)) {
1230 trace_xfs_bdstrat_shut(bp, _RET_IP_);
1231 xfs_bioerror_relse(bp);
1232 return;
1233 }
1234
1235 xfs_buf_iorequest(bp);
1236}
1237
1088STATIC void 1238STATIC void
1089_xfs_buf_ioend( 1239_xfs_buf_ioend(
1090 xfs_buf_t *bp, 1240 xfs_buf_t *bp,
@@ -1107,6 +1257,9 @@ xfs_buf_bio_end_io(
1107 1257
1108 xfs_buf_ioerror(bp, -error); 1258 xfs_buf_ioerror(bp, -error);
1109 1259
1260 if (!error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
1261 invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp));
1262
1110 do { 1263 do {
1111 struct page *page = bvec->bv_page; 1264 struct page *page = bvec->bv_page;
1112 1265
@@ -1216,6 +1369,10 @@ next_chunk:
1216 1369
1217submit_io: 1370submit_io:
1218 if (likely(bio->bi_size)) { 1371 if (likely(bio->bi_size)) {
1372 if (xfs_buf_is_vmapped(bp)) {
1373 flush_kernel_vmap_range(bp->b_addr,
1374 xfs_buf_vmap_len(bp));
1375 }
1219 submit_bio(rw, bio); 1376 submit_bio(rw, bio);
1220 if (size) 1377 if (size)
1221 goto next_chunk; 1378 goto next_chunk;
@@ -1296,7 +1453,7 @@ xfs_buf_iomove(
1296 xfs_buf_t *bp, /* buffer to process */ 1453 xfs_buf_t *bp, /* buffer to process */
1297 size_t boff, /* starting buffer offset */ 1454 size_t boff, /* starting buffer offset */
1298 size_t bsize, /* length to copy */ 1455 size_t bsize, /* length to copy */
1299 caddr_t data, /* data address */ 1456 void *data, /* data address */
1300 xfs_buf_rw_t mode) /* read/write/zero flag */ 1457 xfs_buf_rw_t mode) /* read/write/zero flag */
1301{ 1458{
1302 size_t bend, cpoff, csize; 1459 size_t bend, cpoff, csize;
@@ -1378,8 +1535,8 @@ xfs_alloc_bufhash(
1378 1535
1379 btp->bt_hashshift = external ? 3 : 8; /* 8 or 256 buckets */ 1536 btp->bt_hashshift = external ? 3 : 8; /* 8 or 256 buckets */
1380 btp->bt_hashmask = (1 << btp->bt_hashshift) - 1; 1537 btp->bt_hashmask = (1 << btp->bt_hashshift) - 1;
1381 btp->bt_hash = kmem_zalloc((1 << btp->bt_hashshift) * 1538 btp->bt_hash = kmem_zalloc_large((1 << btp->bt_hashshift) *
1382 sizeof(xfs_bufhash_t), KM_SLEEP | KM_LARGE); 1539 sizeof(xfs_bufhash_t));
1383 for (i = 0; i < (1 << btp->bt_hashshift); i++) { 1540 for (i = 0; i < (1 << btp->bt_hashshift); i++) {
1384 spin_lock_init(&btp->bt_hash[i].bh_lock); 1541 spin_lock_init(&btp->bt_hash[i].bh_lock);
1385 INIT_LIST_HEAD(&btp->bt_hash[i].bh_list); 1542 INIT_LIST_HEAD(&btp->bt_hash[i].bh_list);
@@ -1390,7 +1547,7 @@ STATIC void
1390xfs_free_bufhash( 1547xfs_free_bufhash(
1391 xfs_buftarg_t *btp) 1548 xfs_buftarg_t *btp)
1392{ 1549{
1393 kmem_free(btp->bt_hash); 1550 kmem_free_large(btp->bt_hash);
1394 btp->bt_hash = NULL; 1551 btp->bt_hash = NULL;
1395} 1552}
1396 1553
@@ -1595,6 +1752,11 @@ xfs_buf_delwri_queue(
1595 list_del(&bp->b_list); 1752 list_del(&bp->b_list);
1596 } 1753 }
1597 1754
1755 if (list_empty(dwq)) {
1756 /* start xfsbufd as it is about to have something to do */
1757 wake_up_process(bp->b_target->bt_task);
1758 }
1759
1598 bp->b_flags |= _XBF_DELWRI_Q; 1760 bp->b_flags |= _XBF_DELWRI_Q;
1599 list_add_tail(&bp->b_list, dwq); 1761 list_add_tail(&bp->b_list, dwq);
1600 bp->b_queuetime = jiffies; 1762 bp->b_queuetime = jiffies;
@@ -1626,6 +1788,35 @@ xfs_buf_delwri_dequeue(
1626 trace_xfs_buf_delwri_dequeue(bp, _RET_IP_); 1788 trace_xfs_buf_delwri_dequeue(bp, _RET_IP_);
1627} 1789}
1628 1790
1791/*
1792 * If a delwri buffer needs to be pushed before it has aged out, then promote
1793 * it to the head of the delwri queue so that it will be flushed on the next
1794 * xfsbufd run. We do this by resetting the queuetime of the buffer to be older
1795 * than the age currently needed to flush the buffer. Hence the next time the
1796 * xfsbufd sees it is guaranteed to be considered old enough to flush.
1797 */
1798void
1799xfs_buf_delwri_promote(
1800 struct xfs_buf *bp)
1801{
1802 struct xfs_buftarg *btp = bp->b_target;
1803 long age = xfs_buf_age_centisecs * msecs_to_jiffies(10) + 1;
1804
1805 ASSERT(bp->b_flags & XBF_DELWRI);
1806 ASSERT(bp->b_flags & _XBF_DELWRI_Q);
1807
1808 /*
1809 * Check the buffer age before locking the delayed write queue as we
1810 * don't need to promote buffers that are already past the flush age.
1811 */
1812 if (bp->b_queuetime < jiffies - age)
1813 return;
1814 bp->b_queuetime = jiffies - age;
1815 spin_lock(&btp->bt_delwrite_lock);
1816 list_move(&bp->b_list, &btp->bt_delwrite_queue);
1817 spin_unlock(&btp->bt_delwrite_lock);
1818}
1819
1629STATIC void 1820STATIC void
1630xfs_buf_runall_queues( 1821xfs_buf_runall_queues(
1631 struct workqueue_struct *queue) 1822 struct workqueue_struct *queue)
@@ -1644,6 +1835,8 @@ xfsbufd_wakeup(
1644 list_for_each_entry(btp, &xfs_buftarg_list, bt_list) { 1835 list_for_each_entry(btp, &xfs_buftarg_list, bt_list) {
1645 if (test_bit(XBT_FORCE_SLEEP, &btp->bt_flags)) 1836 if (test_bit(XBT_FORCE_SLEEP, &btp->bt_flags))
1646 continue; 1837 continue;
1838 if (list_empty(&btp->bt_delwrite_queue))
1839 continue;
1647 set_bit(XBT_FORCE_FLUSH, &btp->bt_flags); 1840 set_bit(XBT_FORCE_FLUSH, &btp->bt_flags);
1648 wake_up_process(btp->bt_task); 1841 wake_up_process(btp->bt_task);
1649 } 1842 }
@@ -1694,20 +1887,53 @@ xfs_buf_delwri_split(
1694 1887
1695} 1888}
1696 1889
1890/*
1891 * Compare function is more complex than it needs to be because
1892 * the return value is only 32 bits and we are doing comparisons
1893 * on 64 bit values
1894 */
1895static int
1896xfs_buf_cmp(
1897 void *priv,
1898 struct list_head *a,
1899 struct list_head *b)
1900{
1901 struct xfs_buf *ap = container_of(a, struct xfs_buf, b_list);
1902 struct xfs_buf *bp = container_of(b, struct xfs_buf, b_list);
1903 xfs_daddr_t diff;
1904
1905 diff = ap->b_bn - bp->b_bn;
1906 if (diff < 0)
1907 return -1;
1908 if (diff > 0)
1909 return 1;
1910 return 0;
1911}
1912
1913void
1914xfs_buf_delwri_sort(
1915 xfs_buftarg_t *target,
1916 struct list_head *list)
1917{
1918 list_sort(NULL, list, xfs_buf_cmp);
1919}
1920
1697STATIC int 1921STATIC int
1698xfsbufd( 1922xfsbufd(
1699 void *data) 1923 void *data)
1700{ 1924{
1701 struct list_head tmp; 1925 xfs_buftarg_t *target = (xfs_buftarg_t *)data;
1702 xfs_buftarg_t *target = (xfs_buftarg_t *)data;
1703 int count;
1704 xfs_buf_t *bp;
1705 1926
1706 current->flags |= PF_MEMALLOC; 1927 current->flags |= PF_MEMALLOC;
1707 1928
1708 set_freezable(); 1929 set_freezable();
1709 1930
1710 do { 1931 do {
1932 long age = xfs_buf_age_centisecs * msecs_to_jiffies(10);
1933 long tout = xfs_buf_timer_centisecs * msecs_to_jiffies(10);
1934 int count = 0;
1935 struct list_head tmp;
1936
1711 if (unlikely(freezing(current))) { 1937 if (unlikely(freezing(current))) {
1712 set_bit(XBT_FORCE_SLEEP, &target->bt_flags); 1938 set_bit(XBT_FORCE_SLEEP, &target->bt_flags);
1713 refrigerator(); 1939 refrigerator();
@@ -1715,17 +1941,16 @@ xfsbufd(
1715 clear_bit(XBT_FORCE_SLEEP, &target->bt_flags); 1941 clear_bit(XBT_FORCE_SLEEP, &target->bt_flags);
1716 } 1942 }
1717 1943
1718 schedule_timeout_interruptible( 1944 /* sleep for a long time if there is nothing to do. */
1719 xfs_buf_timer_centisecs * msecs_to_jiffies(10)); 1945 if (list_empty(&target->bt_delwrite_queue))
1946 tout = MAX_SCHEDULE_TIMEOUT;
1947 schedule_timeout_interruptible(tout);
1720 1948
1721 xfs_buf_delwri_split(target, &tmp, 1949 xfs_buf_delwri_split(target, &tmp, age);
1722 xfs_buf_age_centisecs * msecs_to_jiffies(10)); 1950 list_sort(NULL, &tmp, xfs_buf_cmp);
1723
1724 count = 0;
1725 while (!list_empty(&tmp)) { 1951 while (!list_empty(&tmp)) {
1726 bp = list_entry(tmp.next, xfs_buf_t, b_list); 1952 struct xfs_buf *bp;
1727 ASSERT(target == bp->b_target); 1953 bp = list_first_entry(&tmp, struct xfs_buf, b_list);
1728
1729 list_del_init(&bp->b_list); 1954 list_del_init(&bp->b_list);
1730 xfs_buf_iostrategy(bp); 1955 xfs_buf_iostrategy(bp);
1731 count++; 1956 count++;
@@ -1751,42 +1976,45 @@ xfs_flush_buftarg(
1751 xfs_buftarg_t *target, 1976 xfs_buftarg_t *target,
1752 int wait) 1977 int wait)
1753{ 1978{
1754 struct list_head tmp; 1979 xfs_buf_t *bp;
1755 xfs_buf_t *bp, *n;
1756 int pincount = 0; 1980 int pincount = 0;
1981 LIST_HEAD(tmp_list);
1982 LIST_HEAD(wait_list);
1757 1983
1758 xfs_buf_runall_queues(xfsconvertd_workqueue); 1984 xfs_buf_runall_queues(xfsconvertd_workqueue);
1759 xfs_buf_runall_queues(xfsdatad_workqueue); 1985 xfs_buf_runall_queues(xfsdatad_workqueue);
1760 xfs_buf_runall_queues(xfslogd_workqueue); 1986 xfs_buf_runall_queues(xfslogd_workqueue);
1761 1987
1762 set_bit(XBT_FORCE_FLUSH, &target->bt_flags); 1988 set_bit(XBT_FORCE_FLUSH, &target->bt_flags);
1763 pincount = xfs_buf_delwri_split(target, &tmp, 0); 1989 pincount = xfs_buf_delwri_split(target, &tmp_list, 0);
1764 1990
1765 /* 1991 /*
1766 * Dropped the delayed write list lock, now walk the temporary list 1992 * Dropped the delayed write list lock, now walk the temporary list.
1993 * All I/O is issued async and then if we need to wait for completion
1994 * we do that after issuing all the IO.
1767 */ 1995 */
1768 list_for_each_entry_safe(bp, n, &tmp, b_list) { 1996 list_sort(NULL, &tmp_list, xfs_buf_cmp);
1997 while (!list_empty(&tmp_list)) {
1998 bp = list_first_entry(&tmp_list, struct xfs_buf, b_list);
1769 ASSERT(target == bp->b_target); 1999 ASSERT(target == bp->b_target);
1770 if (wait) 2000 list_del_init(&bp->b_list);
2001 if (wait) {
1771 bp->b_flags &= ~XBF_ASYNC; 2002 bp->b_flags &= ~XBF_ASYNC;
1772 else 2003 list_add(&bp->b_list, &wait_list);
1773 list_del_init(&bp->b_list); 2004 }
1774
1775 xfs_buf_iostrategy(bp); 2005 xfs_buf_iostrategy(bp);
1776 } 2006 }
1777 2007
1778 if (wait) 2008 if (wait) {
2009 /* Expedite and wait for IO to complete. */
1779 blk_run_address_space(target->bt_mapping); 2010 blk_run_address_space(target->bt_mapping);
2011 while (!list_empty(&wait_list)) {
2012 bp = list_first_entry(&wait_list, struct xfs_buf, b_list);
1780 2013
1781 /* 2014 list_del_init(&bp->b_list);
1782 * Remaining list items must be flushed before returning 2015 xfs_iowait(bp);
1783 */ 2016 xfs_buf_relse(bp);
1784 while (!list_empty(&tmp)) { 2017 }
1785 bp = list_entry(tmp.next, xfs_buf_t, b_list);
1786
1787 list_del_init(&bp->b_list);
1788 xfs_iowait(bp);
1789 xfs_buf_relse(bp);
1790 } 2018 }
1791 2019
1792 return pincount; 2020 return pincount;
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index a34c7b54822d..386e7361e50e 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -232,13 +232,17 @@ extern void xfs_buf_lock(xfs_buf_t *);
232extern void xfs_buf_unlock(xfs_buf_t *); 232extern void xfs_buf_unlock(xfs_buf_t *);
233 233
234/* Buffer Read and Write Routines */ 234/* Buffer Read and Write Routines */
235extern int xfs_bawrite(void *mp, xfs_buf_t *bp); 235extern int xfs_bwrite(struct xfs_mount *mp, struct xfs_buf *bp);
236extern void xfs_bdwrite(void *mp, xfs_buf_t *bp); 236extern void xfs_bdwrite(void *mp, xfs_buf_t *bp);
237
238extern void xfsbdstrat(struct xfs_mount *, struct xfs_buf *);
239extern int xfs_bdstrat_cb(struct xfs_buf *);
240
237extern void xfs_buf_ioend(xfs_buf_t *, int); 241extern void xfs_buf_ioend(xfs_buf_t *, int);
238extern void xfs_buf_ioerror(xfs_buf_t *, int); 242extern void xfs_buf_ioerror(xfs_buf_t *, int);
239extern int xfs_buf_iorequest(xfs_buf_t *); 243extern int xfs_buf_iorequest(xfs_buf_t *);
240extern int xfs_buf_iowait(xfs_buf_t *); 244extern int xfs_buf_iowait(xfs_buf_t *);
241extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, xfs_caddr_t, 245extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *,
242 xfs_buf_rw_t); 246 xfs_buf_rw_t);
243 247
244static inline int xfs_buf_iostrategy(xfs_buf_t *bp) 248static inline int xfs_buf_iostrategy(xfs_buf_t *bp)
@@ -261,6 +265,7 @@ extern int xfs_buf_ispin(xfs_buf_t *);
261 265
262/* Delayed Write Buffer Routines */ 266/* Delayed Write Buffer Routines */
263extern void xfs_buf_delwri_dequeue(xfs_buf_t *); 267extern void xfs_buf_delwri_dequeue(xfs_buf_t *);
268extern void xfs_buf_delwri_promote(xfs_buf_t *);
264 269
265/* Buffer Daemon Setup Routines */ 270/* Buffer Daemon Setup Routines */
266extern int xfs_buf_init(void); 271extern int xfs_buf_init(void);
@@ -270,33 +275,19 @@ extern void xfs_buf_terminate(void);
270 ({ char __b[BDEVNAME_SIZE]; bdevname((target)->bt_bdev, __b); __b; }) 275 ({ char __b[BDEVNAME_SIZE]; bdevname((target)->bt_bdev, __b); __b; })
271 276
272 277
273#define XFS_B_ASYNC XBF_ASYNC
274#define XFS_B_DELWRI XBF_DELWRI
275#define XFS_B_READ XBF_READ
276#define XFS_B_WRITE XBF_WRITE
277#define XFS_B_STALE XBF_STALE
278
279#define XFS_BUF_TRYLOCK XBF_TRYLOCK
280#define XFS_INCORE_TRYLOCK XBF_TRYLOCK
281#define XFS_BUF_LOCK XBF_LOCK
282#define XFS_BUF_MAPPED XBF_MAPPED
283
284#define BUF_BUSY XBF_DONT_BLOCK
285
286#define XFS_BUF_BFLAGS(bp) ((bp)->b_flags) 278#define XFS_BUF_BFLAGS(bp) ((bp)->b_flags)
287#define XFS_BUF_ZEROFLAGS(bp) ((bp)->b_flags &= \ 279#define XFS_BUF_ZEROFLAGS(bp) ((bp)->b_flags &= \
288 ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI|XBF_ORDERED)) 280 ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI|XBF_ORDERED))
289 281
290#define XFS_BUF_STALE(bp) ((bp)->b_flags |= XFS_B_STALE) 282#define XFS_BUF_STALE(bp) ((bp)->b_flags |= XBF_STALE)
291#define XFS_BUF_UNSTALE(bp) ((bp)->b_flags &= ~XFS_B_STALE) 283#define XFS_BUF_UNSTALE(bp) ((bp)->b_flags &= ~XBF_STALE)
292#define XFS_BUF_ISSTALE(bp) ((bp)->b_flags & XFS_B_STALE) 284#define XFS_BUF_ISSTALE(bp) ((bp)->b_flags & XBF_STALE)
293#define XFS_BUF_SUPER_STALE(bp) do { \ 285#define XFS_BUF_SUPER_STALE(bp) do { \
294 XFS_BUF_STALE(bp); \ 286 XFS_BUF_STALE(bp); \
295 xfs_buf_delwri_dequeue(bp); \ 287 xfs_buf_delwri_dequeue(bp); \
296 XFS_BUF_DONE(bp); \ 288 XFS_BUF_DONE(bp); \
297 } while (0) 289 } while (0)
298 290
299#define XFS_BUF_MANAGE XBF_FS_MANAGED
300#define XFS_BUF_UNMANAGE(bp) ((bp)->b_flags &= ~XBF_FS_MANAGED) 291#define XFS_BUF_UNMANAGE(bp) ((bp)->b_flags &= ~XBF_FS_MANAGED)
301 292
302#define XFS_BUF_DELAYWRITE(bp) ((bp)->b_flags |= XBF_DELWRI) 293#define XFS_BUF_DELAYWRITE(bp) ((bp)->b_flags |= XBF_DELWRI)
@@ -385,31 +376,11 @@ static inline void xfs_buf_relse(xfs_buf_t *bp)
385 376
386#define xfs_biomove(bp, off, len, data, rw) \ 377#define xfs_biomove(bp, off, len, data, rw) \
387 xfs_buf_iomove((bp), (off), (len), (data), \ 378 xfs_buf_iomove((bp), (off), (len), (data), \
388 ((rw) == XFS_B_WRITE) ? XBRW_WRITE : XBRW_READ) 379 ((rw) == XBF_WRITE) ? XBRW_WRITE : XBRW_READ)
389 380
390#define xfs_biozero(bp, off, len) \ 381#define xfs_biozero(bp, off, len) \
391 xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO) 382 xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO)
392 383
393
394static inline int XFS_bwrite(xfs_buf_t *bp)
395{
396 int iowait = (bp->b_flags & XBF_ASYNC) == 0;
397 int error = 0;
398
399 if (!iowait)
400 bp->b_flags |= _XBF_RUN_QUEUES;
401
402 xfs_buf_delwri_dequeue(bp);
403 xfs_buf_iostrategy(bp);
404 if (iowait) {
405 error = xfs_buf_iowait(bp);
406 xfs_buf_relse(bp);
407 }
408 return error;
409}
410
411#define XFS_bdstrat(bp) xfs_buf_iorequest(bp)
412
413#define xfs_iowait(bp) xfs_buf_iowait(bp) 384#define xfs_iowait(bp) xfs_buf_iowait(bp)
414 385
415#define xfs_baread(target, rablkno, ralen) \ 386#define xfs_baread(target, rablkno, ralen) \
@@ -424,6 +395,7 @@ extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *);
424extern void xfs_wait_buftarg(xfs_buftarg_t *); 395extern void xfs_wait_buftarg(xfs_buftarg_t *);
425extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int); 396extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int);
426extern int xfs_flush_buftarg(xfs_buftarg_t *, int); 397extern int xfs_flush_buftarg(xfs_buftarg_t *, int);
398
427#ifdef CONFIG_KDB_MODULES 399#ifdef CONFIG_KDB_MODULES
428extern struct list_head *xfs_get_buftarg_list(void); 400extern struct list_head *xfs_get_buftarg_list(void);
429#endif 401#endif
diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/linux-2.6/xfs_fs_subr.c
index 7501b85fd860..b6918d76bc7b 100644
--- a/fs/xfs/linux-2.6/xfs_fs_subr.c
+++ b/fs/xfs/linux-2.6/xfs_fs_subr.c
@@ -79,7 +79,7 @@ xfs_flush_pages(
79 xfs_iflags_clear(ip, XFS_ITRUNCATED); 79 xfs_iflags_clear(ip, XFS_ITRUNCATED);
80 ret = -filemap_fdatawrite(mapping); 80 ret = -filemap_fdatawrite(mapping);
81 } 81 }
82 if (flags & XFS_B_ASYNC) 82 if (flags & XBF_ASYNC)
83 return ret; 83 return ret;
84 ret2 = xfs_wait_on_pages(ip, first, last); 84 ret2 = xfs_wait_on_pages(ip, first, last);
85 if (!ret) 85 if (!ret)
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index a034cf624437..4ea1ee18aded 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -447,12 +447,12 @@ xfs_attrlist_by_handle(
447int 447int
448xfs_attrmulti_attr_get( 448xfs_attrmulti_attr_get(
449 struct inode *inode, 449 struct inode *inode,
450 char *name, 450 unsigned char *name,
451 char __user *ubuf, 451 unsigned char __user *ubuf,
452 __uint32_t *len, 452 __uint32_t *len,
453 __uint32_t flags) 453 __uint32_t flags)
454{ 454{
455 char *kbuf; 455 unsigned char *kbuf;
456 int error = EFAULT; 456 int error = EFAULT;
457 457
458 if (*len > XATTR_SIZE_MAX) 458 if (*len > XATTR_SIZE_MAX)
@@ -476,12 +476,12 @@ xfs_attrmulti_attr_get(
476int 476int
477xfs_attrmulti_attr_set( 477xfs_attrmulti_attr_set(
478 struct inode *inode, 478 struct inode *inode,
479 char *name, 479 unsigned char *name,
480 const char __user *ubuf, 480 const unsigned char __user *ubuf,
481 __uint32_t len, 481 __uint32_t len,
482 __uint32_t flags) 482 __uint32_t flags)
483{ 483{
484 char *kbuf; 484 unsigned char *kbuf;
485 int error = EFAULT; 485 int error = EFAULT;
486 486
487 if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) 487 if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
@@ -501,7 +501,7 @@ xfs_attrmulti_attr_set(
501int 501int
502xfs_attrmulti_attr_remove( 502xfs_attrmulti_attr_remove(
503 struct inode *inode, 503 struct inode *inode,
504 char *name, 504 unsigned char *name,
505 __uint32_t flags) 505 __uint32_t flags)
506{ 506{
507 if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) 507 if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
@@ -519,7 +519,7 @@ xfs_attrmulti_by_handle(
519 xfs_fsop_attrmulti_handlereq_t am_hreq; 519 xfs_fsop_attrmulti_handlereq_t am_hreq;
520 struct dentry *dentry; 520 struct dentry *dentry;
521 unsigned int i, size; 521 unsigned int i, size;
522 char *attr_name; 522 unsigned char *attr_name;
523 523
524 if (!capable(CAP_SYS_ADMIN)) 524 if (!capable(CAP_SYS_ADMIN))
525 return -XFS_ERROR(EPERM); 525 return -XFS_ERROR(EPERM);
@@ -547,7 +547,7 @@ xfs_attrmulti_by_handle(
547 547
548 error = 0; 548 error = 0;
549 for (i = 0; i < am_hreq.opcount; i++) { 549 for (i = 0; i < am_hreq.opcount; i++) {
550 ops[i].am_error = strncpy_from_user(attr_name, 550 ops[i].am_error = strncpy_from_user((char *)attr_name,
551 ops[i].am_attrname, MAXNAMELEN); 551 ops[i].am_attrname, MAXNAMELEN);
552 if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN) 552 if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN)
553 error = -ERANGE; 553 error = -ERANGE;
@@ -1431,6 +1431,9 @@ xfs_file_ioctl(
1431 if (!capable(CAP_SYS_ADMIN)) 1431 if (!capable(CAP_SYS_ADMIN))
1432 return -EPERM; 1432 return -EPERM;
1433 1433
1434 if (mp->m_flags & XFS_MOUNT_RDONLY)
1435 return -XFS_ERROR(EROFS);
1436
1434 if (copy_from_user(&inout, arg, sizeof(inout))) 1437 if (copy_from_user(&inout, arg, sizeof(inout)))
1435 return -XFS_ERROR(EFAULT); 1438 return -XFS_ERROR(EFAULT);
1436 1439
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.h b/fs/xfs/linux-2.6/xfs_ioctl.h
index 7bd7c6afc1eb..d56173b34a2a 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.h
+++ b/fs/xfs/linux-2.6/xfs_ioctl.h
@@ -45,23 +45,23 @@ xfs_readlink_by_handle(
45extern int 45extern int
46xfs_attrmulti_attr_get( 46xfs_attrmulti_attr_get(
47 struct inode *inode, 47 struct inode *inode,
48 char *name, 48 unsigned char *name,
49 char __user *ubuf, 49 unsigned char __user *ubuf,
50 __uint32_t *len, 50 __uint32_t *len,
51 __uint32_t flags); 51 __uint32_t flags);
52 52
53extern int 53extern int
54 xfs_attrmulti_attr_set( 54xfs_attrmulti_attr_set(
55 struct inode *inode, 55 struct inode *inode,
56 char *name, 56 unsigned char *name,
57 const char __user *ubuf, 57 const unsigned char __user *ubuf,
58 __uint32_t len, 58 __uint32_t len,
59 __uint32_t flags); 59 __uint32_t flags);
60 60
61extern int 61extern int
62xfs_attrmulti_attr_remove( 62xfs_attrmulti_attr_remove(
63 struct inode *inode, 63 struct inode *inode,
64 char *name, 64 unsigned char *name,
65 __uint32_t flags); 65 __uint32_t flags);
66 66
67extern struct dentry * 67extern struct dentry *
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
index be1527b1670c..0bf6d61f0528 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.c
@@ -411,7 +411,7 @@ xfs_compat_attrmulti_by_handle(
411 compat_xfs_fsop_attrmulti_handlereq_t am_hreq; 411 compat_xfs_fsop_attrmulti_handlereq_t am_hreq;
412 struct dentry *dentry; 412 struct dentry *dentry;
413 unsigned int i, size; 413 unsigned int i, size;
414 char *attr_name; 414 unsigned char *attr_name;
415 415
416 if (!capable(CAP_SYS_ADMIN)) 416 if (!capable(CAP_SYS_ADMIN))
417 return -XFS_ERROR(EPERM); 417 return -XFS_ERROR(EPERM);
@@ -440,7 +440,7 @@ xfs_compat_attrmulti_by_handle(
440 440
441 error = 0; 441 error = 0;
442 for (i = 0; i < am_hreq.opcount; i++) { 442 for (i = 0; i < am_hreq.opcount; i++) {
443 ops[i].am_error = strncpy_from_user(attr_name, 443 ops[i].am_error = strncpy_from_user((char *)attr_name,
444 compat_ptr(ops[i].am_attrname), 444 compat_ptr(ops[i].am_attrname),
445 MAXNAMELEN); 445 MAXNAMELEN);
446 if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN) 446 if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN)
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 225946012d0b..e8566bbf0f00 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -140,10 +140,10 @@ xfs_init_security(
140 struct xfs_inode *ip = XFS_I(inode); 140 struct xfs_inode *ip = XFS_I(inode);
141 size_t length; 141 size_t length;
142 void *value; 142 void *value;
143 char *name; 143 unsigned char *name;
144 int error; 144 int error;
145 145
146 error = security_inode_init_security(inode, dir, &name, 146 error = security_inode_init_security(inode, dir, (char **)&name,
147 &value, &length); 147 &value, &length);
148 if (error) { 148 if (error) {
149 if (error == -EOPNOTSUPP) 149 if (error == -EOPNOTSUPP)
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 0d32457abef1..eac6f80d786d 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -630,18 +630,9 @@ start:
630 * by root. This keeps people from modifying setuid and 630 * by root. This keeps people from modifying setuid and
631 * setgid binaries. 631 * setgid binaries.
632 */ 632 */
633 633 error = -file_remove_suid(file);
634 if (((xip->i_d.di_mode & S_ISUID) || 634 if (unlikely(error))
635 ((xip->i_d.di_mode & (S_ISGID | S_IXGRP)) == 635 goto out_unlock_internal;
636 (S_ISGID | S_IXGRP))) &&
637 !capable(CAP_FSETID)) {
638 error = xfs_write_clear_setuid(xip);
639 if (likely(!error))
640 error = -file_remove_suid(file);
641 if (unlikely(error)) {
642 goto out_unlock_internal;
643 }
644 }
645 636
646 /* We can write back this queue in page reclaim */ 637 /* We can write back this queue in page reclaim */
647 current->backing_dev_info = mapping->backing_dev_info; 638 current->backing_dev_info = mapping->backing_dev_info;
@@ -784,53 +775,6 @@ write_retry:
784} 775}
785 776
786/* 777/*
787 * All xfs metadata buffers except log state machine buffers
788 * get this attached as their b_bdstrat callback function.
789 * This is so that we can catch a buffer
790 * after prematurely unpinning it to forcibly shutdown the filesystem.
791 */
792int
793xfs_bdstrat_cb(struct xfs_buf *bp)
794{
795 if (XFS_FORCED_SHUTDOWN(bp->b_mount)) {
796 trace_xfs_bdstrat_shut(bp, _RET_IP_);
797 /*
798 * Metadata write that didn't get logged but
799 * written delayed anyway. These aren't associated
800 * with a transaction, and can be ignored.
801 */
802 if (XFS_BUF_IODONE_FUNC(bp) == NULL &&
803 (XFS_BUF_ISREAD(bp)) == 0)
804 return (xfs_bioerror_relse(bp));
805 else
806 return (xfs_bioerror(bp));
807 }
808
809 xfs_buf_iorequest(bp);
810 return 0;
811}
812
813/*
814 * Wrapper around bdstrat so that we can stop data from going to disk in case
815 * we are shutting down the filesystem. Typically user data goes thru this
816 * path; one of the exceptions is the superblock.
817 */
818void
819xfsbdstrat(
820 struct xfs_mount *mp,
821 struct xfs_buf *bp)
822{
823 ASSERT(mp);
824 if (!XFS_FORCED_SHUTDOWN(mp)) {
825 xfs_buf_iorequest(bp);
826 return;
827 }
828
829 trace_xfs_bdstrat_shut(bp, _RET_IP_);
830 xfs_bioerror_relse(bp);
831}
832
833/*
834 * If the underlying (data/log/rt) device is readonly, there are some 778 * If the underlying (data/log/rt) device is readonly, there are some
835 * operations that cannot proceed. 779 * operations that cannot proceed.
836 */ 780 */
diff --git a/fs/xfs/linux-2.6/xfs_lrw.h b/fs/xfs/linux-2.6/xfs_lrw.h
index d1f7789c7ffb..342ae8c0d011 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.h
+++ b/fs/xfs/linux-2.6/xfs_lrw.h
@@ -22,9 +22,6 @@ struct xfs_mount;
22struct xfs_inode; 22struct xfs_inode;
23struct xfs_buf; 23struct xfs_buf;
24 24
25/* errors from xfsbdstrat() must be extracted from the buffer */
26extern void xfsbdstrat(struct xfs_mount *, struct xfs_buf *);
27extern int xfs_bdstrat_cb(struct xfs_buf *);
28extern int xfs_dev_is_read_only(struct xfs_mount *, char *); 25extern int xfs_dev_is_read_only(struct xfs_mount *, char *);
29 26
30extern int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t); 27extern int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t);
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 77414db10dc2..25ea2408118f 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -877,12 +877,11 @@ xfsaild(
877{ 877{
878 struct xfs_ail *ailp = data; 878 struct xfs_ail *ailp = data;
879 xfs_lsn_t last_pushed_lsn = 0; 879 xfs_lsn_t last_pushed_lsn = 0;
880 long tout = 0; 880 long tout = 0; /* milliseconds */
881 881
882 while (!kthread_should_stop()) { 882 while (!kthread_should_stop()) {
883 if (tout) 883 schedule_timeout_interruptible(tout ?
884 schedule_timeout_interruptible(msecs_to_jiffies(tout)); 884 msecs_to_jiffies(tout) : MAX_SCHEDULE_TIMEOUT);
885 tout = 1000;
886 885
887 /* swsusp */ 886 /* swsusp */
888 try_to_freeze(); 887 try_to_freeze();
@@ -1022,12 +1021,45 @@ xfs_fs_dirty_inode(
1022 XFS_I(inode)->i_update_core = 1; 1021 XFS_I(inode)->i_update_core = 1;
1023} 1022}
1024 1023
1025/* 1024STATIC int
1026 * Attempt to flush the inode, this will actually fail 1025xfs_log_inode(
1027 * if the inode is pinned, but we dirty the inode again 1026 struct xfs_inode *ip)
1028 * at the point when it is unpinned after a log write, 1027{
1029 * since this is when the inode itself becomes flushable. 1028 struct xfs_mount *mp = ip->i_mount;
1030 */ 1029 struct xfs_trans *tp;
1030 int error;
1031
1032 xfs_iunlock(ip, XFS_ILOCK_SHARED);
1033 tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
1034 error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0);
1035
1036 if (error) {
1037 xfs_trans_cancel(tp, 0);
1038 /* we need to return with the lock hold shared */
1039 xfs_ilock(ip, XFS_ILOCK_SHARED);
1040 return error;
1041 }
1042
1043 xfs_ilock(ip, XFS_ILOCK_EXCL);
1044
1045 /*
1046 * Note - it's possible that we might have pushed ourselves out of the
1047 * way during trans_reserve which would flush the inode. But there's
1048 * no guarantee that the inode buffer has actually gone out yet (it's
1049 * delwri). Plus the buffer could be pinned anyway if it's part of
1050 * an inode in another recent transaction. So we play it safe and
1051 * fire off the transaction anyway.
1052 */
1053 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
1054 xfs_trans_ihold(tp, ip);
1055 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1056 xfs_trans_set_sync(tp);
1057 error = xfs_trans_commit(tp, 0);
1058 xfs_ilock_demote(ip, XFS_ILOCK_EXCL);
1059
1060 return error;
1061}
1062
1031STATIC int 1063STATIC int
1032xfs_fs_write_inode( 1064xfs_fs_write_inode(
1033 struct inode *inode, 1065 struct inode *inode,
@@ -1035,7 +1067,7 @@ xfs_fs_write_inode(
1035{ 1067{
1036 struct xfs_inode *ip = XFS_I(inode); 1068 struct xfs_inode *ip = XFS_I(inode);
1037 struct xfs_mount *mp = ip->i_mount; 1069 struct xfs_mount *mp = ip->i_mount;
1038 int error = 0; 1070 int error = EAGAIN;
1039 1071
1040 xfs_itrace_entry(ip); 1072 xfs_itrace_entry(ip);
1041 1073
@@ -1046,35 +1078,55 @@ xfs_fs_write_inode(
1046 error = xfs_wait_on_pages(ip, 0, -1); 1078 error = xfs_wait_on_pages(ip, 0, -1);
1047 if (error) 1079 if (error)
1048 goto out; 1080 goto out;
1049 }
1050
1051 /*
1052 * Bypass inodes which have already been cleaned by
1053 * the inode flush clustering code inside xfs_iflush
1054 */
1055 if (xfs_inode_clean(ip))
1056 goto out;
1057 1081
1058 /* 1082 /*
1059 * We make this non-blocking if the inode is contended, return 1083 * Make sure the inode has hit stable storage. By using the
1060 * EAGAIN to indicate to the caller that they did not succeed. 1084 * log and the fsync transactions we reduce the IOs we have
1061 * This prevents the flush path from blocking on inodes inside 1085 * to do here from two (log and inode) to just the log.
1062 * another operation right now, they get caught later by xfs_sync. 1086 *
1063 */ 1087 * Note: We still need to do a delwri write of the inode after
1064 if (sync) { 1088 * this to flush it to the backing buffer so that bulkstat
1089 * works properly if this is the first time the inode has been
1090 * written. Because we hold the ilock atomically over the
1091 * transaction commit and the inode flush we are guaranteed
1092 * that the inode is not pinned when it returns. If the flush
1093 * lock is already held, then the inode has already been
1094 * flushed once and we don't need to flush it again. Hence
1095 * the code will only flush the inode if it isn't already
1096 * being flushed.
1097 */
1065 xfs_ilock(ip, XFS_ILOCK_SHARED); 1098 xfs_ilock(ip, XFS_ILOCK_SHARED);
1066 xfs_iflock(ip); 1099 if (ip->i_update_core) {
1067 1100 error = xfs_log_inode(ip);
1068 error = xfs_iflush(ip, XFS_IFLUSH_SYNC); 1101 if (error)
1102 goto out_unlock;
1103 }
1069 } else { 1104 } else {
1070 error = EAGAIN; 1105 /*
1106 * We make this non-blocking if the inode is contended, return
1107 * EAGAIN to indicate to the caller that they did not succeed.
1108 * This prevents the flush path from blocking on inodes inside
1109 * another operation right now, they get caught later by xfs_sync.
1110 */
1071 if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) 1111 if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED))
1072 goto out; 1112 goto out;
1073 if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip)) 1113 }
1074 goto out_unlock;
1075 1114
1076 error = xfs_iflush(ip, XFS_IFLUSH_ASYNC_NOBLOCK); 1115 if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip))
1116 goto out_unlock;
1117
1118 /*
1119 * Now we have the flush lock and the inode is not pinned, we can check
1120 * if the inode is really clean as we know that there are no pending
1121 * transaction completions, it is not waiting on the delayed write
1122 * queue and there is no IO in progress.
1123 */
1124 if (xfs_inode_clean(ip)) {
1125 xfs_ifunlock(ip);
1126 error = 0;
1127 goto out_unlock;
1077 } 1128 }
1129 error = xfs_iflush(ip, 0);
1078 1130
1079 out_unlock: 1131 out_unlock:
1080 xfs_iunlock(ip, XFS_ILOCK_SHARED); 1132 xfs_iunlock(ip, XFS_ILOCK_SHARED);
@@ -1257,6 +1309,29 @@ xfs_fs_statfs(
1257 return 0; 1309 return 0;
1258} 1310}
1259 1311
1312STATIC void
1313xfs_save_resvblks(struct xfs_mount *mp)
1314{
1315 __uint64_t resblks = 0;
1316
1317 mp->m_resblks_save = mp->m_resblks;
1318 xfs_reserve_blocks(mp, &resblks, NULL);
1319}
1320
1321STATIC void
1322xfs_restore_resvblks(struct xfs_mount *mp)
1323{
1324 __uint64_t resblks;
1325
1326 if (mp->m_resblks_save) {
1327 resblks = mp->m_resblks_save;
1328 mp->m_resblks_save = 0;
1329 } else
1330 resblks = xfs_default_resblks(mp);
1331
1332 xfs_reserve_blocks(mp, &resblks, NULL);
1333}
1334
1260STATIC int 1335STATIC int
1261xfs_fs_remount( 1336xfs_fs_remount(
1262 struct super_block *sb, 1337 struct super_block *sb,
@@ -1336,11 +1411,27 @@ xfs_fs_remount(
1336 } 1411 }
1337 mp->m_update_flags = 0; 1412 mp->m_update_flags = 0;
1338 } 1413 }
1414
1415 /*
1416 * Fill out the reserve pool if it is empty. Use the stashed
1417 * value if it is non-zero, otherwise go with the default.
1418 */
1419 xfs_restore_resvblks(mp);
1339 } 1420 }
1340 1421
1341 /* rw -> ro */ 1422 /* rw -> ro */
1342 if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) { 1423 if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) {
1424 /*
1425 * After we have synced the data but before we sync the
1426 * metadata, we need to free up the reserve block pool so that
1427 * the used block count in the superblock on disk is correct at
1428 * the end of the remount. Stash the current reserve pool size
1429 * so that if we get remounted rw, we can return it to the same
1430 * size.
1431 */
1432
1343 xfs_quiesce_data(mp); 1433 xfs_quiesce_data(mp);
1434 xfs_save_resvblks(mp);
1344 xfs_quiesce_attr(mp); 1435 xfs_quiesce_attr(mp);
1345 mp->m_flags |= XFS_MOUNT_RDONLY; 1436 mp->m_flags |= XFS_MOUNT_RDONLY;
1346 } 1437 }
@@ -1359,11 +1450,22 @@ xfs_fs_freeze(
1359{ 1450{
1360 struct xfs_mount *mp = XFS_M(sb); 1451 struct xfs_mount *mp = XFS_M(sb);
1361 1452
1453 xfs_save_resvblks(mp);
1362 xfs_quiesce_attr(mp); 1454 xfs_quiesce_attr(mp);
1363 return -xfs_fs_log_dummy(mp); 1455 return -xfs_fs_log_dummy(mp);
1364} 1456}
1365 1457
1366STATIC int 1458STATIC int
1459xfs_fs_unfreeze(
1460 struct super_block *sb)
1461{
1462 struct xfs_mount *mp = XFS_M(sb);
1463
1464 xfs_restore_resvblks(mp);
1465 return 0;
1466}
1467
1468STATIC int
1367xfs_fs_show_options( 1469xfs_fs_show_options(
1368 struct seq_file *m, 1470 struct seq_file *m,
1369 struct vfsmount *mnt) 1471 struct vfsmount *mnt)
@@ -1585,6 +1687,7 @@ static const struct super_operations xfs_super_operations = {
1585 .put_super = xfs_fs_put_super, 1687 .put_super = xfs_fs_put_super,
1586 .sync_fs = xfs_fs_sync_fs, 1688 .sync_fs = xfs_fs_sync_fs,
1587 .freeze_fs = xfs_fs_freeze, 1689 .freeze_fs = xfs_fs_freeze,
1690 .unfreeze_fs = xfs_fs_unfreeze,
1588 .statfs = xfs_fs_statfs, 1691 .statfs = xfs_fs_statfs,
1589 .remount_fs = xfs_fs_remount, 1692 .remount_fs = xfs_fs_remount,
1590 .show_options = xfs_fs_show_options, 1693 .show_options = xfs_fs_show_options,
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 1f5e4bb5e970..a9f6d20aff41 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -90,14 +90,13 @@ xfs_inode_ag_lookup(
90STATIC int 90STATIC int
91xfs_inode_ag_walk( 91xfs_inode_ag_walk(
92 struct xfs_mount *mp, 92 struct xfs_mount *mp,
93 xfs_agnumber_t ag, 93 struct xfs_perag *pag,
94 int (*execute)(struct xfs_inode *ip, 94 int (*execute)(struct xfs_inode *ip,
95 struct xfs_perag *pag, int flags), 95 struct xfs_perag *pag, int flags),
96 int flags, 96 int flags,
97 int tag, 97 int tag,
98 int exclusive) 98 int exclusive)
99{ 99{
100 struct xfs_perag *pag = &mp->m_perag[ag];
101 uint32_t first_index; 100 uint32_t first_index;
102 int last_error = 0; 101 int last_error = 0;
103 int skipped; 102 int skipped;
@@ -141,8 +140,6 @@ restart:
141 delay(1); 140 delay(1);
142 goto restart; 141 goto restart;
143 } 142 }
144
145 xfs_put_perag(mp, pag);
146 return last_error; 143 return last_error;
147} 144}
148 145
@@ -160,10 +157,16 @@ xfs_inode_ag_iterator(
160 xfs_agnumber_t ag; 157 xfs_agnumber_t ag;
161 158
162 for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) { 159 for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) {
163 if (!mp->m_perag[ag].pag_ici_init) 160 struct xfs_perag *pag;
161
162 pag = xfs_perag_get(mp, ag);
163 if (!pag->pag_ici_init) {
164 xfs_perag_put(pag);
164 continue; 165 continue;
165 error = xfs_inode_ag_walk(mp, ag, execute, flags, tag, 166 }
167 error = xfs_inode_ag_walk(mp, pag, execute, flags, tag,
166 exclusive); 168 exclusive);
169 xfs_perag_put(pag);
167 if (error) { 170 if (error) {
168 last_error = error; 171 last_error = error;
169 if (error == EFSCORRUPTED) 172 if (error == EFSCORRUPTED)
@@ -231,7 +234,7 @@ xfs_sync_inode_data(
231 } 234 }
232 235
233 error = xfs_flush_pages(ip, 0, -1, (flags & SYNC_WAIT) ? 236 error = xfs_flush_pages(ip, 0, -1, (flags & SYNC_WAIT) ?
234 0 : XFS_B_ASYNC, FI_NONE); 237 0 : XBF_ASYNC, FI_NONE);
235 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 238 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
236 239
237 out_wait: 240 out_wait:
@@ -267,8 +270,7 @@ xfs_sync_inode_attr(
267 goto out_unlock; 270 goto out_unlock;
268 } 271 }
269 272
270 error = xfs_iflush(ip, (flags & SYNC_WAIT) ? 273 error = xfs_iflush(ip, flags);
271 XFS_IFLUSH_SYNC : XFS_IFLUSH_DELWRI);
272 274
273 out_unlock: 275 out_unlock:
274 xfs_iunlock(ip, XFS_ILOCK_SHARED); 276 xfs_iunlock(ip, XFS_ILOCK_SHARED);
@@ -293,10 +295,7 @@ xfs_sync_data(
293 if (error) 295 if (error)
294 return XFS_ERROR(error); 296 return XFS_ERROR(error);
295 297
296 xfs_log_force(mp, 0, 298 xfs_log_force(mp, (flags & SYNC_WAIT) ? XFS_LOG_SYNC : 0);
297 (flags & SYNC_WAIT) ?
298 XFS_LOG_FORCE | XFS_LOG_SYNC :
299 XFS_LOG_FORCE);
300 return 0; 299 return 0;
301} 300}
302 301
@@ -322,10 +321,6 @@ xfs_commit_dummy_trans(
322 struct xfs_inode *ip = mp->m_rootip; 321 struct xfs_inode *ip = mp->m_rootip;
323 struct xfs_trans *tp; 322 struct xfs_trans *tp;
324 int error; 323 int error;
325 int log_flags = XFS_LOG_FORCE;
326
327 if (flags & SYNC_WAIT)
328 log_flags |= XFS_LOG_SYNC;
329 324
330 /* 325 /*
331 * Put a dummy transaction in the log to tell recovery 326 * Put a dummy transaction in the log to tell recovery
@@ -347,11 +342,11 @@ xfs_commit_dummy_trans(
347 xfs_iunlock(ip, XFS_ILOCK_EXCL); 342 xfs_iunlock(ip, XFS_ILOCK_EXCL);
348 343
349 /* the log force ensures this transaction is pushed to disk */ 344 /* the log force ensures this transaction is pushed to disk */
350 xfs_log_force(mp, 0, log_flags); 345 xfs_log_force(mp, (flags & SYNC_WAIT) ? XFS_LOG_SYNC : 0);
351 return error; 346 return error;
352} 347}
353 348
354int 349STATIC int
355xfs_sync_fsdata( 350xfs_sync_fsdata(
356 struct xfs_mount *mp, 351 struct xfs_mount *mp,
357 int flags) 352 int flags)
@@ -367,7 +362,7 @@ xfs_sync_fsdata(
367 if (flags & SYNC_TRYLOCK) { 362 if (flags & SYNC_TRYLOCK) {
368 ASSERT(!(flags & SYNC_WAIT)); 363 ASSERT(!(flags & SYNC_WAIT));
369 364
370 bp = xfs_getsb(mp, XFS_BUF_TRYLOCK); 365 bp = xfs_getsb(mp, XBF_TRYLOCK);
371 if (!bp) 366 if (!bp)
372 goto out; 367 goto out;
373 368
@@ -387,7 +382,7 @@ xfs_sync_fsdata(
387 * become pinned in between there and here. 382 * become pinned in between there and here.
388 */ 383 */
389 if (XFS_BUF_ISPINNED(bp)) 384 if (XFS_BUF_ISPINNED(bp))
390 xfs_log_force(mp, 0, XFS_LOG_FORCE); 385 xfs_log_force(mp, 0);
391 } 386 }
392 387
393 388
@@ -448,9 +443,6 @@ xfs_quiesce_data(
448 xfs_sync_data(mp, SYNC_WAIT); 443 xfs_sync_data(mp, SYNC_WAIT);
449 xfs_qm_sync(mp, SYNC_WAIT); 444 xfs_qm_sync(mp, SYNC_WAIT);
450 445
451 /* drop inode references pinned by filestreams */
452 xfs_filestream_flush(mp);
453
454 /* write superblock and hoover up shutdown errors */ 446 /* write superblock and hoover up shutdown errors */
455 error = xfs_sync_fsdata(mp, SYNC_WAIT); 447 error = xfs_sync_fsdata(mp, SYNC_WAIT);
456 448
@@ -467,16 +459,18 @@ xfs_quiesce_fs(
467{ 459{
468 int count = 0, pincount; 460 int count = 0, pincount;
469 461
462 xfs_reclaim_inodes(mp, 0);
470 xfs_flush_buftarg(mp->m_ddev_targp, 0); 463 xfs_flush_buftarg(mp->m_ddev_targp, 0);
471 xfs_reclaim_inodes(mp, XFS_IFLUSH_DELWRI_ELSE_ASYNC);
472 464
473 /* 465 /*
474 * This loop must run at least twice. The first instance of the loop 466 * This loop must run at least twice. The first instance of the loop
475 * will flush most meta data but that will generate more meta data 467 * will flush most meta data but that will generate more meta data
476 * (typically directory updates). Which then must be flushed and 468 * (typically directory updates). Which then must be flushed and
477 * logged before we can write the unmount record. 469 * logged before we can write the unmount record. We also so sync
470 * reclaim of inodes to catch any that the above delwri flush skipped.
478 */ 471 */
479 do { 472 do {
473 xfs_reclaim_inodes(mp, SYNC_WAIT);
480 xfs_sync_attr(mp, SYNC_WAIT); 474 xfs_sync_attr(mp, SYNC_WAIT);
481 pincount = xfs_flush_buftarg(mp->m_ddev_targp, 1); 475 pincount = xfs_flush_buftarg(mp->m_ddev_targp, 1);
482 if (!pincount) { 476 if (!pincount) {
@@ -575,7 +569,7 @@ xfs_flush_inodes(
575 igrab(inode); 569 igrab(inode);
576 xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inodes_work, &completion); 570 xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inodes_work, &completion);
577 wait_for_completion(&completion); 571 wait_for_completion(&completion);
578 xfs_log_force(ip->i_mount, (xfs_lsn_t)0, XFS_LOG_FORCE|XFS_LOG_SYNC); 572 xfs_log_force(ip->i_mount, XFS_LOG_SYNC);
579} 573}
580 574
581/* 575/*
@@ -591,8 +585,8 @@ xfs_sync_worker(
591 int error; 585 int error;
592 586
593 if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { 587 if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
594 xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE); 588 xfs_log_force(mp, 0);
595 xfs_reclaim_inodes(mp, XFS_IFLUSH_DELWRI_ELSE_ASYNC); 589 xfs_reclaim_inodes(mp, 0);
596 /* dgc: errors ignored here */ 590 /* dgc: errors ignored here */
597 error = xfs_qm_sync(mp, SYNC_TRYLOCK); 591 error = xfs_qm_sync(mp, SYNC_TRYLOCK);
598 error = xfs_sync_fsdata(mp, SYNC_TRYLOCK); 592 error = xfs_sync_fsdata(mp, SYNC_TRYLOCK);
@@ -690,16 +684,17 @@ void
690xfs_inode_set_reclaim_tag( 684xfs_inode_set_reclaim_tag(
691 xfs_inode_t *ip) 685 xfs_inode_t *ip)
692{ 686{
693 xfs_mount_t *mp = ip->i_mount; 687 struct xfs_mount *mp = ip->i_mount;
694 xfs_perag_t *pag = xfs_get_perag(mp, ip->i_ino); 688 struct xfs_perag *pag;
695 689
690 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
696 read_lock(&pag->pag_ici_lock); 691 read_lock(&pag->pag_ici_lock);
697 spin_lock(&ip->i_flags_lock); 692 spin_lock(&ip->i_flags_lock);
698 __xfs_inode_set_reclaim_tag(pag, ip); 693 __xfs_inode_set_reclaim_tag(pag, ip);
699 __xfs_iflags_set(ip, XFS_IRECLAIMABLE); 694 __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
700 spin_unlock(&ip->i_flags_lock); 695 spin_unlock(&ip->i_flags_lock);
701 read_unlock(&pag->pag_ici_lock); 696 read_unlock(&pag->pag_ici_lock);
702 xfs_put_perag(mp, pag); 697 xfs_perag_put(pag);
703} 698}
704 699
705void 700void
@@ -712,12 +707,64 @@ __xfs_inode_clear_reclaim_tag(
712 XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG); 707 XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
713} 708}
714 709
710/*
711 * Inodes in different states need to be treated differently, and the return
712 * value of xfs_iflush is not sufficient to get this right. The following table
713 * lists the inode states and the reclaim actions necessary for non-blocking
714 * reclaim:
715 *
716 *
717 * inode state iflush ret required action
718 * --------------- ---------- ---------------
719 * bad - reclaim
720 * shutdown EIO unpin and reclaim
721 * clean, unpinned 0 reclaim
722 * stale, unpinned 0 reclaim
723 * clean, pinned(*) 0 requeue
724 * stale, pinned EAGAIN requeue
725 * dirty, delwri ok 0 requeue
726 * dirty, delwri blocked EAGAIN requeue
727 * dirty, sync flush 0 reclaim
728 *
729 * (*) dgc: I don't think the clean, pinned state is possible but it gets
730 * handled anyway given the order of checks implemented.
731 *
732 * As can be seen from the table, the return value of xfs_iflush() is not
733 * sufficient to correctly decide the reclaim action here. The checks in
734 * xfs_iflush() might look like duplicates, but they are not.
735 *
736 * Also, because we get the flush lock first, we know that any inode that has
737 * been flushed delwri has had the flush completed by the time we check that
738 * the inode is clean. The clean inode check needs to be done before flushing
739 * the inode delwri otherwise we would loop forever requeuing clean inodes as
740 * we cannot tell apart a successful delwri flush and a clean inode from the
741 * return value of xfs_iflush().
742 *
743 * Note that because the inode is flushed delayed write by background
744 * writeback, the flush lock may already be held here and waiting on it can
745 * result in very long latencies. Hence for sync reclaims, where we wait on the
746 * flush lock, the caller should push out delayed write inodes first before
747 * trying to reclaim them to minimise the amount of time spent waiting. For
748 * background relaim, we just requeue the inode for the next pass.
749 *
750 * Hence the order of actions after gaining the locks should be:
751 * bad => reclaim
752 * shutdown => unpin and reclaim
753 * pinned, delwri => requeue
754 * pinned, sync => unpin
755 * stale => reclaim
756 * clean => reclaim
757 * dirty, delwri => flush and requeue
758 * dirty, sync => flush, wait and reclaim
759 */
715STATIC int 760STATIC int
716xfs_reclaim_inode( 761xfs_reclaim_inode(
717 struct xfs_inode *ip, 762 struct xfs_inode *ip,
718 struct xfs_perag *pag, 763 struct xfs_perag *pag,
719 int sync_mode) 764 int sync_mode)
720{ 765{
766 int error = 0;
767
721 /* 768 /*
722 * The radix tree lock here protects a thread in xfs_iget from racing 769 * The radix tree lock here protects a thread in xfs_iget from racing
723 * with us starting reclaim on the inode. Once we have the 770 * with us starting reclaim on the inode. Once we have the
@@ -735,33 +782,70 @@ xfs_reclaim_inode(
735 spin_unlock(&ip->i_flags_lock); 782 spin_unlock(&ip->i_flags_lock);
736 write_unlock(&pag->pag_ici_lock); 783 write_unlock(&pag->pag_ici_lock);
737 784
738 /*
739 * If the inode is still dirty, then flush it out. If the inode
740 * is not in the AIL, then it will be OK to flush it delwri as
741 * long as xfs_iflush() does not keep any references to the inode.
742 * We leave that decision up to xfs_iflush() since it has the
743 * knowledge of whether it's OK to simply do a delwri flush of
744 * the inode or whether we need to wait until the inode is
745 * pulled from the AIL.
746 * We get the flush lock regardless, though, just to make sure
747 * we don't free it while it is being flushed.
748 */
749 xfs_ilock(ip, XFS_ILOCK_EXCL); 785 xfs_ilock(ip, XFS_ILOCK_EXCL);
750 xfs_iflock(ip); 786 if (!xfs_iflock_nowait(ip)) {
787 if (!(sync_mode & SYNC_WAIT))
788 goto out;
789 xfs_iflock(ip);
790 }
791
792 if (is_bad_inode(VFS_I(ip)))
793 goto reclaim;
794 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
795 xfs_iunpin_wait(ip);
796 goto reclaim;
797 }
798 if (xfs_ipincount(ip)) {
799 if (!(sync_mode & SYNC_WAIT)) {
800 xfs_ifunlock(ip);
801 goto out;
802 }
803 xfs_iunpin_wait(ip);
804 }
805 if (xfs_iflags_test(ip, XFS_ISTALE))
806 goto reclaim;
807 if (xfs_inode_clean(ip))
808 goto reclaim;
809
810 /* Now we have an inode that needs flushing */
811 error = xfs_iflush(ip, sync_mode);
812 if (sync_mode & SYNC_WAIT) {
813 xfs_iflock(ip);
814 goto reclaim;
815 }
751 816
752 /* 817 /*
753 * In the case of a forced shutdown we rely on xfs_iflush() to 818 * When we have to flush an inode but don't have SYNC_WAIT set, we
754 * wait for the inode to be unpinned before returning an error. 819 * flush the inode out using a delwri buffer and wait for the next
820 * call into reclaim to find it in a clean state instead of waiting for
821 * it now. We also don't return errors here - if the error is transient
822 * then the next reclaim pass will flush the inode, and if the error
823 * is permanent then the next sync reclaim will relcaim the inode and
824 * pass on the error.
755 */ 825 */
756 if (!is_bad_inode(VFS_I(ip)) && xfs_iflush(ip, sync_mode) == 0) { 826 if (error && !XFS_FORCED_SHUTDOWN(ip->i_mount)) {
757 /* synchronize with xfs_iflush_done */ 827 xfs_fs_cmn_err(CE_WARN, ip->i_mount,
758 xfs_iflock(ip); 828 "inode 0x%llx background reclaim flush failed with %d",
759 xfs_ifunlock(ip); 829 (long long)ip->i_ino, error);
760 } 830 }
831out:
832 xfs_iflags_clear(ip, XFS_IRECLAIM);
833 xfs_iunlock(ip, XFS_ILOCK_EXCL);
834 /*
835 * We could return EAGAIN here to make reclaim rescan the inode tree in
836 * a short while. However, this just burns CPU time scanning the tree
837 * waiting for IO to complete and xfssyncd never goes back to the idle
838 * state. Instead, return 0 to let the next scheduled background reclaim
839 * attempt to reclaim the inode again.
840 */
841 return 0;
761 842
843reclaim:
844 xfs_ifunlock(ip);
762 xfs_iunlock(ip, XFS_ILOCK_EXCL); 845 xfs_iunlock(ip, XFS_ILOCK_EXCL);
763 xfs_ireclaim(ip); 846 xfs_ireclaim(ip);
764 return 0; 847 return error;
848
765} 849}
766 850
767int 851int
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index ea932b43335d..d480c346cabb 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -37,7 +37,6 @@ void xfs_syncd_stop(struct xfs_mount *mp);
37 37
38int xfs_sync_attr(struct xfs_mount *mp, int flags); 38int xfs_sync_attr(struct xfs_mount *mp, int flags);
39int xfs_sync_data(struct xfs_mount *mp, int flags); 39int xfs_sync_data(struct xfs_mount *mp, int flags);
40int xfs_sync_fsdata(struct xfs_mount *mp, int flags);
41 40
42int xfs_quiesce_data(struct xfs_mount *mp); 41int xfs_quiesce_data(struct xfs_mount *mp);
43void xfs_quiesce_attr(struct xfs_mount *mp); 42void xfs_quiesce_attr(struct xfs_mount *mp);
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
index c22a608321a3..a4574dcf5065 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -78,6 +78,33 @@ DECLARE_EVENT_CLASS(xfs_attr_list_class,
78 ) 78 )
79) 79)
80 80
81#define DEFINE_PERAG_REF_EVENT(name) \
82TRACE_EVENT(name, \
83 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount, \
84 unsigned long caller_ip), \
85 TP_ARGS(mp, agno, refcount, caller_ip), \
86 TP_STRUCT__entry( \
87 __field(dev_t, dev) \
88 __field(xfs_agnumber_t, agno) \
89 __field(int, refcount) \
90 __field(unsigned long, caller_ip) \
91 ), \
92 TP_fast_assign( \
93 __entry->dev = mp->m_super->s_dev; \
94 __entry->agno = agno; \
95 __entry->refcount = refcount; \
96 __entry->caller_ip = caller_ip; \
97 ), \
98 TP_printk("dev %d:%d agno %u refcount %d caller %pf", \
99 MAJOR(__entry->dev), MINOR(__entry->dev), \
100 __entry->agno, \
101 __entry->refcount, \
102 (char *)__entry->caller_ip) \
103);
104
105DEFINE_PERAG_REF_EVENT(xfs_perag_get)
106DEFINE_PERAG_REF_EVENT(xfs_perag_put)
107
81#define DEFINE_ATTR_LIST_EVENT(name) \ 108#define DEFINE_ATTR_LIST_EVENT(name) \
82DEFINE_EVENT(xfs_attr_list_class, name, \ 109DEFINE_EVENT(xfs_attr_list_class, name, \
83 TP_PROTO(struct xfs_attr_list_context *ctx), \ 110 TP_PROTO(struct xfs_attr_list_context *ctx), \
@@ -456,6 +483,7 @@ DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock);
456DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock_stale); 483DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock_stale);
457DEFINE_BUF_ITEM_EVENT(xfs_buf_item_committed); 484DEFINE_BUF_ITEM_EVENT(xfs_buf_item_committed);
458DEFINE_BUF_ITEM_EVENT(xfs_buf_item_push); 485DEFINE_BUF_ITEM_EVENT(xfs_buf_item_push);
486DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pushbuf);
459DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf); 487DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf);
460DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf_recur); 488DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf_recur);
461DEFINE_BUF_ITEM_EVENT(xfs_trans_getsb); 489DEFINE_BUF_ITEM_EVENT(xfs_trans_getsb);
@@ -1414,6 +1442,59 @@ TRACE_EVENT(xfs_dir2_leafn_moveents,
1414 __entry->count) 1442 __entry->count)
1415); 1443);
1416 1444
1445#define XFS_SWAPEXT_INODES \
1446 { 0, "target" }, \
1447 { 1, "temp" }
1448
1449#define XFS_INODE_FORMAT_STR \
1450 { 0, "invalid" }, \
1451 { 1, "local" }, \
1452 { 2, "extent" }, \
1453 { 3, "btree" }
1454
1455DECLARE_EVENT_CLASS(xfs_swap_extent_class,
1456 TP_PROTO(struct xfs_inode *ip, int which),
1457 TP_ARGS(ip, which),
1458 TP_STRUCT__entry(
1459 __field(dev_t, dev)
1460 __field(int, which)
1461 __field(xfs_ino_t, ino)
1462 __field(int, format)
1463 __field(int, nex)
1464 __field(int, max_nex)
1465 __field(int, broot_size)
1466 __field(int, fork_off)
1467 ),
1468 TP_fast_assign(
1469 __entry->dev = VFS_I(ip)->i_sb->s_dev;
1470 __entry->which = which;
1471 __entry->ino = ip->i_ino;
1472 __entry->format = ip->i_d.di_format;
1473 __entry->nex = ip->i_d.di_nextents;
1474 __entry->max_nex = ip->i_df.if_ext_max;
1475 __entry->broot_size = ip->i_df.if_broot_bytes;
1476 __entry->fork_off = XFS_IFORK_BOFF(ip);
1477 ),
1478 TP_printk("dev %d:%d ino 0x%llx (%s), %s format, num_extents %d, "
1479 "Max in-fork extents %d, broot size %d, fork offset %d",
1480 MAJOR(__entry->dev), MINOR(__entry->dev),
1481 __entry->ino,
1482 __print_symbolic(__entry->which, XFS_SWAPEXT_INODES),
1483 __print_symbolic(__entry->format, XFS_INODE_FORMAT_STR),
1484 __entry->nex,
1485 __entry->max_nex,
1486 __entry->broot_size,
1487 __entry->fork_off)
1488)
1489
1490#define DEFINE_SWAPEXT_EVENT(name) \
1491DEFINE_EVENT(xfs_swap_extent_class, name, \
1492 TP_PROTO(struct xfs_inode *ip, int which), \
1493 TP_ARGS(ip, which))
1494
1495DEFINE_SWAPEXT_EVENT(xfs_swap_extent_before);
1496DEFINE_SWAPEXT_EVENT(xfs_swap_extent_after);
1497
1417#endif /* _TRACE_XFS_H */ 1498#endif /* _TRACE_XFS_H */
1418 1499
1419#undef TRACE_INCLUDE_PATH 1500#undef TRACE_INCLUDE_PATH
diff --git a/fs/xfs/linux-2.6/xfs_xattr.c b/fs/xfs/linux-2.6/xfs_xattr.c
index 0b1878857fc3..fa01b9daba6b 100644
--- a/fs/xfs/linux-2.6/xfs_xattr.c
+++ b/fs/xfs/linux-2.6/xfs_xattr.c
@@ -45,7 +45,7 @@ xfs_xattr_get(struct dentry *dentry, const char *name,
45 value = NULL; 45 value = NULL;
46 } 46 }
47 47
48 error = -xfs_attr_get(ip, name, value, &asize, xflags); 48 error = -xfs_attr_get(ip, (unsigned char *)name, value, &asize, xflags);
49 if (error) 49 if (error)
50 return error; 50 return error;
51 return asize; 51 return asize;
@@ -67,8 +67,9 @@ xfs_xattr_set(struct dentry *dentry, const char *name, const void *value,
67 xflags |= ATTR_REPLACE; 67 xflags |= ATTR_REPLACE;
68 68
69 if (!value) 69 if (!value)
70 return -xfs_attr_remove(ip, name, xflags); 70 return -xfs_attr_remove(ip, (unsigned char *)name, xflags);
71 return -xfs_attr_set(ip, name, (void *)value, size, xflags); 71 return -xfs_attr_set(ip, (unsigned char *)name,
72 (void *)value, size, xflags);
72} 73}
73 74
74static struct xattr_handler xfs_xattr_user_handler = { 75static struct xattr_handler xfs_xattr_user_handler = {
@@ -124,8 +125,13 @@ static const char *xfs_xattr_prefix(int flags)
124} 125}
125 126
126static int 127static int
127xfs_xattr_put_listent(struct xfs_attr_list_context *context, int flags, 128xfs_xattr_put_listent(
128 char *name, int namelen, int valuelen, char *value) 129 struct xfs_attr_list_context *context,
130 int flags,
131 unsigned char *name,
132 int namelen,
133 int valuelen,
134 unsigned char *value)
129{ 135{
130 unsigned int prefix_len = xfs_xattr_prefix_len(flags); 136 unsigned int prefix_len = xfs_xattr_prefix_len(flags);
131 char *offset; 137 char *offset;
@@ -148,7 +154,7 @@ xfs_xattr_put_listent(struct xfs_attr_list_context *context, int flags,
148 offset = (char *)context->alist + context->count; 154 offset = (char *)context->alist + context->count;
149 strncpy(offset, xfs_xattr_prefix(flags), prefix_len); 155 strncpy(offset, xfs_xattr_prefix(flags), prefix_len);
150 offset += prefix_len; 156 offset += prefix_len;
151 strncpy(offset, name, namelen); /* real name */ 157 strncpy(offset, (char *)name, namelen); /* real name */
152 offset += namelen; 158 offset += namelen;
153 *offset = '\0'; 159 *offset = '\0';
154 context->count += prefix_len + namelen + 1; 160 context->count += prefix_len + namelen + 1;
@@ -156,8 +162,13 @@ xfs_xattr_put_listent(struct xfs_attr_list_context *context, int flags,
156} 162}
157 163
158static int 164static int
159xfs_xattr_put_listent_sizes(struct xfs_attr_list_context *context, int flags, 165xfs_xattr_put_listent_sizes(
160 char *name, int namelen, int valuelen, char *value) 166 struct xfs_attr_list_context *context,
167 int flags,
168 unsigned char *name,
169 int namelen,
170 int valuelen,
171 unsigned char *value)
161{ 172{
162 context->count += xfs_xattr_prefix_len(flags) + namelen + 1; 173 context->count += xfs_xattr_prefix_len(flags) + namelen + 1;
163 return 0; 174 return 0;
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index d7c7eea09fc2..5f79dd78626b 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -1187,7 +1187,7 @@ xfs_qm_dqflush(
1187 * block, nada. 1187 * block, nada.
1188 */ 1188 */
1189 if (!XFS_DQ_IS_DIRTY(dqp) || 1189 if (!XFS_DQ_IS_DIRTY(dqp) ||
1190 (!(flags & XFS_QMOPT_SYNC) && atomic_read(&dqp->q_pincount) > 0)) { 1190 (!(flags & SYNC_WAIT) && atomic_read(&dqp->q_pincount) > 0)) {
1191 xfs_dqfunlock(dqp); 1191 xfs_dqfunlock(dqp);
1192 return 0; 1192 return 0;
1193 } 1193 }
@@ -1248,23 +1248,20 @@ xfs_qm_dqflush(
1248 */ 1248 */
1249 if (XFS_BUF_ISPINNED(bp)) { 1249 if (XFS_BUF_ISPINNED(bp)) {
1250 trace_xfs_dqflush_force(dqp); 1250 trace_xfs_dqflush_force(dqp);
1251 xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE); 1251 xfs_log_force(mp, 0);
1252 } 1252 }
1253 1253
1254 if (flags & XFS_QMOPT_DELWRI) { 1254 if (flags & SYNC_WAIT)
1255 xfs_bdwrite(mp, bp);
1256 } else if (flags & XFS_QMOPT_ASYNC) {
1257 error = xfs_bawrite(mp, bp);
1258 } else {
1259 error = xfs_bwrite(mp, bp); 1255 error = xfs_bwrite(mp, bp);
1260 } 1256 else
1257 xfs_bdwrite(mp, bp);
1261 1258
1262 trace_xfs_dqflush_done(dqp); 1259 trace_xfs_dqflush_done(dqp);
1263 1260
1264 /* 1261 /*
1265 * dqp is still locked, but caller is free to unlock it now. 1262 * dqp is still locked, but caller is free to unlock it now.
1266 */ 1263 */
1267 return (error); 1264 return error;
1268 1265
1269} 1266}
1270 1267
@@ -1445,7 +1442,7 @@ xfs_qm_dqpurge(
1445 * We don't care about getting disk errors here. We need 1442 * We don't care about getting disk errors here. We need
1446 * to purge this dquot anyway, so we go ahead regardless. 1443 * to purge this dquot anyway, so we go ahead regardless.
1447 */ 1444 */
1448 error = xfs_qm_dqflush(dqp, XFS_QMOPT_SYNC); 1445 error = xfs_qm_dqflush(dqp, SYNC_WAIT);
1449 if (error) 1446 if (error)
1450 xfs_fs_cmn_err(CE_WARN, mp, 1447 xfs_fs_cmn_err(CE_WARN, mp,
1451 "xfs_qm_dqpurge: dquot %p flush failed", dqp); 1448 "xfs_qm_dqpurge: dquot %p flush failed", dqp);
@@ -1529,25 +1526,17 @@ xfs_qm_dqflock_pushbuf_wait(
1529 * the flush lock when the I/O completes. 1526 * the flush lock when the I/O completes.
1530 */ 1527 */
1531 bp = xfs_incore(dqp->q_mount->m_ddev_targp, dqp->q_blkno, 1528 bp = xfs_incore(dqp->q_mount->m_ddev_targp, dqp->q_blkno,
1532 XFS_QI_DQCHUNKLEN(dqp->q_mount), 1529 XFS_QI_DQCHUNKLEN(dqp->q_mount), XBF_TRYLOCK);
1533 XFS_INCORE_TRYLOCK); 1530 if (!bp)
1534 if (bp != NULL) { 1531 goto out_lock;
1535 if (XFS_BUF_ISDELAYWRITE(bp)) { 1532
1536 int error; 1533 if (XFS_BUF_ISDELAYWRITE(bp)) {
1537 if (XFS_BUF_ISPINNED(bp)) { 1534 if (XFS_BUF_ISPINNED(bp))
1538 xfs_log_force(dqp->q_mount, 1535 xfs_log_force(dqp->q_mount, 0);
1539 (xfs_lsn_t)0, 1536 xfs_buf_delwri_promote(bp);
1540 XFS_LOG_FORCE); 1537 wake_up_process(bp->b_target->bt_task);
1541 }
1542 error = xfs_bawrite(dqp->q_mount, bp);
1543 if (error)
1544 xfs_fs_cmn_err(CE_WARN, dqp->q_mount,
1545 "xfs_qm_dqflock_pushbuf_wait: "
1546 "pushbuf error %d on dqp %p, bp %p",
1547 error, dqp, bp);
1548 } else {
1549 xfs_buf_relse(bp);
1550 }
1551 } 1538 }
1539 xfs_buf_relse(bp);
1540out_lock:
1552 xfs_dqflock(dqp); 1541 xfs_dqflock(dqp);
1553} 1542}
diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c
index d0d4a9a0bbd7..4e4ee9a57194 100644
--- a/fs/xfs/quota/xfs_dquot_item.c
+++ b/fs/xfs/quota/xfs_dquot_item.c
@@ -74,11 +74,11 @@ xfs_qm_dquot_logitem_format(
74 74
75 logvec->i_addr = (xfs_caddr_t)&logitem->qli_format; 75 logvec->i_addr = (xfs_caddr_t)&logitem->qli_format;
76 logvec->i_len = sizeof(xfs_dq_logformat_t); 76 logvec->i_len = sizeof(xfs_dq_logformat_t);
77 XLOG_VEC_SET_TYPE(logvec, XLOG_REG_TYPE_QFORMAT); 77 logvec->i_type = XLOG_REG_TYPE_QFORMAT;
78 logvec++; 78 logvec++;
79 logvec->i_addr = (xfs_caddr_t)&logitem->qli_dquot->q_core; 79 logvec->i_addr = (xfs_caddr_t)&logitem->qli_dquot->q_core;
80 logvec->i_len = sizeof(xfs_disk_dquot_t); 80 logvec->i_len = sizeof(xfs_disk_dquot_t);
81 XLOG_VEC_SET_TYPE(logvec, XLOG_REG_TYPE_DQUOT); 81 logvec->i_type = XLOG_REG_TYPE_DQUOT;
82 82
83 ASSERT(2 == logitem->qli_item.li_desc->lid_size); 83 ASSERT(2 == logitem->qli_item.li_desc->lid_size);
84 logitem->qli_format.qlf_size = 2; 84 logitem->qli_format.qlf_size = 2;
@@ -153,7 +153,7 @@ xfs_qm_dquot_logitem_push(
153 * lock without sleeping, then there must not have been 153 * lock without sleeping, then there must not have been
154 * anyone in the process of flushing the dquot. 154 * anyone in the process of flushing the dquot.
155 */ 155 */
156 error = xfs_qm_dqflush(dqp, XFS_QMOPT_DELWRI); 156 error = xfs_qm_dqflush(dqp, 0);
157 if (error) 157 if (error)
158 xfs_fs_cmn_err(CE_WARN, dqp->q_mount, 158 xfs_fs_cmn_err(CE_WARN, dqp->q_mount,
159 "xfs_qm_dquot_logitem_push: push error %d on dqp %p", 159 "xfs_qm_dquot_logitem_push: push error %d on dqp %p",
@@ -190,7 +190,7 @@ xfs_qm_dqunpin_wait(
190 /* 190 /*
191 * Give the log a push so we don't wait here too long. 191 * Give the log a push so we don't wait here too long.
192 */ 192 */
193 xfs_log_force(dqp->q_mount, (xfs_lsn_t)0, XFS_LOG_FORCE); 193 xfs_log_force(dqp->q_mount, 0);
194 wait_event(dqp->q_pinwait, (atomic_read(&dqp->q_pincount) == 0)); 194 wait_event(dqp->q_pinwait, (atomic_read(&dqp->q_pincount) == 0));
195} 195}
196 196
@@ -212,68 +212,31 @@ xfs_qm_dquot_logitem_pushbuf(
212 xfs_dquot_t *dqp; 212 xfs_dquot_t *dqp;
213 xfs_mount_t *mp; 213 xfs_mount_t *mp;
214 xfs_buf_t *bp; 214 xfs_buf_t *bp;
215 uint dopush;
216 215
217 dqp = qip->qli_dquot; 216 dqp = qip->qli_dquot;
218 ASSERT(XFS_DQ_IS_LOCKED(dqp)); 217 ASSERT(XFS_DQ_IS_LOCKED(dqp));
219 218
220 /* 219 /*
221 * The qli_pushbuf_flag keeps others from
222 * trying to duplicate our effort.
223 */
224 ASSERT(qip->qli_pushbuf_flag != 0);
225 ASSERT(qip->qli_push_owner == current_pid());
226
227 /*
228 * If flushlock isn't locked anymore, chances are that the 220 * If flushlock isn't locked anymore, chances are that the
229 * inode flush completed and the inode was taken off the AIL. 221 * inode flush completed and the inode was taken off the AIL.
230 * So, just get out. 222 * So, just get out.
231 */ 223 */
232 if (completion_done(&dqp->q_flush) || 224 if (completion_done(&dqp->q_flush) ||
233 ((qip->qli_item.li_flags & XFS_LI_IN_AIL) == 0)) { 225 ((qip->qli_item.li_flags & XFS_LI_IN_AIL) == 0)) {
234 qip->qli_pushbuf_flag = 0;
235 xfs_dqunlock(dqp); 226 xfs_dqunlock(dqp);
236 return; 227 return;
237 } 228 }
238 mp = dqp->q_mount; 229 mp = dqp->q_mount;
239 bp = xfs_incore(mp->m_ddev_targp, qip->qli_format.qlf_blkno, 230 bp = xfs_incore(mp->m_ddev_targp, qip->qli_format.qlf_blkno,
240 XFS_QI_DQCHUNKLEN(mp), 231 XFS_QI_DQCHUNKLEN(mp), XBF_TRYLOCK);
241 XFS_INCORE_TRYLOCK); 232 xfs_dqunlock(dqp);
242 if (bp != NULL) { 233 if (!bp)
243 if (XFS_BUF_ISDELAYWRITE(bp)) {
244 dopush = ((qip->qli_item.li_flags & XFS_LI_IN_AIL) &&
245 !completion_done(&dqp->q_flush));
246 qip->qli_pushbuf_flag = 0;
247 xfs_dqunlock(dqp);
248
249 if (XFS_BUF_ISPINNED(bp)) {
250 xfs_log_force(mp, (xfs_lsn_t)0,
251 XFS_LOG_FORCE);
252 }
253 if (dopush) {
254 int error;
255#ifdef XFSRACEDEBUG
256 delay_for_intr();
257 delay(300);
258#endif
259 error = xfs_bawrite(mp, bp);
260 if (error)
261 xfs_fs_cmn_err(CE_WARN, mp,
262 "xfs_qm_dquot_logitem_pushbuf: pushbuf error %d on qip %p, bp %p",
263 error, qip, bp);
264 } else {
265 xfs_buf_relse(bp);
266 }
267 } else {
268 qip->qli_pushbuf_flag = 0;
269 xfs_dqunlock(dqp);
270 xfs_buf_relse(bp);
271 }
272 return; 234 return;
273 } 235 if (XFS_BUF_ISDELAYWRITE(bp))
236 xfs_buf_delwri_promote(bp);
237 xfs_buf_relse(bp);
238 return;
274 239
275 qip->qli_pushbuf_flag = 0;
276 xfs_dqunlock(dqp);
277} 240}
278 241
279/* 242/*
@@ -291,50 +254,24 @@ xfs_qm_dquot_logitem_trylock(
291 xfs_dq_logitem_t *qip) 254 xfs_dq_logitem_t *qip)
292{ 255{
293 xfs_dquot_t *dqp; 256 xfs_dquot_t *dqp;
294 uint retval;
295 257
296 dqp = qip->qli_dquot; 258 dqp = qip->qli_dquot;
297 if (atomic_read(&dqp->q_pincount) > 0) 259 if (atomic_read(&dqp->q_pincount) > 0)
298 return (XFS_ITEM_PINNED); 260 return XFS_ITEM_PINNED;
299 261
300 if (! xfs_qm_dqlock_nowait(dqp)) 262 if (! xfs_qm_dqlock_nowait(dqp))
301 return (XFS_ITEM_LOCKED); 263 return XFS_ITEM_LOCKED;
302 264
303 retval = XFS_ITEM_SUCCESS;
304 if (!xfs_dqflock_nowait(dqp)) { 265 if (!xfs_dqflock_nowait(dqp)) {
305 /* 266 /*
306 * The dquot is already being flushed. It may have been 267 * dquot has already been flushed to the backing buffer,
307 * flushed delayed write, however, and we don't want to 268 * leave it locked, pushbuf routine will unlock it.
308 * get stuck waiting for that to complete. So, we want to check
309 * to see if we can lock the dquot's buffer without sleeping.
310 * If we can and it is marked for delayed write, then we
311 * hold it and send it out from the push routine. We don't
312 * want to do that now since we might sleep in the device
313 * strategy routine. We also don't want to grab the buffer lock
314 * here because we'd like not to call into the buffer cache
315 * while holding the AIL lock.
316 * Make sure to only return PUSHBUF if we set pushbuf_flag
317 * ourselves. If someone else is doing it then we don't
318 * want to go to the push routine and duplicate their efforts.
319 */ 269 */
320 if (qip->qli_pushbuf_flag == 0) { 270 return XFS_ITEM_PUSHBUF;
321 qip->qli_pushbuf_flag = 1;
322 ASSERT(qip->qli_format.qlf_blkno == dqp->q_blkno);
323#ifdef DEBUG
324 qip->qli_push_owner = current_pid();
325#endif
326 /*
327 * The dquot is left locked.
328 */
329 retval = XFS_ITEM_PUSHBUF;
330 } else {
331 retval = XFS_ITEM_FLUSHING;
332 xfs_dqunlock_nonotify(dqp);
333 }
334 } 271 }
335 272
336 ASSERT(qip->qli_item.li_flags & XFS_LI_IN_AIL); 273 ASSERT(qip->qli_item.li_flags & XFS_LI_IN_AIL);
337 return (retval); 274 return XFS_ITEM_SUCCESS;
338} 275}
339 276
340 277
@@ -467,7 +404,7 @@ xfs_qm_qoff_logitem_format(xfs_qoff_logitem_t *qf,
467 404
468 log_vector->i_addr = (xfs_caddr_t)&(qf->qql_format); 405 log_vector->i_addr = (xfs_caddr_t)&(qf->qql_format);
469 log_vector->i_len = sizeof(xfs_qoff_logitem_t); 406 log_vector->i_len = sizeof(xfs_qoff_logitem_t);
470 XLOG_VEC_SET_TYPE(log_vector, XLOG_REG_TYPE_QUOTAOFF); 407 log_vector->i_type = XLOG_REG_TYPE_QUOTAOFF;
471 qf->qql_format.qf_size = 1; 408 qf->qql_format.qf_size = 1;
472} 409}
473 410
diff --git a/fs/xfs/quota/xfs_dquot_item.h b/fs/xfs/quota/xfs_dquot_item.h
index 5a632531f843..5acae2ada70b 100644
--- a/fs/xfs/quota/xfs_dquot_item.h
+++ b/fs/xfs/quota/xfs_dquot_item.h
@@ -27,10 +27,6 @@ typedef struct xfs_dq_logitem {
27 xfs_log_item_t qli_item; /* common portion */ 27 xfs_log_item_t qli_item; /* common portion */
28 struct xfs_dquot *qli_dquot; /* dquot ptr */ 28 struct xfs_dquot *qli_dquot; /* dquot ptr */
29 xfs_lsn_t qli_flush_lsn; /* lsn at last flush */ 29 xfs_lsn_t qli_flush_lsn; /* lsn at last flush */
30 unsigned short qli_pushbuf_flag; /* 1 bit used in push_ail */
31#ifdef DEBUG
32 uint64_t qli_push_owner;
33#endif
34 xfs_dq_logformat_t qli_format; /* logged structure */ 30 xfs_dq_logformat_t qli_format; /* logged structure */
35} xfs_dq_logitem_t; 31} xfs_dq_logitem_t;
36 32
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 9e627a8b5b0e..417e61e3d9dd 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -118,9 +118,14 @@ xfs_Gqm_init(void)
118 */ 118 */
119 udqhash = kmem_zalloc_greedy(&hsize, 119 udqhash = kmem_zalloc_greedy(&hsize,
120 XFS_QM_HASHSIZE_LOW * sizeof(xfs_dqhash_t), 120 XFS_QM_HASHSIZE_LOW * sizeof(xfs_dqhash_t),
121 XFS_QM_HASHSIZE_HIGH * sizeof(xfs_dqhash_t), 121 XFS_QM_HASHSIZE_HIGH * sizeof(xfs_dqhash_t));
122 KM_SLEEP | KM_MAYFAIL | KM_LARGE); 122 if (!udqhash)
123 gdqhash = kmem_zalloc(hsize, KM_SLEEP | KM_LARGE); 123 goto out;
124
125 gdqhash = kmem_zalloc_large(hsize);
126 if (!gdqhash)
127 goto out_free_udqhash;
128
124 hsize /= sizeof(xfs_dqhash_t); 129 hsize /= sizeof(xfs_dqhash_t);
125 ndquot = hsize << 8; 130 ndquot = hsize << 8;
126 131
@@ -170,6 +175,11 @@ xfs_Gqm_init(void)
170 mutex_init(&qcheck_lock); 175 mutex_init(&qcheck_lock);
171#endif 176#endif
172 return xqm; 177 return xqm;
178
179 out_free_udqhash:
180 kmem_free_large(udqhash);
181 out:
182 return NULL;
173} 183}
174 184
175/* 185/*
@@ -189,8 +199,8 @@ xfs_qm_destroy(
189 xfs_qm_list_destroy(&(xqm->qm_usr_dqhtable[i])); 199 xfs_qm_list_destroy(&(xqm->qm_usr_dqhtable[i]));
190 xfs_qm_list_destroy(&(xqm->qm_grp_dqhtable[i])); 200 xfs_qm_list_destroy(&(xqm->qm_grp_dqhtable[i]));
191 } 201 }
192 kmem_free(xqm->qm_usr_dqhtable); 202 kmem_free_large(xqm->qm_usr_dqhtable);
193 kmem_free(xqm->qm_grp_dqhtable); 203 kmem_free_large(xqm->qm_grp_dqhtable);
194 xqm->qm_usr_dqhtable = NULL; 204 xqm->qm_usr_dqhtable = NULL;
195 xqm->qm_grp_dqhtable = NULL; 205 xqm->qm_grp_dqhtable = NULL;
196 xqm->qm_dqhashmask = 0; 206 xqm->qm_dqhashmask = 0;
@@ -219,8 +229,12 @@ xfs_qm_hold_quotafs_ref(
219 */ 229 */
220 mutex_lock(&xfs_Gqm_lock); 230 mutex_lock(&xfs_Gqm_lock);
221 231
222 if (xfs_Gqm == NULL) 232 if (!xfs_Gqm) {
223 xfs_Gqm = xfs_Gqm_init(); 233 xfs_Gqm = xfs_Gqm_init();
234 if (!xfs_Gqm)
235 return ENOMEM;
236 }
237
224 /* 238 /*
225 * We can keep a list of all filesystems with quotas mounted for 239 * We can keep a list of all filesystems with quotas mounted for
226 * debugging and statistical purposes, but ... 240 * debugging and statistical purposes, but ...
@@ -436,7 +450,7 @@ xfs_qm_unmount_quotas(
436STATIC int 450STATIC int
437xfs_qm_dqflush_all( 451xfs_qm_dqflush_all(
438 xfs_mount_t *mp, 452 xfs_mount_t *mp,
439 int flags) 453 int sync_mode)
440{ 454{
441 int recl; 455 int recl;
442 xfs_dquot_t *dqp; 456 xfs_dquot_t *dqp;
@@ -472,7 +486,7 @@ again:
472 * across a disk write. 486 * across a disk write.
473 */ 487 */
474 xfs_qm_mplist_unlock(mp); 488 xfs_qm_mplist_unlock(mp);
475 error = xfs_qm_dqflush(dqp, flags); 489 error = xfs_qm_dqflush(dqp, sync_mode);
476 xfs_dqunlock(dqp); 490 xfs_dqunlock(dqp);
477 if (error) 491 if (error)
478 return error; 492 return error;
@@ -912,13 +926,11 @@ xfs_qm_sync(
912{ 926{
913 int recl, restarts; 927 int recl, restarts;
914 xfs_dquot_t *dqp; 928 xfs_dquot_t *dqp;
915 uint flush_flags;
916 int error; 929 int error;
917 930
918 if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp)) 931 if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
919 return 0; 932 return 0;
920 933
921 flush_flags = (flags & SYNC_WAIT) ? XFS_QMOPT_SYNC : XFS_QMOPT_DELWRI;
922 restarts = 0; 934 restarts = 0;
923 935
924 again: 936 again:
@@ -978,7 +990,7 @@ xfs_qm_sync(
978 * across a disk write 990 * across a disk write
979 */ 991 */
980 xfs_qm_mplist_unlock(mp); 992 xfs_qm_mplist_unlock(mp);
981 error = xfs_qm_dqflush(dqp, flush_flags); 993 error = xfs_qm_dqflush(dqp, flags);
982 xfs_dqunlock(dqp); 994 xfs_dqunlock(dqp);
983 if (error && XFS_FORCED_SHUTDOWN(mp)) 995 if (error && XFS_FORCED_SHUTDOWN(mp))
984 return 0; /* Need to prevent umount failure */ 996 return 0; /* Need to prevent umount failure */
@@ -1782,7 +1794,7 @@ xfs_qm_quotacheck(
1782 * successfully. 1794 * successfully.
1783 */ 1795 */
1784 if (!error) 1796 if (!error)
1785 error = xfs_qm_dqflush_all(mp, XFS_QMOPT_DELWRI); 1797 error = xfs_qm_dqflush_all(mp, 0);
1786 1798
1787 /* 1799 /*
1788 * We can get this error if we couldn't do a dquot allocation inside 1800 * We can get this error if we couldn't do a dquot allocation inside
@@ -2004,7 +2016,7 @@ xfs_qm_shake_freelist(
2004 * We flush it delayed write, so don't bother 2016 * We flush it delayed write, so don't bother
2005 * releasing the mplock. 2017 * releasing the mplock.
2006 */ 2018 */
2007 error = xfs_qm_dqflush(dqp, XFS_QMOPT_DELWRI); 2019 error = xfs_qm_dqflush(dqp, 0);
2008 if (error) { 2020 if (error) {
2009 xfs_fs_cmn_err(CE_WARN, dqp->q_mount, 2021 xfs_fs_cmn_err(CE_WARN, dqp->q_mount,
2010 "xfs_qm_dqflush_all: dquot %p flush failed", dqp); 2022 "xfs_qm_dqflush_all: dquot %p flush failed", dqp);
@@ -2187,7 +2199,7 @@ xfs_qm_dqreclaim_one(void)
2187 * We flush it delayed write, so don't bother 2199 * We flush it delayed write, so don't bother
2188 * releasing the freelist lock. 2200 * releasing the freelist lock.
2189 */ 2201 */
2190 error = xfs_qm_dqflush(dqp, XFS_QMOPT_DELWRI); 2202 error = xfs_qm_dqflush(dqp, 0);
2191 if (error) { 2203 if (error) {
2192 xfs_fs_cmn_err(CE_WARN, dqp->q_mount, 2204 xfs_fs_cmn_err(CE_WARN, dqp->q_mount,
2193 "xfs_qm_dqreclaim: dquot %p flush failed", dqp); 2205 "xfs_qm_dqreclaim: dquot %p flush failed", dqp);
diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c
index a5346630dfae..97b410c12794 100644
--- a/fs/xfs/quota/xfs_qm_bhv.c
+++ b/fs/xfs/quota/xfs_qm_bhv.c
@@ -59,7 +59,7 @@ xfs_fill_statvfs_from_dquot(
59 be64_to_cpu(dp->d_blk_hardlimit); 59 be64_to_cpu(dp->d_blk_hardlimit);
60 if (limit && statp->f_blocks > limit) { 60 if (limit && statp->f_blocks > limit) {
61 statp->f_blocks = limit; 61 statp->f_blocks = limit;
62 statp->f_bfree = 62 statp->f_bfree = statp->f_bavail =
63 (statp->f_blocks > be64_to_cpu(dp->d_bcount)) ? 63 (statp->f_blocks > be64_to_cpu(dp->d_bcount)) ?
64 (statp->f_blocks - be64_to_cpu(dp->d_bcount)) : 0; 64 (statp->f_blocks - be64_to_cpu(dp->d_bcount)) : 0;
65 } 65 }
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index 873e07e29074..5d0ee8d492db 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -1192,9 +1192,9 @@ xfs_qm_internalqcheck(
1192 if (! XFS_IS_QUOTA_ON(mp)) 1192 if (! XFS_IS_QUOTA_ON(mp))
1193 return XFS_ERROR(ESRCH); 1193 return XFS_ERROR(ESRCH);
1194 1194
1195 xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC); 1195 xfs_log_force(mp, XFS_LOG_SYNC);
1196 XFS_bflush(mp->m_ddev_targp); 1196 XFS_bflush(mp->m_ddev_targp);
1197 xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC); 1197 xfs_log_force(mp, XFS_LOG_SYNC);
1198 XFS_bflush(mp->m_ddev_targp); 1198 XFS_bflush(mp->m_ddev_targp);
1199 1199
1200 mutex_lock(&qcheck_lock); 1200 mutex_lock(&qcheck_lock);
diff --git a/fs/xfs/quota/xfs_trans_dquot.c b/fs/xfs/quota/xfs_trans_dquot.c
index 97ac9640be98..c3ab75cb1d9a 100644
--- a/fs/xfs/quota/xfs_trans_dquot.c
+++ b/fs/xfs/quota/xfs_trans_dquot.c
@@ -589,12 +589,18 @@ xfs_trans_unreserve_and_mod_dquots(
589 } 589 }
590} 590}
591 591
592STATIC int 592STATIC void
593xfs_quota_error(uint flags) 593xfs_quota_warn(
594 struct xfs_mount *mp,
595 struct xfs_dquot *dqp,
596 int type)
594{ 597{
595 if (flags & XFS_QMOPT_ENOSPC) 598 /* no warnings for project quotas - we just return ENOSPC later */
596 return ENOSPC; 599 if (dqp->dq_flags & XFS_DQ_PROJ)
597 return EDQUOT; 600 return;
601 quota_send_warning((dqp->dq_flags & XFS_DQ_USER) ? USRQUOTA : GRPQUOTA,
602 be32_to_cpu(dqp->q_core.d_id), mp->m_super->s_dev,
603 type);
598} 604}
599 605
600/* 606/*
@@ -612,7 +618,6 @@ xfs_trans_dqresv(
612 long ninos, 618 long ninos,
613 uint flags) 619 uint flags)
614{ 620{
615 int error;
616 xfs_qcnt_t hardlimit; 621 xfs_qcnt_t hardlimit;
617 xfs_qcnt_t softlimit; 622 xfs_qcnt_t softlimit;
618 time_t timer; 623 time_t timer;
@@ -649,7 +654,6 @@ xfs_trans_dqresv(
649 warnlimit = XFS_QI_RTBWARNLIMIT(dqp->q_mount); 654 warnlimit = XFS_QI_RTBWARNLIMIT(dqp->q_mount);
650 resbcountp = &dqp->q_res_rtbcount; 655 resbcountp = &dqp->q_res_rtbcount;
651 } 656 }
652 error = 0;
653 657
654 if ((flags & XFS_QMOPT_FORCE_RES) == 0 && 658 if ((flags & XFS_QMOPT_FORCE_RES) == 0 &&
655 dqp->q_core.d_id && 659 dqp->q_core.d_id &&
@@ -667,18 +671,20 @@ xfs_trans_dqresv(
667 * nblks. 671 * nblks.
668 */ 672 */
669 if (hardlimit > 0ULL && 673 if (hardlimit > 0ULL &&
670 (hardlimit <= nblks + *resbcountp)) { 674 hardlimit <= nblks + *resbcountp) {
671 error = xfs_quota_error(flags); 675 xfs_quota_warn(mp, dqp, QUOTA_NL_BHARDWARN);
672 goto error_return; 676 goto error_return;
673 } 677 }
674
675 if (softlimit > 0ULL && 678 if (softlimit > 0ULL &&
676 (softlimit <= nblks + *resbcountp)) { 679 softlimit <= nblks + *resbcountp) {
677 if ((timer != 0 && get_seconds() > timer) || 680 if ((timer != 0 && get_seconds() > timer) ||
678 (warns != 0 && warns >= warnlimit)) { 681 (warns != 0 && warns >= warnlimit)) {
679 error = xfs_quota_error(flags); 682 xfs_quota_warn(mp, dqp,
683 QUOTA_NL_BSOFTLONGWARN);
680 goto error_return; 684 goto error_return;
681 } 685 }
686
687 xfs_quota_warn(mp, dqp, QUOTA_NL_BSOFTWARN);
682 } 688 }
683 } 689 }
684 if (ninos > 0) { 690 if (ninos > 0) {
@@ -692,15 +698,19 @@ xfs_trans_dqresv(
692 softlimit = be64_to_cpu(dqp->q_core.d_ino_softlimit); 698 softlimit = be64_to_cpu(dqp->q_core.d_ino_softlimit);
693 if (!softlimit) 699 if (!softlimit)
694 softlimit = q->qi_isoftlimit; 700 softlimit = q->qi_isoftlimit;
701
695 if (hardlimit > 0ULL && count >= hardlimit) { 702 if (hardlimit > 0ULL && count >= hardlimit) {
696 error = xfs_quota_error(flags); 703 xfs_quota_warn(mp, dqp, QUOTA_NL_IHARDWARN);
697 goto error_return; 704 goto error_return;
698 } else if (softlimit > 0ULL && count >= softlimit) { 705 }
699 if ((timer != 0 && get_seconds() > timer) || 706 if (softlimit > 0ULL && count >= softlimit) {
707 if ((timer != 0 && get_seconds() > timer) ||
700 (warns != 0 && warns >= warnlimit)) { 708 (warns != 0 && warns >= warnlimit)) {
701 error = xfs_quota_error(flags); 709 xfs_quota_warn(mp, dqp,
710 QUOTA_NL_ISOFTLONGWARN);
702 goto error_return; 711 goto error_return;
703 } 712 }
713 xfs_quota_warn(mp, dqp, QUOTA_NL_ISOFTWARN);
704 } 714 }
705 } 715 }
706 } 716 }
@@ -736,9 +746,14 @@ xfs_trans_dqresv(
736 ASSERT(dqp->q_res_rtbcount >= be64_to_cpu(dqp->q_core.d_rtbcount)); 746 ASSERT(dqp->q_res_rtbcount >= be64_to_cpu(dqp->q_core.d_rtbcount));
737 ASSERT(dqp->q_res_icount >= be64_to_cpu(dqp->q_core.d_icount)); 747 ASSERT(dqp->q_res_icount >= be64_to_cpu(dqp->q_core.d_icount));
738 748
749 xfs_dqunlock(dqp);
750 return 0;
751
739error_return: 752error_return:
740 xfs_dqunlock(dqp); 753 xfs_dqunlock(dqp);
741 return error; 754 if (flags & XFS_QMOPT_ENOSPC)
755 return ENOSPC;
756 return EDQUOT;
742} 757}
743 758
744 759
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index 00fd357c3e46..d13eeba2c8f8 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -36,8 +36,8 @@ struct xfs_acl {
36}; 36};
37 37
38/* On-disk XFS extended attribute names */ 38/* On-disk XFS extended attribute names */
39#define SGI_ACL_FILE "SGI_ACL_FILE" 39#define SGI_ACL_FILE (unsigned char *)"SGI_ACL_FILE"
40#define SGI_ACL_DEFAULT "SGI_ACL_DEFAULT" 40#define SGI_ACL_DEFAULT (unsigned char *)"SGI_ACL_DEFAULT"
41#define SGI_ACL_FILE_SIZE (sizeof(SGI_ACL_FILE)-1) 41#define SGI_ACL_FILE_SIZE (sizeof(SGI_ACL_FILE)-1)
42#define SGI_ACL_DEFAULT_SIZE (sizeof(SGI_ACL_DEFAULT)-1) 42#define SGI_ACL_DEFAULT_SIZE (sizeof(SGI_ACL_DEFAULT)-1)
43 43
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index 6702bd865811..b1a5a1ff88ea 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -187,17 +187,13 @@ typedef struct xfs_perag_busy {
187/* 187/*
188 * Per-ag incore structure, copies of information in agf and agi, 188 * Per-ag incore structure, copies of information in agf and agi,
189 * to improve the performance of allocation group selection. 189 * to improve the performance of allocation group selection.
190 *
191 * pick sizes which fit in allocation buckets well
192 */ 190 */
193#if (BITS_PER_LONG == 32)
194#define XFS_PAGB_NUM_SLOTS 84
195#elif (BITS_PER_LONG == 64)
196#define XFS_PAGB_NUM_SLOTS 128 191#define XFS_PAGB_NUM_SLOTS 128
197#endif
198 192
199typedef struct xfs_perag 193typedef struct xfs_perag {
200{ 194 struct xfs_mount *pag_mount; /* owner filesystem */
195 xfs_agnumber_t pag_agno; /* AG this structure belongs to */
196 atomic_t pag_ref; /* perag reference count */
201 char pagf_init; /* this agf's entry is initialized */ 197 char pagf_init; /* this agf's entry is initialized */
202 char pagi_init; /* this agi's entry is initialized */ 198 char pagi_init; /* this agi's entry is initialized */
203 char pagf_metadata; /* the agf is preferred to be metadata */ 199 char pagf_metadata; /* the agf is preferred to be metadata */
@@ -210,8 +206,6 @@ typedef struct xfs_perag
210 __uint32_t pagf_btreeblks; /* # of blocks held in AGF btrees */ 206 __uint32_t pagf_btreeblks; /* # of blocks held in AGF btrees */
211 xfs_agino_t pagi_freecount; /* number of free inodes */ 207 xfs_agino_t pagi_freecount; /* number of free inodes */
212 xfs_agino_t pagi_count; /* number of allocated inodes */ 208 xfs_agino_t pagi_count; /* number of allocated inodes */
213 int pagb_count; /* pagb slots in use */
214 xfs_perag_busy_t *pagb_list; /* unstable blocks */
215 209
216 /* 210 /*
217 * Inode allocation search lookup optimisation. 211 * Inode allocation search lookup optimisation.
@@ -230,6 +224,8 @@ typedef struct xfs_perag
230 rwlock_t pag_ici_lock; /* incore inode lock */ 224 rwlock_t pag_ici_lock; /* incore inode lock */
231 struct radix_tree_root pag_ici_root; /* incore inode cache root */ 225 struct radix_tree_root pag_ici_root; /* incore inode cache root */
232#endif 226#endif
227 int pagb_count; /* pagb slots in use */
228 xfs_perag_busy_t pagb_list[XFS_PAGB_NUM_SLOTS]; /* unstable blocks */
233} xfs_perag_t; 229} xfs_perag_t;
234 230
235/* 231/*
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 275b1f4f9430..94cddbfb2560 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -1662,11 +1662,13 @@ xfs_free_ag_extent(
1662 xfs_agf_t *agf; 1662 xfs_agf_t *agf;
1663 xfs_perag_t *pag; /* per allocation group data */ 1663 xfs_perag_t *pag; /* per allocation group data */
1664 1664
1665 pag = xfs_perag_get(mp, agno);
1666 pag->pagf_freeblks += len;
1667 xfs_perag_put(pag);
1668
1665 agf = XFS_BUF_TO_AGF(agbp); 1669 agf = XFS_BUF_TO_AGF(agbp);
1666 pag = &mp->m_perag[agno];
1667 be32_add_cpu(&agf->agf_freeblks, len); 1670 be32_add_cpu(&agf->agf_freeblks, len);
1668 xfs_trans_agblocks_delta(tp, len); 1671 xfs_trans_agblocks_delta(tp, len);
1669 pag->pagf_freeblks += len;
1670 XFS_WANT_CORRUPTED_GOTO( 1672 XFS_WANT_CORRUPTED_GOTO(
1671 be32_to_cpu(agf->agf_freeblks) <= 1673 be32_to_cpu(agf->agf_freeblks) <=
1672 be32_to_cpu(agf->agf_length), 1674 be32_to_cpu(agf->agf_length),
@@ -1969,10 +1971,12 @@ xfs_alloc_get_freelist(
1969 xfs_trans_brelse(tp, agflbp); 1971 xfs_trans_brelse(tp, agflbp);
1970 if (be32_to_cpu(agf->agf_flfirst) == XFS_AGFL_SIZE(mp)) 1972 if (be32_to_cpu(agf->agf_flfirst) == XFS_AGFL_SIZE(mp))
1971 agf->agf_flfirst = 0; 1973 agf->agf_flfirst = 0;
1972 pag = &mp->m_perag[be32_to_cpu(agf->agf_seqno)]; 1974
1975 pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno));
1973 be32_add_cpu(&agf->agf_flcount, -1); 1976 be32_add_cpu(&agf->agf_flcount, -1);
1974 xfs_trans_agflist_delta(tp, -1); 1977 xfs_trans_agflist_delta(tp, -1);
1975 pag->pagf_flcount--; 1978 pag->pagf_flcount--;
1979 xfs_perag_put(pag);
1976 1980
1977 logflags = XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT; 1981 logflags = XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT;
1978 if (btreeblk) { 1982 if (btreeblk) {
@@ -2078,7 +2082,8 @@ xfs_alloc_put_freelist(
2078 be32_add_cpu(&agf->agf_fllast, 1); 2082 be32_add_cpu(&agf->agf_fllast, 1);
2079 if (be32_to_cpu(agf->agf_fllast) == XFS_AGFL_SIZE(mp)) 2083 if (be32_to_cpu(agf->agf_fllast) == XFS_AGFL_SIZE(mp))
2080 agf->agf_fllast = 0; 2084 agf->agf_fllast = 0;
2081 pag = &mp->m_perag[be32_to_cpu(agf->agf_seqno)]; 2085
2086 pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno));
2082 be32_add_cpu(&agf->agf_flcount, 1); 2087 be32_add_cpu(&agf->agf_flcount, 1);
2083 xfs_trans_agflist_delta(tp, 1); 2088 xfs_trans_agflist_delta(tp, 1);
2084 pag->pagf_flcount++; 2089 pag->pagf_flcount++;
@@ -2089,6 +2094,7 @@ xfs_alloc_put_freelist(
2089 pag->pagf_btreeblks--; 2094 pag->pagf_btreeblks--;
2090 logflags |= XFS_AGF_BTREEBLKS; 2095 logflags |= XFS_AGF_BTREEBLKS;
2091 } 2096 }
2097 xfs_perag_put(pag);
2092 2098
2093 xfs_alloc_log_agf(tp, agbp, logflags); 2099 xfs_alloc_log_agf(tp, agbp, logflags);
2094 2100
@@ -2152,7 +2158,6 @@ xfs_read_agf(
2152 xfs_trans_brelse(tp, *bpp); 2158 xfs_trans_brelse(tp, *bpp);
2153 return XFS_ERROR(EFSCORRUPTED); 2159 return XFS_ERROR(EFSCORRUPTED);
2154 } 2160 }
2155
2156 XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_AGF, XFS_AGF_REF); 2161 XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_AGF, XFS_AGF_REF);
2157 return 0; 2162 return 0;
2158} 2163}
@@ -2175,7 +2180,7 @@ xfs_alloc_read_agf(
2175 ASSERT(agno != NULLAGNUMBER); 2180 ASSERT(agno != NULLAGNUMBER);
2176 2181
2177 error = xfs_read_agf(mp, tp, agno, 2182 error = xfs_read_agf(mp, tp, agno,
2178 (flags & XFS_ALLOC_FLAG_TRYLOCK) ? XFS_BUF_TRYLOCK : 0, 2183 (flags & XFS_ALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0,
2179 bpp); 2184 bpp);
2180 if (error) 2185 if (error)
2181 return error; 2186 return error;
@@ -2184,7 +2189,7 @@ xfs_alloc_read_agf(
2184 ASSERT(!XFS_BUF_GETERROR(*bpp)); 2189 ASSERT(!XFS_BUF_GETERROR(*bpp));
2185 2190
2186 agf = XFS_BUF_TO_AGF(*bpp); 2191 agf = XFS_BUF_TO_AGF(*bpp);
2187 pag = &mp->m_perag[agno]; 2192 pag = xfs_perag_get(mp, agno);
2188 if (!pag->pagf_init) { 2193 if (!pag->pagf_init) {
2189 pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks); 2194 pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks);
2190 pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks); 2195 pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks);
@@ -2195,8 +2200,8 @@ xfs_alloc_read_agf(
2195 pag->pagf_levels[XFS_BTNUM_CNTi] = 2200 pag->pagf_levels[XFS_BTNUM_CNTi] =
2196 be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]); 2201 be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]);
2197 spin_lock_init(&pag->pagb_lock); 2202 spin_lock_init(&pag->pagb_lock);
2198 pag->pagb_list = kmem_zalloc(XFS_PAGB_NUM_SLOTS * 2203 pag->pagb_count = 0;
2199 sizeof(xfs_perag_busy_t), KM_SLEEP); 2204 memset(pag->pagb_list, 0, sizeof(pag->pagb_list));
2200 pag->pagf_init = 1; 2205 pag->pagf_init = 1;
2201 } 2206 }
2202#ifdef DEBUG 2207#ifdef DEBUG
@@ -2211,6 +2216,7 @@ xfs_alloc_read_agf(
2211 be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi])); 2216 be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]));
2212 } 2217 }
2213#endif 2218#endif
2219 xfs_perag_put(pag);
2214 return 0; 2220 return 0;
2215} 2221}
2216 2222
@@ -2270,8 +2276,7 @@ xfs_alloc_vextent(
2270 * These three force us into a single a.g. 2276 * These three force us into a single a.g.
2271 */ 2277 */
2272 args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno); 2278 args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
2273 down_read(&mp->m_peraglock); 2279 args->pag = xfs_perag_get(mp, args->agno);
2274 args->pag = &mp->m_perag[args->agno];
2275 args->minleft = 0; 2280 args->minleft = 0;
2276 error = xfs_alloc_fix_freelist(args, 0); 2281 error = xfs_alloc_fix_freelist(args, 0);
2277 args->minleft = minleft; 2282 args->minleft = minleft;
@@ -2280,14 +2285,12 @@ xfs_alloc_vextent(
2280 goto error0; 2285 goto error0;
2281 } 2286 }
2282 if (!args->agbp) { 2287 if (!args->agbp) {
2283 up_read(&mp->m_peraglock);
2284 trace_xfs_alloc_vextent_noagbp(args); 2288 trace_xfs_alloc_vextent_noagbp(args);
2285 break; 2289 break;
2286 } 2290 }
2287 args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno); 2291 args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
2288 if ((error = xfs_alloc_ag_vextent(args))) 2292 if ((error = xfs_alloc_ag_vextent(args)))
2289 goto error0; 2293 goto error0;
2290 up_read(&mp->m_peraglock);
2291 break; 2294 break;
2292 case XFS_ALLOCTYPE_START_BNO: 2295 case XFS_ALLOCTYPE_START_BNO:
2293 /* 2296 /*
@@ -2339,9 +2342,8 @@ xfs_alloc_vextent(
2339 * Loop over allocation groups twice; first time with 2342 * Loop over allocation groups twice; first time with
2340 * trylock set, second time without. 2343 * trylock set, second time without.
2341 */ 2344 */
2342 down_read(&mp->m_peraglock);
2343 for (;;) { 2345 for (;;) {
2344 args->pag = &mp->m_perag[args->agno]; 2346 args->pag = xfs_perag_get(mp, args->agno);
2345 if (no_min) args->minleft = 0; 2347 if (no_min) args->minleft = 0;
2346 error = xfs_alloc_fix_freelist(args, flags); 2348 error = xfs_alloc_fix_freelist(args, flags);
2347 args->minleft = minleft; 2349 args->minleft = minleft;
@@ -2400,8 +2402,8 @@ xfs_alloc_vextent(
2400 } 2402 }
2401 } 2403 }
2402 } 2404 }
2405 xfs_perag_put(args->pag);
2403 } 2406 }
2404 up_read(&mp->m_peraglock);
2405 if (bump_rotor || (type == XFS_ALLOCTYPE_ANY_AG)) { 2407 if (bump_rotor || (type == XFS_ALLOCTYPE_ANY_AG)) {
2406 if (args->agno == sagno) 2408 if (args->agno == sagno)
2407 mp->m_agfrotor = (mp->m_agfrotor + 1) % 2409 mp->m_agfrotor = (mp->m_agfrotor + 1) %
@@ -2427,9 +2429,10 @@ xfs_alloc_vextent(
2427 args->len); 2429 args->len);
2428#endif 2430#endif
2429 } 2431 }
2432 xfs_perag_put(args->pag);
2430 return 0; 2433 return 0;
2431error0: 2434error0:
2432 up_read(&mp->m_peraglock); 2435 xfs_perag_put(args->pag);
2433 return error; 2436 return error;
2434} 2437}
2435 2438
@@ -2454,8 +2457,7 @@ xfs_free_extent(
2454 args.agno = XFS_FSB_TO_AGNO(args.mp, bno); 2457 args.agno = XFS_FSB_TO_AGNO(args.mp, bno);
2455 ASSERT(args.agno < args.mp->m_sb.sb_agcount); 2458 ASSERT(args.agno < args.mp->m_sb.sb_agcount);
2456 args.agbno = XFS_FSB_TO_AGBNO(args.mp, bno); 2459 args.agbno = XFS_FSB_TO_AGBNO(args.mp, bno);
2457 down_read(&args.mp->m_peraglock); 2460 args.pag = xfs_perag_get(args.mp, args.agno);
2458 args.pag = &args.mp->m_perag[args.agno];
2459 if ((error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING))) 2461 if ((error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING)))
2460 goto error0; 2462 goto error0;
2461#ifdef DEBUG 2463#ifdef DEBUG
@@ -2465,7 +2467,7 @@ xfs_free_extent(
2465#endif 2467#endif
2466 error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0); 2468 error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0);
2467error0: 2469error0:
2468 up_read(&args.mp->m_peraglock); 2470 xfs_perag_put(args.pag);
2469 return error; 2471 return error;
2470} 2472}
2471 2473
@@ -2486,15 +2488,15 @@ xfs_alloc_mark_busy(xfs_trans_t *tp,
2486 xfs_agblock_t bno, 2488 xfs_agblock_t bno,
2487 xfs_extlen_t len) 2489 xfs_extlen_t len)
2488{ 2490{
2489 xfs_mount_t *mp;
2490 xfs_perag_busy_t *bsy; 2491 xfs_perag_busy_t *bsy;
2492 struct xfs_perag *pag;
2491 int n; 2493 int n;
2492 2494
2493 mp = tp->t_mountp; 2495 pag = xfs_perag_get(tp->t_mountp, agno);
2494 spin_lock(&mp->m_perag[agno].pagb_lock); 2496 spin_lock(&pag->pagb_lock);
2495 2497
2496 /* search pagb_list for an open slot */ 2498 /* search pagb_list for an open slot */
2497 for (bsy = mp->m_perag[agno].pagb_list, n = 0; 2499 for (bsy = pag->pagb_list, n = 0;
2498 n < XFS_PAGB_NUM_SLOTS; 2500 n < XFS_PAGB_NUM_SLOTS;
2499 bsy++, n++) { 2501 bsy++, n++) {
2500 if (bsy->busy_tp == NULL) { 2502 if (bsy->busy_tp == NULL) {
@@ -2502,11 +2504,11 @@ xfs_alloc_mark_busy(xfs_trans_t *tp,
2502 } 2504 }
2503 } 2505 }
2504 2506
2505 trace_xfs_alloc_busy(mp, agno, bno, len, n); 2507 trace_xfs_alloc_busy(tp->t_mountp, agno, bno, len, n);
2506 2508
2507 if (n < XFS_PAGB_NUM_SLOTS) { 2509 if (n < XFS_PAGB_NUM_SLOTS) {
2508 bsy = &mp->m_perag[agno].pagb_list[n]; 2510 bsy = &pag->pagb_list[n];
2509 mp->m_perag[agno].pagb_count++; 2511 pag->pagb_count++;
2510 bsy->busy_start = bno; 2512 bsy->busy_start = bno;
2511 bsy->busy_length = len; 2513 bsy->busy_length = len;
2512 bsy->busy_tp = tp; 2514 bsy->busy_tp = tp;
@@ -2521,7 +2523,8 @@ xfs_alloc_mark_busy(xfs_trans_t *tp,
2521 xfs_trans_set_sync(tp); 2523 xfs_trans_set_sync(tp);
2522 } 2524 }
2523 2525
2524 spin_unlock(&mp->m_perag[agno].pagb_lock); 2526 spin_unlock(&pag->pagb_lock);
2527 xfs_perag_put(pag);
2525} 2528}
2526 2529
2527void 2530void
@@ -2529,24 +2532,23 @@ xfs_alloc_clear_busy(xfs_trans_t *tp,
2529 xfs_agnumber_t agno, 2532 xfs_agnumber_t agno,
2530 int idx) 2533 int idx)
2531{ 2534{
2532 xfs_mount_t *mp; 2535 struct xfs_perag *pag;
2533 xfs_perag_busy_t *list; 2536 xfs_perag_busy_t *list;
2534 2537
2535 mp = tp->t_mountp;
2536
2537 spin_lock(&mp->m_perag[agno].pagb_lock);
2538 list = mp->m_perag[agno].pagb_list;
2539
2540 ASSERT(idx < XFS_PAGB_NUM_SLOTS); 2538 ASSERT(idx < XFS_PAGB_NUM_SLOTS);
2539 pag = xfs_perag_get(tp->t_mountp, agno);
2540 spin_lock(&pag->pagb_lock);
2541 list = pag->pagb_list;
2541 2542
2542 trace_xfs_alloc_unbusy(mp, agno, idx, list[idx].busy_tp == tp); 2543 trace_xfs_alloc_unbusy(tp->t_mountp, agno, idx, list[idx].busy_tp == tp);
2543 2544
2544 if (list[idx].busy_tp == tp) { 2545 if (list[idx].busy_tp == tp) {
2545 list[idx].busy_tp = NULL; 2546 list[idx].busy_tp = NULL;
2546 mp->m_perag[agno].pagb_count--; 2547 pag->pagb_count--;
2547 } 2548 }
2548 2549
2549 spin_unlock(&mp->m_perag[agno].pagb_lock); 2550 spin_unlock(&pag->pagb_lock);
2551 xfs_perag_put(pag);
2550} 2552}
2551 2553
2552 2554
@@ -2560,17 +2562,15 @@ xfs_alloc_search_busy(xfs_trans_t *tp,
2560 xfs_agblock_t bno, 2562 xfs_agblock_t bno,
2561 xfs_extlen_t len) 2563 xfs_extlen_t len)
2562{ 2564{
2563 xfs_mount_t *mp; 2565 struct xfs_perag *pag;
2564 xfs_perag_busy_t *bsy; 2566 xfs_perag_busy_t *bsy;
2565 xfs_agblock_t uend, bend; 2567 xfs_agblock_t uend, bend;
2566 xfs_lsn_t lsn = 0; 2568 xfs_lsn_t lsn = 0;
2567 int cnt; 2569 int cnt;
2568 2570
2569 mp = tp->t_mountp; 2571 pag = xfs_perag_get(tp->t_mountp, agno);
2570 2572 spin_lock(&pag->pagb_lock);
2571 spin_lock(&mp->m_perag[agno].pagb_lock); 2573 cnt = pag->pagb_count;
2572
2573 uend = bno + len - 1;
2574 2574
2575 /* 2575 /*
2576 * search pagb_list for this slot, skipping open slots. We have to 2576 * search pagb_list for this slot, skipping open slots. We have to
@@ -2578,8 +2578,9 @@ xfs_alloc_search_busy(xfs_trans_t *tp,
2578 * we have to get the most recent LSN for the log force to push out 2578 * we have to get the most recent LSN for the log force to push out
2579 * all the transactions that span the range. 2579 * all the transactions that span the range.
2580 */ 2580 */
2581 for (cnt = 0; cnt < mp->m_perag[agno].pagb_count; cnt++) { 2581 uend = bno + len - 1;
2582 bsy = &mp->m_perag[agno].pagb_list[cnt]; 2582 for (cnt = 0; cnt < pag->pagb_count; cnt++) {
2583 bsy = &pag->pagb_list[cnt];
2583 if (!bsy->busy_tp) 2584 if (!bsy->busy_tp)
2584 continue; 2585 continue;
2585 2586
@@ -2591,7 +2592,8 @@ xfs_alloc_search_busy(xfs_trans_t *tp,
2591 if (XFS_LSN_CMP(bsy->busy_tp->t_commit_lsn, lsn) > 0) 2592 if (XFS_LSN_CMP(bsy->busy_tp->t_commit_lsn, lsn) > 0)
2592 lsn = bsy->busy_tp->t_commit_lsn; 2593 lsn = bsy->busy_tp->t_commit_lsn;
2593 } 2594 }
2594 spin_unlock(&mp->m_perag[agno].pagb_lock); 2595 spin_unlock(&pag->pagb_lock);
2596 xfs_perag_put(pag);
2595 trace_xfs_alloc_busysearch(tp->t_mountp, agno, bno, len, lsn); 2597 trace_xfs_alloc_busysearch(tp->t_mountp, agno, bno, len, lsn);
2596 2598
2597 /* 2599 /*
@@ -2599,5 +2601,5 @@ xfs_alloc_search_busy(xfs_trans_t *tp,
2599 * transaction that freed the block 2601 * transaction that freed the block
2600 */ 2602 */
2601 if (lsn) 2603 if (lsn)
2602 xfs_log_force(mp, lsn, XFS_LOG_FORCE|XFS_LOG_SYNC); 2604 xfs_log_force_lsn(tp->t_mountp, lsn, XFS_LOG_SYNC);
2603} 2605}
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index adbd9141aea1..b726e10d2c1c 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -61,12 +61,14 @@ xfs_allocbt_set_root(
61 struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); 61 struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
62 xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno); 62 xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno);
63 int btnum = cur->bc_btnum; 63 int btnum = cur->bc_btnum;
64 struct xfs_perag *pag = xfs_perag_get(cur->bc_mp, seqno);
64 65
65 ASSERT(ptr->s != 0); 66 ASSERT(ptr->s != 0);
66 67
67 agf->agf_roots[btnum] = ptr->s; 68 agf->agf_roots[btnum] = ptr->s;
68 be32_add_cpu(&agf->agf_levels[btnum], inc); 69 be32_add_cpu(&agf->agf_levels[btnum], inc);
69 cur->bc_mp->m_perag[seqno].pagf_levels[btnum] += inc; 70 pag->pagf_levels[btnum] += inc;
71 xfs_perag_put(pag);
70 72
71 xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS); 73 xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS);
72} 74}
@@ -150,6 +152,7 @@ xfs_allocbt_update_lastrec(
150{ 152{
151 struct xfs_agf *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp); 153 struct xfs_agf *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
152 xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno); 154 xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno);
155 struct xfs_perag *pag;
153 __be32 len; 156 __be32 len;
154 int numrecs; 157 int numrecs;
155 158
@@ -193,7 +196,9 @@ xfs_allocbt_update_lastrec(
193 } 196 }
194 197
195 agf->agf_longest = len; 198 agf->agf_longest = len;
196 cur->bc_mp->m_perag[seqno].pagf_longest = be32_to_cpu(len); 199 pag = xfs_perag_get(cur->bc_mp, seqno);
200 pag->pagf_longest = be32_to_cpu(len);
201 xfs_perag_put(pag);
197 xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp, XFS_AGF_LONGEST); 202 xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp, XFS_AGF_LONGEST);
198} 203}
199 204
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index e953b6cfb2a8..b9c196a53c42 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -93,12 +93,12 @@ STATIC int xfs_attr_rmtval_remove(xfs_da_args_t *args);
93STATIC int 93STATIC int
94xfs_attr_name_to_xname( 94xfs_attr_name_to_xname(
95 struct xfs_name *xname, 95 struct xfs_name *xname,
96 const char *aname) 96 const unsigned char *aname)
97{ 97{
98 if (!aname) 98 if (!aname)
99 return EINVAL; 99 return EINVAL;
100 xname->name = aname; 100 xname->name = aname;
101 xname->len = strlen(aname); 101 xname->len = strlen((char *)aname);
102 if (xname->len >= MAXNAMELEN) 102 if (xname->len >= MAXNAMELEN)
103 return EFAULT; /* match IRIX behaviour */ 103 return EFAULT; /* match IRIX behaviour */
104 104
@@ -124,7 +124,7 @@ STATIC int
124xfs_attr_get_int( 124xfs_attr_get_int(
125 struct xfs_inode *ip, 125 struct xfs_inode *ip,
126 struct xfs_name *name, 126 struct xfs_name *name,
127 char *value, 127 unsigned char *value,
128 int *valuelenp, 128 int *valuelenp,
129 int flags) 129 int flags)
130{ 130{
@@ -171,8 +171,8 @@ xfs_attr_get_int(
171int 171int
172xfs_attr_get( 172xfs_attr_get(
173 xfs_inode_t *ip, 173 xfs_inode_t *ip,
174 const char *name, 174 const unsigned char *name,
175 char *value, 175 unsigned char *value,
176 int *valuelenp, 176 int *valuelenp,
177 int flags) 177 int flags)
178{ 178{
@@ -197,7 +197,7 @@ xfs_attr_get(
197/* 197/*
198 * Calculate how many blocks we need for the new attribute, 198 * Calculate how many blocks we need for the new attribute,
199 */ 199 */
200int 200STATIC int
201xfs_attr_calc_size( 201xfs_attr_calc_size(
202 struct xfs_inode *ip, 202 struct xfs_inode *ip,
203 int namelen, 203 int namelen,
@@ -235,8 +235,12 @@ xfs_attr_calc_size(
235} 235}
236 236
237STATIC int 237STATIC int
238xfs_attr_set_int(xfs_inode_t *dp, struct xfs_name *name, 238xfs_attr_set_int(
239 char *value, int valuelen, int flags) 239 struct xfs_inode *dp,
240 struct xfs_name *name,
241 unsigned char *value,
242 int valuelen,
243 int flags)
240{ 244{
241 xfs_da_args_t args; 245 xfs_da_args_t args;
242 xfs_fsblock_t firstblock; 246 xfs_fsblock_t firstblock;
@@ -452,8 +456,8 @@ out:
452int 456int
453xfs_attr_set( 457xfs_attr_set(
454 xfs_inode_t *dp, 458 xfs_inode_t *dp,
455 const char *name, 459 const unsigned char *name,
456 char *value, 460 unsigned char *value,
457 int valuelen, 461 int valuelen,
458 int flags) 462 int flags)
459{ 463{
@@ -600,7 +604,7 @@ out:
600int 604int
601xfs_attr_remove( 605xfs_attr_remove(
602 xfs_inode_t *dp, 606 xfs_inode_t *dp,
603 const char *name, 607 const unsigned char *name,
604 int flags) 608 int flags)
605{ 609{
606 int error; 610 int error;
@@ -669,9 +673,13 @@ xfs_attr_list_int(xfs_attr_list_context_t *context)
669 */ 673 */
670/*ARGSUSED*/ 674/*ARGSUSED*/
671STATIC int 675STATIC int
672xfs_attr_put_listent(xfs_attr_list_context_t *context, int flags, 676xfs_attr_put_listent(
673 char *name, int namelen, 677 xfs_attr_list_context_t *context,
674 int valuelen, char *value) 678 int flags,
679 unsigned char *name,
680 int namelen,
681 int valuelen,
682 unsigned char *value)
675{ 683{
676 struct attrlist *alist = (struct attrlist *)context->alist; 684 struct attrlist *alist = (struct attrlist *)context->alist;
677 attrlist_ent_t *aep; 685 attrlist_ent_t *aep;
@@ -1980,7 +1988,7 @@ xfs_attr_rmtval_get(xfs_da_args_t *args)
1980 xfs_bmbt_irec_t map[ATTR_RMTVALUE_MAPSIZE]; 1988 xfs_bmbt_irec_t map[ATTR_RMTVALUE_MAPSIZE];
1981 xfs_mount_t *mp; 1989 xfs_mount_t *mp;
1982 xfs_daddr_t dblkno; 1990 xfs_daddr_t dblkno;
1983 xfs_caddr_t dst; 1991 void *dst;
1984 xfs_buf_t *bp; 1992 xfs_buf_t *bp;
1985 int nmap, error, tmp, valuelen, blkcnt, i; 1993 int nmap, error, tmp, valuelen, blkcnt, i;
1986 xfs_dablk_t lblkno; 1994 xfs_dablk_t lblkno;
@@ -2007,15 +2015,14 @@ xfs_attr_rmtval_get(xfs_da_args_t *args)
2007 dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock); 2015 dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock);
2008 blkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount); 2016 blkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount);
2009 error = xfs_read_buf(mp, mp->m_ddev_targp, dblkno, 2017 error = xfs_read_buf(mp, mp->m_ddev_targp, dblkno,
2010 blkcnt, 2018 blkcnt, XBF_LOCK | XBF_DONT_BLOCK,
2011 XFS_BUF_LOCK | XBF_DONT_BLOCK,
2012 &bp); 2019 &bp);
2013 if (error) 2020 if (error)
2014 return(error); 2021 return(error);
2015 2022
2016 tmp = (valuelen < XFS_BUF_SIZE(bp)) 2023 tmp = (valuelen < XFS_BUF_SIZE(bp))
2017 ? valuelen : XFS_BUF_SIZE(bp); 2024 ? valuelen : XFS_BUF_SIZE(bp);
2018 xfs_biomove(bp, 0, tmp, dst, XFS_B_READ); 2025 xfs_biomove(bp, 0, tmp, dst, XBF_READ);
2019 xfs_buf_relse(bp); 2026 xfs_buf_relse(bp);
2020 dst += tmp; 2027 dst += tmp;
2021 valuelen -= tmp; 2028 valuelen -= tmp;
@@ -2039,7 +2046,7 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
2039 xfs_inode_t *dp; 2046 xfs_inode_t *dp;
2040 xfs_bmbt_irec_t map; 2047 xfs_bmbt_irec_t map;
2041 xfs_daddr_t dblkno; 2048 xfs_daddr_t dblkno;
2042 xfs_caddr_t src; 2049 void *src;
2043 xfs_buf_t *bp; 2050 xfs_buf_t *bp;
2044 xfs_dablk_t lblkno; 2051 xfs_dablk_t lblkno;
2045 int blkcnt, valuelen, nmap, error, tmp, committed; 2052 int blkcnt, valuelen, nmap, error, tmp, committed;
@@ -2141,13 +2148,13 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
2141 blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount); 2148 blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount);
2142 2149
2143 bp = xfs_buf_get(mp->m_ddev_targp, dblkno, blkcnt, 2150 bp = xfs_buf_get(mp->m_ddev_targp, dblkno, blkcnt,
2144 XFS_BUF_LOCK | XBF_DONT_BLOCK); 2151 XBF_LOCK | XBF_DONT_BLOCK);
2145 ASSERT(bp); 2152 ASSERT(bp);
2146 ASSERT(!XFS_BUF_GETERROR(bp)); 2153 ASSERT(!XFS_BUF_GETERROR(bp));
2147 2154
2148 tmp = (valuelen < XFS_BUF_SIZE(bp)) ? valuelen : 2155 tmp = (valuelen < XFS_BUF_SIZE(bp)) ? valuelen :
2149 XFS_BUF_SIZE(bp); 2156 XFS_BUF_SIZE(bp);
2150 xfs_biomove(bp, 0, tmp, src, XFS_B_WRITE); 2157 xfs_biomove(bp, 0, tmp, src, XBF_WRITE);
2151 if (tmp < XFS_BUF_SIZE(bp)) 2158 if (tmp < XFS_BUF_SIZE(bp))
2152 xfs_biozero(bp, tmp, XFS_BUF_SIZE(bp) - tmp); 2159 xfs_biozero(bp, tmp, XFS_BUF_SIZE(bp) - tmp);
2153 if ((error = xfs_bwrite(mp, bp))) {/* GROT: NOTE: synchronous write */ 2160 if ((error = xfs_bwrite(mp, bp))) {/* GROT: NOTE: synchronous write */
@@ -2208,8 +2215,7 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
2208 /* 2215 /*
2209 * If the "remote" value is in the cache, remove it. 2216 * If the "remote" value is in the cache, remove it.
2210 */ 2217 */
2211 bp = xfs_incore(mp->m_ddev_targp, dblkno, blkcnt, 2218 bp = xfs_incore(mp->m_ddev_targp, dblkno, blkcnt, XBF_TRYLOCK);
2212 XFS_INCORE_TRYLOCK);
2213 if (bp) { 2219 if (bp) {
2214 XFS_BUF_STALE(bp); 2220 XFS_BUF_STALE(bp);
2215 XFS_BUF_UNDELAYWRITE(bp); 2221 XFS_BUF_UNDELAYWRITE(bp);
diff --git a/fs/xfs/xfs_attr.h b/fs/xfs/xfs_attr.h
index 59b410ce69a1..e920d68ef509 100644
--- a/fs/xfs/xfs_attr.h
+++ b/fs/xfs/xfs_attr.h
@@ -113,7 +113,7 @@ typedef struct attrlist_cursor_kern {
113 113
114 114
115typedef int (*put_listent_func_t)(struct xfs_attr_list_context *, int, 115typedef int (*put_listent_func_t)(struct xfs_attr_list_context *, int,
116 char *, int, int, char *); 116 unsigned char *, int, int, unsigned char *);
117 117
118typedef struct xfs_attr_list_context { 118typedef struct xfs_attr_list_context {
119 struct xfs_inode *dp; /* inode */ 119 struct xfs_inode *dp; /* inode */
@@ -139,7 +139,6 @@ typedef struct xfs_attr_list_context {
139/* 139/*
140 * Overall external interface routines. 140 * Overall external interface routines.
141 */ 141 */
142int xfs_attr_calc_size(struct xfs_inode *, int, int, int *);
143int xfs_attr_inactive(struct xfs_inode *dp); 142int xfs_attr_inactive(struct xfs_inode *dp);
144int xfs_attr_rmtval_get(struct xfs_da_args *args); 143int xfs_attr_rmtval_get(struct xfs_da_args *args);
145int xfs_attr_list_int(struct xfs_attr_list_context *); 144int xfs_attr_list_int(struct xfs_attr_list_context *);
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index baf41b5af756..a90ce74fc256 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -521,11 +521,11 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
521 521
522 sfe = &sf->list[0]; 522 sfe = &sf->list[0];
523 for (i = 0; i < sf->hdr.count; i++) { 523 for (i = 0; i < sf->hdr.count; i++) {
524 nargs.name = (char *)sfe->nameval; 524 nargs.name = sfe->nameval;
525 nargs.namelen = sfe->namelen; 525 nargs.namelen = sfe->namelen;
526 nargs.value = (char *)&sfe->nameval[nargs.namelen]; 526 nargs.value = &sfe->nameval[nargs.namelen];
527 nargs.valuelen = sfe->valuelen; 527 nargs.valuelen = sfe->valuelen;
528 nargs.hashval = xfs_da_hashname((char *)sfe->nameval, 528 nargs.hashval = xfs_da_hashname(sfe->nameval,
529 sfe->namelen); 529 sfe->namelen);
530 nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(sfe->flags); 530 nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(sfe->flags);
531 error = xfs_attr_leaf_lookup_int(bp, &nargs); /* set a->index */ 531 error = xfs_attr_leaf_lookup_int(bp, &nargs); /* set a->index */
@@ -612,10 +612,10 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
612 for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) { 612 for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) {
613 error = context->put_listent(context, 613 error = context->put_listent(context,
614 sfe->flags, 614 sfe->flags,
615 (char *)sfe->nameval, 615 sfe->nameval,
616 (int)sfe->namelen, 616 (int)sfe->namelen,
617 (int)sfe->valuelen, 617 (int)sfe->valuelen,
618 (char*)&sfe->nameval[sfe->namelen]); 618 &sfe->nameval[sfe->namelen]);
619 619
620 /* 620 /*
621 * Either search callback finished early or 621 * Either search callback finished early or
@@ -659,8 +659,8 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
659 } 659 }
660 660
661 sbp->entno = i; 661 sbp->entno = i;
662 sbp->hash = xfs_da_hashname((char *)sfe->nameval, sfe->namelen); 662 sbp->hash = xfs_da_hashname(sfe->nameval, sfe->namelen);
663 sbp->name = (char *)sfe->nameval; 663 sbp->name = sfe->nameval;
664 sbp->namelen = sfe->namelen; 664 sbp->namelen = sfe->namelen;
665 /* These are bytes, and both on-disk, don't endian-flip */ 665 /* These are bytes, and both on-disk, don't endian-flip */
666 sbp->valuelen = sfe->valuelen; 666 sbp->valuelen = sfe->valuelen;
@@ -818,9 +818,9 @@ xfs_attr_leaf_to_shortform(xfs_dabuf_t *bp, xfs_da_args_t *args, int forkoff)
818 continue; 818 continue;
819 ASSERT(entry->flags & XFS_ATTR_LOCAL); 819 ASSERT(entry->flags & XFS_ATTR_LOCAL);
820 name_loc = xfs_attr_leaf_name_local(leaf, i); 820 name_loc = xfs_attr_leaf_name_local(leaf, i);
821 nargs.name = (char *)name_loc->nameval; 821 nargs.name = name_loc->nameval;
822 nargs.namelen = name_loc->namelen; 822 nargs.namelen = name_loc->namelen;
823 nargs.value = (char *)&name_loc->nameval[nargs.namelen]; 823 nargs.value = &name_loc->nameval[nargs.namelen];
824 nargs.valuelen = be16_to_cpu(name_loc->valuelen); 824 nargs.valuelen = be16_to_cpu(name_loc->valuelen);
825 nargs.hashval = be32_to_cpu(entry->hashval); 825 nargs.hashval = be32_to_cpu(entry->hashval);
826 nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(entry->flags); 826 nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(entry->flags);
@@ -2370,10 +2370,10 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context)
2370 2370
2371 retval = context->put_listent(context, 2371 retval = context->put_listent(context,
2372 entry->flags, 2372 entry->flags,
2373 (char *)name_loc->nameval, 2373 name_loc->nameval,
2374 (int)name_loc->namelen, 2374 (int)name_loc->namelen,
2375 be16_to_cpu(name_loc->valuelen), 2375 be16_to_cpu(name_loc->valuelen),
2376 (char *)&name_loc->nameval[name_loc->namelen]); 2376 &name_loc->nameval[name_loc->namelen]);
2377 if (retval) 2377 if (retval)
2378 return retval; 2378 return retval;
2379 } else { 2379 } else {
@@ -2397,15 +2397,15 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context)
2397 return retval; 2397 return retval;
2398 retval = context->put_listent(context, 2398 retval = context->put_listent(context,
2399 entry->flags, 2399 entry->flags,
2400 (char *)name_rmt->name, 2400 name_rmt->name,
2401 (int)name_rmt->namelen, 2401 (int)name_rmt->namelen,
2402 valuelen, 2402 valuelen,
2403 (char*)args.value); 2403 args.value);
2404 kmem_free(args.value); 2404 kmem_free(args.value);
2405 } else { 2405 } else {
2406 retval = context->put_listent(context, 2406 retval = context->put_listent(context,
2407 entry->flags, 2407 entry->flags,
2408 (char *)name_rmt->name, 2408 name_rmt->name,
2409 (int)name_rmt->namelen, 2409 (int)name_rmt->namelen,
2410 valuelen, 2410 valuelen,
2411 NULL); 2411 NULL);
@@ -2950,7 +2950,7 @@ xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp,
2950 map.br_blockcount); 2950 map.br_blockcount);
2951 bp = xfs_trans_get_buf(*trans, 2951 bp = xfs_trans_get_buf(*trans,
2952 dp->i_mount->m_ddev_targp, 2952 dp->i_mount->m_ddev_targp,
2953 dblkno, dblkcnt, XFS_BUF_LOCK); 2953 dblkno, dblkcnt, XBF_LOCK);
2954 xfs_trans_binval(*trans, bp); 2954 xfs_trans_binval(*trans, bp);
2955 /* 2955 /*
2956 * Roll to next transaction. 2956 * Roll to next transaction.
diff --git a/fs/xfs/xfs_attr_sf.h b/fs/xfs/xfs_attr_sf.h
index 76ab7b0cbb3a..919756e3ba53 100644
--- a/fs/xfs/xfs_attr_sf.h
+++ b/fs/xfs/xfs_attr_sf.h
@@ -52,7 +52,7 @@ typedef struct xfs_attr_sf_sort {
52 __uint8_t valuelen; /* length of value */ 52 __uint8_t valuelen; /* length of value */
53 __uint8_t flags; /* flags bits (see xfs_attr_leaf.h) */ 53 __uint8_t flags; /* flags bits (see xfs_attr_leaf.h) */
54 xfs_dahash_t hash; /* this entry's hash value */ 54 xfs_dahash_t hash; /* this entry's hash value */
55 char *name; /* name value, pointer into buffer */ 55 unsigned char *name; /* name value, pointer into buffer */
56} xfs_attr_sf_sort_t; 56} xfs_attr_sf_sort_t;
57 57
58#define XFS_ATTR_SF_ENTSIZE_BYNAME(nlen,vlen) /* space name/value uses */ \ 58#define XFS_ATTR_SF_ENTSIZE_BYNAME(nlen,vlen) /* space name/value uses */ \
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 98251cdc52aa..1869fb973819 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -2629,13 +2629,12 @@ xfs_bmap_btalloc(
2629 if (startag == NULLAGNUMBER) 2629 if (startag == NULLAGNUMBER)
2630 startag = ag = 0; 2630 startag = ag = 0;
2631 notinit = 0; 2631 notinit = 0;
2632 down_read(&mp->m_peraglock); 2632 pag = xfs_perag_get(mp, ag);
2633 while (blen < ap->alen) { 2633 while (blen < ap->alen) {
2634 pag = &mp->m_perag[ag];
2635 if (!pag->pagf_init && 2634 if (!pag->pagf_init &&
2636 (error = xfs_alloc_pagf_init(mp, args.tp, 2635 (error = xfs_alloc_pagf_init(mp, args.tp,
2637 ag, XFS_ALLOC_FLAG_TRYLOCK))) { 2636 ag, XFS_ALLOC_FLAG_TRYLOCK))) {
2638 up_read(&mp->m_peraglock); 2637 xfs_perag_put(pag);
2639 return error; 2638 return error;
2640 } 2639 }
2641 /* 2640 /*
@@ -2667,13 +2666,13 @@ xfs_bmap_btalloc(
2667 break; 2666 break;
2668 2667
2669 error = xfs_filestream_new_ag(ap, &ag); 2668 error = xfs_filestream_new_ag(ap, &ag);
2670 if (error) { 2669 xfs_perag_put(pag);
2671 up_read(&mp->m_peraglock); 2670 if (error)
2672 return error; 2671 return error;
2673 }
2674 2672
2675 /* loop again to set 'blen'*/ 2673 /* loop again to set 'blen'*/
2676 startag = NULLAGNUMBER; 2674 startag = NULLAGNUMBER;
2675 pag = xfs_perag_get(mp, ag);
2677 continue; 2676 continue;
2678 } 2677 }
2679 } 2678 }
@@ -2681,8 +2680,10 @@ xfs_bmap_btalloc(
2681 ag = 0; 2680 ag = 0;
2682 if (ag == startag) 2681 if (ag == startag)
2683 break; 2682 break;
2683 xfs_perag_put(pag);
2684 pag = xfs_perag_get(mp, ag);
2684 } 2685 }
2685 up_read(&mp->m_peraglock); 2686 xfs_perag_put(pag);
2686 /* 2687 /*
2687 * Since the above loop did a BUF_TRYLOCK, it is 2688 * Since the above loop did a BUF_TRYLOCK, it is
2688 * possible that there is space for this request. 2689 * possible that there is space for this request.
@@ -4470,7 +4471,7 @@ xfs_bmapi(
4470 xfs_fsblock_t abno; /* allocated block number */ 4471 xfs_fsblock_t abno; /* allocated block number */
4471 xfs_extlen_t alen; /* allocated extent length */ 4472 xfs_extlen_t alen; /* allocated extent length */
4472 xfs_fileoff_t aoff; /* allocated file offset */ 4473 xfs_fileoff_t aoff; /* allocated file offset */
4473 xfs_bmalloca_t bma; /* args for xfs_bmap_alloc */ 4474 xfs_bmalloca_t bma = { 0 }; /* args for xfs_bmap_alloc */
4474 xfs_btree_cur_t *cur; /* bmap btree cursor */ 4475 xfs_btree_cur_t *cur; /* bmap btree cursor */
4475 xfs_fileoff_t end; /* end of mapped file region */ 4476 xfs_fileoff_t end; /* end of mapped file region */
4476 int eof; /* we've hit the end of extents */ 4477 int eof; /* we've hit the end of extents */
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index 38751d5fac6f..416e47e54b83 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -334,7 +334,7 @@ xfs_bmbt_disk_set_allf(
334/* 334/*
335 * Set all the fields in a bmap extent record from the uncompressed form. 335 * Set all the fields in a bmap extent record from the uncompressed form.
336 */ 336 */
337void 337STATIC void
338xfs_bmbt_disk_set_all( 338xfs_bmbt_disk_set_all(
339 xfs_bmbt_rec_t *r, 339 xfs_bmbt_rec_t *r,
340 xfs_bmbt_irec_t *s) 340 xfs_bmbt_irec_t *s)
diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h
index cf07ca7c22e7..0e66c4ea0f85 100644
--- a/fs/xfs/xfs_bmap_btree.h
+++ b/fs/xfs/xfs_bmap_btree.h
@@ -223,7 +223,6 @@ extern void xfs_bmbt_set_startblock(xfs_bmbt_rec_host_t *r, xfs_fsblock_t v);
223extern void xfs_bmbt_set_startoff(xfs_bmbt_rec_host_t *r, xfs_fileoff_t v); 223extern void xfs_bmbt_set_startoff(xfs_bmbt_rec_host_t *r, xfs_fileoff_t v);
224extern void xfs_bmbt_set_state(xfs_bmbt_rec_host_t *r, xfs_exntst_t v); 224extern void xfs_bmbt_set_state(xfs_bmbt_rec_host_t *r, xfs_exntst_t v);
225 225
226extern void xfs_bmbt_disk_set_all(xfs_bmbt_rec_t *r, xfs_bmbt_irec_t *s);
227extern void xfs_bmbt_disk_set_allf(xfs_bmbt_rec_t *r, xfs_fileoff_t o, 226extern void xfs_bmbt_disk_set_allf(xfs_bmbt_rec_t *r, xfs_fileoff_t o,
228 xfs_fsblock_t b, xfs_filblks_t c, xfs_exntst_t v); 227 xfs_fsblock_t b, xfs_filblks_t c, xfs_exntst_t v);
229 228
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index 36a0992dd669..96be4b0f2496 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -977,7 +977,7 @@ xfs_btree_get_buf_block(
977 xfs_daddr_t d; 977 xfs_daddr_t d;
978 978
979 /* need to sort out how callers deal with failures first */ 979 /* need to sort out how callers deal with failures first */
980 ASSERT(!(flags & XFS_BUF_TRYLOCK)); 980 ASSERT(!(flags & XBF_TRYLOCK));
981 981
982 d = xfs_btree_ptr_to_daddr(cur, ptr); 982 d = xfs_btree_ptr_to_daddr(cur, ptr);
983 *bpp = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d, 983 *bpp = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d,
@@ -1008,7 +1008,7 @@ xfs_btree_read_buf_block(
1008 int error; 1008 int error;
1009 1009
1010 /* need to sort out how callers deal with failures first */ 1010 /* need to sort out how callers deal with failures first */
1011 ASSERT(!(flags & XFS_BUF_TRYLOCK)); 1011 ASSERT(!(flags & XBF_TRYLOCK));
1012 1012
1013 d = xfs_btree_ptr_to_daddr(cur, ptr); 1013 d = xfs_btree_ptr_to_daddr(cur, ptr);
1014 error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d, 1014 error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d,
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index a30f7e9eb2b9..f3c49e69eab9 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -250,7 +250,7 @@ xfs_buf_item_format(
250 ((bip->bli_format.blf_map_size - 1) * sizeof(uint))); 250 ((bip->bli_format.blf_map_size - 1) * sizeof(uint)));
251 vecp->i_addr = (xfs_caddr_t)&bip->bli_format; 251 vecp->i_addr = (xfs_caddr_t)&bip->bli_format;
252 vecp->i_len = base_size; 252 vecp->i_len = base_size;
253 XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_BFORMAT); 253 vecp->i_type = XLOG_REG_TYPE_BFORMAT;
254 vecp++; 254 vecp++;
255 nvecs = 1; 255 nvecs = 1;
256 256
@@ -297,14 +297,14 @@ xfs_buf_item_format(
297 buffer_offset = first_bit * XFS_BLI_CHUNK; 297 buffer_offset = first_bit * XFS_BLI_CHUNK;
298 vecp->i_addr = xfs_buf_offset(bp, buffer_offset); 298 vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
299 vecp->i_len = nbits * XFS_BLI_CHUNK; 299 vecp->i_len = nbits * XFS_BLI_CHUNK;
300 XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_BCHUNK); 300 vecp->i_type = XLOG_REG_TYPE_BCHUNK;
301 nvecs++; 301 nvecs++;
302 break; 302 break;
303 } else if (next_bit != last_bit + 1) { 303 } else if (next_bit != last_bit + 1) {
304 buffer_offset = first_bit * XFS_BLI_CHUNK; 304 buffer_offset = first_bit * XFS_BLI_CHUNK;
305 vecp->i_addr = xfs_buf_offset(bp, buffer_offset); 305 vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
306 vecp->i_len = nbits * XFS_BLI_CHUNK; 306 vecp->i_len = nbits * XFS_BLI_CHUNK;
307 XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_BCHUNK); 307 vecp->i_type = XLOG_REG_TYPE_BCHUNK;
308 nvecs++; 308 nvecs++;
309 vecp++; 309 vecp++;
310 first_bit = next_bit; 310 first_bit = next_bit;
@@ -316,7 +316,7 @@ xfs_buf_item_format(
316 buffer_offset = first_bit * XFS_BLI_CHUNK; 316 buffer_offset = first_bit * XFS_BLI_CHUNK;
317 vecp->i_addr = xfs_buf_offset(bp, buffer_offset); 317 vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
318 vecp->i_len = nbits * XFS_BLI_CHUNK; 318 vecp->i_len = nbits * XFS_BLI_CHUNK;
319 XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_BCHUNK); 319 vecp->i_type = XLOG_REG_TYPE_BCHUNK;
320/* You would think we need to bump the nvecs here too, but we do not 320/* You would think we need to bump the nvecs here too, but we do not
321 * this number is used by recovery, and it gets confused by the boundary 321 * this number is used by recovery, and it gets confused by the boundary
322 * split here 322 * split here
@@ -467,8 +467,10 @@ xfs_buf_item_unpin_remove(
467/* 467/*
468 * This is called to attempt to lock the buffer associated with this 468 * This is called to attempt to lock the buffer associated with this
469 * buf log item. Don't sleep on the buffer lock. If we can't get 469 * buf log item. Don't sleep on the buffer lock. If we can't get
470 * the lock right away, return 0. If we can get the lock, pull the 470 * the lock right away, return 0. If we can get the lock, take a
471 * buffer from the free list, mark it busy, and return 1. 471 * reference to the buffer. If this is a delayed write buffer that
472 * needs AIL help to be written back, invoke the pushbuf routine
473 * rather than the normal success path.
472 */ 474 */
473STATIC uint 475STATIC uint
474xfs_buf_item_trylock( 476xfs_buf_item_trylock(
@@ -477,24 +479,18 @@ xfs_buf_item_trylock(
477 xfs_buf_t *bp; 479 xfs_buf_t *bp;
478 480
479 bp = bip->bli_buf; 481 bp = bip->bli_buf;
480 482 if (XFS_BUF_ISPINNED(bp))
481 if (XFS_BUF_ISPINNED(bp)) {
482 return XFS_ITEM_PINNED; 483 return XFS_ITEM_PINNED;
483 } 484 if (!XFS_BUF_CPSEMA(bp))
484
485 if (!XFS_BUF_CPSEMA(bp)) {
486 return XFS_ITEM_LOCKED; 485 return XFS_ITEM_LOCKED;
487 }
488 486
489 /* 487 /* take a reference to the buffer. */
490 * Remove the buffer from the free list. Only do this
491 * if it's on the free list. Private buffers like the
492 * superblock buffer are not.
493 */
494 XFS_BUF_HOLD(bp); 488 XFS_BUF_HOLD(bp);
495 489
496 ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); 490 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
497 trace_xfs_buf_item_trylock(bip); 491 trace_xfs_buf_item_trylock(bip);
492 if (XFS_BUF_ISDELAYWRITE(bp))
493 return XFS_ITEM_PUSHBUF;
498 return XFS_ITEM_SUCCESS; 494 return XFS_ITEM_SUCCESS;
499} 495}
500 496
@@ -626,11 +622,9 @@ xfs_buf_item_committed(
626} 622}
627 623
628/* 624/*
629 * This is called to asynchronously write the buffer associated with this 625 * The buffer is locked, but is not a delayed write buffer. This happens
630 * buf log item out to disk. The buffer will already have been locked by 626 * if we race with IO completion and hence we don't want to try to write it
631 * a successful call to xfs_buf_item_trylock(). If the buffer still has 627 * again. Just release the buffer.
632 * B_DELWRI set, then get it going out to disk with a call to bawrite().
633 * If not, then just release the buffer.
634 */ 628 */
635STATIC void 629STATIC void
636xfs_buf_item_push( 630xfs_buf_item_push(
@@ -642,17 +636,29 @@ xfs_buf_item_push(
642 trace_xfs_buf_item_push(bip); 636 trace_xfs_buf_item_push(bip);
643 637
644 bp = bip->bli_buf; 638 bp = bip->bli_buf;
639 ASSERT(!XFS_BUF_ISDELAYWRITE(bp));
640 xfs_buf_relse(bp);
641}
645 642
646 if (XFS_BUF_ISDELAYWRITE(bp)) { 643/*
647 int error; 644 * The buffer is locked and is a delayed write buffer. Promote the buffer
648 error = xfs_bawrite(bip->bli_item.li_mountp, bp); 645 * in the delayed write queue as the caller knows that they must invoke
649 if (error) 646 * the xfsbufd to get this buffer written. We have to unlock the buffer
650 xfs_fs_cmn_err(CE_WARN, bip->bli_item.li_mountp, 647 * to allow the xfsbufd to write it, too.
651 "xfs_buf_item_push: pushbuf error %d on bip %p, bp %p", 648 */
652 error, bip, bp); 649STATIC void
653 } else { 650xfs_buf_item_pushbuf(
654 xfs_buf_relse(bp); 651 xfs_buf_log_item_t *bip)
655 } 652{
653 xfs_buf_t *bp;
654
655 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
656 trace_xfs_buf_item_pushbuf(bip);
657
658 bp = bip->bli_buf;
659 ASSERT(XFS_BUF_ISDELAYWRITE(bp));
660 xfs_buf_delwri_promote(bp);
661 xfs_buf_relse(bp);
656} 662}
657 663
658/* ARGSUSED */ 664/* ARGSUSED */
@@ -677,7 +683,7 @@ static struct xfs_item_ops xfs_buf_item_ops = {
677 .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t)) 683 .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
678 xfs_buf_item_committed, 684 xfs_buf_item_committed,
679 .iop_push = (void(*)(xfs_log_item_t*))xfs_buf_item_push, 685 .iop_push = (void(*)(xfs_log_item_t*))xfs_buf_item_push,
680 .iop_pushbuf = NULL, 686 .iop_pushbuf = (void(*)(xfs_log_item_t*))xfs_buf_item_pushbuf,
681 .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t)) 687 .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
682 xfs_buf_item_committing 688 xfs_buf_item_committing
683}; 689};
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index c0c8869115b1..0ca556b4bf31 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -1534,8 +1534,8 @@ xfs_da_hashname(const __uint8_t *name, int namelen)
1534enum xfs_dacmp 1534enum xfs_dacmp
1535xfs_da_compname( 1535xfs_da_compname(
1536 struct xfs_da_args *args, 1536 struct xfs_da_args *args,
1537 const char *name, 1537 const unsigned char *name,
1538 int len) 1538 int len)
1539{ 1539{
1540 return (args->namelen == len && memcmp(args->name, name, len) == 0) ? 1540 return (args->namelen == len && memcmp(args->name, name, len) == 0) ?
1541 XFS_CMP_EXACT : XFS_CMP_DIFFERENT; 1541 XFS_CMP_EXACT : XFS_CMP_DIFFERENT;
diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h
index 30cd08f56a3a..fe9f5a8c1d2a 100644
--- a/fs/xfs/xfs_da_btree.h
+++ b/fs/xfs/xfs_da_btree.h
@@ -209,7 +209,8 @@ typedef struct xfs_da_state {
209 */ 209 */
210struct xfs_nameops { 210struct xfs_nameops {
211 xfs_dahash_t (*hashname)(struct xfs_name *); 211 xfs_dahash_t (*hashname)(struct xfs_name *);
212 enum xfs_dacmp (*compname)(struct xfs_da_args *, const char *, int); 212 enum xfs_dacmp (*compname)(struct xfs_da_args *,
213 const unsigned char *, int);
213}; 214};
214 215
215 216
@@ -260,7 +261,7 @@ int xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
260 261
261uint xfs_da_hashname(const __uint8_t *name_string, int name_length); 262uint xfs_da_hashname(const __uint8_t *name_string, int name_length);
262enum xfs_dacmp xfs_da_compname(struct xfs_da_args *args, 263enum xfs_dacmp xfs_da_compname(struct xfs_da_args *args,
263 const char *name, int len); 264 const unsigned char *name, int len);
264 265
265 266
266xfs_da_state_t *xfs_da_state_alloc(void); 267xfs_da_state_t *xfs_da_state_alloc(void);
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index 84ca1cf16a1e..cd27c9d6c71f 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -45,15 +45,21 @@
45#include "xfs_vnodeops.h" 45#include "xfs_vnodeops.h"
46#include "xfs_trace.h" 46#include "xfs_trace.h"
47 47
48
49static int xfs_swap_extents(
50 xfs_inode_t *ip, /* target inode */
51 xfs_inode_t *tip, /* tmp inode */
52 xfs_swapext_t *sxp);
53
48/* 54/*
49 * Syssgi interface for swapext 55 * ioctl interface for swapext
50 */ 56 */
51int 57int
52xfs_swapext( 58xfs_swapext(
53 xfs_swapext_t *sxp) 59 xfs_swapext_t *sxp)
54{ 60{
55 xfs_inode_t *ip, *tip; 61 xfs_inode_t *ip, *tip;
56 struct file *file, *target_file; 62 struct file *file, *tmp_file;
57 int error = 0; 63 int error = 0;
58 64
59 /* Pull information for the target fd */ 65 /* Pull information for the target fd */
@@ -68,46 +74,46 @@ xfs_swapext(
68 goto out_put_file; 74 goto out_put_file;
69 } 75 }
70 76
71 target_file = fget((int)sxp->sx_fdtmp); 77 tmp_file = fget((int)sxp->sx_fdtmp);
72 if (!target_file) { 78 if (!tmp_file) {
73 error = XFS_ERROR(EINVAL); 79 error = XFS_ERROR(EINVAL);
74 goto out_put_file; 80 goto out_put_file;
75 } 81 }
76 82
77 if (!(target_file->f_mode & FMODE_WRITE) || 83 if (!(tmp_file->f_mode & FMODE_WRITE) ||
78 (target_file->f_flags & O_APPEND)) { 84 (tmp_file->f_flags & O_APPEND)) {
79 error = XFS_ERROR(EBADF); 85 error = XFS_ERROR(EBADF);
80 goto out_put_target_file; 86 goto out_put_tmp_file;
81 } 87 }
82 88
83 if (IS_SWAPFILE(file->f_path.dentry->d_inode) || 89 if (IS_SWAPFILE(file->f_path.dentry->d_inode) ||
84 IS_SWAPFILE(target_file->f_path.dentry->d_inode)) { 90 IS_SWAPFILE(tmp_file->f_path.dentry->d_inode)) {
85 error = XFS_ERROR(EINVAL); 91 error = XFS_ERROR(EINVAL);
86 goto out_put_target_file; 92 goto out_put_tmp_file;
87 } 93 }
88 94
89 ip = XFS_I(file->f_path.dentry->d_inode); 95 ip = XFS_I(file->f_path.dentry->d_inode);
90 tip = XFS_I(target_file->f_path.dentry->d_inode); 96 tip = XFS_I(tmp_file->f_path.dentry->d_inode);
91 97
92 if (ip->i_mount != tip->i_mount) { 98 if (ip->i_mount != tip->i_mount) {
93 error = XFS_ERROR(EINVAL); 99 error = XFS_ERROR(EINVAL);
94 goto out_put_target_file; 100 goto out_put_tmp_file;
95 } 101 }
96 102
97 if (ip->i_ino == tip->i_ino) { 103 if (ip->i_ino == tip->i_ino) {
98 error = XFS_ERROR(EINVAL); 104 error = XFS_ERROR(EINVAL);
99 goto out_put_target_file; 105 goto out_put_tmp_file;
100 } 106 }
101 107
102 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { 108 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
103 error = XFS_ERROR(EIO); 109 error = XFS_ERROR(EIO);
104 goto out_put_target_file; 110 goto out_put_tmp_file;
105 } 111 }
106 112
107 error = xfs_swap_extents(ip, tip, sxp); 113 error = xfs_swap_extents(ip, tip, sxp);
108 114
109 out_put_target_file: 115 out_put_tmp_file:
110 fput(target_file); 116 fput(tmp_file);
111 out_put_file: 117 out_put_file:
112 fput(file); 118 fput(file);
113 out: 119 out:
@@ -186,7 +192,7 @@ xfs_swap_extents_check_format(
186 return 0; 192 return 0;
187} 193}
188 194
189int 195static int
190xfs_swap_extents( 196xfs_swap_extents(
191 xfs_inode_t *ip, /* target inode */ 197 xfs_inode_t *ip, /* target inode */
192 xfs_inode_t *tip, /* tmp inode */ 198 xfs_inode_t *tip, /* tmp inode */
@@ -254,6 +260,9 @@ xfs_swap_extents(
254 goto out_unlock; 260 goto out_unlock;
255 } 261 }
256 262
263 trace_xfs_swap_extent_before(ip, 0);
264 trace_xfs_swap_extent_before(tip, 1);
265
257 /* check inode formats now that data is flushed */ 266 /* check inode formats now that data is flushed */
258 error = xfs_swap_extents_check_format(ip, tip); 267 error = xfs_swap_extents_check_format(ip, tip);
259 if (error) { 268 if (error) {
@@ -421,6 +430,8 @@ xfs_swap_extents(
421 430
422 error = xfs_trans_commit(tp, XFS_TRANS_SWAPEXT); 431 error = xfs_trans_commit(tp, XFS_TRANS_SWAPEXT);
423 432
433 trace_xfs_swap_extent_after(ip, 0);
434 trace_xfs_swap_extent_after(tip, 1);
424out: 435out:
425 kmem_free(tempifp); 436 kmem_free(tempifp);
426 return error; 437 return error;
diff --git a/fs/xfs/xfs_dfrag.h b/fs/xfs/xfs_dfrag.h
index 4f55a6306558..20bdd935c121 100644
--- a/fs/xfs/xfs_dfrag.h
+++ b/fs/xfs/xfs_dfrag.h
@@ -48,9 +48,6 @@ typedef struct xfs_swapext
48 */ 48 */
49int xfs_swapext(struct xfs_swapext *sx); 49int xfs_swapext(struct xfs_swapext *sx);
50 50
51int xfs_swap_extents(struct xfs_inode *ip, struct xfs_inode *tip,
52 struct xfs_swapext *sxp);
53
54#endif /* __KERNEL__ */ 51#endif /* __KERNEL__ */
55 52
56#endif /* __XFS_DFRAG_H__ */ 53#endif /* __XFS_DFRAG_H__ */
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
index 93634a7e90e9..42520f041265 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c
@@ -44,7 +44,7 @@
44#include "xfs_vnodeops.h" 44#include "xfs_vnodeops.h"
45#include "xfs_trace.h" 45#include "xfs_trace.h"
46 46
47struct xfs_name xfs_name_dotdot = {"..", 2}; 47struct xfs_name xfs_name_dotdot = { (unsigned char *)"..", 2};
48 48
49/* 49/*
50 * ASCII case-insensitive (ie. A-Z) support for directories that was 50 * ASCII case-insensitive (ie. A-Z) support for directories that was
@@ -66,8 +66,8 @@ xfs_ascii_ci_hashname(
66STATIC enum xfs_dacmp 66STATIC enum xfs_dacmp
67xfs_ascii_ci_compname( 67xfs_ascii_ci_compname(
68 struct xfs_da_args *args, 68 struct xfs_da_args *args,
69 const char *name, 69 const unsigned char *name,
70 int len) 70 int len)
71{ 71{
72 enum xfs_dacmp result; 72 enum xfs_dacmp result;
73 int i; 73 int i;
@@ -247,7 +247,7 @@ xfs_dir_createname(
247int 247int
248xfs_dir_cilookup_result( 248xfs_dir_cilookup_result(
249 struct xfs_da_args *args, 249 struct xfs_da_args *args,
250 const char *name, 250 const unsigned char *name,
251 int len) 251 int len)
252{ 252{
253 if (args->cmpresult == XFS_CMP_DIFFERENT) 253 if (args->cmpresult == XFS_CMP_DIFFERENT)
diff --git a/fs/xfs/xfs_dir2.h b/fs/xfs/xfs_dir2.h
index 1d9ef96f33aa..74a3b1057685 100644
--- a/fs/xfs/xfs_dir2.h
+++ b/fs/xfs/xfs_dir2.h
@@ -100,7 +100,7 @@ extern int xfs_dir2_isleaf(struct xfs_trans *tp, struct xfs_inode *dp,
100extern int xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db, 100extern int xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db,
101 struct xfs_dabuf *bp); 101 struct xfs_dabuf *bp);
102 102
103extern int xfs_dir_cilookup_result(struct xfs_da_args *args, const char *name, 103extern int xfs_dir_cilookup_result(struct xfs_da_args *args,
104 int len); 104 const unsigned char *name, int len);
105 105
106#endif /* __XFS_DIR2_H__ */ 106#endif /* __XFS_DIR2_H__ */
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index ddc4ecc7807f..779a267b0a84 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -57,8 +57,8 @@ static xfs_dahash_t xfs_dir_hash_dot, xfs_dir_hash_dotdot;
57void 57void
58xfs_dir_startup(void) 58xfs_dir_startup(void)
59{ 59{
60 xfs_dir_hash_dot = xfs_da_hashname(".", 1); 60 xfs_dir_hash_dot = xfs_da_hashname((unsigned char *)".", 1);
61 xfs_dir_hash_dotdot = xfs_da_hashname("..", 2); 61 xfs_dir_hash_dotdot = xfs_da_hashname((unsigned char *)"..", 2);
62} 62}
63 63
64/* 64/*
@@ -513,8 +513,9 @@ xfs_dir2_block_getdents(
513 /* 513 /*
514 * If it didn't fit, set the final offset to here & return. 514 * If it didn't fit, set the final offset to here & return.
515 */ 515 */
516 if (filldir(dirent, dep->name, dep->namelen, cook & 0x7fffffff, 516 if (filldir(dirent, (char *)dep->name, dep->namelen,
517 be64_to_cpu(dep->inumber), DT_UNKNOWN)) { 517 cook & 0x7fffffff, be64_to_cpu(dep->inumber),
518 DT_UNKNOWN)) {
518 *offset = cook & 0x7fffffff; 519 *offset = cook & 0x7fffffff;
519 xfs_da_brelse(NULL, bp); 520 xfs_da_brelse(NULL, bp);
520 return 0; 521 return 0;
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index 29f484c11b3a..e2d89854ec9e 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -1081,7 +1081,7 @@ xfs_dir2_leaf_getdents(
1081 dep = (xfs_dir2_data_entry_t *)ptr; 1081 dep = (xfs_dir2_data_entry_t *)ptr;
1082 length = xfs_dir2_data_entsize(dep->namelen); 1082 length = xfs_dir2_data_entsize(dep->namelen);
1083 1083
1084 if (filldir(dirent, dep->name, dep->namelen, 1084 if (filldir(dirent, (char *)dep->name, dep->namelen,
1085 xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff, 1085 xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff,
1086 be64_to_cpu(dep->inumber), DT_UNKNOWN)) 1086 be64_to_cpu(dep->inumber), DT_UNKNOWN))
1087 break; 1087 break;
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index ce6e355199b5..78fc4d9ae756 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -65,7 +65,7 @@ static int xfs_dir2_node_addname_int(xfs_da_args_t *args,
65/* 65/*
66 * Log entries from a freespace block. 66 * Log entries from a freespace block.
67 */ 67 */
68void 68STATIC void
69xfs_dir2_free_log_bests( 69xfs_dir2_free_log_bests(
70 xfs_trans_t *tp, /* transaction pointer */ 70 xfs_trans_t *tp, /* transaction pointer */
71 xfs_dabuf_t *bp, /* freespace buffer */ 71 xfs_dabuf_t *bp, /* freespace buffer */
diff --git a/fs/xfs/xfs_dir2_node.h b/fs/xfs/xfs_dir2_node.h
index dde72db3d695..82dfe7147195 100644
--- a/fs/xfs/xfs_dir2_node.h
+++ b/fs/xfs/xfs_dir2_node.h
@@ -75,8 +75,6 @@ xfs_dir2_db_to_fdindex(struct xfs_mount *mp, xfs_dir2_db_t db)
75 return ((db) % XFS_DIR2_MAX_FREE_BESTS(mp)); 75 return ((db) % XFS_DIR2_MAX_FREE_BESTS(mp));
76} 76}
77 77
78extern void xfs_dir2_free_log_bests(struct xfs_trans *tp, struct xfs_dabuf *bp,
79 int first, int last);
80extern int xfs_dir2_leaf_to_node(struct xfs_da_args *args, 78extern int xfs_dir2_leaf_to_node(struct xfs_da_args *args,
81 struct xfs_dabuf *lbp); 79 struct xfs_dabuf *lbp);
82extern xfs_dahash_t xfs_dir2_leafn_lasthash(struct xfs_dabuf *bp, int *count); 80extern xfs_dahash_t xfs_dir2_leafn_lasthash(struct xfs_dabuf *bp, int *count);
diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c
index 9d4f17a69676..c1a5945d463a 100644
--- a/fs/xfs/xfs_dir2_sf.c
+++ b/fs/xfs/xfs_dir2_sf.c
@@ -782,7 +782,7 @@ xfs_dir2_sf_getdents(
782 } 782 }
783 783
784 ino = xfs_dir2_sf_get_inumber(sfp, xfs_dir2_sf_inumberp(sfep)); 784 ino = xfs_dir2_sf_get_inumber(sfp, xfs_dir2_sf_inumberp(sfep));
785 if (filldir(dirent, sfep->name, sfep->namelen, 785 if (filldir(dirent, (char *)sfep->name, sfep->namelen,
786 off & 0x7fffffff, ino, DT_UNKNOWN)) { 786 off & 0x7fffffff, ino, DT_UNKNOWN)) {
787 *offset = off & 0x7fffffff; 787 *offset = off & 0x7fffffff;
788 return 0; 788 return 0;
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 05a4bdd4be39..6f35ed1b39b9 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -82,7 +82,7 @@ xfs_efi_item_format(xfs_efi_log_item_t *efip,
82 82
83 log_vector->i_addr = (xfs_caddr_t)&(efip->efi_format); 83 log_vector->i_addr = (xfs_caddr_t)&(efip->efi_format);
84 log_vector->i_len = size; 84 log_vector->i_len = size;
85 XLOG_VEC_SET_TYPE(log_vector, XLOG_REG_TYPE_EFI_FORMAT); 85 log_vector->i_type = XLOG_REG_TYPE_EFI_FORMAT;
86 ASSERT(size >= sizeof(xfs_efi_log_format_t)); 86 ASSERT(size >= sizeof(xfs_efi_log_format_t));
87} 87}
88 88
@@ -406,7 +406,7 @@ xfs_efd_item_format(xfs_efd_log_item_t *efdp,
406 406
407 log_vector->i_addr = (xfs_caddr_t)&(efdp->efd_format); 407 log_vector->i_addr = (xfs_caddr_t)&(efdp->efd_format);
408 log_vector->i_len = size; 408 log_vector->i_len = size;
409 XLOG_VEC_SET_TYPE(log_vector, XLOG_REG_TYPE_EFD_FORMAT); 409 log_vector->i_type = XLOG_REG_TYPE_EFD_FORMAT;
410 ASSERT(size >= sizeof(xfs_efd_log_format_t)); 410 ASSERT(size >= sizeof(xfs_efd_log_format_t));
411} 411}
412 412
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index a631e1451abb..390850ee6603 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -140,6 +140,7 @@ _xfs_filestream_pick_ag(
140 int flags, 140 int flags,
141 xfs_extlen_t minlen) 141 xfs_extlen_t minlen)
142{ 142{
143 int streams, max_streams;
143 int err, trylock, nscan; 144 int err, trylock, nscan;
144 xfs_extlen_t longest, free, minfree, maxfree = 0; 145 xfs_extlen_t longest, free, minfree, maxfree = 0;
145 xfs_agnumber_t ag, max_ag = NULLAGNUMBER; 146 xfs_agnumber_t ag, max_ag = NULLAGNUMBER;
@@ -155,15 +156,15 @@ _xfs_filestream_pick_ag(
155 trylock = XFS_ALLOC_FLAG_TRYLOCK; 156 trylock = XFS_ALLOC_FLAG_TRYLOCK;
156 157
157 for (nscan = 0; 1; nscan++) { 158 for (nscan = 0; 1; nscan++) {
158 159 pag = xfs_perag_get(mp, ag);
159 TRACE_AG_SCAN(mp, ag, xfs_filestream_peek_ag(mp, ag)); 160 TRACE_AG_SCAN(mp, ag, atomic_read(&pag->pagf_fstrms));
160
161 pag = mp->m_perag + ag;
162 161
163 if (!pag->pagf_init) { 162 if (!pag->pagf_init) {
164 err = xfs_alloc_pagf_init(mp, NULL, ag, trylock); 163 err = xfs_alloc_pagf_init(mp, NULL, ag, trylock);
165 if (err && !trylock) 164 if (err && !trylock) {
165 xfs_perag_put(pag);
166 return err; 166 return err;
167 }
167 } 168 }
168 169
169 /* Might fail sometimes during the 1st pass with trylock set. */ 170 /* Might fail sometimes during the 1st pass with trylock set. */
@@ -173,6 +174,7 @@ _xfs_filestream_pick_ag(
173 /* Keep track of the AG with the most free blocks. */ 174 /* Keep track of the AG with the most free blocks. */
174 if (pag->pagf_freeblks > maxfree) { 175 if (pag->pagf_freeblks > maxfree) {
175 maxfree = pag->pagf_freeblks; 176 maxfree = pag->pagf_freeblks;
177 max_streams = atomic_read(&pag->pagf_fstrms);
176 max_ag = ag; 178 max_ag = ag;
177 } 179 }
178 180
@@ -195,6 +197,8 @@ _xfs_filestream_pick_ag(
195 197
196 /* Break out, retaining the reference on the AG. */ 198 /* Break out, retaining the reference on the AG. */
197 free = pag->pagf_freeblks; 199 free = pag->pagf_freeblks;
200 streams = atomic_read(&pag->pagf_fstrms);
201 xfs_perag_put(pag);
198 *agp = ag; 202 *agp = ag;
199 break; 203 break;
200 } 204 }
@@ -202,6 +206,7 @@ _xfs_filestream_pick_ag(
202 /* Drop the reference on this AG, it's not usable. */ 206 /* Drop the reference on this AG, it's not usable. */
203 xfs_filestream_put_ag(mp, ag); 207 xfs_filestream_put_ag(mp, ag);
204next_ag: 208next_ag:
209 xfs_perag_put(pag);
205 /* Move to the next AG, wrapping to AG 0 if necessary. */ 210 /* Move to the next AG, wrapping to AG 0 if necessary. */
206 if (++ag >= mp->m_sb.sb_agcount) 211 if (++ag >= mp->m_sb.sb_agcount)
207 ag = 0; 212 ag = 0;
@@ -229,6 +234,7 @@ next_ag:
229 if (max_ag != NULLAGNUMBER) { 234 if (max_ag != NULLAGNUMBER) {
230 xfs_filestream_get_ag(mp, max_ag); 235 xfs_filestream_get_ag(mp, max_ag);
231 TRACE_AG_PICK1(mp, max_ag, maxfree); 236 TRACE_AG_PICK1(mp, max_ag, maxfree);
237 streams = max_streams;
232 free = maxfree; 238 free = maxfree;
233 *agp = max_ag; 239 *agp = max_ag;
234 break; 240 break;
@@ -240,16 +246,14 @@ next_ag:
240 return 0; 246 return 0;
241 } 247 }
242 248
243 TRACE_AG_PICK2(mp, startag, *agp, xfs_filestream_peek_ag(mp, *agp), 249 TRACE_AG_PICK2(mp, startag, *agp, streams, free, nscan, flags);
244 free, nscan, flags);
245 250
246 return 0; 251 return 0;
247} 252}
248 253
249/* 254/*
250 * Set the allocation group number for a file or a directory, updating inode 255 * Set the allocation group number for a file or a directory, updating inode
251 * references and per-AG references as appropriate. Must be called with the 256 * references and per-AG references as appropriate.
252 * m_peraglock held in read mode.
253 */ 257 */
254static int 258static int
255_xfs_filestream_update_ag( 259_xfs_filestream_update_ag(
@@ -451,20 +455,6 @@ xfs_filestream_unmount(
451} 455}
452 456
453/* 457/*
454 * If the mount point's m_perag array is going to be reallocated, all
455 * outstanding cache entries must be flushed to avoid accessing reference count
456 * addresses that have been freed. The call to xfs_filestream_flush() must be
457 * made inside the block that holds the m_peraglock in write mode to do the
458 * reallocation.
459 */
460void
461xfs_filestream_flush(
462 xfs_mount_t *mp)
463{
464 xfs_mru_cache_flush(mp->m_filestream);
465}
466
467/*
468 * Return the AG of the filestream the file or directory belongs to, or 458 * Return the AG of the filestream the file or directory belongs to, or
469 * NULLAGNUMBER otherwise. 459 * NULLAGNUMBER otherwise.
470 */ 460 */
@@ -526,7 +516,6 @@ xfs_filestream_associate(
526 516
527 mp = pip->i_mount; 517 mp = pip->i_mount;
528 cache = mp->m_filestream; 518 cache = mp->m_filestream;
529 down_read(&mp->m_peraglock);
530 519
531 /* 520 /*
532 * We have a problem, Houston. 521 * We have a problem, Houston.
@@ -543,10 +532,8 @@ xfs_filestream_associate(
543 * 532 *
544 * So, if we can't get the iolock without sleeping then just give up 533 * So, if we can't get the iolock without sleeping then just give up
545 */ 534 */
546 if (!xfs_ilock_nowait(pip, XFS_IOLOCK_EXCL)) { 535 if (!xfs_ilock_nowait(pip, XFS_IOLOCK_EXCL))
547 up_read(&mp->m_peraglock);
548 return 1; 536 return 1;
549 }
550 537
551 /* If the parent directory is already in the cache, use its AG. */ 538 /* If the parent directory is already in the cache, use its AG. */
552 item = xfs_mru_cache_lookup(cache, pip->i_ino); 539 item = xfs_mru_cache_lookup(cache, pip->i_ino);
@@ -601,7 +588,6 @@ exit_did_pick:
601 588
602exit: 589exit:
603 xfs_iunlock(pip, XFS_IOLOCK_EXCL); 590 xfs_iunlock(pip, XFS_IOLOCK_EXCL);
604 up_read(&mp->m_peraglock);
605 return -err; 591 return -err;
606} 592}
607 593
diff --git a/fs/xfs/xfs_filestream.h b/fs/xfs/xfs_filestream.h
index 4aba67c5f64f..260f757bbc5d 100644
--- a/fs/xfs/xfs_filestream.h
+++ b/fs/xfs/xfs_filestream.h
@@ -79,12 +79,21 @@ extern ktrace_t *xfs_filestreams_trace_buf;
79 * the cache that reference per-ag array elements that have since been 79 * the cache that reference per-ag array elements that have since been
80 * reallocated. 80 * reallocated.
81 */ 81 */
82/*
83 * xfs_filestream_peek_ag is only used in tracing code
84 */
82static inline int 85static inline int
83xfs_filestream_peek_ag( 86xfs_filestream_peek_ag(
84 xfs_mount_t *mp, 87 xfs_mount_t *mp,
85 xfs_agnumber_t agno) 88 xfs_agnumber_t agno)
86{ 89{
87 return atomic_read(&mp->m_perag[agno].pagf_fstrms); 90 struct xfs_perag *pag;
91 int ret;
92
93 pag = xfs_perag_get(mp, agno);
94 ret = atomic_read(&pag->pagf_fstrms);
95 xfs_perag_put(pag);
96 return ret;
88} 97}
89 98
90static inline int 99static inline int
@@ -92,7 +101,13 @@ xfs_filestream_get_ag(
92 xfs_mount_t *mp, 101 xfs_mount_t *mp,
93 xfs_agnumber_t agno) 102 xfs_agnumber_t agno)
94{ 103{
95 return atomic_inc_return(&mp->m_perag[agno].pagf_fstrms); 104 struct xfs_perag *pag;
105 int ret;
106
107 pag = xfs_perag_get(mp, agno);
108 ret = atomic_inc_return(&pag->pagf_fstrms);
109 xfs_perag_put(pag);
110 return ret;
96} 111}
97 112
98static inline int 113static inline int
@@ -100,7 +115,13 @@ xfs_filestream_put_ag(
100 xfs_mount_t *mp, 115 xfs_mount_t *mp,
101 xfs_agnumber_t agno) 116 xfs_agnumber_t agno)
102{ 117{
103 return atomic_dec_return(&mp->m_perag[agno].pagf_fstrms); 118 struct xfs_perag *pag;
119 int ret;
120
121 pag = xfs_perag_get(mp, agno);
122 ret = atomic_dec_return(&pag->pagf_fstrms);
123 xfs_perag_put(pag);
124 return ret;
104} 125}
105 126
106/* allocation selection flags */ 127/* allocation selection flags */
@@ -114,7 +135,6 @@ int xfs_filestream_init(void);
114void xfs_filestream_uninit(void); 135void xfs_filestream_uninit(void);
115int xfs_filestream_mount(struct xfs_mount *mp); 136int xfs_filestream_mount(struct xfs_mount *mp);
116void xfs_filestream_unmount(struct xfs_mount *mp); 137void xfs_filestream_unmount(struct xfs_mount *mp);
117void xfs_filestream_flush(struct xfs_mount *mp);
118xfs_agnumber_t xfs_filestream_lookup_ag(struct xfs_inode *ip); 138xfs_agnumber_t xfs_filestream_lookup_ag(struct xfs_inode *ip);
119int xfs_filestream_associate(struct xfs_inode *dip, struct xfs_inode *ip); 139int xfs_filestream_associate(struct xfs_inode *dip, struct xfs_inode *ip);
120void xfs_filestream_deassociate(struct xfs_inode *ip); 140void xfs_filestream_deassociate(struct xfs_inode *ip);
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index a13919a6a364..37a6f62c57b6 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -167,27 +167,14 @@ xfs_growfs_data_private(
167 } 167 }
168 new = nb - mp->m_sb.sb_dblocks; 168 new = nb - mp->m_sb.sb_dblocks;
169 oagcount = mp->m_sb.sb_agcount; 169 oagcount = mp->m_sb.sb_agcount;
170 if (nagcount > oagcount) {
171 void *new_perag, *old_perag;
172
173 xfs_filestream_flush(mp);
174
175 new_perag = kmem_zalloc(sizeof(xfs_perag_t) * nagcount,
176 KM_MAYFAIL);
177 if (!new_perag)
178 return XFS_ERROR(ENOMEM);
179
180 down_write(&mp->m_peraglock);
181 memcpy(new_perag, mp->m_perag, sizeof(xfs_perag_t) * oagcount);
182 old_perag = mp->m_perag;
183 mp->m_perag = new_perag;
184
185 mp->m_flags |= XFS_MOUNT_32BITINODES;
186 nagimax = xfs_initialize_perag(mp, nagcount);
187 up_write(&mp->m_peraglock);
188 170
189 kmem_free(old_perag); 171 /* allocate the new per-ag structures */
172 if (nagcount > oagcount) {
173 error = xfs_initialize_perag(mp, nagcount, &nagimax);
174 if (error)
175 return error;
190 } 176 }
177
191 tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFS); 178 tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFS);
192 tp->t_flags |= XFS_TRANS_RESERVE; 179 tp->t_flags |= XFS_TRANS_RESERVE;
193 if ((error = xfs_trans_reserve(tp, XFS_GROWFS_SPACE_RES(mp), 180 if ((error = xfs_trans_reserve(tp, XFS_GROWFS_SPACE_RES(mp),
@@ -196,6 +183,11 @@ xfs_growfs_data_private(
196 return error; 183 return error;
197 } 184 }
198 185
186 /*
187 * Write new AG headers to disk. Non-transactional, but written
188 * synchronously so they are completed prior to the growfs transaction
189 * being logged.
190 */
199 nfree = 0; 191 nfree = 0;
200 for (agno = nagcount - 1; agno >= oagcount; agno--, new -= agsize) { 192 for (agno = nagcount - 1; agno >= oagcount; agno--, new -= agsize) {
201 /* 193 /*
@@ -359,6 +351,12 @@ xfs_growfs_data_private(
359 goto error0; 351 goto error0;
360 } 352 }
361 } 353 }
354
355 /*
356 * Update changed superblock fields transactionally. These are not
357 * seen by the rest of the world until the transaction commit applies
358 * them atomically to the superblock.
359 */
362 if (nagcount > oagcount) 360 if (nagcount > oagcount)
363 xfs_trans_mod_sb(tp, XFS_TRANS_SB_AGCOUNT, nagcount - oagcount); 361 xfs_trans_mod_sb(tp, XFS_TRANS_SB_AGCOUNT, nagcount - oagcount);
364 if (nb > mp->m_sb.sb_dblocks) 362 if (nb > mp->m_sb.sb_dblocks)
@@ -369,9 +367,9 @@ xfs_growfs_data_private(
369 if (dpct) 367 if (dpct)
370 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IMAXPCT, dpct); 368 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IMAXPCT, dpct);
371 error = xfs_trans_commit(tp, 0); 369 error = xfs_trans_commit(tp, 0);
372 if (error) { 370 if (error)
373 return error; 371 return error;
374 } 372
375 /* New allocation groups fully initialized, so update mount struct */ 373 /* New allocation groups fully initialized, so update mount struct */
376 if (nagimax) 374 if (nagimax)
377 mp->m_maxagi = nagimax; 375 mp->m_maxagi = nagimax;
@@ -381,6 +379,8 @@ xfs_growfs_data_private(
381 mp->m_maxicount = icount << mp->m_sb.sb_inopblog; 379 mp->m_maxicount = icount << mp->m_sb.sb_inopblog;
382 } else 380 } else
383 mp->m_maxicount = 0; 381 mp->m_maxicount = 0;
382
383 /* update secondary superblocks. */
384 for (agno = 1; agno < nagcount; agno++) { 384 for (agno = 1; agno < nagcount; agno++) {
385 error = xfs_read_buf(mp, mp->m_ddev_targp, 385 error = xfs_read_buf(mp, mp->m_ddev_targp,
386 XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)), 386 XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index cb907ba69c4c..9d884c127bb9 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -205,7 +205,7 @@ xfs_ialloc_inode_init(
205 d = XFS_AGB_TO_DADDR(mp, agno, agbno + (j * blks_per_cluster)); 205 d = XFS_AGB_TO_DADDR(mp, agno, agbno + (j * blks_per_cluster));
206 fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, 206 fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
207 mp->m_bsize * blks_per_cluster, 207 mp->m_bsize * blks_per_cluster,
208 XFS_BUF_LOCK); 208 XBF_LOCK);
209 ASSERT(fbuf); 209 ASSERT(fbuf);
210 ASSERT(!XFS_BUF_GETERROR(fbuf)); 210 ASSERT(!XFS_BUF_GETERROR(fbuf));
211 211
@@ -253,6 +253,7 @@ xfs_ialloc_ag_alloc(
253 xfs_agino_t thisino; /* current inode number, for loop */ 253 xfs_agino_t thisino; /* current inode number, for loop */
254 int isaligned = 0; /* inode allocation at stripe unit */ 254 int isaligned = 0; /* inode allocation at stripe unit */
255 /* boundary */ 255 /* boundary */
256 struct xfs_perag *pag;
256 257
257 args.tp = tp; 258 args.tp = tp;
258 args.mp = tp->t_mountp; 259 args.mp = tp->t_mountp;
@@ -382,9 +383,9 @@ xfs_ialloc_ag_alloc(
382 newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0); 383 newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0);
383 be32_add_cpu(&agi->agi_count, newlen); 384 be32_add_cpu(&agi->agi_count, newlen);
384 be32_add_cpu(&agi->agi_freecount, newlen); 385 be32_add_cpu(&agi->agi_freecount, newlen);
385 down_read(&args.mp->m_peraglock); 386 pag = xfs_perag_get(args.mp, agno);
386 args.mp->m_perag[agno].pagi_freecount += newlen; 387 pag->pagi_freecount += newlen;
387 up_read(&args.mp->m_peraglock); 388 xfs_perag_put(pag);
388 agi->agi_newino = cpu_to_be32(newino); 389 agi->agi_newino = cpu_to_be32(newino);
389 390
390 /* 391 /*
@@ -486,9 +487,8 @@ xfs_ialloc_ag_select(
486 */ 487 */
487 agno = pagno; 488 agno = pagno;
488 flags = XFS_ALLOC_FLAG_TRYLOCK; 489 flags = XFS_ALLOC_FLAG_TRYLOCK;
489 down_read(&mp->m_peraglock);
490 for (;;) { 490 for (;;) {
491 pag = &mp->m_perag[agno]; 491 pag = xfs_perag_get(mp, agno);
492 if (!pag->pagi_init) { 492 if (!pag->pagi_init) {
493 if (xfs_ialloc_read_agi(mp, tp, agno, &agbp)) { 493 if (xfs_ialloc_read_agi(mp, tp, agno, &agbp)) {
494 agbp = NULL; 494 agbp = NULL;
@@ -527,7 +527,7 @@ xfs_ialloc_ag_select(
527 agbp = NULL; 527 agbp = NULL;
528 goto nextag; 528 goto nextag;
529 } 529 }
530 up_read(&mp->m_peraglock); 530 xfs_perag_put(pag);
531 return agbp; 531 return agbp;
532 } 532 }
533 } 533 }
@@ -535,22 +535,19 @@ unlock_nextag:
535 if (agbp) 535 if (agbp)
536 xfs_trans_brelse(tp, agbp); 536 xfs_trans_brelse(tp, agbp);
537nextag: 537nextag:
538 xfs_perag_put(pag);
538 /* 539 /*
539 * No point in iterating over the rest, if we're shutting 540 * No point in iterating over the rest, if we're shutting
540 * down. 541 * down.
541 */ 542 */
542 if (XFS_FORCED_SHUTDOWN(mp)) { 543 if (XFS_FORCED_SHUTDOWN(mp))
543 up_read(&mp->m_peraglock);
544 return NULL; 544 return NULL;
545 }
546 agno++; 545 agno++;
547 if (agno >= agcount) 546 if (agno >= agcount)
548 agno = 0; 547 agno = 0;
549 if (agno == pagno) { 548 if (agno == pagno) {
550 if (flags == 0) { 549 if (flags == 0)
551 up_read(&mp->m_peraglock);
552 return NULL; 550 return NULL;
553 }
554 flags = 0; 551 flags = 0;
555 } 552 }
556 } 553 }
@@ -672,6 +669,7 @@ xfs_dialloc(
672 xfs_agnumber_t tagno; /* testing allocation group number */ 669 xfs_agnumber_t tagno; /* testing allocation group number */
673 xfs_btree_cur_t *tcur; /* temp cursor */ 670 xfs_btree_cur_t *tcur; /* temp cursor */
674 xfs_inobt_rec_incore_t trec; /* temp inode allocation record */ 671 xfs_inobt_rec_incore_t trec; /* temp inode allocation record */
672 struct xfs_perag *pag;
675 673
676 674
677 if (*IO_agbp == NULL) { 675 if (*IO_agbp == NULL) {
@@ -771,13 +769,13 @@ nextag:
771 *inop = NULLFSINO; 769 *inop = NULLFSINO;
772 return noroom ? ENOSPC : 0; 770 return noroom ? ENOSPC : 0;
773 } 771 }
774 down_read(&mp->m_peraglock); 772 pag = xfs_perag_get(mp, tagno);
775 if (mp->m_perag[tagno].pagi_inodeok == 0) { 773 if (pag->pagi_inodeok == 0) {
776 up_read(&mp->m_peraglock); 774 xfs_perag_put(pag);
777 goto nextag; 775 goto nextag;
778 } 776 }
779 error = xfs_ialloc_read_agi(mp, tp, tagno, &agbp); 777 error = xfs_ialloc_read_agi(mp, tp, tagno, &agbp);
780 up_read(&mp->m_peraglock); 778 xfs_perag_put(pag);
781 if (error) 779 if (error)
782 goto nextag; 780 goto nextag;
783 agi = XFS_BUF_TO_AGI(agbp); 781 agi = XFS_BUF_TO_AGI(agbp);
@@ -790,6 +788,7 @@ nextag:
790 */ 788 */
791 agno = tagno; 789 agno = tagno;
792 *IO_agbp = NULL; 790 *IO_agbp = NULL;
791 pag = xfs_perag_get(mp, agno);
793 792
794 restart_pagno: 793 restart_pagno:
795 cur = xfs_inobt_init_cursor(mp, tp, agbp, be32_to_cpu(agi->agi_seqno)); 794 cur = xfs_inobt_init_cursor(mp, tp, agbp, be32_to_cpu(agi->agi_seqno));
@@ -808,7 +807,6 @@ nextag:
808 * If in the same AG as the parent, try to get near the parent. 807 * If in the same AG as the parent, try to get near the parent.
809 */ 808 */
810 if (pagno == agno) { 809 if (pagno == agno) {
811 xfs_perag_t *pag = &mp->m_perag[agno];
812 int doneleft; /* done, to the left */ 810 int doneleft; /* done, to the left */
813 int doneright; /* done, to the right */ 811 int doneright; /* done, to the right */
814 int searchdistance = 10; 812 int searchdistance = 10;
@@ -1006,9 +1004,7 @@ alloc_inode:
1006 goto error0; 1004 goto error0;
1007 be32_add_cpu(&agi->agi_freecount, -1); 1005 be32_add_cpu(&agi->agi_freecount, -1);
1008 xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT); 1006 xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
1009 down_read(&mp->m_peraglock); 1007 pag->pagi_freecount--;
1010 mp->m_perag[tagno].pagi_freecount--;
1011 up_read(&mp->m_peraglock);
1012 1008
1013 error = xfs_check_agi_freecount(cur, agi); 1009 error = xfs_check_agi_freecount(cur, agi);
1014 if (error) 1010 if (error)
@@ -1016,12 +1012,14 @@ alloc_inode:
1016 1012
1017 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); 1013 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
1018 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1); 1014 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1);
1015 xfs_perag_put(pag);
1019 *inop = ino; 1016 *inop = ino;
1020 return 0; 1017 return 0;
1021error1: 1018error1:
1022 xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); 1019 xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
1023error0: 1020error0:
1024 xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); 1021 xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
1022 xfs_perag_put(pag);
1025 return error; 1023 return error;
1026} 1024}
1027 1025
@@ -1052,6 +1050,7 @@ xfs_difree(
1052 xfs_mount_t *mp; /* mount structure for filesystem */ 1050 xfs_mount_t *mp; /* mount structure for filesystem */
1053 int off; /* offset of inode in inode chunk */ 1051 int off; /* offset of inode in inode chunk */
1054 xfs_inobt_rec_incore_t rec; /* btree record */ 1052 xfs_inobt_rec_incore_t rec; /* btree record */
1053 struct xfs_perag *pag;
1055 1054
1056 mp = tp->t_mountp; 1055 mp = tp->t_mountp;
1057 1056
@@ -1088,9 +1087,7 @@ xfs_difree(
1088 /* 1087 /*
1089 * Get the allocation group header. 1088 * Get the allocation group header.
1090 */ 1089 */
1091 down_read(&mp->m_peraglock);
1092 error = xfs_ialloc_read_agi(mp, tp, agno, &agbp); 1090 error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
1093 up_read(&mp->m_peraglock);
1094 if (error) { 1091 if (error) {
1095 cmn_err(CE_WARN, 1092 cmn_err(CE_WARN,
1096 "xfs_difree: xfs_ialloc_read_agi() returned an error %d on %s. Returning error.", 1093 "xfs_difree: xfs_ialloc_read_agi() returned an error %d on %s. Returning error.",
@@ -1157,9 +1154,9 @@ xfs_difree(
1157 be32_add_cpu(&agi->agi_count, -ilen); 1154 be32_add_cpu(&agi->agi_count, -ilen);
1158 be32_add_cpu(&agi->agi_freecount, -(ilen - 1)); 1155 be32_add_cpu(&agi->agi_freecount, -(ilen - 1));
1159 xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT); 1156 xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT);
1160 down_read(&mp->m_peraglock); 1157 pag = xfs_perag_get(mp, agno);
1161 mp->m_perag[agno].pagi_freecount -= ilen - 1; 1158 pag->pagi_freecount -= ilen - 1;
1162 up_read(&mp->m_peraglock); 1159 xfs_perag_put(pag);
1163 xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, -ilen); 1160 xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, -ilen);
1164 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -(ilen - 1)); 1161 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -(ilen - 1));
1165 1162
@@ -1188,9 +1185,9 @@ xfs_difree(
1188 */ 1185 */
1189 be32_add_cpu(&agi->agi_freecount, 1); 1186 be32_add_cpu(&agi->agi_freecount, 1);
1190 xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT); 1187 xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
1191 down_read(&mp->m_peraglock); 1188 pag = xfs_perag_get(mp, agno);
1192 mp->m_perag[agno].pagi_freecount++; 1189 pag->pagi_freecount++;
1193 up_read(&mp->m_peraglock); 1190 xfs_perag_put(pag);
1194 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, 1); 1191 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, 1);
1195 } 1192 }
1196 1193
@@ -1312,9 +1309,7 @@ xfs_imap(
1312 xfs_buf_t *agbp; /* agi buffer */ 1309 xfs_buf_t *agbp; /* agi buffer */
1313 int i; /* temp state */ 1310 int i; /* temp state */
1314 1311
1315 down_read(&mp->m_peraglock);
1316 error = xfs_ialloc_read_agi(mp, tp, agno, &agbp); 1312 error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
1317 up_read(&mp->m_peraglock);
1318 if (error) { 1313 if (error) {
1319 xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: " 1314 xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
1320 "xfs_ialloc_read_agi() returned " 1315 "xfs_ialloc_read_agi() returned "
@@ -1379,7 +1374,6 @@ xfs_imap(
1379 XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)); 1374 XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks));
1380 return XFS_ERROR(EINVAL); 1375 return XFS_ERROR(EINVAL);
1381 } 1376 }
1382
1383 return 0; 1377 return 0;
1384} 1378}
1385 1379
@@ -1523,8 +1517,7 @@ xfs_ialloc_read_agi(
1523 return error; 1517 return error;
1524 1518
1525 agi = XFS_BUF_TO_AGI(*bpp); 1519 agi = XFS_BUF_TO_AGI(*bpp);
1526 pag = &mp->m_perag[agno]; 1520 pag = xfs_perag_get(mp, agno);
1527
1528 if (!pag->pagi_init) { 1521 if (!pag->pagi_init) {
1529 pag->pagi_freecount = be32_to_cpu(agi->agi_freecount); 1522 pag->pagi_freecount = be32_to_cpu(agi->agi_freecount);
1530 pag->pagi_count = be32_to_cpu(agi->agi_count); 1523 pag->pagi_count = be32_to_cpu(agi->agi_count);
@@ -1537,6 +1530,7 @@ xfs_ialloc_read_agi(
1537 */ 1530 */
1538 ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) || 1531 ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) ||
1539 XFS_FORCED_SHUTDOWN(mp)); 1532 XFS_FORCED_SHUTDOWN(mp));
1533 xfs_perag_put(pag);
1540 return 0; 1534 return 0;
1541} 1535}
1542 1536
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 155e798f30a1..e281eb4a1c49 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -374,7 +374,7 @@ xfs_iget(
374 return EINVAL; 374 return EINVAL;
375 375
376 /* get the perag structure and ensure that it's inode capable */ 376 /* get the perag structure and ensure that it's inode capable */
377 pag = xfs_get_perag(mp, ino); 377 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino));
378 if (!pag->pagi_inodeok) 378 if (!pag->pagi_inodeok)
379 return EINVAL; 379 return EINVAL;
380 ASSERT(pag->pag_ici_init); 380 ASSERT(pag->pag_ici_init);
@@ -398,7 +398,7 @@ again:
398 if (error) 398 if (error)
399 goto out_error_or_again; 399 goto out_error_or_again;
400 } 400 }
401 xfs_put_perag(mp, pag); 401 xfs_perag_put(pag);
402 402
403 *ipp = ip; 403 *ipp = ip;
404 404
@@ -417,7 +417,7 @@ out_error_or_again:
417 delay(1); 417 delay(1);
418 goto again; 418 goto again;
419 } 419 }
420 xfs_put_perag(mp, pag); 420 xfs_perag_put(pag);
421 return error; 421 return error;
422} 422}
423 423
@@ -488,12 +488,12 @@ xfs_ireclaim(
488 * added to the tree assert that it's been there before to catch 488 * added to the tree assert that it's been there before to catch
489 * problems with the inode life time early on. 489 * problems with the inode life time early on.
490 */ 490 */
491 pag = xfs_get_perag(mp, ip->i_ino); 491 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
492 write_lock(&pag->pag_ici_lock); 492 write_lock(&pag->pag_ici_lock);
493 if (!radix_tree_delete(&pag->pag_ici_root, agino)) 493 if (!radix_tree_delete(&pag->pag_ici_root, agino))
494 ASSERT(0); 494 ASSERT(0);
495 write_unlock(&pag->pag_ici_lock); 495 write_unlock(&pag->pag_ici_lock);
496 xfs_put_perag(mp, pag); 496 xfs_perag_put(pag);
497 497
498 /* 498 /*
499 * Here we do an (almost) spurious inode lock in order to coordinate 499 * Here we do an (almost) spurious inode lock in order to coordinate
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index ef77fd88c8e3..fa31360046d4 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -151,7 +151,7 @@ xfs_imap_to_bp(
151 "an error %d on %s. Returning error.", 151 "an error %d on %s. Returning error.",
152 error, mp->m_fsname); 152 error, mp->m_fsname);
153 } else { 153 } else {
154 ASSERT(buf_flags & XFS_BUF_TRYLOCK); 154 ASSERT(buf_flags & XBF_TRYLOCK);
155 } 155 }
156 return error; 156 return error;
157 } 157 }
@@ -239,7 +239,7 @@ xfs_inotobp(
239 if (error) 239 if (error)
240 return error; 240 return error;
241 241
242 error = xfs_imap_to_bp(mp, tp, &imap, &bp, XFS_BUF_LOCK, imap_flags); 242 error = xfs_imap_to_bp(mp, tp, &imap, &bp, XBF_LOCK, imap_flags);
243 if (error) 243 if (error)
244 return error; 244 return error;
245 245
@@ -285,7 +285,7 @@ xfs_itobp(
285 return error; 285 return error;
286 286
287 if (!bp) { 287 if (!bp) {
288 ASSERT(buf_flags & XFS_BUF_TRYLOCK); 288 ASSERT(buf_flags & XBF_TRYLOCK);
289 ASSERT(tp == NULL); 289 ASSERT(tp == NULL);
290 *bpp = NULL; 290 *bpp = NULL;
291 return EAGAIN; 291 return EAGAIN;
@@ -807,7 +807,7 @@ xfs_iread(
807 * Get pointers to the on-disk inode and the buffer containing it. 807 * Get pointers to the on-disk inode and the buffer containing it.
808 */ 808 */
809 error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp, 809 error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp,
810 XFS_BUF_LOCK, iget_flags); 810 XBF_LOCK, iget_flags);
811 if (error) 811 if (error)
812 return error; 812 return error;
813 dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset); 813 dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset);
@@ -1751,7 +1751,7 @@ xfs_iunlink(
1751 * Here we put the head pointer into our next pointer, 1751 * Here we put the head pointer into our next pointer,
1752 * and then we fall through to point the head at us. 1752 * and then we fall through to point the head at us.
1753 */ 1753 */
1754 error = xfs_itobp(mp, tp, ip, &dip, &ibp, XFS_BUF_LOCK); 1754 error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK);
1755 if (error) 1755 if (error)
1756 return error; 1756 return error;
1757 1757
@@ -1833,7 +1833,7 @@ xfs_iunlink_remove(
1833 * of dealing with the buffer when there is no need to 1833 * of dealing with the buffer when there is no need to
1834 * change it. 1834 * change it.
1835 */ 1835 */
1836 error = xfs_itobp(mp, tp, ip, &dip, &ibp, XFS_BUF_LOCK); 1836 error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK);
1837 if (error) { 1837 if (error) {
1838 cmn_err(CE_WARN, 1838 cmn_err(CE_WARN,
1839 "xfs_iunlink_remove: xfs_itobp() returned an error %d on %s. Returning error.", 1839 "xfs_iunlink_remove: xfs_itobp() returned an error %d on %s. Returning error.",
@@ -1895,7 +1895,7 @@ xfs_iunlink_remove(
1895 * Now last_ibp points to the buffer previous to us on 1895 * Now last_ibp points to the buffer previous to us on
1896 * the unlinked list. Pull us from the list. 1896 * the unlinked list. Pull us from the list.
1897 */ 1897 */
1898 error = xfs_itobp(mp, tp, ip, &dip, &ibp, XFS_BUF_LOCK); 1898 error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK);
1899 if (error) { 1899 if (error) {
1900 cmn_err(CE_WARN, 1900 cmn_err(CE_WARN,
1901 "xfs_iunlink_remove: xfs_itobp() returned an error %d on %s. Returning error.", 1901 "xfs_iunlink_remove: xfs_itobp() returned an error %d on %s. Returning error.",
@@ -1946,8 +1946,9 @@ xfs_ifree_cluster(
1946 xfs_inode_t *ip, **ip_found; 1946 xfs_inode_t *ip, **ip_found;
1947 xfs_inode_log_item_t *iip; 1947 xfs_inode_log_item_t *iip;
1948 xfs_log_item_t *lip; 1948 xfs_log_item_t *lip;
1949 xfs_perag_t *pag = xfs_get_perag(mp, inum); 1949 struct xfs_perag *pag;
1950 1950
1951 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum));
1951 if (mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) { 1952 if (mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) {
1952 blks_per_cluster = 1; 1953 blks_per_cluster = 1;
1953 ninodes = mp->m_sb.sb_inopblock; 1954 ninodes = mp->m_sb.sb_inopblock;
@@ -2039,7 +2040,7 @@ xfs_ifree_cluster(
2039 2040
2040 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno, 2041 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno,
2041 mp->m_bsize * blks_per_cluster, 2042 mp->m_bsize * blks_per_cluster,
2042 XFS_BUF_LOCK); 2043 XBF_LOCK);
2043 2044
2044 pre_flushed = 0; 2045 pre_flushed = 0;
2045 lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); 2046 lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
@@ -2088,7 +2089,7 @@ xfs_ifree_cluster(
2088 } 2089 }
2089 2090
2090 kmem_free(ip_found); 2091 kmem_free(ip_found);
2091 xfs_put_perag(mp, pag); 2092 xfs_perag_put(pag);
2092} 2093}
2093 2094
2094/* 2095/*
@@ -2150,7 +2151,7 @@ xfs_ifree(
2150 2151
2151 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 2152 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
2152 2153
2153 error = xfs_itobp(ip->i_mount, tp, ip, &dip, &ibp, XFS_BUF_LOCK); 2154 error = xfs_itobp(ip->i_mount, tp, ip, &dip, &ibp, XBF_LOCK);
2154 if (error) 2155 if (error)
2155 return error; 2156 return error;
2156 2157
@@ -2483,13 +2484,16 @@ __xfs_iunpin_wait(
2483 return; 2484 return;
2484 2485
2485 /* Give the log a push to start the unpinning I/O */ 2486 /* Give the log a push to start the unpinning I/O */
2486 xfs_log_force(ip->i_mount, (iip && iip->ili_last_lsn) ? 2487 if (iip && iip->ili_last_lsn)
2487 iip->ili_last_lsn : 0, XFS_LOG_FORCE); 2488 xfs_log_force_lsn(ip->i_mount, iip->ili_last_lsn, 0);
2489 else
2490 xfs_log_force(ip->i_mount, 0);
2491
2488 if (wait) 2492 if (wait)
2489 wait_event(ip->i_ipin_wait, (atomic_read(&ip->i_pincount) == 0)); 2493 wait_event(ip->i_ipin_wait, (atomic_read(&ip->i_pincount) == 0));
2490} 2494}
2491 2495
2492static inline void 2496void
2493xfs_iunpin_wait( 2497xfs_iunpin_wait(
2494 xfs_inode_t *ip) 2498 xfs_inode_t *ip)
2495{ 2499{
@@ -2675,7 +2679,7 @@ xfs_iflush_cluster(
2675 xfs_buf_t *bp) 2679 xfs_buf_t *bp)
2676{ 2680{
2677 xfs_mount_t *mp = ip->i_mount; 2681 xfs_mount_t *mp = ip->i_mount;
2678 xfs_perag_t *pag = xfs_get_perag(mp, ip->i_ino); 2682 struct xfs_perag *pag;
2679 unsigned long first_index, mask; 2683 unsigned long first_index, mask;
2680 unsigned long inodes_per_cluster; 2684 unsigned long inodes_per_cluster;
2681 int ilist_size; 2685 int ilist_size;
@@ -2686,6 +2690,7 @@ xfs_iflush_cluster(
2686 int bufwasdelwri; 2690 int bufwasdelwri;
2687 int i; 2691 int i;
2688 2692
2693 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
2689 ASSERT(pag->pagi_inodeok); 2694 ASSERT(pag->pagi_inodeok);
2690 ASSERT(pag->pag_ici_init); 2695 ASSERT(pag->pag_ici_init);
2691 2696
@@ -2693,7 +2698,7 @@ xfs_iflush_cluster(
2693 ilist_size = inodes_per_cluster * sizeof(xfs_inode_t *); 2698 ilist_size = inodes_per_cluster * sizeof(xfs_inode_t *);
2694 ilist = kmem_alloc(ilist_size, KM_MAYFAIL|KM_NOFS); 2699 ilist = kmem_alloc(ilist_size, KM_MAYFAIL|KM_NOFS);
2695 if (!ilist) 2700 if (!ilist)
2696 return 0; 2701 goto out_put;
2697 2702
2698 mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1); 2703 mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
2699 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask; 2704 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
@@ -2762,6 +2767,8 @@ xfs_iflush_cluster(
2762out_free: 2767out_free:
2763 read_unlock(&pag->pag_ici_lock); 2768 read_unlock(&pag->pag_ici_lock);
2764 kmem_free(ilist); 2769 kmem_free(ilist);
2770out_put:
2771 xfs_perag_put(pag);
2765 return 0; 2772 return 0;
2766 2773
2767 2774
@@ -2805,6 +2812,7 @@ cluster_corrupt_out:
2805 */ 2812 */
2806 xfs_iflush_abort(iq); 2813 xfs_iflush_abort(iq);
2807 kmem_free(ilist); 2814 kmem_free(ilist);
2815 xfs_perag_put(pag);
2808 return XFS_ERROR(EFSCORRUPTED); 2816 return XFS_ERROR(EFSCORRUPTED);
2809} 2817}
2810 2818
@@ -2827,8 +2835,6 @@ xfs_iflush(
2827 xfs_dinode_t *dip; 2835 xfs_dinode_t *dip;
2828 xfs_mount_t *mp; 2836 xfs_mount_t *mp;
2829 int error; 2837 int error;
2830 int noblock = (flags == XFS_IFLUSH_ASYNC_NOBLOCK);
2831 enum { INT_DELWRI = (1 << 0), INT_ASYNC = (1 << 1) };
2832 2838
2833 XFS_STATS_INC(xs_iflush_count); 2839 XFS_STATS_INC(xs_iflush_count);
2834 2840
@@ -2841,15 +2847,6 @@ xfs_iflush(
2841 mp = ip->i_mount; 2847 mp = ip->i_mount;
2842 2848
2843 /* 2849 /*
2844 * If the inode isn't dirty, then just release the inode flush lock and
2845 * do nothing.
2846 */
2847 if (xfs_inode_clean(ip)) {
2848 xfs_ifunlock(ip);
2849 return 0;
2850 }
2851
2852 /*
2853 * We can't flush the inode until it is unpinned, so wait for it if we 2850 * We can't flush the inode until it is unpinned, so wait for it if we
2854 * are allowed to block. We know noone new can pin it, because we are 2851 * are allowed to block. We know noone new can pin it, because we are
2855 * holding the inode lock shared and you need to hold it exclusively to 2852 * holding the inode lock shared and you need to hold it exclusively to
@@ -2860,7 +2857,7 @@ xfs_iflush(
2860 * in the same cluster are dirty, they will probably write the inode 2857 * in the same cluster are dirty, they will probably write the inode
2861 * out for us if they occur after the log force completes. 2858 * out for us if they occur after the log force completes.
2862 */ 2859 */
2863 if (noblock && xfs_ipincount(ip)) { 2860 if (!(flags & SYNC_WAIT) && xfs_ipincount(ip)) {
2864 xfs_iunpin_nowait(ip); 2861 xfs_iunpin_nowait(ip);
2865 xfs_ifunlock(ip); 2862 xfs_ifunlock(ip);
2866 return EAGAIN; 2863 return EAGAIN;
@@ -2894,60 +2891,10 @@ xfs_iflush(
2894 } 2891 }
2895 2892
2896 /* 2893 /*
2897 * Decide how buffer will be flushed out. This is done before
2898 * the call to xfs_iflush_int because this field is zeroed by it.
2899 */
2900 if (iip != NULL && iip->ili_format.ilf_fields != 0) {
2901 /*
2902 * Flush out the inode buffer according to the directions
2903 * of the caller. In the cases where the caller has given
2904 * us a choice choose the non-delwri case. This is because
2905 * the inode is in the AIL and we need to get it out soon.
2906 */
2907 switch (flags) {
2908 case XFS_IFLUSH_SYNC:
2909 case XFS_IFLUSH_DELWRI_ELSE_SYNC:
2910 flags = 0;
2911 break;
2912 case XFS_IFLUSH_ASYNC_NOBLOCK:
2913 case XFS_IFLUSH_ASYNC:
2914 case XFS_IFLUSH_DELWRI_ELSE_ASYNC:
2915 flags = INT_ASYNC;
2916 break;
2917 case XFS_IFLUSH_DELWRI:
2918 flags = INT_DELWRI;
2919 break;
2920 default:
2921 ASSERT(0);
2922 flags = 0;
2923 break;
2924 }
2925 } else {
2926 switch (flags) {
2927 case XFS_IFLUSH_DELWRI_ELSE_SYNC:
2928 case XFS_IFLUSH_DELWRI_ELSE_ASYNC:
2929 case XFS_IFLUSH_DELWRI:
2930 flags = INT_DELWRI;
2931 break;
2932 case XFS_IFLUSH_ASYNC_NOBLOCK:
2933 case XFS_IFLUSH_ASYNC:
2934 flags = INT_ASYNC;
2935 break;
2936 case XFS_IFLUSH_SYNC:
2937 flags = 0;
2938 break;
2939 default:
2940 ASSERT(0);
2941 flags = 0;
2942 break;
2943 }
2944 }
2945
2946 /*
2947 * Get the buffer containing the on-disk inode. 2894 * Get the buffer containing the on-disk inode.
2948 */ 2895 */
2949 error = xfs_itobp(mp, NULL, ip, &dip, &bp, 2896 error = xfs_itobp(mp, NULL, ip, &dip, &bp,
2950 noblock ? XFS_BUF_TRYLOCK : XFS_BUF_LOCK); 2897 (flags & SYNC_WAIT) ? XBF_LOCK : XBF_TRYLOCK);
2951 if (error || !bp) { 2898 if (error || !bp) {
2952 xfs_ifunlock(ip); 2899 xfs_ifunlock(ip);
2953 return error; 2900 return error;
@@ -2965,7 +2912,7 @@ xfs_iflush(
2965 * get stuck waiting in the write for too long. 2912 * get stuck waiting in the write for too long.
2966 */ 2913 */
2967 if (XFS_BUF_ISPINNED(bp)) 2914 if (XFS_BUF_ISPINNED(bp))
2968 xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE); 2915 xfs_log_force(mp, 0);
2969 2916
2970 /* 2917 /*
2971 * inode clustering: 2918 * inode clustering:
@@ -2975,13 +2922,10 @@ xfs_iflush(
2975 if (error) 2922 if (error)
2976 goto cluster_corrupt_out; 2923 goto cluster_corrupt_out;
2977 2924
2978 if (flags & INT_DELWRI) { 2925 if (flags & SYNC_WAIT)
2979 xfs_bdwrite(mp, bp);
2980 } else if (flags & INT_ASYNC) {
2981 error = xfs_bawrite(mp, bp);
2982 } else {
2983 error = xfs_bwrite(mp, bp); 2926 error = xfs_bwrite(mp, bp);
2984 } 2927 else
2928 xfs_bdwrite(mp, bp);
2985 return error; 2929 return error;
2986 2930
2987corrupt_out: 2931corrupt_out:
@@ -3016,16 +2960,6 @@ xfs_iflush_int(
3016 iip = ip->i_itemp; 2960 iip = ip->i_itemp;
3017 mp = ip->i_mount; 2961 mp = ip->i_mount;
3018 2962
3019
3020 /*
3021 * If the inode isn't dirty, then just release the inode
3022 * flush lock and do nothing.
3023 */
3024 if (xfs_inode_clean(ip)) {
3025 xfs_ifunlock(ip);
3026 return 0;
3027 }
3028
3029 /* set *dip = inode's place in the buffer */ 2963 /* set *dip = inode's place in the buffer */
3030 dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset); 2964 dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset);
3031 2965
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index ec1f28c4fc4f..6c912b027596 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -420,16 +420,6 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
420#define XFS_ILOCK_DEP(flags) (((flags) & XFS_ILOCK_DEP_MASK) >> XFS_ILOCK_SHIFT) 420#define XFS_ILOCK_DEP(flags) (((flags) & XFS_ILOCK_DEP_MASK) >> XFS_ILOCK_SHIFT)
421 421
422/* 422/*
423 * Flags for xfs_iflush()
424 */
425#define XFS_IFLUSH_DELWRI_ELSE_SYNC 1
426#define XFS_IFLUSH_DELWRI_ELSE_ASYNC 2
427#define XFS_IFLUSH_SYNC 3
428#define XFS_IFLUSH_ASYNC 4
429#define XFS_IFLUSH_DELWRI 5
430#define XFS_IFLUSH_ASYNC_NOBLOCK 6
431
432/*
433 * Flags for xfs_itruncate_start(). 423 * Flags for xfs_itruncate_start().
434 */ 424 */
435#define XFS_ITRUNC_DEFINITE 0x1 425#define XFS_ITRUNC_DEFINITE 0x1
@@ -483,6 +473,7 @@ int xfs_iunlink(struct xfs_trans *, xfs_inode_t *);
483void xfs_iext_realloc(xfs_inode_t *, int, int); 473void xfs_iext_realloc(xfs_inode_t *, int, int);
484void xfs_ipin(xfs_inode_t *); 474void xfs_ipin(xfs_inode_t *);
485void xfs_iunpin(xfs_inode_t *); 475void xfs_iunpin(xfs_inode_t *);
476void xfs_iunpin_wait(xfs_inode_t *);
486int xfs_iflush(xfs_inode_t *, uint); 477int xfs_iflush(xfs_inode_t *, uint);
487void xfs_ichgtime(xfs_inode_t *, int); 478void xfs_ichgtime(xfs_inode_t *, int);
488void xfs_lock_inodes(xfs_inode_t **, int, uint); 479void xfs_lock_inodes(xfs_inode_t **, int, uint);
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index f38855d21ea5..d4dc063111f8 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -228,7 +228,7 @@ xfs_inode_item_format(
228 228
229 vecp->i_addr = (xfs_caddr_t)&iip->ili_format; 229 vecp->i_addr = (xfs_caddr_t)&iip->ili_format;
230 vecp->i_len = sizeof(xfs_inode_log_format_t); 230 vecp->i_len = sizeof(xfs_inode_log_format_t);
231 XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IFORMAT); 231 vecp->i_type = XLOG_REG_TYPE_IFORMAT;
232 vecp++; 232 vecp++;
233 nvecs = 1; 233 nvecs = 1;
234 234
@@ -279,7 +279,7 @@ xfs_inode_item_format(
279 279
280 vecp->i_addr = (xfs_caddr_t)&ip->i_d; 280 vecp->i_addr = (xfs_caddr_t)&ip->i_d;
281 vecp->i_len = sizeof(struct xfs_icdinode); 281 vecp->i_len = sizeof(struct xfs_icdinode);
282 XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_ICORE); 282 vecp->i_type = XLOG_REG_TYPE_ICORE;
283 vecp++; 283 vecp++;
284 nvecs++; 284 nvecs++;
285 iip->ili_format.ilf_fields |= XFS_ILOG_CORE; 285 iip->ili_format.ilf_fields |= XFS_ILOG_CORE;
@@ -336,7 +336,7 @@ xfs_inode_item_format(
336 vecp->i_addr = 336 vecp->i_addr =
337 (char *)(ip->i_df.if_u1.if_extents); 337 (char *)(ip->i_df.if_u1.if_extents);
338 vecp->i_len = ip->i_df.if_bytes; 338 vecp->i_len = ip->i_df.if_bytes;
339 XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IEXT); 339 vecp->i_type = XLOG_REG_TYPE_IEXT;
340 } else 340 } else
341#endif 341#endif
342 { 342 {
@@ -355,7 +355,7 @@ xfs_inode_item_format(
355 vecp->i_addr = (xfs_caddr_t)ext_buffer; 355 vecp->i_addr = (xfs_caddr_t)ext_buffer;
356 vecp->i_len = xfs_iextents_copy(ip, ext_buffer, 356 vecp->i_len = xfs_iextents_copy(ip, ext_buffer,
357 XFS_DATA_FORK); 357 XFS_DATA_FORK);
358 XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IEXT); 358 vecp->i_type = XLOG_REG_TYPE_IEXT;
359 } 359 }
360 ASSERT(vecp->i_len <= ip->i_df.if_bytes); 360 ASSERT(vecp->i_len <= ip->i_df.if_bytes);
361 iip->ili_format.ilf_dsize = vecp->i_len; 361 iip->ili_format.ilf_dsize = vecp->i_len;
@@ -373,7 +373,7 @@ xfs_inode_item_format(
373 ASSERT(ip->i_df.if_broot != NULL); 373 ASSERT(ip->i_df.if_broot != NULL);
374 vecp->i_addr = (xfs_caddr_t)ip->i_df.if_broot; 374 vecp->i_addr = (xfs_caddr_t)ip->i_df.if_broot;
375 vecp->i_len = ip->i_df.if_broot_bytes; 375 vecp->i_len = ip->i_df.if_broot_bytes;
376 XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IBROOT); 376 vecp->i_type = XLOG_REG_TYPE_IBROOT;
377 vecp++; 377 vecp++;
378 nvecs++; 378 nvecs++;
379 iip->ili_format.ilf_dsize = ip->i_df.if_broot_bytes; 379 iip->ili_format.ilf_dsize = ip->i_df.if_broot_bytes;
@@ -399,7 +399,7 @@ xfs_inode_item_format(
399 ASSERT((ip->i_df.if_real_bytes == 0) || 399 ASSERT((ip->i_df.if_real_bytes == 0) ||
400 (ip->i_df.if_real_bytes == data_bytes)); 400 (ip->i_df.if_real_bytes == data_bytes));
401 vecp->i_len = (int)data_bytes; 401 vecp->i_len = (int)data_bytes;
402 XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_ILOCAL); 402 vecp->i_type = XLOG_REG_TYPE_ILOCAL;
403 vecp++; 403 vecp++;
404 nvecs++; 404 nvecs++;
405 iip->ili_format.ilf_dsize = (unsigned)data_bytes; 405 iip->ili_format.ilf_dsize = (unsigned)data_bytes;
@@ -477,7 +477,7 @@ xfs_inode_item_format(
477 vecp->i_len = xfs_iextents_copy(ip, ext_buffer, 477 vecp->i_len = xfs_iextents_copy(ip, ext_buffer,
478 XFS_ATTR_FORK); 478 XFS_ATTR_FORK);
479#endif 479#endif
480 XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IATTR_EXT); 480 vecp->i_type = XLOG_REG_TYPE_IATTR_EXT;
481 iip->ili_format.ilf_asize = vecp->i_len; 481 iip->ili_format.ilf_asize = vecp->i_len;
482 vecp++; 482 vecp++;
483 nvecs++; 483 nvecs++;
@@ -492,7 +492,7 @@ xfs_inode_item_format(
492 ASSERT(ip->i_afp->if_broot != NULL); 492 ASSERT(ip->i_afp->if_broot != NULL);
493 vecp->i_addr = (xfs_caddr_t)ip->i_afp->if_broot; 493 vecp->i_addr = (xfs_caddr_t)ip->i_afp->if_broot;
494 vecp->i_len = ip->i_afp->if_broot_bytes; 494 vecp->i_len = ip->i_afp->if_broot_bytes;
495 XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IATTR_BROOT); 495 vecp->i_type = XLOG_REG_TYPE_IATTR_BROOT;
496 vecp++; 496 vecp++;
497 nvecs++; 497 nvecs++;
498 iip->ili_format.ilf_asize = ip->i_afp->if_broot_bytes; 498 iip->ili_format.ilf_asize = ip->i_afp->if_broot_bytes;
@@ -516,7 +516,7 @@ xfs_inode_item_format(
516 ASSERT((ip->i_afp->if_real_bytes == 0) || 516 ASSERT((ip->i_afp->if_real_bytes == 0) ||
517 (ip->i_afp->if_real_bytes == data_bytes)); 517 (ip->i_afp->if_real_bytes == data_bytes));
518 vecp->i_len = (int)data_bytes; 518 vecp->i_len = (int)data_bytes;
519 XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IATTR_LOCAL); 519 vecp->i_type = XLOG_REG_TYPE_IATTR_LOCAL;
520 vecp++; 520 vecp++;
521 nvecs++; 521 nvecs++;
522 iip->ili_format.ilf_asize = (unsigned)data_bytes; 522 iip->ili_format.ilf_asize = (unsigned)data_bytes;
@@ -602,33 +602,20 @@ xfs_inode_item_trylock(
602 602
603 if (!xfs_iflock_nowait(ip)) { 603 if (!xfs_iflock_nowait(ip)) {
604 /* 604 /*
605 * If someone else isn't already trying to push the inode 605 * inode has already been flushed to the backing buffer,
606 * buffer, we get to do it. 606 * leave it locked in shared mode, pushbuf routine will
607 * unlock it.
607 */ 608 */
608 if (iip->ili_pushbuf_flag == 0) { 609 return XFS_ITEM_PUSHBUF;
609 iip->ili_pushbuf_flag = 1;
610#ifdef DEBUG
611 iip->ili_push_owner = current_pid();
612#endif
613 /*
614 * Inode is left locked in shared mode.
615 * Pushbuf routine gets to unlock it.
616 */
617 return XFS_ITEM_PUSHBUF;
618 } else {
619 /*
620 * We hold the AIL lock, so we must specify the
621 * NONOTIFY flag so that we won't double trip.
622 */
623 xfs_iunlock(ip, XFS_ILOCK_SHARED|XFS_IUNLOCK_NONOTIFY);
624 return XFS_ITEM_FLUSHING;
625 }
626 /* NOTREACHED */
627 } 610 }
628 611
629 /* Stale items should force out the iclog */ 612 /* Stale items should force out the iclog */
630 if (ip->i_flags & XFS_ISTALE) { 613 if (ip->i_flags & XFS_ISTALE) {
631 xfs_ifunlock(ip); 614 xfs_ifunlock(ip);
615 /*
616 * we hold the AIL lock - notify the unlock routine of this
617 * so it doesn't try to get the lock again.
618 */
632 xfs_iunlock(ip, XFS_ILOCK_SHARED|XFS_IUNLOCK_NONOTIFY); 619 xfs_iunlock(ip, XFS_ILOCK_SHARED|XFS_IUNLOCK_NONOTIFY);
633 return XFS_ITEM_PINNED; 620 return XFS_ITEM_PINNED;
634 } 621 }
@@ -746,11 +733,8 @@ xfs_inode_item_committed(
746 * This gets called by xfs_trans_push_ail(), when IOP_TRYLOCK 733 * This gets called by xfs_trans_push_ail(), when IOP_TRYLOCK
747 * failed to get the inode flush lock but did get the inode locked SHARED. 734 * failed to get the inode flush lock but did get the inode locked SHARED.
748 * Here we're trying to see if the inode buffer is incore, and if so whether it's 735 * Here we're trying to see if the inode buffer is incore, and if so whether it's
749 * marked delayed write. If that's the case, we'll initiate a bawrite on that 736 * marked delayed write. If that's the case, we'll promote it and that will
750 * buffer to expedite the process. 737 * allow the caller to write the buffer by triggering the xfsbufd to run.
751 *
752 * We aren't holding the AIL lock (or the flush lock) when this gets called,
753 * so it is inherently race-y.
754 */ 738 */
755STATIC void 739STATIC void
756xfs_inode_item_pushbuf( 740xfs_inode_item_pushbuf(
@@ -759,82 +743,30 @@ xfs_inode_item_pushbuf(
759 xfs_inode_t *ip; 743 xfs_inode_t *ip;
760 xfs_mount_t *mp; 744 xfs_mount_t *mp;
761 xfs_buf_t *bp; 745 xfs_buf_t *bp;
762 uint dopush;
763 746
764 ip = iip->ili_inode; 747 ip = iip->ili_inode;
765
766 ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED)); 748 ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED));
767 749
768 /* 750 /*
769 * The ili_pushbuf_flag keeps others from
770 * trying to duplicate our effort.
771 */
772 ASSERT(iip->ili_pushbuf_flag != 0);
773 ASSERT(iip->ili_push_owner == current_pid());
774
775 /*
776 * If a flush is not in progress anymore, chances are that the 751 * If a flush is not in progress anymore, chances are that the
777 * inode was taken off the AIL. So, just get out. 752 * inode was taken off the AIL. So, just get out.
778 */ 753 */
779 if (completion_done(&ip->i_flush) || 754 if (completion_done(&ip->i_flush) ||
780 ((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0)) { 755 ((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0)) {
781 iip->ili_pushbuf_flag = 0;
782 xfs_iunlock(ip, XFS_ILOCK_SHARED); 756 xfs_iunlock(ip, XFS_ILOCK_SHARED);
783 return; 757 return;
784 } 758 }
785 759
786 mp = ip->i_mount; 760 mp = ip->i_mount;
787 bp = xfs_incore(mp->m_ddev_targp, iip->ili_format.ilf_blkno, 761 bp = xfs_incore(mp->m_ddev_targp, iip->ili_format.ilf_blkno,
788 iip->ili_format.ilf_len, XFS_INCORE_TRYLOCK); 762 iip->ili_format.ilf_len, XBF_TRYLOCK);
789 763
790 if (bp != NULL) {
791 if (XFS_BUF_ISDELAYWRITE(bp)) {
792 /*
793 * We were racing with iflush because we don't hold
794 * the AIL lock or the flush lock. However, at this point,
795 * we have the buffer, and we know that it's dirty.
796 * So, it's possible that iflush raced with us, and
797 * this item is already taken off the AIL.
798 * If not, we can flush it async.
799 */
800 dopush = ((iip->ili_item.li_flags & XFS_LI_IN_AIL) &&
801 !completion_done(&ip->i_flush));
802 iip->ili_pushbuf_flag = 0;
803 xfs_iunlock(ip, XFS_ILOCK_SHARED);
804
805 trace_xfs_inode_item_push(bp, _RET_IP_);
806
807 if (XFS_BUF_ISPINNED(bp)) {
808 xfs_log_force(mp, (xfs_lsn_t)0,
809 XFS_LOG_FORCE);
810 }
811 if (dopush) {
812 int error;
813 error = xfs_bawrite(mp, bp);
814 if (error)
815 xfs_fs_cmn_err(CE_WARN, mp,
816 "xfs_inode_item_pushbuf: pushbuf error %d on iip %p, bp %p",
817 error, iip, bp);
818 } else {
819 xfs_buf_relse(bp);
820 }
821 } else {
822 iip->ili_pushbuf_flag = 0;
823 xfs_iunlock(ip, XFS_ILOCK_SHARED);
824 xfs_buf_relse(bp);
825 }
826 return;
827 }
828 /*
829 * We have to be careful about resetting pushbuf flag too early (above).
830 * Even though in theory we can do it as soon as we have the buflock,
831 * we don't want others to be doing work needlessly. They'll come to
832 * this function thinking that pushing the buffer is their
833 * responsibility only to find that the buffer is still locked by
834 * another doing the same thing
835 */
836 iip->ili_pushbuf_flag = 0;
837 xfs_iunlock(ip, XFS_ILOCK_SHARED); 764 xfs_iunlock(ip, XFS_ILOCK_SHARED);
765 if (!bp)
766 return;
767 if (XFS_BUF_ISDELAYWRITE(bp))
768 xfs_buf_delwri_promote(bp);
769 xfs_buf_relse(bp);
838 return; 770 return;
839} 771}
840 772
@@ -867,10 +799,14 @@ xfs_inode_item_push(
867 iip->ili_format.ilf_fields != 0); 799 iip->ili_format.ilf_fields != 0);
868 800
869 /* 801 /*
870 * Write out the inode. The completion routine ('iflush_done') will 802 * Push the inode to it's backing buffer. This will not remove the
871 * pull it from the AIL, mark it clean, unlock the flush lock. 803 * inode from the AIL - a further push will be required to trigger a
804 * buffer push. However, this allows all the dirty inodes to be pushed
805 * to the buffer before it is pushed to disk. THe buffer IO completion
806 * will pull th einode from the AIL, mark it clean and unlock the flush
807 * lock.
872 */ 808 */
873 (void) xfs_iflush(ip, XFS_IFLUSH_ASYNC); 809 (void) xfs_iflush(ip, 0);
874 xfs_iunlock(ip, XFS_ILOCK_SHARED); 810 xfs_iunlock(ip, XFS_ILOCK_SHARED);
875 811
876 return; 812 return;
@@ -934,7 +870,6 @@ xfs_inode_item_init(
934 /* 870 /*
935 We have zeroed memory. No need ... 871 We have zeroed memory. No need ...
936 iip->ili_extents_buf = NULL; 872 iip->ili_extents_buf = NULL;
937 iip->ili_pushbuf_flag = 0;
938 */ 873 */
939 874
940 iip->ili_format.ilf_type = XFS_LI_INODE; 875 iip->ili_format.ilf_type = XFS_LI_INODE;
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index cc8df1ac7783..9a467958ecdd 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -144,12 +144,6 @@ typedef struct xfs_inode_log_item {
144 data exts */ 144 data exts */
145 struct xfs_bmbt_rec *ili_aextents_buf; /* array of logged 145 struct xfs_bmbt_rec *ili_aextents_buf; /* array of logged
146 attr exts */ 146 attr exts */
147 unsigned int ili_pushbuf_flag; /* one bit used in push_ail */
148
149#ifdef DEBUG
150 uint64_t ili_push_owner; /* one who sets pushbuf_flag
151 above gets to push the buf */
152#endif
153#ifdef XFS_TRANS_DEBUG 147#ifdef XFS_TRANS_DEBUG
154 int ili_root_size; 148 int ili_root_size;
155 char *ili_orig_root; 149 char *ili_orig_root;
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 62efab2f3839..3af02314c605 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -408,8 +408,10 @@ xfs_bulkstat(
408 (XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog); 408 (XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog);
409 nimask = ~(nicluster - 1); 409 nimask = ~(nicluster - 1);
410 nbcluster = nicluster >> mp->m_sb.sb_inopblog; 410 nbcluster = nicluster >> mp->m_sb.sb_inopblog;
411 irbuf = kmem_zalloc_greedy(&irbsize, PAGE_SIZE, PAGE_SIZE * 4, 411 irbuf = kmem_zalloc_greedy(&irbsize, PAGE_SIZE, PAGE_SIZE * 4);
412 KM_SLEEP | KM_MAYFAIL | KM_LARGE); 412 if (!irbuf)
413 return ENOMEM;
414
413 nirbuf = irbsize / sizeof(*irbuf); 415 nirbuf = irbsize / sizeof(*irbuf);
414 416
415 /* 417 /*
@@ -420,9 +422,7 @@ xfs_bulkstat(
420 while (XFS_BULKSTAT_UBLEFT(ubleft) && agno < mp->m_sb.sb_agcount) { 422 while (XFS_BULKSTAT_UBLEFT(ubleft) && agno < mp->m_sb.sb_agcount) {
421 cond_resched(); 423 cond_resched();
422 bp = NULL; 424 bp = NULL;
423 down_read(&mp->m_peraglock);
424 error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp); 425 error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp);
425 up_read(&mp->m_peraglock);
426 if (error) { 426 if (error) {
427 /* 427 /*
428 * Skip this allocation group and go to the next one. 428 * Skip this allocation group and go to the next one.
@@ -729,7 +729,7 @@ xfs_bulkstat(
729 /* 729 /*
730 * Done, we're either out of filesystem or space to put the data. 730 * Done, we're either out of filesystem or space to put the data.
731 */ 731 */
732 kmem_free(irbuf); 732 kmem_free_large(irbuf);
733 *ubcountp = ubelem; 733 *ubcountp = ubelem;
734 /* 734 /*
735 * Found some inodes, return them now and return the error next time. 735 * Found some inodes, return them now and return the error next time.
@@ -849,9 +849,7 @@ xfs_inumbers(
849 agbp = NULL; 849 agbp = NULL;
850 while (left > 0 && agno < mp->m_sb.sb_agcount) { 850 while (left > 0 && agno < mp->m_sb.sb_agcount) {
851 if (agbp == NULL) { 851 if (agbp == NULL) {
852 down_read(&mp->m_peraglock);
853 error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp); 852 error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp);
854 up_read(&mp->m_peraglock);
855 if (error) { 853 if (error) {
856 /* 854 /*
857 * If we can't read the AGI of this ag, 855 * If we can't read the AGI of this ag,
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 600b5b06aaeb..4f16be4b6ee5 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -50,7 +50,6 @@ kmem_zone_t *xfs_log_ticket_zone;
50 (off) += (bytes);} 50 (off) += (bytes);}
51 51
52/* Local miscellaneous function prototypes */ 52/* Local miscellaneous function prototypes */
53STATIC int xlog_bdstrat_cb(struct xfs_buf *);
54STATIC int xlog_commit_record(xfs_mount_t *mp, xlog_ticket_t *ticket, 53STATIC int xlog_commit_record(xfs_mount_t *mp, xlog_ticket_t *ticket,
55 xlog_in_core_t **, xfs_lsn_t *); 54 xlog_in_core_t **, xfs_lsn_t *);
56STATIC xlog_t * xlog_alloc_log(xfs_mount_t *mp, 55STATIC xlog_t * xlog_alloc_log(xfs_mount_t *mp,
@@ -80,11 +79,6 @@ STATIC int xlog_state_release_iclog(xlog_t *log,
80STATIC void xlog_state_switch_iclogs(xlog_t *log, 79STATIC void xlog_state_switch_iclogs(xlog_t *log,
81 xlog_in_core_t *iclog, 80 xlog_in_core_t *iclog,
82 int eventual_size); 81 int eventual_size);
83STATIC int xlog_state_sync(xlog_t *log,
84 xfs_lsn_t lsn,
85 uint flags,
86 int *log_flushed);
87STATIC int xlog_state_sync_all(xlog_t *log, uint flags, int *log_flushed);
88STATIC void xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog); 82STATIC void xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog);
89 83
90/* local functions to manipulate grant head */ 84/* local functions to manipulate grant head */
@@ -297,65 +291,6 @@ xfs_log_done(xfs_mount_t *mp,
297 return lsn; 291 return lsn;
298} /* xfs_log_done */ 292} /* xfs_log_done */
299 293
300
301/*
302 * Force the in-core log to disk. If flags == XFS_LOG_SYNC,
303 * the force is done synchronously.
304 *
305 * Asynchronous forces are implemented by setting the WANT_SYNC
306 * bit in the appropriate in-core log and then returning.
307 *
308 * Synchronous forces are implemented with a signal variable. All callers
309 * to force a given lsn to disk will wait on a the sv attached to the
310 * specific in-core log. When given in-core log finally completes its
311 * write to disk, that thread will wake up all threads waiting on the
312 * sv.
313 */
314int
315_xfs_log_force(
316 xfs_mount_t *mp,
317 xfs_lsn_t lsn,
318 uint flags,
319 int *log_flushed)
320{
321 xlog_t *log = mp->m_log;
322 int dummy;
323
324 if (!log_flushed)
325 log_flushed = &dummy;
326
327 ASSERT(flags & XFS_LOG_FORCE);
328
329 XFS_STATS_INC(xs_log_force);
330
331 if (log->l_flags & XLOG_IO_ERROR)
332 return XFS_ERROR(EIO);
333 if (lsn == 0)
334 return xlog_state_sync_all(log, flags, log_flushed);
335 else
336 return xlog_state_sync(log, lsn, flags, log_flushed);
337} /* _xfs_log_force */
338
339/*
340 * Wrapper for _xfs_log_force(), to be used when caller doesn't care
341 * about errors or whether the log was flushed or not. This is the normal
342 * interface to use when trying to unpin items or move the log forward.
343 */
344void
345xfs_log_force(
346 xfs_mount_t *mp,
347 xfs_lsn_t lsn,
348 uint flags)
349{
350 int error;
351 error = _xfs_log_force(mp, lsn, flags, NULL);
352 if (error) {
353 xfs_fs_cmn_err(CE_WARN, mp, "xfs_log_force: "
354 "error %d returned.", error);
355 }
356}
357
358
359/* 294/*
360 * Attaches a new iclog I/O completion callback routine during 295 * Attaches a new iclog I/O completion callback routine during
361 * transaction commit. If the log is in error state, a non-zero 296 * transaction commit. If the log is in error state, a non-zero
@@ -602,7 +537,7 @@ xfs_log_unmount_write(xfs_mount_t *mp)
602 if (mp->m_flags & XFS_MOUNT_RDONLY) 537 if (mp->m_flags & XFS_MOUNT_RDONLY)
603 return 0; 538 return 0;
604 539
605 error = _xfs_log_force(mp, 0, XFS_LOG_FORCE|XFS_LOG_SYNC, NULL); 540 error = _xfs_log_force(mp, XFS_LOG_SYNC, NULL);
606 ASSERT(error || !(XLOG_FORCED_SHUTDOWN(log))); 541 ASSERT(error || !(XLOG_FORCED_SHUTDOWN(log)));
607 542
608#ifdef DEBUG 543#ifdef DEBUG
@@ -618,7 +553,7 @@ xfs_log_unmount_write(xfs_mount_t *mp)
618 if (! (XLOG_FORCED_SHUTDOWN(log))) { 553 if (! (XLOG_FORCED_SHUTDOWN(log))) {
619 reg[0].i_addr = (void*)&magic; 554 reg[0].i_addr = (void*)&magic;
620 reg[0].i_len = sizeof(magic); 555 reg[0].i_len = sizeof(magic);
621 XLOG_VEC_SET_TYPE(&reg[0], XLOG_REG_TYPE_UNMOUNT); 556 reg[0].i_type = XLOG_REG_TYPE_UNMOUNT;
622 557
623 error = xfs_log_reserve(mp, 600, 1, &tic, 558 error = xfs_log_reserve(mp, 600, 1, &tic,
624 XFS_LOG, 0, XLOG_UNMOUNT_REC_TYPE); 559 XFS_LOG, 0, XLOG_UNMOUNT_REC_TYPE);
@@ -988,35 +923,6 @@ xlog_iodone(xfs_buf_t *bp)
988} /* xlog_iodone */ 923} /* xlog_iodone */
989 924
990/* 925/*
991 * The bdstrat callback function for log bufs. This gives us a central
992 * place to trap bufs in case we get hit by a log I/O error and need to
993 * shutdown. Actually, in practice, even when we didn't get a log error,
994 * we transition the iclogs to IOERROR state *after* flushing all existing
995 * iclogs to disk. This is because we don't want anymore new transactions to be
996 * started or completed afterwards.
997 */
998STATIC int
999xlog_bdstrat_cb(struct xfs_buf *bp)
1000{
1001 xlog_in_core_t *iclog;
1002
1003 iclog = XFS_BUF_FSPRIVATE(bp, xlog_in_core_t *);
1004
1005 if ((iclog->ic_state & XLOG_STATE_IOERROR) == 0) {
1006 /* note for irix bstrat will need struct bdevsw passed
1007 * Fix the following macro if the code ever is merged
1008 */
1009 XFS_bdstrat(bp);
1010 return 0;
1011 }
1012
1013 XFS_BUF_ERROR(bp, EIO);
1014 XFS_BUF_STALE(bp);
1015 xfs_biodone(bp);
1016 return XFS_ERROR(EIO);
1017}
1018
1019/*
1020 * Return size of each in-core log record buffer. 926 * Return size of each in-core log record buffer.
1021 * 927 *
1022 * All machines get 8 x 32kB buffers by default, unless tuned otherwise. 928 * All machines get 8 x 32kB buffers by default, unless tuned otherwise.
@@ -1158,7 +1064,6 @@ xlog_alloc_log(xfs_mount_t *mp,
1158 if (!bp) 1064 if (!bp)
1159 goto out_free_log; 1065 goto out_free_log;
1160 XFS_BUF_SET_IODONE_FUNC(bp, xlog_iodone); 1066 XFS_BUF_SET_IODONE_FUNC(bp, xlog_iodone);
1161 XFS_BUF_SET_BDSTRAT_FUNC(bp, xlog_bdstrat_cb);
1162 XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1); 1067 XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1);
1163 ASSERT(XFS_BUF_ISBUSY(bp)); 1068 ASSERT(XFS_BUF_ISBUSY(bp));
1164 ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); 1069 ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
@@ -1196,7 +1101,6 @@ xlog_alloc_log(xfs_mount_t *mp,
1196 if (!XFS_BUF_CPSEMA(bp)) 1101 if (!XFS_BUF_CPSEMA(bp))
1197 ASSERT(0); 1102 ASSERT(0);
1198 XFS_BUF_SET_IODONE_FUNC(bp, xlog_iodone); 1103 XFS_BUF_SET_IODONE_FUNC(bp, xlog_iodone);
1199 XFS_BUF_SET_BDSTRAT_FUNC(bp, xlog_bdstrat_cb);
1200 XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1); 1104 XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1);
1201 iclog->ic_bp = bp; 1105 iclog->ic_bp = bp;
1202 iclog->ic_data = bp->b_addr; 1106 iclog->ic_data = bp->b_addr;
@@ -1268,7 +1172,7 @@ xlog_commit_record(xfs_mount_t *mp,
1268 1172
1269 reg[0].i_addr = NULL; 1173 reg[0].i_addr = NULL;
1270 reg[0].i_len = 0; 1174 reg[0].i_len = 0;
1271 XLOG_VEC_SET_TYPE(&reg[0], XLOG_REG_TYPE_COMMIT); 1175 reg[0].i_type = XLOG_REG_TYPE_COMMIT;
1272 1176
1273 ASSERT_ALWAYS(iclog); 1177 ASSERT_ALWAYS(iclog);
1274 if ((error = xlog_write(mp, reg, 1, ticket, commitlsnp, 1178 if ((error = xlog_write(mp, reg, 1, ticket, commitlsnp,
@@ -1343,6 +1247,37 @@ xlog_grant_push_ail(xfs_mount_t *mp,
1343 xfs_trans_ail_push(log->l_ailp, threshold_lsn); 1247 xfs_trans_ail_push(log->l_ailp, threshold_lsn);
1344} /* xlog_grant_push_ail */ 1248} /* xlog_grant_push_ail */
1345 1249
1250/*
1251 * The bdstrat callback function for log bufs. This gives us a central
1252 * place to trap bufs in case we get hit by a log I/O error and need to
1253 * shutdown. Actually, in practice, even when we didn't get a log error,
1254 * we transition the iclogs to IOERROR state *after* flushing all existing
1255 * iclogs to disk. This is because we don't want anymore new transactions to be
1256 * started or completed afterwards.
1257 */
1258STATIC int
1259xlog_bdstrat(
1260 struct xfs_buf *bp)
1261{
1262 struct xlog_in_core *iclog;
1263
1264 iclog = XFS_BUF_FSPRIVATE(bp, xlog_in_core_t *);
1265 if (iclog->ic_state & XLOG_STATE_IOERROR) {
1266 XFS_BUF_ERROR(bp, EIO);
1267 XFS_BUF_STALE(bp);
1268 xfs_biodone(bp);
1269 /*
1270 * It would seem logical to return EIO here, but we rely on
1271 * the log state machine to propagate I/O errors instead of
1272 * doing it here.
1273 */
1274 return 0;
1275 }
1276
1277 bp->b_flags |= _XBF_RUN_QUEUES;
1278 xfs_buf_iorequest(bp);
1279 return 0;
1280}
1346 1281
1347/* 1282/*
1348 * Flush out the in-core log (iclog) to the on-disk log in an asynchronous 1283 * Flush out the in-core log (iclog) to the on-disk log in an asynchronous
@@ -1462,7 +1397,7 @@ xlog_sync(xlog_t *log,
1462 */ 1397 */
1463 XFS_BUF_WRITE(bp); 1398 XFS_BUF_WRITE(bp);
1464 1399
1465 if ((error = XFS_bwrite(bp))) { 1400 if ((error = xlog_bdstrat(bp))) {
1466 xfs_ioerror_alert("xlog_sync", log->l_mp, bp, 1401 xfs_ioerror_alert("xlog_sync", log->l_mp, bp,
1467 XFS_BUF_ADDR(bp)); 1402 XFS_BUF_ADDR(bp));
1468 return error; 1403 return error;
@@ -1502,7 +1437,7 @@ xlog_sync(xlog_t *log,
1502 /* account for internal log which doesn't start at block #0 */ 1437 /* account for internal log which doesn't start at block #0 */
1503 XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart); 1438 XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart);
1504 XFS_BUF_WRITE(bp); 1439 XFS_BUF_WRITE(bp);
1505 if ((error = XFS_bwrite(bp))) { 1440 if ((error = xlog_bdstrat(bp))) {
1506 xfs_ioerror_alert("xlog_sync (split)", log->l_mp, 1441 xfs_ioerror_alert("xlog_sync (split)", log->l_mp,
1507 bp, XFS_BUF_ADDR(bp)); 1442 bp, XFS_BUF_ADDR(bp));
1508 return error; 1443 return error;
@@ -2854,7 +2789,6 @@ xlog_state_switch_iclogs(xlog_t *log,
2854 log->l_iclog = iclog->ic_next; 2789 log->l_iclog = iclog->ic_next;
2855} /* xlog_state_switch_iclogs */ 2790} /* xlog_state_switch_iclogs */
2856 2791
2857
2858/* 2792/*
2859 * Write out all data in the in-core log as of this exact moment in time. 2793 * Write out all data in the in-core log as of this exact moment in time.
2860 * 2794 *
@@ -2882,11 +2816,17 @@ xlog_state_switch_iclogs(xlog_t *log,
2882 * b) when we return from flushing out this iclog, it is still 2816 * b) when we return from flushing out this iclog, it is still
2883 * not in the active nor dirty state. 2817 * not in the active nor dirty state.
2884 */ 2818 */
2885STATIC int 2819int
2886xlog_state_sync_all(xlog_t *log, uint flags, int *log_flushed) 2820_xfs_log_force(
2821 struct xfs_mount *mp,
2822 uint flags,
2823 int *log_flushed)
2887{ 2824{
2888 xlog_in_core_t *iclog; 2825 struct log *log = mp->m_log;
2889 xfs_lsn_t lsn; 2826 struct xlog_in_core *iclog;
2827 xfs_lsn_t lsn;
2828
2829 XFS_STATS_INC(xs_log_force);
2890 2830
2891 spin_lock(&log->l_icloglock); 2831 spin_lock(&log->l_icloglock);
2892 2832
@@ -2932,7 +2872,9 @@ xlog_state_sync_all(xlog_t *log, uint flags, int *log_flushed)
2932 2872
2933 if (xlog_state_release_iclog(log, iclog)) 2873 if (xlog_state_release_iclog(log, iclog))
2934 return XFS_ERROR(EIO); 2874 return XFS_ERROR(EIO);
2935 *log_flushed = 1; 2875
2876 if (log_flushed)
2877 *log_flushed = 1;
2936 spin_lock(&log->l_icloglock); 2878 spin_lock(&log->l_icloglock);
2937 if (be64_to_cpu(iclog->ic_header.h_lsn) == lsn && 2879 if (be64_to_cpu(iclog->ic_header.h_lsn) == lsn &&
2938 iclog->ic_state != XLOG_STATE_DIRTY) 2880 iclog->ic_state != XLOG_STATE_DIRTY)
@@ -2976,19 +2918,37 @@ maybe_sleep:
2976 */ 2918 */
2977 if (iclog->ic_state & XLOG_STATE_IOERROR) 2919 if (iclog->ic_state & XLOG_STATE_IOERROR)
2978 return XFS_ERROR(EIO); 2920 return XFS_ERROR(EIO);
2979 *log_flushed = 1; 2921 if (log_flushed)
2980 2922 *log_flushed = 1;
2981 } else { 2923 } else {
2982 2924
2983no_sleep: 2925no_sleep:
2984 spin_unlock(&log->l_icloglock); 2926 spin_unlock(&log->l_icloglock);
2985 } 2927 }
2986 return 0; 2928 return 0;
2987} /* xlog_state_sync_all */ 2929}
2988 2930
2931/*
2932 * Wrapper for _xfs_log_force(), to be used when caller doesn't care
2933 * about errors or whether the log was flushed or not. This is the normal
2934 * interface to use when trying to unpin items or move the log forward.
2935 */
2936void
2937xfs_log_force(
2938 xfs_mount_t *mp,
2939 uint flags)
2940{
2941 int error;
2942
2943 error = _xfs_log_force(mp, flags, NULL);
2944 if (error) {
2945 xfs_fs_cmn_err(CE_WARN, mp, "xfs_log_force: "
2946 "error %d returned.", error);
2947 }
2948}
2989 2949
2990/* 2950/*
2991 * Used by code which implements synchronous log forces. 2951 * Force the in-core log to disk for a specific LSN.
2992 * 2952 *
2993 * Find in-core log with lsn. 2953 * Find in-core log with lsn.
2994 * If it is in the DIRTY state, just return. 2954 * If it is in the DIRTY state, just return.
@@ -2996,109 +2956,142 @@ no_sleep:
2996 * state and go to sleep or return. 2956 * state and go to sleep or return.
2997 * If it is in any other state, go to sleep or return. 2957 * If it is in any other state, go to sleep or return.
2998 * 2958 *
2999 * If filesystem activity goes to zero, the iclog will get flushed only by 2959 * Synchronous forces are implemented with a signal variable. All callers
3000 * bdflush(). 2960 * to force a given lsn to disk will wait on a the sv attached to the
2961 * specific in-core log. When given in-core log finally completes its
2962 * write to disk, that thread will wake up all threads waiting on the
2963 * sv.
3001 */ 2964 */
3002STATIC int 2965int
3003xlog_state_sync(xlog_t *log, 2966_xfs_log_force_lsn(
3004 xfs_lsn_t lsn, 2967 struct xfs_mount *mp,
3005 uint flags, 2968 xfs_lsn_t lsn,
3006 int *log_flushed) 2969 uint flags,
2970 int *log_flushed)
3007{ 2971{
3008 xlog_in_core_t *iclog; 2972 struct log *log = mp->m_log;
3009 int already_slept = 0; 2973 struct xlog_in_core *iclog;
2974 int already_slept = 0;
3010 2975
3011try_again: 2976 ASSERT(lsn != 0);
3012 spin_lock(&log->l_icloglock);
3013 iclog = log->l_iclog;
3014 2977
3015 if (iclog->ic_state & XLOG_STATE_IOERROR) { 2978 XFS_STATS_INC(xs_log_force);
3016 spin_unlock(&log->l_icloglock);
3017 return XFS_ERROR(EIO);
3018 }
3019
3020 do {
3021 if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn) {
3022 iclog = iclog->ic_next;
3023 continue;
3024 }
3025 2979
3026 if (iclog->ic_state == XLOG_STATE_DIRTY) { 2980try_again:
2981 spin_lock(&log->l_icloglock);
2982 iclog = log->l_iclog;
2983 if (iclog->ic_state & XLOG_STATE_IOERROR) {
3027 spin_unlock(&log->l_icloglock); 2984 spin_unlock(&log->l_icloglock);
3028 return 0; 2985 return XFS_ERROR(EIO);
3029 } 2986 }
3030 2987
3031 if (iclog->ic_state == XLOG_STATE_ACTIVE) { 2988 do {
3032 /* 2989 if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn) {
3033 * We sleep here if we haven't already slept (e.g. 2990 iclog = iclog->ic_next;
3034 * this is the first time we've looked at the correct 2991 continue;
3035 * iclog buf) and the buffer before us is going to 2992 }
3036 * be sync'ed. The reason for this is that if we 2993
3037 * are doing sync transactions here, by waiting for 2994 if (iclog->ic_state == XLOG_STATE_DIRTY) {
3038 * the previous I/O to complete, we can allow a few 2995 spin_unlock(&log->l_icloglock);
3039 * more transactions into this iclog before we close 2996 return 0;
3040 * it down. 2997 }
3041 * 2998
3042 * Otherwise, we mark the buffer WANT_SYNC, and bump 2999 if (iclog->ic_state == XLOG_STATE_ACTIVE) {
3043 * up the refcnt so we can release the log (which drops 3000 /*
3044 * the ref count). The state switch keeps new transaction 3001 * We sleep here if we haven't already slept (e.g.
3045 * commits from using this buffer. When the current commits 3002 * this is the first time we've looked at the correct
3046 * finish writing into the buffer, the refcount will drop to 3003 * iclog buf) and the buffer before us is going to
3047 * zero and the buffer will go out then. 3004 * be sync'ed. The reason for this is that if we
3048 */ 3005 * are doing sync transactions here, by waiting for
3049 if (!already_slept && 3006 * the previous I/O to complete, we can allow a few
3050 (iclog->ic_prev->ic_state & (XLOG_STATE_WANT_SYNC | 3007 * more transactions into this iclog before we close
3051 XLOG_STATE_SYNCING))) { 3008 * it down.
3052 ASSERT(!(iclog->ic_state & XLOG_STATE_IOERROR)); 3009 *
3053 XFS_STATS_INC(xs_log_force_sleep); 3010 * Otherwise, we mark the buffer WANT_SYNC, and bump
3054 sv_wait(&iclog->ic_prev->ic_write_wait, PSWP, 3011 * up the refcnt so we can release the log (which
3055 &log->l_icloglock, s); 3012 * drops the ref count). The state switch keeps new
3056 *log_flushed = 1; 3013 * transaction commits from using this buffer. When
3057 already_slept = 1; 3014 * the current commits finish writing into the buffer,
3058 goto try_again; 3015 * the refcount will drop to zero and the buffer will
3059 } else { 3016 * go out then.
3017 */
3018 if (!already_slept &&
3019 (iclog->ic_prev->ic_state &
3020 (XLOG_STATE_WANT_SYNC | XLOG_STATE_SYNCING))) {
3021 ASSERT(!(iclog->ic_state & XLOG_STATE_IOERROR));
3022
3023 XFS_STATS_INC(xs_log_force_sleep);
3024
3025 sv_wait(&iclog->ic_prev->ic_write_wait,
3026 PSWP, &log->l_icloglock, s);
3027 if (log_flushed)
3028 *log_flushed = 1;
3029 already_slept = 1;
3030 goto try_again;
3031 }
3060 atomic_inc(&iclog->ic_refcnt); 3032 atomic_inc(&iclog->ic_refcnt);
3061 xlog_state_switch_iclogs(log, iclog, 0); 3033 xlog_state_switch_iclogs(log, iclog, 0);
3062 spin_unlock(&log->l_icloglock); 3034 spin_unlock(&log->l_icloglock);
3063 if (xlog_state_release_iclog(log, iclog)) 3035 if (xlog_state_release_iclog(log, iclog))
3064 return XFS_ERROR(EIO); 3036 return XFS_ERROR(EIO);
3065 *log_flushed = 1; 3037 if (log_flushed)
3038 *log_flushed = 1;
3066 spin_lock(&log->l_icloglock); 3039 spin_lock(&log->l_icloglock);
3067 } 3040 }
3068 }
3069 3041
3070 if ((flags & XFS_LOG_SYNC) && /* sleep */ 3042 if ((flags & XFS_LOG_SYNC) && /* sleep */
3071 !(iclog->ic_state & (XLOG_STATE_ACTIVE | XLOG_STATE_DIRTY))) { 3043 !(iclog->ic_state &
3044 (XLOG_STATE_ACTIVE | XLOG_STATE_DIRTY))) {
3045 /*
3046 * Don't wait on completion if we know that we've
3047 * gotten a log write error.
3048 */
3049 if (iclog->ic_state & XLOG_STATE_IOERROR) {
3050 spin_unlock(&log->l_icloglock);
3051 return XFS_ERROR(EIO);
3052 }
3053 XFS_STATS_INC(xs_log_force_sleep);
3054 sv_wait(&iclog->ic_force_wait, PSWP, &log->l_icloglock, s);
3055 /*
3056 * No need to grab the log lock here since we're
3057 * only deciding whether or not to return EIO
3058 * and the memory read should be atomic.
3059 */
3060 if (iclog->ic_state & XLOG_STATE_IOERROR)
3061 return XFS_ERROR(EIO);
3072 3062
3073 /* 3063 if (log_flushed)
3074 * Don't wait on completion if we know that we've 3064 *log_flushed = 1;
3075 * gotten a log write error. 3065 } else { /* just return */
3076 */
3077 if (iclog->ic_state & XLOG_STATE_IOERROR) {
3078 spin_unlock(&log->l_icloglock); 3066 spin_unlock(&log->l_icloglock);
3079 return XFS_ERROR(EIO);
3080 } 3067 }
3081 XFS_STATS_INC(xs_log_force_sleep);
3082 sv_wait(&iclog->ic_force_wait, PSWP, &log->l_icloglock, s);
3083 /*
3084 * No need to grab the log lock here since we're
3085 * only deciding whether or not to return EIO
3086 * and the memory read should be atomic.
3087 */
3088 if (iclog->ic_state & XLOG_STATE_IOERROR)
3089 return XFS_ERROR(EIO);
3090 *log_flushed = 1;
3091 } else { /* just return */
3092 spin_unlock(&log->l_icloglock);
3093 }
3094 return 0;
3095 3068
3096 } while (iclog != log->l_iclog); 3069 return 0;
3070 } while (iclog != log->l_iclog);
3097 3071
3098 spin_unlock(&log->l_icloglock); 3072 spin_unlock(&log->l_icloglock);
3099 return 0; 3073 return 0;
3100} /* xlog_state_sync */ 3074}
3101 3075
3076/*
3077 * Wrapper for _xfs_log_force_lsn(), to be used when caller doesn't care
3078 * about errors or whether the log was flushed or not. This is the normal
3079 * interface to use when trying to unpin items or move the log forward.
3080 */
3081void
3082xfs_log_force_lsn(
3083 xfs_mount_t *mp,
3084 xfs_lsn_t lsn,
3085 uint flags)
3086{
3087 int error;
3088
3089 error = _xfs_log_force_lsn(mp, lsn, flags, NULL);
3090 if (error) {
3091 xfs_fs_cmn_err(CE_WARN, mp, "xfs_log_force: "
3092 "error %d returned.", error);
3093 }
3094}
3102 3095
3103/* 3096/*
3104 * Called when we want to mark the current iclog as being ready to sync to 3097 * Called when we want to mark the current iclog as being ready to sync to
@@ -3463,7 +3456,6 @@ xfs_log_force_umount(
3463 xlog_ticket_t *tic; 3456 xlog_ticket_t *tic;
3464 xlog_t *log; 3457 xlog_t *log;
3465 int retval; 3458 int retval;
3466 int dummy;
3467 3459
3468 log = mp->m_log; 3460 log = mp->m_log;
3469 3461
@@ -3537,13 +3529,14 @@ xfs_log_force_umount(
3537 } 3529 }
3538 spin_unlock(&log->l_grant_lock); 3530 spin_unlock(&log->l_grant_lock);
3539 3531
3540 if (! (log->l_iclog->ic_state & XLOG_STATE_IOERROR)) { 3532 if (!(log->l_iclog->ic_state & XLOG_STATE_IOERROR)) {
3541 ASSERT(!logerror); 3533 ASSERT(!logerror);
3542 /* 3534 /*
3543 * Force the incore logs to disk before shutting the 3535 * Force the incore logs to disk before shutting the
3544 * log down completely. 3536 * log down completely.
3545 */ 3537 */
3546 xlog_state_sync_all(log, XFS_LOG_FORCE|XFS_LOG_SYNC, &dummy); 3538 _xfs_log_force(mp, XFS_LOG_SYNC, NULL);
3539
3547 spin_lock(&log->l_icloglock); 3540 spin_lock(&log->l_icloglock);
3548 retval = xlog_state_ioerror(log); 3541 retval = xlog_state_ioerror(log);
3549 spin_unlock(&log->l_icloglock); 3542 spin_unlock(&log->l_icloglock);
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index d0c9baa50b1a..7074be9d13e9 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -70,14 +70,8 @@ static inline xfs_lsn_t _lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2)
70 * Flags to xfs_log_force() 70 * Flags to xfs_log_force()
71 * 71 *
72 * XFS_LOG_SYNC: Synchronous force in-core log to disk 72 * XFS_LOG_SYNC: Synchronous force in-core log to disk
73 * XFS_LOG_FORCE: Start in-core log write now.
74 * XFS_LOG_URGE: Start write within some window of time.
75 *
76 * Note: Either XFS_LOG_FORCE or XFS_LOG_URGE must be set.
77 */ 73 */
78#define XFS_LOG_SYNC 0x1 74#define XFS_LOG_SYNC 0x1
79#define XFS_LOG_FORCE 0x2
80#define XFS_LOG_URGE 0x4
81 75
82#endif /* __KERNEL__ */ 76#endif /* __KERNEL__ */
83 77
@@ -110,10 +104,8 @@ static inline xfs_lsn_t _lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2)
110#define XLOG_REG_TYPE_TRANSHDR 19 104#define XLOG_REG_TYPE_TRANSHDR 19
111#define XLOG_REG_TYPE_MAX 19 105#define XLOG_REG_TYPE_MAX 19
112 106
113#define XLOG_VEC_SET_TYPE(vecp, t) ((vecp)->i_type = (t))
114
115typedef struct xfs_log_iovec { 107typedef struct xfs_log_iovec {
116 xfs_caddr_t i_addr; /* beginning address of region */ 108 xfs_caddr_t i_addr; /* beginning address of region */
117 int i_len; /* length in bytes of region */ 109 int i_len; /* length in bytes of region */
118 uint i_type; /* type of region */ 110 uint i_type; /* type of region */
119} xfs_log_iovec_t; 111} xfs_log_iovec_t;
@@ -140,12 +132,17 @@ xfs_lsn_t xfs_log_done(struct xfs_mount *mp,
140 void **iclog, 132 void **iclog,
141 uint flags); 133 uint flags);
142int _xfs_log_force(struct xfs_mount *mp, 134int _xfs_log_force(struct xfs_mount *mp,
143 xfs_lsn_t lsn,
144 uint flags, 135 uint flags,
145 int *log_forced); 136 int *log_forced);
146void xfs_log_force(struct xfs_mount *mp, 137void xfs_log_force(struct xfs_mount *mp,
147 xfs_lsn_t lsn,
148 uint flags); 138 uint flags);
139int _xfs_log_force_lsn(struct xfs_mount *mp,
140 xfs_lsn_t lsn,
141 uint flags,
142 int *log_forced);
143void xfs_log_force_lsn(struct xfs_mount *mp,
144 xfs_lsn_t lsn,
145 uint flags);
149int xfs_log_mount(struct xfs_mount *mp, 146int xfs_log_mount(struct xfs_mount *mp,
150 struct xfs_buftarg *log_target, 147 struct xfs_buftarg *log_target,
151 xfs_daddr_t start_block, 148 xfs_daddr_t start_block,
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index d55662db7077..fd02a18facd5 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -443,14 +443,9 @@ typedef struct log {
443 443
444/* common routines */ 444/* common routines */
445extern xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp); 445extern xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp);
446extern int xlog_find_tail(xlog_t *log,
447 xfs_daddr_t *head_blk,
448 xfs_daddr_t *tail_blk);
449extern int xlog_recover(xlog_t *log); 446extern int xlog_recover(xlog_t *log);
450extern int xlog_recover_finish(xlog_t *log); 447extern int xlog_recover_finish(xlog_t *log);
451extern void xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int); 448extern void xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int);
452extern struct xfs_buf *xlog_get_bp(xlog_t *, int);
453extern void xlog_put_bp(struct xfs_buf *);
454 449
455extern kmem_zone_t *xfs_log_ticket_zone; 450extern kmem_zone_t *xfs_log_ticket_zone;
456 451
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 69ac2e5ef20c..22e6efdc17ea 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -50,8 +50,6 @@
50 50
51STATIC int xlog_find_zeroed(xlog_t *, xfs_daddr_t *); 51STATIC int xlog_find_zeroed(xlog_t *, xfs_daddr_t *);
52STATIC int xlog_clear_stale_blocks(xlog_t *, xfs_lsn_t); 52STATIC int xlog_clear_stale_blocks(xlog_t *, xfs_lsn_t);
53STATIC void xlog_recover_insert_item_backq(xlog_recover_item_t **q,
54 xlog_recover_item_t *item);
55#if defined(DEBUG) 53#if defined(DEBUG)
56STATIC void xlog_recover_check_summary(xlog_t *); 54STATIC void xlog_recover_check_summary(xlog_t *);
57#else 55#else
@@ -68,7 +66,7 @@ STATIC void xlog_recover_check_summary(xlog_t *);
68 ((bbs + (log)->l_sectbb_mask + 1) & ~(log)->l_sectbb_mask) : (bbs) ) 66 ((bbs + (log)->l_sectbb_mask + 1) & ~(log)->l_sectbb_mask) : (bbs) )
69#define XLOG_SECTOR_ROUNDDOWN_BLKNO(log, bno) ((bno) & ~(log)->l_sectbb_mask) 67#define XLOG_SECTOR_ROUNDDOWN_BLKNO(log, bno) ((bno) & ~(log)->l_sectbb_mask)
70 68
71xfs_buf_t * 69STATIC xfs_buf_t *
72xlog_get_bp( 70xlog_get_bp(
73 xlog_t *log, 71 xlog_t *log,
74 int nbblks) 72 int nbblks)
@@ -88,7 +86,7 @@ xlog_get_bp(
88 return xfs_buf_get_noaddr(BBTOB(nbblks), log->l_mp->m_logdev_targp); 86 return xfs_buf_get_noaddr(BBTOB(nbblks), log->l_mp->m_logdev_targp);
89} 87}
90 88
91void 89STATIC void
92xlog_put_bp( 90xlog_put_bp(
93 xfs_buf_t *bp) 91 xfs_buf_t *bp)
94{ 92{
@@ -805,7 +803,7 @@ xlog_find_head(
805 * We could speed up search by using current head_blk buffer, but it is not 803 * We could speed up search by using current head_blk buffer, but it is not
806 * available. 804 * available.
807 */ 805 */
808int 806STATIC int
809xlog_find_tail( 807xlog_find_tail(
810 xlog_t *log, 808 xlog_t *log,
811 xfs_daddr_t *head_blk, 809 xfs_daddr_t *head_blk,
@@ -1367,36 +1365,45 @@ xlog_clear_stale_blocks(
1367 1365
1368STATIC xlog_recover_t * 1366STATIC xlog_recover_t *
1369xlog_recover_find_tid( 1367xlog_recover_find_tid(
1370 xlog_recover_t *q, 1368 struct hlist_head *head,
1371 xlog_tid_t tid) 1369 xlog_tid_t tid)
1372{ 1370{
1373 xlog_recover_t *p = q; 1371 xlog_recover_t *trans;
1372 struct hlist_node *n;
1374 1373
1375 while (p != NULL) { 1374 hlist_for_each_entry(trans, n, head, r_list) {
1376 if (p->r_log_tid == tid) 1375 if (trans->r_log_tid == tid)
1377 break; 1376 return trans;
1378 p = p->r_next;
1379 } 1377 }
1380 return p; 1378 return NULL;
1381} 1379}
1382 1380
1383STATIC void 1381STATIC void
1384xlog_recover_put_hashq( 1382xlog_recover_new_tid(
1385 xlog_recover_t **q, 1383 struct hlist_head *head,
1386 xlog_recover_t *trans) 1384 xlog_tid_t tid,
1385 xfs_lsn_t lsn)
1387{ 1386{
1388 trans->r_next = *q; 1387 xlog_recover_t *trans;
1389 *q = trans; 1388
1389 trans = kmem_zalloc(sizeof(xlog_recover_t), KM_SLEEP);
1390 trans->r_log_tid = tid;
1391 trans->r_lsn = lsn;
1392 INIT_LIST_HEAD(&trans->r_itemq);
1393
1394 INIT_HLIST_NODE(&trans->r_list);
1395 hlist_add_head(&trans->r_list, head);
1390} 1396}
1391 1397
1392STATIC void 1398STATIC void
1393xlog_recover_add_item( 1399xlog_recover_add_item(
1394 xlog_recover_item_t **itemq) 1400 struct list_head *head)
1395{ 1401{
1396 xlog_recover_item_t *item; 1402 xlog_recover_item_t *item;
1397 1403
1398 item = kmem_zalloc(sizeof(xlog_recover_item_t), KM_SLEEP); 1404 item = kmem_zalloc(sizeof(xlog_recover_item_t), KM_SLEEP);
1399 xlog_recover_insert_item_backq(itemq, item); 1405 INIT_LIST_HEAD(&item->ri_list);
1406 list_add_tail(&item->ri_list, head);
1400} 1407}
1401 1408
1402STATIC int 1409STATIC int
@@ -1409,8 +1416,7 @@ xlog_recover_add_to_cont_trans(
1409 xfs_caddr_t ptr, old_ptr; 1416 xfs_caddr_t ptr, old_ptr;
1410 int old_len; 1417 int old_len;
1411 1418
1412 item = trans->r_itemq; 1419 if (list_empty(&trans->r_itemq)) {
1413 if (item == NULL) {
1414 /* finish copying rest of trans header */ 1420 /* finish copying rest of trans header */
1415 xlog_recover_add_item(&trans->r_itemq); 1421 xlog_recover_add_item(&trans->r_itemq);
1416 ptr = (xfs_caddr_t) &trans->r_theader + 1422 ptr = (xfs_caddr_t) &trans->r_theader +
@@ -1418,7 +1424,8 @@ xlog_recover_add_to_cont_trans(
1418 memcpy(ptr, dp, len); /* d, s, l */ 1424 memcpy(ptr, dp, len); /* d, s, l */
1419 return 0; 1425 return 0;
1420 } 1426 }
1421 item = item->ri_prev; 1427 /* take the tail entry */
1428 item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
1422 1429
1423 old_ptr = item->ri_buf[item->ri_cnt-1].i_addr; 1430 old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
1424 old_len = item->ri_buf[item->ri_cnt-1].i_len; 1431 old_len = item->ri_buf[item->ri_cnt-1].i_len;
@@ -1455,8 +1462,7 @@ xlog_recover_add_to_trans(
1455 1462
1456 if (!len) 1463 if (!len)
1457 return 0; 1464 return 0;
1458 item = trans->r_itemq; 1465 if (list_empty(&trans->r_itemq)) {
1459 if (item == NULL) {
1460 /* we need to catch log corruptions here */ 1466 /* we need to catch log corruptions here */
1461 if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) { 1467 if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) {
1462 xlog_warn("XFS: xlog_recover_add_to_trans: " 1468 xlog_warn("XFS: xlog_recover_add_to_trans: "
@@ -1474,12 +1480,15 @@ xlog_recover_add_to_trans(
1474 memcpy(ptr, dp, len); 1480 memcpy(ptr, dp, len);
1475 in_f = (xfs_inode_log_format_t *)ptr; 1481 in_f = (xfs_inode_log_format_t *)ptr;
1476 1482
1477 if (item->ri_prev->ri_total != 0 && 1483 /* take the tail entry */
1478 item->ri_prev->ri_total == item->ri_prev->ri_cnt) { 1484 item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
1485 if (item->ri_total != 0 &&
1486 item->ri_total == item->ri_cnt) {
1487 /* tail item is in use, get a new one */
1479 xlog_recover_add_item(&trans->r_itemq); 1488 xlog_recover_add_item(&trans->r_itemq);
1489 item = list_entry(trans->r_itemq.prev,
1490 xlog_recover_item_t, ri_list);
1480 } 1491 }
1481 item = trans->r_itemq;
1482 item = item->ri_prev;
1483 1492
1484 if (item->ri_total == 0) { /* first region to be added */ 1493 if (item->ri_total == 0) { /* first region to be added */
1485 if (in_f->ilf_size == 0 || 1494 if (in_f->ilf_size == 0 ||
@@ -1504,96 +1513,29 @@ xlog_recover_add_to_trans(
1504 return 0; 1513 return 0;
1505} 1514}
1506 1515
1507STATIC void 1516/*
1508xlog_recover_new_tid( 1517 * Sort the log items in the transaction. Cancelled buffers need
1509 xlog_recover_t **q, 1518 * to be put first so they are processed before any items that might
1510 xlog_tid_t tid, 1519 * modify the buffers. If they are cancelled, then the modifications
1511 xfs_lsn_t lsn) 1520 * don't need to be replayed.
1512{ 1521 */
1513 xlog_recover_t *trans;
1514
1515 trans = kmem_zalloc(sizeof(xlog_recover_t), KM_SLEEP);
1516 trans->r_log_tid = tid;
1517 trans->r_lsn = lsn;
1518 xlog_recover_put_hashq(q, trans);
1519}
1520
1521STATIC int
1522xlog_recover_unlink_tid(
1523 xlog_recover_t **q,
1524 xlog_recover_t *trans)
1525{
1526 xlog_recover_t *tp;
1527 int found = 0;
1528
1529 ASSERT(trans != NULL);
1530 if (trans == *q) {
1531 *q = (*q)->r_next;
1532 } else {
1533 tp = *q;
1534 while (tp) {
1535 if (tp->r_next == trans) {
1536 found = 1;
1537 break;
1538 }
1539 tp = tp->r_next;
1540 }
1541 if (!found) {
1542 xlog_warn(
1543 "XFS: xlog_recover_unlink_tid: trans not found");
1544 ASSERT(0);
1545 return XFS_ERROR(EIO);
1546 }
1547 tp->r_next = tp->r_next->r_next;
1548 }
1549 return 0;
1550}
1551
1552STATIC void
1553xlog_recover_insert_item_backq(
1554 xlog_recover_item_t **q,
1555 xlog_recover_item_t *item)
1556{
1557 if (*q == NULL) {
1558 item->ri_prev = item->ri_next = item;
1559 *q = item;
1560 } else {
1561 item->ri_next = *q;
1562 item->ri_prev = (*q)->ri_prev;
1563 (*q)->ri_prev = item;
1564 item->ri_prev->ri_next = item;
1565 }
1566}
1567
1568STATIC void
1569xlog_recover_insert_item_frontq(
1570 xlog_recover_item_t **q,
1571 xlog_recover_item_t *item)
1572{
1573 xlog_recover_insert_item_backq(q, item);
1574 *q = item;
1575}
1576
1577STATIC int 1522STATIC int
1578xlog_recover_reorder_trans( 1523xlog_recover_reorder_trans(
1579 xlog_recover_t *trans) 1524 xlog_recover_t *trans)
1580{ 1525{
1581 xlog_recover_item_t *first_item, *itemq, *itemq_next; 1526 xlog_recover_item_t *item, *n;
1582 xfs_buf_log_format_t *buf_f; 1527 LIST_HEAD(sort_list);
1583 ushort flags = 0;
1584 1528
1585 first_item = itemq = trans->r_itemq; 1529 list_splice_init(&trans->r_itemq, &sort_list);
1586 trans->r_itemq = NULL; 1530 list_for_each_entry_safe(item, n, &sort_list, ri_list) {
1587 do { 1531 xfs_buf_log_format_t *buf_f;
1588 itemq_next = itemq->ri_next;
1589 buf_f = (xfs_buf_log_format_t *)itemq->ri_buf[0].i_addr;
1590 1532
1591 switch (ITEM_TYPE(itemq)) { 1533 buf_f = (xfs_buf_log_format_t *)item->ri_buf[0].i_addr;
1534
1535 switch (ITEM_TYPE(item)) {
1592 case XFS_LI_BUF: 1536 case XFS_LI_BUF:
1593 flags = buf_f->blf_flags; 1537 if (!(buf_f->blf_flags & XFS_BLI_CANCEL)) {
1594 if (!(flags & XFS_BLI_CANCEL)) { 1538 list_move(&item->ri_list, &trans->r_itemq);
1595 xlog_recover_insert_item_frontq(&trans->r_itemq,
1596 itemq);
1597 break; 1539 break;
1598 } 1540 }
1599 case XFS_LI_INODE: 1541 case XFS_LI_INODE:
@@ -1601,7 +1543,7 @@ xlog_recover_reorder_trans(
1601 case XFS_LI_QUOTAOFF: 1543 case XFS_LI_QUOTAOFF:
1602 case XFS_LI_EFD: 1544 case XFS_LI_EFD:
1603 case XFS_LI_EFI: 1545 case XFS_LI_EFI:
1604 xlog_recover_insert_item_backq(&trans->r_itemq, itemq); 1546 list_move_tail(&item->ri_list, &trans->r_itemq);
1605 break; 1547 break;
1606 default: 1548 default:
1607 xlog_warn( 1549 xlog_warn(
@@ -1609,8 +1551,8 @@ xlog_recover_reorder_trans(
1609 ASSERT(0); 1551 ASSERT(0);
1610 return XFS_ERROR(EIO); 1552 return XFS_ERROR(EIO);
1611 } 1553 }
1612 itemq = itemq_next; 1554 }
1613 } while (first_item != itemq); 1555 ASSERT(list_empty(&sort_list));
1614 return 0; 1556 return 0;
1615} 1557}
1616 1558
@@ -2242,9 +2184,9 @@ xlog_recover_do_buffer_trans(
2242 } 2184 }
2243 2185
2244 mp = log->l_mp; 2186 mp = log->l_mp;
2245 buf_flags = XFS_BUF_LOCK; 2187 buf_flags = XBF_LOCK;
2246 if (!(flags & XFS_BLI_INODE_BUF)) 2188 if (!(flags & XFS_BLI_INODE_BUF))
2247 buf_flags |= XFS_BUF_MAPPED; 2189 buf_flags |= XBF_MAPPED;
2248 2190
2249 bp = xfs_buf_read(mp->m_ddev_targp, blkno, len, buf_flags); 2191 bp = xfs_buf_read(mp->m_ddev_targp, blkno, len, buf_flags);
2250 if (XFS_BUF_ISERROR(bp)) { 2192 if (XFS_BUF_ISERROR(bp)) {
@@ -2346,7 +2288,7 @@ xlog_recover_do_inode_trans(
2346 } 2288 }
2347 2289
2348 bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 2290 bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len,
2349 XFS_BUF_LOCK); 2291 XBF_LOCK);
2350 if (XFS_BUF_ISERROR(bp)) { 2292 if (XFS_BUF_ISERROR(bp)) {
2351 xfs_ioerror_alert("xlog_recover_do..(read#2)", mp, 2293 xfs_ioerror_alert("xlog_recover_do..(read#2)", mp,
2352 bp, in_f->ilf_blkno); 2294 bp, in_f->ilf_blkno);
@@ -2814,14 +2756,13 @@ xlog_recover_do_trans(
2814 int pass) 2756 int pass)
2815{ 2757{
2816 int error = 0; 2758 int error = 0;
2817 xlog_recover_item_t *item, *first_item; 2759 xlog_recover_item_t *item;
2818 2760
2819 error = xlog_recover_reorder_trans(trans); 2761 error = xlog_recover_reorder_trans(trans);
2820 if (error) 2762 if (error)
2821 return error; 2763 return error;
2822 2764
2823 first_item = item = trans->r_itemq; 2765 list_for_each_entry(item, &trans->r_itemq, ri_list) {
2824 do {
2825 switch (ITEM_TYPE(item)) { 2766 switch (ITEM_TYPE(item)) {
2826 case XFS_LI_BUF: 2767 case XFS_LI_BUF:
2827 error = xlog_recover_do_buffer_trans(log, item, pass); 2768 error = xlog_recover_do_buffer_trans(log, item, pass);
@@ -2854,8 +2795,7 @@ xlog_recover_do_trans(
2854 2795
2855 if (error) 2796 if (error)
2856 return error; 2797 return error;
2857 item = item->ri_next; 2798 }
2858 } while (first_item != item);
2859 2799
2860 return 0; 2800 return 0;
2861} 2801}
@@ -2869,21 +2809,18 @@ STATIC void
2869xlog_recover_free_trans( 2809xlog_recover_free_trans(
2870 xlog_recover_t *trans) 2810 xlog_recover_t *trans)
2871{ 2811{
2872 xlog_recover_item_t *first_item, *item, *free_item; 2812 xlog_recover_item_t *item, *n;
2873 int i; 2813 int i;
2874 2814
2875 item = first_item = trans->r_itemq; 2815 list_for_each_entry_safe(item, n, &trans->r_itemq, ri_list) {
2876 do { 2816 /* Free the regions in the item. */
2877 free_item = item; 2817 list_del(&item->ri_list);
2878 item = item->ri_next; 2818 for (i = 0; i < item->ri_cnt; i++)
2879 /* Free the regions in the item. */ 2819 kmem_free(item->ri_buf[i].i_addr);
2880 for (i = 0; i < free_item->ri_cnt; i++) {
2881 kmem_free(free_item->ri_buf[i].i_addr);
2882 }
2883 /* Free the item itself */ 2820 /* Free the item itself */
2884 kmem_free(free_item->ri_buf); 2821 kmem_free(item->ri_buf);
2885 kmem_free(free_item); 2822 kmem_free(item);
2886 } while (first_item != item); 2823 }
2887 /* Free the transaction recover structure */ 2824 /* Free the transaction recover structure */
2888 kmem_free(trans); 2825 kmem_free(trans);
2889} 2826}
@@ -2891,14 +2828,12 @@ xlog_recover_free_trans(
2891STATIC int 2828STATIC int
2892xlog_recover_commit_trans( 2829xlog_recover_commit_trans(
2893 xlog_t *log, 2830 xlog_t *log,
2894 xlog_recover_t **q,
2895 xlog_recover_t *trans, 2831 xlog_recover_t *trans,
2896 int pass) 2832 int pass)
2897{ 2833{
2898 int error; 2834 int error;
2899 2835
2900 if ((error = xlog_recover_unlink_tid(q, trans))) 2836 hlist_del(&trans->r_list);
2901 return error;
2902 if ((error = xlog_recover_do_trans(log, trans, pass))) 2837 if ((error = xlog_recover_do_trans(log, trans, pass)))
2903 return error; 2838 return error;
2904 xlog_recover_free_trans(trans); /* no error */ 2839 xlog_recover_free_trans(trans); /* no error */
@@ -2926,7 +2861,7 @@ xlog_recover_unmount_trans(
2926STATIC int 2861STATIC int
2927xlog_recover_process_data( 2862xlog_recover_process_data(
2928 xlog_t *log, 2863 xlog_t *log,
2929 xlog_recover_t *rhash[], 2864 struct hlist_head rhash[],
2930 xlog_rec_header_t *rhead, 2865 xlog_rec_header_t *rhead,
2931 xfs_caddr_t dp, 2866 xfs_caddr_t dp,
2932 int pass) 2867 int pass)
@@ -2960,7 +2895,7 @@ xlog_recover_process_data(
2960 } 2895 }
2961 tid = be32_to_cpu(ohead->oh_tid); 2896 tid = be32_to_cpu(ohead->oh_tid);
2962 hash = XLOG_RHASH(tid); 2897 hash = XLOG_RHASH(tid);
2963 trans = xlog_recover_find_tid(rhash[hash], tid); 2898 trans = xlog_recover_find_tid(&rhash[hash], tid);
2964 if (trans == NULL) { /* not found; add new tid */ 2899 if (trans == NULL) { /* not found; add new tid */
2965 if (ohead->oh_flags & XLOG_START_TRANS) 2900 if (ohead->oh_flags & XLOG_START_TRANS)
2966 xlog_recover_new_tid(&rhash[hash], tid, 2901 xlog_recover_new_tid(&rhash[hash], tid,
@@ -2978,7 +2913,7 @@ xlog_recover_process_data(
2978 switch (flags) { 2913 switch (flags) {
2979 case XLOG_COMMIT_TRANS: 2914 case XLOG_COMMIT_TRANS:
2980 error = xlog_recover_commit_trans(log, 2915 error = xlog_recover_commit_trans(log,
2981 &rhash[hash], trans, pass); 2916 trans, pass);
2982 break; 2917 break;
2983 case XLOG_UNMOUNT_TRANS: 2918 case XLOG_UNMOUNT_TRANS:
2984 error = xlog_recover_unmount_trans(trans); 2919 error = xlog_recover_unmount_trans(trans);
@@ -3211,7 +3146,7 @@ xlog_recover_process_one_iunlink(
3211 /* 3146 /*
3212 * Get the on disk inode to find the next inode in the bucket. 3147 * Get the on disk inode to find the next inode in the bucket.
3213 */ 3148 */
3214 error = xfs_itobp(mp, NULL, ip, &dip, &ibp, XFS_BUF_LOCK); 3149 error = xfs_itobp(mp, NULL, ip, &dip, &ibp, XBF_LOCK);
3215 if (error) 3150 if (error)
3216 goto fail_iput; 3151 goto fail_iput;
3217 3152
@@ -3517,7 +3452,7 @@ xlog_do_recovery_pass(
3517 int error = 0, h_size; 3452 int error = 0, h_size;
3518 int bblks, split_bblks; 3453 int bblks, split_bblks;
3519 int hblks, split_hblks, wrapped_hblks; 3454 int hblks, split_hblks, wrapped_hblks;
3520 xlog_recover_t *rhash[XLOG_RHASH_SIZE]; 3455 struct hlist_head rhash[XLOG_RHASH_SIZE];
3521 3456
3522 ASSERT(head_blk != tail_blk); 3457 ASSERT(head_blk != tail_blk);
3523 3458
@@ -3978,8 +3913,7 @@ xlog_recover_finish(
3978 * case the unlink transactions would have problems 3913 * case the unlink transactions would have problems
3979 * pushing the EFIs out of the way. 3914 * pushing the EFIs out of the way.
3980 */ 3915 */
3981 xfs_log_force(log->l_mp, (xfs_lsn_t)0, 3916 xfs_log_force(log->l_mp, XFS_LOG_SYNC);
3982 (XFS_LOG_FORCE | XFS_LOG_SYNC));
3983 3917
3984 xlog_recover_process_iunlinks(log); 3918 xlog_recover_process_iunlinks(log);
3985 3919
diff --git a/fs/xfs/xfs_log_recover.h b/fs/xfs/xfs_log_recover.h
index b22545555301..75d749207258 100644
--- a/fs/xfs/xfs_log_recover.h
+++ b/fs/xfs/xfs_log_recover.h
@@ -35,22 +35,21 @@
35 * item headers are in ri_buf[0]. Additional buffers follow. 35 * item headers are in ri_buf[0]. Additional buffers follow.
36 */ 36 */
37typedef struct xlog_recover_item { 37typedef struct xlog_recover_item {
38 struct xlog_recover_item *ri_next; 38 struct list_head ri_list;
39 struct xlog_recover_item *ri_prev; 39 int ri_type;
40 int ri_type; 40 int ri_cnt; /* count of regions found */
41 int ri_cnt; /* count of regions found */ 41 int ri_total; /* total regions */
42 int ri_total; /* total regions */ 42 xfs_log_iovec_t *ri_buf; /* ptr to regions buffer */
43 xfs_log_iovec_t *ri_buf; /* ptr to regions buffer */
44} xlog_recover_item_t; 43} xlog_recover_item_t;
45 44
46struct xlog_tid; 45struct xlog_tid;
47typedef struct xlog_recover { 46typedef struct xlog_recover {
48 struct xlog_recover *r_next; 47 struct hlist_node r_list;
49 xlog_tid_t r_log_tid; /* log's transaction id */ 48 xlog_tid_t r_log_tid; /* log's transaction id */
50 xfs_trans_header_t r_theader; /* trans header for partial */ 49 xfs_trans_header_t r_theader; /* trans header for partial */
51 int r_state; /* not needed */ 50 int r_state; /* not needed */
52 xfs_lsn_t r_lsn; /* xact lsn */ 51 xfs_lsn_t r_lsn; /* xact lsn */
53 xlog_recover_item_t *r_itemq; /* q for items */ 52 struct list_head r_itemq; /* q for items */
54} xlog_recover_t; 53} xlog_recover_t;
55 54
56#define ITEM_TYPE(i) (*(ushort *)(i)->ri_buf[0].i_addr) 55#define ITEM_TYPE(i) (*(ushort *)(i)->ri_buf[0].i_addr)
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index eb403b40e120..6afaaeb2950a 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -201,6 +201,38 @@ xfs_uuid_unmount(
201 201
202 202
203/* 203/*
204 * Reference counting access wrappers to the perag structures.
205 */
206struct xfs_perag *
207xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno)
208{
209 struct xfs_perag *pag;
210 int ref = 0;
211
212 spin_lock(&mp->m_perag_lock);
213 pag = radix_tree_lookup(&mp->m_perag_tree, agno);
214 if (pag) {
215 ASSERT(atomic_read(&pag->pag_ref) >= 0);
216 /* catch leaks in the positive direction during testing */
217 ASSERT(atomic_read(&pag->pag_ref) < 1000);
218 ref = atomic_inc_return(&pag->pag_ref);
219 }
220 spin_unlock(&mp->m_perag_lock);
221 trace_xfs_perag_get(mp, agno, ref, _RET_IP_);
222 return pag;
223}
224
225void
226xfs_perag_put(struct xfs_perag *pag)
227{
228 int ref;
229
230 ASSERT(atomic_read(&pag->pag_ref) > 0);
231 ref = atomic_dec_return(&pag->pag_ref);
232 trace_xfs_perag_put(pag->pag_mount, pag->pag_agno, ref, _RET_IP_);
233}
234
235/*
204 * Free up the resources associated with a mount structure. Assume that 236 * Free up the resources associated with a mount structure. Assume that
205 * the structure was initially zeroed, so we can tell which fields got 237 * the structure was initially zeroed, so we can tell which fields got
206 * initialized. 238 * initialized.
@@ -209,13 +241,16 @@ STATIC void
209xfs_free_perag( 241xfs_free_perag(
210 xfs_mount_t *mp) 242 xfs_mount_t *mp)
211{ 243{
212 if (mp->m_perag) { 244 xfs_agnumber_t agno;
213 int agno; 245 struct xfs_perag *pag;
214 246
215 for (agno = 0; agno < mp->m_maxagi; agno++) 247 for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
216 if (mp->m_perag[agno].pagb_list) 248 spin_lock(&mp->m_perag_lock);
217 kmem_free(mp->m_perag[agno].pagb_list); 249 pag = radix_tree_delete(&mp->m_perag_tree, agno);
218 kmem_free(mp->m_perag); 250 ASSERT(pag);
251 ASSERT(atomic_read(&pag->pag_ref) == 0);
252 spin_unlock(&mp->m_perag_lock);
253 kmem_free(pag);
219 } 254 }
220} 255}
221 256
@@ -389,22 +424,57 @@ xfs_initialize_perag_icache(
389 } 424 }
390} 425}
391 426
392xfs_agnumber_t 427int
393xfs_initialize_perag( 428xfs_initialize_perag(
394 xfs_mount_t *mp, 429 xfs_mount_t *mp,
395 xfs_agnumber_t agcount) 430 xfs_agnumber_t agcount,
431 xfs_agnumber_t *maxagi)
396{ 432{
397 xfs_agnumber_t index, max_metadata; 433 xfs_agnumber_t index, max_metadata;
434 xfs_agnumber_t first_initialised = 0;
398 xfs_perag_t *pag; 435 xfs_perag_t *pag;
399 xfs_agino_t agino; 436 xfs_agino_t agino;
400 xfs_ino_t ino; 437 xfs_ino_t ino;
401 xfs_sb_t *sbp = &mp->m_sb; 438 xfs_sb_t *sbp = &mp->m_sb;
402 xfs_ino_t max_inum = XFS_MAXINUMBER_32; 439 xfs_ino_t max_inum = XFS_MAXINUMBER_32;
440 int error = -ENOMEM;
403 441
404 /* Check to see if the filesystem can overflow 32 bit inodes */ 442 /* Check to see if the filesystem can overflow 32 bit inodes */
405 agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks - 1, 0); 443 agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks - 1, 0);
406 ino = XFS_AGINO_TO_INO(mp, agcount - 1, agino); 444 ino = XFS_AGINO_TO_INO(mp, agcount - 1, agino);
407 445
446 /*
447 * Walk the current per-ag tree so we don't try to initialise AGs
448 * that already exist (growfs case). Allocate and insert all the
449 * AGs we don't find ready for initialisation.
450 */
451 for (index = 0; index < agcount; index++) {
452 pag = xfs_perag_get(mp, index);
453 if (pag) {
454 xfs_perag_put(pag);
455 continue;
456 }
457 if (!first_initialised)
458 first_initialised = index;
459 pag = kmem_zalloc(sizeof(*pag), KM_MAYFAIL);
460 if (!pag)
461 goto out_unwind;
462 if (radix_tree_preload(GFP_NOFS))
463 goto out_unwind;
464 spin_lock(&mp->m_perag_lock);
465 if (radix_tree_insert(&mp->m_perag_tree, index, pag)) {
466 BUG();
467 spin_unlock(&mp->m_perag_lock);
468 radix_tree_preload_end();
469 error = -EEXIST;
470 goto out_unwind;
471 }
472 pag->pag_agno = index;
473 pag->pag_mount = mp;
474 spin_unlock(&mp->m_perag_lock);
475 radix_tree_preload_end();
476 }
477
408 /* Clear the mount flag if no inode can overflow 32 bits 478 /* Clear the mount flag if no inode can overflow 32 bits
409 * on this filesystem, or if specifically requested.. 479 * on this filesystem, or if specifically requested..
410 */ 480 */
@@ -438,21 +508,33 @@ xfs_initialize_perag(
438 } 508 }
439 509
440 /* This ag is preferred for inodes */ 510 /* This ag is preferred for inodes */
441 pag = &mp->m_perag[index]; 511 pag = xfs_perag_get(mp, index);
442 pag->pagi_inodeok = 1; 512 pag->pagi_inodeok = 1;
443 if (index < max_metadata) 513 if (index < max_metadata)
444 pag->pagf_metadata = 1; 514 pag->pagf_metadata = 1;
445 xfs_initialize_perag_icache(pag); 515 xfs_initialize_perag_icache(pag);
516 xfs_perag_put(pag);
446 } 517 }
447 } else { 518 } else {
448 /* Setup default behavior for smaller filesystems */ 519 /* Setup default behavior for smaller filesystems */
449 for (index = 0; index < agcount; index++) { 520 for (index = 0; index < agcount; index++) {
450 pag = &mp->m_perag[index]; 521 pag = xfs_perag_get(mp, index);
451 pag->pagi_inodeok = 1; 522 pag->pagi_inodeok = 1;
452 xfs_initialize_perag_icache(pag); 523 xfs_initialize_perag_icache(pag);
524 xfs_perag_put(pag);
453 } 525 }
454 } 526 }
455 return index; 527 if (maxagi)
528 *maxagi = index;
529 return 0;
530
531out_unwind:
532 kmem_free(pag);
533 for (; index > first_initialised; index--) {
534 pag = radix_tree_delete(&mp->m_perag_tree, index);
535 kmem_free(pag);
536 }
537 return error;
456} 538}
457 539
458void 540void
@@ -583,7 +665,7 @@ xfs_readsb(xfs_mount_t *mp, int flags)
583 * access to the superblock. 665 * access to the superblock.
584 */ 666 */
585 sector_size = xfs_getsize_buftarg(mp->m_ddev_targp); 667 sector_size = xfs_getsize_buftarg(mp->m_ddev_targp);
586 extra_flags = XFS_BUF_LOCK | XFS_BUF_MANAGE | XFS_BUF_MAPPED; 668 extra_flags = XBF_LOCK | XBF_FS_MANAGED | XBF_MAPPED;
587 669
588 bp = xfs_buf_read(mp->m_ddev_targp, XFS_SB_DADDR, BTOBB(sector_size), 670 bp = xfs_buf_read(mp->m_ddev_targp, XFS_SB_DADDR, BTOBB(sector_size),
589 extra_flags); 671 extra_flags);
@@ -731,12 +813,13 @@ xfs_initialize_perag_data(xfs_mount_t *mp, xfs_agnumber_t agcount)
731 error = xfs_ialloc_pagi_init(mp, NULL, index); 813 error = xfs_ialloc_pagi_init(mp, NULL, index);
732 if (error) 814 if (error)
733 return error; 815 return error;
734 pag = &mp->m_perag[index]; 816 pag = xfs_perag_get(mp, index);
735 ifree += pag->pagi_freecount; 817 ifree += pag->pagi_freecount;
736 ialloc += pag->pagi_count; 818 ialloc += pag->pagi_count;
737 bfree += pag->pagf_freeblks; 819 bfree += pag->pagf_freeblks;
738 bfreelst += pag->pagf_flcount; 820 bfreelst += pag->pagf_flcount;
739 btree += pag->pagf_btreeblks; 821 btree += pag->pagf_btreeblks;
822 xfs_perag_put(pag);
740 } 823 }
741 /* 824 /*
742 * Overwrite incore superblock counters with just-read data 825 * Overwrite incore superblock counters with just-read data
@@ -1008,6 +1091,22 @@ xfs_mount_reset_sbqflags(
1008 return xfs_trans_commit(tp, 0); 1091 return xfs_trans_commit(tp, 0);
1009} 1092}
1010 1093
1094__uint64_t
1095xfs_default_resblks(xfs_mount_t *mp)
1096{
1097 __uint64_t resblks;
1098
1099 /*
1100 * We default to 5% or 1024 fsbs of space reserved, whichever is smaller.
1101 * This may drive us straight to ENOSPC on mount, but that implies
1102 * we were already there on the last unmount. Warn if this occurs.
1103 */
1104 resblks = mp->m_sb.sb_dblocks;
1105 do_div(resblks, 20);
1106 resblks = min_t(__uint64_t, resblks, 1024);
1107 return resblks;
1108}
1109
1011/* 1110/*
1012 * This function does the following on an initial mount of a file system: 1111 * This function does the following on an initial mount of a file system:
1013 * - reads the superblock from disk and init the mount struct 1112 * - reads the superblock from disk and init the mount struct
@@ -1152,13 +1251,13 @@ xfs_mountfs(
1152 /* 1251 /*
1153 * Allocate and initialize the per-ag data. 1252 * Allocate and initialize the per-ag data.
1154 */ 1253 */
1155 init_rwsem(&mp->m_peraglock); 1254 spin_lock_init(&mp->m_perag_lock);
1156 mp->m_perag = kmem_zalloc(sbp->sb_agcount * sizeof(xfs_perag_t), 1255 INIT_RADIX_TREE(&mp->m_perag_tree, GFP_NOFS);
1157 KM_MAYFAIL); 1256 error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi);
1158 if (!mp->m_perag) 1257 if (error) {
1258 cmn_err(CE_WARN, "XFS: Failed per-ag init: %d", error);
1159 goto out_remove_uuid; 1259 goto out_remove_uuid;
1160 1260 }
1161 mp->m_maxagi = xfs_initialize_perag(mp, sbp->sb_agcount);
1162 1261
1163 if (!sbp->sb_logblocks) { 1262 if (!sbp->sb_logblocks) {
1164 cmn_err(CE_WARN, "XFS: no log defined"); 1263 cmn_err(CE_WARN, "XFS: no log defined");
@@ -1318,18 +1417,14 @@ xfs_mountfs(
1318 * when at ENOSPC. This is needed for operations like create with 1417 * when at ENOSPC. This is needed for operations like create with
1319 * attr, unwritten extent conversion at ENOSPC, etc. Data allocations 1418 * attr, unwritten extent conversion at ENOSPC, etc. Data allocations
1320 * are not allowed to use this reserved space. 1419 * are not allowed to use this reserved space.
1321 *
1322 * We default to 5% or 1024 fsbs of space reserved, whichever is smaller.
1323 * This may drive us straight to ENOSPC on mount, but that implies
1324 * we were already there on the last unmount. Warn if this occurs.
1325 */ 1420 */
1326 resblks = mp->m_sb.sb_dblocks; 1421 if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
1327 do_div(resblks, 20); 1422 resblks = xfs_default_resblks(mp);
1328 resblks = min_t(__uint64_t, resblks, 1024); 1423 error = xfs_reserve_blocks(mp, &resblks, NULL);
1329 error = xfs_reserve_blocks(mp, &resblks, NULL); 1424 if (error)
1330 if (error) 1425 cmn_err(CE_WARN, "XFS: Unable to allocate reserve "
1331 cmn_err(CE_WARN, "XFS: Unable to allocate reserve blocks. " 1426 "blocks. Continuing without a reserve pool.");
1332 "Continuing without a reserve pool."); 1427 }
1333 1428
1334 return 0; 1429 return 0;
1335 1430
@@ -1372,8 +1467,19 @@ xfs_unmountfs(
1372 * push out the iclog we will never get that unlocked. hence we 1467 * push out the iclog we will never get that unlocked. hence we
1373 * need to force the log first. 1468 * need to force the log first.
1374 */ 1469 */
1375 xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC); 1470 xfs_log_force(mp, XFS_LOG_SYNC);
1376 xfs_reclaim_inodes(mp, XFS_IFLUSH_ASYNC); 1471
1472 /*
1473 * Do a delwri reclaim pass first so that as many dirty inodes are
1474 * queued up for IO as possible. Then flush the buffers before making
1475 * a synchronous path to catch all the remaining inodes are reclaimed.
1476 * This makes the reclaim process as quick as possible by avoiding
1477 * synchronous writeout and blocking on inodes already in the delwri
1478 * state as much as possible.
1479 */
1480 xfs_reclaim_inodes(mp, 0);
1481 XFS_bflush(mp->m_ddev_targp);
1482 xfs_reclaim_inodes(mp, SYNC_WAIT);
1377 1483
1378 xfs_qm_unmount(mp); 1484 xfs_qm_unmount(mp);
1379 1485
@@ -1382,7 +1488,7 @@ xfs_unmountfs(
1382 * that nothing is pinned. This is important because bflush() 1488 * that nothing is pinned. This is important because bflush()
1383 * will skip pinned buffers. 1489 * will skip pinned buffers.
1384 */ 1490 */
1385 xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC); 1491 xfs_log_force(mp, XFS_LOG_SYNC);
1386 1492
1387 xfs_binval(mp->m_ddev_targp); 1493 xfs_binval(mp->m_ddev_targp);
1388 if (mp->m_rtdev_targp) { 1494 if (mp->m_rtdev_targp) {
@@ -1548,15 +1654,14 @@ xfs_mod_sb(xfs_trans_t *tp, __int64_t fields)
1548 xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb, fields); 1654 xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb, fields);
1549 1655
1550 /* find modified range */ 1656 /* find modified range */
1657 f = (xfs_sb_field_t)xfs_highbit64((__uint64_t)fields);
1658 ASSERT((1LL << f) & XFS_SB_MOD_BITS);
1659 last = xfs_sb_info[f + 1].offset - 1;
1551 1660
1552 f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields); 1661 f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields);
1553 ASSERT((1LL << f) & XFS_SB_MOD_BITS); 1662 ASSERT((1LL << f) & XFS_SB_MOD_BITS);
1554 first = xfs_sb_info[f].offset; 1663 first = xfs_sb_info[f].offset;
1555 1664
1556 f = (xfs_sb_field_t)xfs_highbit64((__uint64_t)fields);
1557 ASSERT((1LL << f) & XFS_SB_MOD_BITS);
1558 last = xfs_sb_info[f + 1].offset - 1;
1559
1560 xfs_trans_log_buf(tp, bp, first, last); 1665 xfs_trans_log_buf(tp, bp, first, last);
1561} 1666}
1562 1667
@@ -1887,7 +1992,7 @@ xfs_getsb(
1887 1992
1888 ASSERT(mp->m_sb_bp != NULL); 1993 ASSERT(mp->m_sb_bp != NULL);
1889 bp = mp->m_sb_bp; 1994 bp = mp->m_sb_bp;
1890 if (flags & XFS_BUF_TRYLOCK) { 1995 if (flags & XBF_TRYLOCK) {
1891 if (!XFS_BUF_CPSEMA(bp)) { 1996 if (!XFS_BUF_CPSEMA(bp)) {
1892 return NULL; 1997 return NULL;
1893 } 1998 }
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 1df7e4502967..14dafd608230 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -78,7 +78,8 @@ typedef int (*xfs_send_destroy_t)(struct xfs_inode *, dm_right_t);
78typedef int (*xfs_send_namesp_t)(dm_eventtype_t, struct xfs_mount *, 78typedef int (*xfs_send_namesp_t)(dm_eventtype_t, struct xfs_mount *,
79 struct xfs_inode *, dm_right_t, 79 struct xfs_inode *, dm_right_t,
80 struct xfs_inode *, dm_right_t, 80 struct xfs_inode *, dm_right_t,
81 const char *, const char *, mode_t, int, int); 81 const unsigned char *, const unsigned char *,
82 mode_t, int, int);
82typedef int (*xfs_send_mount_t)(struct xfs_mount *, dm_right_t, 83typedef int (*xfs_send_mount_t)(struct xfs_mount *, dm_right_t,
83 char *, char *); 84 char *, char *);
84typedef void (*xfs_send_unmount_t)(struct xfs_mount *, struct xfs_inode *, 85typedef void (*xfs_send_unmount_t)(struct xfs_mount *, struct xfs_inode *,
@@ -207,8 +208,8 @@ typedef struct xfs_mount {
207 uint m_ag_maxlevels; /* XFS_AG_MAXLEVELS */ 208 uint m_ag_maxlevels; /* XFS_AG_MAXLEVELS */
208 uint m_bm_maxlevels[2]; /* XFS_BM_MAXLEVELS */ 209 uint m_bm_maxlevels[2]; /* XFS_BM_MAXLEVELS */
209 uint m_in_maxlevels; /* max inobt btree levels. */ 210 uint m_in_maxlevels; /* max inobt btree levels. */
210 struct xfs_perag *m_perag; /* per-ag accounting info */ 211 struct radix_tree_root m_perag_tree; /* per-ag accounting info */
211 struct rw_semaphore m_peraglock; /* lock for m_perag (pointer) */ 212 spinlock_t m_perag_lock; /* lock for m_perag_tree */
212 struct mutex m_growlock; /* growfs mutex */ 213 struct mutex m_growlock; /* growfs mutex */
213 int m_fixedfsid[2]; /* unchanged for life of FS */ 214 int m_fixedfsid[2]; /* unchanged for life of FS */
214 uint m_dmevmask; /* DMI events for this FS */ 215 uint m_dmevmask; /* DMI events for this FS */
@@ -224,6 +225,7 @@ typedef struct xfs_mount {
224 __uint64_t m_maxioffset; /* maximum inode offset */ 225 __uint64_t m_maxioffset; /* maximum inode offset */
225 __uint64_t m_resblks; /* total reserved blocks */ 226 __uint64_t m_resblks; /* total reserved blocks */
226 __uint64_t m_resblks_avail;/* available reserved blocks */ 227 __uint64_t m_resblks_avail;/* available reserved blocks */
228 __uint64_t m_resblks_save; /* reserved blks @ remount,ro */
227 int m_dalign; /* stripe unit */ 229 int m_dalign; /* stripe unit */
228 int m_swidth; /* stripe width */ 230 int m_swidth; /* stripe width */
229 int m_sinoalign; /* stripe unit inode alignment */ 231 int m_sinoalign; /* stripe unit inode alignment */
@@ -243,7 +245,7 @@ typedef struct xfs_mount {
243 struct xfs_qmops *m_qm_ops; /* vector of XQM ops */ 245 struct xfs_qmops *m_qm_ops; /* vector of XQM ops */
244 atomic_t m_active_trans; /* number trans frozen */ 246 atomic_t m_active_trans; /* number trans frozen */
245#ifdef HAVE_PERCPU_SB 247#ifdef HAVE_PERCPU_SB
246 xfs_icsb_cnts_t *m_sb_cnts; /* per-cpu superblock counters */ 248 xfs_icsb_cnts_t __percpu *m_sb_cnts; /* per-cpu superblock counters */
247 unsigned long m_icsb_counters; /* disabled per-cpu counters */ 249 unsigned long m_icsb_counters; /* disabled per-cpu counters */
248 struct notifier_block m_icsb_notifier; /* hotplug cpu notifier */ 250 struct notifier_block m_icsb_notifier; /* hotplug cpu notifier */
249 struct mutex m_icsb_mutex; /* balancer sync lock */ 251 struct mutex m_icsb_mutex; /* balancer sync lock */
@@ -384,19 +386,10 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d)
384} 386}
385 387
386/* 388/*
387 * perag get/put wrappers for eventual ref counting 389 * perag get/put wrappers for ref counting
388 */ 390 */
389static inline xfs_perag_t * 391struct xfs_perag *xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno);
390xfs_get_perag(struct xfs_mount *mp, xfs_ino_t ino) 392void xfs_perag_put(struct xfs_perag *pag);
391{
392 return &mp->m_perag[XFS_INO_TO_AGNO(mp, ino)];
393}
394
395static inline void
396xfs_put_perag(struct xfs_mount *mp, xfs_perag_t *pag)
397{
398 /* nothing to see here, move along */
399}
400 393
401/* 394/*
402 * Per-cpu superblock locking functions 395 * Per-cpu superblock locking functions
@@ -428,6 +421,7 @@ typedef struct xfs_mod_sb {
428} xfs_mod_sb_t; 421} xfs_mod_sb_t;
429 422
430extern int xfs_log_sbcount(xfs_mount_t *, uint); 423extern int xfs_log_sbcount(xfs_mount_t *, uint);
424extern __uint64_t xfs_default_resblks(xfs_mount_t *mp);
431extern int xfs_mountfs(xfs_mount_t *mp); 425extern int xfs_mountfs(xfs_mount_t *mp);
432 426
433extern void xfs_unmountfs(xfs_mount_t *); 427extern void xfs_unmountfs(xfs_mount_t *);
@@ -450,7 +444,8 @@ extern struct xfs_dmops xfs_dmcore_xfs;
450#endif /* __KERNEL__ */ 444#endif /* __KERNEL__ */
451 445
452extern void xfs_mod_sb(struct xfs_trans *, __int64_t); 446extern void xfs_mod_sb(struct xfs_trans *, __int64_t);
453extern xfs_agnumber_t xfs_initialize_perag(struct xfs_mount *, xfs_agnumber_t); 447extern int xfs_initialize_perag(struct xfs_mount *, xfs_agnumber_t,
448 xfs_agnumber_t *);
454extern void xfs_sb_from_disk(struct xfs_sb *, struct xfs_dsb *); 449extern void xfs_sb_from_disk(struct xfs_sb *, struct xfs_dsb *);
455extern void xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t); 450extern void xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t);
456 451
diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c
index 4b0613d99faa..45ce15dc5b2b 100644
--- a/fs/xfs/xfs_mru_cache.c
+++ b/fs/xfs/xfs_mru_cache.c
@@ -398,7 +398,7 @@ exit:
398 * guaranteed that all the free functions for all the elements have finished 398 * guaranteed that all the free functions for all the elements have finished
399 * executing and the reaper is not running. 399 * executing and the reaper is not running.
400 */ 400 */
401void 401static void
402xfs_mru_cache_flush( 402xfs_mru_cache_flush(
403 xfs_mru_cache_t *mru) 403 xfs_mru_cache_t *mru)
404{ 404{
diff --git a/fs/xfs/xfs_mru_cache.h b/fs/xfs/xfs_mru_cache.h
index 5d439f34b0c9..36dd3ec8b4eb 100644
--- a/fs/xfs/xfs_mru_cache.h
+++ b/fs/xfs/xfs_mru_cache.h
@@ -42,7 +42,6 @@ void xfs_mru_cache_uninit(void);
42int xfs_mru_cache_create(struct xfs_mru_cache **mrup, unsigned int lifetime_ms, 42int xfs_mru_cache_create(struct xfs_mru_cache **mrup, unsigned int lifetime_ms,
43 unsigned int grp_count, 43 unsigned int grp_count,
44 xfs_mru_cache_free_func_t free_func); 44 xfs_mru_cache_free_func_t free_func);
45void xfs_mru_cache_flush(xfs_mru_cache_t *mru);
46void xfs_mru_cache_destroy(struct xfs_mru_cache *mru); 45void xfs_mru_cache_destroy(struct xfs_mru_cache *mru);
47int xfs_mru_cache_insert(struct xfs_mru_cache *mru, unsigned long key, 46int xfs_mru_cache_insert(struct xfs_mru_cache *mru, unsigned long key,
48 void *value); 47 void *value);
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index 91bfd60f4c74..fdcab3f81dde 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -223,16 +223,9 @@ typedef struct xfs_qoff_logformat {
223#define XFS_QMOPT_RES_INOS 0x0800000 223#define XFS_QMOPT_RES_INOS 0x0800000
224 224
225/* 225/*
226 * flags for dqflush and dqflush_all.
227 */
228#define XFS_QMOPT_SYNC 0x1000000
229#define XFS_QMOPT_ASYNC 0x2000000
230#define XFS_QMOPT_DELWRI 0x4000000
231
232/*
233 * flags for dqalloc. 226 * flags for dqalloc.
234 */ 227 */
235#define XFS_QMOPT_INHERIT 0x8000000 228#define XFS_QMOPT_INHERIT 0x1000000
236 229
237/* 230/*
238 * flags to xfs_trans_mod_dquot. 231 * flags to xfs_trans_mod_dquot.
diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c
index 5aa07caea5f1..e336742a58a4 100644
--- a/fs/xfs/xfs_rw.c
+++ b/fs/xfs/xfs_rw.c
@@ -47,48 +47,6 @@
47#include "xfs_trace.h" 47#include "xfs_trace.h"
48 48
49/* 49/*
50 * This is a subroutine for xfs_write() and other writers (xfs_ioctl)
51 * which clears the setuid and setgid bits when a file is written.
52 */
53int
54xfs_write_clear_setuid(
55 xfs_inode_t *ip)
56{
57 xfs_mount_t *mp;
58 xfs_trans_t *tp;
59 int error;
60
61 mp = ip->i_mount;
62 tp = xfs_trans_alloc(mp, XFS_TRANS_WRITEID);
63 if ((error = xfs_trans_reserve(tp, 0,
64 XFS_WRITEID_LOG_RES(mp),
65 0, 0, 0))) {
66 xfs_trans_cancel(tp, 0);
67 return error;
68 }
69 xfs_ilock(ip, XFS_ILOCK_EXCL);
70 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
71 xfs_trans_ihold(tp, ip);
72 ip->i_d.di_mode &= ~S_ISUID;
73
74 /*
75 * Note that we don't have to worry about mandatory
76 * file locking being disabled here because we only
77 * clear the S_ISGID bit if the Group execute bit is
78 * on, but if it was on then mandatory locking wouldn't
79 * have been enabled.
80 */
81 if (ip->i_d.di_mode & S_IXGRP) {
82 ip->i_d.di_mode &= ~S_ISGID;
83 }
84 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
85 xfs_trans_set_sync(tp);
86 error = xfs_trans_commit(tp, 0);
87 xfs_iunlock(ip, XFS_ILOCK_EXCL);
88 return 0;
89}
90
91/*
92 * Force a shutdown of the filesystem instantly while keeping 50 * Force a shutdown of the filesystem instantly while keeping
93 * the filesystem consistent. We don't do an unmount here; just shutdown 51 * the filesystem consistent. We don't do an unmount here; just shutdown
94 * the shop, make sure that absolutely nothing persistent happens to 52 * the shop, make sure that absolutely nothing persistent happens to
@@ -153,88 +111,6 @@ xfs_do_force_shutdown(
153 } 111 }
154} 112}
155 113
156
157/*
158 * Called when we want to stop a buffer from getting written or read.
159 * We attach the EIO error, muck with its flags, and call biodone
160 * so that the proper iodone callbacks get called.
161 */
162int
163xfs_bioerror(
164 xfs_buf_t *bp)
165{
166
167#ifdef XFSERRORDEBUG
168 ASSERT(XFS_BUF_ISREAD(bp) || bp->b_iodone);
169#endif
170
171 /*
172 * No need to wait until the buffer is unpinned.
173 * We aren't flushing it.
174 */
175 XFS_BUF_ERROR(bp, EIO);
176 /*
177 * We're calling biodone, so delete B_DONE flag. Either way
178 * we have to call the iodone callback, and calling biodone
179 * probably is the best way since it takes care of
180 * GRIO as well.
181 */
182 XFS_BUF_UNREAD(bp);
183 XFS_BUF_UNDELAYWRITE(bp);
184 XFS_BUF_UNDONE(bp);
185 XFS_BUF_STALE(bp);
186
187 XFS_BUF_CLR_BDSTRAT_FUNC(bp);
188 xfs_biodone(bp);
189
190 return (EIO);
191}
192
193/*
194 * Same as xfs_bioerror, except that we are releasing the buffer
195 * here ourselves, and avoiding the biodone call.
196 * This is meant for userdata errors; metadata bufs come with
197 * iodone functions attached, so that we can track down errors.
198 */
199int
200xfs_bioerror_relse(
201 xfs_buf_t *bp)
202{
203 int64_t fl;
204
205 ASSERT(XFS_BUF_IODONE_FUNC(bp) != xfs_buf_iodone_callbacks);
206 ASSERT(XFS_BUF_IODONE_FUNC(bp) != xlog_iodone);
207
208 fl = XFS_BUF_BFLAGS(bp);
209 /*
210 * No need to wait until the buffer is unpinned.
211 * We aren't flushing it.
212 *
213 * chunkhold expects B_DONE to be set, whether
214 * we actually finish the I/O or not. We don't want to
215 * change that interface.
216 */
217 XFS_BUF_UNREAD(bp);
218 XFS_BUF_UNDELAYWRITE(bp);
219 XFS_BUF_DONE(bp);
220 XFS_BUF_STALE(bp);
221 XFS_BUF_CLR_IODONE_FUNC(bp);
222 XFS_BUF_CLR_BDSTRAT_FUNC(bp);
223 if (!(fl & XFS_B_ASYNC)) {
224 /*
225 * Mark b_error and B_ERROR _both_.
226 * Lot's of chunkcache code assumes that.
227 * There's no reason to mark error for
228 * ASYNC buffers.
229 */
230 XFS_BUF_ERROR(bp, EIO);
231 XFS_BUF_FINISH_IOWAIT(bp);
232 } else {
233 xfs_buf_relse(bp);
234 }
235 return (EIO);
236}
237
238/* 114/*
239 * Prints out an ALERT message about I/O error. 115 * Prints out an ALERT message about I/O error.
240 */ 116 */
@@ -306,37 +182,6 @@ xfs_read_buf(
306} 182}
307 183
308/* 184/*
309 * Wrapper around bwrite() so that we can trap
310 * write errors, and act accordingly.
311 */
312int
313xfs_bwrite(
314 struct xfs_mount *mp,
315 struct xfs_buf *bp)
316{
317 int error;
318
319 /*
320 * XXXsup how does this work for quotas.
321 */
322 XFS_BUF_SET_BDSTRAT_FUNC(bp, xfs_bdstrat_cb);
323 bp->b_mount = mp;
324 XFS_BUF_WRITE(bp);
325
326 if ((error = XFS_bwrite(bp))) {
327 ASSERT(mp);
328 /*
329 * Cannot put a buftrace here since if the buffer is not
330 * B_HOLD then we will brelse() the buffer before returning
331 * from bwrite and we could be tracing a buffer that has
332 * been reused.
333 */
334 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
335 }
336 return (error);
337}
338
339/*
340 * helper function to extract extent size hint from inode 185 * helper function to extract extent size hint from inode
341 */ 186 */
342xfs_extlen_t 187xfs_extlen_t
diff --git a/fs/xfs/xfs_rw.h b/fs/xfs/xfs_rw.h
index 571f2174435c..11c41ec6ed75 100644
--- a/fs/xfs/xfs_rw.h
+++ b/fs/xfs/xfs_rw.h
@@ -39,10 +39,6 @@ xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb)
39/* 39/*
40 * Prototypes for functions in xfs_rw.c. 40 * Prototypes for functions in xfs_rw.c.
41 */ 41 */
42extern int xfs_write_clear_setuid(struct xfs_inode *ip);
43extern int xfs_bwrite(struct xfs_mount *mp, struct xfs_buf *bp);
44extern int xfs_bioerror(struct xfs_buf *bp);
45extern int xfs_bioerror_relse(struct xfs_buf *bp);
46extern int xfs_read_buf(struct xfs_mount *mp, xfs_buftarg_t *btp, 42extern int xfs_read_buf(struct xfs_mount *mp, xfs_buftarg_t *btp,
47 xfs_daddr_t blkno, int len, uint flags, 43 xfs_daddr_t blkno, int len, uint flags,
48 struct xfs_buf **bpp); 44 struct xfs_buf **bpp);
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 237badcbac3b..be942d4e3324 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -981,9 +981,8 @@ shut_us_down:
981 */ 981 */
982 if (sync) { 982 if (sync) {
983 if (!error) { 983 if (!error) {
984 error = _xfs_log_force(mp, commit_lsn, 984 error = _xfs_log_force_lsn(mp, commit_lsn,
985 XFS_LOG_FORCE | XFS_LOG_SYNC, 985 XFS_LOG_SYNC, log_flushed);
986 log_flushed);
987 } 986 }
988 XFS_STATS_INC(xs_trans_sync); 987 XFS_STATS_INC(xs_trans_sync);
989 } else { 988 } else {
@@ -1121,7 +1120,7 @@ xfs_trans_fill_vecs(
1121 tp->t_header.th_num_items = nitems; 1120 tp->t_header.th_num_items = nitems;
1122 log_vector->i_addr = (xfs_caddr_t)&tp->t_header; 1121 log_vector->i_addr = (xfs_caddr_t)&tp->t_header;
1123 log_vector->i_len = sizeof(xfs_trans_header_t); 1122 log_vector->i_len = sizeof(xfs_trans_header_t);
1124 XLOG_VEC_SET_TYPE(log_vector, XLOG_REG_TYPE_TRANSHDR); 1123 log_vector->i_type = XLOG_REG_TYPE_TRANSHDR;
1125} 1124}
1126 1125
1127 1126
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index ca64f33c63a3..c93e3a102857 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -861,8 +861,7 @@ typedef struct xfs_item_ops {
861#define XFS_ITEM_SUCCESS 0 861#define XFS_ITEM_SUCCESS 0
862#define XFS_ITEM_PINNED 1 862#define XFS_ITEM_PINNED 1
863#define XFS_ITEM_LOCKED 2 863#define XFS_ITEM_LOCKED 2
864#define XFS_ITEM_FLUSHING 3 864#define XFS_ITEM_PUSHBUF 3
865#define XFS_ITEM_PUSHBUF 4
866 865
867/* 866/*
868 * This structure is used to maintain a list of block ranges that have been 867 * This structure is used to maintain a list of block ranges that have been
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 2ffc570679be..e799824f7245 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -237,14 +237,15 @@ out:
237} 237}
238 238
239/* 239/*
240 * Function that does the work of pushing on the AIL 240 * xfsaild_push does the work of pushing on the AIL. Returning a timeout of
241 * zero indicates that the caller should sleep until woken.
241 */ 242 */
242long 243long
243xfsaild_push( 244xfsaild_push(
244 struct xfs_ail *ailp, 245 struct xfs_ail *ailp,
245 xfs_lsn_t *last_lsn) 246 xfs_lsn_t *last_lsn)
246{ 247{
247 long tout = 1000; /* milliseconds */ 248 long tout = 0;
248 xfs_lsn_t last_pushed_lsn = *last_lsn; 249 xfs_lsn_t last_pushed_lsn = *last_lsn;
249 xfs_lsn_t target = ailp->xa_target; 250 xfs_lsn_t target = ailp->xa_target;
250 xfs_lsn_t lsn; 251 xfs_lsn_t lsn;
@@ -252,6 +253,7 @@ xfsaild_push(
252 int flush_log, count, stuck; 253 int flush_log, count, stuck;
253 xfs_mount_t *mp = ailp->xa_mount; 254 xfs_mount_t *mp = ailp->xa_mount;
254 struct xfs_ail_cursor *cur = &ailp->xa_cursors; 255 struct xfs_ail_cursor *cur = &ailp->xa_cursors;
256 int push_xfsbufd = 0;
255 257
256 spin_lock(&ailp->xa_lock); 258 spin_lock(&ailp->xa_lock);
257 xfs_trans_ail_cursor_init(ailp, cur); 259 xfs_trans_ail_cursor_init(ailp, cur);
@@ -262,7 +264,7 @@ xfsaild_push(
262 */ 264 */
263 xfs_trans_ail_cursor_done(ailp, cur); 265 xfs_trans_ail_cursor_done(ailp, cur);
264 spin_unlock(&ailp->xa_lock); 266 spin_unlock(&ailp->xa_lock);
265 last_pushed_lsn = 0; 267 *last_lsn = 0;
266 return tout; 268 return tout;
267 } 269 }
268 270
@@ -279,7 +281,6 @@ xfsaild_push(
279 * prevents use from spinning when we can't do anything or there is 281 * prevents use from spinning when we can't do anything or there is
280 * lots of contention on the AIL lists. 282 * lots of contention on the AIL lists.
281 */ 283 */
282 tout = 10;
283 lsn = lip->li_lsn; 284 lsn = lip->li_lsn;
284 flush_log = stuck = count = 0; 285 flush_log = stuck = count = 0;
285 while ((XFS_LSN_CMP(lip->li_lsn, target) < 0)) { 286 while ((XFS_LSN_CMP(lip->li_lsn, target) < 0)) {
@@ -308,6 +309,7 @@ xfsaild_push(
308 XFS_STATS_INC(xs_push_ail_pushbuf); 309 XFS_STATS_INC(xs_push_ail_pushbuf);
309 IOP_PUSHBUF(lip); 310 IOP_PUSHBUF(lip);
310 last_pushed_lsn = lsn; 311 last_pushed_lsn = lsn;
312 push_xfsbufd = 1;
311 break; 313 break;
312 314
313 case XFS_ITEM_PINNED: 315 case XFS_ITEM_PINNED:
@@ -322,12 +324,6 @@ xfsaild_push(
322 stuck++; 324 stuck++;
323 break; 325 break;
324 326
325 case XFS_ITEM_FLUSHING:
326 XFS_STATS_INC(xs_push_ail_flushing);
327 last_pushed_lsn = lsn;
328 stuck++;
329 break;
330
331 default: 327 default:
332 ASSERT(0); 328 ASSERT(0);
333 break; 329 break;
@@ -371,19 +367,24 @@ xfsaild_push(
371 * move forward in the AIL. 367 * move forward in the AIL.
372 */ 368 */
373 XFS_STATS_INC(xs_push_ail_flush); 369 XFS_STATS_INC(xs_push_ail_flush);
374 xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE); 370 xfs_log_force(mp, 0);
371 }
372
373 if (push_xfsbufd) {
374 /* we've got delayed write buffers to flush */
375 wake_up_process(mp->m_ddev_targp->bt_task);
375 } 376 }
376 377
377 if (!count) { 378 if (!count) {
378 /* We're past our target or empty, so idle */ 379 /* We're past our target or empty, so idle */
379 tout = 1000; 380 last_pushed_lsn = 0;
380 } else if (XFS_LSN_CMP(lsn, target) >= 0) { 381 } else if (XFS_LSN_CMP(lsn, target) >= 0) {
381 /* 382 /*
382 * We reached the target so wait a bit longer for I/O to 383 * We reached the target so wait a bit longer for I/O to
383 * complete and remove pushed items from the AIL before we 384 * complete and remove pushed items from the AIL before we
384 * start the next scan from the start of the AIL. 385 * start the next scan from the start of the AIL.
385 */ 386 */
386 tout += 20; 387 tout = 50;
387 last_pushed_lsn = 0; 388 last_pushed_lsn = 0;
388 } else if ((stuck * 100) / count > 90) { 389 } else if ((stuck * 100) / count > 90) {
389 /* 390 /*
@@ -395,11 +396,14 @@ xfsaild_push(
395 * Backoff a bit more to allow some I/O to complete before 396 * Backoff a bit more to allow some I/O to complete before
396 * continuing from where we were. 397 * continuing from where we were.
397 */ 398 */
398 tout += 10; 399 tout = 20;
400 } else {
401 /* more to do, but wait a short while before continuing */
402 tout = 10;
399 } 403 }
400 *last_lsn = last_pushed_lsn; 404 *last_lsn = last_pushed_lsn;
401 return tout; 405 return tout;
402} /* xfsaild_push */ 406}
403 407
404 408
405/* 409/*
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 49130628d5ef..5ffd544434eb 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -75,13 +75,14 @@ xfs_trans_get_buf(xfs_trans_t *tp,
75 xfs_buf_log_item_t *bip; 75 xfs_buf_log_item_t *bip;
76 76
77 if (flags == 0) 77 if (flags == 0)
78 flags = XFS_BUF_LOCK | XFS_BUF_MAPPED; 78 flags = XBF_LOCK | XBF_MAPPED;
79 79
80 /* 80 /*
81 * Default to a normal get_buf() call if the tp is NULL. 81 * Default to a normal get_buf() call if the tp is NULL.
82 */ 82 */
83 if (tp == NULL) 83 if (tp == NULL)
84 return xfs_buf_get(target_dev, blkno, len, flags | BUF_BUSY); 84 return xfs_buf_get(target_dev, blkno, len,
85 flags | XBF_DONT_BLOCK);
85 86
86 /* 87 /*
87 * If we find the buffer in the cache with this transaction 88 * If we find the buffer in the cache with this transaction
@@ -117,14 +118,14 @@ xfs_trans_get_buf(xfs_trans_t *tp,
117 } 118 }
118 119
119 /* 120 /*
120 * We always specify the BUF_BUSY flag within a transaction so 121 * We always specify the XBF_DONT_BLOCK flag within a transaction
121 * that get_buf does not try to push out a delayed write buffer 122 * so that get_buf does not try to push out a delayed write buffer
122 * which might cause another transaction to take place (if the 123 * which might cause another transaction to take place (if the
123 * buffer was delayed alloc). Such recursive transactions can 124 * buffer was delayed alloc). Such recursive transactions can
124 * easily deadlock with our current transaction as well as cause 125 * easily deadlock with our current transaction as well as cause
125 * us to run out of stack space. 126 * us to run out of stack space.
126 */ 127 */
127 bp = xfs_buf_get(target_dev, blkno, len, flags | BUF_BUSY); 128 bp = xfs_buf_get(target_dev, blkno, len, flags | XBF_DONT_BLOCK);
128 if (bp == NULL) { 129 if (bp == NULL) {
129 return NULL; 130 return NULL;
130 } 131 }
@@ -290,15 +291,15 @@ xfs_trans_read_buf(
290 int error; 291 int error;
291 292
292 if (flags == 0) 293 if (flags == 0)
293 flags = XFS_BUF_LOCK | XFS_BUF_MAPPED; 294 flags = XBF_LOCK | XBF_MAPPED;
294 295
295 /* 296 /*
296 * Default to a normal get_buf() call if the tp is NULL. 297 * Default to a normal get_buf() call if the tp is NULL.
297 */ 298 */
298 if (tp == NULL) { 299 if (tp == NULL) {
299 bp = xfs_buf_read(target, blkno, len, flags | BUF_BUSY); 300 bp = xfs_buf_read(target, blkno, len, flags | XBF_DONT_BLOCK);
300 if (!bp) 301 if (!bp)
301 return (flags & XFS_BUF_TRYLOCK) ? 302 return (flags & XBF_TRYLOCK) ?
302 EAGAIN : XFS_ERROR(ENOMEM); 303 EAGAIN : XFS_ERROR(ENOMEM);
303 304
304 if (XFS_BUF_GETERROR(bp) != 0) { 305 if (XFS_BUF_GETERROR(bp) != 0) {
@@ -385,14 +386,14 @@ xfs_trans_read_buf(
385 } 386 }
386 387
387 /* 388 /*
388 * We always specify the BUF_BUSY flag within a transaction so 389 * We always specify the XBF_DONT_BLOCK flag within a transaction
389 * that get_buf does not try to push out a delayed write buffer 390 * so that get_buf does not try to push out a delayed write buffer
390 * which might cause another transaction to take place (if the 391 * which might cause another transaction to take place (if the
391 * buffer was delayed alloc). Such recursive transactions can 392 * buffer was delayed alloc). Such recursive transactions can
392 * easily deadlock with our current transaction as well as cause 393 * easily deadlock with our current transaction as well as cause
393 * us to run out of stack space. 394 * us to run out of stack space.
394 */ 395 */
395 bp = xfs_buf_read(target, blkno, len, flags | BUF_BUSY); 396 bp = xfs_buf_read(target, blkno, len, flags | XBF_DONT_BLOCK);
396 if (bp == NULL) { 397 if (bp == NULL) {
397 *bpp = NULL; 398 *bpp = NULL;
398 return 0; 399 return 0;
@@ -472,8 +473,8 @@ shutdown_abort:
472 if (XFS_BUF_ISSTALE(bp) && XFS_BUF_ISDELAYWRITE(bp)) 473 if (XFS_BUF_ISSTALE(bp) && XFS_BUF_ISDELAYWRITE(bp))
473 cmn_err(CE_NOTE, "about to pop assert, bp == 0x%p", bp); 474 cmn_err(CE_NOTE, "about to pop assert, bp == 0x%p", bp);
474#endif 475#endif
475 ASSERT((XFS_BUF_BFLAGS(bp) & (XFS_B_STALE|XFS_B_DELWRI)) != 476 ASSERT((XFS_BUF_BFLAGS(bp) & (XBF_STALE|XBF_DELWRI)) !=
476 (XFS_B_STALE|XFS_B_DELWRI)); 477 (XBF_STALE|XBF_DELWRI));
477 478
478 trace_xfs_trans_read_buf_shut(bp, _RET_IP_); 479 trace_xfs_trans_read_buf_shut(bp, _RET_IP_);
479 xfs_buf_relse(bp); 480 xfs_buf_relse(bp);
diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h
index d725428c9df6..b09904555d07 100644
--- a/fs/xfs/xfs_types.h
+++ b/fs/xfs/xfs_types.h
@@ -151,8 +151,8 @@ typedef enum {
151} xfs_btnum_t; 151} xfs_btnum_t;
152 152
153struct xfs_name { 153struct xfs_name {
154 const char *name; 154 const unsigned char *name;
155 int len; 155 int len;
156}; 156};
157 157
158#endif /* __XFS_TYPES_H__ */ 158#endif /* __XFS_TYPES_H__ */
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 6f268756bf36..ddd2c5d1b854 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -256,7 +256,7 @@ xfs_setattr(
256 iattr->ia_size > ip->i_d.di_size) { 256 iattr->ia_size > ip->i_d.di_size) {
257 code = xfs_flush_pages(ip, 257 code = xfs_flush_pages(ip,
258 ip->i_d.di_size, iattr->ia_size, 258 ip->i_d.di_size, iattr->ia_size,
259 XFS_B_ASYNC, FI_NONE); 259 XBF_ASYNC, FI_NONE);
260 } 260 }
261 261
262 /* wait for all I/O to complete */ 262 /* wait for all I/O to complete */
@@ -597,7 +597,7 @@ xfs_fsync(
597{ 597{
598 xfs_trans_t *tp; 598 xfs_trans_t *tp;
599 int error = 0; 599 int error = 0;
600 int log_flushed = 0, changed = 1; 600 int log_flushed = 0;
601 601
602 xfs_itrace_entry(ip); 602 xfs_itrace_entry(ip);
603 603
@@ -627,19 +627,16 @@ xfs_fsync(
627 * disk yet, the inode will be still be pinned. If it is, 627 * disk yet, the inode will be still be pinned. If it is,
628 * force the log. 628 * force the log.
629 */ 629 */
630
631 xfs_iunlock(ip, XFS_ILOCK_SHARED); 630 xfs_iunlock(ip, XFS_ILOCK_SHARED);
632
633 if (xfs_ipincount(ip)) { 631 if (xfs_ipincount(ip)) {
634 error = _xfs_log_force(ip->i_mount, (xfs_lsn_t)0, 632 if (ip->i_itemp->ili_last_lsn) {
635 XFS_LOG_FORCE | XFS_LOG_SYNC, 633 error = _xfs_log_force_lsn(ip->i_mount,
636 &log_flushed); 634 ip->i_itemp->ili_last_lsn,
637 } else { 635 XFS_LOG_SYNC, &log_flushed);
638 /* 636 } else {
639 * If the inode is not pinned and nothing has changed 637 error = _xfs_log_force(ip->i_mount,
640 * we don't need to flush the cache. 638 XFS_LOG_SYNC, &log_flushed);
641 */ 639 }
642 changed = 0;
643 } 640 }
644 } else { 641 } else {
645 /* 642 /*
@@ -674,7 +671,7 @@ xfs_fsync(
674 xfs_iunlock(ip, XFS_ILOCK_EXCL); 671 xfs_iunlock(ip, XFS_ILOCK_EXCL);
675 } 672 }
676 673
677 if ((ip->i_mount->m_flags & XFS_MOUNT_BARRIER) && changed) { 674 if (ip->i_mount->m_flags & XFS_MOUNT_BARRIER) {
678 /* 675 /*
679 * If the log write didn't issue an ordered tag we need 676 * If the log write didn't issue an ordered tag we need
680 * to flush the disk cache for the data device now. 677 * to flush the disk cache for the data device now.
@@ -1096,7 +1093,7 @@ xfs_release(
1096 */ 1093 */
1097 truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED); 1094 truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED);
1098 if (truncated && VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0) 1095 if (truncated && VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0)
1099 xfs_flush_pages(ip, 0, -1, XFS_B_ASYNC, FI_NONE); 1096 xfs_flush_pages(ip, 0, -1, XBF_ASYNC, FI_NONE);
1100 } 1097 }
1101 1098
1102 if (ip->i_d.di_nlink != 0) { 1099 if (ip->i_d.di_nlink != 0) {
@@ -2199,7 +2196,8 @@ xfs_symlink(
2199 if (DM_EVENT_ENABLED(dp, DM_EVENT_SYMLINK)) { 2196 if (DM_EVENT_ENABLED(dp, DM_EVENT_SYMLINK)) {
2200 error = XFS_SEND_NAMESP(mp, DM_EVENT_SYMLINK, dp, 2197 error = XFS_SEND_NAMESP(mp, DM_EVENT_SYMLINK, dp,
2201 DM_RIGHT_NULL, NULL, DM_RIGHT_NULL, 2198 DM_RIGHT_NULL, NULL, DM_RIGHT_NULL,
2202 link_name->name, target_path, 0, 0, 0); 2199 link_name->name,
2200 (unsigned char *)target_path, 0, 0, 0);
2203 if (error) 2201 if (error)
2204 return error; 2202 return error;
2205 } 2203 }
@@ -2395,7 +2393,8 @@ std_return:
2395 dp, DM_RIGHT_NULL, 2393 dp, DM_RIGHT_NULL,
2396 error ? NULL : ip, 2394 error ? NULL : ip,
2397 DM_RIGHT_NULL, link_name->name, 2395 DM_RIGHT_NULL, link_name->name,
2398 target_path, 0, error, 0); 2396 (unsigned char *)target_path,
2397 0, error, 0);
2399 } 2398 }
2400 2399
2401 if (!error) 2400 if (!error)
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index 167a467403a5..774f40729ca1 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -43,11 +43,11 @@ int xfs_change_file_space(struct xfs_inode *ip, int cmd,
43int xfs_rename(struct xfs_inode *src_dp, struct xfs_name *src_name, 43int xfs_rename(struct xfs_inode *src_dp, struct xfs_name *src_name,
44 struct xfs_inode *src_ip, struct xfs_inode *target_dp, 44 struct xfs_inode *src_ip, struct xfs_inode *target_dp,
45 struct xfs_name *target_name, struct xfs_inode *target_ip); 45 struct xfs_name *target_name, struct xfs_inode *target_ip);
46int xfs_attr_get(struct xfs_inode *ip, const char *name, char *value, 46int xfs_attr_get(struct xfs_inode *ip, const unsigned char *name,
47 int *valuelenp, int flags); 47 unsigned char *value, int *valuelenp, int flags);
48int xfs_attr_set(struct xfs_inode *dp, const char *name, char *value, 48int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name,
49 int valuelen, int flags); 49 unsigned char *value, int valuelen, int flags);
50int xfs_attr_remove(struct xfs_inode *dp, const char *name, int flags); 50int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags);
51int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize, 51int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize,
52 int flags, struct attrlist_cursor_kern *cursor); 52 int flags, struct attrlist_cursor_kern *cursor);
53ssize_t xfs_read(struct xfs_inode *ip, struct kiocb *iocb, 53ssize_t xfs_read(struct xfs_inode *ip, struct kiocb *iocb,