aboutsummaryrefslogtreecommitdiffstats
path: root/fs/nilfs2/inode.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/nilfs2/inode.c')
-rw-r--r--fs/nilfs2/inode.c439
1 files changed, 341 insertions, 98 deletions
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index eccb2f2e2315..b9b45fc2903e 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -34,6 +34,30 @@
34#include "cpfile.h" 34#include "cpfile.h"
35#include "ifile.h" 35#include "ifile.h"
36 36
37struct nilfs_iget_args {
38 u64 ino;
39 __u64 cno;
40 struct nilfs_root *root;
41 int for_gc;
42};
43
44void nilfs_inode_add_blocks(struct inode *inode, int n)
45{
46 struct nilfs_root *root = NILFS_I(inode)->i_root;
47
48 inode_add_bytes(inode, (1 << inode->i_blkbits) * n);
49 if (root)
50 atomic_add(n, &root->blocks_count);
51}
52
53void nilfs_inode_sub_blocks(struct inode *inode, int n)
54{
55 struct nilfs_root *root = NILFS_I(inode)->i_root;
56
57 inode_sub_bytes(inode, (1 << inode->i_blkbits) * n);
58 if (root)
59 atomic_sub(n, &root->blocks_count);
60}
37 61
38/** 62/**
39 * nilfs_get_block() - get a file block on the filesystem (callback function) 63 * nilfs_get_block() - get a file block on the filesystem (callback function)
@@ -50,14 +74,14 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff,
50 struct buffer_head *bh_result, int create) 74 struct buffer_head *bh_result, int create)
51{ 75{
52 struct nilfs_inode_info *ii = NILFS_I(inode); 76 struct nilfs_inode_info *ii = NILFS_I(inode);
77 struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
53 __u64 blknum = 0; 78 __u64 blknum = 0;
54 int err = 0, ret; 79 int err = 0, ret;
55 struct inode *dat = nilfs_dat_inode(NILFS_I_NILFS(inode));
56 unsigned maxblocks = bh_result->b_size >> inode->i_blkbits; 80 unsigned maxblocks = bh_result->b_size >> inode->i_blkbits;
57 81
58 down_read(&NILFS_MDT(dat)->mi_sem); 82 down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
59 ret = nilfs_bmap_lookup_contig(ii->i_bmap, blkoff, &blknum, maxblocks); 83 ret = nilfs_bmap_lookup_contig(ii->i_bmap, blkoff, &blknum, maxblocks);
60 up_read(&NILFS_MDT(dat)->mi_sem); 84 up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
61 if (ret >= 0) { /* found */ 85 if (ret >= 0) { /* found */
62 map_bh(bh_result, inode->i_sb, blknum); 86 map_bh(bh_result, inode->i_sb, blknum);
63 if (ret > 0) 87 if (ret > 0)
@@ -90,11 +114,6 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff,
90 inode->i_ino, 114 inode->i_ino,
91 (unsigned long long)blkoff); 115 (unsigned long long)blkoff);
92 err = 0; 116 err = 0;
93 } else if (err == -EINVAL) {
94 nilfs_error(inode->i_sb, __func__,
95 "broken bmap (inode=%lu)\n",
96 inode->i_ino);
97 err = -EIO;
98 } 117 }
99 nilfs_transaction_abort(inode->i_sb); 118 nilfs_transaction_abort(inode->i_sb);
100 goto out; 119 goto out;
@@ -103,6 +122,7 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff,
103 nilfs_transaction_commit(inode->i_sb); /* never fails */ 122 nilfs_transaction_commit(inode->i_sb); /* never fails */
104 /* Error handling should be detailed */ 123 /* Error handling should be detailed */
105 set_buffer_new(bh_result); 124 set_buffer_new(bh_result);
125 set_buffer_delay(bh_result);
106 map_bh(bh_result, inode->i_sb, 0); /* dbn must be changed 126 map_bh(bh_result, inode->i_sb, 0); /* dbn must be changed
107 to proper value */ 127 to proper value */
108 } else if (ret == -ENOENT) { 128 } else if (ret == -ENOENT) {
@@ -179,10 +199,9 @@ static int nilfs_set_page_dirty(struct page *page)
179 199
180 if (ret) { 200 if (ret) {
181 struct inode *inode = page->mapping->host; 201 struct inode *inode = page->mapping->host;
182 struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb);
183 unsigned nr_dirty = 1 << (PAGE_SHIFT - inode->i_blkbits); 202 unsigned nr_dirty = 1 << (PAGE_SHIFT - inode->i_blkbits);
184 203
185 nilfs_set_file_dirty(sbi, inode, nr_dirty); 204 nilfs_set_file_dirty(inode, nr_dirty);
186 } 205 }
187 return ret; 206 return ret;
188} 207}
@@ -223,7 +242,7 @@ static int nilfs_write_end(struct file *file, struct address_space *mapping,
223 start + copied); 242 start + copied);
224 copied = generic_write_end(file, mapping, pos, len, copied, page, 243 copied = generic_write_end(file, mapping, pos, len, copied, page,
225 fsdata); 244 fsdata);
226 nilfs_set_file_dirty(NILFS_SB(inode->i_sb), inode, nr_dirty); 245 nilfs_set_file_dirty(inode, nr_dirty);
227 err = nilfs_transaction_commit(inode->i_sb); 246 err = nilfs_transaction_commit(inode->i_sb);
228 return err ? : copied; 247 return err ? : copied;
229} 248}
@@ -261,7 +280,6 @@ nilfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
261const struct address_space_operations nilfs_aops = { 280const struct address_space_operations nilfs_aops = {
262 .writepage = nilfs_writepage, 281 .writepage = nilfs_writepage,
263 .readpage = nilfs_readpage, 282 .readpage = nilfs_readpage,
264 .sync_page = block_sync_page,
265 .writepages = nilfs_writepages, 283 .writepages = nilfs_writepages,
266 .set_page_dirty = nilfs_set_page_dirty, 284 .set_page_dirty = nilfs_set_page_dirty,
267 .readpages = nilfs_readpages, 285 .readpages = nilfs_readpages,
@@ -276,9 +294,10 @@ const struct address_space_operations nilfs_aops = {
276struct inode *nilfs_new_inode(struct inode *dir, int mode) 294struct inode *nilfs_new_inode(struct inode *dir, int mode)
277{ 295{
278 struct super_block *sb = dir->i_sb; 296 struct super_block *sb = dir->i_sb;
279 struct nilfs_sb_info *sbi = NILFS_SB(sb); 297 struct the_nilfs *nilfs = sb->s_fs_info;
280 struct inode *inode; 298 struct inode *inode;
281 struct nilfs_inode_info *ii; 299 struct nilfs_inode_info *ii;
300 struct nilfs_root *root;
282 int err = -ENOMEM; 301 int err = -ENOMEM;
283 ino_t ino; 302 ino_t ino;
284 303
@@ -289,15 +308,17 @@ struct inode *nilfs_new_inode(struct inode *dir, int mode)
289 mapping_set_gfp_mask(inode->i_mapping, 308 mapping_set_gfp_mask(inode->i_mapping,
290 mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS); 309 mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
291 310
311 root = NILFS_I(dir)->i_root;
292 ii = NILFS_I(inode); 312 ii = NILFS_I(inode);
293 ii->i_state = 1 << NILFS_I_NEW; 313 ii->i_state = 1 << NILFS_I_NEW;
314 ii->i_root = root;
294 315
295 err = nilfs_ifile_create_inode(sbi->s_ifile, &ino, &ii->i_bh); 316 err = nilfs_ifile_create_inode(root->ifile, &ino, &ii->i_bh);
296 if (unlikely(err)) 317 if (unlikely(err))
297 goto failed_ifile_create_inode; 318 goto failed_ifile_create_inode;
298 /* reference count of i_bh inherits from nilfs_mdt_read_block() */ 319 /* reference count of i_bh inherits from nilfs_mdt_read_block() */
299 320
300 atomic_inc(&sbi->s_inodes_count); 321 atomic_inc(&root->inodes_count);
301 inode_init_owner(inode, dir, mode); 322 inode_init_owner(inode, dir, mode);
302 inode->i_ino = ino; 323 inode->i_ino = ino;
303 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 324 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
@@ -311,20 +332,16 @@ struct inode *nilfs_new_inode(struct inode *dir, int mode)
311 /* No lock is needed; iget() ensures it. */ 332 /* No lock is needed; iget() ensures it. */
312 } 333 }
313 334
314 ii->i_flags = NILFS_I(dir)->i_flags; 335 ii->i_flags = nilfs_mask_flags(
315 if (S_ISLNK(mode)) 336 mode, NILFS_I(dir)->i_flags & NILFS_FL_INHERITED);
316 ii->i_flags &= ~(NILFS_IMMUTABLE_FL | NILFS_APPEND_FL);
317 if (!S_ISDIR(mode))
318 ii->i_flags &= ~NILFS_DIRSYNC_FL;
319 337
320 /* ii->i_file_acl = 0; */ 338 /* ii->i_file_acl = 0; */
321 /* ii->i_dir_acl = 0; */ 339 /* ii->i_dir_acl = 0; */
322 ii->i_dir_start_lookup = 0; 340 ii->i_dir_start_lookup = 0;
323 ii->i_cno = 0;
324 nilfs_set_inode_flags(inode); 341 nilfs_set_inode_flags(inode);
325 spin_lock(&sbi->s_next_gen_lock); 342 spin_lock(&nilfs->ns_next_gen_lock);
326 inode->i_generation = sbi->s_next_generation++; 343 inode->i_generation = nilfs->ns_next_generation++;
327 spin_unlock(&sbi->s_next_gen_lock); 344 spin_unlock(&nilfs->ns_next_gen_lock);
328 insert_inode_hash(inode); 345 insert_inode_hash(inode);
329 346
330 err = nilfs_init_acl(inode, dir); 347 err = nilfs_init_acl(inode, dir);
@@ -350,33 +367,21 @@ struct inode *nilfs_new_inode(struct inode *dir, int mode)
350 return ERR_PTR(err); 367 return ERR_PTR(err);
351} 368}
352 369
353void nilfs_free_inode(struct inode *inode)
354{
355 struct super_block *sb = inode->i_sb;
356 struct nilfs_sb_info *sbi = NILFS_SB(sb);
357
358 /* XXX: check error code? Is there any thing I can do? */
359 (void) nilfs_ifile_delete_inode(sbi->s_ifile, inode->i_ino);
360 atomic_dec(&sbi->s_inodes_count);
361}
362
363void nilfs_set_inode_flags(struct inode *inode) 370void nilfs_set_inode_flags(struct inode *inode)
364{ 371{
365 unsigned int flags = NILFS_I(inode)->i_flags; 372 unsigned int flags = NILFS_I(inode)->i_flags;
366 373
367 inode->i_flags &= ~(S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | 374 inode->i_flags &= ~(S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME |
368 S_DIRSYNC); 375 S_DIRSYNC);
369 if (flags & NILFS_SYNC_FL) 376 if (flags & FS_SYNC_FL)
370 inode->i_flags |= S_SYNC; 377 inode->i_flags |= S_SYNC;
371 if (flags & NILFS_APPEND_FL) 378 if (flags & FS_APPEND_FL)
372 inode->i_flags |= S_APPEND; 379 inode->i_flags |= S_APPEND;
373 if (flags & NILFS_IMMUTABLE_FL) 380 if (flags & FS_IMMUTABLE_FL)
374 inode->i_flags |= S_IMMUTABLE; 381 inode->i_flags |= S_IMMUTABLE;
375#ifndef NILFS_ATIME_DISABLE 382 if (flags & FS_NOATIME_FL)
376 if (flags & NILFS_NOATIME_FL)
377#endif
378 inode->i_flags |= S_NOATIME; 383 inode->i_flags |= S_NOATIME;
379 if (flags & NILFS_DIRSYNC_FL) 384 if (flags & FS_DIRSYNC_FL)
380 inode->i_flags |= S_DIRSYNC; 385 inode->i_flags |= S_DIRSYNC;
381 mapping_set_gfp_mask(inode->i_mapping, 386 mapping_set_gfp_mask(inode->i_mapping,
382 mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS); 387 mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
@@ -410,7 +415,6 @@ int nilfs_read_inode_common(struct inode *inode,
410 0 : le32_to_cpu(raw_inode->i_dir_acl); 415 0 : le32_to_cpu(raw_inode->i_dir_acl);
411#endif 416#endif
412 ii->i_dir_start_lookup = 0; 417 ii->i_dir_start_lookup = 0;
413 ii->i_cno = 0;
414 inode->i_generation = le32_to_cpu(raw_inode->i_generation); 418 inode->i_generation = le32_to_cpu(raw_inode->i_generation);
415 419
416 if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || 420 if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
@@ -424,21 +428,21 @@ int nilfs_read_inode_common(struct inode *inode,
424 return 0; 428 return 0;
425} 429}
426 430
427static int __nilfs_read_inode(struct super_block *sb, unsigned long ino, 431static int __nilfs_read_inode(struct super_block *sb,
432 struct nilfs_root *root, unsigned long ino,
428 struct inode *inode) 433 struct inode *inode)
429{ 434{
430 struct nilfs_sb_info *sbi = NILFS_SB(sb); 435 struct the_nilfs *nilfs = sb->s_fs_info;
431 struct inode *dat = nilfs_dat_inode(sbi->s_nilfs);
432 struct buffer_head *bh; 436 struct buffer_head *bh;
433 struct nilfs_inode *raw_inode; 437 struct nilfs_inode *raw_inode;
434 int err; 438 int err;
435 439
436 down_read(&NILFS_MDT(dat)->mi_sem); /* XXX */ 440 down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
437 err = nilfs_ifile_get_inode_block(sbi->s_ifile, ino, &bh); 441 err = nilfs_ifile_get_inode_block(root->ifile, ino, &bh);
438 if (unlikely(err)) 442 if (unlikely(err))
439 goto bad_inode; 443 goto bad_inode;
440 444
441 raw_inode = nilfs_ifile_map_inode(sbi->s_ifile, ino, bh); 445 raw_inode = nilfs_ifile_map_inode(root->ifile, ino, bh);
442 446
443 err = nilfs_read_inode_common(inode, raw_inode); 447 err = nilfs_read_inode_common(inode, raw_inode);
444 if (err) 448 if (err)
@@ -461,33 +465,110 @@ static int __nilfs_read_inode(struct super_block *sb, unsigned long ino,
461 inode, inode->i_mode, 465 inode, inode->i_mode,
462 huge_decode_dev(le64_to_cpu(raw_inode->i_device_code))); 466 huge_decode_dev(le64_to_cpu(raw_inode->i_device_code)));
463 } 467 }
464 nilfs_ifile_unmap_inode(sbi->s_ifile, ino, bh); 468 nilfs_ifile_unmap_inode(root->ifile, ino, bh);
465 brelse(bh); 469 brelse(bh);
466 up_read(&NILFS_MDT(dat)->mi_sem); /* XXX */ 470 up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
467 nilfs_set_inode_flags(inode); 471 nilfs_set_inode_flags(inode);
468 return 0; 472 return 0;
469 473
470 failed_unmap: 474 failed_unmap:
471 nilfs_ifile_unmap_inode(sbi->s_ifile, ino, bh); 475 nilfs_ifile_unmap_inode(root->ifile, ino, bh);
472 brelse(bh); 476 brelse(bh);
473 477
474 bad_inode: 478 bad_inode:
475 up_read(&NILFS_MDT(dat)->mi_sem); /* XXX */ 479 up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
476 return err; 480 return err;
477} 481}
478 482
479struct inode *nilfs_iget(struct super_block *sb, unsigned long ino) 483static int nilfs_iget_test(struct inode *inode, void *opaque)
484{
485 struct nilfs_iget_args *args = opaque;
486 struct nilfs_inode_info *ii;
487
488 if (args->ino != inode->i_ino || args->root != NILFS_I(inode)->i_root)
489 return 0;
490
491 ii = NILFS_I(inode);
492 if (!test_bit(NILFS_I_GCINODE, &ii->i_state))
493 return !args->for_gc;
494
495 return args->for_gc && args->cno == ii->i_cno;
496}
497
498static int nilfs_iget_set(struct inode *inode, void *opaque)
499{
500 struct nilfs_iget_args *args = opaque;
501
502 inode->i_ino = args->ino;
503 if (args->for_gc) {
504 NILFS_I(inode)->i_state = 1 << NILFS_I_GCINODE;
505 NILFS_I(inode)->i_cno = args->cno;
506 NILFS_I(inode)->i_root = NULL;
507 } else {
508 if (args->root && args->ino == NILFS_ROOT_INO)
509 nilfs_get_root(args->root);
510 NILFS_I(inode)->i_root = args->root;
511 }
512 return 0;
513}
514
515struct inode *nilfs_ilookup(struct super_block *sb, struct nilfs_root *root,
516 unsigned long ino)
517{
518 struct nilfs_iget_args args = {
519 .ino = ino, .root = root, .cno = 0, .for_gc = 0
520 };
521
522 return ilookup5(sb, ino, nilfs_iget_test, &args);
523}
524
525struct inode *nilfs_iget_locked(struct super_block *sb, struct nilfs_root *root,
526 unsigned long ino)
527{
528 struct nilfs_iget_args args = {
529 .ino = ino, .root = root, .cno = 0, .for_gc = 0
530 };
531
532 return iget5_locked(sb, ino, nilfs_iget_test, nilfs_iget_set, &args);
533}
534
535struct inode *nilfs_iget(struct super_block *sb, struct nilfs_root *root,
536 unsigned long ino)
480{ 537{
481 struct inode *inode; 538 struct inode *inode;
482 int err; 539 int err;
483 540
484 inode = iget_locked(sb, ino); 541 inode = nilfs_iget_locked(sb, root, ino);
485 if (unlikely(!inode)) 542 if (unlikely(!inode))
486 return ERR_PTR(-ENOMEM); 543 return ERR_PTR(-ENOMEM);
487 if (!(inode->i_state & I_NEW)) 544 if (!(inode->i_state & I_NEW))
488 return inode; 545 return inode;
489 546
490 err = __nilfs_read_inode(sb, ino, inode); 547 err = __nilfs_read_inode(sb, root, ino, inode);
548 if (unlikely(err)) {
549 iget_failed(inode);
550 return ERR_PTR(err);
551 }
552 unlock_new_inode(inode);
553 return inode;
554}
555
556struct inode *nilfs_iget_for_gc(struct super_block *sb, unsigned long ino,
557 __u64 cno)
558{
559 struct nilfs_iget_args args = {
560 .ino = ino, .root = NULL, .cno = cno, .for_gc = 1
561 };
562 struct inode *inode;
563 int err;
564
565 inode = iget5_locked(sb, ino, nilfs_iget_test, nilfs_iget_set, &args);
566 if (unlikely(!inode))
567 return ERR_PTR(-ENOMEM);
568 if (!(inode->i_state & I_NEW))
569 return inode;
570
571 err = nilfs_init_gcinode(inode);
491 if (unlikely(err)) { 572 if (unlikely(err)) {
492 iget_failed(inode); 573 iget_failed(inode);
493 return ERR_PTR(err); 574 return ERR_PTR(err);
@@ -515,6 +596,16 @@ void nilfs_write_inode_common(struct inode *inode,
515 raw_inode->i_flags = cpu_to_le32(ii->i_flags); 596 raw_inode->i_flags = cpu_to_le32(ii->i_flags);
516 raw_inode->i_generation = cpu_to_le32(inode->i_generation); 597 raw_inode->i_generation = cpu_to_le32(inode->i_generation);
517 598
599 if (NILFS_ROOT_METADATA_FILE(inode->i_ino)) {
600 struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
601
602 /* zero-fill unused portion in the case of super root block */
603 raw_inode->i_xattr = 0;
604 raw_inode->i_pad = 0;
605 memset((void *)raw_inode + sizeof(*raw_inode), 0,
606 nilfs->ns_inode_size - sizeof(*raw_inode));
607 }
608
518 if (has_bmap) 609 if (has_bmap)
519 nilfs_bmap_write(ii->i_bmap, raw_inode); 610 nilfs_bmap_write(ii->i_bmap, raw_inode);
520 else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) 611 else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
@@ -528,21 +619,20 @@ void nilfs_update_inode(struct inode *inode, struct buffer_head *ibh)
528{ 619{
529 ino_t ino = inode->i_ino; 620 ino_t ino = inode->i_ino;
530 struct nilfs_inode_info *ii = NILFS_I(inode); 621 struct nilfs_inode_info *ii = NILFS_I(inode);
531 struct super_block *sb = inode->i_sb; 622 struct inode *ifile = ii->i_root->ifile;
532 struct nilfs_sb_info *sbi = NILFS_SB(sb);
533 struct nilfs_inode *raw_inode; 623 struct nilfs_inode *raw_inode;
534 624
535 raw_inode = nilfs_ifile_map_inode(sbi->s_ifile, ino, ibh); 625 raw_inode = nilfs_ifile_map_inode(ifile, ino, ibh);
536 626
537 if (test_and_clear_bit(NILFS_I_NEW, &ii->i_state)) 627 if (test_and_clear_bit(NILFS_I_NEW, &ii->i_state))
538 memset(raw_inode, 0, NILFS_MDT(sbi->s_ifile)->mi_entry_size); 628 memset(raw_inode, 0, NILFS_MDT(ifile)->mi_entry_size);
539 set_bit(NILFS_I_INODE_DIRTY, &ii->i_state); 629 set_bit(NILFS_I_INODE_DIRTY, &ii->i_state);
540 630
541 nilfs_write_inode_common(inode, raw_inode, 0); 631 nilfs_write_inode_common(inode, raw_inode, 0);
542 /* XXX: call with has_bmap = 0 is a workaround to avoid 632 /* XXX: call with has_bmap = 0 is a workaround to avoid
543 deadlock of bmap. This delays update of i_bmap to just 633 deadlock of bmap. This delays update of i_bmap to just
544 before writing */ 634 before writing */
545 nilfs_ifile_unmap_inode(sbi->s_ifile, ino, ibh); 635 nilfs_ifile_unmap_inode(ifile, ino, ibh);
546} 636}
547 637
548#define NILFS_MAX_TRUNCATE_BLOCKS 16384 /* 64MB for 4KB block */ 638#define NILFS_MAX_TRUNCATE_BLOCKS 16384 /* 64MB for 4KB block */
@@ -555,7 +645,7 @@ static void nilfs_truncate_bmap(struct nilfs_inode_info *ii,
555 645
556 if (!test_bit(NILFS_I_BMAP, &ii->i_state)) 646 if (!test_bit(NILFS_I_BMAP, &ii->i_state))
557 return; 647 return;
558 repeat: 648repeat:
559 ret = nilfs_bmap_last_key(ii->i_bmap, &b); 649 ret = nilfs_bmap_last_key(ii->i_bmap, &b);
560 if (ret == -ENOENT) 650 if (ret == -ENOENT)
561 return; 651 return;
@@ -572,14 +662,10 @@ static void nilfs_truncate_bmap(struct nilfs_inode_info *ii,
572 nilfs_bmap_truncate(ii->i_bmap, b) == 0)) 662 nilfs_bmap_truncate(ii->i_bmap, b) == 0))
573 goto repeat; 663 goto repeat;
574 664
575 failed: 665failed:
576 if (ret == -EINVAL) 666 nilfs_warning(ii->vfs_inode.i_sb, __func__,
577 nilfs_error(ii->vfs_inode.i_sb, __func__, 667 "failed to truncate bmap (ino=%lu, err=%d)",
578 "bmap is broken (ino=%lu)", ii->vfs_inode.i_ino); 668 ii->vfs_inode.i_ino, ret);
579 else
580 nilfs_warning(ii->vfs_inode.i_sb, __func__,
581 "failed to truncate bmap (ino=%lu, err=%d)",
582 ii->vfs_inode.i_ino, ret);
583} 669}
584 670
585void nilfs_truncate(struct inode *inode) 671void nilfs_truncate(struct inode *inode)
@@ -608,7 +694,7 @@ void nilfs_truncate(struct inode *inode)
608 nilfs_set_transaction_flag(NILFS_TI_SYNC); 694 nilfs_set_transaction_flag(NILFS_TI_SYNC);
609 695
610 nilfs_mark_inode_dirty(inode); 696 nilfs_mark_inode_dirty(inode);
611 nilfs_set_file_dirty(NILFS_SB(sb), inode, 0); 697 nilfs_set_file_dirty(inode, 0);
612 nilfs_transaction_commit(sb); 698 nilfs_transaction_commit(sb);
613 /* May construct a logical segment and may fail in sync mode. 699 /* May construct a logical segment and may fail in sync mode.
614 But truncate has no return value. */ 700 But truncate has no return value. */
@@ -617,6 +703,7 @@ void nilfs_truncate(struct inode *inode)
617static void nilfs_clear_inode(struct inode *inode) 703static void nilfs_clear_inode(struct inode *inode)
618{ 704{
619 struct nilfs_inode_info *ii = NILFS_I(inode); 705 struct nilfs_inode_info *ii = NILFS_I(inode);
706 struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
620 707
621 /* 708 /*
622 * Free resources allocated in nilfs_read_inode(), here. 709 * Free resources allocated in nilfs_read_inode(), here.
@@ -625,10 +712,16 @@ static void nilfs_clear_inode(struct inode *inode)
625 brelse(ii->i_bh); 712 brelse(ii->i_bh);
626 ii->i_bh = NULL; 713 ii->i_bh = NULL;
627 714
715 if (mdi && mdi->mi_palloc_cache)
716 nilfs_palloc_destroy_cache(inode);
717
628 if (test_bit(NILFS_I_BMAP, &ii->i_state)) 718 if (test_bit(NILFS_I_BMAP, &ii->i_state))
629 nilfs_bmap_clear(ii->i_bmap); 719 nilfs_bmap_clear(ii->i_bmap);
630 720
631 nilfs_btnode_cache_clear(&ii->i_btnode_cache); 721 nilfs_btnode_cache_clear(&ii->i_btnode_cache);
722
723 if (ii->i_root && inode->i_ino == NILFS_ROOT_INO)
724 nilfs_put_root(ii->i_root);
632} 725}
633 726
634void nilfs_evict_inode(struct inode *inode) 727void nilfs_evict_inode(struct inode *inode)
@@ -636,8 +729,9 @@ void nilfs_evict_inode(struct inode *inode)
636 struct nilfs_transaction_info ti; 729 struct nilfs_transaction_info ti;
637 struct super_block *sb = inode->i_sb; 730 struct super_block *sb = inode->i_sb;
638 struct nilfs_inode_info *ii = NILFS_I(inode); 731 struct nilfs_inode_info *ii = NILFS_I(inode);
732 int ret;
639 733
640 if (inode->i_nlink || unlikely(is_bad_inode(inode))) { 734 if (inode->i_nlink || !ii->i_root || unlikely(is_bad_inode(inode))) {
641 if (inode->i_data.nrpages) 735 if (inode->i_data.nrpages)
642 truncate_inode_pages(&inode->i_data, 0); 736 truncate_inode_pages(&inode->i_data, 0);
643 end_writeback(inode); 737 end_writeback(inode);
@@ -649,12 +743,17 @@ void nilfs_evict_inode(struct inode *inode)
649 if (inode->i_data.nrpages) 743 if (inode->i_data.nrpages)
650 truncate_inode_pages(&inode->i_data, 0); 744 truncate_inode_pages(&inode->i_data, 0);
651 745
746 /* TODO: some of the following operations may fail. */
652 nilfs_truncate_bmap(ii, 0); 747 nilfs_truncate_bmap(ii, 0);
653 nilfs_mark_inode_dirty(inode); 748 nilfs_mark_inode_dirty(inode);
654 end_writeback(inode); 749 end_writeback(inode);
750
751 ret = nilfs_ifile_delete_inode(ii->i_root->ifile, inode->i_ino);
752 if (!ret)
753 atomic_dec(&ii->i_root->inodes_count);
754
655 nilfs_clear_inode(inode); 755 nilfs_clear_inode(inode);
656 nilfs_free_inode(inode); 756
657 /* nilfs_free_inode() marks inode buffer dirty */
658 if (IS_SYNC(inode)) 757 if (IS_SYNC(inode))
659 nilfs_set_transaction_flag(NILFS_TI_SYNC); 758 nilfs_set_transaction_flag(NILFS_TI_SYNC);
660 nilfs_transaction_commit(sb); 759 nilfs_transaction_commit(sb);
@@ -700,20 +799,30 @@ out_err:
700 return err; 799 return err;
701} 800}
702 801
703int nilfs_load_inode_block(struct nilfs_sb_info *sbi, struct inode *inode, 802int nilfs_permission(struct inode *inode, int mask, unsigned int flags)
704 struct buffer_head **pbh)
705{ 803{
804 struct nilfs_root *root = NILFS_I(inode)->i_root;
805 if ((mask & MAY_WRITE) && root &&
806 root->cno != NILFS_CPTREE_CURRENT_CNO)
807 return -EROFS; /* snapshot is not writable */
808
809 return generic_permission(inode, mask, flags, NULL);
810}
811
812int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh)
813{
814 struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
706 struct nilfs_inode_info *ii = NILFS_I(inode); 815 struct nilfs_inode_info *ii = NILFS_I(inode);
707 int err; 816 int err;
708 817
709 spin_lock(&sbi->s_inode_lock); 818 spin_lock(&nilfs->ns_inode_lock);
710 if (ii->i_bh == NULL) { 819 if (ii->i_bh == NULL) {
711 spin_unlock(&sbi->s_inode_lock); 820 spin_unlock(&nilfs->ns_inode_lock);
712 err = nilfs_ifile_get_inode_block(sbi->s_ifile, inode->i_ino, 821 err = nilfs_ifile_get_inode_block(ii->i_root->ifile,
713 pbh); 822 inode->i_ino, pbh);
714 if (unlikely(err)) 823 if (unlikely(err))
715 return err; 824 return err;
716 spin_lock(&sbi->s_inode_lock); 825 spin_lock(&nilfs->ns_inode_lock);
717 if (ii->i_bh == NULL) 826 if (ii->i_bh == NULL)
718 ii->i_bh = *pbh; 827 ii->i_bh = *pbh;
719 else { 828 else {
@@ -724,36 +833,36 @@ int nilfs_load_inode_block(struct nilfs_sb_info *sbi, struct inode *inode,
724 *pbh = ii->i_bh; 833 *pbh = ii->i_bh;
725 834
726 get_bh(*pbh); 835 get_bh(*pbh);
727 spin_unlock(&sbi->s_inode_lock); 836 spin_unlock(&nilfs->ns_inode_lock);
728 return 0; 837 return 0;
729} 838}
730 839
731int nilfs_inode_dirty(struct inode *inode) 840int nilfs_inode_dirty(struct inode *inode)
732{ 841{
733 struct nilfs_inode_info *ii = NILFS_I(inode); 842 struct nilfs_inode_info *ii = NILFS_I(inode);
734 struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb); 843 struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
735 int ret = 0; 844 int ret = 0;
736 845
737 if (!list_empty(&ii->i_dirty)) { 846 if (!list_empty(&ii->i_dirty)) {
738 spin_lock(&sbi->s_inode_lock); 847 spin_lock(&nilfs->ns_inode_lock);
739 ret = test_bit(NILFS_I_DIRTY, &ii->i_state) || 848 ret = test_bit(NILFS_I_DIRTY, &ii->i_state) ||
740 test_bit(NILFS_I_BUSY, &ii->i_state); 849 test_bit(NILFS_I_BUSY, &ii->i_state);
741 spin_unlock(&sbi->s_inode_lock); 850 spin_unlock(&nilfs->ns_inode_lock);
742 } 851 }
743 return ret; 852 return ret;
744} 853}
745 854
746int nilfs_set_file_dirty(struct nilfs_sb_info *sbi, struct inode *inode, 855int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty)
747 unsigned nr_dirty)
748{ 856{
749 struct nilfs_inode_info *ii = NILFS_I(inode); 857 struct nilfs_inode_info *ii = NILFS_I(inode);
858 struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
750 859
751 atomic_add(nr_dirty, &sbi->s_nilfs->ns_ndirtyblks); 860 atomic_add(nr_dirty, &nilfs->ns_ndirtyblks);
752 861
753 if (test_and_set_bit(NILFS_I_DIRTY, &ii->i_state)) 862 if (test_and_set_bit(NILFS_I_DIRTY, &ii->i_state))
754 return 0; 863 return 0;
755 864
756 spin_lock(&sbi->s_inode_lock); 865 spin_lock(&nilfs->ns_inode_lock);
757 if (!test_bit(NILFS_I_QUEUED, &ii->i_state) && 866 if (!test_bit(NILFS_I_QUEUED, &ii->i_state) &&
758 !test_bit(NILFS_I_BUSY, &ii->i_state)) { 867 !test_bit(NILFS_I_BUSY, &ii->i_state)) {
759 /* Because this routine may race with nilfs_dispose_list(), 868 /* Because this routine may race with nilfs_dispose_list(),
@@ -761,36 +870,34 @@ int nilfs_set_file_dirty(struct nilfs_sb_info *sbi, struct inode *inode,
761 if (list_empty(&ii->i_dirty) && igrab(inode) == NULL) { 870 if (list_empty(&ii->i_dirty) && igrab(inode) == NULL) {
762 /* This will happen when somebody is freeing 871 /* This will happen when somebody is freeing
763 this inode. */ 872 this inode. */
764 nilfs_warning(sbi->s_super, __func__, 873 nilfs_warning(inode->i_sb, __func__,
765 "cannot get inode (ino=%lu)\n", 874 "cannot get inode (ino=%lu)\n",
766 inode->i_ino); 875 inode->i_ino);
767 spin_unlock(&sbi->s_inode_lock); 876 spin_unlock(&nilfs->ns_inode_lock);
768 return -EINVAL; /* NILFS_I_DIRTY may remain for 877 return -EINVAL; /* NILFS_I_DIRTY may remain for
769 freeing inode */ 878 freeing inode */
770 } 879 }
771 list_del(&ii->i_dirty); 880 list_move_tail(&ii->i_dirty, &nilfs->ns_dirty_files);
772 list_add_tail(&ii->i_dirty, &sbi->s_dirty_files);
773 set_bit(NILFS_I_QUEUED, &ii->i_state); 881 set_bit(NILFS_I_QUEUED, &ii->i_state);
774 } 882 }
775 spin_unlock(&sbi->s_inode_lock); 883 spin_unlock(&nilfs->ns_inode_lock);
776 return 0; 884 return 0;
777} 885}
778 886
779int nilfs_mark_inode_dirty(struct inode *inode) 887int nilfs_mark_inode_dirty(struct inode *inode)
780{ 888{
781 struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb);
782 struct buffer_head *ibh; 889 struct buffer_head *ibh;
783 int err; 890 int err;
784 891
785 err = nilfs_load_inode_block(sbi, inode, &ibh); 892 err = nilfs_load_inode_block(inode, &ibh);
786 if (unlikely(err)) { 893 if (unlikely(err)) {
787 nilfs_warning(inode->i_sb, __func__, 894 nilfs_warning(inode->i_sb, __func__,
788 "failed to reget inode block.\n"); 895 "failed to reget inode block.\n");
789 return err; 896 return err;
790 } 897 }
791 nilfs_update_inode(inode, ibh); 898 nilfs_update_inode(inode, ibh);
792 nilfs_mdt_mark_buffer_dirty(ibh); 899 mark_buffer_dirty(ibh);
793 nilfs_mdt_mark_dirty(sbi->s_ifile); 900 nilfs_mdt_mark_dirty(NILFS_I(inode)->i_root->ifile);
794 brelse(ibh); 901 brelse(ibh);
795 return 0; 902 return 0;
796} 903}
@@ -805,9 +912,10 @@ int nilfs_mark_inode_dirty(struct inode *inode)
805 * construction. This function can be called both as a single operation 912 * construction. This function can be called both as a single operation
806 * and as a part of indivisible file operations. 913 * and as a part of indivisible file operations.
807 */ 914 */
808void nilfs_dirty_inode(struct inode *inode) 915void nilfs_dirty_inode(struct inode *inode, int flags)
809{ 916{
810 struct nilfs_transaction_info ti; 917 struct nilfs_transaction_info ti;
918 struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
811 919
812 if (is_bad_inode(inode)) { 920 if (is_bad_inode(inode)) {
813 nilfs_warning(inode->i_sb, __func__, 921 nilfs_warning(inode->i_sb, __func__,
@@ -815,7 +923,142 @@ void nilfs_dirty_inode(struct inode *inode)
815 dump_stack(); 923 dump_stack();
816 return; 924 return;
817 } 925 }
926 if (mdi) {
927 nilfs_mdt_mark_dirty(inode);
928 return;
929 }
818 nilfs_transaction_begin(inode->i_sb, &ti, 0); 930 nilfs_transaction_begin(inode->i_sb, &ti, 0);
819 nilfs_mark_inode_dirty(inode); 931 nilfs_mark_inode_dirty(inode);
820 nilfs_transaction_commit(inode->i_sb); /* never fails */ 932 nilfs_transaction_commit(inode->i_sb); /* never fails */
821} 933}
934
935int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
936 __u64 start, __u64 len)
937{
938 struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
939 __u64 logical = 0, phys = 0, size = 0;
940 __u32 flags = 0;
941 loff_t isize;
942 sector_t blkoff, end_blkoff;
943 sector_t delalloc_blkoff;
944 unsigned long delalloc_blklen;
945 unsigned int blkbits = inode->i_blkbits;
946 int ret, n;
947
948 ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC);
949 if (ret)
950 return ret;
951
952 mutex_lock(&inode->i_mutex);
953
954 isize = i_size_read(inode);
955
956 blkoff = start >> blkbits;
957 end_blkoff = (start + len - 1) >> blkbits;
958
959 delalloc_blklen = nilfs_find_uncommitted_extent(inode, blkoff,
960 &delalloc_blkoff);
961
962 do {
963 __u64 blkphy;
964 unsigned int maxblocks;
965
966 if (delalloc_blklen && blkoff == delalloc_blkoff) {
967 if (size) {
968 /* End of the current extent */
969 ret = fiemap_fill_next_extent(
970 fieinfo, logical, phys, size, flags);
971 if (ret)
972 break;
973 }
974 if (blkoff > end_blkoff)
975 break;
976
977 flags = FIEMAP_EXTENT_MERGED | FIEMAP_EXTENT_DELALLOC;
978 logical = blkoff << blkbits;
979 phys = 0;
980 size = delalloc_blklen << blkbits;
981
982 blkoff = delalloc_blkoff + delalloc_blklen;
983 delalloc_blklen = nilfs_find_uncommitted_extent(
984 inode, blkoff, &delalloc_blkoff);
985 continue;
986 }
987
988 /*
989 * Limit the number of blocks that we look up so as
990 * not to get into the next delayed allocation extent.
991 */
992 maxblocks = INT_MAX;
993 if (delalloc_blklen)
994 maxblocks = min_t(sector_t, delalloc_blkoff - blkoff,
995 maxblocks);
996 blkphy = 0;
997
998 down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
999 n = nilfs_bmap_lookup_contig(
1000 NILFS_I(inode)->i_bmap, blkoff, &blkphy, maxblocks);
1001 up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
1002
1003 if (n < 0) {
1004 int past_eof;
1005
1006 if (unlikely(n != -ENOENT))
1007 break; /* error */
1008
1009 /* HOLE */
1010 blkoff++;
1011 past_eof = ((blkoff << blkbits) >= isize);
1012
1013 if (size) {
1014 /* End of the current extent */
1015
1016 if (past_eof)
1017 flags |= FIEMAP_EXTENT_LAST;
1018
1019 ret = fiemap_fill_next_extent(
1020 fieinfo, logical, phys, size, flags);
1021 if (ret)
1022 break;
1023 size = 0;
1024 }
1025 if (blkoff > end_blkoff || past_eof)
1026 break;
1027 } else {
1028 if (size) {
1029 if (phys && blkphy << blkbits == phys + size) {
1030 /* The current extent goes on */
1031 size += n << blkbits;
1032 } else {
1033 /* Terminate the current extent */
1034 ret = fiemap_fill_next_extent(
1035 fieinfo, logical, phys, size,
1036 flags);
1037 if (ret || blkoff > end_blkoff)
1038 break;
1039
1040 /* Start another extent */
1041 flags = FIEMAP_EXTENT_MERGED;
1042 logical = blkoff << blkbits;
1043 phys = blkphy << blkbits;
1044 size = n << blkbits;
1045 }
1046 } else {
1047 /* Start a new extent */
1048 flags = FIEMAP_EXTENT_MERGED;
1049 logical = blkoff << blkbits;
1050 phys = blkphy << blkbits;
1051 size = n << blkbits;
1052 }
1053 blkoff += n;
1054 }
1055 cond_resched();
1056 } while (true);
1057
1058 /* If ret is 1 then we just hit the end of the extent array */
1059 if (ret == 1)
1060 ret = 0;
1061
1062 mutex_unlock(&inode->i_mutex);
1063 return ret;
1064}
                                
                       
 













                                  















                                                             
                                 
                                                                              








                                                                            
                                                                 
                                                           
                                                      

                                                                
                                              







                                                                        



                                                                            
 
                                                  

 
                                                                   
 





                                        

 


                                                             
 
                                               



                              
                                              
                                     

                                               





                                                                  









                                                                
                                                  


                                                                        

                                                                       




                                                                                       



                                                                           





                                                                                
                                                              
                                                      
                                                                                  




                                                


                                                
                                         







                                                                               






                                                                            
                                        
                  





                                                





                                                                       



                                         
                                
                         
         
                  
 

                     

 
                                                                    
                                                         
                                    
 

                                                
                                                                 

 
                                


                                                                       
                                       





                                                               
                                                               




                              
                                                    
 

                                                                      


                                                              




                                                     
                                                   




                                       
                                     
 


                                   
                       
 
                                          






                                                   


                                                                   



                                           







                                                                    


                                      

                                   







                                                                  
                                     

                            




                                          

                                                               
                                
                                
         
                                  

                              
                                                                


                                    

                                          

         
                           





                                                                  
           
                              

                                   








                                                                  
                          











                                                                          
                                                          
 
                                                                                
                                 
                    








                                                                                
                                                     

                                        
 



                                                                 
 
                                                                                                     
                   

 
                                                              
 
                                                                                

                               
                                                     







                                                                                           
                                                                                





































                                                                         
                                                                 









                                                      

                                                              



























































































                                                                            

                                                                               
                                                     
 

                                                                  


                                                                        
                                                                                


                                                            

                                            



                               

                                                                       
 



                                                                       
 

                                          



                                                               
                                         

                                                                
                                                                        
         

                                    

 
                                                                         






                                                          

                                                                          




                                                         

                                                               







                                                                


                                                                               
 
                                                 
                            

                
                   

                                                
                                 
                 
                                                 
                                        

         
                                 
                                             

                         


                                                           

                                    
                                                                  



                                                            
                                       


                      
                                       

                     
                               
                            


                          

                                                                 
 
                



                                                                         
 
                                                         

                              
         
 
                                                                              

                              
         
 
                                                                




                               

 

                                                                               



                              
                            
                      
                                                                
 



                                                                            
 


                                           
                                            


                                 

                                                                              

                                        
                               
         
                               


                                                                  
                                                  

                                                
                                         
                         
                                                              
                        

                                                                          
                                                                       
                 
                

                                        
                                 
                 
                                          
                                                      
         





                                      


                             

                     
                               
          
                          

                   
                                                                  




                      
                                                       




                                   
                                                                




                                     
                                                                      
                                        
                                                                              











                                      
                             
                                             

                             
                                                            
                                       



                     
                                                                  
















                                                                              
                              














                                                                    
                  

















                                                                              
                  


                                    


                                                                           






                                        

                                         

                
                            



                                                                        
         
 


                                                              
                                                                          
 
                           

                              
                         
         
 
                                             

                                                              
                              
         
                               
                                               
 

                                                      
                              
         





















                                                                   



























                                                                   


                                                                        
 





                                       

                                         
 
                            



                                                                        
         
 
                                                                   
 
                           

                              
                         
         
 
                                             

                                                              
                              
         
                               
                                               
 

                                                     
                              
         











                                                        




































                                                                            

                                                               








                                       
                             


                                                                 


                                                                      
 


                                     




                                                                         
                                                                  


                                                                

                                   

                                                                      

                                              
                                         
                         


                                                                           
                                              







                                                              

                                             
                                         
                         







                                                           
                                                     
                                                                        
                                     
                                           







                                            
                           

                             
                         
         
 
                                             

                                                              
                              
         




                                     
                                                               


                                                                       
                                                


















                                                                            

                                                                 













                                                               


                                                        













                                                                              
                           

                             
                         
         
 
                                             

                                                              
                              
         






                                                      
                                                    
                                         



                                                     
                                           
















                                                                        
                                                                    




                                
                                                              


                                   
                                 

  
                                                         

                                              
                                          
                                



                                                 
                              


                                     

























                                                          





                                                                     
                                                               


                                        
                                                  
                                                     




                                                     


                                                                            

                                                    


                                    






                                                         
                                                



                           
/*
 * POSIX message queues filesystem for Linux.
 *
 * Copyright (C) 2003,2004  Krzysztof Benedyczak    (golbi@mat.uni.torun.pl)
 *                          Michal Wronski          (michal.wronski@gmail.com)
 *
 * Spinlocks:               Mohamed Abbas           (abbas.mohamed@intel.com)
 * Lockless receive & send, fd based notify:
 * 			    Manfred Spraul	    (manfred@colorfullife.com)
 *
 * Audit:                   George Wilson           (ltcgcw@us.ibm.com)
 *
 * This file is released under the GPL.
 */

#include <linux/capability.h>
#include <linux/init.h>
#include <linux/pagemap.h>
#include <linux/file.h>
#include <linux/mount.h>
#include <linux/namei.h>
#include <linux/sysctl.h>
#include <linux/poll.h>
#include <linux/mqueue.h>
#include <linux/msg.h>
#include <linux/skbuff.h>
#include <linux/netlink.h>
#include <linux/syscalls.h>
#include <linux/audit.h>
#include <linux/signal.h>
#include <linux/mutex.h>
#include <linux/nsproxy.h>
#include <linux/pid.h>
#include <linux/ipc_namespace.h>
#include <linux/slab.h>

#include <net/sock.h>
#include "util.h"

#define MQUEUE_MAGIC	0x19800202
#define DIRENT_SIZE	20
#define FILENT_SIZE	80

#define SEND		0
#define RECV		1

#define STATE_NONE	0
#define STATE_PENDING	1
#define STATE_READY	2

struct ext_wait_queue {		/* queue of sleeping tasks */
	struct task_struct *task;
	struct list_head list;
	struct msg_msg *msg;	/* ptr of loaded message */
	int state;		/* one of STATE_* values */
};

struct mqueue_inode_info {
	spinlock_t lock;
	struct inode vfs_inode;
	wait_queue_head_t wait_q;

	struct msg_msg **messages;
	struct mq_attr attr;

	struct sigevent notify;
	struct pid* notify_owner;
	struct user_struct *user;	/* user who created, for accounting */
	struct sock *notify_sock;
	struct sk_buff *notify_cookie;

	/* for tasks waiting for free space and messages, respectively */
	struct ext_wait_queue e_wait_q[2];

	unsigned long qsize; /* size of queue in memory (sum of all msgs) */
};

static const struct inode_operations mqueue_dir_inode_operations;
static const struct file_operations mqueue_file_operations;
static const struct super_operations mqueue_super_ops;
static void remove_notification(struct mqueue_inode_info *info);

static struct kmem_cache *mqueue_inode_cachep;

static struct ctl_table_header * mq_sysctl_table;

static inline struct mqueue_inode_info *MQUEUE_I(struct inode *inode)
{
	return container_of(inode, struct mqueue_inode_info, vfs_inode);
}

/*
 * This routine should be called with the mq_lock held.
 */
static inline struct ipc_namespace *__get_ns_from_inode(struct inode *inode)
{
	return get_ipc_ns(inode->i_sb->s_fs_info);
}

static struct ipc_namespace *get_ns_from_inode(struct inode *inode)
{
	struct ipc_namespace *ns;

	spin_lock(&mq_lock);
	ns = __get_ns_from_inode(inode);
	spin_unlock(&mq_lock);
	return ns;
}

static struct inode *mqueue_get_inode(struct super_block *sb,
		struct ipc_namespace *ipc_ns, int mode,
		struct mq_attr *attr)
{
	struct user_struct *u = current_user();
	struct inode *inode;

	inode = new_inode(sb);
	if (inode) {
		inode->i_ino = get_next_ino();
		inode->i_mode = mode;
		inode->i_uid = current_fsuid();
		inode->i_gid = current_fsgid();
		inode->i_mtime = inode->i_ctime = inode->i_atime =
				CURRENT_TIME;

		if (S_ISREG(mode)) {
			struct mqueue_inode_info *info;
			struct task_struct *p = current;
			unsigned long mq_bytes, mq_msg_tblsz;

			inode->i_fop = &mqueue_file_operations;
			inode->i_size = FILENT_SIZE;
			/* mqueue specific info */
			info = MQUEUE_I(inode);
			spin_lock_init(&info->lock);
			init_waitqueue_head(&info->wait_q);
			INIT_LIST_HEAD(&info->e_wait_q[0].list);
			INIT_LIST_HEAD(&info->e_wait_q[1].list);
			info->notify_owner = NULL;
			info->qsize = 0;
			info->user = NULL;	/* set when all is ok */
			memset(&info->attr, 0, sizeof(info->attr));
			info->attr.mq_maxmsg = ipc_ns->mq_msg_max;
			info->attr.mq_msgsize = ipc_ns->mq_msgsize_max;
			if (attr) {
				info->attr.mq_maxmsg = attr->mq_maxmsg;
				info->attr.mq_msgsize = attr->mq_msgsize;
			}
			mq_msg_tblsz = info->attr.mq_maxmsg * sizeof(struct msg_msg *);
			info->messages = kmalloc(mq_msg_tblsz, GFP_KERNEL);
			if (!info->messages)
				goto out_inode;

			mq_bytes = (mq_msg_tblsz +
				(info->attr.mq_maxmsg * info->attr.mq_msgsize));

			spin_lock(&mq_lock);
			if (u->mq_bytes + mq_bytes < u->mq_bytes ||
		 	    u->mq_bytes + mq_bytes >
			    task_rlimit(p, RLIMIT_MSGQUEUE)) {
				spin_unlock(&mq_lock);
				/* mqueue_evict_inode() releases info->messages */
				goto out_inode;
			}
			u->mq_bytes += mq_bytes;
			spin_unlock(&mq_lock);

			/* all is ok */
			info->user = get_uid(u);
		} else if (S_ISDIR(mode)) {
			inc_nlink(inode);
			/* Some things misbehave if size == 0 on a directory */
			inode->i_size = 2 * DIRENT_SIZE;
			inode->i_op = &mqueue_dir_inode_operations;
			inode->i_fop = &simple_dir_operations;
		}
	}
	return inode;
out_inode:
	iput(inode);
	return NULL;
}

static int mqueue_fill_super(struct super_block *sb, void *data, int silent)
{
	struct inode *inode;
	struct ipc_namespace *ns = data;
	int error;

	sb->s_blocksize = PAGE_CACHE_SIZE;
	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
	sb->s_magic = MQUEUE_MAGIC;
	sb->s_op = &mqueue_super_ops;

	inode = mqueue_get_inode(sb, ns, S_IFDIR | S_ISVTX | S_IRWXUGO,
				NULL);
	if (!inode) {
		error = -ENOMEM;
		goto out;
	}

	sb->s_root = d_alloc_root(inode);
	if (!sb->s_root) {
		iput(inode);
		error = -ENOMEM;
		goto out;
	}
	error = 0;

out:
	return error;
}

static struct dentry *mqueue_mount(struct file_system_type *fs_type,
			 int flags, const char *dev_name,
			 void *data)
{
	if (!(flags & MS_KERNMOUNT))
		data = current->nsproxy->ipc_ns;
	return mount_ns(fs_type, flags, data, mqueue_fill_super);
}

static void init_once(void *foo)
{
	struct mqueue_inode_info *p = (struct mqueue_inode_info *) foo;

	inode_init_once(&p->vfs_inode);
}

static struct inode *mqueue_alloc_inode(struct super_block *sb)
{
	struct mqueue_inode_info *ei;

	ei = kmem_cache_alloc(mqueue_inode_cachep, GFP_KERNEL);
	if (!ei)
		return NULL;
	return &ei->vfs_inode;
}

static void mqueue_i_callback(struct rcu_head *head)
{
	struct inode *inode = container_of(head, struct inode, i_rcu);
	INIT_LIST_HEAD(&inode->i_dentry);
	kmem_cache_free(mqueue_inode_cachep, MQUEUE_I(inode));
}

static void mqueue_destroy_inode(struct inode *inode)
{
	call_rcu(&inode->i_rcu, mqueue_i_callback);
}

static void mqueue_evict_inode(struct inode *inode)
{
	struct mqueue_inode_info *info;
	struct user_struct *user;
	unsigned long mq_bytes;
	int i;
	struct ipc_namespace *ipc_ns;

	end_writeback(inode);

	if (S_ISDIR(inode->i_mode))
		return;

	ipc_ns = get_ns_from_inode(inode);
	info = MQUEUE_I(inode);
	spin_lock(&info->lock);
	for (i = 0; i < info->attr.mq_curmsgs; i++)
		free_msg(info->messages[i]);
	kfree(info->messages);
	spin_unlock(&info->lock);

	/* Total amount of bytes accounted for the mqueue */
	mq_bytes = info->attr.mq_maxmsg * (sizeof(struct msg_msg *)
	    + info->attr.mq_msgsize);
	user = info->user;
	if (user) {
		spin_lock(&mq_lock);
		user->mq_bytes -= mq_bytes;
		/*
		 * get_ns_from_inode() ensures that the
		 * (ipc_ns = sb->s_fs_info) is either a valid ipc_ns
		 * to which we now hold a reference, or it is NULL.
		 * We can't put it here under mq_lock, though.
		 */
		if (ipc_ns)
			ipc_ns->mq_queues_count--;
		spin_unlock(&mq_lock);
		free_uid(user);
	}
	if (ipc_ns)
		put_ipc_ns(ipc_ns);
}

static int mqueue_create(struct inode *dir, struct dentry *dentry,
				int mode, struct nameidata *nd)
{
	struct inode *inode;
	struct mq_attr *attr = dentry->d_fsdata;
	int error;
	struct ipc_namespace *ipc_ns;

	spin_lock(&mq_lock);
	ipc_ns = __get_ns_from_inode(dir);
	if (!ipc_ns) {
		error = -EACCES;
		goto out_unlock;
	}
	if (ipc_ns->mq_queues_count >= ipc_ns->mq_queues_max &&
			!capable(CAP_SYS_RESOURCE)) {
		error = -ENOSPC;
		goto out_unlock;
	}
	ipc_ns->mq_queues_count++;
	spin_unlock(&mq_lock);

	inode = mqueue_get_inode(dir->i_sb, ipc_ns, mode, attr);
	if (!inode) {
		error = -ENOMEM;
		spin_lock(&mq_lock);
		ipc_ns->mq_queues_count--;
		goto out_unlock;
	}

	put_ipc_ns(ipc_ns);
	dir->i_size += DIRENT_SIZE;
	dir->i_ctime = dir->i_mtime = dir->i_atime = CURRENT_TIME;

	d_instantiate(dentry, inode);
	dget(dentry);
	return 0;
out_unlock:
	spin_unlock(&mq_lock);
	if (ipc_ns)
		put_ipc_ns(ipc_ns);
	return error;
}

static int mqueue_unlink(struct inode *dir, struct dentry *dentry)
{
  	struct inode *inode = dentry->d_inode;

	dir->i_ctime = dir->i_mtime = dir->i_atime = CURRENT_TIME;
	dir->i_size -= DIRENT_SIZE;
  	drop_nlink(inode);
  	dput(dentry);
  	return 0;
}

/*
*	This is routine for system read from queue file.
*	To avoid mess with doing here some sort of mq_receive we allow
*	to read only queue size & notification info (the only values
*	that are interesting from user point of view and aren't accessible
*	through std routines)
*/
static ssize_t mqueue_read_file(struct file *filp, char __user *u_data,
				size_t count, loff_t *off)
{
	struct mqueue_inode_info *info = MQUEUE_I(filp->f_path.dentry->d_inode);
	char buffer[FILENT_SIZE];
	ssize_t ret;

	spin_lock(&info->lock);
	snprintf(buffer, sizeof(buffer),
			"QSIZE:%-10lu NOTIFY:%-5d SIGNO:%-5d NOTIFY_PID:%-6d\n",
			info->qsize,
			info->notify_owner ? info->notify.sigev_notify : 0,
			(info->notify_owner &&
			 info->notify.sigev_notify == SIGEV_SIGNAL) ?
				info->notify.sigev_signo : 0,
			pid_vnr(info->notify_owner));
	spin_unlock(&info->lock);
	buffer[sizeof(buffer)-1] = '\0';

	ret = simple_read_from_buffer(u_data, count, off, buffer,
				strlen(buffer));
	if (ret <= 0)
		return ret;

	filp->f_path.dentry->d_inode->i_atime = filp->f_path.dentry->d_inode->i_ctime = CURRENT_TIME;
	return ret;
}

static int mqueue_flush_file(struct file *filp, fl_owner_t id)
{
	struct mqueue_inode_info *info = MQUEUE_I(filp->f_path.dentry->d_inode);

	spin_lock(&info->lock);
	if (task_tgid(current) == info->notify_owner)
		remove_notification(info);

	spin_unlock(&info->lock);
	return 0;
}

static unsigned int mqueue_poll_file(struct file *filp, struct poll_table_struct *poll_tab)
{
	struct mqueue_inode_info *info = MQUEUE_I(filp->f_path.dentry->d_inode);
	int retval = 0;

	poll_wait(filp, &info->wait_q, poll_tab);

	spin_lock(&info->lock);
	if (info->attr.mq_curmsgs)
		retval = POLLIN | POLLRDNORM;

	if (info->attr.mq_curmsgs < info->attr.mq_maxmsg)
		retval |= POLLOUT | POLLWRNORM;
	spin_unlock(&info->lock);

	return retval;
}

/* Adds current to info->e_wait_q[sr] before element with smaller prio */
static void wq_add(struct mqueue_inode_info *info, int sr,
			struct ext_wait_queue *ewp)
{
	struct ext_wait_queue *walk;

	ewp->task = current;

	list_for_each_entry(walk, &info->e_wait_q[sr].list, list) {
		if (walk->task->static_prio <= current->static_prio) {
			list_add_tail(&ewp->list, &walk->list);
			return;
		}
	}
	list_add_tail(&ewp->list, &info->e_wait_q[sr].list);
}

/*
 * Puts current task to sleep. Caller must hold queue lock. After return
 * lock isn't held.
 * sr: SEND or RECV
 */
static int wq_sleep(struct mqueue_inode_info *info, int sr,
		    ktime_t *timeout, struct ext_wait_queue *ewp)
{
	int retval;
	signed long time;

	wq_add(info, sr, ewp);

	for (;;) {
		set_current_state(TASK_INTERRUPTIBLE);

		spin_unlock(&info->lock);
		time = schedule_hrtimeout_range_clock(timeout,
		    HRTIMER_MODE_ABS, 0, CLOCK_REALTIME);

		while (ewp->state == STATE_PENDING)
			cpu_relax();

		if (ewp->state == STATE_READY) {
			retval = 0;
			goto out;
		}
		spin_lock(&info->lock);
		if (ewp->state == STATE_READY) {
			retval = 0;
			goto out_unlock;
		}
		if (signal_pending(current)) {
			retval = -ERESTARTSYS;
			break;
		}
		if (time == 0) {
			retval = -ETIMEDOUT;
			break;
		}
	}
	list_del(&ewp->list);
out_unlock:
	spin_unlock(&info->lock);
out:
	return retval;
}

/*
 * Returns waiting task that should be serviced first or NULL if none exists
 */
static struct ext_wait_queue *wq_get_first_waiter(
		struct mqueue_inode_info *info, int sr)
{
	struct list_head *ptr;

	ptr = info->e_wait_q[sr].list.prev;
	if (ptr == &info->e_wait_q[sr].list)
		return NULL;
	return list_entry(ptr, struct ext_wait_queue, list);
}

/* Auxiliary functions to manipulate messages' list */
static void msg_insert(struct msg_msg *ptr, struct mqueue_inode_info *info)
{
	int k;

	k = info->attr.mq_curmsgs - 1;
	while (k >= 0 && info->messages[k]->m_type >= ptr->m_type) {
		info->messages[k + 1] = info->messages[k];
		k--;
	}
	info->attr.mq_curmsgs++;
	info->qsize += ptr->m_ts;
	info->messages[k + 1] = ptr;
}

static inline struct msg_msg *msg_get(struct mqueue_inode_info *info)
{
	info->qsize -= info->messages[--info->attr.mq_curmsgs]->m_ts;
	return info->messages[info->attr.mq_curmsgs];
}

static inline void set_cookie(struct sk_buff *skb, char code)
{
	((char*)skb->data)[NOTIFY_COOKIE_LEN-1] = code;
}

/*
 * The next function is only to split too long sys_mq_timedsend
 */
static void __do_notify(struct mqueue_inode_info *info)
{
	/* notification
	 * invoked when there is registered process and there isn't process
	 * waiting synchronously for message AND state of queue changed from
	 * empty to not empty. Here we are sure that no one is waiting
	 * synchronously. */
	if (info->notify_owner &&
	    info->attr.mq_curmsgs == 1) {
		struct siginfo sig_i;
		switch (info->notify.sigev_notify) {
		case SIGEV_NONE:
			break;
		case SIGEV_SIGNAL:
			/* sends signal */

			sig_i.si_signo = info->notify.sigev_signo;
			sig_i.si_errno = 0;
			sig_i.si_code = SI_MESGQ;
			sig_i.si_value = info->notify.sigev_value;
			sig_i.si_pid = task_tgid_nr_ns(current,
						ns_of_pid(info->notify_owner));
			sig_i.si_uid = current_uid();

			kill_pid_info(info->notify.sigev_signo,
				      &sig_i, info->notify_owner);
			break;
		case SIGEV_THREAD:
			set_cookie(info->notify_cookie, NOTIFY_WOKENUP);
			netlink_sendskb(info->notify_sock, info->notify_cookie);
			break;
		}
		/* after notification unregisters process */
		put_pid(info->notify_owner);
		info->notify_owner = NULL;
	}
	wake_up(&info->wait_q);
}

static int prepare_timeout(const struct timespec __user *u_abs_timeout,
			   ktime_t *expires, struct timespec *ts)
{
	if (copy_from_user(ts, u_abs_timeout, sizeof(struct timespec)))
		return -EFAULT;
	if (!timespec_valid(ts))
		return -EINVAL;

	*expires = timespec_to_ktime(*ts);
	return 0;
}

static void remove_notification(struct mqueue_inode_info *info)
{
	if (info->notify_owner != NULL &&
	    info->notify.sigev_notify == SIGEV_THREAD) {
		set_cookie(info->notify_cookie, NOTIFY_REMOVED);
		netlink_sendskb(info->notify_sock, info->notify_cookie);
	}
	put_pid(info->notify_owner);
	info->notify_owner = NULL;
}

static int mq_attr_ok(struct ipc_namespace *ipc_ns, struct mq_attr *attr)
{
	if (attr->mq_maxmsg <= 0 || attr->mq_msgsize <= 0)
		return 0;
	if (capable(CAP_SYS_RESOURCE)) {
		if (attr->mq_maxmsg > HARD_MSGMAX)
			return 0;
	} else {
		if (attr->mq_maxmsg > ipc_ns->mq_msg_max ||
				attr->mq_msgsize > ipc_ns->mq_msgsize_max)
			return 0;
	}
	/* check for overflow */
	if (attr->mq_msgsize > ULONG_MAX/attr->mq_maxmsg)
		return 0;
	if ((unsigned long)(attr->mq_maxmsg * (attr->mq_msgsize
	    + sizeof (struct msg_msg *))) <
	    (unsigned long)(attr->mq_maxmsg * attr->mq_msgsize))
		return 0;
	return 1;
}

/*
 * Invoked when creating a new queue via sys_mq_open
 */
static struct file *do_create(struct ipc_namespace *ipc_ns, struct dentry *dir,
			struct dentry *dentry, int oflag, mode_t mode,
			struct mq_attr *attr)
{
	const struct cred *cred = current_cred();
	struct file *result;
	int ret;

	if (attr) {
		if (!mq_attr_ok(ipc_ns, attr)) {
			ret = -EINVAL;
			goto out;
		}
		/* store for use during create */
		dentry->d_fsdata = attr;
	}

	mode &= ~current_umask();
	ret = mnt_want_write(ipc_ns->mq_mnt);
	if (ret)
		goto out;
	ret = vfs_create(dir->d_inode, dentry, mode, NULL);
	dentry->d_fsdata = NULL;
	if (ret)
		goto out_drop_write;

	result = dentry_open(dentry, ipc_ns->mq_mnt, oflag, cred);
	/*
	 * dentry_open() took a persistent mnt_want_write(),
	 * so we can now drop this one.
	 */
	mnt_drop_write(ipc_ns->mq_mnt);
	return result;

out_drop_write:
	mnt_drop_write(ipc_ns->mq_mnt);
out:
	dput(dentry);
	mntput(ipc_ns->mq_mnt);
	return ERR_PTR(ret);
}

/* Opens existing queue */
static struct file *do_open(struct ipc_namespace *ipc_ns,
				struct dentry *dentry, int oflag)
{
	int ret;
	const struct cred *cred = current_cred();

	static const int oflag2acc[O_ACCMODE] = { MAY_READ, MAY_WRITE,
						  MAY_READ | MAY_WRITE };

	if ((oflag & O_ACCMODE) == (O_RDWR | O_WRONLY)) {
		ret = -EINVAL;
		goto err;
	}

	if (inode_permission(dentry->d_inode, oflag2acc[oflag & O_ACCMODE])) {
		ret = -EACCES;
		goto err;
	}

	return dentry_open(dentry, ipc_ns->mq_mnt, oflag, cred);

err:
	dput(dentry);
	mntput(ipc_ns->mq_mnt);
	return ERR_PTR(ret);
}

SYSCALL_DEFINE4(mq_open, const char __user *, u_name, int, oflag, mode_t, mode,
		struct mq_attr __user *, u_attr)
{
	struct dentry *dentry;
	struct file *filp;
	char *name;
	struct mq_attr attr;
	int fd, error;
	struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns;

	if (u_attr && copy_from_user(&attr, u_attr, sizeof(struct mq_attr)))
		return -EFAULT;

	audit_mq_open(oflag, mode, u_attr ? &attr : NULL);

	if (IS_ERR(name = getname(u_name)))
		return PTR_ERR(name);

	fd = get_unused_fd_flags(O_CLOEXEC);
	if (fd < 0)
		goto out_putname;

	mutex_lock(&ipc_ns->mq_mnt->mnt_root->d_inode->i_mutex);
	dentry = lookup_one_len(name, ipc_ns->mq_mnt->mnt_root, strlen(name));
	if (IS_ERR(dentry)) {
		error = PTR_ERR(dentry);
		goto out_putfd;
	}
	mntget(ipc_ns->mq_mnt);

	if (oflag & O_CREAT) {
		if (dentry->d_inode) {	/* entry already exists */
			audit_inode(name, dentry);
			if (oflag & O_EXCL) {
				error = -EEXIST;
				goto out;
			}
			filp = do_open(ipc_ns, dentry, oflag);
		} else {
			filp = do_create(ipc_ns, ipc_ns->mq_mnt->mnt_root,
						dentry, oflag, mode,
						u_attr ? &attr : NULL);
		}
	} else {
		if (!dentry->d_inode) {
			error = -ENOENT;
			goto out;
		}
		audit_inode(name, dentry);
		filp = do_open(ipc_ns, dentry, oflag);
	}

	if (IS_ERR(filp)) {
		error = PTR_ERR(filp);
		goto out_putfd;
	}

	fd_install(fd, filp);
	goto out_upsem;

out:
	dput(dentry);
	mntput(ipc_ns->mq_mnt);
out_putfd:
	put_unused_fd(fd);
	fd = error;
out_upsem:
	mutex_unlock(&ipc_ns->mq_mnt->mnt_root->d_inode->i_mutex);
out_putname:
	putname(name);
	return fd;
}

SYSCALL_DEFINE1(mq_unlink, const char __user *, u_name)
{
	int err;
	char *name;
	struct dentry *dentry;
	struct inode *inode = NULL;
	struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns;

	name = getname(u_name);
	if (IS_ERR(name))
		return PTR_ERR(name);

	mutex_lock_nested(&ipc_ns->mq_mnt->mnt_root->d_inode->i_mutex,
			I_MUTEX_PARENT);
	dentry = lookup_one_len(name, ipc_ns->mq_mnt->mnt_root, strlen(name));
	if (IS_ERR(dentry)) {
		err = PTR_ERR(dentry);
		goto out_unlock;
	}

	if (!dentry->d_inode) {
		err = -ENOENT;
		goto out_err;
	}

	inode = dentry->d_inode;
	if (inode)
		ihold(inode);
	err = mnt_want_write(ipc_ns->mq_mnt);
	if (err)
		goto out_err;
	err = vfs_unlink(dentry->d_parent->d_inode, dentry);
	mnt_drop_write(ipc_ns->mq_mnt);
out_err:
	dput(dentry);

out_unlock:
	mutex_unlock(&ipc_ns->mq_mnt->mnt_root->d_inode->i_mutex);
	putname(name);
	if (inode)
		iput(inode);

	return err;
}

/* Pipelined send and receive functions.
 *
 * If a receiver finds no waiting message, then it registers itself in the
 * list of waiting receivers. A sender checks that list before adding the new
 * message into the message array. If there is a waiting receiver, then it
 * bypasses the message array and directly hands the message over to the
 * receiver.
 * The receiver accepts the message and returns without grabbing the queue
 * spinlock. Therefore an intermediate STATE_PENDING state and memory barriers
 * are necessary. The same algorithm is used for sysv semaphores, see
 * ipc/sem.c for more details.
 *
 * The same algorithm is used for senders.
 */

/* pipelined_send() - send a message directly to the task waiting in
 * sys_mq_timedreceive() (without inserting message into a queue).
 */
static inline void pipelined_send(struct mqueue_inode_info *info,
				  struct msg_msg *message,
				  struct ext_wait_queue *receiver)
{
	receiver->msg = message;
	list_del(&receiver->list);
	receiver->state = STATE_PENDING;
	wake_up_process(receiver->task);
	smp_wmb();
	receiver->state = STATE_READY;
}

/* pipelined_receive() - if there is task waiting in sys_mq_timedsend()
 * gets its message and put to the queue (we have one free place for sure). */
static inline void pipelined_receive(struct mqueue_inode_info *info)
{
	struct ext_wait_queue *sender = wq_get_first_waiter(info, SEND);

	if (!sender) {
		/* for poll */
		wake_up_interruptible(&info->wait_q);
		return;
	}
	msg_insert(sender->msg, info);
	list_del(&sender->list);
	sender->state = STATE_PENDING;
	wake_up_process(sender->task);
	smp_wmb();
	sender->state = STATE_READY;
}

SYSCALL_DEFINE5(mq_timedsend, mqd_t, mqdes, const char __user *, u_msg_ptr,
		size_t, msg_len, unsigned int, msg_prio,
		const struct timespec __user *, u_abs_timeout)
{
	struct file *filp;
	struct inode *inode;
	struct ext_wait_queue wait;
	struct ext_wait_queue *receiver;
	struct msg_msg *msg_ptr;
	struct mqueue_inode_info *info;
	ktime_t expires, *timeout = NULL;
	struct timespec ts;
	int ret;

	if (u_abs_timeout) {
		int res = prepare_timeout(u_abs_timeout, &expires, &ts);
		if (res)
			return res;
		timeout = &expires;
	}

	if (unlikely(msg_prio >= (unsigned long) MQ_PRIO_MAX))
		return -EINVAL;

	audit_mq_sendrecv(mqdes, msg_len, msg_prio, timeout ? &ts : NULL);

	filp = fget(mqdes);
	if (unlikely(!filp)) {
		ret = -EBADF;
		goto out;
	}

	inode = filp->f_path.dentry->d_inode;
	if (unlikely(filp->f_op != &mqueue_file_operations)) {
		ret = -EBADF;
		goto out_fput;
	}
	info = MQUEUE_I(inode);
	audit_inode(NULL, filp->f_path.dentry);

	if (unlikely(!(filp->f_mode & FMODE_WRITE))) {
		ret = -EBADF;
		goto out_fput;
	}

	if (unlikely(msg_len > info->attr.mq_msgsize)) {
		ret = -EMSGSIZE;
		goto out_fput;
	}

	/* First try to allocate memory, before doing anything with
	 * existing queues. */
	msg_ptr = load_msg(u_msg_ptr, msg_len);
	if (IS_ERR(msg_ptr)) {
		ret = PTR_ERR(msg_ptr);
		goto out_fput;
	}
	msg_ptr->m_ts = msg_len;
	msg_ptr->m_type = msg_prio;

	spin_lock(&info->lock);

	if (info->attr.mq_curmsgs == info->attr.mq_maxmsg) {
		if (filp->f_flags & O_NONBLOCK) {
			spin_unlock(&info->lock);
			ret = -EAGAIN;
		} else {
			wait.task = current;
			wait.msg = (void *) msg_ptr;
			wait.state = STATE_NONE;
			ret = wq_sleep(info, SEND, timeout, &wait);
		}
		if (ret < 0)
			free_msg(msg_ptr);
	} else {
		receiver = wq_get_first_waiter(info, RECV);
		if (receiver) {
			pipelined_send(info, msg_ptr, receiver);
		} else {
			/* adds message to the queue */
			msg_insert(msg_ptr, info);
			__do_notify(info);
		}
		inode->i_atime = inode->i_mtime = inode->i_ctime =
				CURRENT_TIME;
		spin_unlock(&info->lock);
		ret = 0;
	}
out_fput:
	fput(filp);
out:
	return ret;
}

SYSCALL_DEFINE5(mq_timedreceive, mqd_t, mqdes, char __user *, u_msg_ptr,
		size_t, msg_len, unsigned int __user *, u_msg_prio,
		const struct timespec __user *, u_abs_timeout)
{
	ssize_t ret;
	struct msg_msg *msg_ptr;
	struct file *filp;
	struct inode *inode;
	struct mqueue_inode_info *info;
	struct ext_wait_queue wait;
	ktime_t expires, *timeout = NULL;
	struct timespec ts;

	if (u_abs_timeout) {
		int res = prepare_timeout(u_abs_timeout, &expires, &ts);
		if (res)
			return res;
		timeout = &expires;
	}

	audit_mq_sendrecv(mqdes, msg_len, 0, timeout ? &ts : NULL);

	filp = fget(mqdes);
	if (unlikely(!filp)) {
		ret = -EBADF;
		goto out;
	}

	inode = filp->f_path.dentry->d_inode;
	if (unlikely(filp->f_op != &mqueue_file_operations)) {
		ret = -EBADF;
		goto out_fput;
	}
	info = MQUEUE_I(inode);
	audit_inode(NULL, filp->f_path.dentry);

	if (unlikely(!(filp->f_mode & FMODE_READ))) {
		ret = -EBADF;
		goto out_fput;
	}

	/* checks if buffer is big enough */
	if (unlikely(msg_len < info->attr.mq_msgsize)) {
		ret = -EMSGSIZE;
		goto out_fput;
	}

	spin_lock(&info->lock);
	if (info->attr.mq_curmsgs == 0) {
		if (filp->f_flags & O_NONBLOCK) {
			spin_unlock(&info->lock);
			ret = -EAGAIN;
		} else {
			wait.task = current;
			wait.state = STATE_NONE;
			ret = wq_sleep(info, RECV, timeout, &wait);
			msg_ptr = wait.msg;
		}
	} else {
		msg_ptr = msg_get(info);

		inode->i_atime = inode->i_mtime = inode->i_ctime =
				CURRENT_TIME;

		/* There is now free space in queue. */
		pipelined_receive(info);
		spin_unlock(&info->lock);
		ret = 0;
	}
	if (ret == 0) {
		ret = msg_ptr->m_ts;

		if ((u_msg_prio && put_user(msg_ptr->m_type, u_msg_prio)) ||
			store_msg(u_msg_ptr, msg_ptr, msg_ptr->m_ts)) {
			ret = -EFAULT;
		}
		free_msg(msg_ptr);
	}
out_fput:
	fput(filp);
out:
	return ret;
}

/*
 * Notes: the case when user wants us to deregister (with NULL as pointer)
 * and he isn't currently owner of notification, will be silently discarded.
 * It isn't explicitly defined in the POSIX.
 */
SYSCALL_DEFINE2(mq_notify, mqd_t, mqdes,
		const struct sigevent __user *, u_notification)
{
	int ret;
	struct file *filp;
	struct sock *sock;
	struct inode *inode;
	struct sigevent notification;
	struct mqueue_inode_info *info;
	struct sk_buff *nc;

	if (u_notification) {
		if (copy_from_user(&notification, u_notification,
					sizeof(struct sigevent)))
			return -EFAULT;
	}

	audit_mq_notify(mqdes, u_notification ? &notification : NULL);

	nc = NULL;
	sock = NULL;
	if (u_notification != NULL) {
		if (unlikely(notification.sigev_notify != SIGEV_NONE &&
			     notification.sigev_notify != SIGEV_SIGNAL &&
			     notification.sigev_notify != SIGEV_THREAD))
			return -EINVAL;
		if (notification.sigev_notify == SIGEV_SIGNAL &&
			!valid_signal(notification.sigev_signo)) {
			return -EINVAL;
		}
		if (notification.sigev_notify == SIGEV_THREAD) {
			long timeo;

			/* create the notify skb */
			nc = alloc_skb(NOTIFY_COOKIE_LEN, GFP_KERNEL);
			if (!nc) {
				ret = -ENOMEM;
				goto out;
			}
			if (copy_from_user(nc->data,
					notification.sigev_value.sival_ptr,
					NOTIFY_COOKIE_LEN)) {
				ret = -EFAULT;
				goto out;
			}

			/* TODO: add a header? */
			skb_put(nc, NOTIFY_COOKIE_LEN);
			/* and attach it to the socket */
retry:
			filp = fget(notification.sigev_signo);
			if (!filp) {
				ret = -EBADF;
				goto out;
			}
			sock = netlink_getsockbyfilp(filp);
			fput(filp);
			if (IS_ERR(sock)) {
				ret = PTR_ERR(sock);
				sock = NULL;
				goto out;
			}

			timeo = MAX_SCHEDULE_TIMEOUT;
			ret = netlink_attachskb(sock, nc, &timeo, NULL);
			if (ret == 1)
				goto retry;
			if (ret) {
				sock = NULL;
				nc = NULL;
				goto out;
			}
		}
	}

	filp = fget(mqdes);
	if (!filp) {
		ret = -EBADF;
		goto out;
	}

	inode = filp->f_path.dentry->d_inode;
	if (unlikely(filp->f_op != &mqueue_file_operations)) {
		ret = -EBADF;
		goto out_fput;
	}
	info = MQUEUE_I(inode);

	ret = 0;
	spin_lock(&info->lock);
	if (u_notification == NULL) {
		if (info->notify_owner == task_tgid(current)) {
			remove_notification(info);
			inode->i_atime = inode->i_ctime = CURRENT_TIME;
		}
	} else if (info->notify_owner != NULL) {
		ret = -EBUSY;
	} else {
		switch (notification.sigev_notify) {
		case SIGEV_NONE:
			info->notify.sigev_notify = SIGEV_NONE;
			break;
		case SIGEV_THREAD:
			info->notify_sock = sock;
			info->notify_cookie = nc;
			sock = NULL;
			nc = NULL;
			info->notify.sigev_notify = SIGEV_THREAD;
			break;
		case SIGEV_SIGNAL:
			info->notify.sigev_signo = notification.sigev_signo;
			info->notify.sigev_value = notification.sigev_value;
			info->notify.sigev_notify = SIGEV_SIGNAL;
			break;
		}

		info->notify_owner = get_pid(task_tgid(current));
		inode->i_atime = inode->i_ctime = CURRENT_TIME;
	}
	spin_unlock(&info->lock);
out_fput:
	fput(filp);
out:
	if (sock) {
		netlink_detachskb(sock, nc);
	} else if (nc) {
		dev_kfree_skb(nc);
	}
	return ret;
}

SYSCALL_DEFINE3(mq_getsetattr, mqd_t, mqdes,
		const struct mq_attr __user *, u_mqstat,
		struct mq_attr __user *, u_omqstat)
{
	int ret;
	struct mq_attr mqstat, omqstat;
	struct file *filp;
	struct inode *inode;
	struct mqueue_inode_info *info;

	if (u_mqstat != NULL) {
		if (copy_from_user(&mqstat, u_mqstat, sizeof(struct mq_attr)))
			return -EFAULT;
		if (mqstat.mq_flags & (~O_NONBLOCK))
			return -EINVAL;
	}

	filp = fget(mqdes);
	if (!filp) {
		ret = -EBADF;
		goto out;
	}

	inode = filp->f_path.dentry->d_inode;
	if (unlikely(filp->f_op != &mqueue_file_operations)) {
		ret = -EBADF;
		goto out_fput;
	}
	info = MQUEUE_I(inode);

	spin_lock(&info->lock);

	omqstat = info->attr;
	omqstat.mq_flags = filp->f_flags & O_NONBLOCK;
	if (u_mqstat) {
		audit_mq_getsetattr(mqdes, &mqstat);
		spin_lock(&filp->f_lock);
		if (mqstat.mq_flags & O_NONBLOCK)
			filp->f_flags |= O_NONBLOCK;
		else
			filp->f_flags &= ~O_NONBLOCK;
		spin_unlock(&filp->f_lock);

		inode->i_atime = inode->i_ctime = CURRENT_TIME;
	}

	spin_unlock(&info->lock);

	ret = 0;
	if (u_omqstat != NULL && copy_to_user(u_omqstat, &omqstat,
						sizeof(struct mq_attr)))
		ret = -EFAULT;

out_fput:
	fput(filp);
out:
	return ret;
}

static const struct inode_operations mqueue_dir_inode_operations = {
	.lookup = simple_lookup,
	.create = mqueue_create,
	.unlink = mqueue_unlink,
};

static const struct file_operations mqueue_file_operations = {
	.flush = mqueue_flush_file,
	.poll = mqueue_poll_file,
	.read = mqueue_read_file,
	.llseek = default_llseek,
};

static const struct super_operations mqueue_super_ops = {
	.alloc_inode = mqueue_alloc_inode,
	.destroy_inode = mqueue_destroy_inode,
	.evict_inode = mqueue_evict_inode,
	.statfs = simple_statfs,
};

static struct file_system_type mqueue_fs_type = {
	.name = "mqueue",
	.mount = mqueue_mount,
	.kill_sb = kill_litter_super,
};

int mq_init_ns(struct ipc_namespace *ns)
{
	ns->mq_queues_count  = 0;
	ns->mq_queues_max    = DFLT_QUEUESMAX;
	ns->mq_msg_max       = DFLT_MSGMAX;
	ns->mq_msgsize_max   = DFLT_MSGSIZEMAX;

	ns->mq_mnt = kern_mount_data(&mqueue_fs_type, ns);
	if (IS_ERR(ns->mq_mnt)) {
		int err = PTR_ERR(ns->mq_mnt);
		ns->mq_mnt = NULL;
		return err;
	}
	return 0;
}

void mq_clear_sbinfo(struct ipc_namespace *ns)
{
	ns->mq_mnt->mnt_sb->s_fs_info = NULL;
}

void mq_put_mnt(struct ipc_namespace *ns)
{
	mntput(ns->mq_mnt);
}

static int __init init_mqueue_fs(void)
{
	int error;

	mqueue_inode_cachep = kmem_cache_create("mqueue_inode_cache",
				sizeof(struct mqueue_inode_info), 0,
				SLAB_HWCACHE_ALIGN, init_once);
	if (mqueue_inode_cachep == NULL)
		return -ENOMEM;

	/* ignore failures - they are not fatal */
	mq_sysctl_table = mq_register_sysctl_table();

	error = register_filesystem(&mqueue_fs_type);
	if (error)
		goto out_sysctl;

	spin_lock_init(&mq_lock);

	init_ipc_ns.mq_mnt = kern_mount_data(&mqueue_fs_type, &init_ipc_ns);
	if (IS_ERR(init_ipc_ns.mq_mnt)) {
		error = PTR_ERR(init_ipc_ns.mq_mnt);
		goto out_filesystem;
	}

	return 0;

out_filesystem:
	unregister_filesystem(&mqueue_fs_type);
out_sysctl:
	if (mq_sysctl_table)
		unregister_sysctl_table(mq_sysctl_table);
	kmem_cache_destroy(mqueue_inode_cachep);
	return error;
}

__initcall(init_mqueue_fs);