aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/Makefile4
-rw-r--r--fs/afs/afs.h23
-rw-r--r--fs/afs/afs_fs.h3
-rw-r--r--fs/afs/dir.c18
-rw-r--r--fs/afs/file.c2
-rw-r--r--fs/afs/fsclient.c298
-rw-r--r--fs/afs/inode.c10
-rw-r--r--fs/afs/internal.h6
-rw-r--r--fs/afs/super.c44
-rw-r--r--fs/afs/vnode.c85
-rw-r--r--fs/afs/write.c5
-rw-r--r--fs/aio.c28
-rw-r--r--fs/anon_inodes.c200
-rw-r--r--fs/autofs/autofs_i.h4
-rw-r--r--fs/autofs/inode.c47
-rw-r--r--fs/autofs/root.c83
-rw-r--r--fs/autofs4/inode.c16
-rw-r--r--fs/autofs4/root.c18
-rw-r--r--fs/compat.c49
-rw-r--r--fs/eventfd.c228
-rw-r--r--fs/eventpoll.c1178
-rw-r--r--fs/exec.c13
-rw-r--r--fs/mpage.c174
-rw-r--r--fs/partitions/Kconfig2
-rw-r--r--fs/partitions/efi.c12
-rw-r--r--fs/signalfd.c349
-rw-r--r--fs/timerfd.c227
27 files changed, 2164 insertions, 962 deletions
diff --git a/fs/Makefile b/fs/Makefile
index 9edf4112bee0..720c29d57a62 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -22,6 +22,10 @@ endif
22obj-$(CONFIG_INOTIFY) += inotify.o 22obj-$(CONFIG_INOTIFY) += inotify.o
23obj-$(CONFIG_INOTIFY_USER) += inotify_user.o 23obj-$(CONFIG_INOTIFY_USER) += inotify_user.o
24obj-$(CONFIG_EPOLL) += eventpoll.o 24obj-$(CONFIG_EPOLL) += eventpoll.o
25obj-$(CONFIG_ANON_INODES) += anon_inodes.o
26obj-$(CONFIG_SIGNALFD) += signalfd.o
27obj-$(CONFIG_TIMERFD) += timerfd.o
28obj-$(CONFIG_EVENTFD) += eventfd.o
25obj-$(CONFIG_COMPAT) += compat.o compat_ioctl.o 29obj-$(CONFIG_COMPAT) += compat.o compat_ioctl.o
26 30
27nfsd-$(CONFIG_NFSD) := nfsctl.o 31nfsd-$(CONFIG_NFSD) := nfsctl.o
diff --git a/fs/afs/afs.h b/fs/afs/afs.h
index 52d0752265b8..245257948140 100644
--- a/fs/afs/afs.h
+++ b/fs/afs/afs.h
@@ -16,6 +16,9 @@
16 16
17#define AFS_MAXCELLNAME 64 /* maximum length of a cell name */ 17#define AFS_MAXCELLNAME 64 /* maximum length of a cell name */
18#define AFS_MAXVOLNAME 64 /* maximum length of a volume name */ 18#define AFS_MAXVOLNAME 64 /* maximum length of a volume name */
19#define AFSNAMEMAX 256 /* maximum length of a filename plus NUL */
20#define AFSPATHMAX 1024 /* maximum length of a pathname plus NUL */
21#define AFSOPAQUEMAX 1024 /* maximum length of an opaque field */
19 22
20typedef unsigned afs_volid_t; 23typedef unsigned afs_volid_t;
21typedef unsigned afs_vnodeid_t; 24typedef unsigned afs_vnodeid_t;
@@ -143,4 +146,24 @@ struct afs_volsync {
143 time_t creation; /* volume creation time */ 146 time_t creation; /* volume creation time */
144}; 147};
145 148
149/*
150 * AFS volume status record
151 */
152struct afs_volume_status {
153 u32 vid; /* volume ID */
154 u32 parent_id; /* parent volume ID */
155 u8 online; /* true if volume currently online and available */
156 u8 in_service; /* true if volume currently in service */
157 u8 blessed; /* same as in_service */
158 u8 needs_salvage; /* true if consistency checking required */
159 u32 type; /* volume type (afs_voltype_t) */
160 u32 min_quota; /* minimum space set aside (blocks) */
161 u32 max_quota; /* maximum space this volume may occupy (blocks) */
162 u32 blocks_in_use; /* space this volume currently occupies (blocks) */
163 u32 part_blocks_avail; /* space available in volume's partition */
164 u32 part_max_blocks; /* size of volume's partition */
165};
166
167#define AFS_BLOCK_SIZE 1024
168
146#endif /* AFS_H */ 169#endif /* AFS_H */
diff --git a/fs/afs/afs_fs.h b/fs/afs/afs_fs.h
index d963ef4daee8..a18c374ebe08 100644
--- a/fs/afs/afs_fs.h
+++ b/fs/afs/afs_fs.h
@@ -28,7 +28,8 @@ enum AFS_FS_Operations {
28 FSMAKEDIR = 141, /* AFS Create a directory */ 28 FSMAKEDIR = 141, /* AFS Create a directory */
29 FSREMOVEDIR = 142, /* AFS Remove a directory */ 29 FSREMOVEDIR = 142, /* AFS Remove a directory */
30 FSGIVEUPCALLBACKS = 147, /* AFS Discard callback promises */ 30 FSGIVEUPCALLBACKS = 147, /* AFS Discard callback promises */
31 FSGETVOLUMEINFO = 148, /* AFS Get root volume information */ 31 FSGETVOLUMEINFO = 148, /* AFS Get information about a volume */
32 FSGETVOLUMESTATUS = 149, /* AFS Get volume status information */
32 FSGETROOTVOLUME = 151, /* AFS Get root volume name */ 33 FSGETROOTVOLUME = 151, /* AFS Get root volume name */
33 FSLOOKUP = 161, /* AFS lookup file in directory */ 34 FSLOOKUP = 161, /* AFS lookup file in directory */
34 FSFETCHDATA64 = 65537, /* AFS Fetch file data */ 35 FSFETCHDATA64 = 65537, /* AFS Fetch file data */
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 2fb31276196b..719af4fb15dc 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -497,7 +497,7 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
497 497
498 ASSERTCMP(dentry->d_inode, ==, NULL); 498 ASSERTCMP(dentry->d_inode, ==, NULL);
499 499
500 if (dentry->d_name.len > 255) { 500 if (dentry->d_name.len >= AFSNAMEMAX) {
501 _leave(" = -ENAMETOOLONG"); 501 _leave(" = -ENAMETOOLONG");
502 return ERR_PTR(-ENAMETOOLONG); 502 return ERR_PTR(-ENAMETOOLONG);
503 } 503 }
@@ -736,7 +736,7 @@ static int afs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
736 dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name, mode); 736 dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name, mode);
737 737
738 ret = -ENAMETOOLONG; 738 ret = -ENAMETOOLONG;
739 if (dentry->d_name.len > 255) 739 if (dentry->d_name.len >= AFSNAMEMAX)
740 goto error; 740 goto error;
741 741
742 key = afs_request_key(dvnode->volume->cell); 742 key = afs_request_key(dvnode->volume->cell);
@@ -801,7 +801,7 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry)
801 dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name); 801 dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name);
802 802
803 ret = -ENAMETOOLONG; 803 ret = -ENAMETOOLONG;
804 if (dentry->d_name.len > 255) 804 if (dentry->d_name.len >= AFSNAMEMAX)
805 goto error; 805 goto error;
806 806
807 key = afs_request_key(dvnode->volume->cell); 807 key = afs_request_key(dvnode->volume->cell);
@@ -847,7 +847,7 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry)
847 dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name); 847 dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name);
848 848
849 ret = -ENAMETOOLONG; 849 ret = -ENAMETOOLONG;
850 if (dentry->d_name.len > 255) 850 if (dentry->d_name.len >= AFSNAMEMAX)
851 goto error; 851 goto error;
852 852
853 key = afs_request_key(dvnode->volume->cell); 853 key = afs_request_key(dvnode->volume->cell);
@@ -921,7 +921,7 @@ static int afs_create(struct inode *dir, struct dentry *dentry, int mode,
921 dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name, mode); 921 dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name, mode);
922 922
923 ret = -ENAMETOOLONG; 923 ret = -ENAMETOOLONG;
924 if (dentry->d_name.len > 255) 924 if (dentry->d_name.len >= AFSNAMEMAX)
925 goto error; 925 goto error;
926 926
927 key = afs_request_key(dvnode->volume->cell); 927 key = afs_request_key(dvnode->volume->cell);
@@ -990,7 +990,7 @@ static int afs_link(struct dentry *from, struct inode *dir,
990 dentry->d_name.name); 990 dentry->d_name.name);
991 991
992 ret = -ENAMETOOLONG; 992 ret = -ENAMETOOLONG;
993 if (dentry->d_name.len > 255) 993 if (dentry->d_name.len >= AFSNAMEMAX)
994 goto error; 994 goto error;
995 995
996 key = afs_request_key(dvnode->volume->cell); 996 key = afs_request_key(dvnode->volume->cell);
@@ -1038,11 +1038,11 @@ static int afs_symlink(struct inode *dir, struct dentry *dentry,
1038 content); 1038 content);
1039 1039
1040 ret = -ENAMETOOLONG; 1040 ret = -ENAMETOOLONG;
1041 if (dentry->d_name.len > 255) 1041 if (dentry->d_name.len >= AFSNAMEMAX)
1042 goto error; 1042 goto error;
1043 1043
1044 ret = -EINVAL; 1044 ret = -EINVAL;
1045 if (strlen(content) > 1023) 1045 if (strlen(content) >= AFSPATHMAX)
1046 goto error; 1046 goto error;
1047 1047
1048 key = afs_request_key(dvnode->volume->cell); 1048 key = afs_request_key(dvnode->volume->cell);
@@ -1112,7 +1112,7 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
1112 new_dentry->d_name.name); 1112 new_dentry->d_name.name);
1113 1113
1114 ret = -ENAMETOOLONG; 1114 ret = -ENAMETOOLONG;
1115 if (new_dentry->d_name.len > 255) 1115 if (new_dentry->d_name.len >= AFSNAMEMAX)
1116 goto error; 1116 goto error;
1117 1117
1118 key = afs_request_key(orig_dvnode->volume->cell); 1118 key = afs_request_key(orig_dvnode->volume->cell);
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 3e25795e5a42..9c0e721d9fc2 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -236,7 +236,7 @@ static void afs_invalidatepage(struct page *page, unsigned long offset)
236{ 236{
237 int ret = 1; 237 int ret = 1;
238 238
239 kenter("{%lu},%lu", page->index, offset); 239 _enter("{%lu},%lu", page->index, offset);
240 240
241 BUG_ON(!PageLocked(page)); 241 BUG_ON(!PageLocked(page));
242 242
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index 56cc0efa2a0c..5dff1308b6f0 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -202,6 +202,29 @@ static void xdr_encode_AFS_StoreStatus(__be32 **_bp, struct iattr *attr)
202} 202}
203 203
204/* 204/*
205 * decode an AFSFetchVolumeStatus block
206 */
207static void xdr_decode_AFSFetchVolumeStatus(const __be32 **_bp,
208 struct afs_volume_status *vs)
209{
210 const __be32 *bp = *_bp;
211
212 vs->vid = ntohl(*bp++);
213 vs->parent_id = ntohl(*bp++);
214 vs->online = ntohl(*bp++);
215 vs->in_service = ntohl(*bp++);
216 vs->blessed = ntohl(*bp++);
217 vs->needs_salvage = ntohl(*bp++);
218 vs->type = ntohl(*bp++);
219 vs->min_quota = ntohl(*bp++);
220 vs->max_quota = ntohl(*bp++);
221 vs->blocks_in_use = ntohl(*bp++);
222 vs->part_blocks_avail = ntohl(*bp++);
223 vs->part_max_blocks = ntohl(*bp++);
224 *_bp = bp;
225}
226
227/*
205 * deliver reply data to an FS.FetchStatus 228 * deliver reply data to an FS.FetchStatus
206 */ 229 */
207static int afs_deliver_fs_fetch_status(struct afs_call *call, 230static int afs_deliver_fs_fetch_status(struct afs_call *call,
@@ -1450,3 +1473,278 @@ int afs_fs_setattr(struct afs_server *server, struct key *key,
1450 1473
1451 return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode); 1474 return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
1452} 1475}
1476
1477/*
1478 * deliver reply data to an FS.GetVolumeStatus
1479 */
1480static int afs_deliver_fs_get_volume_status(struct afs_call *call,
1481 struct sk_buff *skb, bool last)
1482{
1483 const __be32 *bp;
1484 char *p;
1485 int ret;
1486
1487 _enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
1488
1489 switch (call->unmarshall) {
1490 case 0:
1491 call->offset = 0;
1492 call->unmarshall++;
1493
1494 /* extract the returned status record */
1495 case 1:
1496 _debug("extract status");
1497 ret = afs_extract_data(call, skb, last, call->buffer,
1498 12 * 4);
1499 switch (ret) {
1500 case 0: break;
1501 case -EAGAIN: return 0;
1502 default: return ret;
1503 }
1504
1505 bp = call->buffer;
1506 xdr_decode_AFSFetchVolumeStatus(&bp, call->reply2);
1507 call->offset = 0;
1508 call->unmarshall++;
1509
1510 /* extract the volume name length */
1511 case 2:
1512 ret = afs_extract_data(call, skb, last, &call->tmp, 4);
1513 switch (ret) {
1514 case 0: break;
1515 case -EAGAIN: return 0;
1516 default: return ret;
1517 }
1518
1519 call->count = ntohl(call->tmp);
1520 _debug("volname length: %u", call->count);
1521 if (call->count >= AFSNAMEMAX)
1522 return -EBADMSG;
1523 call->offset = 0;
1524 call->unmarshall++;
1525
1526 /* extract the volume name */
1527 case 3:
1528 _debug("extract volname");
1529 if (call->count > 0) {
1530 ret = afs_extract_data(call, skb, last, call->reply3,
1531 call->count);
1532 switch (ret) {
1533 case 0: break;
1534 case -EAGAIN: return 0;
1535 default: return ret;
1536 }
1537 }
1538
1539 p = call->reply3;
1540 p[call->count] = 0;
1541 _debug("volname '%s'", p);
1542
1543 call->offset = 0;
1544 call->unmarshall++;
1545
1546 /* extract the volume name padding */
1547 if ((call->count & 3) == 0) {
1548 call->unmarshall++;
1549 goto no_volname_padding;
1550 }
1551 call->count = 4 - (call->count & 3);
1552
1553 case 4:
1554 ret = afs_extract_data(call, skb, last, call->buffer,
1555 call->count);
1556 switch (ret) {
1557 case 0: break;
1558 case -EAGAIN: return 0;
1559 default: return ret;
1560 }
1561
1562 call->offset = 0;
1563 call->unmarshall++;
1564 no_volname_padding:
1565
1566 /* extract the offline message length */
1567 case 5:
1568 ret = afs_extract_data(call, skb, last, &call->tmp, 4);
1569 switch (ret) {
1570 case 0: break;
1571 case -EAGAIN: return 0;
1572 default: return ret;
1573 }
1574
1575 call->count = ntohl(call->tmp);
1576 _debug("offline msg length: %u", call->count);
1577 if (call->count >= AFSNAMEMAX)
1578 return -EBADMSG;
1579 call->offset = 0;
1580 call->unmarshall++;
1581
1582 /* extract the offline message */
1583 case 6:
1584 _debug("extract offline");
1585 if (call->count > 0) {
1586 ret = afs_extract_data(call, skb, last, call->reply3,
1587 call->count);
1588 switch (ret) {
1589 case 0: break;
1590 case -EAGAIN: return 0;
1591 default: return ret;
1592 }
1593 }
1594
1595 p = call->reply3;
1596 p[call->count] = 0;
1597 _debug("offline '%s'", p);
1598
1599 call->offset = 0;
1600 call->unmarshall++;
1601
1602 /* extract the offline message padding */
1603 if ((call->count & 3) == 0) {
1604 call->unmarshall++;
1605 goto no_offline_padding;
1606 }
1607 call->count = 4 - (call->count & 3);
1608
1609 case 7:
1610 ret = afs_extract_data(call, skb, last, call->buffer,
1611 call->count);
1612 switch (ret) {
1613 case 0: break;
1614 case -EAGAIN: return 0;
1615 default: return ret;
1616 }
1617
1618 call->offset = 0;
1619 call->unmarshall++;
1620 no_offline_padding:
1621
1622 /* extract the message of the day length */
1623 case 8:
1624 ret = afs_extract_data(call, skb, last, &call->tmp, 4);
1625 switch (ret) {
1626 case 0: break;
1627 case -EAGAIN: return 0;
1628 default: return ret;
1629 }
1630
1631 call->count = ntohl(call->tmp);
1632 _debug("motd length: %u", call->count);
1633 if (call->count >= AFSNAMEMAX)
1634 return -EBADMSG;
1635 call->offset = 0;
1636 call->unmarshall++;
1637
1638 /* extract the message of the day */
1639 case 9:
1640 _debug("extract motd");
1641 if (call->count > 0) {
1642 ret = afs_extract_data(call, skb, last, call->reply3,
1643 call->count);
1644 switch (ret) {
1645 case 0: break;
1646 case -EAGAIN: return 0;
1647 default: return ret;
1648 }
1649 }
1650
1651 p = call->reply3;
1652 p[call->count] = 0;
1653 _debug("motd '%s'", p);
1654
1655 call->offset = 0;
1656 call->unmarshall++;
1657
1658 /* extract the message of the day padding */
1659 if ((call->count & 3) == 0) {
1660 call->unmarshall++;
1661 goto no_motd_padding;
1662 }
1663 call->count = 4 - (call->count & 3);
1664
1665 case 10:
1666 ret = afs_extract_data(call, skb, last, call->buffer,
1667 call->count);
1668 switch (ret) {
1669 case 0: break;
1670 case -EAGAIN: return 0;
1671 default: return ret;
1672 }
1673
1674 call->offset = 0;
1675 call->unmarshall++;
1676 no_motd_padding:
1677
1678 case 11:
1679 _debug("trailer %d", skb->len);
1680 if (skb->len != 0)
1681 return -EBADMSG;
1682 break;
1683 }
1684
1685 if (!last)
1686 return 0;
1687
1688 _leave(" = 0 [done]");
1689 return 0;
1690}
1691
1692/*
1693 * destroy an FS.GetVolumeStatus call
1694 */
1695static void afs_get_volume_status_call_destructor(struct afs_call *call)
1696{
1697 kfree(call->reply3);
1698 call->reply3 = NULL;
1699 afs_flat_call_destructor(call);
1700}
1701
1702/*
1703 * FS.GetVolumeStatus operation type
1704 */
1705static const struct afs_call_type afs_RXFSGetVolumeStatus = {
1706 .name = "FS.GetVolumeStatus",
1707 .deliver = afs_deliver_fs_get_volume_status,
1708 .abort_to_error = afs_abort_to_error,
1709 .destructor = afs_get_volume_status_call_destructor,
1710};
1711
1712/*
1713 * fetch the status of a volume
1714 */
1715int afs_fs_get_volume_status(struct afs_server *server,
1716 struct key *key,
1717 struct afs_vnode *vnode,
1718 struct afs_volume_status *vs,
1719 const struct afs_wait_mode *wait_mode)
1720{
1721 struct afs_call *call;
1722 __be32 *bp;
1723 void *tmpbuf;
1724
1725 _enter("");
1726
1727 tmpbuf = kmalloc(AFSOPAQUEMAX, GFP_KERNEL);
1728 if (!tmpbuf)
1729 return -ENOMEM;
1730
1731 call = afs_alloc_flat_call(&afs_RXFSGetVolumeStatus, 2 * 4, 12 * 4);
1732 if (!call) {
1733 kfree(tmpbuf);
1734 return -ENOMEM;
1735 }
1736
1737 call->key = key;
1738 call->reply = vnode;
1739 call->reply2 = vs;
1740 call->reply3 = tmpbuf;
1741 call->service_id = FS_SERVICE;
1742 call->port = htons(AFS_FS_PORT);
1743
1744 /* marshall the parameters */
1745 bp = call->request;
1746 bp[0] = htonl(FSGETVOLUMESTATUS);
1747 bp[1] = htonl(vnode->fid.vid);
1748
1749 return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
1750}
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 515a5d12d8fb..47f5fed7195d 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -209,11 +209,15 @@ bad_inode:
209 */ 209 */
210void afs_zap_data(struct afs_vnode *vnode) 210void afs_zap_data(struct afs_vnode *vnode)
211{ 211{
212 _enter("zap data {%x:%u}", vnode->fid.vid, vnode->fid.vnode); 212 _enter("{%x:%u}", vnode->fid.vid, vnode->fid.vnode);
213 213
214 /* nuke all the non-dirty pages that aren't locked, mapped or being 214 /* nuke all the non-dirty pages that aren't locked, mapped or being
215 * written back */ 215 * written back in a regular file and completely discard the pages in a
216 invalidate_remote_inode(&vnode->vfs_inode); 216 * directory or symlink */
217 if (S_ISREG(vnode->vfs_inode.i_mode))
218 invalidate_remote_inode(&vnode->vfs_inode);
219 else
220 invalidate_inode_pages2(vnode->vfs_inode.i_mapping);
217} 221}
218 222
219/* 223/*
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index a30d4fa768e3..4953ba5a6f44 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -506,6 +506,10 @@ extern int afs_fs_store_data(struct afs_server *, struct afs_writeback *,
506extern int afs_fs_setattr(struct afs_server *, struct key *, 506extern int afs_fs_setattr(struct afs_server *, struct key *,
507 struct afs_vnode *, struct iattr *, 507 struct afs_vnode *, struct iattr *,
508 const struct afs_wait_mode *); 508 const struct afs_wait_mode *);
509extern int afs_fs_get_volume_status(struct afs_server *, struct key *,
510 struct afs_vnode *,
511 struct afs_volume_status *,
512 const struct afs_wait_mode *);
509 513
510/* 514/*
511 * inode.c 515 * inode.c
@@ -672,6 +676,8 @@ extern int afs_vnode_rename(struct afs_vnode *, struct afs_vnode *,
672extern int afs_vnode_store_data(struct afs_writeback *, pgoff_t, pgoff_t, 676extern int afs_vnode_store_data(struct afs_writeback *, pgoff_t, pgoff_t,
673 unsigned, unsigned); 677 unsigned, unsigned);
674extern int afs_vnode_setattr(struct afs_vnode *, struct key *, struct iattr *); 678extern int afs_vnode_setattr(struct afs_vnode *, struct key *, struct iattr *);
679extern int afs_vnode_get_volume_status(struct afs_vnode *, struct key *,
680 struct afs_volume_status *);
675 681
676/* 682/*
677 * volume.c 683 * volume.c
diff --git a/fs/afs/super.c b/fs/afs/super.c
index d24be334b608..579af632c8e8 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -21,22 +21,20 @@
21#include <linux/fs.h> 21#include <linux/fs.h>
22#include <linux/pagemap.h> 22#include <linux/pagemap.h>
23#include <linux/parser.h> 23#include <linux/parser.h>
24#include <linux/statfs.h>
24#include "internal.h" 25#include "internal.h"
25 26
26#define AFS_FS_MAGIC 0x6B414653 /* 'kAFS' */ 27#define AFS_FS_MAGIC 0x6B414653 /* 'kAFS' */
27 28
28static void afs_i_init_once(void *foo, struct kmem_cache *cachep, 29static void afs_i_init_once(void *foo, struct kmem_cache *cachep,
29 unsigned long flags); 30 unsigned long flags);
30
31static int afs_get_sb(struct file_system_type *fs_type, 31static int afs_get_sb(struct file_system_type *fs_type,
32 int flags, const char *dev_name, 32 int flags, const char *dev_name,
33 void *data, struct vfsmount *mnt); 33 void *data, struct vfsmount *mnt);
34
35static struct inode *afs_alloc_inode(struct super_block *sb); 34static struct inode *afs_alloc_inode(struct super_block *sb);
36
37static void afs_put_super(struct super_block *sb); 35static void afs_put_super(struct super_block *sb);
38
39static void afs_destroy_inode(struct inode *inode); 36static void afs_destroy_inode(struct inode *inode);
37static int afs_statfs(struct dentry *dentry, struct kstatfs *buf);
40 38
41struct file_system_type afs_fs_type = { 39struct file_system_type afs_fs_type = {
42 .owner = THIS_MODULE, 40 .owner = THIS_MODULE,
@@ -47,7 +45,7 @@ struct file_system_type afs_fs_type = {
47}; 45};
48 46
49static const struct super_operations afs_super_ops = { 47static const struct super_operations afs_super_ops = {
50 .statfs = simple_statfs, 48 .statfs = afs_statfs,
51 .alloc_inode = afs_alloc_inode, 49 .alloc_inode = afs_alloc_inode,
52 .drop_inode = generic_delete_inode, 50 .drop_inode = generic_delete_inode,
53 .write_inode = afs_write_inode, 51 .write_inode = afs_write_inode,
@@ -488,6 +486,7 @@ static struct inode *afs_alloc_inode(struct super_block *sb)
488 vnode->flags = 1 << AFS_VNODE_UNSET; 486 vnode->flags = 1 << AFS_VNODE_UNSET;
489 vnode->cb_promised = false; 487 vnode->cb_promised = false;
490 488
489 _leave(" = %p", &vnode->vfs_inode);
491 return &vnode->vfs_inode; 490 return &vnode->vfs_inode;
492} 491}
493 492
@@ -498,7 +497,7 @@ static void afs_destroy_inode(struct inode *inode)
498{ 497{
499 struct afs_vnode *vnode = AFS_FS_I(inode); 498 struct afs_vnode *vnode = AFS_FS_I(inode);
500 499
501 _enter("{%lu}", inode->i_ino); 500 _enter("%p{%x:%u}", inode, vnode->fid.vid, vnode->fid.vnode);
502 501
503 _debug("DESTROY INODE %p", inode); 502 _debug("DESTROY INODE %p", inode);
504 503
@@ -507,3 +506,36 @@ static void afs_destroy_inode(struct inode *inode)
507 kmem_cache_free(afs_inode_cachep, vnode); 506 kmem_cache_free(afs_inode_cachep, vnode);
508 atomic_dec(&afs_count_active_inodes); 507 atomic_dec(&afs_count_active_inodes);
509} 508}
509
510/*
511 * return information about an AFS volume
512 */
513static int afs_statfs(struct dentry *dentry, struct kstatfs *buf)
514{
515 struct afs_volume_status vs;
516 struct afs_vnode *vnode = AFS_FS_I(dentry->d_inode);
517 struct key *key;
518 int ret;
519
520 key = afs_request_key(vnode->volume->cell);
521 if (IS_ERR(key))
522 return PTR_ERR(key);
523
524 ret = afs_vnode_get_volume_status(vnode, key, &vs);
525 key_put(key);
526 if (ret < 0) {
527 _leave(" = %d", ret);
528 return ret;
529 }
530
531 buf->f_type = dentry->d_sb->s_magic;
532 buf->f_bsize = AFS_BLOCK_SIZE;
533 buf->f_namelen = AFSNAMEMAX - 1;
534
535 if (vs.max_quota == 0)
536 buf->f_blocks = vs.part_max_blocks;
537 else
538 buf->f_blocks = vs.max_quota;
539 buf->f_bavail = buf->f_bfree = buf->f_blocks - vs.blocks_in_use;
540 return 0;
541}
diff --git a/fs/afs/vnode.c b/fs/afs/vnode.c
index ec814660209f..c36c98ce2c3c 100644
--- a/fs/afs/vnode.c
+++ b/fs/afs/vnode.c
@@ -175,24 +175,33 @@ static void afs_vnode_deleted_remotely(struct afs_vnode *vnode)
175{ 175{
176 struct afs_server *server; 176 struct afs_server *server;
177 177
178 _enter("{%p}", vnode->server);
179
178 set_bit(AFS_VNODE_DELETED, &vnode->flags); 180 set_bit(AFS_VNODE_DELETED, &vnode->flags);
179 181
180 server = vnode->server; 182 server = vnode->server;
181 if (vnode->cb_promised) { 183 if (server) {
182 spin_lock(&server->cb_lock);
183 if (vnode->cb_promised) { 184 if (vnode->cb_promised) {
184 rb_erase(&vnode->cb_promise, &server->cb_promises); 185 spin_lock(&server->cb_lock);
185 vnode->cb_promised = false; 186 if (vnode->cb_promised) {
187 rb_erase(&vnode->cb_promise,
188 &server->cb_promises);
189 vnode->cb_promised = false;
190 }
191 spin_unlock(&server->cb_lock);
186 } 192 }
187 spin_unlock(&server->cb_lock);
188 }
189 193
190 spin_lock(&vnode->server->fs_lock); 194 spin_lock(&server->fs_lock);
191 rb_erase(&vnode->server_rb, &vnode->server->fs_vnodes); 195 rb_erase(&vnode->server_rb, &server->fs_vnodes);
192 spin_unlock(&vnode->server->fs_lock); 196 spin_unlock(&server->fs_lock);
193 197
194 vnode->server = NULL; 198 vnode->server = NULL;
195 afs_put_server(server); 199 afs_put_server(server);
200 } else {
201 ASSERT(!vnode->cb_promised);
202 }
203
204 _leave("");
196} 205}
197 206
198/* 207/*
@@ -225,7 +234,7 @@ void afs_vnode_finalise_status_update(struct afs_vnode *vnode,
225 */ 234 */
226static void afs_vnode_status_update_failed(struct afs_vnode *vnode, int ret) 235static void afs_vnode_status_update_failed(struct afs_vnode *vnode, int ret)
227{ 236{
228 _enter("%p,%d", vnode, ret); 237 _enter("{%x:%u},%d", vnode->fid.vid, vnode->fid.vnode, ret);
229 238
230 spin_lock(&vnode->lock); 239 spin_lock(&vnode->lock);
231 240
@@ -860,3 +869,55 @@ no_server:
860 spin_unlock(&vnode->lock); 869 spin_unlock(&vnode->lock);
861 return PTR_ERR(server); 870 return PTR_ERR(server);
862} 871}
872
873/*
874 * get the status of a volume
875 */
876int afs_vnode_get_volume_status(struct afs_vnode *vnode, struct key *key,
877 struct afs_volume_status *vs)
878{
879 struct afs_server *server;
880 int ret;
881
882 _enter("%s{%x:%u.%u},%x,",
883 vnode->volume->vlocation->vldb.name,
884 vnode->fid.vid,
885 vnode->fid.vnode,
886 vnode->fid.unique,
887 key_serial(key));
888
889 /* this op will fetch the status */
890 spin_lock(&vnode->lock);
891 vnode->update_cnt++;
892 spin_unlock(&vnode->lock);
893
894 do {
895 /* pick a server to query */
896 server = afs_volume_pick_fileserver(vnode);
897 if (IS_ERR(server))
898 goto no_server;
899
900 _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
901
902 ret = afs_fs_get_volume_status(server, key, vnode, vs, &afs_sync_call);
903
904 } while (!afs_volume_release_fileserver(vnode, server, ret));
905
906 /* adjust the flags */
907 if (ret == 0) {
908 afs_vnode_finalise_status_update(vnode, server);
909 afs_put_server(server);
910 } else {
911 afs_vnode_status_update_failed(vnode, ret);
912 }
913
914 _leave(" = %d", ret);
915 return ret;
916
917no_server:
918 spin_lock(&vnode->lock);
919 vnode->update_cnt--;
920 ASSERTCMP(vnode->update_cnt, >=, 0);
921 spin_unlock(&vnode->lock);
922 return PTR_ERR(server);
923}
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 67ae4dbf66b3..28f37516c126 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -395,8 +395,9 @@ static int afs_write_back_from_locked_page(struct afs_writeback *wb,
395 if (n == 0) 395 if (n == 0)
396 goto no_more; 396 goto no_more;
397 if (pages[0]->index != start) { 397 if (pages[0]->index != start) {
398 for (n--; n >= 0; n--) 398 do {
399 put_page(pages[n]); 399 put_page(pages[--n]);
400 } while (n > 0);
400 goto no_more; 401 goto no_more;
401 } 402 }
402 403
diff --git a/fs/aio.c b/fs/aio.c
index ac1c1587aa02..dbe699e9828c 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -30,6 +30,7 @@
30#include <linux/highmem.h> 30#include <linux/highmem.h>
31#include <linux/workqueue.h> 31#include <linux/workqueue.h>
32#include <linux/security.h> 32#include <linux/security.h>
33#include <linux/eventfd.h>
33 34
34#include <asm/kmap_types.h> 35#include <asm/kmap_types.h>
35#include <asm/uaccess.h> 36#include <asm/uaccess.h>
@@ -417,6 +418,7 @@ static struct kiocb fastcall *__aio_get_req(struct kioctx *ctx)
417 req->private = NULL; 418 req->private = NULL;
418 req->ki_iovec = NULL; 419 req->ki_iovec = NULL;
419 INIT_LIST_HEAD(&req->ki_run_list); 420 INIT_LIST_HEAD(&req->ki_run_list);
421 req->ki_eventfd = ERR_PTR(-EINVAL);
420 422
421 /* Check if the completion queue has enough free space to 423 /* Check if the completion queue has enough free space to
422 * accept an event from this io. 424 * accept an event from this io.
@@ -458,6 +460,8 @@ static inline void really_put_req(struct kioctx *ctx, struct kiocb *req)
458{ 460{
459 assert_spin_locked(&ctx->ctx_lock); 461 assert_spin_locked(&ctx->ctx_lock);
460 462
463 if (!IS_ERR(req->ki_eventfd))
464 fput(req->ki_eventfd);
461 if (req->ki_dtor) 465 if (req->ki_dtor)
462 req->ki_dtor(req); 466 req->ki_dtor(req);
463 if (req->ki_iovec != &req->ki_inline_vec) 467 if (req->ki_iovec != &req->ki_inline_vec)
@@ -942,6 +946,14 @@ int fastcall aio_complete(struct kiocb *iocb, long res, long res2)
942 return 1; 946 return 1;
943 } 947 }
944 948
949 /*
950 * Check if the user asked us to deliver the result through an
951 * eventfd. The eventfd_signal() function is safe to be called
952 * from IRQ context.
953 */
954 if (!IS_ERR(iocb->ki_eventfd))
955 eventfd_signal(iocb->ki_eventfd, 1);
956
945 info = &ctx->ring_info; 957 info = &ctx->ring_info;
946 958
947 /* add a completion event to the ring buffer. 959 /* add a completion event to the ring buffer.
@@ -1526,8 +1538,7 @@ int fastcall io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
1526 ssize_t ret; 1538 ssize_t ret;
1527 1539
1528 /* enforce forwards compatibility on users */ 1540 /* enforce forwards compatibility on users */
1529 if (unlikely(iocb->aio_reserved1 || iocb->aio_reserved2 || 1541 if (unlikely(iocb->aio_reserved1 || iocb->aio_reserved2)) {
1530 iocb->aio_reserved3)) {
1531 pr_debug("EINVAL: io_submit: reserve field set\n"); 1542 pr_debug("EINVAL: io_submit: reserve field set\n");
1532 return -EINVAL; 1543 return -EINVAL;
1533 } 1544 }
@@ -1551,6 +1562,19 @@ int fastcall io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
1551 fput(file); 1562 fput(file);
1552 return -EAGAIN; 1563 return -EAGAIN;
1553 } 1564 }
1565 if (iocb->aio_flags & IOCB_FLAG_RESFD) {
1566 /*
1567 * If the IOCB_FLAG_RESFD flag of aio_flags is set, get an
1568 * instance of the file* now. The file descriptor must be
1569 * an eventfd() fd, and will be signaled for each completed
1570 * event using the eventfd_signal() function.
1571 */
1572 req->ki_eventfd = eventfd_fget((int) iocb->aio_resfd);
1573 if (unlikely(IS_ERR(req->ki_eventfd))) {
1574 ret = PTR_ERR(req->ki_eventfd);
1575 goto out_put_req;
1576 }
1577 }
1554 1578
1555 req->ki_filp = file; 1579 req->ki_filp = file;
1556 ret = put_user(req->ki_key, &user_iocb->aio_key); 1580 ret = put_user(req->ki_key, &user_iocb->aio_key);
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
new file mode 100644
index 000000000000..40fe3a3222e4
--- /dev/null
+++ b/fs/anon_inodes.c
@@ -0,0 +1,200 @@
1/*
2 * fs/anon_inodes.c
3 *
4 * Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org>
5 *
6 * Thanks to Arnd Bergmann for code review and suggestions.
7 * More changes for Thomas Gleixner suggestions.
8 *
9 */
10
11#include <linux/file.h>
12#include <linux/poll.h>
13#include <linux/slab.h>
14#include <linux/init.h>
15#include <linux/fs.h>
16#include <linux/mount.h>
17#include <linux/module.h>
18#include <linux/kernel.h>
19#include <linux/magic.h>
20#include <linux/anon_inodes.h>
21
22#include <asm/uaccess.h>
23
24static struct vfsmount *anon_inode_mnt __read_mostly;
25static struct inode *anon_inode_inode;
26static const struct file_operations anon_inode_fops;
27
28static int anon_inodefs_get_sb(struct file_system_type *fs_type, int flags,
29 const char *dev_name, void *data,
30 struct vfsmount *mnt)
31{
32 return get_sb_pseudo(fs_type, "anon_inode:", NULL, ANON_INODE_FS_MAGIC,
33 mnt);
34}
35
36static int anon_inodefs_delete_dentry(struct dentry *dentry)
37{
38 /*
39 * We faked vfs to believe the dentry was hashed when we created it.
40 * Now we restore the flag so that dput() will work correctly.
41 */
42 dentry->d_flags |= DCACHE_UNHASHED;
43 return 1;
44}
45
46static struct file_system_type anon_inode_fs_type = {
47 .name = "anon_inodefs",
48 .get_sb = anon_inodefs_get_sb,
49 .kill_sb = kill_anon_super,
50};
51static struct dentry_operations anon_inodefs_dentry_operations = {
52 .d_delete = anon_inodefs_delete_dentry,
53};
54
55/**
56 * anon_inode_getfd - creates a new file instance by hooking it up to and
57 * anonymous inode, and a dentry that describe the "class"
58 * of the file
59 *
60 * @pfd: [out] pointer to the file descriptor
61 * @dpinode: [out] pointer to the inode
62 * @pfile: [out] pointer to the file struct
63 * @name: [in] name of the "class" of the new file
64 * @fops [in] file operations for the new file
65 * @priv [in] private data for the new file (will be file's private_data)
66 *
67 * Creates a new file by hooking it on a single inode. This is useful for files
68 * that do not need to have a full-fledged inode in order to operate correctly.
69 * All the files created with anon_inode_getfd() will share a single inode, by
70 * hence saving memory and avoiding code duplication for the file/inode/dentry
71 * setup.
72 */
73int anon_inode_getfd(int *pfd, struct inode **pinode, struct file **pfile,
74 const char *name, const struct file_operations *fops,
75 void *priv)
76{
77 struct qstr this;
78 struct dentry *dentry;
79 struct inode *inode;
80 struct file *file;
81 int error, fd;
82
83 if (IS_ERR(anon_inode_inode))
84 return -ENODEV;
85 file = get_empty_filp();
86 if (!file)
87 return -ENFILE;
88
89 inode = igrab(anon_inode_inode);
90 if (IS_ERR(inode)) {
91 error = PTR_ERR(inode);
92 goto err_put_filp;
93 }
94
95 error = get_unused_fd();
96 if (error < 0)
97 goto err_iput;
98 fd = error;
99
100 /*
101 * Link the inode to a directory entry by creating a unique name
102 * using the inode sequence number.
103 */
104 error = -ENOMEM;
105 this.name = name;
106 this.len = strlen(name);
107 this.hash = 0;
108 dentry = d_alloc(anon_inode_mnt->mnt_sb->s_root, &this);
109 if (!dentry)
110 goto err_put_unused_fd;
111 dentry->d_op = &anon_inodefs_dentry_operations;
112 /* Do not publish this dentry inside the global dentry hash table */
113 dentry->d_flags &= ~DCACHE_UNHASHED;
114 d_instantiate(dentry, inode);
115
116 file->f_path.mnt = mntget(anon_inode_mnt);
117 file->f_path.dentry = dentry;
118 file->f_mapping = inode->i_mapping;
119
120 file->f_pos = 0;
121 file->f_flags = O_RDWR;
122 file->f_op = fops;
123 file->f_mode = FMODE_READ | FMODE_WRITE;
124 file->f_version = 0;
125 file->private_data = priv;
126
127 fd_install(fd, file);
128
129 *pfd = fd;
130 *pinode = inode;
131 *pfile = file;
132 return 0;
133
134err_put_unused_fd:
135 put_unused_fd(fd);
136err_iput:
137 iput(inode);
138err_put_filp:
139 put_filp(file);
140 return error;
141}
142
143/*
144 * A single inode exist for all anon_inode files. Contrary to pipes,
145 * anon_inode inodes has no per-instance data associated, so we can avoid
146 * the allocation of multiple of them.
147 */
148static struct inode *anon_inode_mkinode(void)
149{
150 struct inode *inode = new_inode(anon_inode_mnt->mnt_sb);
151
152 if (!inode)
153 return ERR_PTR(-ENOMEM);
154
155 inode->i_fop = &anon_inode_fops;
156
157 /*
158 * Mark the inode dirty from the very beginning,
159 * that way it will never be moved to the dirty
160 * list because mark_inode_dirty() will think
161 * that it already _is_ on the dirty list.
162 */
163 inode->i_state = I_DIRTY;
164 inode->i_mode = S_IRUSR | S_IWUSR;
165 inode->i_uid = current->fsuid;
166 inode->i_gid = current->fsgid;
167 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
168 return inode;
169}
170
171static int __init anon_inode_init(void)
172{
173 int error;
174
175 error = register_filesystem(&anon_inode_fs_type);
176 if (error)
177 goto err_exit;
178 anon_inode_mnt = kern_mount(&anon_inode_fs_type);
179 if (IS_ERR(anon_inode_mnt)) {
180 error = PTR_ERR(anon_inode_mnt);
181 goto err_unregister_filesystem;
182 }
183 anon_inode_inode = anon_inode_mkinode();
184 if (IS_ERR(anon_inode_inode)) {
185 error = PTR_ERR(anon_inode_inode);
186 goto err_mntput;
187 }
188
189 return 0;
190
191err_mntput:
192 mntput(anon_inode_mnt);
193err_unregister_filesystem:
194 unregister_filesystem(&anon_inode_fs_type);
195err_exit:
196 panic(KERN_ERR "anon_inode_init() failed (%d)\n", error);
197}
198
199fs_initcall(anon_inode_init);
200
diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h
index 4ef544434b51..8b4cca3c4705 100644
--- a/fs/autofs/autofs_i.h
+++ b/fs/autofs/autofs_i.h
@@ -101,7 +101,7 @@ struct autofs_symlink {
101struct autofs_sb_info { 101struct autofs_sb_info {
102 u32 magic; 102 u32 magic;
103 struct file *pipe; 103 struct file *pipe;
104 pid_t oz_pgrp; 104 struct pid *oz_pgrp;
105 int catatonic; 105 int catatonic;
106 struct super_block *sb; 106 struct super_block *sb;
107 unsigned long exp_timeout; 107 unsigned long exp_timeout;
@@ -122,7 +122,7 @@ static inline struct autofs_sb_info *autofs_sbi(struct super_block *sb)
122 filesystem without "magic".) */ 122 filesystem without "magic".) */
123 123
124static inline int autofs_oz_mode(struct autofs_sb_info *sbi) { 124static inline int autofs_oz_mode(struct autofs_sb_info *sbi) {
125 return sbi->catatonic || process_group(current) == sbi->oz_pgrp; 125 return sbi->catatonic || task_pgrp(current) == sbi->oz_pgrp;
126} 126}
127 127
128/* Hash operations */ 128/* Hash operations */
diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c
index aa0b61ff8270..e7204d71acc9 100644
--- a/fs/autofs/inode.c
+++ b/fs/autofs/inode.c
@@ -34,12 +34,14 @@ void autofs_kill_sb(struct super_block *sb)
34 if (!sbi) 34 if (!sbi)
35 goto out_kill_sb; 35 goto out_kill_sb;
36 36
37 if ( !sbi->catatonic ) 37 if (!sbi->catatonic)
38 autofs_catatonic_mode(sbi); /* Free wait queues, close pipe */ 38 autofs_catatonic_mode(sbi); /* Free wait queues, close pipe */
39 39
40 put_pid(sbi->oz_pgrp);
41
40 autofs_hash_nuke(sbi); 42 autofs_hash_nuke(sbi);
41 for ( n = 0 ; n < AUTOFS_MAX_SYMLINKS ; n++ ) { 43 for (n = 0; n < AUTOFS_MAX_SYMLINKS; n++) {
42 if ( test_bit(n, sbi->symlink_bitmap) ) 44 if (test_bit(n, sbi->symlink_bitmap))
43 kfree(sbi->symlink[n].data); 45 kfree(sbi->symlink[n].data);
44 } 46 }
45 47
@@ -69,7 +71,8 @@ static match_table_t autofs_tokens = {
69 {Opt_err, NULL} 71 {Opt_err, NULL}
70}; 72};
71 73
72static int parse_options(char *options, int *pipefd, uid_t *uid, gid_t *gid, pid_t *pgrp, int *minproto, int *maxproto) 74static int parse_options(char *options, int *pipefd, uid_t *uid, gid_t *gid,
75 pid_t *pgrp, int *minproto, int *maxproto)
73{ 76{
74 char *p; 77 char *p;
75 substring_t args[MAX_OPT_ARGS]; 78 substring_t args[MAX_OPT_ARGS];
@@ -138,9 +141,10 @@ int autofs_fill_super(struct super_block *s, void *data, int silent)
138 int pipefd; 141 int pipefd;
139 struct autofs_sb_info *sbi; 142 struct autofs_sb_info *sbi;
140 int minproto, maxproto; 143 int minproto, maxproto;
144 pid_t pgid;
141 145
142 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); 146 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
143 if ( !sbi ) 147 if (!sbi)
144 goto fail_unlock; 148 goto fail_unlock;
145 DPRINTK(("autofs: starting up, sbi = %p\n",sbi)); 149 DPRINTK(("autofs: starting up, sbi = %p\n",sbi));
146 150
@@ -149,7 +153,6 @@ int autofs_fill_super(struct super_block *s, void *data, int silent)
149 sbi->pipe = NULL; 153 sbi->pipe = NULL;
150 sbi->catatonic = 1; 154 sbi->catatonic = 1;
151 sbi->exp_timeout = 0; 155 sbi->exp_timeout = 0;
152 sbi->oz_pgrp = process_group(current);
153 autofs_initialize_hash(&sbi->dirhash); 156 autofs_initialize_hash(&sbi->dirhash);
154 sbi->queues = NULL; 157 sbi->queues = NULL;
155 memset(sbi->symlink_bitmap, 0, sizeof(long)*AUTOFS_SYMLINK_BITMAP_LEN); 158 memset(sbi->symlink_bitmap, 0, sizeof(long)*AUTOFS_SYMLINK_BITMAP_LEN);
@@ -169,26 +172,36 @@ int autofs_fill_super(struct super_block *s, void *data, int silent)
169 goto fail_iput; 172 goto fail_iput;
170 173
171 /* Can this call block? - WTF cares? s is locked. */ 174 /* Can this call block? - WTF cares? s is locked. */
172 if ( parse_options(data,&pipefd,&root_inode->i_uid,&root_inode->i_gid,&sbi->oz_pgrp,&minproto,&maxproto) ) { 175 if (parse_options(data, &pipefd, &root_inode->i_uid,
176 &root_inode->i_gid, &pgid, &minproto,
177 &maxproto)) {
173 printk("autofs: called with bogus options\n"); 178 printk("autofs: called with bogus options\n");
174 goto fail_dput; 179 goto fail_dput;
175 } 180 }
176 181
177 /* Couldn't this be tested earlier? */ 182 /* Couldn't this be tested earlier? */
178 if ( minproto > AUTOFS_PROTO_VERSION || 183 if (minproto > AUTOFS_PROTO_VERSION ||
179 maxproto < AUTOFS_PROTO_VERSION ) { 184 maxproto < AUTOFS_PROTO_VERSION) {
180 printk("autofs: kernel does not match daemon version\n"); 185 printk("autofs: kernel does not match daemon version\n");
181 goto fail_dput; 186 goto fail_dput;
182 } 187 }
183 188
184 DPRINTK(("autofs: pipe fd = %d, pgrp = %u\n", pipefd, sbi->oz_pgrp)); 189 DPRINTK(("autofs: pipe fd = %d, pgrp = %u\n", pipefd, pgid));
190 sbi->oz_pgrp = find_get_pid(pgid);
191
192 if (!sbi->oz_pgrp) {
193 printk("autofs: could not find process group %d\n", pgid);
194 goto fail_dput;
195 }
196
185 pipe = fget(pipefd); 197 pipe = fget(pipefd);
186 198
187 if ( !pipe ) { 199 if (!pipe) {
188 printk("autofs: could not open pipe file descriptor\n"); 200 printk("autofs: could not open pipe file descriptor\n");
189 goto fail_dput; 201 goto fail_put_pid;
190 } 202 }
191 if ( !pipe->f_op || !pipe->f_op->write ) 203
204 if (!pipe->f_op || !pipe->f_op->write)
192 goto fail_fput; 205 goto fail_fput;
193 sbi->pipe = pipe; 206 sbi->pipe = pipe;
194 sbi->catatonic = 0; 207 sbi->catatonic = 0;
@@ -202,6 +215,8 @@ int autofs_fill_super(struct super_block *s, void *data, int silent)
202fail_fput: 215fail_fput:
203 printk("autofs: pipe file descriptor does not contain proper ops\n"); 216 printk("autofs: pipe file descriptor does not contain proper ops\n");
204 fput(pipe); 217 fput(pipe);
218fail_put_pid:
219 put_pid(sbi->oz_pgrp);
205fail_dput: 220fail_dput:
206 dput(root); 221 dput(root);
207 goto fail_free; 222 goto fail_free;
@@ -230,7 +245,7 @@ static void autofs_read_inode(struct inode *inode)
230 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 245 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
231 inode->i_blocks = 0; 246 inode->i_blocks = 0;
232 247
233 if ( ino == AUTOFS_ROOT_INO ) { 248 if (ino == AUTOFS_ROOT_INO) {
234 inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR; 249 inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR;
235 inode->i_op = &autofs_root_inode_operations; 250 inode->i_op = &autofs_root_inode_operations;
236 inode->i_fop = &autofs_root_operations; 251 inode->i_fop = &autofs_root_operations;
@@ -241,12 +256,12 @@ static void autofs_read_inode(struct inode *inode)
241 inode->i_uid = inode->i_sb->s_root->d_inode->i_uid; 256 inode->i_uid = inode->i_sb->s_root->d_inode->i_uid;
242 inode->i_gid = inode->i_sb->s_root->d_inode->i_gid; 257 inode->i_gid = inode->i_sb->s_root->d_inode->i_gid;
243 258
244 if ( ino >= AUTOFS_FIRST_SYMLINK && ino < AUTOFS_FIRST_DIR_INO ) { 259 if (ino >= AUTOFS_FIRST_SYMLINK && ino < AUTOFS_FIRST_DIR_INO) {
245 /* Symlink inode - should be in symlink list */ 260 /* Symlink inode - should be in symlink list */
246 struct autofs_symlink *sl; 261 struct autofs_symlink *sl;
247 262
248 n = ino - AUTOFS_FIRST_SYMLINK; 263 n = ino - AUTOFS_FIRST_SYMLINK;
249 if ( n >= AUTOFS_MAX_SYMLINKS || !test_bit(n,sbi->symlink_bitmap)) { 264 if (n >= AUTOFS_MAX_SYMLINKS || !test_bit(n,sbi->symlink_bitmap)) {
250 printk("autofs: Looking for bad symlink inode %u\n", (unsigned int) ino); 265 printk("autofs: Looking for bad symlink inode %u\n", (unsigned int) ino);
251 return; 266 return;
252 } 267 }
diff --git a/fs/autofs/root.c b/fs/autofs/root.c
index f2597205939d..c1489533277a 100644
--- a/fs/autofs/root.c
+++ b/fs/autofs/root.c
@@ -67,8 +67,8 @@ static int autofs_root_readdir(struct file *filp, void *dirent, filldir_t filldi
67 filp->f_pos = ++nr; 67 filp->f_pos = ++nr;
68 /* fall through */ 68 /* fall through */
69 default: 69 default:
70 while ( onr = nr, ent = autofs_hash_enum(dirhash,&nr,ent) ) { 70 while (onr = nr, ent = autofs_hash_enum(dirhash,&nr,ent)) {
71 if ( !ent->dentry || d_mountpoint(ent->dentry) ) { 71 if (!ent->dentry || d_mountpoint(ent->dentry)) {
72 if (filldir(dirent,ent->name,ent->len,onr,ent->ino,DT_UNKNOWN) < 0) 72 if (filldir(dirent,ent->name,ent->len,onr,ent->ino,DT_UNKNOWN) < 0)
73 goto out; 73 goto out;
74 filp->f_pos = nr; 74 filp->f_pos = nr;
@@ -88,10 +88,10 @@ static int try_to_fill_dentry(struct dentry *dentry, struct super_block *sb, str
88 struct autofs_dir_ent *ent; 88 struct autofs_dir_ent *ent;
89 int status = 0; 89 int status = 0;
90 90
91 if ( !(ent = autofs_hash_lookup(&sbi->dirhash, &dentry->d_name)) ) { 91 if (!(ent = autofs_hash_lookup(&sbi->dirhash, &dentry->d_name))) {
92 do { 92 do {
93 if ( status && dentry->d_inode ) { 93 if (status && dentry->d_inode) {
94 if ( status != -ENOENT ) 94 if (status != -ENOENT)
95 printk("autofs warning: lookup failure on positive dentry, status = %d, name = %s\n", status, dentry->d_name.name); 95 printk("autofs warning: lookup failure on positive dentry, status = %d, name = %s\n", status, dentry->d_name.name);
96 return 0; /* Try to get the kernel to invalidate this dentry */ 96 return 0; /* Try to get the kernel to invalidate this dentry */
97 } 97 }
@@ -106,7 +106,7 @@ static int try_to_fill_dentry(struct dentry *dentry, struct super_block *sb, str
106 return 1; 106 return 1;
107 } 107 }
108 status = autofs_wait(sbi, &dentry->d_name); 108 status = autofs_wait(sbi, &dentry->d_name);
109 } while (!(ent = autofs_hash_lookup(&sbi->dirhash, &dentry->d_name)) ); 109 } while (!(ent = autofs_hash_lookup(&sbi->dirhash, &dentry->d_name)));
110 } 110 }
111 111
112 /* Abuse this field as a pointer to the directory entry, used to 112 /* Abuse this field as a pointer to the directory entry, used to
@@ -124,13 +124,13 @@ static int try_to_fill_dentry(struct dentry *dentry, struct super_block *sb, str
124 124
125 /* If this is a directory that isn't a mount point, bitch at the 125 /* If this is a directory that isn't a mount point, bitch at the
126 daemon and fix it in user space */ 126 daemon and fix it in user space */
127 if ( S_ISDIR(dentry->d_inode->i_mode) && !d_mountpoint(dentry) ) { 127 if (S_ISDIR(dentry->d_inode->i_mode) && !d_mountpoint(dentry)) {
128 return !autofs_wait(sbi, &dentry->d_name); 128 return !autofs_wait(sbi, &dentry->d_name);
129 } 129 }
130 130
131 /* We don't update the usages for the autofs daemon itself, this 131 /* We don't update the usages for the autofs daemon itself, this
132 is necessary for recursive autofs mounts */ 132 is necessary for recursive autofs mounts */
133 if ( !autofs_oz_mode(sbi) ) { 133 if (!autofs_oz_mode(sbi)) {
134 autofs_update_usage(&sbi->dirhash,ent); 134 autofs_update_usage(&sbi->dirhash,ent);
135 } 135 }
136 136
@@ -157,7 +157,7 @@ static int autofs_revalidate(struct dentry * dentry, struct nameidata *nd)
157 sbi = autofs_sbi(dir->i_sb); 157 sbi = autofs_sbi(dir->i_sb);
158 158
159 /* Pending dentry */ 159 /* Pending dentry */
160 if ( dentry->d_flags & DCACHE_AUTOFS_PENDING ) { 160 if (dentry->d_flags & DCACHE_AUTOFS_PENDING) {
161 if (autofs_oz_mode(sbi)) 161 if (autofs_oz_mode(sbi))
162 res = 1; 162 res = 1;
163 else 163 else
@@ -173,7 +173,7 @@ static int autofs_revalidate(struct dentry * dentry, struct nameidata *nd)
173 } 173 }
174 174
175 /* Check for a non-mountpoint directory */ 175 /* Check for a non-mountpoint directory */
176 if ( S_ISDIR(dentry->d_inode->i_mode) && !d_mountpoint(dentry) ) { 176 if (S_ISDIR(dentry->d_inode->i_mode) && !d_mountpoint(dentry)) {
177 if (autofs_oz_mode(sbi)) 177 if (autofs_oz_mode(sbi))
178 res = 1; 178 res = 1;
179 else 179 else
@@ -183,9 +183,9 @@ static int autofs_revalidate(struct dentry * dentry, struct nameidata *nd)
183 } 183 }
184 184
185 /* Update the usage list */ 185 /* Update the usage list */
186 if ( !autofs_oz_mode(sbi) ) { 186 if (!autofs_oz_mode(sbi)) {
187 ent = (struct autofs_dir_ent *) dentry->d_time; 187 ent = (struct autofs_dir_ent *) dentry->d_time;
188 if ( ent ) 188 if (ent)
189 autofs_update_usage(&sbi->dirhash,ent); 189 autofs_update_usage(&sbi->dirhash,ent);
190 } 190 }
191 unlock_kernel(); 191 unlock_kernel();
@@ -213,8 +213,10 @@ static struct dentry *autofs_root_lookup(struct inode *dir, struct dentry *dentr
213 sbi = autofs_sbi(dir->i_sb); 213 sbi = autofs_sbi(dir->i_sb);
214 214
215 oz_mode = autofs_oz_mode(sbi); 215 oz_mode = autofs_oz_mode(sbi);
216 DPRINTK(("autofs_lookup: pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d\n", 216 DPRINTK(("autofs_lookup: pid = %u, pgrp = %u, catatonic = %d, "
217 current->pid, process_group(current), sbi->catatonic, oz_mode)); 217 "oz_mode = %d\n", pid_nr(task_pid(current)),
218 process_group(current), sbi->catatonic,
219 oz_mode));
218 220
219 /* 221 /*
220 * Mark the dentry incomplete, but add it. This is needed so 222 * Mark the dentry incomplete, but add it. This is needed so
@@ -258,7 +260,7 @@ static struct dentry *autofs_root_lookup(struct inode *dir, struct dentry *dentr
258 * doesn't do the right thing for all system calls, but it should 260 * doesn't do the right thing for all system calls, but it should
259 * be OK for the operations we permit from an autofs. 261 * be OK for the operations we permit from an autofs.
260 */ 262 */
261 if ( dentry->d_inode && d_unhashed(dentry) ) 263 if (dentry->d_inode && d_unhashed(dentry))
262 return ERR_PTR(-ENOENT); 264 return ERR_PTR(-ENOENT);
263 265
264 return NULL; 266 return NULL;
@@ -277,18 +279,18 @@ static int autofs_root_symlink(struct inode *dir, struct dentry *dentry, const c
277 autofs_say(dentry->d_name.name,dentry->d_name.len); 279 autofs_say(dentry->d_name.name,dentry->d_name.len);
278 280
279 lock_kernel(); 281 lock_kernel();
280 if ( !autofs_oz_mode(sbi) ) { 282 if (!autofs_oz_mode(sbi)) {
281 unlock_kernel(); 283 unlock_kernel();
282 return -EACCES; 284 return -EACCES;
283 } 285 }
284 286
285 if ( autofs_hash_lookup(dh, &dentry->d_name) ) { 287 if (autofs_hash_lookup(dh, &dentry->d_name)) {
286 unlock_kernel(); 288 unlock_kernel();
287 return -EEXIST; 289 return -EEXIST;
288 } 290 }
289 291
290 n = find_first_zero_bit(sbi->symlink_bitmap,AUTOFS_MAX_SYMLINKS); 292 n = find_first_zero_bit(sbi->symlink_bitmap,AUTOFS_MAX_SYMLINKS);
291 if ( n >= AUTOFS_MAX_SYMLINKS ) { 293 if (n >= AUTOFS_MAX_SYMLINKS) {
292 unlock_kernel(); 294 unlock_kernel();
293 return -ENOSPC; 295 return -ENOSPC;
294 } 296 }
@@ -297,14 +299,14 @@ static int autofs_root_symlink(struct inode *dir, struct dentry *dentry, const c
297 sl = &sbi->symlink[n]; 299 sl = &sbi->symlink[n];
298 sl->len = strlen(symname); 300 sl->len = strlen(symname);
299 sl->data = kmalloc(slsize = sl->len+1, GFP_KERNEL); 301 sl->data = kmalloc(slsize = sl->len+1, GFP_KERNEL);
300 if ( !sl->data ) { 302 if (!sl->data) {
301 clear_bit(n,sbi->symlink_bitmap); 303 clear_bit(n,sbi->symlink_bitmap);
302 unlock_kernel(); 304 unlock_kernel();
303 return -ENOSPC; 305 return -ENOSPC;
304 } 306 }
305 307
306 ent = kmalloc(sizeof(struct autofs_dir_ent), GFP_KERNEL); 308 ent = kmalloc(sizeof(struct autofs_dir_ent), GFP_KERNEL);
307 if ( !ent ) { 309 if (!ent) {
308 kfree(sl->data); 310 kfree(sl->data);
309 clear_bit(n,sbi->symlink_bitmap); 311 clear_bit(n,sbi->symlink_bitmap);
310 unlock_kernel(); 312 unlock_kernel();
@@ -312,7 +314,7 @@ static int autofs_root_symlink(struct inode *dir, struct dentry *dentry, const c
312 } 314 }
313 315
314 ent->name = kmalloc(dentry->d_name.len+1, GFP_KERNEL); 316 ent->name = kmalloc(dentry->d_name.len+1, GFP_KERNEL);
315 if ( !ent->name ) { 317 if (!ent->name) {
316 kfree(sl->data); 318 kfree(sl->data);
317 kfree(ent); 319 kfree(ent);
318 clear_bit(n,sbi->symlink_bitmap); 320 clear_bit(n,sbi->symlink_bitmap);
@@ -354,23 +356,23 @@ static int autofs_root_unlink(struct inode *dir, struct dentry *dentry)
354 356
355 /* This allows root to remove symlinks */ 357 /* This allows root to remove symlinks */
356 lock_kernel(); 358 lock_kernel();
357 if ( !autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN) ) { 359 if (!autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN)) {
358 unlock_kernel(); 360 unlock_kernel();
359 return -EACCES; 361 return -EACCES;
360 } 362 }
361 363
362 ent = autofs_hash_lookup(dh, &dentry->d_name); 364 ent = autofs_hash_lookup(dh, &dentry->d_name);
363 if ( !ent ) { 365 if (!ent) {
364 unlock_kernel(); 366 unlock_kernel();
365 return -ENOENT; 367 return -ENOENT;
366 } 368 }
367 369
368 n = ent->ino - AUTOFS_FIRST_SYMLINK; 370 n = ent->ino - AUTOFS_FIRST_SYMLINK;
369 if ( n >= AUTOFS_MAX_SYMLINKS ) { 371 if (n >= AUTOFS_MAX_SYMLINKS) {
370 unlock_kernel(); 372 unlock_kernel();
371 return -EISDIR; /* It's a directory, dummy */ 373 return -EISDIR; /* It's a directory, dummy */
372 } 374 }
373 if ( !test_bit(n,sbi->symlink_bitmap) ) { 375 if (!test_bit(n,sbi->symlink_bitmap)) {
374 unlock_kernel(); 376 unlock_kernel();
375 return -EINVAL; /* Nonexistent symlink? Shouldn't happen */ 377 return -EINVAL; /* Nonexistent symlink? Shouldn't happen */
376 } 378 }
@@ -392,23 +394,23 @@ static int autofs_root_rmdir(struct inode *dir, struct dentry *dentry)
392 struct autofs_dir_ent *ent; 394 struct autofs_dir_ent *ent;
393 395
394 lock_kernel(); 396 lock_kernel();
395 if ( !autofs_oz_mode(sbi) ) { 397 if (!autofs_oz_mode(sbi)) {
396 unlock_kernel(); 398 unlock_kernel();
397 return -EACCES; 399 return -EACCES;
398 } 400 }
399 401
400 ent = autofs_hash_lookup(dh, &dentry->d_name); 402 ent = autofs_hash_lookup(dh, &dentry->d_name);
401 if ( !ent ) { 403 if (!ent) {
402 unlock_kernel(); 404 unlock_kernel();
403 return -ENOENT; 405 return -ENOENT;
404 } 406 }
405 407
406 if ( (unsigned int)ent->ino < AUTOFS_FIRST_DIR_INO ) { 408 if ((unsigned int)ent->ino < AUTOFS_FIRST_DIR_INO) {
407 unlock_kernel(); 409 unlock_kernel();
408 return -ENOTDIR; /* Not a directory */ 410 return -ENOTDIR; /* Not a directory */
409 } 411 }
410 412
411 if ( ent->dentry != dentry ) { 413 if (ent->dentry != dentry) {
412 printk("autofs_rmdir: odentry != dentry for entry %s\n", dentry->d_name.name); 414 printk("autofs_rmdir: odentry != dentry for entry %s\n", dentry->d_name.name);
413 } 415 }
414 416
@@ -429,18 +431,18 @@ static int autofs_root_mkdir(struct inode *dir, struct dentry *dentry, int mode)
429 ino_t ino; 431 ino_t ino;
430 432
431 lock_kernel(); 433 lock_kernel();
432 if ( !autofs_oz_mode(sbi) ) { 434 if (!autofs_oz_mode(sbi)) {
433 unlock_kernel(); 435 unlock_kernel();
434 return -EACCES; 436 return -EACCES;
435 } 437 }
436 438
437 ent = autofs_hash_lookup(dh, &dentry->d_name); 439 ent = autofs_hash_lookup(dh, &dentry->d_name);
438 if ( ent ) { 440 if (ent) {
439 unlock_kernel(); 441 unlock_kernel();
440 return -EEXIST; 442 return -EEXIST;
441 } 443 }
442 444
443 if ( sbi->next_dir_ino < AUTOFS_FIRST_DIR_INO ) { 445 if (sbi->next_dir_ino < AUTOFS_FIRST_DIR_INO) {
444 printk("autofs: Out of inode numbers -- what the heck did you do??\n"); 446 printk("autofs: Out of inode numbers -- what the heck did you do??\n");
445 unlock_kernel(); 447 unlock_kernel();
446 return -ENOSPC; 448 return -ENOSPC;
@@ -448,13 +450,13 @@ static int autofs_root_mkdir(struct inode *dir, struct dentry *dentry, int mode)
448 ino = sbi->next_dir_ino++; 450 ino = sbi->next_dir_ino++;
449 451
450 ent = kmalloc(sizeof(struct autofs_dir_ent), GFP_KERNEL); 452 ent = kmalloc(sizeof(struct autofs_dir_ent), GFP_KERNEL);
451 if ( !ent ) { 453 if (!ent) {
452 unlock_kernel(); 454 unlock_kernel();
453 return -ENOSPC; 455 return -ENOSPC;
454 } 456 }
455 457
456 ent->name = kmalloc(dentry->d_name.len+1, GFP_KERNEL); 458 ent->name = kmalloc(dentry->d_name.len+1, GFP_KERNEL);
457 if ( !ent->name ) { 459 if (!ent->name) {
458 kfree(ent); 460 kfree(ent);
459 unlock_kernel(); 461 unlock_kernel();
460 return -ENOSPC; 462 return -ENOSPC;
@@ -483,7 +485,7 @@ static inline int autofs_get_set_timeout(struct autofs_sb_info *sbi,
483 put_user(sbi->exp_timeout / HZ, p)) 485 put_user(sbi->exp_timeout / HZ, p))
484 return -EFAULT; 486 return -EFAULT;
485 487
486 if ( ntimeout > ULONG_MAX/HZ ) 488 if (ntimeout > ULONG_MAX/HZ)
487 sbi->exp_timeout = 0; 489 sbi->exp_timeout = 0;
488 else 490 else
489 sbi->exp_timeout = ntimeout * HZ; 491 sbi->exp_timeout = ntimeout * HZ;
@@ -511,15 +513,14 @@ static inline int autofs_expire_run(struct super_block *sb,
511 pkt.hdr.proto_version = AUTOFS_PROTO_VERSION; 513 pkt.hdr.proto_version = AUTOFS_PROTO_VERSION;
512 pkt.hdr.type = autofs_ptype_expire; 514 pkt.hdr.type = autofs_ptype_expire;
513 515
514 if ( !sbi->exp_timeout || 516 if (!sbi->exp_timeout || !(ent = autofs_expire(sb,sbi,mnt)))
515 !(ent = autofs_expire(sb,sbi,mnt)) )
516 return -EAGAIN; 517 return -EAGAIN;
517 518
518 pkt.len = ent->len; 519 pkt.len = ent->len;
519 memcpy(pkt.name, ent->name, pkt.len); 520 memcpy(pkt.name, ent->name, pkt.len);
520 pkt.name[pkt.len] = '\0'; 521 pkt.name[pkt.len] = '\0';
521 522
522 if ( copy_to_user(pkt_p, &pkt, sizeof(struct autofs_packet_expire)) ) 523 if (copy_to_user(pkt_p, &pkt, sizeof(struct autofs_packet_expire)))
523 return -EFAULT; 524 return -EFAULT;
524 525
525 return 0; 526 return 0;
@@ -537,11 +538,11 @@ static int autofs_root_ioctl(struct inode *inode, struct file *filp,
537 538
538 DPRINTK(("autofs_ioctl: cmd = 0x%08x, arg = 0x%08lx, sbi = %p, pgrp = %u\n",cmd,arg,sbi,process_group(current))); 539 DPRINTK(("autofs_ioctl: cmd = 0x%08x, arg = 0x%08lx, sbi = %p, pgrp = %u\n",cmd,arg,sbi,process_group(current)));
539 540
540 if ( _IOC_TYPE(cmd) != _IOC_TYPE(AUTOFS_IOC_FIRST) || 541 if (_IOC_TYPE(cmd) != _IOC_TYPE(AUTOFS_IOC_FIRST) ||
541 _IOC_NR(cmd) - _IOC_NR(AUTOFS_IOC_FIRST) >= AUTOFS_IOC_COUNT ) 542 _IOC_NR(cmd) - _IOC_NR(AUTOFS_IOC_FIRST) >= AUTOFS_IOC_COUNT)
542 return -ENOTTY; 543 return -ENOTTY;
543 544
544 if ( !autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN) ) 545 if (!autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN))
545 return -EPERM; 546 return -EPERM;
546 547
547 switch(cmd) { 548 switch(cmd) {
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 5769a2f9ad60..692364e8ffc3 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -218,8 +218,7 @@ static match_table_t tokens = {
218}; 218};
219 219
220static int parse_options(char *options, int *pipefd, uid_t *uid, gid_t *gid, 220static int parse_options(char *options, int *pipefd, uid_t *uid, gid_t *gid,
221 pid_t *pgrp, unsigned int *type, 221 pid_t *pgrp, unsigned int *type, int *minproto, int *maxproto)
222 int *minproto, int *maxproto)
223{ 222{
224 char *p; 223 char *p;
225 substring_t args[MAX_OPT_ARGS]; 224 substring_t args[MAX_OPT_ARGS];
@@ -314,7 +313,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
314 struct autofs_info *ino; 313 struct autofs_info *ino;
315 314
316 sbi = kmalloc(sizeof(*sbi), GFP_KERNEL); 315 sbi = kmalloc(sizeof(*sbi), GFP_KERNEL);
317 if ( !sbi ) 316 if (!sbi)
318 goto fail_unlock; 317 goto fail_unlock;
319 DPRINTK("starting up, sbi = %p",sbi); 318 DPRINTK("starting up, sbi = %p",sbi);
320 319
@@ -363,10 +362,9 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
363 root->d_fsdata = ino; 362 root->d_fsdata = ino;
364 363
365 /* Can this call block? */ 364 /* Can this call block? */
366 if (parse_options(data, &pipefd, 365 if (parse_options(data, &pipefd, &root_inode->i_uid, &root_inode->i_gid,
367 &root_inode->i_uid, &root_inode->i_gid, 366 &sbi->oz_pgrp, &sbi->type, &sbi->min_proto,
368 &sbi->oz_pgrp, &sbi->type, 367 &sbi->max_proto)) {
369 &sbi->min_proto, &sbi->max_proto)) {
370 printk("autofs: called with bogus options\n"); 368 printk("autofs: called with bogus options\n");
371 goto fail_dput; 369 goto fail_dput;
372 } 370 }
@@ -396,11 +394,11 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
396 DPRINTK("pipe fd = %d, pgrp = %u", pipefd, sbi->oz_pgrp); 394 DPRINTK("pipe fd = %d, pgrp = %u", pipefd, sbi->oz_pgrp);
397 pipe = fget(pipefd); 395 pipe = fget(pipefd);
398 396
399 if ( !pipe ) { 397 if (!pipe) {
400 printk("autofs: could not open pipe file descriptor\n"); 398 printk("autofs: could not open pipe file descriptor\n");
401 goto fail_dput; 399 goto fail_dput;
402 } 400 }
403 if ( !pipe->f_op || !pipe->f_op->write ) 401 if (!pipe->f_op || !pipe->f_op->write)
404 goto fail_fput; 402 goto fail_fput;
405 sbi->pipe = pipe; 403 sbi->pipe = pipe;
406 sbi->pipefd = pipefd; 404 sbi->pipefd = pipefd;
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 15170f4e13a7..2d4c8a3e604e 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -759,7 +759,7 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)
759 struct autofs_info *p_ino; 759 struct autofs_info *p_ino;
760 760
761 /* This allows root to remove symlinks */ 761 /* This allows root to remove symlinks */
762 if ( !autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN) ) 762 if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN))
763 return -EACCES; 763 return -EACCES;
764 764
765 if (atomic_dec_and_test(&ino->count)) { 765 if (atomic_dec_and_test(&ino->count)) {
@@ -833,7 +833,7 @@ static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, int mode)
833 struct autofs_info *p_ino; 833 struct autofs_info *p_ino;
834 struct inode *inode; 834 struct inode *inode;
835 835
836 if ( !autofs4_oz_mode(sbi) ) 836 if (!autofs4_oz_mode(sbi))
837 return -EACCES; 837 return -EACCES;
838 838
839 DPRINTK("dentry %p, creating %.*s", 839 DPRINTK("dentry %p, creating %.*s",
@@ -871,11 +871,11 @@ static inline int autofs4_get_set_timeout(struct autofs_sb_info *sbi,
871 int rv; 871 int rv;
872 unsigned long ntimeout; 872 unsigned long ntimeout;
873 873
874 if ( (rv = get_user(ntimeout, p)) || 874 if ((rv = get_user(ntimeout, p)) ||
875 (rv = put_user(sbi->exp_timeout/HZ, p)) ) 875 (rv = put_user(sbi->exp_timeout/HZ, p)))
876 return rv; 876 return rv;
877 877
878 if ( ntimeout > ULONG_MAX/HZ ) 878 if (ntimeout > ULONG_MAX/HZ)
879 sbi->exp_timeout = 0; 879 sbi->exp_timeout = 0;
880 else 880 else
881 sbi->exp_timeout = ntimeout * HZ; 881 sbi->exp_timeout = ntimeout * HZ;
@@ -906,7 +906,7 @@ static inline int autofs4_ask_reghost(struct autofs_sb_info *sbi, int __user *p)
906 DPRINTK("returning %d", sbi->needs_reghost); 906 DPRINTK("returning %d", sbi->needs_reghost);
907 907
908 status = put_user(sbi->needs_reghost, p); 908 status = put_user(sbi->needs_reghost, p);
909 if ( status ) 909 if (status)
910 return status; 910 return status;
911 911
912 sbi->needs_reghost = 0; 912 sbi->needs_reghost = 0;
@@ -975,11 +975,11 @@ static int autofs4_root_ioctl(struct inode *inode, struct file *filp,
975 DPRINTK("cmd = 0x%08x, arg = 0x%08lx, sbi = %p, pgrp = %u", 975 DPRINTK("cmd = 0x%08x, arg = 0x%08lx, sbi = %p, pgrp = %u",
976 cmd,arg,sbi,process_group(current)); 976 cmd,arg,sbi,process_group(current));
977 977
978 if ( _IOC_TYPE(cmd) != _IOC_TYPE(AUTOFS_IOC_FIRST) || 978 if (_IOC_TYPE(cmd) != _IOC_TYPE(AUTOFS_IOC_FIRST) ||
979 _IOC_NR(cmd) - _IOC_NR(AUTOFS_IOC_FIRST) >= AUTOFS_IOC_COUNT ) 979 _IOC_NR(cmd) - _IOC_NR(AUTOFS_IOC_FIRST) >= AUTOFS_IOC_COUNT)
980 return -ENOTTY; 980 return -ENOTTY;
981 981
982 if ( !autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN) ) 982 if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN))
983 return -EPERM; 983 return -EPERM;
984 984
985 switch(cmd) { 985 switch(cmd) {
diff --git a/fs/compat.c b/fs/compat.c
index 9cf75df9b2bb..7b21b0a82596 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -46,6 +46,7 @@
46#include <linux/tsacct_kern.h> 46#include <linux/tsacct_kern.h>
47#include <linux/security.h> 47#include <linux/security.h>
48#include <linux/highmem.h> 48#include <linux/highmem.h>
49#include <linux/signal.h>
49#include <linux/poll.h> 50#include <linux/poll.h>
50#include <linux/mm.h> 51#include <linux/mm.h>
51#include <linux/eventpoll.h> 52#include <linux/eventpoll.h>
@@ -2199,3 +2200,51 @@ asmlinkage long compat_sys_epoll_pwait(int epfd,
2199#endif /* TIF_RESTORE_SIGMASK */ 2200#endif /* TIF_RESTORE_SIGMASK */
2200 2201
2201#endif /* CONFIG_EPOLL */ 2202#endif /* CONFIG_EPOLL */
2203
2204#ifdef CONFIG_SIGNALFD
2205
2206asmlinkage long compat_sys_signalfd(int ufd,
2207 const compat_sigset_t __user *sigmask,
2208 compat_size_t sigsetsize)
2209{
2210 compat_sigset_t ss32;
2211 sigset_t tmp;
2212 sigset_t __user *ksigmask;
2213
2214 if (sigsetsize != sizeof(compat_sigset_t))
2215 return -EINVAL;
2216 if (copy_from_user(&ss32, sigmask, sizeof(ss32)))
2217 return -EFAULT;
2218 sigset_from_compat(&tmp, &ss32);
2219 ksigmask = compat_alloc_user_space(sizeof(sigset_t));
2220 if (copy_to_user(ksigmask, &tmp, sizeof(sigset_t)))
2221 return -EFAULT;
2222
2223 return sys_signalfd(ufd, ksigmask, sizeof(sigset_t));
2224}
2225
2226#endif /* CONFIG_SIGNALFD */
2227
2228#ifdef CONFIG_TIMERFD
2229
2230asmlinkage long compat_sys_timerfd(int ufd, int clockid, int flags,
2231 const struct compat_itimerspec __user *utmr)
2232{
2233 long res;
2234 struct itimerspec t;
2235 struct itimerspec __user *ut;
2236
2237 res = -EFAULT;
2238 if (get_compat_itimerspec(&t, utmr))
2239 goto err_exit;
2240 ut = compat_alloc_user_space(sizeof(*ut));
2241 if (copy_to_user(ut, &t, sizeof(t)) )
2242 goto err_exit;
2243
2244 res = sys_timerfd(ufd, clockid, flags, ut);
2245err_exit:
2246 return res;
2247}
2248
2249#endif /* CONFIG_TIMERFD */
2250
diff --git a/fs/eventfd.c b/fs/eventfd.c
new file mode 100644
index 000000000000..480e2b3c4166
--- /dev/null
+++ b/fs/eventfd.c
@@ -0,0 +1,228 @@
1/*
2 * fs/eventfd.c
3 *
4 * Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org>
5 *
6 */
7
8#include <linux/file.h>
9#include <linux/poll.h>
10#include <linux/init.h>
11#include <linux/fs.h>
12#include <linux/sched.h>
13#include <linux/kernel.h>
14#include <linux/list.h>
15#include <linux/spinlock.h>
16#include <linux/anon_inodes.h>
17#include <linux/eventfd.h>
18
19struct eventfd_ctx {
20 spinlock_t lock;
21 wait_queue_head_t wqh;
22 /*
23 * Every time that a write(2) is performed on an eventfd, the
24 * value of the __u64 being written is added to "count" and a
25 * wakeup is performed on "wqh". A read(2) will return the "count"
26 * value to userspace, and will reset "count" to zero. The kernel
27 * size eventfd_signal() also, adds to the "count" counter and
28 * issue a wakeup.
29 */
30 __u64 count;
31};
32
33/*
34 * Adds "n" to the eventfd counter "count". Returns "n" in case of
35 * success, or a value lower then "n" in case of coutner overflow.
36 * This function is supposed to be called by the kernel in paths
37 * that do not allow sleeping. In this function we allow the counter
38 * to reach the ULLONG_MAX value, and we signal this as overflow
39 * condition by returining a POLLERR to poll(2).
40 */
41int eventfd_signal(struct file *file, int n)
42{
43 struct eventfd_ctx *ctx = file->private_data;
44 unsigned long flags;
45
46 if (n < 0)
47 return -EINVAL;
48 spin_lock_irqsave(&ctx->lock, flags);
49 if (ULLONG_MAX - ctx->count < n)
50 n = (int) (ULLONG_MAX - ctx->count);
51 ctx->count += n;
52 if (waitqueue_active(&ctx->wqh))
53 wake_up_locked(&ctx->wqh);
54 spin_unlock_irqrestore(&ctx->lock, flags);
55
56 return n;
57}
58
59static int eventfd_release(struct inode *inode, struct file *file)
60{
61 kfree(file->private_data);
62 return 0;
63}
64
65static unsigned int eventfd_poll(struct file *file, poll_table *wait)
66{
67 struct eventfd_ctx *ctx = file->private_data;
68 unsigned int events = 0;
69 unsigned long flags;
70
71 poll_wait(file, &ctx->wqh, wait);
72
73 spin_lock_irqsave(&ctx->lock, flags);
74 if (ctx->count > 0)
75 events |= POLLIN;
76 if (ctx->count == ULLONG_MAX)
77 events |= POLLERR;
78 if (ULLONG_MAX - 1 > ctx->count)
79 events |= POLLOUT;
80 spin_unlock_irqrestore(&ctx->lock, flags);
81
82 return events;
83}
84
85static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count,
86 loff_t *ppos)
87{
88 struct eventfd_ctx *ctx = file->private_data;
89 ssize_t res;
90 __u64 ucnt;
91 DECLARE_WAITQUEUE(wait, current);
92
93 if (count < sizeof(ucnt))
94 return -EINVAL;
95 spin_lock_irq(&ctx->lock);
96 res = -EAGAIN;
97 ucnt = ctx->count;
98 if (ucnt > 0)
99 res = sizeof(ucnt);
100 else if (!(file->f_flags & O_NONBLOCK)) {
101 __add_wait_queue(&ctx->wqh, &wait);
102 for (res = 0;;) {
103 set_current_state(TASK_INTERRUPTIBLE);
104 if (ctx->count > 0) {
105 ucnt = ctx->count;
106 res = sizeof(ucnt);
107 break;
108 }
109 if (signal_pending(current)) {
110 res = -ERESTARTSYS;
111 break;
112 }
113 spin_unlock_irq(&ctx->lock);
114 schedule();
115 spin_lock_irq(&ctx->lock);
116 }
117 __remove_wait_queue(&ctx->wqh, &wait);
118 __set_current_state(TASK_RUNNING);
119 }
120 if (res > 0) {
121 ctx->count = 0;
122 if (waitqueue_active(&ctx->wqh))
123 wake_up_locked(&ctx->wqh);
124 }
125 spin_unlock_irq(&ctx->lock);
126 if (res > 0 && put_user(ucnt, (__u64 __user *) buf))
127 return -EFAULT;
128
129 return res;
130}
131
132static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t count,
133 loff_t *ppos)
134{
135 struct eventfd_ctx *ctx = file->private_data;
136 ssize_t res;
137 __u64 ucnt;
138 DECLARE_WAITQUEUE(wait, current);
139
140 if (count < sizeof(ucnt))
141 return -EINVAL;
142 if (copy_from_user(&ucnt, buf, sizeof(ucnt)))
143 return -EFAULT;
144 if (ucnt == ULLONG_MAX)
145 return -EINVAL;
146 spin_lock_irq(&ctx->lock);
147 res = -EAGAIN;
148 if (ULLONG_MAX - ctx->count > ucnt)
149 res = sizeof(ucnt);
150 else if (!(file->f_flags & O_NONBLOCK)) {
151 __add_wait_queue(&ctx->wqh, &wait);
152 for (res = 0;;) {
153 set_current_state(TASK_INTERRUPTIBLE);
154 if (ULLONG_MAX - ctx->count > ucnt) {
155 res = sizeof(ucnt);
156 break;
157 }
158 if (signal_pending(current)) {
159 res = -ERESTARTSYS;
160 break;
161 }
162 spin_unlock_irq(&ctx->lock);
163 schedule();
164 spin_lock_irq(&ctx->lock);
165 }
166 __remove_wait_queue(&ctx->wqh, &wait);
167 __set_current_state(TASK_RUNNING);
168 }
169 if (res > 0) {
170 ctx->count += ucnt;
171 if (waitqueue_active(&ctx->wqh))
172 wake_up_locked(&ctx->wqh);
173 }
174 spin_unlock_irq(&ctx->lock);
175
176 return res;
177}
178
179static const struct file_operations eventfd_fops = {
180 .release = eventfd_release,
181 .poll = eventfd_poll,
182 .read = eventfd_read,
183 .write = eventfd_write,
184};
185
186struct file *eventfd_fget(int fd)
187{
188 struct file *file;
189
190 file = fget(fd);
191 if (!file)
192 return ERR_PTR(-EBADF);
193 if (file->f_op != &eventfd_fops) {
194 fput(file);
195 return ERR_PTR(-EINVAL);
196 }
197
198 return file;
199}
200
201asmlinkage long sys_eventfd(unsigned int count)
202{
203 int error, fd;
204 struct eventfd_ctx *ctx;
205 struct file *file;
206 struct inode *inode;
207
208 ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
209 if (!ctx)
210 return -ENOMEM;
211
212 init_waitqueue_head(&ctx->wqh);
213 spin_lock_init(&ctx->lock);
214 ctx->count = count;
215
216 /*
217 * When we call this, the initialization must be complete, since
218 * anon_inode_getfd() will install the fd.
219 */
220 error = anon_inode_getfd(&fd, &inode, &file, "[eventfd]",
221 &eventfd_fops, ctx);
222 if (!error)
223 return fd;
224
225 kfree(ctx);
226 return error;
227}
228
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index b5c7ca584939..1aad34ea61a4 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -11,7 +11,6 @@
11 * 11 *
12 */ 12 */
13 13
14#include <linux/module.h>
15#include <linux/init.h> 14#include <linux/init.h>
16#include <linux/kernel.h> 15#include <linux/kernel.h>
17#include <linux/sched.h> 16#include <linux/sched.h>
@@ -34,6 +33,7 @@
34#include <linux/mount.h> 33#include <linux/mount.h>
35#include <linux/bitops.h> 34#include <linux/bitops.h>
36#include <linux/mutex.h> 35#include <linux/mutex.h>
36#include <linux/anon_inodes.h>
37#include <asm/uaccess.h> 37#include <asm/uaccess.h>
38#include <asm/system.h> 38#include <asm/system.h>
39#include <asm/io.h> 39#include <asm/io.h>
@@ -41,7 +41,6 @@
41#include <asm/atomic.h> 41#include <asm/atomic.h>
42#include <asm/semaphore.h> 42#include <asm/semaphore.h>
43 43
44
45/* 44/*
46 * LOCKING: 45 * LOCKING:
47 * There are three level of locking required by epoll : 46 * There are three level of locking required by epoll :
@@ -74,9 +73,6 @@
74 * a greater scalability. 73 * a greater scalability.
75 */ 74 */
76 75
77
78#define EVENTPOLLFS_MAGIC 0x03111965 /* My birthday should work for this :) */
79
80#define DEBUG_EPOLL 0 76#define DEBUG_EPOLL 0
81 77
82#if DEBUG_EPOLL > 0 78#if DEBUG_EPOLL > 0
@@ -106,7 +102,6 @@
106 102
107#define EP_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event)) 103#define EP_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event))
108 104
109
110struct epoll_filefd { 105struct epoll_filefd {
111 struct file *file; 106 struct file *file;
112 int fd; 107 int fd;
@@ -224,43 +219,6 @@ struct ep_pqueue {
224 struct epitem *epi; 219 struct epitem *epi;
225}; 220};
226 221
227
228
229static void ep_poll_safewake_init(struct poll_safewake *psw);
230static void ep_poll_safewake(struct poll_safewake *psw, wait_queue_head_t *wq);
231static int ep_getfd(int *efd, struct inode **einode, struct file **efile,
232 struct eventpoll *ep);
233static int ep_alloc(struct eventpoll **pep);
234static void ep_free(struct eventpoll *ep);
235static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd);
236static void ep_use_epitem(struct epitem *epi);
237static void ep_release_epitem(struct epitem *epi);
238static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
239 poll_table *pt);
240static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi);
241static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
242 struct file *tfile, int fd);
243static int ep_modify(struct eventpoll *ep, struct epitem *epi,
244 struct epoll_event *event);
245static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi);
246static int ep_unlink(struct eventpoll *ep, struct epitem *epi);
247static int ep_remove(struct eventpoll *ep, struct epitem *epi);
248static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key);
249static int ep_eventpoll_close(struct inode *inode, struct file *file);
250static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait);
251static int ep_send_events(struct eventpoll *ep, struct list_head *txlist,
252 struct epoll_event __user *events, int maxevents);
253static int ep_events_transfer(struct eventpoll *ep,
254 struct epoll_event __user *events,
255 int maxevents);
256static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
257 int maxevents, long timeout);
258static int eventpollfs_delete_dentry(struct dentry *dentry);
259static struct inode *ep_eventpoll_inode(void);
260static int eventpollfs_get_sb(struct file_system_type *fs_type,
261 int flags, const char *dev_name,
262 void *data, struct vfsmount *mnt);
263
264/* 222/*
265 * This semaphore is used to serialize ep_free() and eventpoll_release_file(). 223 * This semaphore is used to serialize ep_free() and eventpoll_release_file().
266 */ 224 */
@@ -275,37 +233,6 @@ static struct kmem_cache *epi_cache __read_mostly;
275/* Slab cache used to allocate "struct eppoll_entry" */ 233/* Slab cache used to allocate "struct eppoll_entry" */
276static struct kmem_cache *pwq_cache __read_mostly; 234static struct kmem_cache *pwq_cache __read_mostly;
277 235
278/* Virtual fs used to allocate inodes for eventpoll files */
279static struct vfsmount *eventpoll_mnt __read_mostly;
280
281/* File callbacks that implement the eventpoll file behaviour */
282static const struct file_operations eventpoll_fops = {
283 .release = ep_eventpoll_close,
284 .poll = ep_eventpoll_poll
285};
286
287/*
288 * This is used to register the virtual file system from where
289 * eventpoll inodes are allocated.
290 */
291static struct file_system_type eventpoll_fs_type = {
292 .name = "eventpollfs",
293 .get_sb = eventpollfs_get_sb,
294 .kill_sb = kill_anon_super,
295};
296
297/* Very basic directory entry operations for the eventpoll virtual file system */
298static struct dentry_operations eventpollfs_dentry_operations = {
299 .d_delete = eventpollfs_delete_dentry,
300};
301
302
303
304/* Fast test to see if the file is an evenpoll file */
305static inline int is_file_epoll(struct file *f)
306{
307 return f->f_op == &eventpoll_fops;
308}
309 236
310/* Setup the structure that is used as key for the rb-tree */ 237/* Setup the structure that is used as key for the rb-tree */
311static inline void ep_set_ffd(struct epoll_filefd *ffd, 238static inline void ep_set_ffd(struct epoll_filefd *ffd,
@@ -374,7 +301,6 @@ static void ep_poll_safewake_init(struct poll_safewake *psw)
374 spin_lock_init(&psw->lock); 301 spin_lock_init(&psw->lock);
375} 302}
376 303
377
378/* 304/*
379 * Perform a safe wake up of the poll wait list. The problem is that 305 * Perform a safe wake up of the poll wait list. The problem is that
380 * with the new callback'd wake up system, it is possible that the 306 * with the new callback'd wake up system, it is possible that the
@@ -429,399 +355,144 @@ static void ep_poll_safewake(struct poll_safewake *psw, wait_queue_head_t *wq)
429 spin_unlock_irqrestore(&psw->lock, flags); 355 spin_unlock_irqrestore(&psw->lock, flags);
430} 356}
431 357
432
433/* 358/*
434 * This is called from eventpoll_release() to unlink files from the eventpoll 359 * This function unregister poll callbacks from the associated file descriptor.
435 * interface. We need to have this facility to cleanup correctly files that are 360 * Since this must be called without holding "ep->lock" the atomic exchange trick
436 * closed without being removed from the eventpoll interface. 361 * will protect us from multiple unregister.
437 */ 362 */
438void eventpoll_release_file(struct file *file) 363static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)
439{ 364{
440 struct list_head *lsthead = &file->f_ep_links; 365 int nwait;
441 struct eventpoll *ep; 366 struct list_head *lsthead = &epi->pwqlist;
442 struct epitem *epi; 367 struct eppoll_entry *pwq;
443 368
444 /* 369 /* This is called without locks, so we need the atomic exchange */
445 * We don't want to get "file->f_ep_lock" because it is not 370 nwait = xchg(&epi->nwait, 0);
446 * necessary. It is not necessary because we're in the "struct file"
447 * cleanup path, and this means that noone is using this file anymore.
448 * The only hit might come from ep_free() but by holding the semaphore
449 * will correctly serialize the operation. We do need to acquire
450 * "ep->sem" after "epmutex" because ep_remove() requires it when called
451 * from anywhere but ep_free().
452 */
453 mutex_lock(&epmutex);
454 371
455 while (!list_empty(lsthead)) { 372 if (nwait) {
456 epi = list_first_entry(lsthead, struct epitem, fllink); 373 while (!list_empty(lsthead)) {
374 pwq = list_first_entry(lsthead, struct eppoll_entry, llink);
457 375
458 ep = epi->ep; 376 list_del_init(&pwq->llink);
459 list_del_init(&epi->fllink); 377 remove_wait_queue(pwq->whead, &pwq->wait);
460 down_write(&ep->sem); 378 kmem_cache_free(pwq_cache, pwq);
461 ep_remove(ep, epi); 379 }
462 up_write(&ep->sem);
463 } 380 }
464
465 mutex_unlock(&epmutex);
466} 381}
467 382
468
469/* 383/*
470 * It opens an eventpoll file descriptor by suggesting a storage of "size" 384 * Unlink the "struct epitem" from all places it might have been hooked up.
471 * file descriptors. The size parameter is just an hint about how to size 385 * This function must be called with write IRQ lock on "ep->lock".
472 * data structures. It won't prevent the user to store more than "size"
473 * file descriptors inside the epoll interface. It is the kernel part of
474 * the userspace epoll_create(2).
475 */ 386 */
476asmlinkage long sys_epoll_create(int size) 387static int ep_unlink(struct eventpoll *ep, struct epitem *epi)
477{ 388{
478 int error, fd = -1; 389 int error;
479 struct eventpoll *ep;
480 struct inode *inode;
481 struct file *file;
482
483 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d)\n",
484 current, size));
485
486 /*
487 * Sanity check on the size parameter, and create the internal data
488 * structure ( "struct eventpoll" ).
489 */
490 error = -EINVAL;
491 if (size <= 0 || (error = ep_alloc(&ep)) != 0)
492 goto eexit_1;
493 390
494 /* 391 /*
495 * Creates all the items needed to setup an eventpoll file. That is, 392 * It can happen that this one is called for an item already unlinked.
496 * a file structure, and inode and a free file descriptor. 393 * The check protect us from doing a double unlink ( crash ).
497 */ 394 */
498 error = ep_getfd(&fd, &inode, &file, ep); 395 error = -ENOENT;
499 if (error) 396 if (!ep_rb_linked(&epi->rbn))
500 goto eexit_2; 397 goto error_return;
501
502 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n",
503 current, size, fd));
504
505 return fd;
506
507eexit_2:
508 ep_free(ep);
509 kfree(ep);
510eexit_1:
511 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n",
512 current, size, error));
513 return error;
514}
515
516
517/*
518 * The following function implements the controller interface for
519 * the eventpoll file that enables the insertion/removal/change of
520 * file descriptors inside the interest set. It represents
521 * the kernel part of the user space epoll_ctl(2).
522 */
523asmlinkage long
524sys_epoll_ctl(int epfd, int op, int fd, struct epoll_event __user *event)
525{
526 int error;
527 struct file *file, *tfile;
528 struct eventpoll *ep;
529 struct epitem *epi;
530 struct epoll_event epds;
531
532 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p)\n",
533 current, epfd, op, fd, event));
534
535 error = -EFAULT;
536 if (ep_op_has_event(op) &&
537 copy_from_user(&epds, event, sizeof(struct epoll_event)))
538 goto eexit_1;
539
540 /* Get the "struct file *" for the eventpoll file */
541 error = -EBADF;
542 file = fget(epfd);
543 if (!file)
544 goto eexit_1;
545
546 /* Get the "struct file *" for the target file */
547 tfile = fget(fd);
548 if (!tfile)
549 goto eexit_2;
550
551 /* The target file descriptor must support poll */
552 error = -EPERM;
553 if (!tfile->f_op || !tfile->f_op->poll)
554 goto eexit_3;
555 398
556 /* 399 /*
557 * We have to check that the file structure underneath the file descriptor 400 * Clear the event mask for the unlinked item. This will avoid item
558 * the user passed to us _is_ an eventpoll file. And also we do not permit 401 * notifications to be sent after the unlink operation from inside
559 * adding an epoll file descriptor inside itself. 402 * the kernel->userspace event transfer loop.
560 */ 403 */
561 error = -EINVAL; 404 epi->event.events = 0;
562 if (file == tfile || !is_file_epoll(file))
563 goto eexit_3;
564 405
565 /* 406 /*
566 * At this point it is safe to assume that the "private_data" contains 407 * At this point is safe to do the job, unlink the item from our rb-tree.
567 * our own data structure. 408 * This operation togheter with the above check closes the door to
409 * double unlinks.
568 */ 410 */
569 ep = file->private_data; 411 ep_rb_erase(&epi->rbn, &ep->rbr);
570
571 down_write(&ep->sem);
572
573 /* Try to lookup the file inside our RB tree */
574 epi = ep_find(ep, tfile, fd);
575
576 error = -EINVAL;
577 switch (op) {
578 case EPOLL_CTL_ADD:
579 if (!epi) {
580 epds.events |= POLLERR | POLLHUP;
581
582 error = ep_insert(ep, &epds, tfile, fd);
583 } else
584 error = -EEXIST;
585 break;
586 case EPOLL_CTL_DEL:
587 if (epi)
588 error = ep_remove(ep, epi);
589 else
590 error = -ENOENT;
591 break;
592 case EPOLL_CTL_MOD:
593 if (epi) {
594 epds.events |= POLLERR | POLLHUP;
595 error = ep_modify(ep, epi, &epds);
596 } else
597 error = -ENOENT;
598 break;
599 }
600 412
601 /* 413 /*
602 * The function ep_find() increments the usage count of the structure 414 * If the item we are going to remove is inside the ready file descriptors
603 * so, if this is not NULL, we need to release it. 415 * we want to remove it from this list to avoid stale events.
604 */ 416 */
605 if (epi) 417 if (ep_is_linked(&epi->rdllink))
606 ep_release_epitem(epi); 418 list_del_init(&epi->rdllink);
607 419
608 up_write(&ep->sem); 420 error = 0;
421error_return:
609 422
610eexit_3: 423 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_unlink(%p, %p) = %d\n",
611 fput(tfile); 424 current, ep, epi->ffd.file, error));
612eexit_2:
613 fput(file);
614eexit_1:
615 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p) = %d\n",
616 current, epfd, op, fd, event, error));
617 425
618 return error; 426 return error;
619} 427}
620 428
621
622/* 429/*
623 * Implement the event wait interface for the eventpoll file. It is the kernel 430 * Increment the usage count of the "struct epitem" making it sure
624 * part of the user space epoll_wait(2). 431 * that the user will have a valid pointer to reference.
625 */ 432 */
626asmlinkage long sys_epoll_wait(int epfd, struct epoll_event __user *events, 433static void ep_use_epitem(struct epitem *epi)
627 int maxevents, int timeout)
628{ 434{
629 int error; 435 atomic_inc(&epi->usecnt);
630 struct file *file;
631 struct eventpoll *ep;
632
633 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d)\n",
634 current, epfd, events, maxevents, timeout));
635
636 /* The maximum number of event must be greater than zero */
637 if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)
638 return -EINVAL;
639
640 /* Verify that the area passed by the user is writeable */
641 if (!access_ok(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event))) {
642 error = -EFAULT;
643 goto eexit_1;
644 }
645
646 /* Get the "struct file *" for the eventpoll file */
647 error = -EBADF;
648 file = fget(epfd);
649 if (!file)
650 goto eexit_1;
651
652 /*
653 * We have to check that the file structure underneath the fd
654 * the user passed to us _is_ an eventpoll file.
655 */
656 error = -EINVAL;
657 if (!is_file_epoll(file))
658 goto eexit_2;
659
660 /*
661 * At this point it is safe to assume that the "private_data" contains
662 * our own data structure.
663 */
664 ep = file->private_data;
665
666 /* Time to fish for events ... */
667 error = ep_poll(ep, events, maxevents, timeout);
668
669eexit_2:
670 fput(file);
671eexit_1:
672 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d) = %d\n",
673 current, epfd, events, maxevents, timeout, error));
674
675 return error;
676} 436}
677 437
678
679#ifdef TIF_RESTORE_SIGMASK
680
681/* 438/*
682 * Implement the event wait interface for the eventpoll file. It is the kernel 439 * Decrement ( release ) the usage count by signaling that the user
683 * part of the user space epoll_pwait(2). 440 * has finished using the structure. It might lead to freeing the
441 * structure itself if the count goes to zero.
684 */ 442 */
685asmlinkage long sys_epoll_pwait(int epfd, struct epoll_event __user *events, 443static void ep_release_epitem(struct epitem *epi)
686 int maxevents, int timeout, const sigset_t __user *sigmask,
687 size_t sigsetsize)
688{ 444{
689 int error; 445 if (atomic_dec_and_test(&epi->usecnt))
690 sigset_t ksigmask, sigsaved; 446 kmem_cache_free(epi_cache, epi);
691
692 /*
693 * If the caller wants a certain signal mask to be set during the wait,
694 * we apply it here.
695 */
696 if (sigmask) {
697 if (sigsetsize != sizeof(sigset_t))
698 return -EINVAL;
699 if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
700 return -EFAULT;
701 sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP));
702 sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
703 }
704
705 error = sys_epoll_wait(epfd, events, maxevents, timeout);
706
707 /*
708 * If we changed the signal mask, we need to restore the original one.
709 * In case we've got a signal while waiting, we do not restore the
710 * signal mask yet, and we allow do_signal() to deliver the signal on
711 * the way back to userspace, before the signal mask is restored.
712 */
713 if (sigmask) {
714 if (error == -EINTR) {
715 memcpy(&current->saved_sigmask, &sigsaved,
716 sizeof(sigsaved));
717 set_thread_flag(TIF_RESTORE_SIGMASK);
718 } else
719 sigprocmask(SIG_SETMASK, &sigsaved, NULL);
720 }
721
722 return error;
723} 447}
724 448
725#endif /* #ifdef TIF_RESTORE_SIGMASK */
726
727
728/* 449/*
729 * Creates the file descriptor to be used by the epoll interface. 450 * Removes a "struct epitem" from the eventpoll RB tree and deallocates
451 * all the associated resources.
730 */ 452 */
731static int ep_getfd(int *efd, struct inode **einode, struct file **efile, 453static int ep_remove(struct eventpoll *ep, struct epitem *epi)
732 struct eventpoll *ep)
733{ 454{
734 struct qstr this; 455 int error;
735 char name[32]; 456 unsigned long flags;
736 struct dentry *dentry; 457 struct file *file = epi->ffd.file;
737 struct inode *inode;
738 struct file *file;
739 int error, fd;
740
741 /* Get an ready to use file */
742 error = -ENFILE;
743 file = get_empty_filp();
744 if (!file)
745 goto eexit_1;
746
747 /* Allocates an inode from the eventpoll file system */
748 inode = ep_eventpoll_inode();
749 if (IS_ERR(inode)) {
750 error = PTR_ERR(inode);
751 goto eexit_2;
752 }
753
754 /* Allocates a free descriptor to plug the file onto */
755 error = get_unused_fd();
756 if (error < 0)
757 goto eexit_3;
758 fd = error;
759 458
760 /* 459 /*
761 * Link the inode to a directory entry by creating a unique name 460 * Removes poll wait queue hooks. We _have_ to do this without holding
762 * using the inode number. 461 * the "ep->lock" otherwise a deadlock might occur. This because of the
462 * sequence of the lock acquisition. Here we do "ep->lock" then the wait
463 * queue head lock when unregistering the wait queue. The wakeup callback
464 * will run by holding the wait queue head lock and will call our callback
465 * that will try to get "ep->lock".
763 */ 466 */
764 error = -ENOMEM; 467 ep_unregister_pollwait(ep, epi);
765 sprintf(name, "[%lu]", inode->i_ino);
766 this.name = name;
767 this.len = strlen(name);
768 this.hash = inode->i_ino;
769 dentry = d_alloc(eventpoll_mnt->mnt_sb->s_root, &this);
770 if (!dentry)
771 goto eexit_4;
772 dentry->d_op = &eventpollfs_dentry_operations;
773 d_add(dentry, inode);
774 file->f_path.mnt = mntget(eventpoll_mnt);
775 file->f_path.dentry = dentry;
776 file->f_mapping = inode->i_mapping;
777
778 file->f_pos = 0;
779 file->f_flags = O_RDONLY;
780 file->f_op = &eventpoll_fops;
781 file->f_mode = FMODE_READ;
782 file->f_version = 0;
783 file->private_data = ep;
784
785 /* Install the new setup file into the allocated fd. */
786 fd_install(fd, file);
787
788 *efd = fd;
789 *einode = inode;
790 *efile = file;
791 return 0;
792 468
793eexit_4: 469 /* Remove the current item from the list of epoll hooks */
794 put_unused_fd(fd); 470 spin_lock(&file->f_ep_lock);
795eexit_3: 471 if (ep_is_linked(&epi->fllink))
796 iput(inode); 472 list_del_init(&epi->fllink);
797eexit_2: 473 spin_unlock(&file->f_ep_lock);
798 put_filp(file);
799eexit_1:
800 return error;
801}
802 474
475 /* We need to acquire the write IRQ lock before calling ep_unlink() */
476 write_lock_irqsave(&ep->lock, flags);
803 477
804static int ep_alloc(struct eventpoll **pep) 478 /* Really unlink the item from the RB tree */
805{ 479 error = ep_unlink(ep, epi);
806 struct eventpoll *ep = kzalloc(sizeof(*ep), GFP_KERNEL);
807 480
808 if (!ep) 481 write_unlock_irqrestore(&ep->lock, flags);
809 return -ENOMEM;
810 482
811 rwlock_init(&ep->lock); 483 if (error)
812 init_rwsem(&ep->sem); 484 goto error_return;
813 init_waitqueue_head(&ep->wq);
814 init_waitqueue_head(&ep->poll_wait);
815 INIT_LIST_HEAD(&ep->rdllist);
816 ep->rbr = RB_ROOT;
817 485
818 *pep = ep; 486 /* At this point it is safe to free the eventpoll item */
487 ep_release_epitem(epi);
819 488
820 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_alloc() ep=%p\n", 489 error = 0;
821 current, ep)); 490error_return:
822 return 0; 491 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_remove(%p, %p) = %d\n",
823} 492 current, ep, file, error));
824 493
494 return error;
495}
825 496
826static void ep_free(struct eventpoll *ep) 497static void ep_free(struct eventpoll *ep)
827{ 498{
@@ -865,6 +536,104 @@ static void ep_free(struct eventpoll *ep)
865 mutex_unlock(&epmutex); 536 mutex_unlock(&epmutex);
866} 537}
867 538
539static int ep_eventpoll_release(struct inode *inode, struct file *file)
540{
541 struct eventpoll *ep = file->private_data;
542
543 if (ep) {
544 ep_free(ep);
545 kfree(ep);
546 }
547
548 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: close() ep=%p\n", current, ep));
549 return 0;
550}
551
552static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
553{
554 unsigned int pollflags = 0;
555 unsigned long flags;
556 struct eventpoll *ep = file->private_data;
557
558 /* Insert inside our poll wait queue */
559 poll_wait(file, &ep->poll_wait, wait);
560
561 /* Check our condition */
562 read_lock_irqsave(&ep->lock, flags);
563 if (!list_empty(&ep->rdllist))
564 pollflags = POLLIN | POLLRDNORM;
565 read_unlock_irqrestore(&ep->lock, flags);
566
567 return pollflags;
568}
569
570/* File callbacks that implement the eventpoll file behaviour */
571static const struct file_operations eventpoll_fops = {
572 .release = ep_eventpoll_release,
573 .poll = ep_eventpoll_poll
574};
575
576/* Fast test to see if the file is an evenpoll file */
577static inline int is_file_epoll(struct file *f)
578{
579 return f->f_op == &eventpoll_fops;
580}
581
582/*
583 * This is called from eventpoll_release() to unlink files from the eventpoll
584 * interface. We need to have this facility to cleanup correctly files that are
585 * closed without being removed from the eventpoll interface.
586 */
587void eventpoll_release_file(struct file *file)
588{
589 struct list_head *lsthead = &file->f_ep_links;
590 struct eventpoll *ep;
591 struct epitem *epi;
592
593 /*
594 * We don't want to get "file->f_ep_lock" because it is not
595 * necessary. It is not necessary because we're in the "struct file"
596 * cleanup path, and this means that noone is using this file anymore.
597 * The only hit might come from ep_free() but by holding the semaphore
598 * will correctly serialize the operation. We do need to acquire
599 * "ep->sem" after "epmutex" because ep_remove() requires it when called
600 * from anywhere but ep_free().
601 */
602 mutex_lock(&epmutex);
603
604 while (!list_empty(lsthead)) {
605 epi = list_first_entry(lsthead, struct epitem, fllink);
606
607 ep = epi->ep;
608 list_del_init(&epi->fllink);
609 down_write(&ep->sem);
610 ep_remove(ep, epi);
611 up_write(&ep->sem);
612 }
613
614 mutex_unlock(&epmutex);
615}
616
617static int ep_alloc(struct eventpoll **pep)
618{
619 struct eventpoll *ep = kzalloc(sizeof(*ep), GFP_KERNEL);
620
621 if (!ep)
622 return -ENOMEM;
623
624 rwlock_init(&ep->lock);
625 init_rwsem(&ep->sem);
626 init_waitqueue_head(&ep->wq);
627 init_waitqueue_head(&ep->poll_wait);
628 INIT_LIST_HEAD(&ep->rdllist);
629 ep->rbr = RB_ROOT;
630
631 *pep = ep;
632
633 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_alloc() ep=%p\n",
634 current, ep));
635 return 0;
636}
868 637
869/* 638/*
870 * Search the file inside the eventpoll tree. It add usage count to 639 * Search the file inside the eventpoll tree. It add usage count to
@@ -902,30 +671,58 @@ static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
902 return epir; 671 return epir;
903} 672}
904 673
905
906/* 674/*
907 * Increment the usage count of the "struct epitem" making it sure 675 * This is the callback that is passed to the wait queue wakeup
908 * that the user will have a valid pointer to reference. 676 * machanism. It is called by the stored file descriptors when they
677 * have events to report.
909 */ 678 */
910static void ep_use_epitem(struct epitem *epi) 679static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key)
911{ 680{
681 int pwake = 0;
682 unsigned long flags;
683 struct epitem *epi = ep_item_from_wait(wait);
684 struct eventpoll *ep = epi->ep;
912 685
913 atomic_inc(&epi->usecnt); 686 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: poll_callback(%p) epi=%p ep=%p\n",
914} 687 current, epi->ffd.file, epi, ep));
915 688
689 write_lock_irqsave(&ep->lock, flags);
916 690
917/* 691 /*
918 * Decrement ( release ) the usage count by signaling that the user 692 * If the event mask does not contain any poll(2) event, we consider the
919 * has finished using the structure. It might lead to freeing the 693 * descriptor to be disabled. This condition is likely the effect of the
920 * structure itself if the count goes to zero. 694 * EPOLLONESHOT bit that disables the descriptor when an event is received,
921 */ 695 * until the next EPOLL_CTL_MOD will be issued.
922static void ep_release_epitem(struct epitem *epi) 696 */
923{ 697 if (!(epi->event.events & ~EP_PRIVATE_BITS))
698 goto is_disabled;
924 699
925 if (atomic_dec_and_test(&epi->usecnt)) 700 /* If this file is already in the ready list we exit soon */
926 kmem_cache_free(epi_cache, epi); 701 if (ep_is_linked(&epi->rdllink))
927} 702 goto is_linked;
928 703
704 list_add_tail(&epi->rdllink, &ep->rdllist);
705
706is_linked:
707 /*
708 * Wake up ( if active ) both the eventpoll wait list and the ->poll()
709 * wait list.
710 */
711 if (waitqueue_active(&ep->wq))
712 __wake_up_locked(&ep->wq, TASK_UNINTERRUPTIBLE |
713 TASK_INTERRUPTIBLE);
714 if (waitqueue_active(&ep->poll_wait))
715 pwake++;
716
717is_disabled:
718 write_unlock_irqrestore(&ep->lock, flags);
719
720 /* We have to call this outside the lock */
721 if (pwake)
722 ep_poll_safewake(&psw, &ep->poll_wait);
723
724 return 1;
725}
929 726
930/* 727/*
931 * This is the callback that is used to add our wait queue to the 728 * This is the callback that is used to add our wait queue to the
@@ -950,7 +747,6 @@ static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
950 } 747 }
951} 748}
952 749
953
954static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi) 750static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi)
955{ 751{
956 int kcmp; 752 int kcmp;
@@ -970,7 +766,6 @@ static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi)
970 rb_insert_color(&epi->rbn, &ep->rbr); 766 rb_insert_color(&epi->rbn, &ep->rbr);
971} 767}
972 768
973
974static int ep_insert(struct eventpoll *ep, struct epoll_event *event, 769static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
975 struct file *tfile, int fd) 770 struct file *tfile, int fd)
976{ 771{
@@ -981,7 +776,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
981 776
982 error = -ENOMEM; 777 error = -ENOMEM;
983 if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL))) 778 if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))
984 goto eexit_1; 779 goto error_return;
985 780
986 /* Item initialization follow here ... */ 781 /* Item initialization follow here ... */
987 ep_rb_initnode(&epi->rbn); 782 ep_rb_initnode(&epi->rbn);
@@ -1011,7 +806,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
1011 * high memory pressure. 806 * high memory pressure.
1012 */ 807 */
1013 if (epi->nwait < 0) 808 if (epi->nwait < 0)
1014 goto eexit_2; 809 goto error_unregister;
1015 810
1016 /* Add the current item to the list of active epoll hook for this file */ 811 /* Add the current item to the list of active epoll hook for this file */
1017 spin_lock(&tfile->f_ep_lock); 812 spin_lock(&tfile->f_ep_lock);
@@ -1046,7 +841,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
1046 841
1047 return 0; 842 return 0;
1048 843
1049eexit_2: 844error_unregister:
1050 ep_unregister_pollwait(ep, epi); 845 ep_unregister_pollwait(ep, epi);
1051 846
1052 /* 847 /*
@@ -1059,11 +854,10 @@ eexit_2:
1059 write_unlock_irqrestore(&ep->lock, flags); 854 write_unlock_irqrestore(&ep->lock, flags);
1060 855
1061 kmem_cache_free(epi_cache, epi); 856 kmem_cache_free(epi_cache, epi);
1062eexit_1: 857error_return:
1063 return error; 858 return error;
1064} 859}
1065 860
1066
1067/* 861/*
1068 * Modify the interest event mask by dropping an event if the new mask 862 * Modify the interest event mask by dropping an event if the new mask
1069 * has a match in the current file status. 863 * has a match in the current file status.
@@ -1126,216 +920,6 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
1126 return 0; 920 return 0;
1127} 921}
1128 922
1129
1130/*
1131 * This function unregister poll callbacks from the associated file descriptor.
1132 * Since this must be called without holding "ep->lock" the atomic exchange trick
1133 * will protect us from multiple unregister.
1134 */
1135static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)
1136{
1137 int nwait;
1138 struct list_head *lsthead = &epi->pwqlist;
1139 struct eppoll_entry *pwq;
1140
1141 /* This is called without locks, so we need the atomic exchange */
1142 nwait = xchg(&epi->nwait, 0);
1143
1144 if (nwait) {
1145 while (!list_empty(lsthead)) {
1146 pwq = list_first_entry(lsthead, struct eppoll_entry, llink);
1147
1148 list_del_init(&pwq->llink);
1149 remove_wait_queue(pwq->whead, &pwq->wait);
1150 kmem_cache_free(pwq_cache, pwq);
1151 }
1152 }
1153}
1154
1155
1156/*
1157 * Unlink the "struct epitem" from all places it might have been hooked up.
1158 * This function must be called with write IRQ lock on "ep->lock".
1159 */
1160static int ep_unlink(struct eventpoll *ep, struct epitem *epi)
1161{
1162 int error;
1163
1164 /*
1165 * It can happen that this one is called for an item already unlinked.
1166 * The check protect us from doing a double unlink ( crash ).
1167 */
1168 error = -ENOENT;
1169 if (!ep_rb_linked(&epi->rbn))
1170 goto eexit_1;
1171
1172 /*
1173 * Clear the event mask for the unlinked item. This will avoid item
1174 * notifications to be sent after the unlink operation from inside
1175 * the kernel->userspace event transfer loop.
1176 */
1177 epi->event.events = 0;
1178
1179 /*
1180 * At this point is safe to do the job, unlink the item from our rb-tree.
1181 * This operation togheter with the above check closes the door to
1182 * double unlinks.
1183 */
1184 ep_rb_erase(&epi->rbn, &ep->rbr);
1185
1186 /*
1187 * If the item we are going to remove is inside the ready file descriptors
1188 * we want to remove it from this list to avoid stale events.
1189 */
1190 if (ep_is_linked(&epi->rdllink))
1191 list_del_init(&epi->rdllink);
1192
1193 error = 0;
1194eexit_1:
1195
1196 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_unlink(%p, %p) = %d\n",
1197 current, ep, epi->ffd.file, error));
1198
1199 return error;
1200}
1201
1202
1203/*
1204 * Removes a "struct epitem" from the eventpoll RB tree and deallocates
1205 * all the associated resources.
1206 */
1207static int ep_remove(struct eventpoll *ep, struct epitem *epi)
1208{
1209 int error;
1210 unsigned long flags;
1211 struct file *file = epi->ffd.file;
1212
1213 /*
1214 * Removes poll wait queue hooks. We _have_ to do this without holding
1215 * the "ep->lock" otherwise a deadlock might occur. This because of the
1216 * sequence of the lock acquisition. Here we do "ep->lock" then the wait
1217 * queue head lock when unregistering the wait queue. The wakeup callback
1218 * will run by holding the wait queue head lock and will call our callback
1219 * that will try to get "ep->lock".
1220 */
1221 ep_unregister_pollwait(ep, epi);
1222
1223 /* Remove the current item from the list of epoll hooks */
1224 spin_lock(&file->f_ep_lock);
1225 if (ep_is_linked(&epi->fllink))
1226 list_del_init(&epi->fllink);
1227 spin_unlock(&file->f_ep_lock);
1228
1229 /* We need to acquire the write IRQ lock before calling ep_unlink() */
1230 write_lock_irqsave(&ep->lock, flags);
1231
1232 /* Really unlink the item from the RB tree */
1233 error = ep_unlink(ep, epi);
1234
1235 write_unlock_irqrestore(&ep->lock, flags);
1236
1237 if (error)
1238 goto eexit_1;
1239
1240 /* At this point it is safe to free the eventpoll item */
1241 ep_release_epitem(epi);
1242
1243 error = 0;
1244eexit_1:
1245 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_remove(%p, %p) = %d\n",
1246 current, ep, file, error));
1247
1248 return error;
1249}
1250
1251
1252/*
1253 * This is the callback that is passed to the wait queue wakeup
1254 * machanism. It is called by the stored file descriptors when they
1255 * have events to report.
1256 */
1257static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key)
1258{
1259 int pwake = 0;
1260 unsigned long flags;
1261 struct epitem *epi = ep_item_from_wait(wait);
1262 struct eventpoll *ep = epi->ep;
1263
1264 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: poll_callback(%p) epi=%p ep=%p\n",
1265 current, epi->ffd.file, epi, ep));
1266
1267 write_lock_irqsave(&ep->lock, flags);
1268
1269 /*
1270 * If the event mask does not contain any poll(2) event, we consider the
1271 * descriptor to be disabled. This condition is likely the effect of the
1272 * EPOLLONESHOT bit that disables the descriptor when an event is received,
1273 * until the next EPOLL_CTL_MOD will be issued.
1274 */
1275 if (!(epi->event.events & ~EP_PRIVATE_BITS))
1276 goto is_disabled;
1277
1278 /* If this file is already in the ready list we exit soon */
1279 if (ep_is_linked(&epi->rdllink))
1280 goto is_linked;
1281
1282 list_add_tail(&epi->rdllink, &ep->rdllist);
1283
1284is_linked:
1285 /*
1286 * Wake up ( if active ) both the eventpoll wait list and the ->poll()
1287 * wait list.
1288 */
1289 if (waitqueue_active(&ep->wq))
1290 __wake_up_locked(&ep->wq, TASK_UNINTERRUPTIBLE |
1291 TASK_INTERRUPTIBLE);
1292 if (waitqueue_active(&ep->poll_wait))
1293 pwake++;
1294
1295is_disabled:
1296 write_unlock_irqrestore(&ep->lock, flags);
1297
1298 /* We have to call this outside the lock */
1299 if (pwake)
1300 ep_poll_safewake(&psw, &ep->poll_wait);
1301
1302 return 1;
1303}
1304
1305
1306static int ep_eventpoll_close(struct inode *inode, struct file *file)
1307{
1308 struct eventpoll *ep = file->private_data;
1309
1310 if (ep) {
1311 ep_free(ep);
1312 kfree(ep);
1313 }
1314
1315 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: close() ep=%p\n", current, ep));
1316 return 0;
1317}
1318
1319
1320static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
1321{
1322 unsigned int pollflags = 0;
1323 unsigned long flags;
1324 struct eventpoll *ep = file->private_data;
1325
1326 /* Insert inside our poll wait queue */
1327 poll_wait(file, &ep->poll_wait, wait);
1328
1329 /* Check our condition */
1330 read_lock_irqsave(&ep->lock, flags);
1331 if (!list_empty(&ep->rdllist))
1332 pollflags = POLLIN | POLLRDNORM;
1333 read_unlock_irqrestore(&ep->lock, flags);
1334
1335 return pollflags;
1336}
1337
1338
1339/* 923/*
1340 * This function is called without holding the "ep->lock" since the call to 924 * This function is called without holding the "ep->lock" since the call to
1341 * __copy_to_user() might sleep, and also f_op->poll() might reenable the IRQ 925 * __copy_to_user() might sleep, and also f_op->poll() might reenable the IRQ
@@ -1447,7 +1031,6 @@ static int ep_send_events(struct eventpoll *ep, struct list_head *txlist,
1447 return eventcnt == 0 ? error: eventcnt; 1031 return eventcnt == 0 ? error: eventcnt;
1448} 1032}
1449 1033
1450
1451/* 1034/*
1452 * Perform the transfer of events to user space. 1035 * Perform the transfer of events to user space.
1453 */ 1036 */
@@ -1483,7 +1066,6 @@ static int ep_events_transfer(struct eventpoll *ep,
1483 return eventcnt; 1066 return eventcnt;
1484} 1067}
1485 1068
1486
1487static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, 1069static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
1488 int maxevents, long timeout) 1070 int maxevents, long timeout)
1489{ 1071{
@@ -1553,52 +1135,262 @@ retry:
1553 return res; 1135 return res;
1554} 1136}
1555 1137
1556static int eventpollfs_delete_dentry(struct dentry *dentry) 1138/*
1139 * It opens an eventpoll file descriptor by suggesting a storage of "size"
1140 * file descriptors. The size parameter is just an hint about how to size
1141 * data structures. It won't prevent the user to store more than "size"
1142 * file descriptors inside the epoll interface. It is the kernel part of
1143 * the userspace epoll_create(2).
1144 */
1145asmlinkage long sys_epoll_create(int size)
1557{ 1146{
1147 int error, fd = -1;
1148 struct eventpoll *ep;
1149 struct inode *inode;
1150 struct file *file;
1558 1151
1559 return 1; 1152 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d)\n",
1153 current, size));
1154
1155 /*
1156 * Sanity check on the size parameter, and create the internal data
1157 * structure ( "struct eventpoll" ).
1158 */
1159 error = -EINVAL;
1160 if (size <= 0 || (error = ep_alloc(&ep)) != 0)
1161 goto error_return;
1162
1163 /*
1164 * Creates all the items needed to setup an eventpoll file. That is,
1165 * a file structure, and inode and a free file descriptor.
1166 */
1167 error = anon_inode_getfd(&fd, &inode, &file, "[eventpoll]",
1168 &eventpoll_fops, ep);
1169 if (error)
1170 goto error_free;
1171
1172 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n",
1173 current, size, fd));
1174
1175 return fd;
1176
1177error_free:
1178 ep_free(ep);
1179 kfree(ep);
1180error_return:
1181 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n",
1182 current, size, error));
1183 return error;
1560} 1184}
1561 1185
1562static struct inode *ep_eventpoll_inode(void) 1186/*
1187 * The following function implements the controller interface for
1188 * the eventpoll file that enables the insertion/removal/change of
1189 * file descriptors inside the interest set. It represents
1190 * the kernel part of the user space epoll_ctl(2).
1191 */
1192asmlinkage long sys_epoll_ctl(int epfd, int op, int fd,
1193 struct epoll_event __user *event)
1563{ 1194{
1564 int error = -ENOMEM; 1195 int error;
1565 struct inode *inode = new_inode(eventpoll_mnt->mnt_sb); 1196 struct file *file, *tfile;
1197 struct eventpoll *ep;
1198 struct epitem *epi;
1199 struct epoll_event epds;
1200
1201 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p)\n",
1202 current, epfd, op, fd, event));
1203
1204 error = -EFAULT;
1205 if (ep_op_has_event(op) &&
1206 copy_from_user(&epds, event, sizeof(struct epoll_event)))
1207 goto error_return;
1208
1209 /* Get the "struct file *" for the eventpoll file */
1210 error = -EBADF;
1211 file = fget(epfd);
1212 if (!file)
1213 goto error_return;
1214
1215 /* Get the "struct file *" for the target file */
1216 tfile = fget(fd);
1217 if (!tfile)
1218 goto error_fput;
1219
1220 /* The target file descriptor must support poll */
1221 error = -EPERM;
1222 if (!tfile->f_op || !tfile->f_op->poll)
1223 goto error_tgt_fput;
1224
1225 /*
1226 * We have to check that the file structure underneath the file descriptor
1227 * the user passed to us _is_ an eventpoll file. And also we do not permit
1228 * adding an epoll file descriptor inside itself.
1229 */
1230 error = -EINVAL;
1231 if (file == tfile || !is_file_epoll(file))
1232 goto error_tgt_fput;
1566 1233
1567 if (!inode) 1234 /*
1568 goto eexit_1; 1235 * At this point it is safe to assume that the "private_data" contains
1236 * our own data structure.
1237 */
1238 ep = file->private_data;
1239
1240 down_write(&ep->sem);
1569 1241
1570 inode->i_fop = &eventpoll_fops; 1242 /* Try to lookup the file inside our RB tree */
1243 epi = ep_find(ep, tfile, fd);
1244
1245 error = -EINVAL;
1246 switch (op) {
1247 case EPOLL_CTL_ADD:
1248 if (!epi) {
1249 epds.events |= POLLERR | POLLHUP;
1571 1250
1251 error = ep_insert(ep, &epds, tfile, fd);
1252 } else
1253 error = -EEXIST;
1254 break;
1255 case EPOLL_CTL_DEL:
1256 if (epi)
1257 error = ep_remove(ep, epi);
1258 else
1259 error = -ENOENT;
1260 break;
1261 case EPOLL_CTL_MOD:
1262 if (epi) {
1263 epds.events |= POLLERR | POLLHUP;
1264 error = ep_modify(ep, epi, &epds);
1265 } else
1266 error = -ENOENT;
1267 break;
1268 }
1572 /* 1269 /*
1573 * Mark the inode dirty from the very beginning, 1270 * The function ep_find() increments the usage count of the structure
1574 * that way it will never be moved to the dirty 1271 * so, if this is not NULL, we need to release it.
1575 * list because mark_inode_dirty() will think
1576 * that it already _is_ on the dirty list.
1577 */ 1272 */
1578 inode->i_state = I_DIRTY; 1273 if (epi)
1579 inode->i_mode = S_IRUSR | S_IWUSR; 1274 ep_release_epitem(epi);
1580 inode->i_uid = current->fsuid; 1275 up_write(&ep->sem);
1581 inode->i_gid = current->fsgid; 1276
1582 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 1277error_tgt_fput:
1583 return inode; 1278 fput(tfile);
1584 1279error_fput:
1585eexit_1: 1280 fput(file);
1586 return ERR_PTR(error); 1281error_return:
1282 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p) = %d\n",
1283 current, epfd, op, fd, event, error));
1284
1285 return error;
1587} 1286}
1588 1287
1589static int 1288/*
1590eventpollfs_get_sb(struct file_system_type *fs_type, int flags, 1289 * Implement the event wait interface for the eventpoll file. It is the kernel
1591 const char *dev_name, void *data, struct vfsmount *mnt) 1290 * part of the user space epoll_wait(2).
1291 */
1292asmlinkage long sys_epoll_wait(int epfd, struct epoll_event __user *events,
1293 int maxevents, int timeout)
1592{ 1294{
1593 return get_sb_pseudo(fs_type, "eventpoll:", NULL, EVENTPOLLFS_MAGIC, 1295 int error;
1594 mnt); 1296 struct file *file;
1297 struct eventpoll *ep;
1298
1299 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d)\n",
1300 current, epfd, events, maxevents, timeout));
1301
1302 /* The maximum number of event must be greater than zero */
1303 if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)
1304 return -EINVAL;
1305
1306 /* Verify that the area passed by the user is writeable */
1307 if (!access_ok(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event))) {
1308 error = -EFAULT;
1309 goto error_return;
1310 }
1311
1312 /* Get the "struct file *" for the eventpoll file */
1313 error = -EBADF;
1314 file = fget(epfd);
1315 if (!file)
1316 goto error_return;
1317
1318 /*
1319 * We have to check that the file structure underneath the fd
1320 * the user passed to us _is_ an eventpoll file.
1321 */
1322 error = -EINVAL;
1323 if (!is_file_epoll(file))
1324 goto error_fput;
1325
1326 /*
1327 * At this point it is safe to assume that the "private_data" contains
1328 * our own data structure.
1329 */
1330 ep = file->private_data;
1331
1332 /* Time to fish for events ... */
1333 error = ep_poll(ep, events, maxevents, timeout);
1334
1335error_fput:
1336 fput(file);
1337error_return:
1338 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d) = %d\n",
1339 current, epfd, events, maxevents, timeout, error));
1340
1341 return error;
1595} 1342}
1596 1343
1344#ifdef TIF_RESTORE_SIGMASK
1597 1345
1598static int __init eventpoll_init(void) 1346/*
1347 * Implement the event wait interface for the eventpoll file. It is the kernel
1348 * part of the user space epoll_pwait(2).
1349 */
1350asmlinkage long sys_epoll_pwait(int epfd, struct epoll_event __user *events,
1351 int maxevents, int timeout, const sigset_t __user *sigmask,
1352 size_t sigsetsize)
1599{ 1353{
1600 int error; 1354 int error;
1355 sigset_t ksigmask, sigsaved;
1356
1357 /*
1358 * If the caller wants a certain signal mask to be set during the wait,
1359 * we apply it here.
1360 */
1361 if (sigmask) {
1362 if (sigsetsize != sizeof(sigset_t))
1363 return -EINVAL;
1364 if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
1365 return -EFAULT;
1366 sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP));
1367 sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
1368 }
1369
1370 error = sys_epoll_wait(epfd, events, maxevents, timeout);
1371
1372 /*
1373 * If we changed the signal mask, we need to restore the original one.
1374 * In case we've got a signal while waiting, we do not restore the
1375 * signal mask yet, and we allow do_signal() to deliver the signal on
1376 * the way back to userspace, before the signal mask is restored.
1377 */
1378 if (sigmask) {
1379 if (error == -EINTR) {
1380 memcpy(&current->saved_sigmask, &sigsaved,
1381 sizeof(sigsaved));
1382 set_thread_flag(TIF_RESTORE_SIGMASK);
1383 } else
1384 sigprocmask(SIG_SETMASK, &sigsaved, NULL);
1385 }
1386
1387 return error;
1388}
1601 1389
1390#endif /* #ifdef TIF_RESTORE_SIGMASK */
1391
1392static int __init eventpoll_init(void)
1393{
1602 mutex_init(&epmutex); 1394 mutex_init(&epmutex);
1603 1395
1604 /* Initialize the structure used to perform safe poll wait head wake ups */ 1396 /* Initialize the structure used to perform safe poll wait head wake ups */
@@ -1614,39 +1406,7 @@ static int __init eventpoll_init(void)
1614 sizeof(struct eppoll_entry), 0, 1406 sizeof(struct eppoll_entry), 0,
1615 EPI_SLAB_DEBUG|SLAB_PANIC, NULL, NULL); 1407 EPI_SLAB_DEBUG|SLAB_PANIC, NULL, NULL);
1616 1408
1617 /*
1618 * Register the virtual file system that will be the source of inodes
1619 * for the eventpoll files
1620 */
1621 error = register_filesystem(&eventpoll_fs_type);
1622 if (error)
1623 goto epanic;
1624
1625 /* Mount the above commented virtual file system */
1626 eventpoll_mnt = kern_mount(&eventpoll_fs_type);
1627 error = PTR_ERR(eventpoll_mnt);
1628 if (IS_ERR(eventpoll_mnt))
1629 goto epanic;
1630
1631 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: successfully initialized.\n",
1632 current));
1633 return 0; 1409 return 0;
1634
1635epanic:
1636 panic("eventpoll_init() failed\n");
1637} 1410}
1411fs_initcall(eventpoll_init);
1638 1412
1639
1640static void __exit eventpoll_exit(void)
1641{
1642 /* Undo all operations done inside eventpoll_init() */
1643 unregister_filesystem(&eventpoll_fs_type);
1644 mntput(eventpoll_mnt);
1645 kmem_cache_destroy(pwq_cache);
1646 kmem_cache_destroy(epi_cache);
1647}
1648
1649module_init(eventpoll_init);
1650module_exit(eventpoll_exit);
1651
1652MODULE_LICENSE("GPL");
diff --git a/fs/exec.c b/fs/exec.c
index 7cf078ec758e..70fa36554c14 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -50,6 +50,7 @@
50#include <linux/tsacct_kern.h> 50#include <linux/tsacct_kern.h>
51#include <linux/cn_proc.h> 51#include <linux/cn_proc.h>
52#include <linux/audit.h> 52#include <linux/audit.h>
53#include <linux/signalfd.h>
53 54
54#include <asm/uaccess.h> 55#include <asm/uaccess.h>
55#include <asm/mmu_context.h> 56#include <asm/mmu_context.h>
@@ -582,6 +583,13 @@ static int de_thread(struct task_struct *tsk)
582 int count; 583 int count;
583 584
584 /* 585 /*
586 * Tell all the sighand listeners that this sighand has
587 * been detached. The signalfd_detach() function grabs the
588 * sighand lock, if signal listeners are present on the sighand.
589 */
590 signalfd_detach(tsk);
591
592 /*
585 * If we don't share sighandlers, then we aren't sharing anything 593 * If we don't share sighandlers, then we aren't sharing anything
586 * and we can just re-use it all. 594 * and we can just re-use it all.
587 */ 595 */
@@ -702,7 +710,7 @@ static int de_thread(struct task_struct *tsk)
702 */ 710 */
703 detach_pid(tsk, PIDTYPE_PID); 711 detach_pid(tsk, PIDTYPE_PID);
704 tsk->pid = leader->pid; 712 tsk->pid = leader->pid;
705 attach_pid(tsk, PIDTYPE_PID, tsk->pid); 713 attach_pid(tsk, PIDTYPE_PID, find_pid(tsk->pid));
706 transfer_pid(leader, tsk, PIDTYPE_PGID); 714 transfer_pid(leader, tsk, PIDTYPE_PGID);
707 transfer_pid(leader, tsk, PIDTYPE_SID); 715 transfer_pid(leader, tsk, PIDTYPE_SID);
708 list_replace_rcu(&leader->tasks, &tsk->tasks); 716 list_replace_rcu(&leader->tasks, &tsk->tasks);
@@ -757,8 +765,7 @@ no_thread_group:
757 spin_unlock(&oldsighand->siglock); 765 spin_unlock(&oldsighand->siglock);
758 write_unlock_irq(&tasklist_lock); 766 write_unlock_irq(&tasklist_lock);
759 767
760 if (atomic_dec_and_test(&oldsighand->count)) 768 __cleanup_sighand(oldsighand);
761 kmem_cache_free(sighand_cachep, oldsighand);
762 } 769 }
763 770
764 BUG_ON(!thread_group_leader(tsk)); 771 BUG_ON(!thread_group_leader(tsk));
diff --git a/fs/mpage.c b/fs/mpage.c
index 0fb914fc2ee0..c1698f2291aa 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -454,11 +454,18 @@ EXPORT_SYMBOL(mpage_readpage);
454 * written, so it can intelligently allocate a suitably-sized BIO. For now, 454 * written, so it can intelligently allocate a suitably-sized BIO. For now,
455 * just allocate full-size (16-page) BIOs. 455 * just allocate full-size (16-page) BIOs.
456 */ 456 */
457static struct bio * 457struct mpage_data {
458__mpage_writepage(struct bio *bio, struct page *page, get_block_t get_block, 458 struct bio *bio;
459 sector_t *last_block_in_bio, int *ret, struct writeback_control *wbc, 459 sector_t last_block_in_bio;
460 writepage_t writepage_fn) 460 get_block_t *get_block;
461 unsigned use_writepage;
462};
463
464static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
465 void *data)
461{ 466{
467 struct mpage_data *mpd = data;
468 struct bio *bio = mpd->bio;
462 struct address_space *mapping = page->mapping; 469 struct address_space *mapping = page->mapping;
463 struct inode *inode = page->mapping->host; 470 struct inode *inode = page->mapping->host;
464 const unsigned blkbits = inode->i_blkbits; 471 const unsigned blkbits = inode->i_blkbits;
@@ -476,6 +483,7 @@ __mpage_writepage(struct bio *bio, struct page *page, get_block_t get_block,
476 int length; 483 int length;
477 struct buffer_head map_bh; 484 struct buffer_head map_bh;
478 loff_t i_size = i_size_read(inode); 485 loff_t i_size = i_size_read(inode);
486 int ret = 0;
479 487
480 if (page_has_buffers(page)) { 488 if (page_has_buffers(page)) {
481 struct buffer_head *head = page_buffers(page); 489 struct buffer_head *head = page_buffers(page);
@@ -538,7 +546,7 @@ __mpage_writepage(struct bio *bio, struct page *page, get_block_t get_block,
538 546
539 map_bh.b_state = 0; 547 map_bh.b_state = 0;
540 map_bh.b_size = 1 << blkbits; 548 map_bh.b_size = 1 << blkbits;
541 if (get_block(inode, block_in_file, &map_bh, 1)) 549 if (mpd->get_block(inode, block_in_file, &map_bh, 1))
542 goto confused; 550 goto confused;
543 if (buffer_new(&map_bh)) 551 if (buffer_new(&map_bh))
544 unmap_underlying_metadata(map_bh.b_bdev, 552 unmap_underlying_metadata(map_bh.b_bdev,
@@ -584,7 +592,7 @@ page_is_mapped:
584 /* 592 /*
585 * This page will go to BIO. Do we need to send this BIO off first? 593 * This page will go to BIO. Do we need to send this BIO off first?
586 */ 594 */
587 if (bio && *last_block_in_bio != blocks[0] - 1) 595 if (bio && mpd->last_block_in_bio != blocks[0] - 1)
588 bio = mpage_bio_submit(WRITE, bio); 596 bio = mpage_bio_submit(WRITE, bio);
589 597
590alloc_new: 598alloc_new:
@@ -641,7 +649,7 @@ alloc_new:
641 boundary_block, 1 << blkbits); 649 boundary_block, 1 << blkbits);
642 } 650 }
643 } else { 651 } else {
644 *last_block_in_bio = blocks[blocks_per_page - 1]; 652 mpd->last_block_in_bio = blocks[blocks_per_page - 1];
645 } 653 }
646 goto out; 654 goto out;
647 655
@@ -649,18 +657,19 @@ confused:
649 if (bio) 657 if (bio)
650 bio = mpage_bio_submit(WRITE, bio); 658 bio = mpage_bio_submit(WRITE, bio);
651 659
652 if (writepage_fn) { 660 if (mpd->use_writepage) {
653 *ret = (*writepage_fn)(page, wbc); 661 ret = mapping->a_ops->writepage(page, wbc);
654 } else { 662 } else {
655 *ret = -EAGAIN; 663 ret = -EAGAIN;
656 goto out; 664 goto out;
657 } 665 }
658 /* 666 /*
659 * The caller has a ref on the inode, so *mapping is stable 667 * The caller has a ref on the inode, so *mapping is stable
660 */ 668 */
661 mapping_set_error(mapping, *ret); 669 mapping_set_error(mapping, ret);
662out: 670out:
663 return bio; 671 mpd->bio = bio;
672 return ret;
664} 673}
665 674
666/** 675/**
@@ -683,120 +692,27 @@ out:
683 * the call was made get new I/O started against them. If wbc->sync_mode is 692 * the call was made get new I/O started against them. If wbc->sync_mode is
684 * WB_SYNC_ALL then we were called for data integrity and we must wait for 693 * WB_SYNC_ALL then we were called for data integrity and we must wait for
685 * existing IO to complete. 694 * existing IO to complete.
686 *
687 * If you fix this you should check generic_writepages() also!
688 */ 695 */
689int 696int
690mpage_writepages(struct address_space *mapping, 697mpage_writepages(struct address_space *mapping,
691 struct writeback_control *wbc, get_block_t get_block) 698 struct writeback_control *wbc, get_block_t get_block)
692{ 699{
693 struct backing_dev_info *bdi = mapping->backing_dev_info; 700 int ret;
694 struct bio *bio = NULL; 701
695 sector_t last_block_in_bio = 0; 702 if (!get_block)
696 int ret = 0; 703 ret = generic_writepages(mapping, wbc);
697 int done = 0; 704 else {
698 int (*writepage)(struct page *page, struct writeback_control *wbc); 705 struct mpage_data mpd = {
699 struct pagevec pvec; 706 .bio = NULL,
700 int nr_pages; 707 .last_block_in_bio = 0,
701 pgoff_t index; 708 .get_block = get_block,
702 pgoff_t end; /* Inclusive */ 709 .use_writepage = 1,
703 int scanned = 0; 710 };
704 int range_whole = 0; 711
705 712 ret = write_cache_pages(mapping, wbc, __mpage_writepage, &mpd);
706 if (wbc->nonblocking && bdi_write_congested(bdi)) { 713 if (mpd.bio)
707 wbc->encountered_congestion = 1; 714 mpage_bio_submit(WRITE, mpd.bio);
708 return 0;
709 }
710
711 writepage = NULL;
712 if (get_block == NULL)
713 writepage = mapping->a_ops->writepage;
714
715 pagevec_init(&pvec, 0);
716 if (wbc->range_cyclic) {
717 index = mapping->writeback_index; /* Start from prev offset */
718 end = -1;
719 } else {
720 index = wbc->range_start >> PAGE_CACHE_SHIFT;
721 end = wbc->range_end >> PAGE_CACHE_SHIFT;
722 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
723 range_whole = 1;
724 scanned = 1;
725 } 715 }
726retry:
727 while (!done && (index <= end) &&
728 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
729 PAGECACHE_TAG_DIRTY,
730 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
731 unsigned i;
732
733 scanned = 1;
734 for (i = 0; i < nr_pages; i++) {
735 struct page *page = pvec.pages[i];
736
737 /*
738 * At this point we hold neither mapping->tree_lock nor
739 * lock on the page itself: the page may be truncated or
740 * invalidated (changing page->mapping to NULL), or even
741 * swizzled back from swapper_space to tmpfs file
742 * mapping
743 */
744
745 lock_page(page);
746
747 if (unlikely(page->mapping != mapping)) {
748 unlock_page(page);
749 continue;
750 }
751
752 if (!wbc->range_cyclic && page->index > end) {
753 done = 1;
754 unlock_page(page);
755 continue;
756 }
757
758 if (wbc->sync_mode != WB_SYNC_NONE)
759 wait_on_page_writeback(page);
760
761 if (PageWriteback(page) ||
762 !clear_page_dirty_for_io(page)) {
763 unlock_page(page);
764 continue;
765 }
766
767 if (writepage) {
768 ret = (*writepage)(page, wbc);
769 mapping_set_error(mapping, ret);
770 } else {
771 bio = __mpage_writepage(bio, page, get_block,
772 &last_block_in_bio, &ret, wbc,
773 page->mapping->a_ops->writepage);
774 }
775 if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE))
776 unlock_page(page);
777 if (ret || (--(wbc->nr_to_write) <= 0))
778 done = 1;
779 if (wbc->nonblocking && bdi_write_congested(bdi)) {
780 wbc->encountered_congestion = 1;
781 done = 1;
782 }
783 }
784 pagevec_release(&pvec);
785 cond_resched();
786 }
787 if (!scanned && !done) {
788 /*
789 * We hit the last page and there is more work to be done: wrap
790 * back to the start of the file
791 */
792 scanned = 1;
793 index = 0;
794 goto retry;
795 }
796 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
797 mapping->writeback_index = index;
798 if (bio)
799 mpage_bio_submit(WRITE, bio);
800 return ret; 716 return ret;
801} 717}
802EXPORT_SYMBOL(mpage_writepages); 718EXPORT_SYMBOL(mpage_writepages);
@@ -804,15 +720,15 @@ EXPORT_SYMBOL(mpage_writepages);
804int mpage_writepage(struct page *page, get_block_t get_block, 720int mpage_writepage(struct page *page, get_block_t get_block,
805 struct writeback_control *wbc) 721 struct writeback_control *wbc)
806{ 722{
807 int ret = 0; 723 struct mpage_data mpd = {
808 struct bio *bio; 724 .bio = NULL,
809 sector_t last_block_in_bio = 0; 725 .last_block_in_bio = 0,
810 726 .get_block = get_block,
811 bio = __mpage_writepage(NULL, page, get_block, 727 .use_writepage = 0,
812 &last_block_in_bio, &ret, wbc, NULL); 728 };
813 if (bio) 729 int ret = __mpage_writepage(page, wbc, &mpd);
814 mpage_bio_submit(WRITE, bio); 730 if (mpd.bio)
815 731 mpage_bio_submit(WRITE, mpd.bio);
816 return ret; 732 return ret;
817} 733}
818EXPORT_SYMBOL(mpage_writepage); 734EXPORT_SYMBOL(mpage_writepage);
diff --git a/fs/partitions/Kconfig b/fs/partitions/Kconfig
index 01207042048b..7638a1c42a7d 100644
--- a/fs/partitions/Kconfig
+++ b/fs/partitions/Kconfig
@@ -239,7 +239,7 @@ config EFI_PARTITION
239 239
240config SYSV68_PARTITION 240config SYSV68_PARTITION
241 bool "SYSV68 partition table support" if PARTITION_ADVANCED 241 bool "SYSV68 partition table support" if PARTITION_ADVANCED
242 default y if M68K 242 default y if VME
243 help 243 help
244 Say Y here if you would like to be able to read the hard disk 244 Say Y here if you would like to be able to read the hard disk
245 partition table format used by Motorola Delta machines (using 245 partition table format used by Motorola Delta machines (using
diff --git a/fs/partitions/efi.c b/fs/partitions/efi.c
index 1bea610078b3..e7b07006bc41 100644
--- a/fs/partitions/efi.c
+++ b/fs/partitions/efi.c
@@ -152,7 +152,7 @@ last_lba(struct block_device *bdev)
152} 152}
153 153
154static inline int 154static inline int
155pmbr_part_valid(struct partition *part, u64 lastlba) 155pmbr_part_valid(struct partition *part)
156{ 156{
157 if (part->sys_ind == EFI_PMBR_OSTYPE_EFI_GPT && 157 if (part->sys_ind == EFI_PMBR_OSTYPE_EFI_GPT &&
158 le32_to_cpu(part->start_sect) == 1UL) 158 le32_to_cpu(part->start_sect) == 1UL)
@@ -163,7 +163,6 @@ pmbr_part_valid(struct partition *part, u64 lastlba)
163/** 163/**
164 * is_pmbr_valid(): test Protective MBR for validity 164 * is_pmbr_valid(): test Protective MBR for validity
165 * @mbr: pointer to a legacy mbr structure 165 * @mbr: pointer to a legacy mbr structure
166 * @lastlba: last_lba for the whole device
167 * 166 *
168 * Description: Returns 1 if PMBR is valid, 0 otherwise. 167 * Description: Returns 1 if PMBR is valid, 0 otherwise.
169 * Validity depends on two things: 168 * Validity depends on two things:
@@ -171,13 +170,13 @@ pmbr_part_valid(struct partition *part, u64 lastlba)
171 * 2) One partition of type 0xEE is found 170 * 2) One partition of type 0xEE is found
172 */ 171 */
173static int 172static int
174is_pmbr_valid(legacy_mbr *mbr, u64 lastlba) 173is_pmbr_valid(legacy_mbr *mbr)
175{ 174{
176 int i; 175 int i;
177 if (!mbr || le16_to_cpu(mbr->signature) != MSDOS_MBR_SIGNATURE) 176 if (!mbr || le16_to_cpu(mbr->signature) != MSDOS_MBR_SIGNATURE)
178 return 0; 177 return 0;
179 for (i = 0; i < 4; i++) 178 for (i = 0; i < 4; i++)
180 if (pmbr_part_valid(&mbr->partition_record[i], lastlba)) 179 if (pmbr_part_valid(&mbr->partition_record[i]))
181 return 1; 180 return 1;
182 return 0; 181 return 0;
183} 182}
@@ -516,7 +515,7 @@ find_valid_gpt(struct block_device *bdev, gpt_header **gpt, gpt_entry **ptes)
516 int good_pgpt = 0, good_agpt = 0, good_pmbr = 0; 515 int good_pgpt = 0, good_agpt = 0, good_pmbr = 0;
517 gpt_header *pgpt = NULL, *agpt = NULL; 516 gpt_header *pgpt = NULL, *agpt = NULL;
518 gpt_entry *pptes = NULL, *aptes = NULL; 517 gpt_entry *pptes = NULL, *aptes = NULL;
519 legacy_mbr *legacymbr = NULL; 518 legacy_mbr *legacymbr;
520 u64 lastlba; 519 u64 lastlba;
521 if (!bdev || !gpt || !ptes) 520 if (!bdev || !gpt || !ptes)
522 return 0; 521 return 0;
@@ -528,9 +527,8 @@ find_valid_gpt(struct block_device *bdev, gpt_header **gpt, gpt_entry **ptes)
528 if (legacymbr) { 527 if (legacymbr) {
529 read_lba(bdev, 0, (u8 *) legacymbr, 528 read_lba(bdev, 0, (u8 *) legacymbr,
530 sizeof (*legacymbr)); 529 sizeof (*legacymbr));
531 good_pmbr = is_pmbr_valid(legacymbr, lastlba); 530 good_pmbr = is_pmbr_valid(legacymbr);
532 kfree(legacymbr); 531 kfree(legacymbr);
533 legacymbr=NULL;
534 } 532 }
535 if (!good_pmbr) 533 if (!good_pmbr)
536 goto fail; 534 goto fail;
diff --git a/fs/signalfd.c b/fs/signalfd.c
new file mode 100644
index 000000000000..7cfeab412b45
--- /dev/null
+++ b/fs/signalfd.c
@@ -0,0 +1,349 @@
1/*
2 * fs/signalfd.c
3 *
4 * Copyright (C) 2003 Linus Torvalds
5 *
6 * Mon Mar 5, 2007: Davide Libenzi <davidel@xmailserver.org>
7 * Changed ->read() to return a siginfo strcture instead of signal number.
8 * Fixed locking in ->poll().
9 * Added sighand-detach notification.
10 * Added fd re-use in sys_signalfd() syscall.
11 * Now using anonymous inode source.
12 * Thanks to Oleg Nesterov for useful code review and suggestions.
13 * More comments and suggestions from Arnd Bergmann.
14 */
15
16#include <linux/file.h>
17#include <linux/poll.h>
18#include <linux/init.h>
19#include <linux/fs.h>
20#include <linux/sched.h>
21#include <linux/kernel.h>
22#include <linux/signal.h>
23#include <linux/list.h>
24#include <linux/anon_inodes.h>
25#include <linux/signalfd.h>
26
27struct signalfd_ctx {
28 struct list_head lnk;
29 wait_queue_head_t wqh;
30 sigset_t sigmask;
31 struct task_struct *tsk;
32};
33
34struct signalfd_lockctx {
35 struct task_struct *tsk;
36 unsigned long flags;
37};
38
39/*
40 * Tries to acquire the sighand lock. We do not increment the sighand
41 * use count, and we do not even pin the task struct, so we need to
42 * do it inside an RCU read lock, and we must be prepared for the
43 * ctx->tsk going to NULL (in signalfd_deliver()), and for the sighand
44 * being detached. We return 0 if the sighand has been detached, or
45 * 1 if we were able to pin the sighand lock.
46 */
47static int signalfd_lock(struct signalfd_ctx *ctx, struct signalfd_lockctx *lk)
48{
49 struct sighand_struct *sighand = NULL;
50
51 rcu_read_lock();
52 lk->tsk = rcu_dereference(ctx->tsk);
53 if (likely(lk->tsk != NULL))
54 sighand = lock_task_sighand(lk->tsk, &lk->flags);
55 rcu_read_unlock();
56
57 if (sighand && !ctx->tsk) {
58 unlock_task_sighand(lk->tsk, &lk->flags);
59 sighand = NULL;
60 }
61
62 return sighand != NULL;
63}
64
65static void signalfd_unlock(struct signalfd_lockctx *lk)
66{
67 unlock_task_sighand(lk->tsk, &lk->flags);
68}
69
70/*
71 * This must be called with the sighand lock held.
72 */
73void signalfd_deliver(struct task_struct *tsk, int sig)
74{
75 struct sighand_struct *sighand = tsk->sighand;
76 struct signalfd_ctx *ctx, *tmp;
77
78 BUG_ON(!sig);
79 list_for_each_entry_safe(ctx, tmp, &sighand->signalfd_list, lnk) {
80 /*
81 * We use a negative signal value as a way to broadcast that the
82 * sighand has been orphaned, so that we can notify all the
83 * listeners about this. Remember the ctx->sigmask is inverted,
84 * so if the user is interested in a signal, that corresponding
85 * bit will be zero.
86 */
87 if (sig < 0) {
88 if (ctx->tsk == tsk) {
89 ctx->tsk = NULL;
90 list_del_init(&ctx->lnk);
91 wake_up(&ctx->wqh);
92 }
93 } else {
94 if (!sigismember(&ctx->sigmask, sig))
95 wake_up(&ctx->wqh);
96 }
97 }
98}
99
100static void signalfd_cleanup(struct signalfd_ctx *ctx)
101{
102 struct signalfd_lockctx lk;
103
104 /*
105 * This is tricky. If the sighand is gone, we do not need to remove
106 * context from the list, the list itself won't be there anymore.
107 */
108 if (signalfd_lock(ctx, &lk)) {
109 list_del(&ctx->lnk);
110 signalfd_unlock(&lk);
111 }
112 kfree(ctx);
113}
114
115static int signalfd_release(struct inode *inode, struct file *file)
116{
117 signalfd_cleanup(file->private_data);
118 return 0;
119}
120
121static unsigned int signalfd_poll(struct file *file, poll_table *wait)
122{
123 struct signalfd_ctx *ctx = file->private_data;
124 unsigned int events = 0;
125 struct signalfd_lockctx lk;
126
127 poll_wait(file, &ctx->wqh, wait);
128
129 /*
130 * Let the caller get a POLLIN in this case, ala socket recv() when
131 * the peer disconnects.
132 */
133 if (signalfd_lock(ctx, &lk)) {
134 if (next_signal(&lk.tsk->pending, &ctx->sigmask) > 0 ||
135 next_signal(&lk.tsk->signal->shared_pending,
136 &ctx->sigmask) > 0)
137 events |= POLLIN;
138 signalfd_unlock(&lk);
139 } else
140 events |= POLLIN;
141
142 return events;
143}
144
145/*
146 * Copied from copy_siginfo_to_user() in kernel/signal.c
147 */
148static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo,
149 siginfo_t const *kinfo)
150{
151 long err;
152
153 BUILD_BUG_ON(sizeof(struct signalfd_siginfo) != 128);
154
155 /*
156 * Unused memebers should be zero ...
157 */
158 err = __clear_user(uinfo, sizeof(*uinfo));
159
160 /*
161 * If you change siginfo_t structure, please be sure
162 * this code is fixed accordingly.
163 */
164 err |= __put_user(kinfo->si_signo, &uinfo->signo);
165 err |= __put_user(kinfo->si_errno, &uinfo->err);
166 err |= __put_user((short)kinfo->si_code, &uinfo->code);
167 switch (kinfo->si_code & __SI_MASK) {
168 case __SI_KILL:
169 err |= __put_user(kinfo->si_pid, &uinfo->pid);
170 err |= __put_user(kinfo->si_uid, &uinfo->uid);
171 break;
172 case __SI_TIMER:
173 err |= __put_user(kinfo->si_tid, &uinfo->tid);
174 err |= __put_user(kinfo->si_overrun, &uinfo->overrun);
175 err |= __put_user((long)kinfo->si_ptr, &uinfo->svptr);
176 break;
177 case __SI_POLL:
178 err |= __put_user(kinfo->si_band, &uinfo->band);
179 err |= __put_user(kinfo->si_fd, &uinfo->fd);
180 break;
181 case __SI_FAULT:
182 err |= __put_user((long)kinfo->si_addr, &uinfo->addr);
183#ifdef __ARCH_SI_TRAPNO
184 err |= __put_user(kinfo->si_trapno, &uinfo->trapno);
185#endif
186 break;
187 case __SI_CHLD:
188 err |= __put_user(kinfo->si_pid, &uinfo->pid);
189 err |= __put_user(kinfo->si_uid, &uinfo->uid);
190 err |= __put_user(kinfo->si_status, &uinfo->status);
191 err |= __put_user(kinfo->si_utime, &uinfo->utime);
192 err |= __put_user(kinfo->si_stime, &uinfo->stime);
193 break;
194 case __SI_RT: /* This is not generated by the kernel as of now. */
195 case __SI_MESGQ: /* But this is */
196 err |= __put_user(kinfo->si_pid, &uinfo->pid);
197 err |= __put_user(kinfo->si_uid, &uinfo->uid);
198 err |= __put_user((long)kinfo->si_ptr, &uinfo->svptr);
199 break;
200 default: /* this is just in case for now ... */
201 err |= __put_user(kinfo->si_pid, &uinfo->pid);
202 err |= __put_user(kinfo->si_uid, &uinfo->uid);
203 break;
204 }
205
206 return err ? -EFAULT: sizeof(*uinfo);
207}
208
209/*
210 * Returns either the size of a "struct signalfd_siginfo", or zero if the
211 * sighand we are attached to, has been orphaned. The "count" parameter
212 * must be at least the size of a "struct signalfd_siginfo".
213 */
214static ssize_t signalfd_read(struct file *file, char __user *buf, size_t count,
215 loff_t *ppos)
216{
217 struct signalfd_ctx *ctx = file->private_data;
218 ssize_t res = 0;
219 int locked, signo;
220 siginfo_t info;
221 struct signalfd_lockctx lk;
222 DECLARE_WAITQUEUE(wait, current);
223
224 if (count < sizeof(struct signalfd_siginfo))
225 return -EINVAL;
226 locked = signalfd_lock(ctx, &lk);
227 if (!locked)
228 return 0;
229 res = -EAGAIN;
230 signo = dequeue_signal(lk.tsk, &ctx->sigmask, &info);
231 if (signo == 0 && !(file->f_flags & O_NONBLOCK)) {
232 add_wait_queue(&ctx->wqh, &wait);
233 for (;;) {
234 set_current_state(TASK_INTERRUPTIBLE);
235 signo = dequeue_signal(lk.tsk, &ctx->sigmask, &info);
236 if (signo != 0)
237 break;
238 if (signal_pending(current)) {
239 res = -ERESTARTSYS;
240 break;
241 }
242 signalfd_unlock(&lk);
243 schedule();
244 locked = signalfd_lock(ctx, &lk);
245 if (unlikely(!locked)) {
246 /*
247 * Let the caller read zero byte, ala socket
248 * recv() when the peer disconnect. This test
249 * must be done before doing a dequeue_signal(),
250 * because if the sighand has been orphaned,
251 * the dequeue_signal() call is going to crash.
252 */
253 res = 0;
254 break;
255 }
256 }
257 remove_wait_queue(&ctx->wqh, &wait);
258 __set_current_state(TASK_RUNNING);
259 }
260 if (likely(locked))
261 signalfd_unlock(&lk);
262 if (likely(signo))
263 res = signalfd_copyinfo((struct signalfd_siginfo __user *) buf,
264 &info);
265
266 return res;
267}
268
269static const struct file_operations signalfd_fops = {
270 .release = signalfd_release,
271 .poll = signalfd_poll,
272 .read = signalfd_read,
273};
274
275/*
276 * Create a file descriptor that is associated with our signal
277 * state. We can pass it around to others if we want to, but
278 * it will always be _our_ signal state.
279 */
280asmlinkage long sys_signalfd(int ufd, sigset_t __user *user_mask, size_t sizemask)
281{
282 int error;
283 sigset_t sigmask;
284 struct signalfd_ctx *ctx;
285 struct sighand_struct *sighand;
286 struct file *file;
287 struct inode *inode;
288 struct signalfd_lockctx lk;
289
290 if (sizemask != sizeof(sigset_t) ||
291 copy_from_user(&sigmask, user_mask, sizeof(sigmask)))
292 return error = -EINVAL;
293 sigdelsetmask(&sigmask, sigmask(SIGKILL) | sigmask(SIGSTOP));
294 signotset(&sigmask);
295
296 if (ufd == -1) {
297 ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
298 if (!ctx)
299 return -ENOMEM;
300
301 init_waitqueue_head(&ctx->wqh);
302 ctx->sigmask = sigmask;
303 ctx->tsk = current;
304
305 sighand = current->sighand;
306 /*
307 * Add this fd to the list of signal listeners.
308 */
309 spin_lock_irq(&sighand->siglock);
310 list_add_tail(&ctx->lnk, &sighand->signalfd_list);
311 spin_unlock_irq(&sighand->siglock);
312
313 /*
314 * When we call this, the initialization must be complete, since
315 * anon_inode_getfd() will install the fd.
316 */
317 error = anon_inode_getfd(&ufd, &inode, &file, "[signalfd]",
318 &signalfd_fops, ctx);
319 if (error)
320 goto err_fdalloc;
321 } else {
322 file = fget(ufd);
323 if (!file)
324 return -EBADF;
325 ctx = file->private_data;
326 if (file->f_op != &signalfd_fops) {
327 fput(file);
328 return -EINVAL;
329 }
330 /*
331 * We need to be prepared of the fact that the sighand this fd
332 * is attached to, has been detched. In that case signalfd_lock()
333 * will return 0, and we'll just skip setting the new mask.
334 */
335 if (signalfd_lock(ctx, &lk)) {
336 ctx->sigmask = sigmask;
337 signalfd_unlock(&lk);
338 }
339 wake_up(&ctx->wqh);
340 fput(file);
341 }
342
343 return ufd;
344
345err_fdalloc:
346 signalfd_cleanup(ctx);
347 return error;
348}
349
diff --git a/fs/timerfd.c b/fs/timerfd.c
new file mode 100644
index 000000000000..e329e37f15a8
--- /dev/null
+++ b/fs/timerfd.c
@@ -0,0 +1,227 @@
1/*
2 * fs/timerfd.c
3 *
4 * Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org>
5 *
6 *
7 * Thanks to Thomas Gleixner for code reviews and useful comments.
8 *
9 */
10
11#include <linux/file.h>
12#include <linux/poll.h>
13#include <linux/init.h>
14#include <linux/fs.h>
15#include <linux/sched.h>
16#include <linux/kernel.h>
17#include <linux/list.h>
18#include <linux/spinlock.h>
19#include <linux/time.h>
20#include <linux/hrtimer.h>
21#include <linux/anon_inodes.h>
22#include <linux/timerfd.h>
23
24struct timerfd_ctx {
25 struct hrtimer tmr;
26 ktime_t tintv;
27 spinlock_t lock;
28 wait_queue_head_t wqh;
29 int expired;
30};
31
32/*
33 * This gets called when the timer event triggers. We set the "expired"
34 * flag, but we do not re-arm the timer (in case it's necessary,
35 * tintv.tv64 != 0) until the timer is read.
36 */
37static enum hrtimer_restart timerfd_tmrproc(struct hrtimer *htmr)
38{
39 struct timerfd_ctx *ctx = container_of(htmr, struct timerfd_ctx, tmr);
40 unsigned long flags;
41
42 spin_lock_irqsave(&ctx->lock, flags);
43 ctx->expired = 1;
44 wake_up_locked(&ctx->wqh);
45 spin_unlock_irqrestore(&ctx->lock, flags);
46
47 return HRTIMER_NORESTART;
48}
49
50static void timerfd_setup(struct timerfd_ctx *ctx, int clockid, int flags,
51 const struct itimerspec *ktmr)
52{
53 enum hrtimer_mode htmode;
54 ktime_t texp;
55
56 htmode = (flags & TFD_TIMER_ABSTIME) ?
57 HRTIMER_MODE_ABS: HRTIMER_MODE_REL;
58
59 texp = timespec_to_ktime(ktmr->it_value);
60 ctx->expired = 0;
61 ctx->tintv = timespec_to_ktime(ktmr->it_interval);
62 hrtimer_init(&ctx->tmr, clockid, htmode);
63 ctx->tmr.expires = texp;
64 ctx->tmr.function = timerfd_tmrproc;
65 if (texp.tv64 != 0)
66 hrtimer_start(&ctx->tmr, texp, htmode);
67}
68
69static int timerfd_release(struct inode *inode, struct file *file)
70{
71 struct timerfd_ctx *ctx = file->private_data;
72
73 hrtimer_cancel(&ctx->tmr);
74 kfree(ctx);
75 return 0;
76}
77
78static unsigned int timerfd_poll(struct file *file, poll_table *wait)
79{
80 struct timerfd_ctx *ctx = file->private_data;
81 unsigned int events = 0;
82 unsigned long flags;
83
84 poll_wait(file, &ctx->wqh, wait);
85
86 spin_lock_irqsave(&ctx->lock, flags);
87 if (ctx->expired)
88 events |= POLLIN;
89 spin_unlock_irqrestore(&ctx->lock, flags);
90
91 return events;
92}
93
94static ssize_t timerfd_read(struct file *file, char __user *buf, size_t count,
95 loff_t *ppos)
96{
97 struct timerfd_ctx *ctx = file->private_data;
98 ssize_t res;
99 u32 ticks = 0;
100 DECLARE_WAITQUEUE(wait, current);
101
102 if (count < sizeof(ticks))
103 return -EINVAL;
104 spin_lock_irq(&ctx->lock);
105 res = -EAGAIN;
106 if (!ctx->expired && !(file->f_flags & O_NONBLOCK)) {
107 __add_wait_queue(&ctx->wqh, &wait);
108 for (res = 0;;) {
109 set_current_state(TASK_INTERRUPTIBLE);
110 if (ctx->expired) {
111 res = 0;
112 break;
113 }
114 if (signal_pending(current)) {
115 res = -ERESTARTSYS;
116 break;
117 }
118 spin_unlock_irq(&ctx->lock);
119 schedule();
120 spin_lock_irq(&ctx->lock);
121 }
122 __remove_wait_queue(&ctx->wqh, &wait);
123 __set_current_state(TASK_RUNNING);
124 }
125 if (ctx->expired) {
126 ctx->expired = 0;
127 if (ctx->tintv.tv64 != 0) {
128 /*
129 * If tintv.tv64 != 0, this is a periodic timer that
130 * needs to be re-armed. We avoid doing it in the timer
131 * callback to avoid DoS attacks specifying a very
132 * short timer period.
133 */
134 ticks = (u32)
135 hrtimer_forward(&ctx->tmr,
136 hrtimer_cb_get_time(&ctx->tmr),
137 ctx->tintv);
138 hrtimer_restart(&ctx->tmr);
139 } else
140 ticks = 1;
141 }
142 spin_unlock_irq(&ctx->lock);
143 if (ticks)
144 res = put_user(ticks, buf) ? -EFAULT: sizeof(ticks);
145 return res;
146}
147
148static const struct file_operations timerfd_fops = {
149 .release = timerfd_release,
150 .poll = timerfd_poll,
151 .read = timerfd_read,
152};
153
154asmlinkage long sys_timerfd(int ufd, int clockid, int flags,
155 const struct itimerspec __user *utmr)
156{
157 int error;
158 struct timerfd_ctx *ctx;
159 struct file *file;
160 struct inode *inode;
161 struct itimerspec ktmr;
162
163 if (copy_from_user(&ktmr, utmr, sizeof(ktmr)))
164 return -EFAULT;
165
166 if (clockid != CLOCK_MONOTONIC &&
167 clockid != CLOCK_REALTIME)
168 return -EINVAL;
169 if (!timespec_valid(&ktmr.it_value) ||
170 !timespec_valid(&ktmr.it_interval))
171 return -EINVAL;
172
173 if (ufd == -1) {
174 ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
175 if (!ctx)
176 return -ENOMEM;
177
178 init_waitqueue_head(&ctx->wqh);
179 spin_lock_init(&ctx->lock);
180
181 timerfd_setup(ctx, clockid, flags, &ktmr);
182
183 /*
184 * When we call this, the initialization must be complete, since
185 * anon_inode_getfd() will install the fd.
186 */
187 error = anon_inode_getfd(&ufd, &inode, &file, "[timerfd]",
188 &timerfd_fops, ctx);
189 if (error)
190 goto err_tmrcancel;
191 } else {
192 file = fget(ufd);
193 if (!file)
194 return -EBADF;
195 ctx = file->private_data;
196 if (file->f_op != &timerfd_fops) {
197 fput(file);
198 return -EINVAL;
199 }
200 /*
201 * We need to stop the existing timer before reprogramming
202 * it to the new values.
203 */
204 for (;;) {
205 spin_lock_irq(&ctx->lock);
206 if (hrtimer_try_to_cancel(&ctx->tmr) >= 0)
207 break;
208 spin_unlock_irq(&ctx->lock);
209 cpu_relax();
210 }
211 /*
212 * Re-program the timer to the new value ...
213 */
214 timerfd_setup(ctx, clockid, flags, &ktmr);
215
216 spin_unlock_irq(&ctx->lock);
217 fput(file);
218 }
219
220 return ufd;
221
222err_tmrcancel:
223 hrtimer_cancel(&ctx->tmr);
224 kfree(ctx);
225 return error;
226}
227