aboutsummaryrefslogtreecommitdiffstats
path: root/arch
diff options
context:
space:
mode:
authorJon Mason <jdmason@us.ibm.com>2006-07-29 15:42:43 -0400
committerLinus Torvalds <torvalds@g5.osdl.org>2006-07-29 23:59:55 -0400
commitd2105b10fe0f460c388fe4e09226313f519d8c00 (patch)
tree59ad2f99eeb124ecea6506801eb7f5c0a0a1395d /arch
parent089bbbcb36979166131868a89ca5f4e695d6637d (diff)
[PATCH] x86_64: Calgary IOMMU - Multi-Node NULL pointer dereference fix
Calgary hits a NULL pointer dereference when booting in a multi-chassis NUMA system. See Redhat bugzilla number 198498, found by Konrad Rzeszutek (konradr@redhat.com). There are many issues that had to be resolved to fix this problem. Firstly when I originally wrote the code to handle NUMA systems, I had a large misunderstanding that was not corrected until now. That was that I thought the "number of nodes online" referred to number of physical systems connected. So that if NUMA was disabled, there would only be 1 node and it would only show that node's PCI bus. In reality if NUMA is disabled, the system displays all of the connected chassis as one node but is only ignorant of the delays in accessing main memory. Therefore, references to num_online_nodes() and MAX_NUMNODES are incorrect and need to be set to the maximum number of nodes that can be accessed (which are 8). I created a variable, MAX_NUM_CHASSIS, and set it to 8 to fix this. Secondly, when walking the PCI in detect_calgary, the code only checked the first "slot" when looking to see if a device is present. This will work for most cases, but unfortunately it isn't always the case. In the NUMA MXE drawers, there are USB devices present on the 3rd slot (with slot 1 being empty). So, to work around this, all slots (up to 8) are scanned to see if there are any devices present. Lastly, the bus is being enumerated on large systems in a different way the we originally thought. This throws the ugly logic we had out the window. To more elegantly handle this, I reorganized the kva array to be sparse (which removed the need to have any bus number to kva slot logic in tce.c) and created a secondary space array to contain the bus number to phb mapping. With these changes Calgary boots on an x460 with 4 nodes with and without NUMA enabled. Signed-off-by: Jon Mason <jdmason@us.ibm.com> Signed-off-by: Muli Ben-Yehuda <muli@il.ibm.com> Signed-off-by: Andi Kleen <ak@suse.de> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'arch')
-rw-r--r--arch/x86_64/kernel/pci-calgary.c76
-rw-r--r--arch/x86_64/kernel/tce.c4
2 files changed, 45 insertions, 35 deletions
diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c
index 92744abff133..146924ba5df5 100644
--- a/arch/x86_64/kernel/pci-calgary.c
+++ b/arch/x86_64/kernel/pci-calgary.c
@@ -85,7 +85,8 @@
85#define CSR_AGENT_MASK 0xffe0ffff 85#define CSR_AGENT_MASK 0xffe0ffff
86 86
87#define MAX_NUM_OF_PHBS 8 /* how many PHBs in total? */ 87#define MAX_NUM_OF_PHBS 8 /* how many PHBs in total? */
88#define MAX_PHB_BUS_NUM (MAX_NUM_OF_PHBS * 2) /* max dev->bus->number */ 88#define MAX_NUM_CHASSIS 8 /* max number of chassis */
89#define MAX_PHB_BUS_NUM (MAX_NUM_OF_PHBS * MAX_NUM_CHASSIS * 2) /* max dev->bus->number */
89#define PHBS_PER_CALGARY 4 90#define PHBS_PER_CALGARY 4
90 91
91/* register offsets in Calgary's internal register space */ 92/* register offsets in Calgary's internal register space */
@@ -110,7 +111,8 @@ static const unsigned long phb_offsets[] = {
110 0xB000 /* PHB3 */ 111 0xB000 /* PHB3 */
111}; 112};
112 113
113void* tce_table_kva[MAX_NUM_OF_PHBS * MAX_NUMNODES]; 114static char bus_to_phb[MAX_PHB_BUS_NUM];
115void* tce_table_kva[MAX_PHB_BUS_NUM];
114unsigned int specified_table_size = TCE_TABLE_SIZE_UNSPECIFIED; 116unsigned int specified_table_size = TCE_TABLE_SIZE_UNSPECIFIED;
115static int translate_empty_slots __read_mostly = 0; 117static int translate_empty_slots __read_mostly = 0;
116static int calgary_detected __read_mostly = 0; 118static int calgary_detected __read_mostly = 0;
@@ -119,7 +121,7 @@ static int calgary_detected __read_mostly = 0;
119 * the bitmap of PHBs the user requested that we disable 121 * the bitmap of PHBs the user requested that we disable
120 * translation on. 122 * translation on.
121 */ 123 */
122static DECLARE_BITMAP(translation_disabled, MAX_NUMNODES * MAX_PHB_BUS_NUM); 124static DECLARE_BITMAP(translation_disabled, MAX_PHB_BUS_NUM);
123 125
124static void tce_cache_blast(struct iommu_table *tbl); 126static void tce_cache_blast(struct iommu_table *tbl);
125 127
@@ -452,7 +454,7 @@ static struct dma_mapping_ops calgary_dma_ops = {
452 454
453static inline int busno_to_phbid(unsigned char num) 455static inline int busno_to_phbid(unsigned char num)
454{ 456{
455 return bus_to_phb(num) % PHBS_PER_CALGARY; 457 return bus_to_phb[num];
456} 458}
457 459
458static inline unsigned long split_queue_offset(unsigned char num) 460static inline unsigned long split_queue_offset(unsigned char num)
@@ -812,7 +814,7 @@ static int __init calgary_init(void)
812 int i, ret = -ENODEV; 814 int i, ret = -ENODEV;
813 struct pci_dev *dev = NULL; 815 struct pci_dev *dev = NULL;
814 816
815 for (i = 0; i < num_online_nodes() * MAX_NUM_OF_PHBS; i++) { 817 for (i = 0; i < MAX_PHB_BUS_NUM; i++) {
816 dev = pci_get_device(PCI_VENDOR_ID_IBM, 818 dev = pci_get_device(PCI_VENDOR_ID_IBM,
817 PCI_DEVICE_ID_IBM_CALGARY, 819 PCI_DEVICE_ID_IBM_CALGARY,
818 dev); 820 dev);
@@ -822,7 +824,7 @@ static int __init calgary_init(void)
822 calgary_init_one_nontraslated(dev); 824 calgary_init_one_nontraslated(dev);
823 continue; 825 continue;
824 } 826 }
825 if (!tce_table_kva[i] && !translate_empty_slots) { 827 if (!tce_table_kva[dev->bus->number] && !translate_empty_slots) {
826 pci_dev_put(dev); 828 pci_dev_put(dev);
827 continue; 829 continue;
828 } 830 }
@@ -842,7 +844,7 @@ error:
842 pci_dev_put(dev); 844 pci_dev_put(dev);
843 continue; 845 continue;
844 } 846 }
845 if (!tce_table_kva[i] && !translate_empty_slots) 847 if (!tce_table_kva[dev->bus->number] && !translate_empty_slots)
846 continue; 848 continue;
847 calgary_disable_translation(dev); 849 calgary_disable_translation(dev);
848 calgary_free_tar(dev); 850 calgary_free_tar(dev);
@@ -876,9 +878,10 @@ static inline int __init determine_tce_table_size(u64 ram)
876void __init detect_calgary(void) 878void __init detect_calgary(void)
877{ 879{
878 u32 val; 880 u32 val;
879 int bus, table_idx; 881 int bus;
880 void *tbl; 882 void *tbl;
881 int detected = 0; 883 int calgary_found = 0;
884 int phb = -1;
882 885
883 /* 886 /*
884 * if the user specified iommu=off or iommu=soft or we found 887 * if the user specified iommu=off or iommu=soft or we found
@@ -889,37 +892,46 @@ void __init detect_calgary(void)
889 892
890 specified_table_size = determine_tce_table_size(end_pfn * PAGE_SIZE); 893 specified_table_size = determine_tce_table_size(end_pfn * PAGE_SIZE);
891 894
892 for (bus = 0, table_idx = 0; 895 for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) {
893 bus < num_online_nodes() * MAX_PHB_BUS_NUM; 896 int dev;
894 bus++) { 897
898 tce_table_kva[bus] = NULL;
899 bus_to_phb[bus] = -1;
900
895 if (read_pci_config(bus, 0, 0, 0) != PCI_VENDOR_DEVICE_ID_CALGARY) 901 if (read_pci_config(bus, 0, 0, 0) != PCI_VENDOR_DEVICE_ID_CALGARY)
896 continue; 902 continue;
903
904 /*
905 * There are 4 PHBs per Calgary chip. Set phb to which phb (0-3)
906 * it is connected to releative to the clagary chip.
907 */
908 phb = (phb + 1) % PHBS_PER_CALGARY;
909
897 if (test_bit(bus, translation_disabled)) { 910 if (test_bit(bus, translation_disabled)) {
898 printk(KERN_INFO "Calgary: translation is disabled for " 911 printk(KERN_INFO "Calgary: translation is disabled for "
899 "PHB 0x%x\n", bus); 912 "PHB 0x%x\n", bus);
900 /* skip this phb, don't allocate a tbl for it */ 913 /* skip this phb, don't allocate a tbl for it */
901 tce_table_kva[table_idx] = NULL;
902 table_idx++;
903 continue; 914 continue;
904 } 915 }
905 /* 916 /*
906 * scan the first slot of the PCI bus to see if there 917 * Scan the slots of the PCI bus to see if there is a device present.
907 * are any devices present 918 * The parent bus will be the zero-ith device, so start at 1.
908 */ 919 */
909 val = read_pci_config(bus, 1, 0, 0); 920 for (dev = 1; dev < 8; dev++) {
910 if (val != 0xffffffff || translate_empty_slots) { 921 val = read_pci_config(bus, dev, 0, 0);
911 tbl = alloc_tce_table(); 922 if (val != 0xffffffff || translate_empty_slots) {
912 if (!tbl) 923 tbl = alloc_tce_table();
913 goto cleanup; 924 if (!tbl)
914 detected = 1; 925 goto cleanup;
915 } else 926 tce_table_kva[bus] = tbl;
916 tbl = NULL; 927 bus_to_phb[bus] = phb;
917 928 calgary_found = 1;
918 tce_table_kva[table_idx] = tbl; 929 break;
919 table_idx++; 930 }
931 }
920 } 932 }
921 933
922 if (detected) { 934 if (calgary_found) {
923 iommu_detected = 1; 935 iommu_detected = 1;
924 calgary_detected = 1; 936 calgary_detected = 1;
925 printk(KERN_INFO "PCI-DMA: Calgary IOMMU detected. " 937 printk(KERN_INFO "PCI-DMA: Calgary IOMMU detected. "
@@ -928,9 +940,9 @@ void __init detect_calgary(void)
928 return; 940 return;
929 941
930cleanup: 942cleanup:
931 for (--table_idx; table_idx >= 0; --table_idx) 943 for (--bus; bus >= 0; --bus)
932 if (tce_table_kva[table_idx]) 944 if (tce_table_kva[bus])
933 free_tce_table(tce_table_kva[table_idx]); 945 free_tce_table(tce_table_kva[bus]);
934} 946}
935 947
936int __init calgary_iommu_init(void) 948int __init calgary_iommu_init(void)
@@ -1001,7 +1013,7 @@ static int __init calgary_parse_options(char *p)
1001 if (p == endp) 1013 if (p == endp)
1002 break; 1014 break;
1003 1015
1004 if (bridge < (num_online_nodes() * MAX_PHB_BUS_NUM)) { 1016 if (bridge < MAX_PHB_BUS_NUM) {
1005 printk(KERN_INFO "Calgary: disabling " 1017 printk(KERN_INFO "Calgary: disabling "
1006 "translation for PHB 0x%x\n", bridge); 1018 "translation for PHB 0x%x\n", bridge);
1007 set_bit(bridge, translation_disabled); 1019 set_bit(bridge, translation_disabled);
diff --git a/arch/x86_64/kernel/tce.c b/arch/x86_64/kernel/tce.c
index d3a9e79e954c..5530dda3f27a 100644
--- a/arch/x86_64/kernel/tce.c
+++ b/arch/x86_64/kernel/tce.c
@@ -96,7 +96,6 @@ static inline unsigned int table_size_to_number_of_entries(unsigned char size)
96static int tce_table_setparms(struct pci_dev *dev, struct iommu_table *tbl) 96static int tce_table_setparms(struct pci_dev *dev, struct iommu_table *tbl)
97{ 97{
98 unsigned int bitmapsz; 98 unsigned int bitmapsz;
99 unsigned int tce_table_index;
100 unsigned long bmppages; 99 unsigned long bmppages;
101 int ret; 100 int ret;
102 101
@@ -105,8 +104,7 @@ static int tce_table_setparms(struct pci_dev *dev, struct iommu_table *tbl)
105 /* set the tce table size - measured in entries */ 104 /* set the tce table size - measured in entries */
106 tbl->it_size = table_size_to_number_of_entries(specified_table_size); 105 tbl->it_size = table_size_to_number_of_entries(specified_table_size);
107 106
108 tce_table_index = bus_to_phb(tbl->it_busno); 107 tbl->it_base = (unsigned long)tce_table_kva[dev->bus->number];
109 tbl->it_base = (unsigned long)tce_table_kva[tce_table_index];
110 if (!tbl->it_base) { 108 if (!tbl->it_base) {
111 printk(KERN_ERR "Calgary: iommu_table_setparms: " 109 printk(KERN_ERR "Calgary: iommu_table_setparms: "
112 "no table allocated?!\n"); 110 "no table allocated?!\n");
{ return !list_empty(p); } /* Get the "struct epitem" from a wait queue pointer */ static inline struct epitem *ep_item_from_wait(wait_queue_t *p) { return container_of(p, struct eppoll_entry, wait)->base; } /* Get the "struct epitem" from an epoll queue wrapper */ static inline struct epitem *ep_item_from_epqueue(poll_table *p) { return container_of(p, struct ep_pqueue, pt)->epi; } /* Tells if the epoll_ctl(2) operation needs an event copy from userspace */ static inline int ep_op_has_event(int op) { return op != EPOLL_CTL_DEL; } /* Initialize the poll safe wake up structure */ static void ep_nested_calls_init(struct nested_calls *ncalls) { INIT_LIST_HEAD(&ncalls->tasks_call_list); spin_lock_init(&ncalls->lock); } /** * ep_call_nested - Perform a bound (possibly) nested call, by checking * that the recursion limit is not exceeded, and that * the same nested call (by the meaning of same cookie) is * no re-entered. * * @ncalls: Pointer to the nested_calls structure to be used for this call. * @max_nests: Maximum number of allowed nesting calls. * @nproc: Nested call core function pointer. * @priv: Opaque data to be passed to the @nproc callback. * @cookie: Cookie to be used to identify this nested call. * @ctx: This instance context. * * Returns: Returns the code returned by the @nproc callback, or -1 if * the maximum recursion limit has been exceeded. */ static int ep_call_nested(struct nested_calls *ncalls, int max_nests, int (*nproc)(void *, void *, int), void *priv, void *cookie, void *ctx) { int error, call_nests = 0; unsigned long flags; struct list_head *lsthead = &ncalls->tasks_call_list; struct nested_call_node *tncur; struct nested_call_node tnode; spin_lock_irqsave(&ncalls->lock, flags); /* * Try to see if the current task is already inside this wakeup call. * We use a list here, since the population inside this set is always * very much limited. */ list_for_each_entry(tncur, lsthead, llink) { if (tncur->ctx == ctx && (tncur->cookie == cookie || ++call_nests > max_nests)) { /* * Ops ... loop detected or maximum nest level reached. * We abort this wake by breaking the cycle itself. */ error = -1; goto out_unlock; } } /* Add the current task and cookie to the list */ tnode.ctx = ctx; tnode.cookie = cookie; list_add(&tnode.llink, lsthead); spin_unlock_irqrestore(&ncalls->lock, flags); /* Call the nested function */ error = (*nproc)(priv, cookie, call_nests); /* Remove the current task from the list */ spin_lock_irqsave(&ncalls->lock, flags); list_del(&tnode.llink); out_unlock: spin_unlock_irqrestore(&ncalls->lock, flags); return error; } #ifdef CONFIG_DEBUG_LOCK_ALLOC static inline void ep_wake_up_nested(wait_queue_head_t *wqueue, unsigned long events, int subclass) { unsigned long flags; spin_lock_irqsave_nested(&wqueue->lock, flags, subclass); wake_up_locked_poll(wqueue, events); spin_unlock_irqrestore(&wqueue->lock, flags); } #else static inline void ep_wake_up_nested(wait_queue_head_t *wqueue, unsigned long events, int subclass) { wake_up_poll(wqueue, events); } #endif static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests) { ep_wake_up_nested((wait_queue_head_t *) cookie, POLLIN, 1 + call_nests); return 0; } /* * Perform a safe wake up of the poll wait list. The problem is that * with the new callback'd wake up system, it is possible that the * poll callback is reentered from inside the call to wake_up() done * on the poll wait queue head. The rule is that we cannot reenter the * wake up code from the same task more than EP_MAX_NESTS times, * and we cannot reenter the same wait queue head at all. This will * enable to have a hierarchy of epoll file descriptor of no more than * EP_MAX_NESTS deep. */ static void ep_poll_safewake(wait_queue_head_t *wq) { int this_cpu = get_cpu(); ep_call_nested(&poll_safewake_ncalls, EP_MAX_NESTS, ep_poll_wakeup_proc, NULL, wq, (void *) (long) this_cpu); put_cpu(); } /* * This function unregisters poll callbacks from the associated file * descriptor. Must be called with "mtx" held (or "epmutex" if called from * ep_free). */ static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi) { struct list_head *lsthead = &epi->pwqlist; struct eppoll_entry *pwq; while (!list_empty(lsthead)) { pwq = list_first_entry(lsthead, struct eppoll_entry, llink); list_del(&pwq->llink); remove_wait_queue(pwq->whead, &pwq->wait); kmem_cache_free(pwq_cache, pwq); } } /** * ep_scan_ready_list - Scans the ready list in a way that makes possible for * the scan code, to call f_op->poll(). Also allows for * O(NumReady) performance. * * @ep: Pointer to the epoll private data structure. * @sproc: Pointer to the scan callback. * @priv: Private opaque data passed to the @sproc callback. * * Returns: The same integer error code returned by the @sproc callback. */ static int ep_scan_ready_list(struct eventpoll *ep, int (*sproc)(struct eventpoll *, struct list_head *, void *), void *priv) { int error, pwake = 0; unsigned long flags; struct epitem *epi, *nepi; LIST_HEAD(txlist); /* * We need to lock this because we could be hit by * eventpoll_release_file() and epoll_ctl(). */ mutex_lock(&ep->mtx); /* * Steal the ready list, and re-init the original one to the * empty list. Also, set ep->ovflist to NULL so that events * happening while looping w/out locks, are not lost. We cannot * have the poll callback to queue directly on ep->rdllist, * because we want the "sproc" callback to be able to do it * in a lockless way. */ spin_lock_irqsave(&ep->lock, flags); list_splice_init(&ep->rdllist, &txlist); ep->ovflist = NULL; spin_unlock_irqrestore(&ep->lock, flags); /* * Now call the callback function. */ error = (*sproc)(ep, &txlist, priv); spin_lock_irqsave(&ep->lock, flags); /* * During the time we spent inside the "sproc" callback, some * other events might have been queued by the poll callback. * We re-insert them inside the main ready-list here. */ for (nepi = ep->ovflist; (epi = nepi) != NULL; nepi = epi->next, epi->next = EP_UNACTIVE_PTR) { /* * We need to check if the item is already in the list. * During the "sproc" callback execution time, items are * queued into ->ovflist but the "txlist" might already * contain them, and the list_splice() below takes care of them. */ if (!ep_is_linked(&epi->rdllink)) list_add_tail(&epi->rdllink, &ep->rdllist); } /* * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after * releasing the lock, events will be queued in the normal way inside * ep->rdllist. */ ep->ovflist = EP_UNACTIVE_PTR; /* * Quickly re-inject items left on "txlist". */ list_splice(&txlist, &ep->rdllist); if (!list_empty(&ep->rdllist)) { /* * Wake up (if active) both the eventpoll wait list and * the ->poll() wait list (delayed after we release the lock). */ if (waitqueue_active(&ep->wq)) wake_up_locked(&ep->wq); if (waitqueue_active(&ep->poll_wait)) pwake++; } spin_unlock_irqrestore(&ep->lock, flags); mutex_unlock(&ep->mtx); /* We have to call this outside the lock */ if (pwake) ep_poll_safewake(&ep->poll_wait); return error; } /* * Removes a "struct epitem" from the eventpoll RB tree and deallocates * all the associated resources. Must be called with "mtx" held. */ static int ep_remove(struct eventpoll *ep, struct epitem *epi) { unsigned long flags; struct file *file = epi->ffd.file; /* * Removes poll wait queue hooks. We _have_ to do this without holding * the "ep->lock" otherwise a deadlock might occur. This because of the * sequence of the lock acquisition. Here we do "ep->lock" then the wait * queue head lock when unregistering the wait queue. The wakeup callback * will run by holding the wait queue head lock and will call our callback * that will try to get "ep->lock". */ ep_unregister_pollwait(ep, epi); /* Remove the current item from the list of epoll hooks */ spin_lock(&file->f_lock); if (ep_is_linked(&epi->fllink)) list_del_init(&epi->fllink); spin_unlock(&file->f_lock); rb_erase(&epi->rbn, &ep->rbr); spin_lock_irqsave(&ep->lock, flags); if (ep_is_linked(&epi->rdllink)) list_del_init(&epi->rdllink); spin_unlock_irqrestore(&ep->lock, flags); /* At this point it is safe to free the eventpoll item */ kmem_cache_free(epi_cache, epi); atomic_dec(&ep->user->epoll_watches); return 0; } static void ep_free(struct eventpoll *ep) { struct rb_node *rbp; struct epitem *epi; /* We need to release all tasks waiting for these file */ if (waitqueue_active(&ep->poll_wait)) ep_poll_safewake(&ep->poll_wait); /* * We need to lock this because we could be hit by * eventpoll_release_file() while we're freeing the "struct eventpoll". * We do not need to hold "ep->mtx" here because the epoll file * is on the way to be removed and no one has references to it * anymore. The only hit might come from eventpoll_release_file() but * holding "epmutex" is sufficent here. */ mutex_lock(&epmutex); /* * Walks through the whole tree by unregistering poll callbacks. */ for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) { epi = rb_entry(rbp, struct epitem, rbn); ep_unregister_pollwait(ep, epi); } /* * Walks through the whole tree by freeing each "struct epitem". At this * point we are sure no poll callbacks will be lingering around, and also by * holding "epmutex" we can be sure that no file cleanup code will hit * us during this operation. So we can avoid the lock on "ep->lock". */ while ((rbp = rb_first(&ep->rbr)) != NULL) { epi = rb_entry(rbp, struct epitem, rbn); ep_remove(ep, epi); } mutex_unlock(&epmutex); mutex_destroy(&ep->mtx); free_uid(ep->user); kfree(ep); } static int ep_eventpoll_release(struct inode *inode, struct file *file) { struct eventpoll *ep = file->private_data; if (ep) ep_free(ep); return 0; } static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head, void *priv) { struct epitem *epi, *tmp; list_for_each_entry_safe(epi, tmp, head, rdllink) { if (epi->ffd.file->f_op->poll(epi->ffd.file, NULL) & epi->event.events) return POLLIN | POLLRDNORM; else { /* * Item has been dropped into the ready list by the poll * callback, but it's not actually ready, as far as * caller requested events goes. We can remove it here. */ list_del_init(&epi->rdllink); } } return 0; } static int ep_poll_readyevents_proc(void *priv, void *cookie, int call_nests) { return ep_scan_ready_list(priv, ep_read_events_proc, NULL); } static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait) { int pollflags; struct eventpoll *ep = file->private_data; /* Insert inside our poll wait queue */ poll_wait(file, &ep->poll_wait, wait); /* * Proceed to find out if wanted events are really available inside * the ready list. This need to be done under ep_call_nested() * supervision, since the call to f_op->poll() done on listed files * could re-enter here. */ pollflags = ep_call_nested(&poll_readywalk_ncalls, EP_MAX_NESTS, ep_poll_readyevents_proc, ep, ep, current); return pollflags != -1 ? pollflags : 0; } /* File callbacks that implement the eventpoll file behaviour */ static const struct file_operations eventpoll_fops = { .release = ep_eventpoll_release, .poll = ep_eventpoll_poll }; /* Fast test to see if the file is an evenpoll file */ static inline int is_file_epoll(struct file *f) { return f->f_op == &eventpoll_fops; } /* * This is called from eventpoll_release() to unlink files from the eventpoll * interface. We need to have this facility to cleanup correctly files that are * closed without being removed from the eventpoll interface. */ void eventpoll_release_file(struct file *file) { struct list_head *lsthead = &file->f_ep_links; struct eventpoll *ep; struct epitem *epi; /* * We don't want to get "file->f_lock" because it is not * necessary. It is not necessary because we're in the "struct file" * cleanup path, and this means that noone is using this file anymore. * So, for example, epoll_ctl() cannot hit here since if we reach this * point, the file counter already went to zero and fget() would fail. * The only hit might come from ep_free() but by holding the mutex * will correctly serialize the operation. We do need to acquire * "ep->mtx" after "epmutex" because ep_remove() requires it when called * from anywhere but ep_free(). * * Besides, ep_remove() acquires the lock, so we can't hold it here. */ mutex_lock(&epmutex); while (!list_empty(lsthead)) { epi = list_first_entry(lsthead, struct epitem, fllink); ep = epi->ep; list_del_init(&epi->fllink); mutex_lock(&ep->mtx); ep_remove(ep, epi); mutex_unlock(&ep->mtx); } mutex_unlock(&epmutex); } static int ep_alloc(struct eventpoll **pep) { int error; struct user_struct *user; struct eventpoll *ep; user = get_current_user(); error = -ENOMEM; ep = kzalloc(sizeof(*ep), GFP_KERNEL); if (unlikely(!ep)) goto free_uid; spin_lock_init(&ep->lock); mutex_init(&ep->mtx); init_waitqueue_head(&ep->wq); init_waitqueue_head(&ep->poll_wait); INIT_LIST_HEAD(&ep->rdllist); ep->rbr = RB_ROOT; ep->ovflist = EP_UNACTIVE_PTR; ep->user = user; *pep = ep; return 0; free_uid: free_uid(user); return error; } /* * Search the file inside the eventpoll tree. The RB tree operations * are protected by the "mtx" mutex, and ep_find() must be called with * "mtx" held. */ static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd) { int kcmp; struct rb_node *rbp; struct epitem *epi, *epir = NULL; struct epoll_filefd ffd; ep_set_ffd(&ffd, file, fd); for (rbp = ep->rbr.rb_node; rbp; ) { epi = rb_entry(rbp, struct epitem, rbn); kcmp = ep_cmp_ffd(&ffd, &epi->ffd); if (kcmp > 0) rbp = rbp->rb_right; else if (kcmp < 0) rbp = rbp->rb_left; else { epir = epi; break; } } return epir; } /* * This is the callback that is passed to the wait queue wakeup * machanism. It is called by the stored file descriptors when they * have events to report. */ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key) { int pwake = 0; unsigned long flags; struct epitem *epi = ep_item_from_wait(wait); struct eventpoll *ep = epi->ep; spin_lock_irqsave(&ep->lock, flags); /* * If the event mask does not contain any poll(2) event, we consider the * descriptor to be disabled. This condition is likely the effect of the * EPOLLONESHOT bit that disables the descriptor when an event is received, * until the next EPOLL_CTL_MOD will be issued. */ if (!(epi->event.events & ~EP_PRIVATE_BITS)) goto out_unlock; /* * Check the events coming with the callback. At this stage, not * every device reports the events in the "key" parameter of the * callback. We need to be able to handle both cases here, hence the * test for "key" != NULL before the event match test. */ if (key && !((unsigned long) key & epi->event.events)) goto out_unlock; /* * If we are trasfering events to userspace, we can hold no locks * (because we're accessing user memory, and because of linux f_op->poll() * semantics). All the events that happens during that period of time are * chained in ep->ovflist and requeued later on. */ if (unlikely(ep->ovflist != EP_UNACTIVE_PTR)) { if (epi->next == EP_UNACTIVE_PTR) { epi->next = ep->ovflist; ep->ovflist = epi; } goto out_unlock; } /* If this file is already in the ready list we exit soon */ if (!ep_is_linked(&epi->rdllink)) list_add_tail(&epi->rdllink, &ep->rdllist); /* * Wake up ( if active ) both the eventpoll wait list and the ->poll() * wait list. */ if (waitqueue_active(&ep->wq)) wake_up_locked(&ep->wq); if (waitqueue_active(&ep->poll_wait)) pwake++; out_unlock: spin_unlock_irqrestore(&ep->lock, flags); /* We have to call this outside the lock */ if (pwake) ep_poll_safewake(&ep->poll_wait); return 1; } /* * This is the callback that is used to add our wait queue to the * target file wakeup lists. */ static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead, poll_table *pt) { struct epitem *epi = ep_item_from_epqueue(pt); struct eppoll_entry *pwq; if (epi->nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL))) { init_waitqueue_func_entry(&pwq->wait, ep_poll_callback); pwq->whead = whead; pwq->base = epi; add_wait_queue(whead, &pwq->wait); list_add_tail(&pwq->llink, &epi->pwqlist); epi->nwait++; } else { /* We have to signal that an error occurred */ epi->nwait = -1; } } static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi) { int kcmp; struct rb_node **p = &ep->rbr.rb_node, *parent = NULL; struct epitem *epic; while (*p) { parent = *p; epic = rb_entry(parent, struct epitem, rbn); kcmp = ep_cmp_ffd(&epi->ffd, &epic->ffd); if (kcmp > 0) p = &parent->rb_right; else p = &parent->rb_left; } rb_link_node(&epi->rbn, parent, p); rb_insert_color(&epi->rbn, &ep->rbr); } /* * Must be called with "mtx" held. */ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, struct file *tfile, int fd) { int error, revents, pwake = 0; unsigned long flags; struct epitem *epi; struct ep_pqueue epq; if (unlikely(atomic_read(&ep->user->epoll_watches) >= max_user_watches)) return -ENOSPC; if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL))) return -ENOMEM; /* Item initialization follow here ... */ INIT_LIST_HEAD(&epi->rdllink); INIT_LIST_HEAD(&epi->fllink); INIT_LIST_HEAD(&epi->pwqlist); epi->ep = ep; ep_set_ffd(&epi->ffd, tfile, fd); epi->event = *event; epi->nwait = 0; epi->next = EP_UNACTIVE_PTR; /* Initialize the poll table using the queue callback */ epq.epi = epi; init_poll_funcptr(&epq.pt, ep_ptable_queue_proc); /* * Attach the item to the poll hooks and get current event bits. * We can safely use the file* here because its usage count has * been increased by the caller of this function. Note that after * this operation completes, the poll callback can start hitting * the new item. */ revents = tfile->f_op->poll(tfile, &epq.pt); /* * We have to check if something went wrong during the poll wait queue * install process. Namely an allocation for a wait queue failed due * high memory pressure. */ error = -ENOMEM; if (epi->nwait < 0) goto error_unregister; /* Add the current item to the list of active epoll hook for this file */ spin_lock(&tfile->f_lock); list_add_tail(&epi->fllink, &tfile->f_ep_links); spin_unlock(&tfile->f_lock); /* * Add the current item to the RB tree. All RB tree operations are * protected by "mtx", and ep_insert() is called with "mtx" held. */ ep_rbtree_insert(ep, epi); /* We have to drop the new item inside our item list to keep track of it */ spin_lock_irqsave(&ep->lock, flags); /* If the file is already "ready" we drop it inside the ready list */ if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) { list_add_tail(&epi->rdllink, &ep->rdllist); /* Notify waiting tasks that events are available */ if (waitqueue_active(&ep->wq)) wake_up_locked(&ep->wq); if (waitqueue_active(&ep->poll_wait)) pwake++; } spin_unlock_irqrestore(&ep->lock, flags); atomic_inc(&ep->user->epoll_watches); /* We have to call this outside the lock */ if (pwake) ep_poll_safewake(&ep->poll_wait); return 0; error_unregister: ep_unregister_pollwait(ep, epi); /* * We need to do this because an event could have been arrived on some * allocated wait queue. Note that we don't care about the ep->ovflist * list, since that is used/cleaned only inside a section bound by "mtx". * And ep_insert() is called with "mtx" held. */ spin_lock_irqsave(&ep->lock, flags); if (ep_is_linked(&epi->rdllink)) list_del_init(&epi->rdllink); spin_unlock_irqrestore(&ep->lock, flags); kmem_cache_free(epi_cache, epi); return error; } /* * Modify the interest event mask by dropping an event if the new mask * has a match in the current file status. Must be called with "mtx" held. */ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_event *event) { int pwake = 0; unsigned int revents; /* * Set the new event interest mask before calling f_op->poll(); * otherwise we might miss an event that happens between the * f_op->poll() call and the new event set registering. */ epi->event.events = event->events; epi->event.data = event->data; /* protected by mtx */ /* * Get current event bits. We can safely use the file* here because * its usage count has been increased by the caller of this function. */ revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL); /* * If the item is "hot" and it is not registered inside the ready * list, push it inside. */ if (revents & event->events) { spin_lock_irq(&ep->lock); if (!ep_is_linked(&epi->rdllink)) { list_add_tail(&epi->rdllink, &ep->rdllist); /* Notify waiting tasks that events are available */ if (waitqueue_active(&ep->wq)) wake_up_locked(&ep->wq); if (waitqueue_active(&ep->poll_wait)) pwake++; } spin_unlock_irq(&ep->lock); } /* We have to call this outside the lock */ if (pwake) ep_poll_safewake(&ep->poll_wait); return 0; } static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head, void *priv) { struct ep_send_events_data *esed = priv; int eventcnt; unsigned int revents; struct epitem *epi; struct epoll_event __user *uevent; /* * We can loop without lock because we are passed a task private list. * Items cannot vanish during the loop because ep_scan_ready_list() is * holding "mtx" during this call. */ for (eventcnt = 0, uevent = esed->events; !list_empty(head) && eventcnt < esed->maxevents;) { epi = list_first_entry(head, struct epitem, rdllink); list_del_init(&epi->rdllink); revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL) & epi->event.events; /* * If the event mask intersect the caller-requested one, * deliver the event to userspace. Again, ep_scan_ready_list() * is holding "mtx", so no operations coming from userspace * can change the item. */ if (revents) {