aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAnton Ivanov <anton.ivanov@cambridgegreys.com>2017-11-20 16:17:58 -0500
committerRichard Weinberger <richard@nod.at>2018-02-19 13:38:51 -0500
commitff6a17989c08b0bb0fd490cc500b084581b3a9b9 (patch)
tree1f87a8f21ad3659e61e4958faece57994a842cde
parent4d1a535b8ec5e74b42dfd9dc809142653b2597f6 (diff)
Epoll based IRQ controller
1. Removes the need to walk the IRQ/Device list to determine who triggered the IRQ. 2. Improves scalability (up to several times performance improvement for cases with 10s of devices). 3. Improves UML baseline IO performance for one disk + one NIC use case by up to 10%. 4. Introduces write poll triggered IRQs. 5. Prerequisite for introducing high performance mmesg family of functions in network IO. 6. Fixes RNG shutdown which was leaking a file descriptor Signed-off-by: Anton Ivanov <anton.ivanov@cambridgegreys.com> Signed-off-by: Richard Weinberger <richard@nod.at>
-rw-r--r--arch/um/drivers/chan_kern.c53
-rw-r--r--arch/um/drivers/line.c2
-rw-r--r--arch/um/drivers/random.c11
-rw-r--r--arch/um/drivers/ubd_kern.c4
-rw-r--r--arch/um/include/shared/irq_user.h12
-rw-r--r--arch/um/include/shared/os.h17
-rw-r--r--arch/um/kernel/irq.c460
-rw-r--r--arch/um/os-Linux/irq.c202
8 files changed, 444 insertions, 317 deletions
diff --git a/arch/um/drivers/chan_kern.c b/arch/um/drivers/chan_kern.c
index acbe6c67afba..05588f9466c7 100644
--- a/arch/um/drivers/chan_kern.c
+++ b/arch/um/drivers/chan_kern.c
@@ -171,56 +171,19 @@ int enable_chan(struct line *line)
171 return err; 171 return err;
172} 172}
173 173
174/* Items are added in IRQ context, when free_irq can't be called, and
175 * removed in process context, when it can.
176 * This handles interrupt sources which disappear, and which need to
177 * be permanently disabled. This is discovered in IRQ context, but
178 * the freeing of the IRQ must be done later.
179 */
180static DEFINE_SPINLOCK(irqs_to_free_lock);
181static LIST_HEAD(irqs_to_free);
182
183void free_irqs(void)
184{
185 struct chan *chan;
186 LIST_HEAD(list);
187 struct list_head *ele;
188 unsigned long flags;
189
190 spin_lock_irqsave(&irqs_to_free_lock, flags);
191 list_splice_init(&irqs_to_free, &list);
192 spin_unlock_irqrestore(&irqs_to_free_lock, flags);
193
194 list_for_each(ele, &list) {
195 chan = list_entry(ele, struct chan, free_list);
196
197 if (chan->input && chan->enabled)
198 um_free_irq(chan->line->driver->read_irq, chan);
199 if (chan->output && chan->enabled)
200 um_free_irq(chan->line->driver->write_irq, chan);
201 chan->enabled = 0;
202 }
203}
204
205static void close_one_chan(struct chan *chan, int delay_free_irq) 174static void close_one_chan(struct chan *chan, int delay_free_irq)
206{ 175{
207 unsigned long flags;
208
209 if (!chan->opened) 176 if (!chan->opened)
210 return; 177 return;
211 178
212 if (delay_free_irq) { 179 /* we can safely call free now - it will be marked
213 spin_lock_irqsave(&irqs_to_free_lock, flags); 180 * as free and freed once the IRQ stopped processing
214 list_add(&chan->free_list, &irqs_to_free); 181 */
215 spin_unlock_irqrestore(&irqs_to_free_lock, flags); 182 if (chan->input && chan->enabled)
216 } 183 um_free_irq(chan->line->driver->read_irq, chan);
217 else { 184 if (chan->output && chan->enabled)
218 if (chan->input && chan->enabled) 185 um_free_irq(chan->line->driver->write_irq, chan);
219 um_free_irq(chan->line->driver->read_irq, chan); 186 chan->enabled = 0;
220 if (chan->output && chan->enabled)
221 um_free_irq(chan->line->driver->write_irq, chan);
222 chan->enabled = 0;
223 }
224 if (chan->ops->close != NULL) 187 if (chan->ops->close != NULL)
225 (*chan->ops->close)(chan->fd, chan->data); 188 (*chan->ops->close)(chan->fd, chan->data);
226 189
diff --git a/arch/um/drivers/line.c b/arch/um/drivers/line.c
index 366e57f5e8d6..8d80b27502e6 100644
--- a/arch/um/drivers/line.c
+++ b/arch/um/drivers/line.c
@@ -284,7 +284,7 @@ int line_setup_irq(int fd, int input, int output, struct line *line, void *data)
284 if (err) 284 if (err)
285 return err; 285 return err;
286 if (output) 286 if (output)
287 err = um_request_irq(driver->write_irq, fd, IRQ_WRITE, 287 err = um_request_irq(driver->write_irq, fd, IRQ_NONE,
288 line_write_interrupt, IRQF_SHARED, 288 line_write_interrupt, IRQF_SHARED,
289 driver->write_irq_name, data); 289 driver->write_irq_name, data);
290 return err; 290 return err;
diff --git a/arch/um/drivers/random.c b/arch/um/drivers/random.c
index 37c51a6be690..778a0e52d5a5 100644
--- a/arch/um/drivers/random.c
+++ b/arch/um/drivers/random.c
@@ -13,6 +13,7 @@
13#include <linux/miscdevice.h> 13#include <linux/miscdevice.h>
14#include <linux/delay.h> 14#include <linux/delay.h>
15#include <linux/uaccess.h> 15#include <linux/uaccess.h>
16#include <init.h>
16#include <irq_kern.h> 17#include <irq_kern.h>
17#include <os.h> 18#include <os.h>
18 19
@@ -154,7 +155,14 @@ err_out_cleanup_hw:
154/* 155/*
155 * rng_cleanup - shutdown RNG module 156 * rng_cleanup - shutdown RNG module
156 */ 157 */
157static void __exit rng_cleanup (void) 158
159static void cleanup(void)
160{
161 free_irq_by_fd(random_fd);
162 os_close_file(random_fd);
163}
164
165static void __exit rng_cleanup(void)
158{ 166{
159 os_close_file(random_fd); 167 os_close_file(random_fd);
160 misc_deregister (&rng_miscdev); 168 misc_deregister (&rng_miscdev);
@@ -162,6 +170,7 @@ static void __exit rng_cleanup (void)
162 170
163module_init (rng_init); 171module_init (rng_init);
164module_exit (rng_cleanup); 172module_exit (rng_cleanup);
173__uml_exitcall(cleanup);
165 174
166MODULE_DESCRIPTION("UML Host Random Number Generator (RNG) driver"); 175MODULE_DESCRIPTION("UML Host Random Number Generator (RNG) driver");
167MODULE_LICENSE("GPL"); 176MODULE_LICENSE("GPL");
diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c
index b55fe9bf5d3e..d4e8c497ae86 100644
--- a/arch/um/drivers/ubd_kern.c
+++ b/arch/um/drivers/ubd_kern.c
@@ -1587,11 +1587,11 @@ int io_thread(void *arg)
1587 1587
1588 do { 1588 do {
1589 res = os_write_file(kernel_fd, ((char *) io_req_buffer) + written, n); 1589 res = os_write_file(kernel_fd, ((char *) io_req_buffer) + written, n);
1590 if (res > 0) { 1590 if (res >= 0) {
1591 written += res; 1591 written += res;
1592 } else { 1592 } else {
1593 if (res != -EAGAIN) { 1593 if (res != -EAGAIN) {
1594 printk("io_thread - read failed, fd = %d, " 1594 printk("io_thread - write failed, fd = %d, "
1595 "err = %d\n", kernel_fd, -n); 1595 "err = %d\n", kernel_fd, -n);
1596 } 1596 }
1597 } 1597 }
diff --git a/arch/um/include/shared/irq_user.h b/arch/um/include/shared/irq_user.h
index df5633053957..a7a6120f19d5 100644
--- a/arch/um/include/shared/irq_user.h
+++ b/arch/um/include/shared/irq_user.h
@@ -7,6 +7,7 @@
7#define __IRQ_USER_H__ 7#define __IRQ_USER_H__
8 8
9#include <sysdep/ptrace.h> 9#include <sysdep/ptrace.h>
10#include <stdbool.h>
10 11
11struct irq_fd { 12struct irq_fd {
12 struct irq_fd *next; 13 struct irq_fd *next;
@@ -15,10 +16,17 @@ struct irq_fd {
15 int type; 16 int type;
16 int irq; 17 int irq;
17 int events; 18 int events;
18 int current_events; 19 bool active;
20 bool pending;
21 bool purge;
19}; 22};
20 23
21enum { IRQ_READ, IRQ_WRITE }; 24#define IRQ_READ 0
25#define IRQ_WRITE 1
26#define IRQ_NONE 2
27#define MAX_IRQ_TYPE (IRQ_NONE + 1)
28
29
22 30
23struct siginfo; 31struct siginfo;
24extern void sigio_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs); 32extern void sigio_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs);
diff --git a/arch/um/include/shared/os.h b/arch/um/include/shared/os.h
index d8ddaf9790d2..048ae37eb5aa 100644
--- a/arch/um/include/shared/os.h
+++ b/arch/um/include/shared/os.h
@@ -290,15 +290,16 @@ extern void halt_skas(void);
290extern void reboot_skas(void); 290extern void reboot_skas(void);
291 291
292/* irq.c */ 292/* irq.c */
293extern int os_waiting_for_events(struct irq_fd *active_fds); 293extern int os_waiting_for_events_epoll(void);
294extern int os_create_pollfd(int fd, int events, void *tmp_pfd, int size_tmpfds); 294extern void *os_epoll_get_data_pointer(int index);
295extern void os_free_irq_by_cb(int (*test)(struct irq_fd *, void *), void *arg, 295extern int os_epoll_triggered(int index, int events);
296 struct irq_fd *active_fds, struct irq_fd ***last_irq_ptr2); 296extern int os_event_mask(int irq_type);
297extern void os_free_irq_later(struct irq_fd *active_fds, 297extern int os_setup_epoll(void);
298 int irq, void *dev_id); 298extern int os_add_epoll_fd(int events, int fd, void *data);
299extern int os_get_pollfd(int i); 299extern int os_mod_epoll_fd(int events, int fd, void *data);
300extern void os_set_pollfd(int i, int fd); 300extern int os_del_epoll_fd(int fd);
301extern void os_set_ioignore(void); 301extern void os_set_ioignore(void);
302extern void os_close_epoll_fd(void);
302 303
303/* sigio.c */ 304/* sigio.c */
304extern int add_sigio_fd(int fd); 305extern int add_sigio_fd(int fd);
diff --git a/arch/um/kernel/irq.c b/arch/um/kernel/irq.c
index 23cb9350d47e..980148d56537 100644
--- a/arch/um/kernel/irq.c
+++ b/arch/um/kernel/irq.c
@@ -1,4 +1,6 @@
1/* 1/*
2 * Copyright (C) 2017 - Cambridge Greys Ltd
3 * Copyright (C) 2011 - 2014 Cisco Systems Inc
2 * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) 4 * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
3 * Licensed under the GPL 5 * Licensed under the GPL
4 * Derived (i.e. mostly copied) from arch/i386/kernel/irq.c: 6 * Derived (i.e. mostly copied) from arch/i386/kernel/irq.c:
@@ -16,243 +18,361 @@
16#include <as-layout.h> 18#include <as-layout.h>
17#include <kern_util.h> 19#include <kern_util.h>
18#include <os.h> 20#include <os.h>
21#include <irq_user.h>
19 22
20/* 23
21 * This list is accessed under irq_lock, except in sigio_handler, 24/* When epoll triggers we do not know why it did so
22 * where it is safe from being modified. IRQ handlers won't change it - 25 * we can also have different IRQs for read and write.
23 * if an IRQ source has vanished, it will be freed by free_irqs just 26 * This is why we keep a small irq_fd array for each fd -
24 * before returning from sigio_handler. That will process a separate 27 * one entry per IRQ type
25 * list of irqs to free, with its own locking, coming back here to
26 * remove list elements, taking the irq_lock to do so.
27 */ 28 */
28static struct irq_fd *active_fds = NULL;
29static struct irq_fd **last_irq_ptr = &active_fds;
30 29
31extern void free_irqs(void); 30struct irq_entry {
31 struct irq_entry *next;
32 int fd;
33 struct irq_fd *irq_array[MAX_IRQ_TYPE + 1];
34};
35
36static struct irq_entry *active_fds;
37
38static DEFINE_SPINLOCK(irq_lock);
39
40static void irq_io_loop(struct irq_fd *irq, struct uml_pt_regs *regs)
41{
42/*
43 * irq->active guards against reentry
44 * irq->pending accumulates pending requests
45 * if pending is raised the irq_handler is re-run
46 * until pending is cleared
47 */
48 if (irq->active) {
49 irq->active = false;
50 do {
51 irq->pending = false;
52 do_IRQ(irq->irq, regs);
53 } while (irq->pending && (!irq->purge));
54 if (!irq->purge)
55 irq->active = true;
56 } else {
57 irq->pending = true;
58 }
59}
32 60
33void sigio_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs) 61void sigio_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs)
34{ 62{
35 struct irq_fd *irq_fd; 63 struct irq_entry *irq_entry;
36 int n; 64 struct irq_fd *irq;
65
66 int n, i, j;
37 67
38 while (1) { 68 while (1) {
39 n = os_waiting_for_events(active_fds); 69 /* This is now lockless - epoll keeps back-referencesto the irqs
70 * which have trigger it so there is no need to walk the irq
71 * list and lock it every time. We avoid locking by turning off
72 * IO for a specific fd by executing os_del_epoll_fd(fd) before
73 * we do any changes to the actual data structures
74 */
75 n = os_waiting_for_events_epoll();
76
40 if (n <= 0) { 77 if (n <= 0) {
41 if (n == -EINTR) 78 if (n == -EINTR)
42 continue; 79 continue;
43 else break; 80 else
81 break;
44 } 82 }
45 83
46 for (irq_fd = active_fds; irq_fd != NULL; 84 for (i = 0; i < n ; i++) {
47 irq_fd = irq_fd->next) { 85 /* Epoll back reference is the entry with 3 irq_fd
48 if (irq_fd->current_events != 0) { 86 * leaves - one for each irq type.
49 irq_fd->current_events = 0; 87 */
50 do_IRQ(irq_fd->irq, regs); 88 irq_entry = (struct irq_entry *)
89 os_epoll_get_data_pointer(i);
90 for (j = 0; j < MAX_IRQ_TYPE ; j++) {
91 irq = irq_entry->irq_array[j];
92 if (irq == NULL)
93 continue;
94 if (os_epoll_triggered(i, irq->events) > 0)
95 irq_io_loop(irq, regs);
96 if (irq->purge) {
97 irq_entry->irq_array[j] = NULL;
98 kfree(irq);
99 }
51 } 100 }
52 } 101 }
53 } 102 }
103}
104
105static int assign_epoll_events_to_irq(struct irq_entry *irq_entry)
106{
107 int i;
108 int events = 0;
109 struct irq_fd *irq;
54 110
55 free_irqs(); 111 for (i = 0; i < MAX_IRQ_TYPE ; i++) {
112 irq = irq_entry->irq_array[i];
113 if (irq != NULL)
114 events = irq->events | events;
115 }
116 if (events > 0) {
117 /* os_add_epoll will call os_mod_epoll if this already exists */
118 return os_add_epoll_fd(events, irq_entry->fd, irq_entry);
119 }
120 /* No events - delete */
121 return os_del_epoll_fd(irq_entry->fd);
56} 122}
57 123
58static DEFINE_SPINLOCK(irq_lock); 124
59 125
60static int activate_fd(int irq, int fd, int type, void *dev_id) 126static int activate_fd(int irq, int fd, int type, void *dev_id)
61{ 127{
62 struct pollfd *tmp_pfd; 128 struct irq_fd *new_fd;
63 struct irq_fd *new_fd, *irq_fd; 129 struct irq_entry *irq_entry;
130 int i, err, events;
64 unsigned long flags; 131 unsigned long flags;
65 int events, err, n;
66 132
67 err = os_set_fd_async(fd); 133 err = os_set_fd_async(fd);
68 if (err < 0) 134 if (err < 0)
69 goto out; 135 goto out;
70 136
71 err = -ENOMEM; 137 spin_lock_irqsave(&irq_lock, flags);
72 new_fd = kmalloc(sizeof(struct irq_fd), GFP_KERNEL);
73 if (new_fd == NULL)
74 goto out;
75 138
76 if (type == IRQ_READ) 139 /* Check if we have an entry for this fd */
77 events = UM_POLLIN | UM_POLLPRI;
78 else events = UM_POLLOUT;
79 *new_fd = ((struct irq_fd) { .next = NULL,
80 .id = dev_id,
81 .fd = fd,
82 .type = type,
83 .irq = irq,
84 .events = events,
85 .current_events = 0 } );
86 140
87 err = -EBUSY; 141 err = -EBUSY;
88 spin_lock_irqsave(&irq_lock, flags); 142 for (irq_entry = active_fds;
89 for (irq_fd = active_fds; irq_fd != NULL; irq_fd = irq_fd->next) { 143 irq_entry != NULL; irq_entry = irq_entry->next) {
90 if ((irq_fd->fd == fd) && (irq_fd->type == type)) { 144 if (irq_entry->fd == fd)
91 printk(KERN_ERR "Registering fd %d twice\n", fd); 145 break;
92 printk(KERN_ERR "Irqs : %d, %d\n", irq_fd->irq, irq); 146 }
93 printk(KERN_ERR "Ids : 0x%p, 0x%p\n", irq_fd->id, 147
94 dev_id); 148 if (irq_entry == NULL) {
149 /* This needs to be atomic as it may be called from an
150 * IRQ context.
151 */
152 irq_entry = kmalloc(sizeof(struct irq_entry), GFP_ATOMIC);
153 if (irq_entry == NULL) {
154 printk(KERN_ERR
155 "Failed to allocate new IRQ entry\n");
95 goto out_unlock; 156 goto out_unlock;
96 } 157 }
158 irq_entry->fd = fd;
159 for (i = 0; i < MAX_IRQ_TYPE; i++)
160 irq_entry->irq_array[i] = NULL;
161 irq_entry->next = active_fds;
162 active_fds = irq_entry;
97 } 163 }
98 164
99 if (type == IRQ_WRITE) 165 /* Check if we are trying to re-register an interrupt for a
100 fd = -1; 166 * particular fd
101 167 */
102 tmp_pfd = NULL;
103 n = 0;
104 168
105 while (1) { 169 if (irq_entry->irq_array[type] != NULL) {
106 n = os_create_pollfd(fd, events, tmp_pfd, n); 170 printk(KERN_ERR
107 if (n == 0) 171 "Trying to reregister IRQ %d FD %d TYPE %d ID %p\n",
108 break; 172 irq, fd, type, dev_id
173 );
174 goto out_unlock;
175 } else {
176 /* New entry for this fd */
177
178 err = -ENOMEM;
179 new_fd = kmalloc(sizeof(struct irq_fd), GFP_ATOMIC);
180 if (new_fd == NULL)
181 goto out_unlock;
109 182
110 /* 183 events = os_event_mask(type);
111 * n > 0 184
112 * It means we couldn't put new pollfd to current pollfds 185 *new_fd = ((struct irq_fd) {
113 * and tmp_fds is NULL or too small for new pollfds array. 186 .id = dev_id,
114 * Needed size is equal to n as minimum. 187 .irq = irq,
115 * 188 .type = type,
116 * Here we have to drop the lock in order to call 189 .events = events,
117 * kmalloc, which might sleep. 190 .active = true,
118 * If something else came in and changed the pollfds array 191 .pending = false,
119 * so we will not be able to put new pollfd struct to pollfds 192 .purge = false
120 * then we free the buffer tmp_fds and try again. 193 });
194 /* Turn off any IO on this fd - allows us to
195 * avoid locking the IRQ loop
121 */ 196 */
122 spin_unlock_irqrestore(&irq_lock, flags); 197 os_del_epoll_fd(irq_entry->fd);
123 kfree(tmp_pfd); 198 irq_entry->irq_array[type] = new_fd;
124
125 tmp_pfd = kmalloc(n, GFP_KERNEL);
126 if (tmp_pfd == NULL)
127 goto out_kfree;
128
129 spin_lock_irqsave(&irq_lock, flags);
130 } 199 }
131 200
132 *last_irq_ptr = new_fd; 201 /* Turn back IO on with the correct (new) IO event mask */
133 last_irq_ptr = &new_fd->next; 202 assign_epoll_events_to_irq(irq_entry);
134
135 spin_unlock_irqrestore(&irq_lock, flags); 203 spin_unlock_irqrestore(&irq_lock, flags);
136 204 maybe_sigio_broken(fd, (type != IRQ_NONE));
137 /*
138 * This calls activate_fd, so it has to be outside the critical
139 * section.
140 */
141 maybe_sigio_broken(fd, (type == IRQ_READ));
142 205
143 return 0; 206 return 0;
144 207out_unlock:
145 out_unlock:
146 spin_unlock_irqrestore(&irq_lock, flags); 208 spin_unlock_irqrestore(&irq_lock, flags);
147 out_kfree: 209out:
148 kfree(new_fd);
149 out:
150 return err; 210 return err;
151} 211}
152 212
153static void free_irq_by_cb(int (*test)(struct irq_fd *, void *), void *arg) 213/*
214 * Walk the IRQ list and dispose of any unused entries.
215 * Should be done under irq_lock.
216 */
217
218static void garbage_collect_irq_entries(void)
154{ 219{
155 unsigned long flags; 220 int i;
221 bool reap;
222 struct irq_entry *walk;
223 struct irq_entry *previous = NULL;
224 struct irq_entry *to_free;
156 225
157 spin_lock_irqsave(&irq_lock, flags); 226 if (active_fds == NULL)
158 os_free_irq_by_cb(test, arg, active_fds, &last_irq_ptr); 227 return;
159 spin_unlock_irqrestore(&irq_lock, flags); 228 walk = active_fds;
229 while (walk != NULL) {
230 reap = true;
231 for (i = 0; i < MAX_IRQ_TYPE ; i++) {
232 if (walk->irq_array[i] != NULL) {
233 reap = false;
234 break;
235 }
236 }
237 if (reap) {
238 if (previous == NULL)
239 active_fds = walk->next;
240 else
241 previous->next = walk->next;
242 to_free = walk;
243 } else {
244 to_free = NULL;
245 }
246 walk = walk->next;
247 if (to_free != NULL)
248 kfree(to_free);
249 }
160} 250}
161 251
162struct irq_and_dev { 252/*
163 int irq; 253 * Walk the IRQ list and get the descriptor for our FD
164 void *dev; 254 */
165};
166 255
167static int same_irq_and_dev(struct irq_fd *irq, void *d) 256static struct irq_entry *get_irq_entry_by_fd(int fd)
168{ 257{
169 struct irq_and_dev *data = d; 258 struct irq_entry *walk = active_fds;
170 259
171 return ((irq->irq == data->irq) && (irq->id == data->dev)); 260 while (walk != NULL) {
261 if (walk->fd == fd)
262 return walk;
263 walk = walk->next;
264 }
265 return NULL;
172} 266}
173 267
174static void free_irq_by_irq_and_dev(unsigned int irq, void *dev)
175{
176 struct irq_and_dev data = ((struct irq_and_dev) { .irq = irq,
177 .dev = dev });
178 268
179 free_irq_by_cb(same_irq_and_dev, &data); 269/*
180} 270 * Walk the IRQ list and dispose of an entry for a specific
271 * device, fd and number. Note - if sharing an IRQ for read
272 * and writefor the same FD it will be disposed in either case.
273 * If this behaviour is undesirable use different IRQ ids.
274 */
181 275
182static int same_fd(struct irq_fd *irq, void *fd) 276#define IGNORE_IRQ 1
183{ 277#define IGNORE_DEV (1<<1)
184 return (irq->fd == *((int *)fd));
185}
186 278
187void free_irq_by_fd(int fd) 279static void do_free_by_irq_and_dev(
280 struct irq_entry *irq_entry,
281 unsigned int irq,
282 void *dev,
283 int flags
284)
188{ 285{
189 free_irq_by_cb(same_fd, &fd); 286 int i;
287 struct irq_fd *to_free;
288
289 for (i = 0; i < MAX_IRQ_TYPE ; i++) {
290 if (irq_entry->irq_array[i] != NULL) {
291 if (
292 ((flags & IGNORE_IRQ) ||
293 (irq_entry->irq_array[i]->irq == irq)) &&
294 ((flags & IGNORE_DEV) ||
295 (irq_entry->irq_array[i]->id == dev))
296 ) {
297 /* Turn off any IO on this fd - allows us to
298 * avoid locking the IRQ loop
299 */
300 os_del_epoll_fd(irq_entry->fd);
301 to_free = irq_entry->irq_array[i];
302 irq_entry->irq_array[i] = NULL;
303 assign_epoll_events_to_irq(irq_entry);
304 if (to_free->active)
305 to_free->purge = true;
306 else
307 kfree(to_free);
308 }
309 }
310 }
190} 311}
191 312
192/* Must be called with irq_lock held */ 313void free_irq_by_fd(int fd)
193static struct irq_fd *find_irq_by_fd(int fd, int irqnum, int *index_out)
194{ 314{
195 struct irq_fd *irq; 315 struct irq_entry *to_free;
196 int i = 0; 316 unsigned long flags;
197 int fdi;
198 317
199 for (irq = active_fds; irq != NULL; irq = irq->next) { 318 spin_lock_irqsave(&irq_lock, flags);
200 if ((irq->fd == fd) && (irq->irq == irqnum)) 319 to_free = get_irq_entry_by_fd(fd);
201 break; 320 if (to_free != NULL) {
202 i++; 321 do_free_by_irq_and_dev(
203 } 322 to_free,
204 if (irq == NULL) { 323 -1,
205 printk(KERN_ERR "find_irq_by_fd doesn't have descriptor %d\n", 324 NULL,
206 fd); 325 IGNORE_IRQ | IGNORE_DEV
207 goto out; 326 );
208 }
209 fdi = os_get_pollfd(i);
210 if ((fdi != -1) && (fdi != fd)) {
211 printk(KERN_ERR "find_irq_by_fd - mismatch between active_fds "
212 "and pollfds, fd %d vs %d, need %d\n", irq->fd,
213 fdi, fd);
214 irq = NULL;
215 goto out;
216 } 327 }
217 *index_out = i; 328 garbage_collect_irq_entries();
218 out: 329 spin_unlock_irqrestore(&irq_lock, flags);
219 return irq;
220} 330}
221 331
222void reactivate_fd(int fd, int irqnum) 332static void free_irq_by_irq_and_dev(unsigned int irq, void *dev)
223{ 333{
224 struct irq_fd *irq; 334 struct irq_entry *to_free;
225 unsigned long flags; 335 unsigned long flags;
226 int i;
227 336
228 spin_lock_irqsave(&irq_lock, flags); 337 spin_lock_irqsave(&irq_lock, flags);
229 irq = find_irq_by_fd(fd, irqnum, &i); 338 to_free = active_fds;
230 if (irq == NULL) { 339 while (to_free != NULL) {
231 spin_unlock_irqrestore(&irq_lock, flags); 340 do_free_by_irq_and_dev(
232 return; 341 to_free,
342 irq,
343 dev,
344 0
345 );
346 to_free = to_free->next;
233 } 347 }
234 os_set_pollfd(i, irq->fd); 348 garbage_collect_irq_entries();
235 spin_unlock_irqrestore(&irq_lock, flags); 349 spin_unlock_irqrestore(&irq_lock, flags);
350}
236 351
237 add_sigio_fd(fd); 352
353void reactivate_fd(int fd, int irqnum)
354{
355 /** NOP - we do auto-EOI now **/
238} 356}
239 357
240void deactivate_fd(int fd, int irqnum) 358void deactivate_fd(int fd, int irqnum)
241{ 359{
242 struct irq_fd *irq; 360 struct irq_entry *to_free;
243 unsigned long flags; 361 unsigned long flags;
244 int i;
245 362
363 os_del_epoll_fd(fd);
246 spin_lock_irqsave(&irq_lock, flags); 364 spin_lock_irqsave(&irq_lock, flags);
247 irq = find_irq_by_fd(fd, irqnum, &i); 365 to_free = get_irq_entry_by_fd(fd);
248 if (irq == NULL) { 366 if (to_free != NULL) {
249 spin_unlock_irqrestore(&irq_lock, flags); 367 do_free_by_irq_and_dev(
250 return; 368 to_free,
369 irqnum,
370 NULL,
371 IGNORE_DEV
372 );
251 } 373 }
252 374 garbage_collect_irq_entries();
253 os_set_pollfd(i, -1);
254 spin_unlock_irqrestore(&irq_lock, flags); 375 spin_unlock_irqrestore(&irq_lock, flags);
255
256 ignore_sigio_fd(fd); 376 ignore_sigio_fd(fd);
257} 377}
258EXPORT_SYMBOL(deactivate_fd); 378EXPORT_SYMBOL(deactivate_fd);
@@ -265,17 +385,28 @@ EXPORT_SYMBOL(deactivate_fd);
265 */ 385 */
266int deactivate_all_fds(void) 386int deactivate_all_fds(void)
267{ 387{
268 struct irq_fd *irq; 388 unsigned long flags;
269 int err; 389 struct irq_entry *to_free;
270 390
271 for (irq = active_fds; irq != NULL; irq = irq->next) { 391 spin_lock_irqsave(&irq_lock, flags);
272 err = os_clear_fd_async(irq->fd); 392 /* Stop IO. The IRQ loop has no lock so this is our
273 if (err) 393 * only way of making sure we are safe to dispose
274 return err; 394 * of all IRQ handlers
275 } 395 */
276 /* If there is a signal already queued, after unblocking ignore it */
277 os_set_ioignore(); 396 os_set_ioignore();
278 397 to_free = active_fds;
398 while (to_free != NULL) {
399 do_free_by_irq_and_dev(
400 to_free,
401 -1,
402 NULL,
403 IGNORE_IRQ | IGNORE_DEV
404 );
405 to_free = to_free->next;
406 }
407 garbage_collect_irq_entries();
408 spin_unlock_irqrestore(&irq_lock, flags);
409 os_close_epoll_fd();
279 return 0; 410 return 0;
280} 411}
281 412
@@ -353,8 +484,11 @@ void __init init_IRQ(void)
353 484
354 irq_set_chip_and_handler(TIMER_IRQ, &SIGVTALRM_irq_type, handle_edge_irq); 485 irq_set_chip_and_handler(TIMER_IRQ, &SIGVTALRM_irq_type, handle_edge_irq);
355 486
487
356 for (i = 1; i < NR_IRQS; i++) 488 for (i = 1; i < NR_IRQS; i++)
357 irq_set_chip_and_handler(i, &normal_irq_type, handle_edge_irq); 489 irq_set_chip_and_handler(i, &normal_irq_type, handle_edge_irq);
490 /* Initialize EPOLL Loop */
491 os_setup_epoll();
358} 492}
359 493
360/* 494/*
diff --git a/arch/um/os-Linux/irq.c b/arch/um/os-Linux/irq.c
index b9afb74b79ad..365823010346 100644
--- a/arch/um/os-Linux/irq.c
+++ b/arch/um/os-Linux/irq.c
@@ -1,135 +1,147 @@
1/* 1/*
2 * Copyright (C) 2017 - Cambridge Greys Ltd
3 * Copyright (C) 2011 - 2014 Cisco Systems Inc
2 * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) 4 * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
3 * Licensed under the GPL 5 * Licensed under the GPL
4 */ 6 */
5 7
6#include <stdlib.h> 8#include <stdlib.h>
7#include <errno.h> 9#include <errno.h>
8#include <poll.h> 10#include <sys/epoll.h>
9#include <signal.h> 11#include <signal.h>
10#include <string.h> 12#include <string.h>
11#include <irq_user.h> 13#include <irq_user.h>
12#include <os.h> 14#include <os.h>
13#include <um_malloc.h> 15#include <um_malloc.h>
14 16
17/* Epoll support */
18
19static int epollfd = -1;
20
21#define MAX_EPOLL_EVENTS 64
22
23static struct epoll_event epoll_events[MAX_EPOLL_EVENTS];
24
25/* Helper to return an Epoll data pointer from an epoll event structure.
26 * We need to keep this one on the userspace side to keep includes separate
27 */
28
29void *os_epoll_get_data_pointer(int index)
30{
31 return epoll_events[index].data.ptr;
32}
33
34/* Helper to compare events versus the events in the epoll structure.
35 * Same as above - needs to be on the userspace side
36 */
37
38
39int os_epoll_triggered(int index, int events)
40{
41 return epoll_events[index].events & events;
42}
43/* Helper to set the event mask.
44 * The event mask is opaque to the kernel side, because it does not have
45 * access to the right includes/defines for EPOLL constants.
46 */
47
48int os_event_mask(int irq_type)
49{
50 if (irq_type == IRQ_READ)
51 return EPOLLIN | EPOLLPRI;
52 if (irq_type == IRQ_WRITE)
53 return EPOLLOUT;
54 return 0;
55}
56
15/* 57/*
16 * Locked by irq_lock in arch/um/kernel/irq.c. Changed by os_create_pollfd 58 * Initial Epoll Setup
17 * and os_free_irq_by_cb, which are called under irq_lock.
18 */ 59 */
19static struct pollfd *pollfds = NULL; 60int os_setup_epoll(void)
20static int pollfds_num = 0; 61{
21static int pollfds_size = 0; 62 epollfd = epoll_create(MAX_EPOLL_EVENTS);
63 return epollfd;
64}
22 65
23int os_waiting_for_events(struct irq_fd *active_fds) 66/*
67 * Helper to run the actual epoll_wait
68 */
69int os_waiting_for_events_epoll(void)
24{ 70{
25 struct irq_fd *irq_fd; 71 int n, err;
26 int i, n, err;
27 72
28 n = poll(pollfds, pollfds_num, 0); 73 n = epoll_wait(epollfd,
74 (struct epoll_event *) &epoll_events, MAX_EPOLL_EVENTS, 0);
29 if (n < 0) { 75 if (n < 0) {
30 err = -errno; 76 err = -errno;
31 if (errno != EINTR) 77 if (errno != EINTR)
32 printk(UM_KERN_ERR "os_waiting_for_events:" 78 printk(
33 " poll returned %d, errno = %d\n", n, errno); 79 UM_KERN_ERR "os_waiting_for_events:"
80 " epoll returned %d, error = %s\n", n,
81 strerror(errno)
82 );
34 return err; 83 return err;
35 } 84 }
36
37 if (n == 0)
38 return 0;
39
40 irq_fd = active_fds;
41
42 for (i = 0; i < pollfds_num; i++) {
43 if (pollfds[i].revents != 0) {
44 irq_fd->current_events = pollfds[i].revents;
45 pollfds[i].fd = -1;
46 }
47 irq_fd = irq_fd->next;
48 }
49 return n; 85 return n;
50} 86}
51 87
52int os_create_pollfd(int fd, int events, void *tmp_pfd, int size_tmpfds)
53{
54 if (pollfds_num == pollfds_size) {
55 if (size_tmpfds <= pollfds_size * sizeof(pollfds[0])) {
56 /* return min size needed for new pollfds area */
57 return (pollfds_size + 1) * sizeof(pollfds[0]);
58 }
59
60 if (pollfds != NULL) {
61 memcpy(tmp_pfd, pollfds,
62 sizeof(pollfds[0]) * pollfds_size);
63 /* remove old pollfds */
64 kfree(pollfds);
65 }
66 pollfds = tmp_pfd;
67 pollfds_size++;
68 } else
69 kfree(tmp_pfd); /* remove not used tmp_pfd */
70
71 pollfds[pollfds_num] = ((struct pollfd) { .fd = fd,
72 .events = events,
73 .revents = 0 });
74 pollfds_num++;
75
76 return 0;
77}
78 88
79void os_free_irq_by_cb(int (*test)(struct irq_fd *, void *), void *arg, 89/*
80 struct irq_fd *active_fds, struct irq_fd ***last_irq_ptr2) 90 * Helper to add a fd to epoll
91 */
92int os_add_epoll_fd(int events, int fd, void *data)
81{ 93{
82 struct irq_fd **prev; 94 struct epoll_event event;
83 int i = 0; 95 int result;
84 96
85 prev = &active_fds; 97 event.data.ptr = data;
86 while (*prev != NULL) { 98 event.events = events | EPOLLET;
87 if ((*test)(*prev, arg)) { 99 result = epoll_ctl(epollfd, EPOLL_CTL_ADD, fd, &event);
88 struct irq_fd *old_fd = *prev; 100 if ((result) && (errno == EEXIST))
89 if ((pollfds[i].fd != -1) && 101 result = os_mod_epoll_fd(events, fd, data);
90 (pollfds[i].fd != (*prev)->fd)) { 102 if (result)
91 printk(UM_KERN_ERR "os_free_irq_by_cb - " 103 printk("epollctl add err fd %d, %s\n", fd, strerror(errno));
92 "mismatch between active_fds and " 104 return result;
93 "pollfds, fd %d vs %d\n",
94 (*prev)->fd, pollfds[i].fd);
95 goto out;
96 }
97
98 pollfds_num--;
99
100 /*
101 * This moves the *whole* array after pollfds[i]
102 * (though it doesn't spot as such)!
103 */
104 memmove(&pollfds[i], &pollfds[i + 1],
105 (pollfds_num - i) * sizeof(pollfds[0]));
106 if (*last_irq_ptr2 == &old_fd->next)
107 *last_irq_ptr2 = prev;
108
109 *prev = (*prev)->next;
110 if (old_fd->type == IRQ_WRITE)
111 ignore_sigio_fd(old_fd->fd);
112 kfree(old_fd);
113 continue;
114 }
115 prev = &(*prev)->next;
116 i++;
117 }
118 out:
119 return;
120} 105}
121 106
122int os_get_pollfd(int i) 107/*
108 * Helper to mod the fd event mask and/or data backreference
109 */
110int os_mod_epoll_fd(int events, int fd, void *data)
123{ 111{
124 return pollfds[i].fd; 112 struct epoll_event event;
113 int result;
114
115 event.data.ptr = data;
116 event.events = events;
117 result = epoll_ctl(epollfd, EPOLL_CTL_MOD, fd, &event);
118 if (result)
119 printk(UM_KERN_ERR
120 "epollctl mod err fd %d, %s\n", fd, strerror(errno));
121 return result;
125} 122}
126 123
127void os_set_pollfd(int i, int fd) 124/*
125 * Helper to delete the epoll fd
126 */
127int os_del_epoll_fd(int fd)
128{ 128{
129 pollfds[i].fd = fd; 129 struct epoll_event event;
130 int result;
131 /* This is quiet as we use this as IO ON/OFF - so it is often
132 * invoked on a non-existent fd
133 */
134 result = epoll_ctl(epollfd, EPOLL_CTL_DEL, fd, &event);
135 return result;
130} 136}
131 137
132void os_set_ioignore(void) 138void os_set_ioignore(void)
133{ 139{
134 signal(SIGIO, SIG_IGN); 140 signal(SIGIO, SIG_IGN);
135} 141}
142
143void os_close_epoll_fd(void)
144{
145 /* Needed so we do not leak an fd when rebooting */
146 os_close_file(epollfd);
147}