aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/xen
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/xen')
-rw-r--r--drivers/xen/Kconfig19
-rw-r--r--drivers/xen/Makefile4
-rw-r--r--drivers/xen/balloon.c712
-rw-r--r--drivers/xen/events.c674
-rw-r--r--drivers/xen/features.c29
-rw-r--r--drivers/xen/grant-table.c37
-rw-r--r--drivers/xen/xenbus/xenbus_client.c6
-rw-r--r--drivers/xen/xenbus/xenbus_probe.c32
-rw-r--r--drivers/xen/xencomm.c232
9 files changed, 1705 insertions, 40 deletions
diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig
new file mode 100644
index 000000000000..4b75a16de009
--- /dev/null
+++ b/drivers/xen/Kconfig
@@ -0,0 +1,19 @@
1config XEN_BALLOON
2 bool "Xen memory balloon driver"
3 depends on XEN
4 default y
5 help
6 The balloon driver allows the Xen domain to request more memory from
7 the system to expand the domain's memory allocation, or alternatively
8 return unneeded memory to the system.
9
10config XEN_SCRUB_PAGES
11 bool "Scrub pages before returning them to system"
12 depends on XEN_BALLOON
13 default y
14 help
15 Scrub pages before returning them to the system for reuse by
16 other domains. This makes sure that any confidential data
17 is not accidentally visible to other domains. Is it more
18 secure, but slightly less efficient.
19 If in doubt, say yes.
diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile
index 56592f0d6cef..37af04f1ffd9 100644
--- a/drivers/xen/Makefile
+++ b/drivers/xen/Makefile
@@ -1,2 +1,4 @@
1obj-y += grant-table.o 1obj-y += grant-table.o features.o events.o
2obj-y += xenbus/ 2obj-y += xenbus/
3obj-$(CONFIG_XEN_XENCOMM) += xencomm.o
4obj-$(CONFIG_XEN_BALLOON) += balloon.o
diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c
new file mode 100644
index 000000000000..ab25ba6cbbb9
--- /dev/null
+++ b/drivers/xen/balloon.c
@@ -0,0 +1,712 @@
1/******************************************************************************
2 * balloon.c
3 *
4 * Xen balloon driver - enables returning/claiming memory to/from Xen.
5 *
6 * Copyright (c) 2003, B Dragovic
7 * Copyright (c) 2003-2004, M Williamson, K Fraser
8 * Copyright (c) 2005 Dan M. Smith, IBM Corporation
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License version 2
12 * as published by the Free Software Foundation; or, when distributed
13 * separately from the Linux kernel or incorporated into other
14 * software packages, subject to the following license:
15 *
16 * Permission is hereby granted, free of charge, to any person obtaining a copy
17 * of this source file (the "Software"), to deal in the Software without
18 * restriction, including without limitation the rights to use, copy, modify,
19 * merge, publish, distribute, sublicense, and/or sell copies of the Software,
20 * and to permit persons to whom the Software is furnished to do so, subject to
21 * the following conditions:
22 *
23 * The above copyright notice and this permission notice shall be included in
24 * all copies or substantial portions of the Software.
25 *
26 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
27 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
28 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
29 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
30 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
31 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
32 * IN THE SOFTWARE.
33 */
34
35#include <linux/kernel.h>
36#include <linux/module.h>
37#include <linux/sched.h>
38#include <linux/errno.h>
39#include <linux/mm.h>
40#include <linux/bootmem.h>
41#include <linux/pagemap.h>
42#include <linux/highmem.h>
43#include <linux/mutex.h>
44#include <linux/highmem.h>
45#include <linux/list.h>
46#include <linux/sysdev.h>
47
48#include <asm/xen/hypervisor.h>
49#include <asm/page.h>
50#include <asm/pgalloc.h>
51#include <asm/pgtable.h>
52#include <asm/uaccess.h>
53#include <asm/tlb.h>
54
55#include <xen/interface/memory.h>
56#include <xen/balloon.h>
57#include <xen/xenbus.h>
58#include <xen/features.h>
59#include <xen/page.h>
60
61#define PAGES2KB(_p) ((_p)<<(PAGE_SHIFT-10))
62
63#define BALLOON_CLASS_NAME "memory"
64
65struct balloon_stats {
66 /* We aim for 'current allocation' == 'target allocation'. */
67 unsigned long current_pages;
68 unsigned long target_pages;
69 /* We may hit the hard limit in Xen. If we do then we remember it. */
70 unsigned long hard_limit;
71 /*
72 * Drivers may alter the memory reservation independently, but they
73 * must inform the balloon driver so we avoid hitting the hard limit.
74 */
75 unsigned long driver_pages;
76 /* Number of pages in high- and low-memory balloons. */
77 unsigned long balloon_low;
78 unsigned long balloon_high;
79};
80
81static DEFINE_MUTEX(balloon_mutex);
82
83static struct sys_device balloon_sysdev;
84
85static int register_balloon(struct sys_device *sysdev);
86
87/*
88 * Protects atomic reservation decrease/increase against concurrent increases.
89 * Also protects non-atomic updates of current_pages and driver_pages, and
90 * balloon lists.
91 */
92static DEFINE_SPINLOCK(balloon_lock);
93
94static struct balloon_stats balloon_stats;
95
96/* We increase/decrease in batches which fit in a page */
97static unsigned long frame_list[PAGE_SIZE / sizeof(unsigned long)];
98
99/* VM /proc information for memory */
100extern unsigned long totalram_pages;
101
102#ifdef CONFIG_HIGHMEM
103extern unsigned long totalhigh_pages;
104#define inc_totalhigh_pages() (totalhigh_pages++)
105#define dec_totalhigh_pages() (totalhigh_pages--)
106#else
107#define inc_totalhigh_pages() do {} while(0)
108#define dec_totalhigh_pages() do {} while(0)
109#endif
110
111/* List of ballooned pages, threaded through the mem_map array. */
112static LIST_HEAD(ballooned_pages);
113
114/* Main work function, always executed in process context. */
115static void balloon_process(struct work_struct *work);
116static DECLARE_WORK(balloon_worker, balloon_process);
117static struct timer_list balloon_timer;
118
119/* When ballooning out (allocating memory to return to Xen) we don't really
120 want the kernel to try too hard since that can trigger the oom killer. */
121#define GFP_BALLOON \
122 (GFP_HIGHUSER | __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC)
123
124static void scrub_page(struct page *page)
125{
126#ifdef CONFIG_XEN_SCRUB_PAGES
127 if (PageHighMem(page)) {
128 void *v = kmap(page);
129 clear_page(v);
130 kunmap(v);
131 } else {
132 void *v = page_address(page);
133 clear_page(v);
134 }
135#endif
136}
137
138/* balloon_append: add the given page to the balloon. */
139static void balloon_append(struct page *page)
140{
141 /* Lowmem is re-populated first, so highmem pages go at list tail. */
142 if (PageHighMem(page)) {
143 list_add_tail(&page->lru, &ballooned_pages);
144 balloon_stats.balloon_high++;
145 dec_totalhigh_pages();
146 } else {
147 list_add(&page->lru, &ballooned_pages);
148 balloon_stats.balloon_low++;
149 }
150}
151
152/* balloon_retrieve: rescue a page from the balloon, if it is not empty. */
153static struct page *balloon_retrieve(void)
154{
155 struct page *page;
156
157 if (list_empty(&ballooned_pages))
158 return NULL;
159
160 page = list_entry(ballooned_pages.next, struct page, lru);
161 list_del(&page->lru);
162
163 if (PageHighMem(page)) {
164 balloon_stats.balloon_high--;
165 inc_totalhigh_pages();
166 }
167 else
168 balloon_stats.balloon_low--;
169
170 return page;
171}
172
173static struct page *balloon_first_page(void)
174{
175 if (list_empty(&ballooned_pages))
176 return NULL;
177 return list_entry(ballooned_pages.next, struct page, lru);
178}
179
180static struct page *balloon_next_page(struct page *page)
181{
182 struct list_head *next = page->lru.next;
183 if (next == &ballooned_pages)
184 return NULL;
185 return list_entry(next, struct page, lru);
186}
187
188static void balloon_alarm(unsigned long unused)
189{
190 schedule_work(&balloon_worker);
191}
192
193static unsigned long current_target(void)
194{
195 unsigned long target = min(balloon_stats.target_pages, balloon_stats.hard_limit);
196
197 target = min(target,
198 balloon_stats.current_pages +
199 balloon_stats.balloon_low +
200 balloon_stats.balloon_high);
201
202 return target;
203}
204
205static int increase_reservation(unsigned long nr_pages)
206{
207 unsigned long pfn, i, flags;
208 struct page *page;
209 long rc;
210 struct xen_memory_reservation reservation = {
211 .address_bits = 0,
212 .extent_order = 0,
213 .domid = DOMID_SELF
214 };
215
216 if (nr_pages > ARRAY_SIZE(frame_list))
217 nr_pages = ARRAY_SIZE(frame_list);
218
219 spin_lock_irqsave(&balloon_lock, flags);
220
221 page = balloon_first_page();
222 for (i = 0; i < nr_pages; i++) {
223 BUG_ON(page == NULL);
224 frame_list[i] = page_to_pfn(page);;
225 page = balloon_next_page(page);
226 }
227
228 reservation.extent_start = (unsigned long)frame_list;
229 reservation.nr_extents = nr_pages;
230 rc = HYPERVISOR_memory_op(
231 XENMEM_populate_physmap, &reservation);
232 if (rc < nr_pages) {
233 if (rc > 0) {
234 int ret;
235
236 /* We hit the Xen hard limit: reprobe. */
237 reservation.nr_extents = rc;
238 ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
239 &reservation);
240 BUG_ON(ret != rc);
241 }
242 if (rc >= 0)
243 balloon_stats.hard_limit = (balloon_stats.current_pages + rc -
244 balloon_stats.driver_pages);
245 goto out;
246 }
247
248 for (i = 0; i < nr_pages; i++) {
249 page = balloon_retrieve();
250 BUG_ON(page == NULL);
251
252 pfn = page_to_pfn(page);
253 BUG_ON(!xen_feature(XENFEAT_auto_translated_physmap) &&
254 phys_to_machine_mapping_valid(pfn));
255
256 set_phys_to_machine(pfn, frame_list[i]);
257
258 /* Link back into the page tables if not highmem. */
259 if (pfn < max_low_pfn) {
260 int ret;
261 ret = HYPERVISOR_update_va_mapping(
262 (unsigned long)__va(pfn << PAGE_SHIFT),
263 mfn_pte(frame_list[i], PAGE_KERNEL),
264 0);
265 BUG_ON(ret);
266 }
267
268 /* Relinquish the page back to the allocator. */
269 ClearPageReserved(page);
270 init_page_count(page);
271 __free_page(page);
272 }
273
274 balloon_stats.current_pages += nr_pages;
275 totalram_pages = balloon_stats.current_pages;
276
277 out:
278 spin_unlock_irqrestore(&balloon_lock, flags);
279
280 return 0;
281}
282
283static int decrease_reservation(unsigned long nr_pages)
284{
285 unsigned long pfn, i, flags;
286 struct page *page;
287 int need_sleep = 0;
288 int ret;
289 struct xen_memory_reservation reservation = {
290 .address_bits = 0,
291 .extent_order = 0,
292 .domid = DOMID_SELF
293 };
294
295 if (nr_pages > ARRAY_SIZE(frame_list))
296 nr_pages = ARRAY_SIZE(frame_list);
297
298 for (i = 0; i < nr_pages; i++) {
299 if ((page = alloc_page(GFP_BALLOON)) == NULL) {
300 nr_pages = i;
301 need_sleep = 1;
302 break;
303 }
304
305 pfn = page_to_pfn(page);
306 frame_list[i] = pfn_to_mfn(pfn);
307
308 scrub_page(page);
309 }
310
311 /* Ensure that ballooned highmem pages don't have kmaps. */
312 kmap_flush_unused();
313 flush_tlb_all();
314
315 spin_lock_irqsave(&balloon_lock, flags);
316
317 /* No more mappings: invalidate P2M and add to balloon. */
318 for (i = 0; i < nr_pages; i++) {
319 pfn = mfn_to_pfn(frame_list[i]);
320 set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
321 balloon_append(pfn_to_page(pfn));
322 }
323
324 reservation.extent_start = (unsigned long)frame_list;
325 reservation.nr_extents = nr_pages;
326 ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation);
327 BUG_ON(ret != nr_pages);
328
329 balloon_stats.current_pages -= nr_pages;
330 totalram_pages = balloon_stats.current_pages;
331
332 spin_unlock_irqrestore(&balloon_lock, flags);
333
334 return need_sleep;
335}
336
337/*
338 * We avoid multiple worker processes conflicting via the balloon mutex.
339 * We may of course race updates of the target counts (which are protected
340 * by the balloon lock), or with changes to the Xen hard limit, but we will
341 * recover from these in time.
342 */
343static void balloon_process(struct work_struct *work)
344{
345 int need_sleep = 0;
346 long credit;
347
348 mutex_lock(&balloon_mutex);
349
350 do {
351 credit = current_target() - balloon_stats.current_pages;
352 if (credit > 0)
353 need_sleep = (increase_reservation(credit) != 0);
354 if (credit < 0)
355 need_sleep = (decrease_reservation(-credit) != 0);
356
357#ifndef CONFIG_PREEMPT
358 if (need_resched())
359 schedule();
360#endif
361 } while ((credit != 0) && !need_sleep);
362
363 /* Schedule more work if there is some still to be done. */
364 if (current_target() != balloon_stats.current_pages)
365 mod_timer(&balloon_timer, jiffies + HZ);
366
367 mutex_unlock(&balloon_mutex);
368}
369
370/* Resets the Xen limit, sets new target, and kicks off processing. */
371void balloon_set_new_target(unsigned long target)
372{
373 /* No need for lock. Not read-modify-write updates. */
374 balloon_stats.hard_limit = ~0UL;
375 balloon_stats.target_pages = target;
376 schedule_work(&balloon_worker);
377}
378
379static struct xenbus_watch target_watch =
380{
381 .node = "memory/target"
382};
383
384/* React to a change in the target key */
385static void watch_target(struct xenbus_watch *watch,
386 const char **vec, unsigned int len)
387{
388 unsigned long long new_target;
389 int err;
390
391 err = xenbus_scanf(XBT_NIL, "memory", "target", "%llu", &new_target);
392 if (err != 1) {
393 /* This is ok (for domain0 at least) - so just return */
394 return;
395 }
396
397 /* The given memory/target value is in KiB, so it needs converting to
398 * pages. PAGE_SHIFT converts bytes to pages, hence PAGE_SHIFT - 10.
399 */
400 balloon_set_new_target(new_target >> (PAGE_SHIFT - 10));
401}
402
403static int balloon_init_watcher(struct notifier_block *notifier,
404 unsigned long event,
405 void *data)
406{
407 int err;
408
409 err = register_xenbus_watch(&target_watch);
410 if (err)
411 printk(KERN_ERR "Failed to set balloon watcher\n");
412
413 return NOTIFY_DONE;
414}
415
416static struct notifier_block xenstore_notifier;
417
418static int __init balloon_init(void)
419{
420 unsigned long pfn;
421 struct page *page;
422
423 if (!is_running_on_xen())
424 return -ENODEV;
425
426 pr_info("xen_balloon: Initialising balloon driver.\n");
427
428 balloon_stats.current_pages = min(xen_start_info->nr_pages, max_pfn);
429 totalram_pages = balloon_stats.current_pages;
430 balloon_stats.target_pages = balloon_stats.current_pages;
431 balloon_stats.balloon_low = 0;
432 balloon_stats.balloon_high = 0;
433 balloon_stats.driver_pages = 0UL;
434 balloon_stats.hard_limit = ~0UL;
435
436 init_timer(&balloon_timer);
437 balloon_timer.data = 0;
438 balloon_timer.function = balloon_alarm;
439
440 register_balloon(&balloon_sysdev);
441
442 /* Initialise the balloon with excess memory space. */
443 for (pfn = xen_start_info->nr_pages; pfn < max_pfn; pfn++) {
444 page = pfn_to_page(pfn);
445 if (!PageReserved(page))
446 balloon_append(page);
447 }
448
449 target_watch.callback = watch_target;
450 xenstore_notifier.notifier_call = balloon_init_watcher;
451
452 register_xenstore_notifier(&xenstore_notifier);
453
454 return 0;
455}
456
457subsys_initcall(balloon_init);
458
459static void balloon_exit(void)
460{
461 /* XXX - release balloon here */
462 return;
463}
464
465module_exit(balloon_exit);
466
467static void balloon_update_driver_allowance(long delta)
468{
469 unsigned long flags;
470
471 spin_lock_irqsave(&balloon_lock, flags);
472 balloon_stats.driver_pages += delta;
473 spin_unlock_irqrestore(&balloon_lock, flags);
474}
475
476static int dealloc_pte_fn(
477 pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
478{
479 unsigned long mfn = pte_mfn(*pte);
480 int ret;
481 struct xen_memory_reservation reservation = {
482 .nr_extents = 1,
483 .extent_order = 0,
484 .domid = DOMID_SELF
485 };
486 reservation.extent_start = (unsigned long)&mfn;
487 set_pte_at(&init_mm, addr, pte, __pte_ma(0ull));
488 set_phys_to_machine(__pa(addr) >> PAGE_SHIFT, INVALID_P2M_ENTRY);
489 ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation);
490 BUG_ON(ret != 1);
491 return 0;
492}
493
494static struct page **alloc_empty_pages_and_pagevec(int nr_pages)
495{
496 unsigned long vaddr, flags;
497 struct page *page, **pagevec;
498 int i, ret;
499
500 pagevec = kmalloc(sizeof(page) * nr_pages, GFP_KERNEL);
501 if (pagevec == NULL)
502 return NULL;
503
504 for (i = 0; i < nr_pages; i++) {
505 page = pagevec[i] = alloc_page(GFP_KERNEL);
506 if (page == NULL)
507 goto err;
508
509 vaddr = (unsigned long)page_address(page);
510
511 scrub_page(page);
512
513 spin_lock_irqsave(&balloon_lock, flags);
514
515 if (xen_feature(XENFEAT_auto_translated_physmap)) {
516 unsigned long gmfn = page_to_pfn(page);
517 struct xen_memory_reservation reservation = {
518 .nr_extents = 1,
519 .extent_order = 0,
520 .domid = DOMID_SELF
521 };
522 reservation.extent_start = (unsigned long)&gmfn;
523 ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
524 &reservation);
525 if (ret == 1)
526 ret = 0; /* success */
527 } else {
528 ret = apply_to_page_range(&init_mm, vaddr, PAGE_SIZE,
529 dealloc_pte_fn, NULL);
530 }
531
532 if (ret != 0) {
533 spin_unlock_irqrestore(&balloon_lock, flags);
534 __free_page(page);
535 goto err;
536 }
537
538 totalram_pages = --balloon_stats.current_pages;
539
540 spin_unlock_irqrestore(&balloon_lock, flags);
541 }
542
543 out:
544 schedule_work(&balloon_worker);
545 flush_tlb_all();
546 return pagevec;
547
548 err:
549 spin_lock_irqsave(&balloon_lock, flags);
550 while (--i >= 0)
551 balloon_append(pagevec[i]);
552 spin_unlock_irqrestore(&balloon_lock, flags);
553 kfree(pagevec);
554 pagevec = NULL;
555 goto out;
556}
557
558static void free_empty_pages_and_pagevec(struct page **pagevec, int nr_pages)
559{
560 unsigned long flags;
561 int i;
562
563 if (pagevec == NULL)
564 return;
565
566 spin_lock_irqsave(&balloon_lock, flags);
567 for (i = 0; i < nr_pages; i++) {
568 BUG_ON(page_count(pagevec[i]) != 1);
569 balloon_append(pagevec[i]);
570 }
571 spin_unlock_irqrestore(&balloon_lock, flags);
572
573 kfree(pagevec);
574
575 schedule_work(&balloon_worker);
576}
577
578static void balloon_release_driver_page(struct page *page)
579{
580 unsigned long flags;
581
582 spin_lock_irqsave(&balloon_lock, flags);
583 balloon_append(page);
584 balloon_stats.driver_pages--;
585 spin_unlock_irqrestore(&balloon_lock, flags);
586
587 schedule_work(&balloon_worker);
588}
589
590
591#define BALLOON_SHOW(name, format, args...) \
592 static ssize_t show_##name(struct sys_device *dev, \
593 char *buf) \
594 { \
595 return sprintf(buf, format, ##args); \
596 } \
597 static SYSDEV_ATTR(name, S_IRUGO, show_##name, NULL)
598
599BALLOON_SHOW(current_kb, "%lu\n", PAGES2KB(balloon_stats.current_pages));
600BALLOON_SHOW(low_kb, "%lu\n", PAGES2KB(balloon_stats.balloon_low));
601BALLOON_SHOW(high_kb, "%lu\n", PAGES2KB(balloon_stats.balloon_high));
602BALLOON_SHOW(hard_limit_kb,
603 (balloon_stats.hard_limit!=~0UL) ? "%lu\n" : "???\n",
604 (balloon_stats.hard_limit!=~0UL) ? PAGES2KB(balloon_stats.hard_limit) : 0);
605BALLOON_SHOW(driver_kb, "%lu\n", PAGES2KB(balloon_stats.driver_pages));
606
607static ssize_t show_target_kb(struct sys_device *dev, char *buf)
608{
609 return sprintf(buf, "%lu\n", PAGES2KB(balloon_stats.target_pages));
610}
611
612static ssize_t store_target_kb(struct sys_device *dev,
613 const char *buf,
614 size_t count)
615{
616 char memstring[64], *endchar;
617 unsigned long long target_bytes;
618
619 if (!capable(CAP_SYS_ADMIN))
620 return -EPERM;
621
622 if (count <= 1)
623 return -EBADMSG; /* runt */
624 if (count > sizeof(memstring))
625 return -EFBIG; /* too long */
626 strcpy(memstring, buf);
627
628 target_bytes = memparse(memstring, &endchar);
629 balloon_set_new_target(target_bytes >> PAGE_SHIFT);
630
631 return count;
632}
633
634static SYSDEV_ATTR(target_kb, S_IRUGO | S_IWUSR,
635 show_target_kb, store_target_kb);
636
637static struct sysdev_attribute *balloon_attrs[] = {
638 &attr_target_kb,
639};
640
641static struct attribute *balloon_info_attrs[] = {
642 &attr_current_kb.attr,
643 &attr_low_kb.attr,
644 &attr_high_kb.attr,
645 &attr_hard_limit_kb.attr,
646 &attr_driver_kb.attr,
647 NULL
648};
649
650static struct attribute_group balloon_info_group = {
651 .name = "info",
652 .attrs = balloon_info_attrs,
653};
654
655static struct sysdev_class balloon_sysdev_class = {
656 .name = BALLOON_CLASS_NAME,
657};
658
659static int register_balloon(struct sys_device *sysdev)
660{
661 int i, error;
662
663 error = sysdev_class_register(&balloon_sysdev_class);
664 if (error)
665 return error;
666
667 sysdev->id = 0;
668 sysdev->cls = &balloon_sysdev_class;
669
670 error = sysdev_register(sysdev);
671 if (error) {
672 sysdev_class_unregister(&balloon_sysdev_class);
673 return error;
674 }
675
676 for (i = 0; i < ARRAY_SIZE(balloon_attrs); i++) {
677 error = sysdev_create_file(sysdev, balloon_attrs[i]);
678 if (error)
679 goto fail;
680 }
681
682 error = sysfs_create_group(&sysdev->kobj, &balloon_info_group);
683 if (error)
684 goto fail;
685
686 return 0;
687
688 fail:
689 while (--i >= 0)
690 sysdev_remove_file(sysdev, balloon_attrs[i]);
691 sysdev_unregister(sysdev);
692 sysdev_class_unregister(&balloon_sysdev_class);
693 return error;
694}
695
696static void unregister_balloon(struct sys_device *sysdev)
697{
698 int i;
699
700 sysfs_remove_group(&sysdev->kobj, &balloon_info_group);
701 for (i = 0; i < ARRAY_SIZE(balloon_attrs); i++)
702 sysdev_remove_file(sysdev, balloon_attrs[i]);
703 sysdev_unregister(sysdev);
704 sysdev_class_unregister(&balloon_sysdev_class);
705}
706
707static void balloon_sysfs_exit(void)
708{
709 unregister_balloon(&balloon_sysdev);
710}
711
712MODULE_LICENSE("GPL");
diff --git a/drivers/xen/events.c b/drivers/xen/events.c
new file mode 100644
index 000000000000..4f0f22b020ea
--- /dev/null
+++ b/drivers/xen/events.c
@@ -0,0 +1,674 @@
1/*
2 * Xen event channels
3 *
4 * Xen models interrupts with abstract event channels. Because each
5 * domain gets 1024 event channels, but NR_IRQ is not that large, we
6 * must dynamically map irqs<->event channels. The event channels
7 * interface with the rest of the kernel by defining a xen interrupt
8 * chip. When an event is recieved, it is mapped to an irq and sent
9 * through the normal interrupt processing path.
10 *
11 * There are four kinds of events which can be mapped to an event
12 * channel:
13 *
14 * 1. Inter-domain notifications. This includes all the virtual
15 * device events, since they're driven by front-ends in another domain
16 * (typically dom0).
17 * 2. VIRQs, typically used for timers. These are per-cpu events.
18 * 3. IPIs.
19 * 4. Hardware interrupts. Not supported at present.
20 *
21 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
22 */
23
24#include <linux/linkage.h>
25#include <linux/interrupt.h>
26#include <linux/irq.h>
27#include <linux/module.h>
28#include <linux/string.h>
29
30#include <asm/ptrace.h>
31#include <asm/irq.h>
32#include <asm/sync_bitops.h>
33#include <asm/xen/hypercall.h>
34#include <asm/xen/hypervisor.h>
35
36#include <xen/xen-ops.h>
37#include <xen/events.h>
38#include <xen/interface/xen.h>
39#include <xen/interface/event_channel.h>
40
41/*
42 * This lock protects updates to the following mapping and reference-count
43 * arrays. The lock does not need to be acquired to read the mapping tables.
44 */
45static DEFINE_SPINLOCK(irq_mapping_update_lock);
46
47/* IRQ <-> VIRQ mapping. */
48static DEFINE_PER_CPU(int, virq_to_irq[NR_VIRQS]) = {[0 ... NR_VIRQS-1] = -1};
49
50/* IRQ <-> IPI mapping */
51static DEFINE_PER_CPU(int, ipi_to_irq[XEN_NR_IPIS]) = {[0 ... XEN_NR_IPIS-1] = -1};
52
53/* Packed IRQ information: binding type, sub-type index, and event channel. */
54struct packed_irq
55{
56 unsigned short evtchn;
57 unsigned char index;
58 unsigned char type;
59};
60
61static struct packed_irq irq_info[NR_IRQS];
62
63/* Binding types. */
64enum {
65 IRQT_UNBOUND,
66 IRQT_PIRQ,
67 IRQT_VIRQ,
68 IRQT_IPI,
69 IRQT_EVTCHN
70};
71
72/* Convenient shorthand for packed representation of an unbound IRQ. */
73#define IRQ_UNBOUND mk_irq_info(IRQT_UNBOUND, 0, 0)
74
75static int evtchn_to_irq[NR_EVENT_CHANNELS] = {
76 [0 ... NR_EVENT_CHANNELS-1] = -1
77};
78static unsigned long cpu_evtchn_mask[NR_CPUS][NR_EVENT_CHANNELS/BITS_PER_LONG];
79static u8 cpu_evtchn[NR_EVENT_CHANNELS];
80
81/* Reference counts for bindings to IRQs. */
82static int irq_bindcount[NR_IRQS];
83
84/* Xen will never allocate port zero for any purpose. */
85#define VALID_EVTCHN(chn) ((chn) != 0)
86
87/*
88 * Force a proper event-channel callback from Xen after clearing the
89 * callback mask. We do this in a very simple manner, by making a call
90 * down into Xen. The pending flag will be checked by Xen on return.
91 */
92void force_evtchn_callback(void)
93{
94 (void)HYPERVISOR_xen_version(0, NULL);
95}
96EXPORT_SYMBOL_GPL(force_evtchn_callback);
97
98static struct irq_chip xen_dynamic_chip;
99
100/* Constructor for packed IRQ information. */
101static inline struct packed_irq mk_irq_info(u32 type, u32 index, u32 evtchn)
102{
103 return (struct packed_irq) { evtchn, index, type };
104}
105
106/*
107 * Accessors for packed IRQ information.
108 */
109static inline unsigned int evtchn_from_irq(int irq)
110{
111 return irq_info[irq].evtchn;
112}
113
114static inline unsigned int index_from_irq(int irq)
115{
116 return irq_info[irq].index;
117}
118
119static inline unsigned int type_from_irq(int irq)
120{
121 return irq_info[irq].type;
122}
123
124static inline unsigned long active_evtchns(unsigned int cpu,
125 struct shared_info *sh,
126 unsigned int idx)
127{
128 return (sh->evtchn_pending[idx] &
129 cpu_evtchn_mask[cpu][idx] &
130 ~sh->evtchn_mask[idx]);
131}
132
133static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu)
134{
135 int irq = evtchn_to_irq[chn];
136
137 BUG_ON(irq == -1);
138#ifdef CONFIG_SMP
139 irq_desc[irq].affinity = cpumask_of_cpu(cpu);
140#endif
141
142 __clear_bit(chn, cpu_evtchn_mask[cpu_evtchn[chn]]);
143 __set_bit(chn, cpu_evtchn_mask[cpu]);
144
145 cpu_evtchn[chn] = cpu;
146}
147
148static void init_evtchn_cpu_bindings(void)
149{
150#ifdef CONFIG_SMP
151 int i;
152 /* By default all event channels notify CPU#0. */
153 for (i = 0; i < NR_IRQS; i++)
154 irq_desc[i].affinity = cpumask_of_cpu(0);
155#endif
156
157 memset(cpu_evtchn, 0, sizeof(cpu_evtchn));
158 memset(cpu_evtchn_mask[0], ~0, sizeof(cpu_evtchn_mask[0]));
159}
160
161static inline unsigned int cpu_from_evtchn(unsigned int evtchn)
162{
163 return cpu_evtchn[evtchn];
164}
165
166static inline void clear_evtchn(int port)
167{
168 struct shared_info *s = HYPERVISOR_shared_info;
169 sync_clear_bit(port, &s->evtchn_pending[0]);
170}
171
172static inline void set_evtchn(int port)
173{
174 struct shared_info *s = HYPERVISOR_shared_info;
175 sync_set_bit(port, &s->evtchn_pending[0]);
176}
177
178
179/**
180 * notify_remote_via_irq - send event to remote end of event channel via irq
181 * @irq: irq of event channel to send event to
182 *
183 * Unlike notify_remote_via_evtchn(), this is safe to use across
184 * save/restore. Notifications on a broken connection are silently
185 * dropped.
186 */
187void notify_remote_via_irq(int irq)
188{
189 int evtchn = evtchn_from_irq(irq);
190
191 if (VALID_EVTCHN(evtchn))
192 notify_remote_via_evtchn(evtchn);
193}
194EXPORT_SYMBOL_GPL(notify_remote_via_irq);
195
196static void mask_evtchn(int port)
197{
198 struct shared_info *s = HYPERVISOR_shared_info;
199 sync_set_bit(port, &s->evtchn_mask[0]);
200}
201
202static void unmask_evtchn(int port)
203{
204 struct shared_info *s = HYPERVISOR_shared_info;
205 unsigned int cpu = get_cpu();
206
207 BUG_ON(!irqs_disabled());
208
209 /* Slow path (hypercall) if this is a non-local port. */
210 if (unlikely(cpu != cpu_from_evtchn(port))) {
211 struct evtchn_unmask unmask = { .port = port };
212 (void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask);
213 } else {
214 struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu);
215
216 sync_clear_bit(port, &s->evtchn_mask[0]);
217
218 /*
219 * The following is basically the equivalent of
220 * 'hw_resend_irq'. Just like a real IO-APIC we 'lose
221 * the interrupt edge' if the channel is masked.
222 */
223 if (sync_test_bit(port, &s->evtchn_pending[0]) &&
224 !sync_test_and_set_bit(port / BITS_PER_LONG,
225 &vcpu_info->evtchn_pending_sel))
226 vcpu_info->evtchn_upcall_pending = 1;
227 }
228
229 put_cpu();
230}
231
232static int find_unbound_irq(void)
233{
234 int irq;
235
236 /* Only allocate from dynirq range */
237 for (irq = 0; irq < NR_IRQS; irq++)
238 if (irq_bindcount[irq] == 0)
239 break;
240
241 if (irq == NR_IRQS)
242 panic("No available IRQ to bind to: increase NR_IRQS!\n");
243
244 return irq;
245}
246
247int bind_evtchn_to_irq(unsigned int evtchn)
248{
249 int irq;
250
251 spin_lock(&irq_mapping_update_lock);
252
253 irq = evtchn_to_irq[evtchn];
254
255 if (irq == -1) {
256 irq = find_unbound_irq();
257
258 dynamic_irq_init(irq);
259 set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
260 handle_level_irq, "event");
261
262 evtchn_to_irq[evtchn] = irq;
263 irq_info[irq] = mk_irq_info(IRQT_EVTCHN, 0, evtchn);
264 }
265
266 irq_bindcount[irq]++;
267
268 spin_unlock(&irq_mapping_update_lock);
269
270 return irq;
271}
272EXPORT_SYMBOL_GPL(bind_evtchn_to_irq);
273
274static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
275{
276 struct evtchn_bind_ipi bind_ipi;
277 int evtchn, irq;
278
279 spin_lock(&irq_mapping_update_lock);
280
281 irq = per_cpu(ipi_to_irq, cpu)[ipi];
282 if (irq == -1) {
283 irq = find_unbound_irq();
284 if (irq < 0)
285 goto out;
286
287 dynamic_irq_init(irq);
288 set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
289 handle_level_irq, "ipi");
290
291 bind_ipi.vcpu = cpu;
292 if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi,
293 &bind_ipi) != 0)
294 BUG();
295 evtchn = bind_ipi.port;
296
297 evtchn_to_irq[evtchn] = irq;
298 irq_info[irq] = mk_irq_info(IRQT_IPI, ipi, evtchn);
299
300 per_cpu(ipi_to_irq, cpu)[ipi] = irq;
301
302 bind_evtchn_to_cpu(evtchn, cpu);
303 }
304
305 irq_bindcount[irq]++;
306
307 out:
308 spin_unlock(&irq_mapping_update_lock);
309 return irq;
310}
311
312
313static int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
314{
315 struct evtchn_bind_virq bind_virq;
316 int evtchn, irq;
317
318 spin_lock(&irq_mapping_update_lock);
319
320 irq = per_cpu(virq_to_irq, cpu)[virq];
321
322 if (irq == -1) {
323 bind_virq.virq = virq;
324 bind_virq.vcpu = cpu;
325 if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq,
326 &bind_virq) != 0)
327 BUG();
328 evtchn = bind_virq.port;
329
330 irq = find_unbound_irq();
331
332 dynamic_irq_init(irq);
333 set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
334 handle_level_irq, "virq");
335
336 evtchn_to_irq[evtchn] = irq;
337 irq_info[irq] = mk_irq_info(IRQT_VIRQ, virq, evtchn);
338
339 per_cpu(virq_to_irq, cpu)[virq] = irq;
340
341 bind_evtchn_to_cpu(evtchn, cpu);
342 }
343
344 irq_bindcount[irq]++;
345
346 spin_unlock(&irq_mapping_update_lock);
347
348 return irq;
349}
350
351static void unbind_from_irq(unsigned int irq)
352{
353 struct evtchn_close close;
354 int evtchn = evtchn_from_irq(irq);
355
356 spin_lock(&irq_mapping_update_lock);
357
358 if (VALID_EVTCHN(evtchn) && (--irq_bindcount[irq] == 0)) {
359 close.port = evtchn;
360 if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0)
361 BUG();
362
363 switch (type_from_irq(irq)) {
364 case IRQT_VIRQ:
365 per_cpu(virq_to_irq, cpu_from_evtchn(evtchn))
366 [index_from_irq(irq)] = -1;
367 break;
368 default:
369 break;
370 }
371
372 /* Closed ports are implicitly re-bound to VCPU0. */
373 bind_evtchn_to_cpu(evtchn, 0);
374
375 evtchn_to_irq[evtchn] = -1;
376 irq_info[irq] = IRQ_UNBOUND;
377
378 dynamic_irq_init(irq);
379 }
380
381 spin_unlock(&irq_mapping_update_lock);
382}
383
384int bind_evtchn_to_irqhandler(unsigned int evtchn,
385 irq_handler_t handler,
386 unsigned long irqflags,
387 const char *devname, void *dev_id)
388{
389 unsigned int irq;
390 int retval;
391
392 irq = bind_evtchn_to_irq(evtchn);
393 retval = request_irq(irq, handler, irqflags, devname, dev_id);
394 if (retval != 0) {
395 unbind_from_irq(irq);
396 return retval;
397 }
398
399 return irq;
400}
401EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler);
402
403int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu,
404 irq_handler_t handler,
405 unsigned long irqflags, const char *devname, void *dev_id)
406{
407 unsigned int irq;
408 int retval;
409
410 irq = bind_virq_to_irq(virq, cpu);
411 retval = request_irq(irq, handler, irqflags, devname, dev_id);
412 if (retval != 0) {
413 unbind_from_irq(irq);
414 return retval;
415 }
416
417 return irq;
418}
419EXPORT_SYMBOL_GPL(bind_virq_to_irqhandler);
420
421int bind_ipi_to_irqhandler(enum ipi_vector ipi,
422 unsigned int cpu,
423 irq_handler_t handler,
424 unsigned long irqflags,
425 const char *devname,
426 void *dev_id)
427{
428 int irq, retval;
429
430 irq = bind_ipi_to_irq(ipi, cpu);
431 if (irq < 0)
432 return irq;
433
434 retval = request_irq(irq, handler, irqflags, devname, dev_id);
435 if (retval != 0) {
436 unbind_from_irq(irq);
437 return retval;
438 }
439
440 return irq;
441}
442
443void unbind_from_irqhandler(unsigned int irq, void *dev_id)
444{
445 free_irq(irq, dev_id);
446 unbind_from_irq(irq);
447}
448EXPORT_SYMBOL_GPL(unbind_from_irqhandler);
449
450void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector)
451{
452 int irq = per_cpu(ipi_to_irq, cpu)[vector];
453 BUG_ON(irq < 0);
454 notify_remote_via_irq(irq);
455}
456
457irqreturn_t xen_debug_interrupt(int irq, void *dev_id)
458{
459 struct shared_info *sh = HYPERVISOR_shared_info;
460 int cpu = smp_processor_id();
461 int i;
462 unsigned long flags;
463 static DEFINE_SPINLOCK(debug_lock);
464
465 spin_lock_irqsave(&debug_lock, flags);
466
467 printk("vcpu %d\n ", cpu);
468
469 for_each_online_cpu(i) {
470 struct vcpu_info *v = per_cpu(xen_vcpu, i);
471 printk("%d: masked=%d pending=%d event_sel %08lx\n ", i,
472 (get_irq_regs() && i == cpu) ? xen_irqs_disabled(get_irq_regs()) : v->evtchn_upcall_mask,
473 v->evtchn_upcall_pending,
474 v->evtchn_pending_sel);
475 }
476 printk("pending:\n ");
477 for(i = ARRAY_SIZE(sh->evtchn_pending)-1; i >= 0; i--)
478 printk("%08lx%s", sh->evtchn_pending[i],
479 i % 8 == 0 ? "\n " : " ");
480 printk("\nmasks:\n ");
481 for(i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--)
482 printk("%08lx%s", sh->evtchn_mask[i],
483 i % 8 == 0 ? "\n " : " ");
484
485 printk("\nunmasked:\n ");
486 for(i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--)
487 printk("%08lx%s", sh->evtchn_pending[i] & ~sh->evtchn_mask[i],
488 i % 8 == 0 ? "\n " : " ");
489
490 printk("\npending list:\n");
491 for(i = 0; i < NR_EVENT_CHANNELS; i++) {
492 if (sync_test_bit(i, sh->evtchn_pending)) {
493 printk(" %d: event %d -> irq %d\n",
494 cpu_evtchn[i], i,
495 evtchn_to_irq[i]);
496 }
497 }
498
499 spin_unlock_irqrestore(&debug_lock, flags);
500
501 return IRQ_HANDLED;
502}
503
504
505/*
506 * Search the CPUs pending events bitmasks. For each one found, map
507 * the event number to an irq, and feed it into do_IRQ() for
508 * handling.
509 *
510 * Xen uses a two-level bitmap to speed searching. The first level is
511 * a bitset of words which contain pending event bits. The second
512 * level is a bitset of pending events themselves.
513 */
514void xen_evtchn_do_upcall(struct pt_regs *regs)
515{
516 int cpu = get_cpu();
517 struct shared_info *s = HYPERVISOR_shared_info;
518 struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu);
519 static DEFINE_PER_CPU(unsigned, nesting_count);
520 unsigned count;
521
522 do {
523 unsigned long pending_words;
524
525 vcpu_info->evtchn_upcall_pending = 0;
526
527 if (__get_cpu_var(nesting_count)++)
528 goto out;
529
530#ifndef CONFIG_X86 /* No need for a barrier -- XCHG is a barrier on x86. */
531 /* Clear master flag /before/ clearing selector flag. */
532 rmb();
533#endif
534 pending_words = xchg(&vcpu_info->evtchn_pending_sel, 0);
535 while (pending_words != 0) {
536 unsigned long pending_bits;
537 int word_idx = __ffs(pending_words);
538 pending_words &= ~(1UL << word_idx);
539
540 while ((pending_bits = active_evtchns(cpu, s, word_idx)) != 0) {
541 int bit_idx = __ffs(pending_bits);
542 int port = (word_idx * BITS_PER_LONG) + bit_idx;
543 int irq = evtchn_to_irq[port];
544
545 if (irq != -1)
546 xen_do_IRQ(irq, regs);
547 }
548 }
549
550 BUG_ON(!irqs_disabled());
551
552 count = __get_cpu_var(nesting_count);
553 __get_cpu_var(nesting_count) = 0;
554 } while(count != 1);
555
556out:
557 put_cpu();
558}
559
560/* Rebind an evtchn so that it gets delivered to a specific cpu */
561static void rebind_irq_to_cpu(unsigned irq, unsigned tcpu)
562{
563 struct evtchn_bind_vcpu bind_vcpu;
564 int evtchn = evtchn_from_irq(irq);
565
566 if (!VALID_EVTCHN(evtchn))
567 return;
568
569 /* Send future instances of this interrupt to other vcpu. */
570 bind_vcpu.port = evtchn;
571 bind_vcpu.vcpu = tcpu;
572
573 /*
574 * If this fails, it usually just indicates that we're dealing with a
575 * virq or IPI channel, which don't actually need to be rebound. Ignore
576 * it, but don't do the xenlinux-level rebind in that case.
577 */
578 if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu) >= 0)
579 bind_evtchn_to_cpu(evtchn, tcpu);
580}
581
582
583static void set_affinity_irq(unsigned irq, cpumask_t dest)
584{
585 unsigned tcpu = first_cpu(dest);
586 rebind_irq_to_cpu(irq, tcpu);
587}
588
589int resend_irq_on_evtchn(unsigned int irq)
590{
591 int masked, evtchn = evtchn_from_irq(irq);
592 struct shared_info *s = HYPERVISOR_shared_info;
593
594 if (!VALID_EVTCHN(evtchn))
595 return 1;
596
597 masked = sync_test_and_set_bit(evtchn, s->evtchn_mask);
598 sync_set_bit(evtchn, s->evtchn_pending);
599 if (!masked)
600 unmask_evtchn(evtchn);
601
602 return 1;
603}
604
605static void enable_dynirq(unsigned int irq)
606{
607 int evtchn = evtchn_from_irq(irq);
608
609 if (VALID_EVTCHN(evtchn))
610 unmask_evtchn(evtchn);
611}
612
613static void disable_dynirq(unsigned int irq)
614{
615 int evtchn = evtchn_from_irq(irq);
616
617 if (VALID_EVTCHN(evtchn))
618 mask_evtchn(evtchn);
619}
620
621static void ack_dynirq(unsigned int irq)
622{
623 int evtchn = evtchn_from_irq(irq);
624
625 move_native_irq(irq);
626
627 if (VALID_EVTCHN(evtchn))
628 clear_evtchn(evtchn);
629}
630
631static int retrigger_dynirq(unsigned int irq)
632{
633 int evtchn = evtchn_from_irq(irq);
634 struct shared_info *sh = HYPERVISOR_shared_info;
635 int ret = 0;
636
637 if (VALID_EVTCHN(evtchn)) {
638 int masked;
639
640 masked = sync_test_and_set_bit(evtchn, sh->evtchn_mask);
641 sync_set_bit(evtchn, sh->evtchn_pending);
642 if (!masked)
643 unmask_evtchn(evtchn);
644 ret = 1;
645 }
646
647 return ret;
648}
649
650static struct irq_chip xen_dynamic_chip __read_mostly = {
651 .name = "xen-dyn",
652 .mask = disable_dynirq,
653 .unmask = enable_dynirq,
654 .ack = ack_dynirq,
655 .set_affinity = set_affinity_irq,
656 .retrigger = retrigger_dynirq,
657};
658
659void __init xen_init_IRQ(void)
660{
661 int i;
662
663 init_evtchn_cpu_bindings();
664
665 /* No event channels are 'live' right now. */
666 for (i = 0; i < NR_EVENT_CHANNELS; i++)
667 mask_evtchn(i);
668
669 /* Dynamic IRQ space is currently unbound. Zero the refcnts. */
670 for (i = 0; i < NR_IRQS; i++)
671 irq_bindcount[i] = 0;
672
673 irq_ctx_init(smp_processor_id());
674}
diff --git a/drivers/xen/features.c b/drivers/xen/features.c
new file mode 100644
index 000000000000..0707714e40d6
--- /dev/null
+++ b/drivers/xen/features.c
@@ -0,0 +1,29 @@
1/******************************************************************************
2 * features.c
3 *
4 * Xen feature flags.
5 *
6 * Copyright (c) 2006, Ian Campbell, XenSource Inc.
7 */
8#include <linux/types.h>
9#include <linux/cache.h>
10#include <linux/module.h>
11#include <asm/xen/hypervisor.h>
12#include <xen/features.h>
13
14u8 xen_features[XENFEAT_NR_SUBMAPS * 32] __read_mostly;
15EXPORT_SYMBOL_GPL(xen_features);
16
17void xen_setup_features(void)
18{
19 struct xen_feature_info fi;
20 int i, j;
21
22 for (i = 0; i < XENFEAT_NR_SUBMAPS; i++) {
23 fi.submap_idx = i;
24 if (HYPERVISOR_xen_version(XENVER_get_features, &fi) < 0)
25 break;
26 for (j = 0; j < 32; j++)
27 xen_features[i * 32 + j] = !!(fi.submap & 1<<j);
28 }
29}
diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c
index d85dc6d41c2a..52b6b41b909d 100644
--- a/drivers/xen/grant-table.c
+++ b/drivers/xen/grant-table.c
@@ -439,24 +439,6 @@ static inline unsigned int max_nr_grant_frames(void)
439 return xen_max; 439 return xen_max;
440} 440}
441 441
442static int map_pte_fn(pte_t *pte, struct page *pmd_page,
443 unsigned long addr, void *data)
444{
445 unsigned long **frames = (unsigned long **)data;
446
447 set_pte_at(&init_mm, addr, pte, mfn_pte((*frames)[0], PAGE_KERNEL));
448 (*frames)++;
449 return 0;
450}
451
452static int unmap_pte_fn(pte_t *pte, struct page *pmd_page,
453 unsigned long addr, void *data)
454{
455
456 set_pte_at(&init_mm, addr, pte, __pte(0));
457 return 0;
458}
459
460static int gnttab_map(unsigned int start_idx, unsigned int end_idx) 442static int gnttab_map(unsigned int start_idx, unsigned int end_idx)
461{ 443{
462 struct gnttab_setup_table setup; 444 struct gnttab_setup_table setup;
@@ -470,7 +452,7 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx)
470 452
471 setup.dom = DOMID_SELF; 453 setup.dom = DOMID_SELF;
472 setup.nr_frames = nr_gframes; 454 setup.nr_frames = nr_gframes;
473 setup.frame_list = frames; 455 set_xen_guest_handle(setup.frame_list, frames);
474 456
475 rc = HYPERVISOR_grant_table_op(GNTTABOP_setup_table, &setup, 1); 457 rc = HYPERVISOR_grant_table_op(GNTTABOP_setup_table, &setup, 1);
476 if (rc == -ENOSYS) { 458 if (rc == -ENOSYS) {
@@ -480,17 +462,9 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx)
480 462
481 BUG_ON(rc || setup.status); 463 BUG_ON(rc || setup.status);
482 464
483 if (shared == NULL) { 465 rc = arch_gnttab_map_shared(frames, nr_gframes, max_nr_grant_frames(),
484 struct vm_struct *area; 466 &shared);
485 area = alloc_vm_area(PAGE_SIZE * max_nr_grant_frames());
486 BUG_ON(area == NULL);
487 shared = area->addr;
488 }
489 rc = apply_to_page_range(&init_mm, (unsigned long)shared,
490 PAGE_SIZE * nr_gframes,
491 map_pte_fn, &frames);
492 BUG_ON(rc); 467 BUG_ON(rc);
493 frames -= nr_gframes; /* adjust after map_pte_fn() */
494 468
495 kfree(frames); 469 kfree(frames);
496 470
@@ -506,10 +480,7 @@ static int gnttab_resume(void)
506 480
507static int gnttab_suspend(void) 481static int gnttab_suspend(void)
508{ 482{
509 apply_to_page_range(&init_mm, (unsigned long)shared, 483 arch_gnttab_unmap_shared(shared, nr_grant_frames);
510 PAGE_SIZE * nr_grant_frames,
511 unmap_pte_fn, NULL);
512
513 return 0; 484 return 0;
514} 485}
515 486
diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c
index 9fd2f70ab46d..0f86b0ff7879 100644
--- a/drivers/xen/xenbus/xenbus_client.c
+++ b/drivers/xen/xenbus/xenbus_client.c
@@ -399,7 +399,7 @@ int xenbus_map_ring_valloc(struct xenbus_device *dev, int gnt_ref, void **vaddr)
399 399
400 *vaddr = NULL; 400 *vaddr = NULL;
401 401
402 area = alloc_vm_area(PAGE_SIZE); 402 area = xen_alloc_vm_area(PAGE_SIZE);
403 if (!area) 403 if (!area)
404 return -ENOMEM; 404 return -ENOMEM;
405 405
@@ -409,7 +409,7 @@ int xenbus_map_ring_valloc(struct xenbus_device *dev, int gnt_ref, void **vaddr)
409 BUG(); 409 BUG();
410 410
411 if (op.status != GNTST_okay) { 411 if (op.status != GNTST_okay) {
412 free_vm_area(area); 412 xen_free_vm_area(area);
413 xenbus_dev_fatal(dev, op.status, 413 xenbus_dev_fatal(dev, op.status,
414 "mapping in shared page %d from domain %d", 414 "mapping in shared page %d from domain %d",
415 gnt_ref, dev->otherend_id); 415 gnt_ref, dev->otherend_id);
@@ -508,7 +508,7 @@ int xenbus_unmap_ring_vfree(struct xenbus_device *dev, void *vaddr)
508 BUG(); 508 BUG();
509 509
510 if (op.status == GNTST_okay) 510 if (op.status == GNTST_okay)
511 free_vm_area(area); 511 xen_free_vm_area(area);
512 else 512 else
513 xenbus_dev_error(dev, op.status, 513 xenbus_dev_error(dev, op.status,
514 "unmapping page at handle %d error %d", 514 "unmapping page at handle %d error %d",
diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c
index 4750de316ad3..57ceb5346b74 100644
--- a/drivers/xen/xenbus/xenbus_probe.c
+++ b/drivers/xen/xenbus/xenbus_probe.c
@@ -88,6 +88,16 @@ int xenbus_match(struct device *_dev, struct device_driver *_drv)
88 return match_device(drv->ids, to_xenbus_device(_dev)) != NULL; 88 return match_device(drv->ids, to_xenbus_device(_dev)) != NULL;
89} 89}
90 90
91static int xenbus_uevent(struct device *_dev, struct kobj_uevent_env *env)
92{
93 struct xenbus_device *dev = to_xenbus_device(_dev);
94
95 if (add_uevent_var(env, "MODALIAS=xen:%s", dev->devicetype))
96 return -ENOMEM;
97
98 return 0;
99}
100
91/* device/<type>/<id> => <type>-<id> */ 101/* device/<type>/<id> => <type>-<id> */
92static int frontend_bus_id(char bus_id[BUS_ID_SIZE], const char *nodename) 102static int frontend_bus_id(char bus_id[BUS_ID_SIZE], const char *nodename)
93{ 103{
@@ -166,6 +176,7 @@ static struct xen_bus_type xenbus_frontend = {
166 .bus = { 176 .bus = {
167 .name = "xen", 177 .name = "xen",
168 .match = xenbus_match, 178 .match = xenbus_match,
179 .uevent = xenbus_uevent,
169 .probe = xenbus_dev_probe, 180 .probe = xenbus_dev_probe,
170 .remove = xenbus_dev_remove, 181 .remove = xenbus_dev_remove,
171 .shutdown = xenbus_dev_shutdown, 182 .shutdown = xenbus_dev_shutdown,
@@ -438,6 +449,12 @@ static ssize_t xendev_show_devtype(struct device *dev,
438} 449}
439DEVICE_ATTR(devtype, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_devtype, NULL); 450DEVICE_ATTR(devtype, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_devtype, NULL);
440 451
452static ssize_t xendev_show_modalias(struct device *dev,
453 struct device_attribute *attr, char *buf)
454{
455 return sprintf(buf, "xen:%s\n", to_xenbus_device(dev)->devicetype);
456}
457DEVICE_ATTR(modalias, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_modalias, NULL);
441 458
442int xenbus_probe_node(struct xen_bus_type *bus, 459int xenbus_probe_node(struct xen_bus_type *bus,
443 const char *type, 460 const char *type,
@@ -492,10 +509,16 @@ int xenbus_probe_node(struct xen_bus_type *bus,
492 509
493 err = device_create_file(&xendev->dev, &dev_attr_devtype); 510 err = device_create_file(&xendev->dev, &dev_attr_devtype);
494 if (err) 511 if (err)
495 goto fail_remove_file; 512 goto fail_remove_nodename;
513
514 err = device_create_file(&xendev->dev, &dev_attr_modalias);
515 if (err)
516 goto fail_remove_devtype;
496 517
497 return 0; 518 return 0;
498fail_remove_file: 519fail_remove_devtype:
520 device_remove_file(&xendev->dev, &dev_attr_devtype);
521fail_remove_nodename:
499 device_remove_file(&xendev->dev, &dev_attr_nodename); 522 device_remove_file(&xendev->dev, &dev_attr_nodename);
500fail_unregister: 523fail_unregister:
501 device_unregister(&xendev->dev); 524 device_unregister(&xendev->dev);
@@ -846,6 +869,7 @@ static int is_disconnected_device(struct device *dev, void *data)
846{ 869{
847 struct xenbus_device *xendev = to_xenbus_device(dev); 870 struct xenbus_device *xendev = to_xenbus_device(dev);
848 struct device_driver *drv = data; 871 struct device_driver *drv = data;
872 struct xenbus_driver *xendrv;
849 873
850 /* 874 /*
851 * A device with no driver will never connect. We care only about 875 * A device with no driver will never connect. We care only about
@@ -858,7 +882,9 @@ static int is_disconnected_device(struct device *dev, void *data)
858 if (drv && (dev->driver != drv)) 882 if (drv && (dev->driver != drv))
859 return 0; 883 return 0;
860 884
861 return (xendev->state != XenbusStateConnected); 885 xendrv = to_xenbus_driver(dev->driver);
886 return (xendev->state != XenbusStateConnected ||
887 (xendrv->is_ready && !xendrv->is_ready(xendev)));
862} 888}
863 889
864static int exists_disconnected_device(struct device_driver *drv) 890static int exists_disconnected_device(struct device_driver *drv)
diff --git a/drivers/xen/xencomm.c b/drivers/xen/xencomm.c
new file mode 100644
index 000000000000..797cb4e31f07
--- /dev/null
+++ b/drivers/xen/xencomm.c
@@ -0,0 +1,232 @@
1/*
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License, or
5 * (at your option) any later version.
6 *
7 * This program is distributed in the hope that it will be useful,
8 * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 * GNU General Public License for more details.
11 *
12 * You should have received a copy of the GNU General Public License
13 * along with this program; if not, write to the Free Software
14 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
15 *
16 * Copyright (C) IBM Corp. 2006
17 *
18 * Authors: Hollis Blanchard <hollisb@us.ibm.com>
19 */
20
21#include <linux/gfp.h>
22#include <linux/mm.h>
23#include <asm/page.h>
24#include <xen/xencomm.h>
25#include <xen/interface/xen.h>
26#ifdef __ia64__
27#include <asm/xen/xencomm.h> /* for is_kern_addr() */
28#endif
29
30#ifdef HAVE_XEN_PLATFORM_COMPAT_H
31#include <xen/platform-compat.h>
32#endif
33
34static int xencomm_init(struct xencomm_desc *desc,
35 void *buffer, unsigned long bytes)
36{
37 unsigned long recorded = 0;
38 int i = 0;
39
40 while ((recorded < bytes) && (i < desc->nr_addrs)) {
41 unsigned long vaddr = (unsigned long)buffer + recorded;
42 unsigned long paddr;
43 int offset;
44 int chunksz;
45
46 offset = vaddr % PAGE_SIZE; /* handle partial pages */
47 chunksz = min(PAGE_SIZE - offset, bytes - recorded);
48
49 paddr = xencomm_vtop(vaddr);
50 if (paddr == ~0UL) {
51 printk(KERN_DEBUG "%s: couldn't translate vaddr %lx\n",
52 __func__, vaddr);
53 return -EINVAL;
54 }
55
56 desc->address[i++] = paddr;
57 recorded += chunksz;
58 }
59
60 if (recorded < bytes) {
61 printk(KERN_DEBUG
62 "%s: could only translate %ld of %ld bytes\n",
63 __func__, recorded, bytes);
64 return -ENOSPC;
65 }
66
67 /* mark remaining addresses invalid (just for safety) */
68 while (i < desc->nr_addrs)
69 desc->address[i++] = XENCOMM_INVALID;
70
71 desc->magic = XENCOMM_MAGIC;
72
73 return 0;
74}
75
76static struct xencomm_desc *xencomm_alloc(gfp_t gfp_mask,
77 void *buffer, unsigned long bytes)
78{
79 struct xencomm_desc *desc;
80 unsigned long buffer_ulong = (unsigned long)buffer;
81 unsigned long start = buffer_ulong & PAGE_MASK;
82 unsigned long end = (buffer_ulong + bytes) | ~PAGE_MASK;
83 unsigned long nr_addrs = (end - start + 1) >> PAGE_SHIFT;
84 unsigned long size = sizeof(*desc) +
85 sizeof(desc->address[0]) * nr_addrs;
86
87 /*
88 * slab allocator returns at least sizeof(void*) aligned pointer.
89 * When sizeof(*desc) > sizeof(void*), struct xencomm_desc might
90 * cross page boundary.
91 */
92 if (sizeof(*desc) > sizeof(void *)) {
93 unsigned long order = get_order(size);
94 desc = (struct xencomm_desc *)__get_free_pages(gfp_mask,
95 order);
96 if (desc == NULL)
97 return NULL;
98
99 desc->nr_addrs =
100 ((PAGE_SIZE << order) - sizeof(struct xencomm_desc)) /
101 sizeof(*desc->address);
102 } else {
103 desc = kmalloc(size, gfp_mask);
104 if (desc == NULL)
105 return NULL;
106
107 desc->nr_addrs = nr_addrs;
108 }
109 return desc;
110}
111
112void xencomm_free(struct xencomm_handle *desc)
113{
114 if (desc && !((ulong)desc & XENCOMM_INLINE_FLAG)) {
115 struct xencomm_desc *desc__ = (struct xencomm_desc *)desc;
116 if (sizeof(*desc__) > sizeof(void *)) {
117 unsigned long size = sizeof(*desc__) +
118 sizeof(desc__->address[0]) * desc__->nr_addrs;
119 unsigned long order = get_order(size);
120 free_pages((unsigned long)__va(desc), order);
121 } else
122 kfree(__va(desc));
123 }
124}
125
126static int xencomm_create(void *buffer, unsigned long bytes,
127 struct xencomm_desc **ret, gfp_t gfp_mask)
128{
129 struct xencomm_desc *desc;
130 int rc;
131
132 pr_debug("%s: %p[%ld]\n", __func__, buffer, bytes);
133
134 if (bytes == 0) {
135 /* don't create a descriptor; Xen recognizes NULL. */
136 BUG_ON(buffer != NULL);
137 *ret = NULL;
138 return 0;
139 }
140
141 BUG_ON(buffer == NULL); /* 'bytes' is non-zero */
142
143 desc = xencomm_alloc(gfp_mask, buffer, bytes);
144 if (!desc) {
145 printk(KERN_DEBUG "%s failure\n", "xencomm_alloc");
146 return -ENOMEM;
147 }
148
149 rc = xencomm_init(desc, buffer, bytes);
150 if (rc) {
151 printk(KERN_DEBUG "%s failure: %d\n", "xencomm_init", rc);
152 xencomm_free((struct xencomm_handle *)__pa(desc));
153 return rc;
154 }
155
156 *ret = desc;
157 return 0;
158}
159
160/* check if memory address is within VMALLOC region */
161static int is_phys_contiguous(unsigned long addr)
162{
163 if (!is_kernel_addr(addr))
164 return 0;
165
166 return (addr < VMALLOC_START) || (addr >= VMALLOC_END);
167}
168
169static struct xencomm_handle *xencomm_create_inline(void *ptr)
170{
171 unsigned long paddr;
172
173 BUG_ON(!is_phys_contiguous((unsigned long)ptr));
174
175 paddr = (unsigned long)xencomm_pa(ptr);
176 BUG_ON(paddr & XENCOMM_INLINE_FLAG);
177 return (struct xencomm_handle *)(paddr | XENCOMM_INLINE_FLAG);
178}
179
180/* "mini" routine, for stack-based communications: */
181static int xencomm_create_mini(void *buffer,
182 unsigned long bytes, struct xencomm_mini *xc_desc,
183 struct xencomm_desc **ret)
184{
185 int rc = 0;
186 struct xencomm_desc *desc;
187 BUG_ON(((unsigned long)xc_desc) % sizeof(*xc_desc) != 0);
188
189 desc = (void *)xc_desc;
190
191 desc->nr_addrs = XENCOMM_MINI_ADDRS;
192
193 rc = xencomm_init(desc, buffer, bytes);
194 if (!rc)
195 *ret = desc;
196
197 return rc;
198}
199
200struct xencomm_handle *xencomm_map(void *ptr, unsigned long bytes)
201{
202 int rc;
203 struct xencomm_desc *desc;
204
205 if (is_phys_contiguous((unsigned long)ptr))
206 return xencomm_create_inline(ptr);
207
208 rc = xencomm_create(ptr, bytes, &desc, GFP_KERNEL);
209
210 if (rc || desc == NULL)
211 return NULL;
212
213 return xencomm_pa(desc);
214}
215
216struct xencomm_handle *__xencomm_map_no_alloc(void *ptr, unsigned long bytes,
217 struct xencomm_mini *xc_desc)
218{
219 int rc;
220 struct xencomm_desc *desc = NULL;
221
222 if (is_phys_contiguous((unsigned long)ptr))
223 return xencomm_create_inline(ptr);
224
225 rc = xencomm_create_mini(ptr, bytes, xc_desc,
226 &desc);
227
228 if (rc)
229 return NULL;
230
231 return xencomm_pa(desc);
232}