diff options
89 files changed, 900 insertions, 1364 deletions
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt index 09027a9fece5..ddf4f93967a9 100644 --- a/Documentation/cgroups/memory.txt +++ b/Documentation/cgroups/memory.txt | |||
@@ -480,7 +480,9 @@ memory.stat file includes following statistics | |||
480 | 480 | ||
481 | # per-memory cgroup local status | 481 | # per-memory cgroup local status |
482 | cache - # of bytes of page cache memory. | 482 | cache - # of bytes of page cache memory. |
483 | rss - # of bytes of anonymous and swap cache memory. | 483 | rss - # of bytes of anonymous and swap cache memory (includes |
484 | transparent hugepages). | ||
485 | rss_huge - # of bytes of anonymous transparent hugepages. | ||
484 | mapped_file - # of bytes of mapped file (includes tmpfs/shmem) | 486 | mapped_file - # of bytes of mapped file (includes tmpfs/shmem) |
485 | pgpgin - # of charging events to the memory cgroup. The charging | 487 | pgpgin - # of charging events to the memory cgroup. The charging |
486 | event happens each time a page is accounted as either mapped | 488 | event happens each time a page is accounted as either mapped |
diff --git a/arch/s390/hypfs/inode.c b/arch/s390/hypfs/inode.c index 5f7d7ba2874c..7a539f4f5e30 100644 --- a/arch/s390/hypfs/inode.c +++ b/arch/s390/hypfs/inode.c | |||
@@ -21,6 +21,7 @@ | |||
21 | #include <linux/module.h> | 21 | #include <linux/module.h> |
22 | #include <linux/seq_file.h> | 22 | #include <linux/seq_file.h> |
23 | #include <linux/mount.h> | 23 | #include <linux/mount.h> |
24 | #include <linux/aio.h> | ||
24 | #include <asm/ebcdic.h> | 25 | #include <asm/ebcdic.h> |
25 | #include "hypfs.h" | 26 | #include "hypfs.h" |
26 | 27 | ||
diff --git a/arch/sparc/kernel/leon_smp.c b/arch/sparc/kernel/leon_smp.c index 9b40c9c12a0c..6cfc1b09ec25 100644 --- a/arch/sparc/kernel/leon_smp.c +++ b/arch/sparc/kernel/leon_smp.c | |||
@@ -253,24 +253,15 @@ void __init leon_smp_done(void) | |||
253 | 253 | ||
254 | /* Free unneeded trap tables */ | 254 | /* Free unneeded trap tables */ |
255 | if (!cpu_present(1)) { | 255 | if (!cpu_present(1)) { |
256 | ClearPageReserved(virt_to_page(&trapbase_cpu1)); | 256 | free_reserved_page(virt_to_page(&trapbase_cpu1)); |
257 | init_page_count(virt_to_page(&trapbase_cpu1)); | ||
258 | free_page((unsigned long)&trapbase_cpu1); | ||
259 | totalram_pages++; | ||
260 | num_physpages++; | 257 | num_physpages++; |
261 | } | 258 | } |
262 | if (!cpu_present(2)) { | 259 | if (!cpu_present(2)) { |
263 | ClearPageReserved(virt_to_page(&trapbase_cpu2)); | 260 | free_reserved_page(virt_to_page(&trapbase_cpu2)); |
264 | init_page_count(virt_to_page(&trapbase_cpu2)); | ||
265 | free_page((unsigned long)&trapbase_cpu2); | ||
266 | totalram_pages++; | ||
267 | num_physpages++; | 261 | num_physpages++; |
268 | } | 262 | } |
269 | if (!cpu_present(3)) { | 263 | if (!cpu_present(3)) { |
270 | ClearPageReserved(virt_to_page(&trapbase_cpu3)); | 264 | free_reserved_page(virt_to_page(&trapbase_cpu3)); |
271 | init_page_count(virt_to_page(&trapbase_cpu3)); | ||
272 | free_page((unsigned long)&trapbase_cpu3); | ||
273 | totalram_pages++; | ||
274 | num_physpages++; | 265 | num_physpages++; |
275 | } | 266 | } |
276 | /* Ok, they are spinning and ready to go. */ | 267 | /* Ok, they are spinning and ready to go. */ |
diff --git a/arch/sparc/mm/init_32.c b/arch/sparc/mm/init_32.c index 4490c397bb5b..af472cf7c69a 100644 --- a/arch/sparc/mm/init_32.c +++ b/arch/sparc/mm/init_32.c | |||
@@ -366,45 +366,14 @@ void __init mem_init(void) | |||
366 | 366 | ||
367 | void free_initmem (void) | 367 | void free_initmem (void) |
368 | { | 368 | { |
369 | unsigned long addr; | 369 | num_physpages += free_initmem_default(POISON_FREE_INITMEM); |
370 | unsigned long freed; | ||
371 | |||
372 | addr = (unsigned long)(&__init_begin); | ||
373 | freed = (unsigned long)(&__init_end) - addr; | ||
374 | for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) { | ||
375 | struct page *p; | ||
376 | |||
377 | memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE); | ||
378 | p = virt_to_page(addr); | ||
379 | |||
380 | ClearPageReserved(p); | ||
381 | init_page_count(p); | ||
382 | __free_page(p); | ||
383 | totalram_pages++; | ||
384 | num_physpages++; | ||
385 | } | ||
386 | printk(KERN_INFO "Freeing unused kernel memory: %ldk freed\n", | ||
387 | freed >> 10); | ||
388 | } | 370 | } |
389 | 371 | ||
390 | #ifdef CONFIG_BLK_DEV_INITRD | 372 | #ifdef CONFIG_BLK_DEV_INITRD |
391 | void free_initrd_mem(unsigned long start, unsigned long end) | 373 | void free_initrd_mem(unsigned long start, unsigned long end) |
392 | { | 374 | { |
393 | if (start < end) | 375 | num_physpages += free_reserved_area(start, end, POISON_FREE_INITMEM, |
394 | printk(KERN_INFO "Freeing initrd memory: %ldk freed\n", | 376 | "initrd"); |
395 | (end - start) >> 10); | ||
396 | for (; start < end; start += PAGE_SIZE) { | ||
397 | struct page *p; | ||
398 | |||
399 | memset((void *)start, POISON_FREE_INITMEM, PAGE_SIZE); | ||
400 | p = virt_to_page(start); | ||
401 | |||
402 | ClearPageReserved(p); | ||
403 | init_page_count(p); | ||
404 | __free_page(p); | ||
405 | totalram_pages++; | ||
406 | num_physpages++; | ||
407 | } | ||
408 | } | 377 | } |
409 | #endif | 378 | #endif |
410 | 379 | ||
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index cf72a8a5b3aa..a7171997adfd 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c | |||
@@ -2059,8 +2059,7 @@ void __init mem_init(void) | |||
2059 | /* We subtract one to account for the mem_map_zero page | 2059 | /* We subtract one to account for the mem_map_zero page |
2060 | * allocated below. | 2060 | * allocated below. |
2061 | */ | 2061 | */ |
2062 | totalram_pages -= 1; | 2062 | num_physpages = totalram_pages - 1; |
2063 | num_physpages = totalram_pages; | ||
2064 | 2063 | ||
2065 | /* | 2064 | /* |
2066 | * Set up the zero page, mark it reserved, so that page count | 2065 | * Set up the zero page, mark it reserved, so that page count |
@@ -2071,7 +2070,7 @@ void __init mem_init(void) | |||
2071 | prom_printf("paging_init: Cannot alloc zero page.\n"); | 2070 | prom_printf("paging_init: Cannot alloc zero page.\n"); |
2072 | prom_halt(); | 2071 | prom_halt(); |
2073 | } | 2072 | } |
2074 | SetPageReserved(mem_map_zero); | 2073 | mark_page_reserved(mem_map_zero); |
2075 | 2074 | ||
2076 | codepages = (((unsigned long) _etext) - ((unsigned long) _start)); | 2075 | codepages = (((unsigned long) _etext) - ((unsigned long) _start)); |
2077 | codepages = PAGE_ALIGN(codepages) >> PAGE_SHIFT; | 2076 | codepages = PAGE_ALIGN(codepages) >> PAGE_SHIFT; |
@@ -2111,37 +2110,22 @@ void free_initmem(void) | |||
2111 | initend = (unsigned long)(__init_end) & PAGE_MASK; | 2110 | initend = (unsigned long)(__init_end) & PAGE_MASK; |
2112 | for (; addr < initend; addr += PAGE_SIZE) { | 2111 | for (; addr < initend; addr += PAGE_SIZE) { |
2113 | unsigned long page; | 2112 | unsigned long page; |
2114 | struct page *p; | ||
2115 | 2113 | ||
2116 | page = (addr + | 2114 | page = (addr + |
2117 | ((unsigned long) __va(kern_base)) - | 2115 | ((unsigned long) __va(kern_base)) - |
2118 | ((unsigned long) KERNBASE)); | 2116 | ((unsigned long) KERNBASE)); |
2119 | memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE); | 2117 | memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE); |
2120 | 2118 | ||
2121 | if (do_free) { | 2119 | if (do_free) |
2122 | p = virt_to_page(page); | 2120 | free_reserved_page(virt_to_page(page)); |
2123 | |||
2124 | ClearPageReserved(p); | ||
2125 | init_page_count(p); | ||
2126 | __free_page(p); | ||
2127 | totalram_pages++; | ||
2128 | } | ||
2129 | } | 2121 | } |
2130 | } | 2122 | } |
2131 | 2123 | ||
2132 | #ifdef CONFIG_BLK_DEV_INITRD | 2124 | #ifdef CONFIG_BLK_DEV_INITRD |
2133 | void free_initrd_mem(unsigned long start, unsigned long end) | 2125 | void free_initrd_mem(unsigned long start, unsigned long end) |
2134 | { | 2126 | { |
2135 | if (start < end) | 2127 | num_physpages += free_reserved_area(start, end, POISON_FREE_INITMEM, |
2136 | printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10); | 2128 | "initrd"); |
2137 | for (; start < end; start += PAGE_SIZE) { | ||
2138 | struct page *p = virt_to_page(start); | ||
2139 | |||
2140 | ClearPageReserved(p); | ||
2141 | init_page_count(p); | ||
2142 | __free_page(p); | ||
2143 | totalram_pages++; | ||
2144 | } | ||
2145 | } | 2129 | } |
2146 | #endif | 2130 | #endif |
2147 | 2131 | ||
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c index 9a87daa6f4fb..a5ffcc988f0b 100644 --- a/block/scsi_ioctl.c +++ b/block/scsi_ioctl.c | |||
@@ -27,6 +27,7 @@ | |||
27 | #include <linux/ratelimit.h> | 27 | #include <linux/ratelimit.h> |
28 | #include <linux/slab.h> | 28 | #include <linux/slab.h> |
29 | #include <linux/times.h> | 29 | #include <linux/times.h> |
30 | #include <linux/uio.h> | ||
30 | #include <asm/uaccess.h> | 31 | #include <asm/uaccess.h> |
31 | 32 | ||
32 | #include <scsi/scsi.h> | 33 | #include <scsi/scsi.h> |
diff --git a/drivers/char/mem.c b/drivers/char/mem.c index 2c644afbcdd4..1ccbe9482faa 100644 --- a/drivers/char/mem.c +++ b/drivers/char/mem.c | |||
@@ -28,6 +28,7 @@ | |||
28 | #include <linux/pfn.h> | 28 | #include <linux/pfn.h> |
29 | #include <linux/export.h> | 29 | #include <linux/export.h> |
30 | #include <linux/io.h> | 30 | #include <linux/io.h> |
31 | #include <linux/aio.h> | ||
31 | 32 | ||
32 | #include <asm/uaccess.h> | 33 | #include <asm/uaccess.h> |
33 | 34 | ||
@@ -627,6 +628,18 @@ static ssize_t write_null(struct file *file, const char __user *buf, | |||
627 | return count; | 628 | return count; |
628 | } | 629 | } |
629 | 630 | ||
631 | static ssize_t aio_read_null(struct kiocb *iocb, const struct iovec *iov, | ||
632 | unsigned long nr_segs, loff_t pos) | ||
633 | { | ||
634 | return 0; | ||
635 | } | ||
636 | |||
637 | static ssize_t aio_write_null(struct kiocb *iocb, const struct iovec *iov, | ||
638 | unsigned long nr_segs, loff_t pos) | ||
639 | { | ||
640 | return iov_length(iov, nr_segs); | ||
641 | } | ||
642 | |||
630 | static int pipe_to_null(struct pipe_inode_info *info, struct pipe_buffer *buf, | 643 | static int pipe_to_null(struct pipe_inode_info *info, struct pipe_buffer *buf, |
631 | struct splice_desc *sd) | 644 | struct splice_desc *sd) |
632 | { | 645 | { |
@@ -670,6 +683,24 @@ static ssize_t read_zero(struct file *file, char __user *buf, | |||
670 | return written ? written : -EFAULT; | 683 | return written ? written : -EFAULT; |
671 | } | 684 | } |
672 | 685 | ||
686 | static ssize_t aio_read_zero(struct kiocb *iocb, const struct iovec *iov, | ||
687 | unsigned long nr_segs, loff_t pos) | ||
688 | { | ||
689 | size_t written = 0; | ||
690 | unsigned long i; | ||
691 | ssize_t ret; | ||
692 | |||
693 | for (i = 0; i < nr_segs; i++) { | ||
694 | ret = read_zero(iocb->ki_filp, iov[i].iov_base, iov[i].iov_len, | ||
695 | &pos); | ||
696 | if (ret < 0) | ||
697 | break; | ||
698 | written += ret; | ||
699 | } | ||
700 | |||
701 | return written ? written : -EFAULT; | ||
702 | } | ||
703 | |||
673 | static int mmap_zero(struct file *file, struct vm_area_struct *vma) | 704 | static int mmap_zero(struct file *file, struct vm_area_struct *vma) |
674 | { | 705 | { |
675 | #ifndef CONFIG_MMU | 706 | #ifndef CONFIG_MMU |
@@ -738,6 +769,7 @@ static int open_port(struct inode *inode, struct file *filp) | |||
738 | #define full_lseek null_lseek | 769 | #define full_lseek null_lseek |
739 | #define write_zero write_null | 770 | #define write_zero write_null |
740 | #define read_full read_zero | 771 | #define read_full read_zero |
772 | #define aio_write_zero aio_write_null | ||
741 | #define open_mem open_port | 773 | #define open_mem open_port |
742 | #define open_kmem open_mem | 774 | #define open_kmem open_mem |
743 | #define open_oldmem open_mem | 775 | #define open_oldmem open_mem |
@@ -766,6 +798,8 @@ static const struct file_operations null_fops = { | |||
766 | .llseek = null_lseek, | 798 | .llseek = null_lseek, |
767 | .read = read_null, | 799 | .read = read_null, |
768 | .write = write_null, | 800 | .write = write_null, |
801 | .aio_read = aio_read_null, | ||
802 | .aio_write = aio_write_null, | ||
769 | .splice_write = splice_write_null, | 803 | .splice_write = splice_write_null, |
770 | }; | 804 | }; |
771 | 805 | ||
@@ -782,6 +816,8 @@ static const struct file_operations zero_fops = { | |||
782 | .llseek = zero_lseek, | 816 | .llseek = zero_lseek, |
783 | .read = read_zero, | 817 | .read = read_zero, |
784 | .write = write_zero, | 818 | .write = write_zero, |
819 | .aio_read = aio_read_zero, | ||
820 | .aio_write = aio_write_zero, | ||
785 | .mmap = mmap_zero, | 821 | .mmap = mmap_zero, |
786 | }; | 822 | }; |
787 | 823 | ||
diff --git a/drivers/infiniband/hw/cxgb3/cxio_resource.c b/drivers/infiniband/hw/cxgb3/cxio_resource.c index 31f9201b2980..c40088ecf9f3 100644 --- a/drivers/infiniband/hw/cxgb3/cxio_resource.c +++ b/drivers/infiniband/hw/cxgb3/cxio_resource.c | |||
@@ -62,13 +62,13 @@ static int __cxio_init_resource_fifo(struct kfifo *fifo, | |||
62 | kfifo_in(fifo, (unsigned char *) &entry, sizeof(u32)); | 62 | kfifo_in(fifo, (unsigned char *) &entry, sizeof(u32)); |
63 | if (random) { | 63 | if (random) { |
64 | j = 0; | 64 | j = 0; |
65 | random_bytes = random32(); | 65 | random_bytes = prandom_u32(); |
66 | for (i = 0; i < RANDOM_SIZE; i++) | 66 | for (i = 0; i < RANDOM_SIZE; i++) |
67 | rarray[i] = i + skip_low; | 67 | rarray[i] = i + skip_low; |
68 | for (i = skip_low + RANDOM_SIZE; i < nr - skip_high; i++) { | 68 | for (i = skip_low + RANDOM_SIZE; i < nr - skip_high; i++) { |
69 | if (j >= RANDOM_SIZE) { | 69 | if (j >= RANDOM_SIZE) { |
70 | j = 0; | 70 | j = 0; |
71 | random_bytes = random32(); | 71 | random_bytes = prandom_u32(); |
72 | } | 72 | } |
73 | idx = (random_bytes >> (j * 2)) & 0xF; | 73 | idx = (random_bytes >> (j * 2)) & 0xF; |
74 | kfifo_in(fifo, | 74 | kfifo_in(fifo, |
diff --git a/drivers/infiniband/hw/cxgb4/id_table.c b/drivers/infiniband/hw/cxgb4/id_table.c index f95e5df30db2..0161ae6ad629 100644 --- a/drivers/infiniband/hw/cxgb4/id_table.c +++ b/drivers/infiniband/hw/cxgb4/id_table.c | |||
@@ -54,7 +54,7 @@ u32 c4iw_id_alloc(struct c4iw_id_table *alloc) | |||
54 | 54 | ||
55 | if (obj < alloc->max) { | 55 | if (obj < alloc->max) { |
56 | if (alloc->flags & C4IW_ID_TABLE_F_RANDOM) | 56 | if (alloc->flags & C4IW_ID_TABLE_F_RANDOM) |
57 | alloc->last += random32() % RANDOM_SKIP; | 57 | alloc->last += prandom_u32() % RANDOM_SKIP; |
58 | else | 58 | else |
59 | alloc->last = obj + 1; | 59 | alloc->last = obj + 1; |
60 | if (alloc->last >= alloc->max) | 60 | if (alloc->last >= alloc->max) |
@@ -88,7 +88,7 @@ int c4iw_id_table_alloc(struct c4iw_id_table *alloc, u32 start, u32 num, | |||
88 | alloc->start = start; | 88 | alloc->start = start; |
89 | alloc->flags = flags; | 89 | alloc->flags = flags; |
90 | if (flags & C4IW_ID_TABLE_F_RANDOM) | 90 | if (flags & C4IW_ID_TABLE_F_RANDOM) |
91 | alloc->last = random32() % RANDOM_SKIP; | 91 | alloc->last = prandom_u32() % RANDOM_SKIP; |
92 | else | 92 | else |
93 | alloc->last = 0; | 93 | alloc->last = 0; |
94 | alloc->max = num; | 94 | alloc->max = num; |
diff --git a/drivers/infiniband/hw/ipath/ipath_file_ops.c b/drivers/infiniband/hw/ipath/ipath_file_ops.c index aed8afee56da..6d7f453b4d05 100644 --- a/drivers/infiniband/hw/ipath/ipath_file_ops.c +++ b/drivers/infiniband/hw/ipath/ipath_file_ops.c | |||
@@ -40,6 +40,7 @@ | |||
40 | #include <linux/slab.h> | 40 | #include <linux/slab.h> |
41 | #include <linux/highmem.h> | 41 | #include <linux/highmem.h> |
42 | #include <linux/io.h> | 42 | #include <linux/io.h> |
43 | #include <linux/aio.h> | ||
43 | #include <linux/jiffies.h> | 44 | #include <linux/jiffies.h> |
44 | #include <linux/cpu.h> | 45 | #include <linux/cpu.h> |
45 | #include <asm/pgtable.h> | 46 | #include <asm/pgtable.h> |
diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c index 934792c477bc..4d599cedbb0b 100644 --- a/drivers/infiniband/hw/mlx4/mad.c +++ b/drivers/infiniband/hw/mlx4/mad.c | |||
@@ -93,7 +93,7 @@ static void __propagate_pkey_ev(struct mlx4_ib_dev *dev, int port_num, | |||
93 | __be64 mlx4_ib_gen_node_guid(void) | 93 | __be64 mlx4_ib_gen_node_guid(void) |
94 | { | 94 | { |
95 | #define NODE_GUID_HI ((u64) (((u64)IB_OPENIB_OUI) << 40)) | 95 | #define NODE_GUID_HI ((u64) (((u64)IB_OPENIB_OUI) << 40)) |
96 | return cpu_to_be64(NODE_GUID_HI | random32()); | 96 | return cpu_to_be64(NODE_GUID_HI | prandom_u32()); |
97 | } | 97 | } |
98 | 98 | ||
99 | __be64 mlx4_ib_get_new_demux_tid(struct mlx4_ib_demux_ctx *ctx) | 99 | __be64 mlx4_ib_get_new_demux_tid(struct mlx4_ib_demux_ctx *ctx) |
diff --git a/drivers/infiniband/hw/qib/qib_file_ops.c b/drivers/infiniband/hw/qib/qib_file_ops.c index 4f7aa301b3b1..b56c9428f3c5 100644 --- a/drivers/infiniband/hw/qib/qib_file_ops.c +++ b/drivers/infiniband/hw/qib/qib_file_ops.c | |||
@@ -39,7 +39,7 @@ | |||
39 | #include <linux/vmalloc.h> | 39 | #include <linux/vmalloc.h> |
40 | #include <linux/highmem.h> | 40 | #include <linux/highmem.h> |
41 | #include <linux/io.h> | 41 | #include <linux/io.h> |
42 | #include <linux/uio.h> | 42 | #include <linux/aio.h> |
43 | #include <linux/jiffies.h> | 43 | #include <linux/jiffies.h> |
44 | #include <asm/pgtable.h> | 44 | #include <asm/pgtable.h> |
45 | #include <linux/delay.h> | 45 | #include <linux/delay.h> |
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c index 1ef880de3a41..3eceb61e3532 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c | |||
@@ -460,7 +460,7 @@ static int ipoib_cm_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *even | |||
460 | goto err_qp; | 460 | goto err_qp; |
461 | } | 461 | } |
462 | 462 | ||
463 | psn = random32() & 0xffffff; | 463 | psn = prandom_u32() & 0xffffff; |
464 | ret = ipoib_cm_modify_rx_qp(dev, cm_id, p->qp, psn); | 464 | ret = ipoib_cm_modify_rx_qp(dev, cm_id, p->qp, psn); |
465 | if (ret) | 465 | if (ret) |
466 | goto err_modify; | 466 | goto err_modify; |
diff --git a/drivers/net/ethernet/broadcom/cnic.c b/drivers/net/ethernet/broadcom/cnic.c index 40649a8bf390..6b0dc131b20e 100644 --- a/drivers/net/ethernet/broadcom/cnic.c +++ b/drivers/net/ethernet/broadcom/cnic.c | |||
@@ -4085,7 +4085,7 @@ static int cnic_cm_alloc_mem(struct cnic_dev *dev) | |||
4085 | if (!cp->csk_tbl) | 4085 | if (!cp->csk_tbl) |
4086 | return -ENOMEM; | 4086 | return -ENOMEM; |
4087 | 4087 | ||
4088 | port_id = random32(); | 4088 | port_id = prandom_u32(); |
4089 | port_id %= CNIC_LOCAL_PORT_RANGE; | 4089 | port_id %= CNIC_LOCAL_PORT_RANGE; |
4090 | if (cnic_init_id_tbl(&cp->csk_port_tbl, CNIC_LOCAL_PORT_RANGE, | 4090 | if (cnic_init_id_tbl(&cp->csk_port_tbl, CNIC_LOCAL_PORT_RANGE, |
4091 | CNIC_LOCAL_PORT_MIN, port_id)) { | 4091 | CNIC_LOCAL_PORT_MIN, port_id)) { |
@@ -4145,7 +4145,7 @@ static int cnic_cm_init_bnx2_hw(struct cnic_dev *dev) | |||
4145 | { | 4145 | { |
4146 | u32 seed; | 4146 | u32 seed; |
4147 | 4147 | ||
4148 | seed = random32(); | 4148 | seed = prandom_u32(); |
4149 | cnic_ctx_wr(dev, 45, 0, seed); | 4149 | cnic_ctx_wr(dev, 45, 0, seed); |
4150 | return 0; | 4150 | return 0; |
4151 | } | 4151 | } |
diff --git a/drivers/net/hamradio/baycom_epp.c b/drivers/net/hamradio/baycom_epp.c index 49b8b58fc5c6..484f77ec2ce1 100644 --- a/drivers/net/hamradio/baycom_epp.c +++ b/drivers/net/hamradio/baycom_epp.c | |||
@@ -449,7 +449,7 @@ static int transmit(struct baycom_state *bc, int cnt, unsigned char stat) | |||
449 | if ((--bc->hdlctx.slotcnt) > 0) | 449 | if ((--bc->hdlctx.slotcnt) > 0) |
450 | return 0; | 450 | return 0; |
451 | bc->hdlctx.slotcnt = bc->ch_params.slottime; | 451 | bc->hdlctx.slotcnt = bc->ch_params.slottime; |
452 | if ((random32() % 256) > bc->ch_params.ppersist) | 452 | if ((prandom_u32() % 256) > bc->ch_params.ppersist) |
453 | return 0; | 453 | return 0; |
454 | } | 454 | } |
455 | } | 455 | } |
diff --git a/drivers/net/hamradio/hdlcdrv.c b/drivers/net/hamradio/hdlcdrv.c index a4a3516b6bbf..3169252613fa 100644 --- a/drivers/net/hamradio/hdlcdrv.c +++ b/drivers/net/hamradio/hdlcdrv.c | |||
@@ -389,7 +389,7 @@ void hdlcdrv_arbitrate(struct net_device *dev, struct hdlcdrv_state *s) | |||
389 | if ((--s->hdlctx.slotcnt) > 0) | 389 | if ((--s->hdlctx.slotcnt) > 0) |
390 | return; | 390 | return; |
391 | s->hdlctx.slotcnt = s->ch_params.slottime; | 391 | s->hdlctx.slotcnt = s->ch_params.slottime; |
392 | if ((random32() % 256) > s->ch_params.ppersist) | 392 | if ((prandom_u32() % 256) > s->ch_params.ppersist) |
393 | return; | 393 | return; |
394 | start_tx(dev, s); | 394 | start_tx(dev, s); |
395 | } | 395 | } |
diff --git a/drivers/net/hamradio/yam.c b/drivers/net/hamradio/yam.c index b2d863f2ea42..0721e72f9299 100644 --- a/drivers/net/hamradio/yam.c +++ b/drivers/net/hamradio/yam.c | |||
@@ -638,7 +638,7 @@ static void yam_arbitrate(struct net_device *dev) | |||
638 | yp->slotcnt = yp->slot / 10; | 638 | yp->slotcnt = yp->slot / 10; |
639 | 639 | ||
640 | /* is random > persist ? */ | 640 | /* is random > persist ? */ |
641 | if ((random32() % 256) > yp->pers) | 641 | if ((prandom_u32() % 256) > yp->pers) |
642 | return; | 642 | return; |
643 | 643 | ||
644 | yam_start_tx(dev, yp); | 644 | yam_start_tx(dev, yp); |
diff --git a/drivers/net/team/team_mode_random.c b/drivers/net/team/team_mode_random.c index 9eabfaa22f3e..5ca14d463ba7 100644 --- a/drivers/net/team/team_mode_random.c +++ b/drivers/net/team/team_mode_random.c | |||
@@ -18,7 +18,7 @@ | |||
18 | 18 | ||
19 | static u32 random_N(unsigned int N) | 19 | static u32 random_N(unsigned int N) |
20 | { | 20 | { |
21 | return reciprocal_divide(random32(), N); | 21 | return reciprocal_divide(prandom_u32(), N); |
22 | } | 22 | } |
23 | 23 | ||
24 | static bool rnd_transmit(struct team *team, struct sk_buff *skb) | 24 | static bool rnd_transmit(struct team *team, struct sk_buff *skb) |
diff --git a/drivers/net/wireless/brcm80211/brcmfmac/p2p.c b/drivers/net/wireless/brcm80211/brcmfmac/p2p.c index 2b90da0d85f3..e7a1a4770996 100644 --- a/drivers/net/wireless/brcm80211/brcmfmac/p2p.c +++ b/drivers/net/wireless/brcm80211/brcmfmac/p2p.c | |||
@@ -1117,7 +1117,7 @@ static void brcmf_p2p_afx_handler(struct work_struct *work) | |||
1117 | if (afx_hdl->is_listen && afx_hdl->my_listen_chan) | 1117 | if (afx_hdl->is_listen && afx_hdl->my_listen_chan) |
1118 | /* 100ms ~ 300ms */ | 1118 | /* 100ms ~ 300ms */ |
1119 | err = brcmf_p2p_discover_listen(p2p, afx_hdl->my_listen_chan, | 1119 | err = brcmf_p2p_discover_listen(p2p, afx_hdl->my_listen_chan, |
1120 | 100 * (1 + (random32() % 3))); | 1120 | 100 * (1 + prandom_u32() % 3)); |
1121 | else | 1121 | else |
1122 | err = brcmf_p2p_act_frm_search(p2p, afx_hdl->peer_listen_chan); | 1122 | err = brcmf_p2p_act_frm_search(p2p, afx_hdl->peer_listen_chan); |
1123 | 1123 | ||
diff --git a/drivers/net/wireless/mwifiex/cfg80211.c b/drivers/net/wireless/mwifiex/cfg80211.c index a0cb0770d319..d3c8ece980d8 100644 --- a/drivers/net/wireless/mwifiex/cfg80211.c +++ b/drivers/net/wireless/mwifiex/cfg80211.c | |||
@@ -216,7 +216,7 @@ mwifiex_cfg80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev, | |||
216 | mwifiex_form_mgmt_frame(skb, buf, len); | 216 | mwifiex_form_mgmt_frame(skb, buf, len); |
217 | mwifiex_queue_tx_pkt(priv, skb); | 217 | mwifiex_queue_tx_pkt(priv, skb); |
218 | 218 | ||
219 | *cookie = random32() | 1; | 219 | *cookie = prandom_u32() | 1; |
220 | cfg80211_mgmt_tx_status(wdev, *cookie, buf, len, true, GFP_ATOMIC); | 220 | cfg80211_mgmt_tx_status(wdev, *cookie, buf, len, true, GFP_ATOMIC); |
221 | 221 | ||
222 | wiphy_dbg(wiphy, "info: management frame transmitted\n"); | 222 | wiphy_dbg(wiphy, "info: management frame transmitted\n"); |
@@ -271,7 +271,7 @@ mwifiex_cfg80211_remain_on_channel(struct wiphy *wiphy, | |||
271 | duration); | 271 | duration); |
272 | 272 | ||
273 | if (!ret) { | 273 | if (!ret) { |
274 | *cookie = random32() | 1; | 274 | *cookie = prandom_u32() | 1; |
275 | priv->roc_cfg.cookie = *cookie; | 275 | priv->roc_cfg.cookie = *cookie; |
276 | priv->roc_cfg.chan = *chan; | 276 | priv->roc_cfg.chan = *chan; |
277 | 277 | ||
diff --git a/drivers/rtc/rtc-rs5c372.c b/drivers/rtc/rtc-rs5c372.c index 224d634322b4..ccf54f06396b 100644 --- a/drivers/rtc/rtc-rs5c372.c +++ b/drivers/rtc/rtc-rs5c372.c | |||
@@ -68,6 +68,7 @@ | |||
68 | enum rtc_type { | 68 | enum rtc_type { |
69 | rtc_undef = 0, | 69 | rtc_undef = 0, |
70 | rtc_r2025sd, | 70 | rtc_r2025sd, |
71 | rtc_r2221tl, | ||
71 | rtc_rs5c372a, | 72 | rtc_rs5c372a, |
72 | rtc_rs5c372b, | 73 | rtc_rs5c372b, |
73 | rtc_rv5c386, | 74 | rtc_rv5c386, |
@@ -76,6 +77,7 @@ enum rtc_type { | |||
76 | 77 | ||
77 | static const struct i2c_device_id rs5c372_id[] = { | 78 | static const struct i2c_device_id rs5c372_id[] = { |
78 | { "r2025sd", rtc_r2025sd }, | 79 | { "r2025sd", rtc_r2025sd }, |
80 | { "r2221tl", rtc_r2221tl }, | ||
79 | { "rs5c372a", rtc_rs5c372a }, | 81 | { "rs5c372a", rtc_rs5c372a }, |
80 | { "rs5c372b", rtc_rs5c372b }, | 82 | { "rs5c372b", rtc_rs5c372b }, |
81 | { "rv5c386", rtc_rv5c386 }, | 83 | { "rv5c386", rtc_rv5c386 }, |
@@ -529,6 +531,7 @@ static int rs5c_oscillator_setup(struct rs5c372 *rs5c372) | |||
529 | rs5c372->time24 = 1; | 531 | rs5c372->time24 = 1; |
530 | break; | 532 | break; |
531 | case rtc_r2025sd: | 533 | case rtc_r2025sd: |
534 | case rtc_r2221tl: | ||
532 | case rtc_rv5c386: | 535 | case rtc_rv5c386: |
533 | case rtc_rv5c387a: | 536 | case rtc_rv5c387a: |
534 | buf[0] |= RV5C387_CTRL1_24; | 537 | buf[0] |= RV5C387_CTRL1_24; |
@@ -609,6 +612,7 @@ static int rs5c372_probe(struct i2c_client *client, | |||
609 | rs5c372->time24 = 1; | 612 | rs5c372->time24 = 1; |
610 | break; | 613 | break; |
611 | case rtc_r2025sd: | 614 | case rtc_r2025sd: |
615 | case rtc_r2221tl: | ||
612 | case rtc_rv5c386: | 616 | case rtc_rv5c386: |
613 | case rtc_rv5c387a: | 617 | case rtc_rv5c387a: |
614 | if (rs5c372->regs[RS5C_REG_CTRL1] & RV5C387_CTRL1_24) | 618 | if (rs5c372->regs[RS5C_REG_CTRL1] & RV5C387_CTRL1_24) |
@@ -640,6 +644,7 @@ static int rs5c372_probe(struct i2c_client *client, | |||
640 | dev_info(&client->dev, "%s found, %s, driver version " DRV_VERSION "\n", | 644 | dev_info(&client->dev, "%s found, %s, driver version " DRV_VERSION "\n", |
641 | ({ char *s; switch (rs5c372->type) { | 645 | ({ char *s; switch (rs5c372->type) { |
642 | case rtc_r2025sd: s = "r2025sd"; break; | 646 | case rtc_r2025sd: s = "r2025sd"; break; |
647 | case rtc_r2221tl: s = "r2221tl"; break; | ||
643 | case rtc_rs5c372a: s = "rs5c372a"; break; | 648 | case rtc_rs5c372a: s = "rs5c372a"; break; |
644 | case rtc_rs5c372b: s = "rs5c372b"; break; | 649 | case rtc_rs5c372b: s = "rs5c372b"; break; |
645 | case rtc_rv5c386: s = "rv5c386"; break; | 650 | case rtc_rv5c386: s = "rv5c386"; break; |
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index 9f0c46547459..df5e961484e1 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c | |||
@@ -35,6 +35,7 @@ static int sg_version_num = 30534; /* 2 digits for each component */ | |||
35 | #include <linux/sched.h> | 35 | #include <linux/sched.h> |
36 | #include <linux/string.h> | 36 | #include <linux/string.h> |
37 | #include <linux/mm.h> | 37 | #include <linux/mm.h> |
38 | #include <linux/aio.h> | ||
38 | #include <linux/errno.h> | 39 | #include <linux/errno.h> |
39 | #include <linux/mtio.h> | 40 | #include <linux/mtio.h> |
40 | #include <linux/ioctl.h> | 41 | #include <linux/ioctl.h> |
diff --git a/drivers/staging/android/logger.c b/drivers/staging/android/logger.c index b14a55742559..b040200a5a55 100644 --- a/drivers/staging/android/logger.c +++ b/drivers/staging/android/logger.c | |||
@@ -28,6 +28,7 @@ | |||
28 | #include <linux/slab.h> | 28 | #include <linux/slab.h> |
29 | #include <linux/time.h> | 29 | #include <linux/time.h> |
30 | #include <linux/vmalloc.h> | 30 | #include <linux/vmalloc.h> |
31 | #include <linux/aio.h> | ||
31 | #include "logger.h" | 32 | #include "logger.h" |
32 | 33 | ||
33 | #include <asm/ioctls.h> | 34 | #include <asm/ioctls.h> |
diff --git a/drivers/usb/gadget/inode.c b/drivers/usb/gadget/inode.c index dda0dc4a5567..570c005062ab 100644 --- a/drivers/usb/gadget/inode.c +++ b/drivers/usb/gadget/inode.c | |||
@@ -24,6 +24,8 @@ | |||
24 | #include <linux/sched.h> | 24 | #include <linux/sched.h> |
25 | #include <linux/slab.h> | 25 | #include <linux/slab.h> |
26 | #include <linux/poll.h> | 26 | #include <linux/poll.h> |
27 | #include <linux/mmu_context.h> | ||
28 | #include <linux/aio.h> | ||
27 | 29 | ||
28 | #include <linux/device.h> | 30 | #include <linux/device.h> |
29 | #include <linux/moduleparam.h> | 31 | #include <linux/moduleparam.h> |
@@ -513,6 +515,9 @@ static long ep_ioctl(struct file *fd, unsigned code, unsigned long value) | |||
513 | struct kiocb_priv { | 515 | struct kiocb_priv { |
514 | struct usb_request *req; | 516 | struct usb_request *req; |
515 | struct ep_data *epdata; | 517 | struct ep_data *epdata; |
518 | struct kiocb *iocb; | ||
519 | struct mm_struct *mm; | ||
520 | struct work_struct work; | ||
516 | void *buf; | 521 | void *buf; |
517 | const struct iovec *iv; | 522 | const struct iovec *iv; |
518 | unsigned long nr_segs; | 523 | unsigned long nr_segs; |
@@ -528,7 +533,6 @@ static int ep_aio_cancel(struct kiocb *iocb, struct io_event *e) | |||
528 | local_irq_disable(); | 533 | local_irq_disable(); |
529 | epdata = priv->epdata; | 534 | epdata = priv->epdata; |
530 | // spin_lock(&epdata->dev->lock); | 535 | // spin_lock(&epdata->dev->lock); |
531 | kiocbSetCancelled(iocb); | ||
532 | if (likely(epdata && epdata->ep && priv->req)) | 536 | if (likely(epdata && epdata->ep && priv->req)) |
533 | value = usb_ep_dequeue (epdata->ep, priv->req); | 537 | value = usb_ep_dequeue (epdata->ep, priv->req); |
534 | else | 538 | else |
@@ -540,15 +544,12 @@ static int ep_aio_cancel(struct kiocb *iocb, struct io_event *e) | |||
540 | return value; | 544 | return value; |
541 | } | 545 | } |
542 | 546 | ||
543 | static ssize_t ep_aio_read_retry(struct kiocb *iocb) | 547 | static ssize_t ep_copy_to_user(struct kiocb_priv *priv) |
544 | { | 548 | { |
545 | struct kiocb_priv *priv = iocb->private; | ||
546 | ssize_t len, total; | 549 | ssize_t len, total; |
547 | void *to_copy; | 550 | void *to_copy; |
548 | int i; | 551 | int i; |
549 | 552 | ||
550 | /* we "retry" to get the right mm context for this: */ | ||
551 | |||
552 | /* copy stuff into user buffers */ | 553 | /* copy stuff into user buffers */ |
553 | total = priv->actual; | 554 | total = priv->actual; |
554 | len = 0; | 555 | len = 0; |
@@ -568,9 +569,26 @@ static ssize_t ep_aio_read_retry(struct kiocb *iocb) | |||
568 | if (total == 0) | 569 | if (total == 0) |
569 | break; | 570 | break; |
570 | } | 571 | } |
572 | |||
573 | return len; | ||
574 | } | ||
575 | |||
576 | static void ep_user_copy_worker(struct work_struct *work) | ||
577 | { | ||
578 | struct kiocb_priv *priv = container_of(work, struct kiocb_priv, work); | ||
579 | struct mm_struct *mm = priv->mm; | ||
580 | struct kiocb *iocb = priv->iocb; | ||
581 | size_t ret; | ||
582 | |||
583 | use_mm(mm); | ||
584 | ret = ep_copy_to_user(priv); | ||
585 | unuse_mm(mm); | ||
586 | |||
587 | /* completing the iocb can drop the ctx and mm, don't touch mm after */ | ||
588 | aio_complete(iocb, ret, ret); | ||
589 | |||
571 | kfree(priv->buf); | 590 | kfree(priv->buf); |
572 | kfree(priv); | 591 | kfree(priv); |
573 | return len; | ||
574 | } | 592 | } |
575 | 593 | ||
576 | static void ep_aio_complete(struct usb_ep *ep, struct usb_request *req) | 594 | static void ep_aio_complete(struct usb_ep *ep, struct usb_request *req) |
@@ -596,14 +614,14 @@ static void ep_aio_complete(struct usb_ep *ep, struct usb_request *req) | |||
596 | aio_complete(iocb, req->actual ? req->actual : req->status, | 614 | aio_complete(iocb, req->actual ? req->actual : req->status, |
597 | req->status); | 615 | req->status); |
598 | } else { | 616 | } else { |
599 | /* retry() won't report both; so we hide some faults */ | 617 | /* ep_copy_to_user() won't report both; we hide some faults */ |
600 | if (unlikely(0 != req->status)) | 618 | if (unlikely(0 != req->status)) |
601 | DBG(epdata->dev, "%s fault %d len %d\n", | 619 | DBG(epdata->dev, "%s fault %d len %d\n", |
602 | ep->name, req->status, req->actual); | 620 | ep->name, req->status, req->actual); |
603 | 621 | ||
604 | priv->buf = req->buf; | 622 | priv->buf = req->buf; |
605 | priv->actual = req->actual; | 623 | priv->actual = req->actual; |
606 | kick_iocb(iocb); | 624 | schedule_work(&priv->work); |
607 | } | 625 | } |
608 | spin_unlock(&epdata->dev->lock); | 626 | spin_unlock(&epdata->dev->lock); |
609 | 627 | ||
@@ -633,8 +651,10 @@ fail: | |||
633 | return value; | 651 | return value; |
634 | } | 652 | } |
635 | iocb->private = priv; | 653 | iocb->private = priv; |
654 | priv->iocb = iocb; | ||
636 | priv->iv = iv; | 655 | priv->iv = iv; |
637 | priv->nr_segs = nr_segs; | 656 | priv->nr_segs = nr_segs; |
657 | INIT_WORK(&priv->work, ep_user_copy_worker); | ||
638 | 658 | ||
639 | value = get_ready_ep(iocb->ki_filp->f_flags, epdata); | 659 | value = get_ready_ep(iocb->ki_filp->f_flags, epdata); |
640 | if (unlikely(value < 0)) { | 660 | if (unlikely(value < 0)) { |
@@ -642,10 +662,11 @@ fail: | |||
642 | goto fail; | 662 | goto fail; |
643 | } | 663 | } |
644 | 664 | ||
645 | iocb->ki_cancel = ep_aio_cancel; | 665 | kiocb_set_cancel_fn(iocb, ep_aio_cancel); |
646 | get_ep(epdata); | 666 | get_ep(epdata); |
647 | priv->epdata = epdata; | 667 | priv->epdata = epdata; |
648 | priv->actual = 0; | 668 | priv->actual = 0; |
669 | priv->mm = current->mm; /* mm teardown waits for iocbs in exit_aio() */ | ||
649 | 670 | ||
650 | /* each kiocb is coupled to one usb_request, but we can't | 671 | /* each kiocb is coupled to one usb_request, but we can't |
651 | * allocate or submit those if the host disconnected. | 672 | * allocate or submit those if the host disconnected. |
@@ -674,7 +695,7 @@ fail: | |||
674 | kfree(priv); | 695 | kfree(priv); |
675 | put_ep(epdata); | 696 | put_ep(epdata); |
676 | } else | 697 | } else |
677 | value = (iv ? -EIOCBRETRY : -EIOCBQUEUED); | 698 | value = -EIOCBQUEUED; |
678 | return value; | 699 | return value; |
679 | } | 700 | } |
680 | 701 | ||
@@ -692,7 +713,6 @@ ep_aio_read(struct kiocb *iocb, const struct iovec *iov, | |||
692 | if (unlikely(!buf)) | 713 | if (unlikely(!buf)) |
693 | return -ENOMEM; | 714 | return -ENOMEM; |
694 | 715 | ||
695 | iocb->ki_retry = ep_aio_read_retry; | ||
696 | return ep_aio_rwtail(iocb, buf, iocb->ki_left, epdata, iov, nr_segs); | 716 | return ep_aio_rwtail(iocb, buf, iocb->ki_left, epdata, iov, nr_segs); |
697 | } | 717 | } |
698 | 718 | ||
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index 0ad61c6a65a5..055562c580b4 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c | |||
@@ -33,6 +33,7 @@ | |||
33 | #include <linux/pagemap.h> | 33 | #include <linux/pagemap.h> |
34 | #include <linux/idr.h> | 34 | #include <linux/idr.h> |
35 | #include <linux/sched.h> | 35 | #include <linux/sched.h> |
36 | #include <linux/aio.h> | ||
36 | #include <net/9p/9p.h> | 37 | #include <net/9p/9p.h> |
37 | #include <net/9p/client.h> | 38 | #include <net/9p/client.h> |
38 | 39 | ||
diff --git a/fs/afs/write.c b/fs/afs/write.c index 7e03eadb40c0..a890db4b9898 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c | |||
@@ -14,6 +14,7 @@ | |||
14 | #include <linux/pagemap.h> | 14 | #include <linux/pagemap.h> |
15 | #include <linux/writeback.h> | 15 | #include <linux/writeback.h> |
16 | #include <linux/pagevec.h> | 16 | #include <linux/pagevec.h> |
17 | #include <linux/aio.h> | ||
17 | #include "internal.h" | 18 | #include "internal.h" |
18 | 19 | ||
19 | static int afs_write_back_from_locked_page(struct afs_writeback *wb, | 20 | static int afs_write_back_from_locked_page(struct afs_writeback *wb, |
@@ -8,6 +8,8 @@ | |||
8 | * | 8 | * |
9 | * See ../COPYING for licensing terms. | 9 | * See ../COPYING for licensing terms. |
10 | */ | 10 | */ |
11 | #define pr_fmt(fmt) "%s: " fmt, __func__ | ||
12 | |||
11 | #include <linux/kernel.h> | 13 | #include <linux/kernel.h> |
12 | #include <linux/init.h> | 14 | #include <linux/init.h> |
13 | #include <linux/errno.h> | 15 | #include <linux/errno.h> |
@@ -18,8 +20,6 @@ | |||
18 | #include <linux/backing-dev.h> | 20 | #include <linux/backing-dev.h> |
19 | #include <linux/uio.h> | 21 | #include <linux/uio.h> |
20 | 22 | ||
21 | #define DEBUG 0 | ||
22 | |||
23 | #include <linux/sched.h> | 23 | #include <linux/sched.h> |
24 | #include <linux/fs.h> | 24 | #include <linux/fs.h> |
25 | #include <linux/file.h> | 25 | #include <linux/file.h> |
@@ -39,11 +39,76 @@ | |||
39 | #include <asm/kmap_types.h> | 39 | #include <asm/kmap_types.h> |
40 | #include <asm/uaccess.h> | 40 | #include <asm/uaccess.h> |
41 | 41 | ||
42 | #if DEBUG > 1 | 42 | #define AIO_RING_MAGIC 0xa10a10a1 |
43 | #define dprintk printk | 43 | #define AIO_RING_COMPAT_FEATURES 1 |
44 | #else | 44 | #define AIO_RING_INCOMPAT_FEATURES 0 |
45 | #define dprintk(x...) do { ; } while (0) | 45 | struct aio_ring { |
46 | #endif | 46 | unsigned id; /* kernel internal index number */ |
47 | unsigned nr; /* number of io_events */ | ||
48 | unsigned head; | ||
49 | unsigned tail; | ||
50 | |||
51 | unsigned magic; | ||
52 | unsigned compat_features; | ||
53 | unsigned incompat_features; | ||
54 | unsigned header_length; /* size of aio_ring */ | ||
55 | |||
56 | |||
57 | struct io_event io_events[0]; | ||
58 | }; /* 128 bytes + ring size */ | ||
59 | |||
60 | #define AIO_RING_PAGES 8 | ||
61 | |||
62 | struct kioctx { | ||
63 | atomic_t users; | ||
64 | atomic_t dead; | ||
65 | |||
66 | /* This needs improving */ | ||
67 | unsigned long user_id; | ||
68 | struct hlist_node list; | ||
69 | |||
70 | /* | ||
71 | * This is what userspace passed to io_setup(), it's not used for | ||
72 | * anything but counting against the global max_reqs quota. | ||
73 | * | ||
74 | * The real limit is nr_events - 1, which will be larger (see | ||
75 | * aio_setup_ring()) | ||
76 | */ | ||
77 | unsigned max_reqs; | ||
78 | |||
79 | /* Size of ringbuffer, in units of struct io_event */ | ||
80 | unsigned nr_events; | ||
81 | |||
82 | unsigned long mmap_base; | ||
83 | unsigned long mmap_size; | ||
84 | |||
85 | struct page **ring_pages; | ||
86 | long nr_pages; | ||
87 | |||
88 | struct rcu_head rcu_head; | ||
89 | struct work_struct rcu_work; | ||
90 | |||
91 | struct { | ||
92 | atomic_t reqs_active; | ||
93 | } ____cacheline_aligned_in_smp; | ||
94 | |||
95 | struct { | ||
96 | spinlock_t ctx_lock; | ||
97 | struct list_head active_reqs; /* used for cancellation */ | ||
98 | } ____cacheline_aligned_in_smp; | ||
99 | |||
100 | struct { | ||
101 | struct mutex ring_lock; | ||
102 | wait_queue_head_t wait; | ||
103 | } ____cacheline_aligned_in_smp; | ||
104 | |||
105 | struct { | ||
106 | unsigned tail; | ||
107 | spinlock_t completion_lock; | ||
108 | } ____cacheline_aligned_in_smp; | ||
109 | |||
110 | struct page *internal_pages[AIO_RING_PAGES]; | ||
111 | }; | ||
47 | 112 | ||
48 | /*------ sysctl variables----*/ | 113 | /*------ sysctl variables----*/ |
49 | static DEFINE_SPINLOCK(aio_nr_lock); | 114 | static DEFINE_SPINLOCK(aio_nr_lock); |
@@ -54,11 +119,6 @@ unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio request | |||
54 | static struct kmem_cache *kiocb_cachep; | 119 | static struct kmem_cache *kiocb_cachep; |
55 | static struct kmem_cache *kioctx_cachep; | 120 | static struct kmem_cache *kioctx_cachep; |
56 | 121 | ||
57 | static struct workqueue_struct *aio_wq; | ||
58 | |||
59 | static void aio_kick_handler(struct work_struct *); | ||
60 | static void aio_queue_work(struct kioctx *); | ||
61 | |||
62 | /* aio_setup | 122 | /* aio_setup |
63 | * Creates the slab caches used by the aio routines, panic on | 123 | * Creates the slab caches used by the aio routines, panic on |
64 | * failure as this is done early during the boot sequence. | 124 | * failure as this is done early during the boot sequence. |
@@ -68,10 +128,7 @@ static int __init aio_setup(void) | |||
68 | kiocb_cachep = KMEM_CACHE(kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC); | 128 | kiocb_cachep = KMEM_CACHE(kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC); |
69 | kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC); | 129 | kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC); |
70 | 130 | ||
71 | aio_wq = alloc_workqueue("aio", 0, 1); /* used to limit concurrency */ | 131 | pr_debug("sizeof(struct page) = %zu\n", sizeof(struct page)); |
72 | BUG_ON(!aio_wq); | ||
73 | |||
74 | pr_debug("aio_setup: sizeof(struct page) = %d\n", (int)sizeof(struct page)); | ||
75 | 132 | ||
76 | return 0; | 133 | return 0; |
77 | } | 134 | } |
@@ -79,28 +136,23 @@ __initcall(aio_setup); | |||
79 | 136 | ||
80 | static void aio_free_ring(struct kioctx *ctx) | 137 | static void aio_free_ring(struct kioctx *ctx) |
81 | { | 138 | { |
82 | struct aio_ring_info *info = &ctx->ring_info; | ||
83 | long i; | 139 | long i; |
84 | 140 | ||
85 | for (i=0; i<info->nr_pages; i++) | 141 | for (i = 0; i < ctx->nr_pages; i++) |
86 | put_page(info->ring_pages[i]); | 142 | put_page(ctx->ring_pages[i]); |
87 | 143 | ||
88 | if (info->mmap_size) { | 144 | if (ctx->mmap_size) |
89 | BUG_ON(ctx->mm != current->mm); | 145 | vm_munmap(ctx->mmap_base, ctx->mmap_size); |
90 | vm_munmap(info->mmap_base, info->mmap_size); | ||
91 | } | ||
92 | 146 | ||
93 | if (info->ring_pages && info->ring_pages != info->internal_pages) | 147 | if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages) |
94 | kfree(info->ring_pages); | 148 | kfree(ctx->ring_pages); |
95 | info->ring_pages = NULL; | ||
96 | info->nr = 0; | ||
97 | } | 149 | } |
98 | 150 | ||
99 | static int aio_setup_ring(struct kioctx *ctx) | 151 | static int aio_setup_ring(struct kioctx *ctx) |
100 | { | 152 | { |
101 | struct aio_ring *ring; | 153 | struct aio_ring *ring; |
102 | struct aio_ring_info *info = &ctx->ring_info; | ||
103 | unsigned nr_events = ctx->max_reqs; | 154 | unsigned nr_events = ctx->max_reqs; |
155 | struct mm_struct *mm = current->mm; | ||
104 | unsigned long size, populate; | 156 | unsigned long size, populate; |
105 | int nr_pages; | 157 | int nr_pages; |
106 | 158 | ||
@@ -116,46 +168,44 @@ static int aio_setup_ring(struct kioctx *ctx) | |||
116 | 168 | ||
117 | nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / sizeof(struct io_event); | 169 | nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / sizeof(struct io_event); |
118 | 170 | ||
119 | info->nr = 0; | 171 | ctx->nr_events = 0; |
120 | info->ring_pages = info->internal_pages; | 172 | ctx->ring_pages = ctx->internal_pages; |
121 | if (nr_pages > AIO_RING_PAGES) { | 173 | if (nr_pages > AIO_RING_PAGES) { |
122 | info->ring_pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); | 174 | ctx->ring_pages = kcalloc(nr_pages, sizeof(struct page *), |
123 | if (!info->ring_pages) | 175 | GFP_KERNEL); |
176 | if (!ctx->ring_pages) | ||
124 | return -ENOMEM; | 177 | return -ENOMEM; |
125 | } | 178 | } |
126 | 179 | ||
127 | info->mmap_size = nr_pages * PAGE_SIZE; | 180 | ctx->mmap_size = nr_pages * PAGE_SIZE; |
128 | dprintk("attempting mmap of %lu bytes\n", info->mmap_size); | 181 | pr_debug("attempting mmap of %lu bytes\n", ctx->mmap_size); |
129 | down_write(&ctx->mm->mmap_sem); | 182 | down_write(&mm->mmap_sem); |
130 | info->mmap_base = do_mmap_pgoff(NULL, 0, info->mmap_size, | 183 | ctx->mmap_base = do_mmap_pgoff(NULL, 0, ctx->mmap_size, |
131 | PROT_READ|PROT_WRITE, | 184 | PROT_READ|PROT_WRITE, |
132 | MAP_ANONYMOUS|MAP_PRIVATE, 0, | 185 | MAP_ANONYMOUS|MAP_PRIVATE, 0, &populate); |
133 | &populate); | 186 | if (IS_ERR((void *)ctx->mmap_base)) { |
134 | if (IS_ERR((void *)info->mmap_base)) { | 187 | up_write(&mm->mmap_sem); |
135 | up_write(&ctx->mm->mmap_sem); | 188 | ctx->mmap_size = 0; |
136 | info->mmap_size = 0; | ||
137 | aio_free_ring(ctx); | 189 | aio_free_ring(ctx); |
138 | return -EAGAIN; | 190 | return -EAGAIN; |
139 | } | 191 | } |
140 | 192 | ||
141 | dprintk("mmap address: 0x%08lx\n", info->mmap_base); | 193 | pr_debug("mmap address: 0x%08lx\n", ctx->mmap_base); |
142 | info->nr_pages = get_user_pages(current, ctx->mm, | 194 | ctx->nr_pages = get_user_pages(current, mm, ctx->mmap_base, nr_pages, |
143 | info->mmap_base, nr_pages, | 195 | 1, 0, ctx->ring_pages, NULL); |
144 | 1, 0, info->ring_pages, NULL); | 196 | up_write(&mm->mmap_sem); |
145 | up_write(&ctx->mm->mmap_sem); | ||
146 | 197 | ||
147 | if (unlikely(info->nr_pages != nr_pages)) { | 198 | if (unlikely(ctx->nr_pages != nr_pages)) { |
148 | aio_free_ring(ctx); | 199 | aio_free_ring(ctx); |
149 | return -EAGAIN; | 200 | return -EAGAIN; |
150 | } | 201 | } |
151 | if (populate) | 202 | if (populate) |
152 | mm_populate(info->mmap_base, populate); | 203 | mm_populate(ctx->mmap_base, populate); |
153 | 204 | ||
154 | ctx->user_id = info->mmap_base; | 205 | ctx->user_id = ctx->mmap_base; |
206 | ctx->nr_events = nr_events; /* trusted copy */ | ||
155 | 207 | ||
156 | info->nr = nr_events; /* trusted copy */ | 208 | ring = kmap_atomic(ctx->ring_pages[0]); |
157 | |||
158 | ring = kmap_atomic(info->ring_pages[0]); | ||
159 | ring->nr = nr_events; /* user copy */ | 209 | ring->nr = nr_events; /* user copy */ |
160 | ring->id = ctx->user_id; | 210 | ring->id = ctx->user_id; |
161 | ring->head = ring->tail = 0; | 211 | ring->head = ring->tail = 0; |
@@ -164,72 +214,133 @@ static int aio_setup_ring(struct kioctx *ctx) | |||
164 | ring->incompat_features = AIO_RING_INCOMPAT_FEATURES; | 214 | ring->incompat_features = AIO_RING_INCOMPAT_FEATURES; |
165 | ring->header_length = sizeof(struct aio_ring); | 215 | ring->header_length = sizeof(struct aio_ring); |
166 | kunmap_atomic(ring); | 216 | kunmap_atomic(ring); |
217 | flush_dcache_page(ctx->ring_pages[0]); | ||
167 | 218 | ||
168 | return 0; | 219 | return 0; |
169 | } | 220 | } |
170 | 221 | ||
171 | |||
172 | /* aio_ring_event: returns a pointer to the event at the given index from | ||
173 | * kmap_atomic(). Release the pointer with put_aio_ring_event(); | ||
174 | */ | ||
175 | #define AIO_EVENTS_PER_PAGE (PAGE_SIZE / sizeof(struct io_event)) | 222 | #define AIO_EVENTS_PER_PAGE (PAGE_SIZE / sizeof(struct io_event)) |
176 | #define AIO_EVENTS_FIRST_PAGE ((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event)) | 223 | #define AIO_EVENTS_FIRST_PAGE ((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event)) |
177 | #define AIO_EVENTS_OFFSET (AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE) | 224 | #define AIO_EVENTS_OFFSET (AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE) |
178 | 225 | ||
179 | #define aio_ring_event(info, nr) ({ \ | 226 | void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel) |
180 | unsigned pos = (nr) + AIO_EVENTS_OFFSET; \ | 227 | { |
181 | struct io_event *__event; \ | 228 | struct kioctx *ctx = req->ki_ctx; |
182 | __event = kmap_atomic( \ | 229 | unsigned long flags; |
183 | (info)->ring_pages[pos / AIO_EVENTS_PER_PAGE]); \ | 230 | |
184 | __event += pos % AIO_EVENTS_PER_PAGE; \ | 231 | spin_lock_irqsave(&ctx->ctx_lock, flags); |
185 | __event; \ | 232 | |
186 | }) | 233 | if (!req->ki_list.next) |
187 | 234 | list_add(&req->ki_list, &ctx->active_reqs); | |
188 | #define put_aio_ring_event(event) do { \ | 235 | |
189 | struct io_event *__event = (event); \ | 236 | req->ki_cancel = cancel; |
190 | (void)__event; \ | 237 | |
191 | kunmap_atomic((void *)((unsigned long)__event & PAGE_MASK)); \ | 238 | spin_unlock_irqrestore(&ctx->ctx_lock, flags); |
192 | } while(0) | 239 | } |
193 | 240 | EXPORT_SYMBOL(kiocb_set_cancel_fn); | |
194 | static void ctx_rcu_free(struct rcu_head *head) | 241 | |
242 | static int kiocb_cancel(struct kioctx *ctx, struct kiocb *kiocb, | ||
243 | struct io_event *res) | ||
244 | { | ||
245 | kiocb_cancel_fn *old, *cancel; | ||
246 | int ret = -EINVAL; | ||
247 | |||
248 | /* | ||
249 | * Don't want to set kiocb->ki_cancel = KIOCB_CANCELLED unless it | ||
250 | * actually has a cancel function, hence the cmpxchg() | ||
251 | */ | ||
252 | |||
253 | cancel = ACCESS_ONCE(kiocb->ki_cancel); | ||
254 | do { | ||
255 | if (!cancel || cancel == KIOCB_CANCELLED) | ||
256 | return ret; | ||
257 | |||
258 | old = cancel; | ||
259 | cancel = cmpxchg(&kiocb->ki_cancel, old, KIOCB_CANCELLED); | ||
260 | } while (cancel != old); | ||
261 | |||
262 | atomic_inc(&kiocb->ki_users); | ||
263 | spin_unlock_irq(&ctx->ctx_lock); | ||
264 | |||
265 | memset(res, 0, sizeof(*res)); | ||
266 | res->obj = (u64)(unsigned long)kiocb->ki_obj.user; | ||
267 | res->data = kiocb->ki_user_data; | ||
268 | ret = cancel(kiocb, res); | ||
269 | |||
270 | spin_lock_irq(&ctx->ctx_lock); | ||
271 | |||
272 | return ret; | ||
273 | } | ||
274 | |||
275 | static void free_ioctx_rcu(struct rcu_head *head) | ||
195 | { | 276 | { |
196 | struct kioctx *ctx = container_of(head, struct kioctx, rcu_head); | 277 | struct kioctx *ctx = container_of(head, struct kioctx, rcu_head); |
197 | kmem_cache_free(kioctx_cachep, ctx); | 278 | kmem_cache_free(kioctx_cachep, ctx); |
198 | } | 279 | } |
199 | 280 | ||
200 | /* __put_ioctx | 281 | /* |
201 | * Called when the last user of an aio context has gone away, | 282 | * When this function runs, the kioctx has been removed from the "hash table" |
202 | * and the struct needs to be freed. | 283 | * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted - |
284 | * now it's safe to cancel any that need to be. | ||
203 | */ | 285 | */ |
204 | static void __put_ioctx(struct kioctx *ctx) | 286 | static void free_ioctx(struct kioctx *ctx) |
205 | { | 287 | { |
206 | unsigned nr_events = ctx->max_reqs; | 288 | struct aio_ring *ring; |
207 | BUG_ON(ctx->reqs_active); | 289 | struct io_event res; |
290 | struct kiocb *req; | ||
291 | unsigned head, avail; | ||
208 | 292 | ||
209 | cancel_delayed_work_sync(&ctx->wq); | 293 | spin_lock_irq(&ctx->ctx_lock); |
210 | aio_free_ring(ctx); | 294 | |
211 | mmdrop(ctx->mm); | 295 | while (!list_empty(&ctx->active_reqs)) { |
212 | ctx->mm = NULL; | 296 | req = list_first_entry(&ctx->active_reqs, |
213 | if (nr_events) { | 297 | struct kiocb, ki_list); |
214 | spin_lock(&aio_nr_lock); | 298 | |
215 | BUG_ON(aio_nr - nr_events > aio_nr); | 299 | list_del_init(&req->ki_list); |
216 | aio_nr -= nr_events; | 300 | kiocb_cancel(ctx, req, &res); |
217 | spin_unlock(&aio_nr_lock); | ||
218 | } | 301 | } |
219 | pr_debug("__put_ioctx: freeing %p\n", ctx); | ||
220 | call_rcu(&ctx->rcu_head, ctx_rcu_free); | ||
221 | } | ||
222 | 302 | ||
223 | static inline int try_get_ioctx(struct kioctx *kioctx) | 303 | spin_unlock_irq(&ctx->ctx_lock); |
224 | { | 304 | |
225 | return atomic_inc_not_zero(&kioctx->users); | 305 | ring = kmap_atomic(ctx->ring_pages[0]); |
306 | head = ring->head; | ||
307 | kunmap_atomic(ring); | ||
308 | |||
309 | while (atomic_read(&ctx->reqs_active) > 0) { | ||
310 | wait_event(ctx->wait, head != ctx->tail); | ||
311 | |||
312 | avail = (head <= ctx->tail ? ctx->tail : ctx->nr_events) - head; | ||
313 | |||
314 | atomic_sub(avail, &ctx->reqs_active); | ||
315 | head += avail; | ||
316 | head %= ctx->nr_events; | ||
317 | } | ||
318 | |||
319 | WARN_ON(atomic_read(&ctx->reqs_active) < 0); | ||
320 | |||
321 | aio_free_ring(ctx); | ||
322 | |||
323 | spin_lock(&aio_nr_lock); | ||
324 | BUG_ON(aio_nr - ctx->max_reqs > aio_nr); | ||
325 | aio_nr -= ctx->max_reqs; | ||
326 | spin_unlock(&aio_nr_lock); | ||
327 | |||
328 | pr_debug("freeing %p\n", ctx); | ||
329 | |||
330 | /* | ||
331 | * Here the call_rcu() is between the wait_event() for reqs_active to | ||
332 | * hit 0, and freeing the ioctx. | ||
333 | * | ||
334 | * aio_complete() decrements reqs_active, but it has to touch the ioctx | ||
335 | * after to issue a wakeup so we use rcu. | ||
336 | */ | ||
337 | call_rcu(&ctx->rcu_head, free_ioctx_rcu); | ||
226 | } | 338 | } |
227 | 339 | ||
228 | static inline void put_ioctx(struct kioctx *kioctx) | 340 | static void put_ioctx(struct kioctx *ctx) |
229 | { | 341 | { |
230 | BUG_ON(atomic_read(&kioctx->users) <= 0); | 342 | if (unlikely(atomic_dec_and_test(&ctx->users))) |
231 | if (unlikely(atomic_dec_and_test(&kioctx->users))) | 343 | free_ioctx(ctx); |
232 | __put_ioctx(kioctx); | ||
233 | } | 344 | } |
234 | 345 | ||
235 | /* ioctx_alloc | 346 | /* ioctx_alloc |
@@ -237,7 +348,7 @@ static inline void put_ioctx(struct kioctx *kioctx) | |||
237 | */ | 348 | */ |
238 | static struct kioctx *ioctx_alloc(unsigned nr_events) | 349 | static struct kioctx *ioctx_alloc(unsigned nr_events) |
239 | { | 350 | { |
240 | struct mm_struct *mm; | 351 | struct mm_struct *mm = current->mm; |
241 | struct kioctx *ctx; | 352 | struct kioctx *ctx; |
242 | int err = -ENOMEM; | 353 | int err = -ENOMEM; |
243 | 354 | ||
@@ -256,17 +367,15 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) | |||
256 | return ERR_PTR(-ENOMEM); | 367 | return ERR_PTR(-ENOMEM); |
257 | 368 | ||
258 | ctx->max_reqs = nr_events; | 369 | ctx->max_reqs = nr_events; |
259 | mm = ctx->mm = current->mm; | ||
260 | atomic_inc(&mm->mm_count); | ||
261 | 370 | ||
262 | atomic_set(&ctx->users, 2); | 371 | atomic_set(&ctx->users, 2); |
372 | atomic_set(&ctx->dead, 0); | ||
263 | spin_lock_init(&ctx->ctx_lock); | 373 | spin_lock_init(&ctx->ctx_lock); |
264 | spin_lock_init(&ctx->ring_info.ring_lock); | 374 | spin_lock_init(&ctx->completion_lock); |
375 | mutex_init(&ctx->ring_lock); | ||
265 | init_waitqueue_head(&ctx->wait); | 376 | init_waitqueue_head(&ctx->wait); |
266 | 377 | ||
267 | INIT_LIST_HEAD(&ctx->active_reqs); | 378 | INIT_LIST_HEAD(&ctx->active_reqs); |
268 | INIT_LIST_HEAD(&ctx->run_list); | ||
269 | INIT_DELAYED_WORK(&ctx->wq, aio_kick_handler); | ||
270 | 379 | ||
271 | if (aio_setup_ring(ctx) < 0) | 380 | if (aio_setup_ring(ctx) < 0) |
272 | goto out_freectx; | 381 | goto out_freectx; |
@@ -286,64 +395,56 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) | |||
286 | hlist_add_head_rcu(&ctx->list, &mm->ioctx_list); | 395 | hlist_add_head_rcu(&ctx->list, &mm->ioctx_list); |
287 | spin_unlock(&mm->ioctx_lock); | 396 | spin_unlock(&mm->ioctx_lock); |
288 | 397 | ||
289 | dprintk("aio: allocated ioctx %p[%ld]: mm=%p mask=0x%x\n", | 398 | pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n", |
290 | ctx, ctx->user_id, current->mm, ctx->ring_info.nr); | 399 | ctx, ctx->user_id, mm, ctx->nr_events); |
291 | return ctx; | 400 | return ctx; |
292 | 401 | ||
293 | out_cleanup: | 402 | out_cleanup: |
294 | err = -EAGAIN; | 403 | err = -EAGAIN; |
295 | aio_free_ring(ctx); | 404 | aio_free_ring(ctx); |
296 | out_freectx: | 405 | out_freectx: |
297 | mmdrop(mm); | ||
298 | kmem_cache_free(kioctx_cachep, ctx); | 406 | kmem_cache_free(kioctx_cachep, ctx); |
299 | dprintk("aio: error allocating ioctx %d\n", err); | 407 | pr_debug("error allocating ioctx %d\n", err); |
300 | return ERR_PTR(err); | 408 | return ERR_PTR(err); |
301 | } | 409 | } |
302 | 410 | ||
303 | /* kill_ctx | 411 | static void kill_ioctx_work(struct work_struct *work) |
304 | * Cancels all outstanding aio requests on an aio context. Used | ||
305 | * when the processes owning a context have all exited to encourage | ||
306 | * the rapid destruction of the kioctx. | ||
307 | */ | ||
308 | static void kill_ctx(struct kioctx *ctx) | ||
309 | { | 412 | { |
310 | int (*cancel)(struct kiocb *, struct io_event *); | 413 | struct kioctx *ctx = container_of(work, struct kioctx, rcu_work); |
311 | struct task_struct *tsk = current; | ||
312 | DECLARE_WAITQUEUE(wait, tsk); | ||
313 | struct io_event res; | ||
314 | 414 | ||
315 | spin_lock_irq(&ctx->ctx_lock); | 415 | wake_up_all(&ctx->wait); |
316 | ctx->dead = 1; | 416 | put_ioctx(ctx); |
317 | while (!list_empty(&ctx->active_reqs)) { | 417 | } |
318 | struct list_head *pos = ctx->active_reqs.next; | ||
319 | struct kiocb *iocb = list_kiocb(pos); | ||
320 | list_del_init(&iocb->ki_list); | ||
321 | cancel = iocb->ki_cancel; | ||
322 | kiocbSetCancelled(iocb); | ||
323 | if (cancel) { | ||
324 | iocb->ki_users++; | ||
325 | spin_unlock_irq(&ctx->ctx_lock); | ||
326 | cancel(iocb, &res); | ||
327 | spin_lock_irq(&ctx->ctx_lock); | ||
328 | } | ||
329 | } | ||
330 | 418 | ||
331 | if (!ctx->reqs_active) | 419 | static void kill_ioctx_rcu(struct rcu_head *head) |
332 | goto out; | 420 | { |
421 | struct kioctx *ctx = container_of(head, struct kioctx, rcu_head); | ||
333 | 422 | ||
334 | add_wait_queue(&ctx->wait, &wait); | 423 | INIT_WORK(&ctx->rcu_work, kill_ioctx_work); |
335 | set_task_state(tsk, TASK_UNINTERRUPTIBLE); | 424 | schedule_work(&ctx->rcu_work); |
336 | while (ctx->reqs_active) { | 425 | } |
337 | spin_unlock_irq(&ctx->ctx_lock); | ||
338 | io_schedule(); | ||
339 | set_task_state(tsk, TASK_UNINTERRUPTIBLE); | ||
340 | spin_lock_irq(&ctx->ctx_lock); | ||
341 | } | ||
342 | __set_task_state(tsk, TASK_RUNNING); | ||
343 | remove_wait_queue(&ctx->wait, &wait); | ||
344 | 426 | ||
345 | out: | 427 | /* kill_ioctx |
346 | spin_unlock_irq(&ctx->ctx_lock); | 428 | * Cancels all outstanding aio requests on an aio context. Used |
429 | * when the processes owning a context have all exited to encourage | ||
430 | * the rapid destruction of the kioctx. | ||
431 | */ | ||
432 | static void kill_ioctx(struct kioctx *ctx) | ||
433 | { | ||
434 | if (!atomic_xchg(&ctx->dead, 1)) { | ||
435 | hlist_del_rcu(&ctx->list); | ||
436 | /* Between hlist_del_rcu() and dropping the initial ref */ | ||
437 | synchronize_rcu(); | ||
438 | |||
439 | /* | ||
440 | * We can't punt to workqueue here because put_ioctx() -> | ||
441 | * free_ioctx() will unmap the ringbuffer, and that has to be | ||
442 | * done in the original process's context. kill_ioctx_rcu/work() | ||
443 | * exist for exit_aio(), as in that path free_ioctx() won't do | ||
444 | * the unmap. | ||
445 | */ | ||
446 | kill_ioctx_work(&ctx->rcu_work); | ||
447 | } | ||
347 | } | 448 | } |
348 | 449 | ||
349 | /* wait_on_sync_kiocb: | 450 | /* wait_on_sync_kiocb: |
@@ -351,9 +452,9 @@ out: | |||
351 | */ | 452 | */ |
352 | ssize_t wait_on_sync_kiocb(struct kiocb *iocb) | 453 | ssize_t wait_on_sync_kiocb(struct kiocb *iocb) |
353 | { | 454 | { |
354 | while (iocb->ki_users) { | 455 | while (atomic_read(&iocb->ki_users)) { |
355 | set_current_state(TASK_UNINTERRUPTIBLE); | 456 | set_current_state(TASK_UNINTERRUPTIBLE); |
356 | if (!iocb->ki_users) | 457 | if (!atomic_read(&iocb->ki_users)) |
357 | break; | 458 | break; |
358 | io_schedule(); | 459 | io_schedule(); |
359 | } | 460 | } |
@@ -362,28 +463,26 @@ ssize_t wait_on_sync_kiocb(struct kiocb *iocb) | |||
362 | } | 463 | } |
363 | EXPORT_SYMBOL(wait_on_sync_kiocb); | 464 | EXPORT_SYMBOL(wait_on_sync_kiocb); |
364 | 465 | ||
365 | /* exit_aio: called when the last user of mm goes away. At this point, | 466 | /* |
366 | * there is no way for any new requests to be submited or any of the | 467 | * exit_aio: called when the last user of mm goes away. At this point, there is |
367 | * io_* syscalls to be called on the context. However, there may be | 468 | * no way for any new requests to be submited or any of the io_* syscalls to be |
368 | * outstanding requests which hold references to the context; as they | 469 | * called on the context. |
369 | * go away, they will call put_ioctx and release any pinned memory | 470 | * |
370 | * associated with the request (held via struct page * references). | 471 | * There may be outstanding kiocbs, but free_ioctx() will explicitly wait on |
472 | * them. | ||
371 | */ | 473 | */ |
372 | void exit_aio(struct mm_struct *mm) | 474 | void exit_aio(struct mm_struct *mm) |
373 | { | 475 | { |
374 | struct kioctx *ctx; | 476 | struct kioctx *ctx; |
477 | struct hlist_node *n; | ||
375 | 478 | ||
376 | while (!hlist_empty(&mm->ioctx_list)) { | 479 | hlist_for_each_entry_safe(ctx, n, &mm->ioctx_list, list) { |
377 | ctx = hlist_entry(mm->ioctx_list.first, struct kioctx, list); | ||
378 | hlist_del_rcu(&ctx->list); | ||
379 | |||
380 | kill_ctx(ctx); | ||
381 | |||
382 | if (1 != atomic_read(&ctx->users)) | 480 | if (1 != atomic_read(&ctx->users)) |
383 | printk(KERN_DEBUG | 481 | printk(KERN_DEBUG |
384 | "exit_aio:ioctx still alive: %d %d %d\n", | 482 | "exit_aio:ioctx still alive: %d %d %d\n", |
385 | atomic_read(&ctx->users), ctx->dead, | 483 | atomic_read(&ctx->users), |
386 | ctx->reqs_active); | 484 | atomic_read(&ctx->dead), |
485 | atomic_read(&ctx->reqs_active)); | ||
387 | /* | 486 | /* |
388 | * We don't need to bother with munmap() here - | 487 | * We don't need to bother with munmap() here - |
389 | * exit_mmap(mm) is coming and it'll unmap everything. | 488 | * exit_mmap(mm) is coming and it'll unmap everything. |
@@ -391,150 +490,53 @@ void exit_aio(struct mm_struct *mm) | |||
391 | * as indicator that it needs to unmap the area, | 490 | * as indicator that it needs to unmap the area, |
392 | * just set it to 0; aio_free_ring() is the only | 491 | * just set it to 0; aio_free_ring() is the only |
393 | * place that uses ->mmap_size, so it's safe. | 492 | * place that uses ->mmap_size, so it's safe. |
394 | * That way we get all munmap done to current->mm - | ||
395 | * all other callers have ctx->mm == current->mm. | ||
396 | */ | 493 | */ |
397 | ctx->ring_info.mmap_size = 0; | 494 | ctx->mmap_size = 0; |
398 | put_ioctx(ctx); | 495 | |
496 | if (!atomic_xchg(&ctx->dead, 1)) { | ||
497 | hlist_del_rcu(&ctx->list); | ||
498 | call_rcu(&ctx->rcu_head, kill_ioctx_rcu); | ||
499 | } | ||
399 | } | 500 | } |
400 | } | 501 | } |
401 | 502 | ||
402 | /* aio_get_req | 503 | /* aio_get_req |
403 | * Allocate a slot for an aio request. Increments the users count | 504 | * Allocate a slot for an aio request. Increments the ki_users count |
404 | * of the kioctx so that the kioctx stays around until all requests are | 505 | * of the kioctx so that the kioctx stays around until all requests are |
405 | * complete. Returns NULL if no requests are free. | 506 | * complete. Returns NULL if no requests are free. |
406 | * | 507 | * |
407 | * Returns with kiocb->users set to 2. The io submit code path holds | 508 | * Returns with kiocb->ki_users set to 2. The io submit code path holds |
408 | * an extra reference while submitting the i/o. | 509 | * an extra reference while submitting the i/o. |
409 | * This prevents races between the aio code path referencing the | 510 | * This prevents races between the aio code path referencing the |
410 | * req (after submitting it) and aio_complete() freeing the req. | 511 | * req (after submitting it) and aio_complete() freeing the req. |
411 | */ | 512 | */ |
412 | static struct kiocb *__aio_get_req(struct kioctx *ctx) | 513 | static inline struct kiocb *aio_get_req(struct kioctx *ctx) |
413 | { | 514 | { |
414 | struct kiocb *req = NULL; | 515 | struct kiocb *req; |
415 | 516 | ||
416 | req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL); | 517 | if (atomic_read(&ctx->reqs_active) >= ctx->nr_events) |
417 | if (unlikely(!req)) | ||
418 | return NULL; | 518 | return NULL; |
419 | 519 | ||
420 | req->ki_flags = 0; | 520 | if (atomic_inc_return(&ctx->reqs_active) > ctx->nr_events - 1) |
421 | req->ki_users = 2; | 521 | goto out_put; |
422 | req->ki_key = 0; | ||
423 | req->ki_ctx = ctx; | ||
424 | req->ki_cancel = NULL; | ||
425 | req->ki_retry = NULL; | ||
426 | req->ki_dtor = NULL; | ||
427 | req->private = NULL; | ||
428 | req->ki_iovec = NULL; | ||
429 | INIT_LIST_HEAD(&req->ki_run_list); | ||
430 | req->ki_eventfd = NULL; | ||
431 | |||
432 | return req; | ||
433 | } | ||
434 | |||
435 | /* | ||
436 | * struct kiocb's are allocated in batches to reduce the number of | ||
437 | * times the ctx lock is acquired and released. | ||
438 | */ | ||
439 | #define KIOCB_BATCH_SIZE 32L | ||
440 | struct kiocb_batch { | ||
441 | struct list_head head; | ||
442 | long count; /* number of requests left to allocate */ | ||
443 | }; | ||
444 | |||
445 | static void kiocb_batch_init(struct kiocb_batch *batch, long total) | ||
446 | { | ||
447 | INIT_LIST_HEAD(&batch->head); | ||
448 | batch->count = total; | ||
449 | } | ||
450 | |||
451 | static void kiocb_batch_free(struct kioctx *ctx, struct kiocb_batch *batch) | ||
452 | { | ||
453 | struct kiocb *req, *n; | ||
454 | |||
455 | if (list_empty(&batch->head)) | ||
456 | return; | ||
457 | |||
458 | spin_lock_irq(&ctx->ctx_lock); | ||
459 | list_for_each_entry_safe(req, n, &batch->head, ki_batch) { | ||
460 | list_del(&req->ki_batch); | ||
461 | list_del(&req->ki_list); | ||
462 | kmem_cache_free(kiocb_cachep, req); | ||
463 | ctx->reqs_active--; | ||
464 | } | ||
465 | if (unlikely(!ctx->reqs_active && ctx->dead)) | ||
466 | wake_up_all(&ctx->wait); | ||
467 | spin_unlock_irq(&ctx->ctx_lock); | ||
468 | } | ||
469 | |||
470 | /* | ||
471 | * Allocate a batch of kiocbs. This avoids taking and dropping the | ||
472 | * context lock a lot during setup. | ||
473 | */ | ||
474 | static int kiocb_batch_refill(struct kioctx *ctx, struct kiocb_batch *batch) | ||
475 | { | ||
476 | unsigned short allocated, to_alloc; | ||
477 | long avail; | ||
478 | struct kiocb *req, *n; | ||
479 | struct aio_ring *ring; | ||
480 | |||
481 | to_alloc = min(batch->count, KIOCB_BATCH_SIZE); | ||
482 | for (allocated = 0; allocated < to_alloc; allocated++) { | ||
483 | req = __aio_get_req(ctx); | ||
484 | if (!req) | ||
485 | /* allocation failed, go with what we've got */ | ||
486 | break; | ||
487 | list_add(&req->ki_batch, &batch->head); | ||
488 | } | ||
489 | |||
490 | if (allocated == 0) | ||
491 | goto out; | ||
492 | |||
493 | spin_lock_irq(&ctx->ctx_lock); | ||
494 | ring = kmap_atomic(ctx->ring_info.ring_pages[0]); | ||
495 | |||
496 | avail = aio_ring_avail(&ctx->ring_info, ring) - ctx->reqs_active; | ||
497 | BUG_ON(avail < 0); | ||
498 | if (avail < allocated) { | ||
499 | /* Trim back the number of requests. */ | ||
500 | list_for_each_entry_safe(req, n, &batch->head, ki_batch) { | ||
501 | list_del(&req->ki_batch); | ||
502 | kmem_cache_free(kiocb_cachep, req); | ||
503 | if (--allocated <= avail) | ||
504 | break; | ||
505 | } | ||
506 | } | ||
507 | |||
508 | batch->count -= allocated; | ||
509 | list_for_each_entry(req, &batch->head, ki_batch) { | ||
510 | list_add(&req->ki_list, &ctx->active_reqs); | ||
511 | ctx->reqs_active++; | ||
512 | } | ||
513 | 522 | ||
514 | kunmap_atomic(ring); | 523 | req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL|__GFP_ZERO); |
515 | spin_unlock_irq(&ctx->ctx_lock); | 524 | if (unlikely(!req)) |
516 | 525 | goto out_put; | |
517 | out: | ||
518 | return allocated; | ||
519 | } | ||
520 | 526 | ||
521 | static inline struct kiocb *aio_get_req(struct kioctx *ctx, | 527 | atomic_set(&req->ki_users, 2); |
522 | struct kiocb_batch *batch) | 528 | req->ki_ctx = ctx; |
523 | { | ||
524 | struct kiocb *req; | ||
525 | 529 | ||
526 | if (list_empty(&batch->head)) | ||
527 | if (kiocb_batch_refill(ctx, batch) == 0) | ||
528 | return NULL; | ||
529 | req = list_first_entry(&batch->head, struct kiocb, ki_batch); | ||
530 | list_del(&req->ki_batch); | ||
531 | return req; | 530 | return req; |
531 | out_put: | ||
532 | atomic_dec(&ctx->reqs_active); | ||
533 | return NULL; | ||
532 | } | 534 | } |
533 | 535 | ||
534 | static inline void really_put_req(struct kioctx *ctx, struct kiocb *req) | 536 | static void kiocb_free(struct kiocb *req) |
535 | { | 537 | { |
536 | assert_spin_locked(&ctx->ctx_lock); | 538 | if (req->ki_filp) |
537 | 539 | fput(req->ki_filp); | |
538 | if (req->ki_eventfd != NULL) | 540 | if (req->ki_eventfd != NULL) |
539 | eventfd_ctx_put(req->ki_eventfd); | 541 | eventfd_ctx_put(req->ki_eventfd); |
540 | if (req->ki_dtor) | 542 | if (req->ki_dtor) |
@@ -542,48 +544,12 @@ static inline void really_put_req(struct kioctx *ctx, struct kiocb *req) | |||
542 | if (req->ki_iovec != &req->ki_inline_vec) | 544 | if (req->ki_iovec != &req->ki_inline_vec) |
543 | kfree(req->ki_iovec); | 545 | kfree(req->ki_iovec); |
544 | kmem_cache_free(kiocb_cachep, req); | 546 | kmem_cache_free(kiocb_cachep, req); |
545 | ctx->reqs_active--; | ||
546 | |||
547 | if (unlikely(!ctx->reqs_active && ctx->dead)) | ||
548 | wake_up_all(&ctx->wait); | ||
549 | } | 547 | } |
550 | 548 | ||
551 | /* __aio_put_req | 549 | void aio_put_req(struct kiocb *req) |
552 | * Returns true if this put was the last user of the request. | ||
553 | */ | ||
554 | static int __aio_put_req(struct kioctx *ctx, struct kiocb *req) | ||
555 | { | 550 | { |
556 | dprintk(KERN_DEBUG "aio_put(%p): f_count=%ld\n", | 551 | if (atomic_dec_and_test(&req->ki_users)) |
557 | req, atomic_long_read(&req->ki_filp->f_count)); | 552 | kiocb_free(req); |
558 | |||
559 | assert_spin_locked(&ctx->ctx_lock); | ||
560 | |||
561 | req->ki_users--; | ||
562 | BUG_ON(req->ki_users < 0); | ||
563 | if (likely(req->ki_users)) | ||
564 | return 0; | ||
565 | list_del(&req->ki_list); /* remove from active_reqs */ | ||
566 | req->ki_cancel = NULL; | ||
567 | req->ki_retry = NULL; | ||
568 | |||
569 | fput(req->ki_filp); | ||
570 | req->ki_filp = NULL; | ||
571 | really_put_req(ctx, req); | ||
572 | return 1; | ||
573 | } | ||
574 | |||
575 | /* aio_put_req | ||
576 | * Returns true if this put was the last user of the kiocb, | ||
577 | * false if the request is still in use. | ||
578 | */ | ||
579 | int aio_put_req(struct kiocb *req) | ||
580 | { | ||
581 | struct kioctx *ctx = req->ki_ctx; | ||
582 | int ret; | ||
583 | spin_lock_irq(&ctx->ctx_lock); | ||
584 | ret = __aio_put_req(ctx, req); | ||
585 | spin_unlock_irq(&ctx->ctx_lock); | ||
586 | return ret; | ||
587 | } | 553 | } |
588 | EXPORT_SYMBOL(aio_put_req); | 554 | EXPORT_SYMBOL(aio_put_req); |
589 | 555 | ||
@@ -595,13 +561,8 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id) | |||
595 | rcu_read_lock(); | 561 | rcu_read_lock(); |
596 | 562 | ||
597 | hlist_for_each_entry_rcu(ctx, &mm->ioctx_list, list) { | 563 | hlist_for_each_entry_rcu(ctx, &mm->ioctx_list, list) { |
598 | /* | 564 | if (ctx->user_id == ctx_id) { |
599 | * RCU protects us against accessing freed memory but | 565 | atomic_inc(&ctx->users); |
600 | * we have to be careful not to get a reference when the | ||
601 | * reference count already dropped to 0 (ctx->dead test | ||
602 | * is unreliable because of races). | ||
603 | */ | ||
604 | if (ctx->user_id == ctx_id && !ctx->dead && try_get_ioctx(ctx)){ | ||
605 | ret = ctx; | 566 | ret = ctx; |
606 | break; | 567 | break; |
607 | } | 568 | } |
@@ -611,295 +572,16 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id) | |||
611 | return ret; | 572 | return ret; |
612 | } | 573 | } |
613 | 574 | ||
614 | /* | ||
615 | * Queue up a kiocb to be retried. Assumes that the kiocb | ||
616 | * has already been marked as kicked, and places it on | ||
617 | * the retry run list for the corresponding ioctx, if it | ||
618 | * isn't already queued. Returns 1 if it actually queued | ||
619 | * the kiocb (to tell the caller to activate the work | ||
620 | * queue to process it), or 0, if it found that it was | ||
621 | * already queued. | ||
622 | */ | ||
623 | static inline int __queue_kicked_iocb(struct kiocb *iocb) | ||
624 | { | ||
625 | struct kioctx *ctx = iocb->ki_ctx; | ||
626 | |||
627 | assert_spin_locked(&ctx->ctx_lock); | ||
628 | |||
629 | if (list_empty(&iocb->ki_run_list)) { | ||
630 | list_add_tail(&iocb->ki_run_list, | ||
631 | &ctx->run_list); | ||
632 | return 1; | ||
633 | } | ||
634 | return 0; | ||
635 | } | ||
636 | |||
637 | /* aio_run_iocb | ||
638 | * This is the core aio execution routine. It is | ||
639 | * invoked both for initial i/o submission and | ||
640 | * subsequent retries via the aio_kick_handler. | ||
641 | * Expects to be invoked with iocb->ki_ctx->lock | ||
642 | * already held. The lock is released and reacquired | ||
643 | * as needed during processing. | ||
644 | * | ||
645 | * Calls the iocb retry method (already setup for the | ||
646 | * iocb on initial submission) for operation specific | ||
647 | * handling, but takes care of most of common retry | ||
648 | * execution details for a given iocb. The retry method | ||
649 | * needs to be non-blocking as far as possible, to avoid | ||
650 | * holding up other iocbs waiting to be serviced by the | ||
651 | * retry kernel thread. | ||
652 | * | ||
653 | * The trickier parts in this code have to do with | ||
654 | * ensuring that only one retry instance is in progress | ||
655 | * for a given iocb at any time. Providing that guarantee | ||
656 | * simplifies the coding of individual aio operations as | ||
657 | * it avoids various potential races. | ||
658 | */ | ||
659 | static ssize_t aio_run_iocb(struct kiocb *iocb) | ||
660 | { | ||
661 | struct kioctx *ctx = iocb->ki_ctx; | ||
662 | ssize_t (*retry)(struct kiocb *); | ||
663 | ssize_t ret; | ||
664 | |||
665 | if (!(retry = iocb->ki_retry)) { | ||
666 | printk("aio_run_iocb: iocb->ki_retry = NULL\n"); | ||
667 | return 0; | ||
668 | } | ||
669 | |||
670 | /* | ||
671 | * We don't want the next retry iteration for this | ||
672 | * operation to start until this one has returned and | ||
673 | * updated the iocb state. However, wait_queue functions | ||
674 | * can trigger a kick_iocb from interrupt context in the | ||
675 | * meantime, indicating that data is available for the next | ||
676 | * iteration. We want to remember that and enable the | ||
677 | * next retry iteration _after_ we are through with | ||
678 | * this one. | ||
679 | * | ||
680 | * So, in order to be able to register a "kick", but | ||
681 | * prevent it from being queued now, we clear the kick | ||
682 | * flag, but make the kick code *think* that the iocb is | ||
683 | * still on the run list until we are actually done. | ||
684 | * When we are done with this iteration, we check if | ||
685 | * the iocb was kicked in the meantime and if so, queue | ||
686 | * it up afresh. | ||
687 | */ | ||
688 | |||
689 | kiocbClearKicked(iocb); | ||
690 | |||
691 | /* | ||
692 | * This is so that aio_complete knows it doesn't need to | ||
693 | * pull the iocb off the run list (We can't just call | ||
694 | * INIT_LIST_HEAD because we don't want a kick_iocb to | ||
695 | * queue this on the run list yet) | ||
696 | */ | ||
697 | iocb->ki_run_list.next = iocb->ki_run_list.prev = NULL; | ||
698 | spin_unlock_irq(&ctx->ctx_lock); | ||
699 | |||
700 | /* Quit retrying if the i/o has been cancelled */ | ||
701 | if (kiocbIsCancelled(iocb)) { | ||
702 | ret = -EINTR; | ||
703 | aio_complete(iocb, ret, 0); | ||
704 | /* must not access the iocb after this */ | ||
705 | goto out; | ||
706 | } | ||
707 | |||
708 | /* | ||
709 | * Now we are all set to call the retry method in async | ||
710 | * context. | ||
711 | */ | ||
712 | ret = retry(iocb); | ||
713 | |||
714 | if (ret != -EIOCBRETRY && ret != -EIOCBQUEUED) { | ||
715 | /* | ||
716 | * There's no easy way to restart the syscall since other AIO's | ||
717 | * may be already running. Just fail this IO with EINTR. | ||
718 | */ | ||
719 | if (unlikely(ret == -ERESTARTSYS || ret == -ERESTARTNOINTR || | ||
720 | ret == -ERESTARTNOHAND || ret == -ERESTART_RESTARTBLOCK)) | ||
721 | ret = -EINTR; | ||
722 | aio_complete(iocb, ret, 0); | ||
723 | } | ||
724 | out: | ||
725 | spin_lock_irq(&ctx->ctx_lock); | ||
726 | |||
727 | if (-EIOCBRETRY == ret) { | ||
728 | /* | ||
729 | * OK, now that we are done with this iteration | ||
730 | * and know that there is more left to go, | ||
731 | * this is where we let go so that a subsequent | ||
732 | * "kick" can start the next iteration | ||
733 | */ | ||
734 | |||
735 | /* will make __queue_kicked_iocb succeed from here on */ | ||
736 | INIT_LIST_HEAD(&iocb->ki_run_list); | ||
737 | /* we must queue the next iteration ourselves, if it | ||
738 | * has already been kicked */ | ||
739 | if (kiocbIsKicked(iocb)) { | ||
740 | __queue_kicked_iocb(iocb); | ||
741 | |||
742 | /* | ||
743 | * __queue_kicked_iocb will always return 1 here, because | ||
744 | * iocb->ki_run_list is empty at this point so it should | ||
745 | * be safe to unconditionally queue the context into the | ||
746 | * work queue. | ||
747 | */ | ||
748 | aio_queue_work(ctx); | ||
749 | } | ||
750 | } | ||
751 | return ret; | ||
752 | } | ||
753 | |||
754 | /* | ||
755 | * __aio_run_iocbs: | ||
756 | * Process all pending retries queued on the ioctx | ||
757 | * run list. | ||
758 | * Assumes it is operating within the aio issuer's mm | ||
759 | * context. | ||
760 | */ | ||
761 | static int __aio_run_iocbs(struct kioctx *ctx) | ||
762 | { | ||
763 | struct kiocb *iocb; | ||
764 | struct list_head run_list; | ||
765 | |||
766 | assert_spin_locked(&ctx->ctx_lock); | ||
767 | |||
768 | list_replace_init(&ctx->run_list, &run_list); | ||
769 | while (!list_empty(&run_list)) { | ||
770 | iocb = list_entry(run_list.next, struct kiocb, | ||
771 | ki_run_list); | ||
772 | list_del(&iocb->ki_run_list); | ||
773 | /* | ||
774 | * Hold an extra reference while retrying i/o. | ||
775 | */ | ||
776 | iocb->ki_users++; /* grab extra reference */ | ||
777 | aio_run_iocb(iocb); | ||
778 | __aio_put_req(ctx, iocb); | ||
779 | } | ||
780 | if (!list_empty(&ctx->run_list)) | ||
781 | return 1; | ||
782 | return 0; | ||
783 | } | ||
784 | |||
785 | static void aio_queue_work(struct kioctx * ctx) | ||
786 | { | ||
787 | unsigned long timeout; | ||
788 | /* | ||
789 | * if someone is waiting, get the work started right | ||
790 | * away, otherwise, use a longer delay | ||
791 | */ | ||
792 | smp_mb(); | ||
793 | if (waitqueue_active(&ctx->wait)) | ||
794 | timeout = 1; | ||
795 | else | ||
796 | timeout = HZ/10; | ||
797 | queue_delayed_work(aio_wq, &ctx->wq, timeout); | ||
798 | } | ||
799 | |||
800 | /* | ||
801 | * aio_run_all_iocbs: | ||
802 | * Process all pending retries queued on the ioctx | ||
803 | * run list, and keep running them until the list | ||
804 | * stays empty. | ||
805 | * Assumes it is operating within the aio issuer's mm context. | ||
806 | */ | ||
807 | static inline void aio_run_all_iocbs(struct kioctx *ctx) | ||
808 | { | ||
809 | spin_lock_irq(&ctx->ctx_lock); | ||
810 | while (__aio_run_iocbs(ctx)) | ||
811 | ; | ||
812 | spin_unlock_irq(&ctx->ctx_lock); | ||
813 | } | ||
814 | |||
815 | /* | ||
816 | * aio_kick_handler: | ||
817 | * Work queue handler triggered to process pending | ||
818 | * retries on an ioctx. Takes on the aio issuer's | ||
819 | * mm context before running the iocbs, so that | ||
820 | * copy_xxx_user operates on the issuer's address | ||
821 | * space. | ||
822 | * Run on aiod's context. | ||
823 | */ | ||
824 | static void aio_kick_handler(struct work_struct *work) | ||
825 | { | ||
826 | struct kioctx *ctx = container_of(work, struct kioctx, wq.work); | ||
827 | mm_segment_t oldfs = get_fs(); | ||
828 | struct mm_struct *mm; | ||
829 | int requeue; | ||
830 | |||
831 | set_fs(USER_DS); | ||
832 | use_mm(ctx->mm); | ||
833 | spin_lock_irq(&ctx->ctx_lock); | ||
834 | requeue =__aio_run_iocbs(ctx); | ||
835 | mm = ctx->mm; | ||
836 | spin_unlock_irq(&ctx->ctx_lock); | ||
837 | unuse_mm(mm); | ||
838 | set_fs(oldfs); | ||
839 | /* | ||
840 | * we're in a worker thread already; no point using non-zero delay | ||
841 | */ | ||
842 | if (requeue) | ||
843 | queue_delayed_work(aio_wq, &ctx->wq, 0); | ||
844 | } | ||
845 | |||
846 | |||
847 | /* | ||
848 | * Called by kick_iocb to queue the kiocb for retry | ||
849 | * and if required activate the aio work queue to process | ||
850 | * it | ||
851 | */ | ||
852 | static void try_queue_kicked_iocb(struct kiocb *iocb) | ||
853 | { | ||
854 | struct kioctx *ctx = iocb->ki_ctx; | ||
855 | unsigned long flags; | ||
856 | int run = 0; | ||
857 | |||
858 | spin_lock_irqsave(&ctx->ctx_lock, flags); | ||
859 | /* set this inside the lock so that we can't race with aio_run_iocb() | ||
860 | * testing it and putting the iocb on the run list under the lock */ | ||
861 | if (!kiocbTryKick(iocb)) | ||
862 | run = __queue_kicked_iocb(iocb); | ||
863 | spin_unlock_irqrestore(&ctx->ctx_lock, flags); | ||
864 | if (run) | ||
865 | aio_queue_work(ctx); | ||
866 | } | ||
867 | |||
868 | /* | ||
869 | * kick_iocb: | ||
870 | * Called typically from a wait queue callback context | ||
871 | * to trigger a retry of the iocb. | ||
872 | * The retry is usually executed by aio workqueue | ||
873 | * threads (See aio_kick_handler). | ||
874 | */ | ||
875 | void kick_iocb(struct kiocb *iocb) | ||
876 | { | ||
877 | /* sync iocbs are easy: they can only ever be executing from a | ||
878 | * single context. */ | ||
879 | if (is_sync_kiocb(iocb)) { | ||
880 | kiocbSetKicked(iocb); | ||
881 | wake_up_process(iocb->ki_obj.tsk); | ||
882 | return; | ||
883 | } | ||
884 | |||
885 | try_queue_kicked_iocb(iocb); | ||
886 | } | ||
887 | EXPORT_SYMBOL(kick_iocb); | ||
888 | |||
889 | /* aio_complete | 575 | /* aio_complete |
890 | * Called when the io request on the given iocb is complete. | 576 | * Called when the io request on the given iocb is complete. |
891 | * Returns true if this is the last user of the request. The | ||
892 | * only other user of the request can be the cancellation code. | ||
893 | */ | 577 | */ |
894 | int aio_complete(struct kiocb *iocb, long res, long res2) | 578 | void aio_complete(struct kiocb *iocb, long res, long res2) |
895 | { | 579 | { |
896 | struct kioctx *ctx = iocb->ki_ctx; | 580 | struct kioctx *ctx = iocb->ki_ctx; |
897 | struct aio_ring_info *info; | ||
898 | struct aio_ring *ring; | 581 | struct aio_ring *ring; |
899 | struct io_event *event; | 582 | struct io_event *ev_page, *event; |
900 | unsigned long flags; | 583 | unsigned long flags; |
901 | unsigned long tail; | 584 | unsigned tail, pos; |
902 | int ret; | ||
903 | 585 | ||
904 | /* | 586 | /* |
905 | * Special case handling for sync iocbs: | 587 | * Special case handling for sync iocbs: |
@@ -909,61 +591,81 @@ int aio_complete(struct kiocb *iocb, long res, long res2) | |||
909 | * - the sync task helpfully left a reference to itself in the iocb | 591 | * - the sync task helpfully left a reference to itself in the iocb |
910 | */ | 592 | */ |
911 | if (is_sync_kiocb(iocb)) { | 593 | if (is_sync_kiocb(iocb)) { |
912 | BUG_ON(iocb->ki_users != 1); | 594 | BUG_ON(atomic_read(&iocb->ki_users) != 1); |
913 | iocb->ki_user_data = res; | 595 | iocb->ki_user_data = res; |
914 | iocb->ki_users = 0; | 596 | atomic_set(&iocb->ki_users, 0); |
915 | wake_up_process(iocb->ki_obj.tsk); | 597 | wake_up_process(iocb->ki_obj.tsk); |
916 | return 1; | 598 | return; |
917 | } | 599 | } |
918 | 600 | ||
919 | info = &ctx->ring_info; | 601 | /* |
920 | 602 | * Take rcu_read_lock() in case the kioctx is being destroyed, as we | |
921 | /* add a completion event to the ring buffer. | 603 | * need to issue a wakeup after decrementing reqs_active. |
922 | * must be done holding ctx->ctx_lock to prevent | ||
923 | * other code from messing with the tail | ||
924 | * pointer since we might be called from irq | ||
925 | * context. | ||
926 | */ | 604 | */ |
927 | spin_lock_irqsave(&ctx->ctx_lock, flags); | 605 | rcu_read_lock(); |
928 | 606 | ||
929 | if (iocb->ki_run_list.prev && !list_empty(&iocb->ki_run_list)) | 607 | if (iocb->ki_list.next) { |
930 | list_del_init(&iocb->ki_run_list); | 608 | unsigned long flags; |
609 | |||
610 | spin_lock_irqsave(&ctx->ctx_lock, flags); | ||
611 | list_del(&iocb->ki_list); | ||
612 | spin_unlock_irqrestore(&ctx->ctx_lock, flags); | ||
613 | } | ||
931 | 614 | ||
932 | /* | 615 | /* |
933 | * cancelled requests don't get events, userland was given one | 616 | * cancelled requests don't get events, userland was given one |
934 | * when the event got cancelled. | 617 | * when the event got cancelled. |
935 | */ | 618 | */ |
936 | if (kiocbIsCancelled(iocb)) | 619 | if (unlikely(xchg(&iocb->ki_cancel, |
620 | KIOCB_CANCELLED) == KIOCB_CANCELLED)) { | ||
621 | atomic_dec(&ctx->reqs_active); | ||
622 | /* Still need the wake_up in case free_ioctx is waiting */ | ||
937 | goto put_rq; | 623 | goto put_rq; |
624 | } | ||
938 | 625 | ||
939 | ring = kmap_atomic(info->ring_pages[0]); | 626 | /* |
627 | * Add a completion event to the ring buffer. Must be done holding | ||
628 | * ctx->ctx_lock to prevent other code from messing with the tail | ||
629 | * pointer since we might be called from irq context. | ||
630 | */ | ||
631 | spin_lock_irqsave(&ctx->completion_lock, flags); | ||
940 | 632 | ||
941 | tail = info->tail; | 633 | tail = ctx->tail; |
942 | event = aio_ring_event(info, tail); | 634 | pos = tail + AIO_EVENTS_OFFSET; |
943 | if (++tail >= info->nr) | 635 | |
636 | if (++tail >= ctx->nr_events) | ||
944 | tail = 0; | 637 | tail = 0; |
945 | 638 | ||
639 | ev_page = kmap_atomic(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]); | ||
640 | event = ev_page + pos % AIO_EVENTS_PER_PAGE; | ||
641 | |||
946 | event->obj = (u64)(unsigned long)iocb->ki_obj.user; | 642 | event->obj = (u64)(unsigned long)iocb->ki_obj.user; |
947 | event->data = iocb->ki_user_data; | 643 | event->data = iocb->ki_user_data; |
948 | event->res = res; | 644 | event->res = res; |
949 | event->res2 = res2; | 645 | event->res2 = res2; |
950 | 646 | ||
951 | dprintk("aio_complete: %p[%lu]: %p: %p %Lx %lx %lx\n", | 647 | kunmap_atomic(ev_page); |
952 | ctx, tail, iocb, iocb->ki_obj.user, iocb->ki_user_data, | 648 | flush_dcache_page(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]); |
953 | res, res2); | 649 | |
650 | pr_debug("%p[%u]: %p: %p %Lx %lx %lx\n", | ||
651 | ctx, tail, iocb, iocb->ki_obj.user, iocb->ki_user_data, | ||
652 | res, res2); | ||
954 | 653 | ||
955 | /* after flagging the request as done, we | 654 | /* after flagging the request as done, we |
956 | * must never even look at it again | 655 | * must never even look at it again |
957 | */ | 656 | */ |
958 | smp_wmb(); /* make event visible before updating tail */ | 657 | smp_wmb(); /* make event visible before updating tail */ |
959 | 658 | ||
960 | info->tail = tail; | 659 | ctx->tail = tail; |
961 | ring->tail = tail; | ||
962 | 660 | ||
963 | put_aio_ring_event(event); | 661 | ring = kmap_atomic(ctx->ring_pages[0]); |
662 | ring->tail = tail; | ||
964 | kunmap_atomic(ring); | 663 | kunmap_atomic(ring); |
664 | flush_dcache_page(ctx->ring_pages[0]); | ||
665 | |||
666 | spin_unlock_irqrestore(&ctx->completion_lock, flags); | ||
965 | 667 | ||
966 | pr_debug("added to ring %p at [%lu]\n", iocb, tail); | 668 | pr_debug("added to ring %p at [%u]\n", iocb, tail); |
967 | 669 | ||
968 | /* | 670 | /* |
969 | * Check if the user asked us to deliver the result through an | 671 | * Check if the user asked us to deliver the result through an |
@@ -975,7 +677,7 @@ int aio_complete(struct kiocb *iocb, long res, long res2) | |||
975 | 677 | ||
976 | put_rq: | 678 | put_rq: |
977 | /* everything turned out well, dispose of the aiocb. */ | 679 | /* everything turned out well, dispose of the aiocb. */ |
978 | ret = __aio_put_req(ctx, iocb); | 680 | aio_put_req(iocb); |
979 | 681 | ||
980 | /* | 682 | /* |
981 | * We have to order our ring_info tail store above and test | 683 | * We have to order our ring_info tail store above and test |
@@ -988,233 +690,133 @@ put_rq: | |||
988 | if (waitqueue_active(&ctx->wait)) | 690 | if (waitqueue_active(&ctx->wait)) |
989 | wake_up(&ctx->wait); | 691 | wake_up(&ctx->wait); |
990 | 692 | ||
991 | spin_unlock_irqrestore(&ctx->ctx_lock, flags); | 693 | rcu_read_unlock(); |
992 | return ret; | ||
993 | } | 694 | } |
994 | EXPORT_SYMBOL(aio_complete); | 695 | EXPORT_SYMBOL(aio_complete); |
995 | 696 | ||
996 | /* aio_read_evt | 697 | /* aio_read_events |
997 | * Pull an event off of the ioctx's event ring. Returns the number of | 698 | * Pull an event off of the ioctx's event ring. Returns the number of |
998 | * events fetched (0 or 1 ;-) | 699 | * events fetched |
999 | * FIXME: make this use cmpxchg. | ||
1000 | * TODO: make the ringbuffer user mmap()able (requires FIXME). | ||
1001 | */ | 700 | */ |
1002 | static int aio_read_evt(struct kioctx *ioctx, struct io_event *ent) | 701 | static long aio_read_events_ring(struct kioctx *ctx, |
702 | struct io_event __user *event, long nr) | ||
1003 | { | 703 | { |
1004 | struct aio_ring_info *info = &ioctx->ring_info; | ||
1005 | struct aio_ring *ring; | 704 | struct aio_ring *ring; |
1006 | unsigned long head; | 705 | unsigned head, pos; |
1007 | int ret = 0; | 706 | long ret = 0; |
1008 | 707 | int copy_ret; | |
1009 | ring = kmap_atomic(info->ring_pages[0]); | ||
1010 | dprintk("in aio_read_evt h%lu t%lu m%lu\n", | ||
1011 | (unsigned long)ring->head, (unsigned long)ring->tail, | ||
1012 | (unsigned long)ring->nr); | ||
1013 | |||
1014 | if (ring->head == ring->tail) | ||
1015 | goto out; | ||
1016 | 708 | ||
1017 | spin_lock(&info->ring_lock); | 709 | mutex_lock(&ctx->ring_lock); |
1018 | |||
1019 | head = ring->head % info->nr; | ||
1020 | if (head != ring->tail) { | ||
1021 | struct io_event *evp = aio_ring_event(info, head); | ||
1022 | *ent = *evp; | ||
1023 | head = (head + 1) % info->nr; | ||
1024 | smp_mb(); /* finish reading the event before updatng the head */ | ||
1025 | ring->head = head; | ||
1026 | ret = 1; | ||
1027 | put_aio_ring_event(evp); | ||
1028 | } | ||
1029 | spin_unlock(&info->ring_lock); | ||
1030 | 710 | ||
1031 | out: | 711 | ring = kmap_atomic(ctx->ring_pages[0]); |
1032 | dprintk("leaving aio_read_evt: %d h%lu t%lu\n", ret, | 712 | head = ring->head; |
1033 | (unsigned long)ring->head, (unsigned long)ring->tail); | ||
1034 | kunmap_atomic(ring); | 713 | kunmap_atomic(ring); |
1035 | return ret; | ||
1036 | } | ||
1037 | 714 | ||
1038 | struct aio_timeout { | 715 | pr_debug("h%u t%u m%u\n", head, ctx->tail, ctx->nr_events); |
1039 | struct timer_list timer; | ||
1040 | int timed_out; | ||
1041 | struct task_struct *p; | ||
1042 | }; | ||
1043 | 716 | ||
1044 | static void timeout_func(unsigned long data) | 717 | if (head == ctx->tail) |
1045 | { | 718 | goto out; |
1046 | struct aio_timeout *to = (struct aio_timeout *)data; | ||
1047 | 719 | ||
1048 | to->timed_out = 1; | 720 | while (ret < nr) { |
1049 | wake_up_process(to->p); | 721 | long avail; |
1050 | } | 722 | struct io_event *ev; |
723 | struct page *page; | ||
1051 | 724 | ||
1052 | static inline void init_timeout(struct aio_timeout *to) | 725 | avail = (head <= ctx->tail ? ctx->tail : ctx->nr_events) - head; |
1053 | { | 726 | if (head == ctx->tail) |
1054 | setup_timer_on_stack(&to->timer, timeout_func, (unsigned long) to); | 727 | break; |
1055 | to->timed_out = 0; | ||
1056 | to->p = current; | ||
1057 | } | ||
1058 | 728 | ||
1059 | static inline void set_timeout(long start_jiffies, struct aio_timeout *to, | 729 | avail = min(avail, nr - ret); |
1060 | const struct timespec *ts) | 730 | avail = min_t(long, avail, AIO_EVENTS_PER_PAGE - |
1061 | { | 731 | ((head + AIO_EVENTS_OFFSET) % AIO_EVENTS_PER_PAGE)); |
1062 | to->timer.expires = start_jiffies + timespec_to_jiffies(ts); | ||
1063 | if (time_after(to->timer.expires, jiffies)) | ||
1064 | add_timer(&to->timer); | ||
1065 | else | ||
1066 | to->timed_out = 1; | ||
1067 | } | ||
1068 | 732 | ||
1069 | static inline void clear_timeout(struct aio_timeout *to) | 733 | pos = head + AIO_EVENTS_OFFSET; |
1070 | { | 734 | page = ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]; |
1071 | del_singleshot_timer_sync(&to->timer); | 735 | pos %= AIO_EVENTS_PER_PAGE; |
1072 | } | ||
1073 | 736 | ||
1074 | static int read_events(struct kioctx *ctx, | 737 | ev = kmap(page); |
1075 | long min_nr, long nr, | 738 | copy_ret = copy_to_user(event + ret, ev + pos, |
1076 | struct io_event __user *event, | 739 | sizeof(*ev) * avail); |
1077 | struct timespec __user *timeout) | 740 | kunmap(page); |
1078 | { | ||
1079 | long start_jiffies = jiffies; | ||
1080 | struct task_struct *tsk = current; | ||
1081 | DECLARE_WAITQUEUE(wait, tsk); | ||
1082 | int ret; | ||
1083 | int i = 0; | ||
1084 | struct io_event ent; | ||
1085 | struct aio_timeout to; | ||
1086 | int retry = 0; | ||
1087 | |||
1088 | /* needed to zero any padding within an entry (there shouldn't be | ||
1089 | * any, but C is fun! | ||
1090 | */ | ||
1091 | memset(&ent, 0, sizeof(ent)); | ||
1092 | retry: | ||
1093 | ret = 0; | ||
1094 | while (likely(i < nr)) { | ||
1095 | ret = aio_read_evt(ctx, &ent); | ||
1096 | if (unlikely(ret <= 0)) | ||
1097 | break; | ||
1098 | |||
1099 | dprintk("read event: %Lx %Lx %Lx %Lx\n", | ||
1100 | ent.data, ent.obj, ent.res, ent.res2); | ||
1101 | 741 | ||
1102 | /* Could we split the check in two? */ | 742 | if (unlikely(copy_ret)) { |
1103 | ret = -EFAULT; | 743 | ret = -EFAULT; |
1104 | if (unlikely(copy_to_user(event, &ent, sizeof(ent)))) { | 744 | goto out; |
1105 | dprintk("aio: lost an event due to EFAULT.\n"); | ||
1106 | break; | ||
1107 | } | 745 | } |
1108 | ret = 0; | ||
1109 | 746 | ||
1110 | /* Good, event copied to userland, update counts. */ | 747 | ret += avail; |
1111 | event ++; | 748 | head += avail; |
1112 | i ++; | 749 | head %= ctx->nr_events; |
1113 | } | 750 | } |
1114 | 751 | ||
1115 | if (min_nr <= i) | 752 | ring = kmap_atomic(ctx->ring_pages[0]); |
1116 | return i; | 753 | ring->head = head; |
1117 | if (ret) | 754 | kunmap_atomic(ring); |
1118 | return ret; | 755 | flush_dcache_page(ctx->ring_pages[0]); |
1119 | 756 | ||
1120 | /* End fast path */ | 757 | pr_debug("%li h%u t%u\n", ret, head, ctx->tail); |
1121 | 758 | ||
1122 | /* racey check, but it gets redone */ | 759 | atomic_sub(ret, &ctx->reqs_active); |
1123 | if (!retry && unlikely(!list_empty(&ctx->run_list))) { | 760 | out: |
1124 | retry = 1; | 761 | mutex_unlock(&ctx->ring_lock); |
1125 | aio_run_all_iocbs(ctx); | ||
1126 | goto retry; | ||
1127 | } | ||
1128 | 762 | ||
1129 | init_timeout(&to); | 763 | return ret; |
1130 | if (timeout) { | 764 | } |
1131 | struct timespec ts; | ||
1132 | ret = -EFAULT; | ||
1133 | if (unlikely(copy_from_user(&ts, timeout, sizeof(ts)))) | ||
1134 | goto out; | ||
1135 | 765 | ||
1136 | set_timeout(start_jiffies, &to, &ts); | 766 | static bool aio_read_events(struct kioctx *ctx, long min_nr, long nr, |
1137 | } | 767 | struct io_event __user *event, long *i) |
768 | { | ||
769 | long ret = aio_read_events_ring(ctx, event + *i, nr - *i); | ||
1138 | 770 | ||
1139 | while (likely(i < nr)) { | 771 | if (ret > 0) |
1140 | add_wait_queue_exclusive(&ctx->wait, &wait); | 772 | *i += ret; |
1141 | do { | ||
1142 | set_task_state(tsk, TASK_INTERRUPTIBLE); | ||
1143 | ret = aio_read_evt(ctx, &ent); | ||
1144 | if (ret) | ||
1145 | break; | ||
1146 | if (min_nr <= i) | ||
1147 | break; | ||
1148 | if (unlikely(ctx->dead)) { | ||
1149 | ret = -EINVAL; | ||
1150 | break; | ||
1151 | } | ||
1152 | if (to.timed_out) /* Only check after read evt */ | ||
1153 | break; | ||
1154 | /* Try to only show up in io wait if there are ops | ||
1155 | * in flight */ | ||
1156 | if (ctx->reqs_active) | ||
1157 | io_schedule(); | ||
1158 | else | ||
1159 | schedule(); | ||
1160 | if (signal_pending(tsk)) { | ||
1161 | ret = -EINTR; | ||
1162 | break; | ||
1163 | } | ||
1164 | /*ret = aio_read_evt(ctx, &ent);*/ | ||
1165 | } while (1) ; | ||
1166 | |||
1167 | set_task_state(tsk, TASK_RUNNING); | ||
1168 | remove_wait_queue(&ctx->wait, &wait); | ||
1169 | |||
1170 | if (unlikely(ret <= 0)) | ||
1171 | break; | ||
1172 | 773 | ||
1173 | ret = -EFAULT; | 774 | if (unlikely(atomic_read(&ctx->dead))) |
1174 | if (unlikely(copy_to_user(event, &ent, sizeof(ent)))) { | 775 | ret = -EINVAL; |
1175 | dprintk("aio: lost an event due to EFAULT.\n"); | ||
1176 | break; | ||
1177 | } | ||
1178 | 776 | ||
1179 | /* Good, event copied to userland, update counts. */ | 777 | if (!*i) |
1180 | event ++; | 778 | *i = ret; |
1181 | i ++; | ||
1182 | } | ||
1183 | 779 | ||
1184 | if (timeout) | 780 | return ret < 0 || *i >= min_nr; |
1185 | clear_timeout(&to); | ||
1186 | out: | ||
1187 | destroy_timer_on_stack(&to.timer); | ||
1188 | return i ? i : ret; | ||
1189 | } | 781 | } |
1190 | 782 | ||
1191 | /* Take an ioctx and remove it from the list of ioctx's. Protects | 783 | static long read_events(struct kioctx *ctx, long min_nr, long nr, |
1192 | * against races with itself via ->dead. | 784 | struct io_event __user *event, |
1193 | */ | 785 | struct timespec __user *timeout) |
1194 | static void io_destroy(struct kioctx *ioctx) | ||
1195 | { | 786 | { |
1196 | struct mm_struct *mm = current->mm; | 787 | ktime_t until = { .tv64 = KTIME_MAX }; |
1197 | int was_dead; | 788 | long ret = 0; |
1198 | 789 | ||
1199 | /* delete the entry from the list is someone else hasn't already */ | 790 | if (timeout) { |
1200 | spin_lock(&mm->ioctx_lock); | 791 | struct timespec ts; |
1201 | was_dead = ioctx->dead; | ||
1202 | ioctx->dead = 1; | ||
1203 | hlist_del_rcu(&ioctx->list); | ||
1204 | spin_unlock(&mm->ioctx_lock); | ||
1205 | 792 | ||
1206 | dprintk("aio_release(%p)\n", ioctx); | 793 | if (unlikely(copy_from_user(&ts, timeout, sizeof(ts)))) |
1207 | if (likely(!was_dead)) | 794 | return -EFAULT; |
1208 | put_ioctx(ioctx); /* twice for the list */ | ||
1209 | 795 | ||
1210 | kill_ctx(ioctx); | 796 | until = timespec_to_ktime(ts); |
797 | } | ||
1211 | 798 | ||
1212 | /* | 799 | /* |
1213 | * Wake up any waiters. The setting of ctx->dead must be seen | 800 | * Note that aio_read_events() is being called as the conditional - i.e. |
1214 | * by other CPUs at this point. Right now, we rely on the | 801 | * we're calling it after prepare_to_wait() has set task state to |
1215 | * locking done by the above calls to ensure this consistency. | 802 | * TASK_INTERRUPTIBLE. |
803 | * | ||
804 | * But aio_read_events() can block, and if it blocks it's going to flip | ||
805 | * the task state back to TASK_RUNNING. | ||
806 | * | ||
807 | * This should be ok, provided it doesn't flip the state back to | ||
808 | * TASK_RUNNING and return 0 too much - that causes us to spin. That | ||
809 | * will only happen if the mutex_lock() call blocks, and we then find | ||
810 | * the ringbuffer empty. So in practice we should be ok, but it's | ||
811 | * something to be aware of when touching this code. | ||
1216 | */ | 812 | */ |
1217 | wake_up_all(&ioctx->wait); | 813 | wait_event_interruptible_hrtimeout(ctx->wait, |
814 | aio_read_events(ctx, min_nr, nr, event, &ret), until); | ||
815 | |||
816 | if (!ret && signal_pending(current)) | ||
817 | ret = -EINTR; | ||
818 | |||
819 | return ret; | ||
1218 | } | 820 | } |
1219 | 821 | ||
1220 | /* sys_io_setup: | 822 | /* sys_io_setup: |
@@ -1252,7 +854,7 @@ SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp) | |||
1252 | if (!IS_ERR(ioctx)) { | 854 | if (!IS_ERR(ioctx)) { |
1253 | ret = put_user(ioctx->user_id, ctxp); | 855 | ret = put_user(ioctx->user_id, ctxp); |
1254 | if (ret) | 856 | if (ret) |
1255 | io_destroy(ioctx); | 857 | kill_ioctx(ioctx); |
1256 | put_ioctx(ioctx); | 858 | put_ioctx(ioctx); |
1257 | } | 859 | } |
1258 | 860 | ||
@@ -1270,7 +872,7 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx) | |||
1270 | { | 872 | { |
1271 | struct kioctx *ioctx = lookup_ioctx(ctx); | 873 | struct kioctx *ioctx = lookup_ioctx(ctx); |
1272 | if (likely(NULL != ioctx)) { | 874 | if (likely(NULL != ioctx)) { |
1273 | io_destroy(ioctx); | 875 | kill_ioctx(ioctx); |
1274 | put_ioctx(ioctx); | 876 | put_ioctx(ioctx); |
1275 | return 0; | 877 | return 0; |
1276 | } | 878 | } |
@@ -1301,30 +903,21 @@ static void aio_advance_iovec(struct kiocb *iocb, ssize_t ret) | |||
1301 | BUG_ON(ret > 0 && iocb->ki_left == 0); | 903 | BUG_ON(ret > 0 && iocb->ki_left == 0); |
1302 | } | 904 | } |
1303 | 905 | ||
1304 | static ssize_t aio_rw_vect_retry(struct kiocb *iocb) | 906 | typedef ssize_t (aio_rw_op)(struct kiocb *, const struct iovec *, |
907 | unsigned long, loff_t); | ||
908 | |||
909 | static ssize_t aio_rw_vect_retry(struct kiocb *iocb, int rw, aio_rw_op *rw_op) | ||
1305 | { | 910 | { |
1306 | struct file *file = iocb->ki_filp; | 911 | struct file *file = iocb->ki_filp; |
1307 | struct address_space *mapping = file->f_mapping; | 912 | struct address_space *mapping = file->f_mapping; |
1308 | struct inode *inode = mapping->host; | 913 | struct inode *inode = mapping->host; |
1309 | ssize_t (*rw_op)(struct kiocb *, const struct iovec *, | ||
1310 | unsigned long, loff_t); | ||
1311 | ssize_t ret = 0; | 914 | ssize_t ret = 0; |
1312 | unsigned short opcode; | ||
1313 | |||
1314 | if ((iocb->ki_opcode == IOCB_CMD_PREADV) || | ||
1315 | (iocb->ki_opcode == IOCB_CMD_PREAD)) { | ||
1316 | rw_op = file->f_op->aio_read; | ||
1317 | opcode = IOCB_CMD_PREADV; | ||
1318 | } else { | ||
1319 | rw_op = file->f_op->aio_write; | ||
1320 | opcode = IOCB_CMD_PWRITEV; | ||
1321 | } | ||
1322 | 915 | ||
1323 | /* This matches the pread()/pwrite() logic */ | 916 | /* This matches the pread()/pwrite() logic */ |
1324 | if (iocb->ki_pos < 0) | 917 | if (iocb->ki_pos < 0) |
1325 | return -EINVAL; | 918 | return -EINVAL; |
1326 | 919 | ||
1327 | if (opcode == IOCB_CMD_PWRITEV) | 920 | if (rw == WRITE) |
1328 | file_start_write(file); | 921 | file_start_write(file); |
1329 | do { | 922 | do { |
1330 | ret = rw_op(iocb, &iocb->ki_iovec[iocb->ki_cur_seg], | 923 | ret = rw_op(iocb, &iocb->ki_iovec[iocb->ki_cur_seg], |
@@ -1336,9 +929,9 @@ static ssize_t aio_rw_vect_retry(struct kiocb *iocb) | |||
1336 | /* retry all partial writes. retry partial reads as long as its a | 929 | /* retry all partial writes. retry partial reads as long as its a |
1337 | * regular file. */ | 930 | * regular file. */ |
1338 | } while (ret > 0 && iocb->ki_left > 0 && | 931 | } while (ret > 0 && iocb->ki_left > 0 && |
1339 | (opcode == IOCB_CMD_PWRITEV || | 932 | (rw == WRITE || |
1340 | (!S_ISFIFO(inode->i_mode) && !S_ISSOCK(inode->i_mode)))); | 933 | (!S_ISFIFO(inode->i_mode) && !S_ISSOCK(inode->i_mode)))); |
1341 | if (opcode == IOCB_CMD_PWRITEV) | 934 | if (rw == WRITE) |
1342 | file_end_write(file); | 935 | file_end_write(file); |
1343 | 936 | ||
1344 | /* This means we must have transferred all that we could */ | 937 | /* This means we must have transferred all that we could */ |
@@ -1348,81 +941,49 @@ static ssize_t aio_rw_vect_retry(struct kiocb *iocb) | |||
1348 | 941 | ||
1349 | /* If we managed to write some out we return that, rather than | 942 | /* If we managed to write some out we return that, rather than |
1350 | * the eventual error. */ | 943 | * the eventual error. */ |
1351 | if (opcode == IOCB_CMD_PWRITEV | 944 | if (rw == WRITE |
1352 | && ret < 0 && ret != -EIOCBQUEUED && ret != -EIOCBRETRY | 945 | && ret < 0 && ret != -EIOCBQUEUED |
1353 | && iocb->ki_nbytes - iocb->ki_left) | 946 | && iocb->ki_nbytes - iocb->ki_left) |
1354 | ret = iocb->ki_nbytes - iocb->ki_left; | 947 | ret = iocb->ki_nbytes - iocb->ki_left; |
1355 | 948 | ||
1356 | return ret; | 949 | return ret; |
1357 | } | 950 | } |
1358 | 951 | ||
1359 | static ssize_t aio_fdsync(struct kiocb *iocb) | 952 | static ssize_t aio_setup_vectored_rw(int rw, struct kiocb *kiocb, bool compat) |
1360 | { | ||
1361 | struct file *file = iocb->ki_filp; | ||
1362 | ssize_t ret = -EINVAL; | ||
1363 | |||
1364 | if (file->f_op->aio_fsync) | ||
1365 | ret = file->f_op->aio_fsync(iocb, 1); | ||
1366 | return ret; | ||
1367 | } | ||
1368 | |||
1369 | static ssize_t aio_fsync(struct kiocb *iocb) | ||
1370 | { | ||
1371 | struct file *file = iocb->ki_filp; | ||
1372 | ssize_t ret = -EINVAL; | ||
1373 | |||
1374 | if (file->f_op->aio_fsync) | ||
1375 | ret = file->f_op->aio_fsync(iocb, 0); | ||
1376 | return ret; | ||
1377 | } | ||
1378 | |||
1379 | static ssize_t aio_setup_vectored_rw(int type, struct kiocb *kiocb, bool compat) | ||
1380 | { | 953 | { |
1381 | ssize_t ret; | 954 | ssize_t ret; |
1382 | 955 | ||
956 | kiocb->ki_nr_segs = kiocb->ki_nbytes; | ||
957 | |||
1383 | #ifdef CONFIG_COMPAT | 958 | #ifdef CONFIG_COMPAT |
1384 | if (compat) | 959 | if (compat) |
1385 | ret = compat_rw_copy_check_uvector(type, | 960 | ret = compat_rw_copy_check_uvector(rw, |
1386 | (struct compat_iovec __user *)kiocb->ki_buf, | 961 | (struct compat_iovec __user *)kiocb->ki_buf, |
1387 | kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec, | 962 | kiocb->ki_nr_segs, 1, &kiocb->ki_inline_vec, |
1388 | &kiocb->ki_iovec); | 963 | &kiocb->ki_iovec); |
1389 | else | 964 | else |
1390 | #endif | 965 | #endif |
1391 | ret = rw_copy_check_uvector(type, | 966 | ret = rw_copy_check_uvector(rw, |
1392 | (struct iovec __user *)kiocb->ki_buf, | 967 | (struct iovec __user *)kiocb->ki_buf, |
1393 | kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec, | 968 | kiocb->ki_nr_segs, 1, &kiocb->ki_inline_vec, |
1394 | &kiocb->ki_iovec); | 969 | &kiocb->ki_iovec); |
1395 | if (ret < 0) | 970 | if (ret < 0) |
1396 | goto out; | 971 | return ret; |
1397 | |||
1398 | ret = rw_verify_area(type, kiocb->ki_filp, &kiocb->ki_pos, ret); | ||
1399 | if (ret < 0) | ||
1400 | goto out; | ||
1401 | 972 | ||
1402 | kiocb->ki_nr_segs = kiocb->ki_nbytes; | 973 | /* ki_nbytes now reflect bytes instead of segs */ |
1403 | kiocb->ki_cur_seg = 0; | ||
1404 | /* ki_nbytes/left now reflect bytes instead of segs */ | ||
1405 | kiocb->ki_nbytes = ret; | 974 | kiocb->ki_nbytes = ret; |
1406 | kiocb->ki_left = ret; | 975 | return 0; |
1407 | |||
1408 | ret = 0; | ||
1409 | out: | ||
1410 | return ret; | ||
1411 | } | 976 | } |
1412 | 977 | ||
1413 | static ssize_t aio_setup_single_vector(int type, struct file * file, struct kiocb *kiocb) | 978 | static ssize_t aio_setup_single_vector(int rw, struct kiocb *kiocb) |
1414 | { | 979 | { |
1415 | int bytes; | 980 | if (unlikely(!access_ok(!rw, kiocb->ki_buf, kiocb->ki_nbytes))) |
1416 | 981 | return -EFAULT; | |
1417 | bytes = rw_verify_area(type, file, &kiocb->ki_pos, kiocb->ki_left); | ||
1418 | if (bytes < 0) | ||
1419 | return bytes; | ||
1420 | 982 | ||
1421 | kiocb->ki_iovec = &kiocb->ki_inline_vec; | 983 | kiocb->ki_iovec = &kiocb->ki_inline_vec; |
1422 | kiocb->ki_iovec->iov_base = kiocb->ki_buf; | 984 | kiocb->ki_iovec->iov_base = kiocb->ki_buf; |
1423 | kiocb->ki_iovec->iov_len = bytes; | 985 | kiocb->ki_iovec->iov_len = kiocb->ki_nbytes; |
1424 | kiocb->ki_nr_segs = 1; | 986 | kiocb->ki_nr_segs = 1; |
1425 | kiocb->ki_cur_seg = 0; | ||
1426 | return 0; | 987 | return 0; |
1427 | } | 988 | } |
1428 | 989 | ||
@@ -1431,96 +992,95 @@ static ssize_t aio_setup_single_vector(int type, struct file * file, struct kioc | |||
1431 | * Performs the initial checks and aio retry method | 992 | * Performs the initial checks and aio retry method |
1432 | * setup for the kiocb at the time of io submission. | 993 | * setup for the kiocb at the time of io submission. |
1433 | */ | 994 | */ |
1434 | static ssize_t aio_setup_iocb(struct kiocb *kiocb, bool compat) | 995 | static ssize_t aio_run_iocb(struct kiocb *req, bool compat) |
1435 | { | 996 | { |
1436 | struct file *file = kiocb->ki_filp; | 997 | struct file *file = req->ki_filp; |
1437 | ssize_t ret = 0; | 998 | ssize_t ret; |
999 | int rw; | ||
1000 | fmode_t mode; | ||
1001 | aio_rw_op *rw_op; | ||
1438 | 1002 | ||
1439 | switch (kiocb->ki_opcode) { | 1003 | switch (req->ki_opcode) { |
1440 | case IOCB_CMD_PREAD: | 1004 | case IOCB_CMD_PREAD: |
1441 | ret = -EBADF; | ||
1442 | if (unlikely(!(file->f_mode & FMODE_READ))) | ||
1443 | break; | ||
1444 | ret = -EFAULT; | ||
1445 | if (unlikely(!access_ok(VERIFY_WRITE, kiocb->ki_buf, | ||
1446 | kiocb->ki_left))) | ||
1447 | break; | ||
1448 | ret = aio_setup_single_vector(READ, file, kiocb); | ||
1449 | if (ret) | ||
1450 | break; | ||
1451 | ret = -EINVAL; | ||
1452 | if (file->f_op->aio_read) | ||
1453 | kiocb->ki_retry = aio_rw_vect_retry; | ||
1454 | break; | ||
1455 | case IOCB_CMD_PWRITE: | ||
1456 | ret = -EBADF; | ||
1457 | if (unlikely(!(file->f_mode & FMODE_WRITE))) | ||
1458 | break; | ||
1459 | ret = -EFAULT; | ||
1460 | if (unlikely(!access_ok(VERIFY_READ, kiocb->ki_buf, | ||
1461 | kiocb->ki_left))) | ||
1462 | break; | ||
1463 | ret = aio_setup_single_vector(WRITE, file, kiocb); | ||
1464 | if (ret) | ||
1465 | break; | ||
1466 | ret = -EINVAL; | ||
1467 | if (file->f_op->aio_write) | ||
1468 | kiocb->ki_retry = aio_rw_vect_retry; | ||
1469 | break; | ||
1470 | case IOCB_CMD_PREADV: | 1005 | case IOCB_CMD_PREADV: |
1471 | ret = -EBADF; | 1006 | mode = FMODE_READ; |
1472 | if (unlikely(!(file->f_mode & FMODE_READ))) | 1007 | rw = READ; |
1473 | break; | 1008 | rw_op = file->f_op->aio_read; |
1474 | ret = aio_setup_vectored_rw(READ, kiocb, compat); | 1009 | goto rw_common; |
1475 | if (ret) | 1010 | |
1476 | break; | 1011 | case IOCB_CMD_PWRITE: |
1477 | ret = -EINVAL; | ||
1478 | if (file->f_op->aio_read) | ||
1479 | kiocb->ki_retry = aio_rw_vect_retry; | ||
1480 | break; | ||
1481 | case IOCB_CMD_PWRITEV: | 1012 | case IOCB_CMD_PWRITEV: |
1482 | ret = -EBADF; | 1013 | mode = FMODE_WRITE; |
1483 | if (unlikely(!(file->f_mode & FMODE_WRITE))) | 1014 | rw = WRITE; |
1484 | break; | 1015 | rw_op = file->f_op->aio_write; |
1485 | ret = aio_setup_vectored_rw(WRITE, kiocb, compat); | 1016 | goto rw_common; |
1017 | rw_common: | ||
1018 | if (unlikely(!(file->f_mode & mode))) | ||
1019 | return -EBADF; | ||
1020 | |||
1021 | if (!rw_op) | ||
1022 | return -EINVAL; | ||
1023 | |||
1024 | ret = (req->ki_opcode == IOCB_CMD_PREADV || | ||
1025 | req->ki_opcode == IOCB_CMD_PWRITEV) | ||
1026 | ? aio_setup_vectored_rw(rw, req, compat) | ||
1027 | : aio_setup_single_vector(rw, req); | ||
1486 | if (ret) | 1028 | if (ret) |
1487 | break; | 1029 | return ret; |
1488 | ret = -EINVAL; | 1030 | |
1489 | if (file->f_op->aio_write) | 1031 | ret = rw_verify_area(rw, file, &req->ki_pos, req->ki_nbytes); |
1490 | kiocb->ki_retry = aio_rw_vect_retry; | 1032 | if (ret < 0) |
1033 | return ret; | ||
1034 | |||
1035 | req->ki_nbytes = ret; | ||
1036 | req->ki_left = ret; | ||
1037 | |||
1038 | ret = aio_rw_vect_retry(req, rw, rw_op); | ||
1491 | break; | 1039 | break; |
1040 | |||
1492 | case IOCB_CMD_FDSYNC: | 1041 | case IOCB_CMD_FDSYNC: |
1493 | ret = -EINVAL; | 1042 | if (!file->f_op->aio_fsync) |
1494 | if (file->f_op->aio_fsync) | 1043 | return -EINVAL; |
1495 | kiocb->ki_retry = aio_fdsync; | 1044 | |
1045 | ret = file->f_op->aio_fsync(req, 1); | ||
1496 | break; | 1046 | break; |
1047 | |||
1497 | case IOCB_CMD_FSYNC: | 1048 | case IOCB_CMD_FSYNC: |
1498 | ret = -EINVAL; | 1049 | if (!file->f_op->aio_fsync) |
1499 | if (file->f_op->aio_fsync) | 1050 | return -EINVAL; |
1500 | kiocb->ki_retry = aio_fsync; | 1051 | |
1052 | ret = file->f_op->aio_fsync(req, 0); | ||
1501 | break; | 1053 | break; |
1054 | |||
1502 | default: | 1055 | default: |
1503 | dprintk("EINVAL: io_submit: no operation provided\n"); | 1056 | pr_debug("EINVAL: no operation provided\n"); |
1504 | ret = -EINVAL; | 1057 | return -EINVAL; |
1505 | } | 1058 | } |
1506 | 1059 | ||
1507 | if (!kiocb->ki_retry) | 1060 | if (ret != -EIOCBQUEUED) { |
1508 | return ret; | 1061 | /* |
1062 | * There's no easy way to restart the syscall since other AIO's | ||
1063 | * may be already running. Just fail this IO with EINTR. | ||
1064 | */ | ||
1065 | if (unlikely(ret == -ERESTARTSYS || ret == -ERESTARTNOINTR || | ||
1066 | ret == -ERESTARTNOHAND || | ||
1067 | ret == -ERESTART_RESTARTBLOCK)) | ||
1068 | ret = -EINTR; | ||
1069 | aio_complete(req, ret, 0); | ||
1070 | } | ||
1509 | 1071 | ||
1510 | return 0; | 1072 | return 0; |
1511 | } | 1073 | } |
1512 | 1074 | ||
1513 | static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, | 1075 | static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, |
1514 | struct iocb *iocb, struct kiocb_batch *batch, | 1076 | struct iocb *iocb, bool compat) |
1515 | bool compat) | ||
1516 | { | 1077 | { |
1517 | struct kiocb *req; | 1078 | struct kiocb *req; |
1518 | struct file *file; | ||
1519 | ssize_t ret; | 1079 | ssize_t ret; |
1520 | 1080 | ||
1521 | /* enforce forwards compatibility on users */ | 1081 | /* enforce forwards compatibility on users */ |
1522 | if (unlikely(iocb->aio_reserved1 || iocb->aio_reserved2)) { | 1082 | if (unlikely(iocb->aio_reserved1 || iocb->aio_reserved2)) { |
1523 | pr_debug("EINVAL: io_submit: reserve field set\n"); | 1083 | pr_debug("EINVAL: reserve field set\n"); |
1524 | return -EINVAL; | 1084 | return -EINVAL; |
1525 | } | 1085 | } |
1526 | 1086 | ||
@@ -1534,16 +1094,16 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, | |||
1534 | return -EINVAL; | 1094 | return -EINVAL; |
1535 | } | 1095 | } |
1536 | 1096 | ||
1537 | file = fget(iocb->aio_fildes); | 1097 | req = aio_get_req(ctx); |
1538 | if (unlikely(!file)) | 1098 | if (unlikely(!req)) |
1539 | return -EBADF; | ||
1540 | |||
1541 | req = aio_get_req(ctx, batch); /* returns with 2 references to req */ | ||
1542 | if (unlikely(!req)) { | ||
1543 | fput(file); | ||
1544 | return -EAGAIN; | 1099 | return -EAGAIN; |
1100 | |||
1101 | req->ki_filp = fget(iocb->aio_fildes); | ||
1102 | if (unlikely(!req->ki_filp)) { | ||
1103 | ret = -EBADF; | ||
1104 | goto out_put_req; | ||
1545 | } | 1105 | } |
1546 | req->ki_filp = file; | 1106 | |
1547 | if (iocb->aio_flags & IOCB_FLAG_RESFD) { | 1107 | if (iocb->aio_flags & IOCB_FLAG_RESFD) { |
1548 | /* | 1108 | /* |
1549 | * If the IOCB_FLAG_RESFD flag of aio_flags is set, get an | 1109 | * If the IOCB_FLAG_RESFD flag of aio_flags is set, get an |
@@ -1559,9 +1119,9 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, | |||
1559 | } | 1119 | } |
1560 | } | 1120 | } |
1561 | 1121 | ||
1562 | ret = put_user(req->ki_key, &user_iocb->aio_key); | 1122 | ret = put_user(KIOCB_KEY, &user_iocb->aio_key); |
1563 | if (unlikely(ret)) { | 1123 | if (unlikely(ret)) { |
1564 | dprintk("EFAULT: aio_key\n"); | 1124 | pr_debug("EFAULT: aio_key\n"); |
1565 | goto out_put_req; | 1125 | goto out_put_req; |
1566 | } | 1126 | } |
1567 | 1127 | ||
@@ -1573,41 +1133,14 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, | |||
1573 | req->ki_left = req->ki_nbytes = iocb->aio_nbytes; | 1133 | req->ki_left = req->ki_nbytes = iocb->aio_nbytes; |
1574 | req->ki_opcode = iocb->aio_lio_opcode; | 1134 | req->ki_opcode = iocb->aio_lio_opcode; |
1575 | 1135 | ||
1576 | ret = aio_setup_iocb(req, compat); | 1136 | ret = aio_run_iocb(req, compat); |
1577 | |||
1578 | if (ret) | 1137 | if (ret) |
1579 | goto out_put_req; | 1138 | goto out_put_req; |
1580 | 1139 | ||
1581 | spin_lock_irq(&ctx->ctx_lock); | ||
1582 | /* | ||
1583 | * We could have raced with io_destroy() and are currently holding a | ||
1584 | * reference to ctx which should be destroyed. We cannot submit IO | ||
1585 | * since ctx gets freed as soon as io_submit() puts its reference. The | ||
1586 | * check here is reliable: io_destroy() sets ctx->dead before waiting | ||
1587 | * for outstanding IO and the barrier between these two is realized by | ||
1588 | * unlock of mm->ioctx_lock and lock of ctx->ctx_lock. Analogously we | ||
1589 | * increment ctx->reqs_active before checking for ctx->dead and the | ||
1590 | * barrier is realized by unlock and lock of ctx->ctx_lock. Thus if we | ||
1591 | * don't see ctx->dead set here, io_destroy() waits for our IO to | ||
1592 | * finish. | ||
1593 | */ | ||
1594 | if (ctx->dead) { | ||
1595 | spin_unlock_irq(&ctx->ctx_lock); | ||
1596 | ret = -EINVAL; | ||
1597 | goto out_put_req; | ||
1598 | } | ||
1599 | aio_run_iocb(req); | ||
1600 | if (!list_empty(&ctx->run_list)) { | ||
1601 | /* drain the run list */ | ||
1602 | while (__aio_run_iocbs(ctx)) | ||
1603 | ; | ||
1604 | } | ||
1605 | spin_unlock_irq(&ctx->ctx_lock); | ||
1606 | |||
1607 | aio_put_req(req); /* drop extra ref to req */ | 1140 | aio_put_req(req); /* drop extra ref to req */ |
1608 | return 0; | 1141 | return 0; |
1609 | |||
1610 | out_put_req: | 1142 | out_put_req: |
1143 | atomic_dec(&ctx->reqs_active); | ||
1611 | aio_put_req(req); /* drop extra ref to req */ | 1144 | aio_put_req(req); /* drop extra ref to req */ |
1612 | aio_put_req(req); /* drop i/o ref to req */ | 1145 | aio_put_req(req); /* drop i/o ref to req */ |
1613 | return ret; | 1146 | return ret; |
@@ -1620,7 +1153,6 @@ long do_io_submit(aio_context_t ctx_id, long nr, | |||
1620 | long ret = 0; | 1153 | long ret = 0; |
1621 | int i = 0; | 1154 | int i = 0; |
1622 | struct blk_plug plug; | 1155 | struct blk_plug plug; |
1623 | struct kiocb_batch batch; | ||
1624 | 1156 | ||
1625 | if (unlikely(nr < 0)) | 1157 | if (unlikely(nr < 0)) |
1626 | return -EINVAL; | 1158 | return -EINVAL; |
@@ -1633,12 +1165,10 @@ long do_io_submit(aio_context_t ctx_id, long nr, | |||
1633 | 1165 | ||
1634 | ctx = lookup_ioctx(ctx_id); | 1166 | ctx = lookup_ioctx(ctx_id); |
1635 | if (unlikely(!ctx)) { | 1167 | if (unlikely(!ctx)) { |
1636 | pr_debug("EINVAL: io_submit: invalid context id\n"); | 1168 | pr_debug("EINVAL: invalid context id\n"); |
1637 | return -EINVAL; | 1169 | return -EINVAL; |
1638 | } | 1170 | } |
1639 | 1171 | ||
1640 | kiocb_batch_init(&batch, nr); | ||
1641 | |||
1642 | blk_start_plug(&plug); | 1172 | blk_start_plug(&plug); |
1643 | 1173 | ||
1644 | /* | 1174 | /* |
@@ -1659,13 +1189,12 @@ long do_io_submit(aio_context_t ctx_id, long nr, | |||
1659 | break; | 1189 | break; |
1660 | } | 1190 | } |
1661 | 1191 | ||
1662 | ret = io_submit_one(ctx, user_iocb, &tmp, &batch, compat); | 1192 | ret = io_submit_one(ctx, user_iocb, &tmp, compat); |
1663 | if (ret) | 1193 | if (ret) |
1664 | break; | 1194 | break; |
1665 | } | 1195 | } |
1666 | blk_finish_plug(&plug); | 1196 | blk_finish_plug(&plug); |
1667 | 1197 | ||
1668 | kiocb_batch_free(ctx, &batch); | ||
1669 | put_ioctx(ctx); | 1198 | put_ioctx(ctx); |
1670 | return i ? i : ret; | 1199 | return i ? i : ret; |
1671 | } | 1200 | } |
@@ -1698,10 +1227,13 @@ static struct kiocb *lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb, | |||
1698 | 1227 | ||
1699 | assert_spin_locked(&ctx->ctx_lock); | 1228 | assert_spin_locked(&ctx->ctx_lock); |
1700 | 1229 | ||
1230 | if (key != KIOCB_KEY) | ||
1231 | return NULL; | ||
1232 | |||
1701 | /* TODO: use a hash or array, this sucks. */ | 1233 | /* TODO: use a hash or array, this sucks. */ |
1702 | list_for_each(pos, &ctx->active_reqs) { | 1234 | list_for_each(pos, &ctx->active_reqs) { |
1703 | struct kiocb *kiocb = list_kiocb(pos); | 1235 | struct kiocb *kiocb = list_kiocb(pos); |
1704 | if (kiocb->ki_obj.user == iocb && kiocb->ki_key == key) | 1236 | if (kiocb->ki_obj.user == iocb) |
1705 | return kiocb; | 1237 | return kiocb; |
1706 | } | 1238 | } |
1707 | return NULL; | 1239 | return NULL; |
@@ -1720,7 +1252,7 @@ static struct kiocb *lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb, | |||
1720 | SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, | 1252 | SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, |
1721 | struct io_event __user *, result) | 1253 | struct io_event __user *, result) |
1722 | { | 1254 | { |
1723 | int (*cancel)(struct kiocb *iocb, struct io_event *res); | 1255 | struct io_event res; |
1724 | struct kioctx *ctx; | 1256 | struct kioctx *ctx; |
1725 | struct kiocb *kiocb; | 1257 | struct kiocb *kiocb; |
1726 | u32 key; | 1258 | u32 key; |
@@ -1735,32 +1267,22 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, | |||
1735 | return -EINVAL; | 1267 | return -EINVAL; |
1736 | 1268 | ||
1737 | spin_lock_irq(&ctx->ctx_lock); | 1269 | spin_lock_irq(&ctx->ctx_lock); |
1738 | ret = -EAGAIN; | 1270 | |
1739 | kiocb = lookup_kiocb(ctx, iocb, key); | 1271 | kiocb = lookup_kiocb(ctx, iocb, key); |
1740 | if (kiocb && kiocb->ki_cancel) { | 1272 | if (kiocb) |
1741 | cancel = kiocb->ki_cancel; | 1273 | ret = kiocb_cancel(ctx, kiocb, &res); |
1742 | kiocb->ki_users ++; | 1274 | else |
1743 | kiocbSetCancelled(kiocb); | 1275 | ret = -EINVAL; |
1744 | } else | 1276 | |
1745 | cancel = NULL; | ||
1746 | spin_unlock_irq(&ctx->ctx_lock); | 1277 | spin_unlock_irq(&ctx->ctx_lock); |
1747 | 1278 | ||
1748 | if (NULL != cancel) { | 1279 | if (!ret) { |
1749 | struct io_event tmp; | 1280 | /* Cancellation succeeded -- copy the result |
1750 | pr_debug("calling cancel\n"); | 1281 | * into the user's buffer. |
1751 | memset(&tmp, 0, sizeof(tmp)); | 1282 | */ |
1752 | tmp.obj = (u64)(unsigned long)kiocb->ki_obj.user; | 1283 | if (copy_to_user(result, &res, sizeof(res))) |
1753 | tmp.data = kiocb->ki_user_data; | 1284 | ret = -EFAULT; |
1754 | ret = cancel(kiocb, &tmp); | 1285 | } |
1755 | if (!ret) { | ||
1756 | /* Cancellation succeeded -- copy the result | ||
1757 | * into the user's buffer. | ||
1758 | */ | ||
1759 | if (copy_to_user(result, &tmp, sizeof(tmp))) | ||
1760 | ret = -EFAULT; | ||
1761 | } | ||
1762 | } else | ||
1763 | ret = -EINVAL; | ||
1764 | 1286 | ||
1765 | put_ioctx(ctx); | 1287 | put_ioctx(ctx); |
1766 | 1288 | ||
@@ -19,6 +19,7 @@ | |||
19 | #include <linux/swap.h> | 19 | #include <linux/swap.h> |
20 | #include <linux/bio.h> | 20 | #include <linux/bio.h> |
21 | #include <linux/blkdev.h> | 21 | #include <linux/blkdev.h> |
22 | #include <linux/uio.h> | ||
22 | #include <linux/iocontext.h> | 23 | #include <linux/iocontext.h> |
23 | #include <linux/slab.h> | 24 | #include <linux/slab.h> |
24 | #include <linux/init.h> | 25 | #include <linux/init.h> |
diff --git a/fs/block_dev.c b/fs/block_dev.c index 3823d3ffb760..d9871c1f0894 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c | |||
@@ -27,6 +27,7 @@ | |||
27 | #include <linux/namei.h> | 27 | #include <linux/namei.h> |
28 | #include <linux/log2.h> | 28 | #include <linux/log2.h> |
29 | #include <linux/cleancache.h> | 29 | #include <linux/cleancache.h> |
30 | #include <linux/aio.h> | ||
30 | #include <asm/uaccess.h> | 31 | #include <asm/uaccess.h> |
31 | #include "internal.h" | 32 | #include "internal.h" |
32 | 33 | ||
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index bb8b7a0e28a6..bc4d54c465a0 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c | |||
@@ -24,6 +24,7 @@ | |||
24 | #include <linux/string.h> | 24 | #include <linux/string.h> |
25 | #include <linux/backing-dev.h> | 25 | #include <linux/backing-dev.h> |
26 | #include <linux/mpage.h> | 26 | #include <linux/mpage.h> |
27 | #include <linux/aio.h> | ||
27 | #include <linux/falloc.h> | 28 | #include <linux/falloc.h> |
28 | #include <linux/swap.h> | 29 | #include <linux/swap.h> |
29 | #include <linux/writeback.h> | 30 | #include <linux/writeback.h> |
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 09c58a35b429..898da0a01e04 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c | |||
@@ -32,6 +32,7 @@ | |||
32 | #include <linux/writeback.h> | 32 | #include <linux/writeback.h> |
33 | #include <linux/statfs.h> | 33 | #include <linux/statfs.h> |
34 | #include <linux/compat.h> | 34 | #include <linux/compat.h> |
35 | #include <linux/aio.h> | ||
35 | #include <linux/bit_spinlock.h> | 36 | #include <linux/bit_spinlock.h> |
36 | #include <linux/xattr.h> | 37 | #include <linux/xattr.h> |
37 | #include <linux/posix_acl.h> | 38 | #include <linux/posix_acl.h> |
diff --git a/fs/ceph/file.c b/fs/ceph/file.c index d70830c66833..656e16907430 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c | |||
@@ -7,6 +7,7 @@ | |||
7 | #include <linux/mount.h> | 7 | #include <linux/mount.h> |
8 | #include <linux/namei.h> | 8 | #include <linux/namei.h> |
9 | #include <linux/writeback.h> | 9 | #include <linux/writeback.h> |
10 | #include <linux/aio.h> | ||
10 | 11 | ||
11 | #include "super.h" | 12 | #include "super.h" |
12 | #include "mds_client.h" | 13 | #include "mds_client.h" |
diff --git a/fs/compat.c b/fs/compat.c index 93f7d021b716..fc3b55dce184 100644 --- a/fs/compat.c +++ b/fs/compat.c | |||
@@ -47,6 +47,7 @@ | |||
47 | #include <linux/fs_struct.h> | 47 | #include <linux/fs_struct.h> |
48 | #include <linux/slab.h> | 48 | #include <linux/slab.h> |
49 | #include <linux/pagemap.h> | 49 | #include <linux/pagemap.h> |
50 | #include <linux/aio.h> | ||
50 | 51 | ||
51 | #include <asm/uaccess.h> | 52 | #include <asm/uaccess.h> |
52 | #include <asm/mmu_context.h> | 53 | #include <asm/mmu_context.h> |
diff --git a/fs/direct-io.c b/fs/direct-io.c index cfb816dc6d9f..51d16e067d68 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c | |||
@@ -37,6 +37,7 @@ | |||
37 | #include <linux/uio.h> | 37 | #include <linux/uio.h> |
38 | #include <linux/atomic.h> | 38 | #include <linux/atomic.h> |
39 | #include <linux/prefetch.h> | 39 | #include <linux/prefetch.h> |
40 | #include <linux/aio.h> | ||
40 | 41 | ||
41 | /* | 42 | /* |
42 | * How many user pages to map in one call to get_user_pages(). This determines | 43 | * How many user pages to map in one call to get_user_pages(). This determines |
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index 63b1f54b6a1f..201f0a0d6b0a 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c | |||
@@ -31,6 +31,7 @@ | |||
31 | #include <linux/security.h> | 31 | #include <linux/security.h> |
32 | #include <linux/compat.h> | 32 | #include <linux/compat.h> |
33 | #include <linux/fs_stack.h> | 33 | #include <linux/fs_stack.h> |
34 | #include <linux/aio.h> | ||
34 | #include "ecryptfs_kernel.h" | 35 | #include "ecryptfs_kernel.h" |
35 | 36 | ||
36 | /** | 37 | /** |
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index fe60cc1117d8..0a87bb10998d 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c | |||
@@ -31,6 +31,7 @@ | |||
31 | #include <linux/mpage.h> | 31 | #include <linux/mpage.h> |
32 | #include <linux/fiemap.h> | 32 | #include <linux/fiemap.h> |
33 | #include <linux/namei.h> | 33 | #include <linux/namei.h> |
34 | #include <linux/aio.h> | ||
34 | #include "ext2.h" | 35 | #include "ext2.h" |
35 | #include "acl.h" | 36 | #include "acl.h" |
36 | #include "xip.h" | 37 | #include "xip.h" |
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index d706dbfa6220..23c712825640 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c | |||
@@ -27,6 +27,7 @@ | |||
27 | #include <linux/writeback.h> | 27 | #include <linux/writeback.h> |
28 | #include <linux/mpage.h> | 28 | #include <linux/mpage.h> |
29 | #include <linux/namei.h> | 29 | #include <linux/namei.h> |
30 | #include <linux/aio.h> | ||
30 | #include "ext3.h" | 31 | #include "ext3.h" |
31 | #include "xattr.h" | 32 | #include "xattr.h" |
32 | #include "acl.h" | 33 | #include "acl.h" |
diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 64848b595b24..4959e29573b6 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c | |||
@@ -23,6 +23,7 @@ | |||
23 | #include <linux/jbd2.h> | 23 | #include <linux/jbd2.h> |
24 | #include <linux/mount.h> | 24 | #include <linux/mount.h> |
25 | #include <linux/path.h> | 25 | #include <linux/path.h> |
26 | #include <linux/aio.h> | ||
26 | #include <linux/quotaops.h> | 27 | #include <linux/quotaops.h> |
27 | #include <linux/pagevec.h> | 28 | #include <linux/pagevec.h> |
28 | #include "ext4.h" | 29 | #include "ext4.h" |
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 98be6f697463..b8d5d351e24f 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c | |||
@@ -20,6 +20,7 @@ | |||
20 | * (sct@redhat.com), 1993, 1998 | 20 | * (sct@redhat.com), 1993, 1998 |
21 | */ | 21 | */ |
22 | 22 | ||
23 | #include <linux/aio.h> | ||
23 | #include "ext4_jbd2.h" | 24 | #include "ext4_jbd2.h" |
24 | #include "truncate.h" | 25 | #include "truncate.h" |
25 | #include "ext4_extents.h" /* Needed for EXT_MAX_BLOCKS */ | 26 | #include "ext4_extents.h" /* Needed for EXT_MAX_BLOCKS */ |
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 793d44b84d7f..0723774bdfb5 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c | |||
@@ -37,6 +37,7 @@ | |||
37 | #include <linux/printk.h> | 37 | #include <linux/printk.h> |
38 | #include <linux/slab.h> | 38 | #include <linux/slab.h> |
39 | #include <linux/ratelimit.h> | 39 | #include <linux/ratelimit.h> |
40 | #include <linux/aio.h> | ||
40 | 41 | ||
41 | #include "ext4_jbd2.h" | 42 | #include "ext4_jbd2.h" |
42 | #include "xattr.h" | 43 | #include "xattr.h" |
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 5929cd0baa20..19599bded62a 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c | |||
@@ -18,6 +18,7 @@ | |||
18 | #include <linux/pagevec.h> | 18 | #include <linux/pagevec.h> |
19 | #include <linux/mpage.h> | 19 | #include <linux/mpage.h> |
20 | #include <linux/namei.h> | 20 | #include <linux/namei.h> |
21 | #include <linux/aio.h> | ||
21 | #include <linux/uio.h> | 22 | #include <linux/uio.h> |
22 | #include <linux/bio.h> | 23 | #include <linux/bio.h> |
23 | #include <linux/workqueue.h> | 24 | #include <linux/workqueue.h> |
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 7bd22a201125..d0ed4ba4b61b 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c | |||
@@ -12,6 +12,7 @@ | |||
12 | #include <linux/f2fs_fs.h> | 12 | #include <linux/f2fs_fs.h> |
13 | #include <linux/buffer_head.h> | 13 | #include <linux/buffer_head.h> |
14 | #include <linux/mpage.h> | 14 | #include <linux/mpage.h> |
15 | #include <linux/aio.h> | ||
15 | #include <linux/writeback.h> | 16 | #include <linux/writeback.h> |
16 | #include <linux/backing-dev.h> | 17 | #include <linux/backing-dev.h> |
17 | #include <linux/blkdev.h> | 18 | #include <linux/blkdev.h> |
diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 4ff901632b26..dfce656ddb33 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c | |||
@@ -19,6 +19,7 @@ | |||
19 | #include <linux/mpage.h> | 19 | #include <linux/mpage.h> |
20 | #include <linux/buffer_head.h> | 20 | #include <linux/buffer_head.h> |
21 | #include <linux/mount.h> | 21 | #include <linux/mount.h> |
22 | #include <linux/aio.h> | ||
22 | #include <linux/vfs.h> | 23 | #include <linux/vfs.h> |
23 | #include <linux/parser.h> | 24 | #include <linux/parser.h> |
24 | #include <linux/uio.h> | 25 | #include <linux/uio.h> |
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index b3aaf7b3578b..aef34b1e635e 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c | |||
@@ -38,6 +38,7 @@ | |||
38 | #include <linux/device.h> | 38 | #include <linux/device.h> |
39 | #include <linux/file.h> | 39 | #include <linux/file.h> |
40 | #include <linux/fs.h> | 40 | #include <linux/fs.h> |
41 | #include <linux/aio.h> | ||
41 | #include <linux/kdev_t.h> | 42 | #include <linux/kdev_t.h> |
42 | #include <linux/kthread.h> | 43 | #include <linux/kthread.h> |
43 | #include <linux/list.h> | 44 | #include <linux/list.h> |
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index a6c1664e330b..1d55f9465400 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c | |||
@@ -19,6 +19,7 @@ | |||
19 | #include <linux/pipe_fs_i.h> | 19 | #include <linux/pipe_fs_i.h> |
20 | #include <linux/swap.h> | 20 | #include <linux/swap.h> |
21 | #include <linux/splice.h> | 21 | #include <linux/splice.h> |
22 | #include <linux/aio.h> | ||
22 | 23 | ||
23 | MODULE_ALIAS_MISCDEV(FUSE_MINOR); | 24 | MODULE_ALIAS_MISCDEV(FUSE_MINOR); |
24 | MODULE_ALIAS("devname:fuse"); | 25 | MODULE_ALIAS("devname:fuse"); |
diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 4655e59d545b..d1c9b85b3f58 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c | |||
@@ -15,6 +15,7 @@ | |||
15 | #include <linux/module.h> | 15 | #include <linux/module.h> |
16 | #include <linux/compat.h> | 16 | #include <linux/compat.h> |
17 | #include <linux/swap.h> | 17 | #include <linux/swap.h> |
18 | #include <linux/aio.h> | ||
18 | 19 | ||
19 | static const struct file_operations fuse_direct_io_file_operations; | 20 | static const struct file_operations fuse_direct_io_file_operations; |
20 | 21 | ||
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 9883694f1e7c..0bad69ed6336 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c | |||
@@ -20,6 +20,7 @@ | |||
20 | #include <linux/swap.h> | 20 | #include <linux/swap.h> |
21 | #include <linux/gfs2_ondisk.h> | 21 | #include <linux/gfs2_ondisk.h> |
22 | #include <linux/backing-dev.h> | 22 | #include <linux/backing-dev.h> |
23 | #include <linux/aio.h> | ||
23 | 24 | ||
24 | #include "gfs2.h" | 25 | #include "gfs2.h" |
25 | #include "incore.h" | 26 | #include "incore.h" |
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index d79c2dadc536..acd16764b133 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c | |||
@@ -25,6 +25,7 @@ | |||
25 | #include <asm/uaccess.h> | 25 | #include <asm/uaccess.h> |
26 | #include <linux/dlm.h> | 26 | #include <linux/dlm.h> |
27 | #include <linux/dlm_plock.h> | 27 | #include <linux/dlm_plock.h> |
28 | #include <linux/aio.h> | ||
28 | 29 | ||
29 | #include "gfs2.h" | 30 | #include "gfs2.h" |
30 | #include "incore.h" | 31 | #include "incore.h" |
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index 716e1aafb2e2..f9299d8a64e3 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c | |||
@@ -14,6 +14,7 @@ | |||
14 | #include <linux/pagemap.h> | 14 | #include <linux/pagemap.h> |
15 | #include <linux/mpage.h> | 15 | #include <linux/mpage.h> |
16 | #include <linux/sched.h> | 16 | #include <linux/sched.h> |
17 | #include <linux/aio.h> | ||
17 | 18 | ||
18 | #include "hfs_fs.h" | 19 | #include "hfs_fs.h" |
19 | #include "btree.h" | 20 | #include "btree.h" |
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index 7faaa964968e..f833d35630ab 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c | |||
@@ -14,6 +14,7 @@ | |||
14 | #include <linux/pagemap.h> | 14 | #include <linux/pagemap.h> |
15 | #include <linux/mpage.h> | 15 | #include <linux/mpage.h> |
16 | #include <linux/sched.h> | 16 | #include <linux/sched.h> |
17 | #include <linux/aio.h> | ||
17 | 18 | ||
18 | #include "hfsplus_fs.h" | 19 | #include "hfsplus_fs.h" |
19 | #include "hfsplus_raw.h" | 20 | #include "hfsplus_raw.h" |
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 523464e62849..a3f868ae3fd4 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c | |||
@@ -909,11 +909,8 @@ static int can_do_hugetlb_shm(void) | |||
909 | 909 | ||
910 | static int get_hstate_idx(int page_size_log) | 910 | static int get_hstate_idx(int page_size_log) |
911 | { | 911 | { |
912 | struct hstate *h; | 912 | struct hstate *h = hstate_sizelog(page_size_log); |
913 | 913 | ||
914 | if (!page_size_log) | ||
915 | return default_hstate_idx; | ||
916 | h = size_to_hstate(1 << page_size_log); | ||
917 | if (!h) | 914 | if (!h) |
918 | return -1; | 915 | return -1; |
919 | return h - hstates; | 916 | return h - hstates; |
@@ -929,9 +926,12 @@ static struct dentry_operations anon_ops = { | |||
929 | .d_dname = hugetlb_dname | 926 | .d_dname = hugetlb_dname |
930 | }; | 927 | }; |
931 | 928 | ||
932 | struct file *hugetlb_file_setup(const char *name, unsigned long addr, | 929 | /* |
933 | size_t size, vm_flags_t acctflag, | 930 | * Note that size should be aligned to proper hugepage size in caller side, |
934 | struct user_struct **user, | 931 | * otherwise hugetlb_reserve_pages reserves one less hugepages than intended. |
932 | */ | ||
933 | struct file *hugetlb_file_setup(const char *name, size_t size, | ||
934 | vm_flags_t acctflag, struct user_struct **user, | ||
935 | int creat_flags, int page_size_log) | 935 | int creat_flags, int page_size_log) |
936 | { | 936 | { |
937 | struct file *file = ERR_PTR(-ENOMEM); | 937 | struct file *file = ERR_PTR(-ENOMEM); |
@@ -939,8 +939,6 @@ struct file *hugetlb_file_setup(const char *name, unsigned long addr, | |||
939 | struct path path; | 939 | struct path path; |
940 | struct super_block *sb; | 940 | struct super_block *sb; |
941 | struct qstr quick_string; | 941 | struct qstr quick_string; |
942 | struct hstate *hstate; | ||
943 | unsigned long num_pages; | ||
944 | int hstate_idx; | 942 | int hstate_idx; |
945 | 943 | ||
946 | hstate_idx = get_hstate_idx(page_size_log); | 944 | hstate_idx = get_hstate_idx(page_size_log); |
@@ -980,12 +978,10 @@ struct file *hugetlb_file_setup(const char *name, unsigned long addr, | |||
980 | if (!inode) | 978 | if (!inode) |
981 | goto out_dentry; | 979 | goto out_dentry; |
982 | 980 | ||
983 | hstate = hstate_inode(inode); | ||
984 | size += addr & ~huge_page_mask(hstate); | ||
985 | num_pages = ALIGN(size, huge_page_size(hstate)) >> | ||
986 | huge_page_shift(hstate); | ||
987 | file = ERR_PTR(-ENOMEM); | 981 | file = ERR_PTR(-ENOMEM); |
988 | if (hugetlb_reserve_pages(inode, 0, num_pages, NULL, acctflag)) | 982 | if (hugetlb_reserve_pages(inode, 0, |
983 | size >> huge_page_shift(hstate_inode(inode)), NULL, | ||
984 | acctflag)) | ||
989 | goto out_inode; | 985 | goto out_inode; |
990 | 986 | ||
991 | d_instantiate(path.dentry, inode); | 987 | d_instantiate(path.dentry, inode); |
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index 77554b61d124..730f24e282a6 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c | |||
@@ -23,6 +23,7 @@ | |||
23 | #include <linux/pagemap.h> | 23 | #include <linux/pagemap.h> |
24 | #include <linux/quotaops.h> | 24 | #include <linux/quotaops.h> |
25 | #include <linux/writeback.h> | 25 | #include <linux/writeback.h> |
26 | #include <linux/aio.h> | ||
26 | #include "jfs_incore.h" | 27 | #include "jfs_incore.h" |
27 | #include "jfs_inode.h" | 28 | #include "jfs_inode.h" |
28 | #include "jfs_filsys.h" | 29 | #include "jfs_filsys.h" |
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index cf02f5530713..689fb608648e 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c | |||
@@ -25,7 +25,7 @@ | |||
25 | #include <linux/gfp.h> | 25 | #include <linux/gfp.h> |
26 | #include <linux/mpage.h> | 26 | #include <linux/mpage.h> |
27 | #include <linux/writeback.h> | 27 | #include <linux/writeback.h> |
28 | #include <linux/uio.h> | 28 | #include <linux/aio.h> |
29 | #include "nilfs.h" | 29 | #include "nilfs.h" |
30 | #include "btnode.h" | 30 | #include "btnode.h" |
31 | #include "segment.h" | 31 | #include "segment.h" |
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index 1da4b81e6f76..c5670b8d198c 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c | |||
@@ -27,6 +27,7 @@ | |||
27 | #include <linux/swap.h> | 27 | #include <linux/swap.h> |
28 | #include <linux/uio.h> | 28 | #include <linux/uio.h> |
29 | #include <linux/writeback.h> | 29 | #include <linux/writeback.h> |
30 | #include <linux/aio.h> | ||
30 | 31 | ||
31 | #include <asm/page.h> | 32 | #include <asm/page.h> |
32 | #include <asm/uaccess.h> | 33 | #include <asm/uaccess.h> |
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index d3e118cc6ffa..2778b0255dc6 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c | |||
@@ -28,6 +28,7 @@ | |||
28 | #include <linux/quotaops.h> | 28 | #include <linux/quotaops.h> |
29 | #include <linux/slab.h> | 29 | #include <linux/slab.h> |
30 | #include <linux/log2.h> | 30 | #include <linux/log2.h> |
31 | #include <linux/aio.h> | ||
31 | 32 | ||
32 | #include "aops.h" | 33 | #include "aops.h" |
33 | #include "attrib.h" | 34 | #include "attrib.h" |
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h index ffb2da370a99..f671e49beb34 100644 --- a/fs/ocfs2/aops.h +++ b/fs/ocfs2/aops.h | |||
@@ -22,6 +22,8 @@ | |||
22 | #ifndef OCFS2_AOPS_H | 22 | #ifndef OCFS2_AOPS_H |
23 | #define OCFS2_AOPS_H | 23 | #define OCFS2_AOPS_H |
24 | 24 | ||
25 | #include <linux/aio.h> | ||
26 | |||
25 | handle_t *ocfs2_start_walk_page_trans(struct inode *inode, | 27 | handle_t *ocfs2_start_walk_page_trans(struct inode *inode, |
26 | struct page *page, | 28 | struct page *page, |
27 | unsigned from, | 29 | unsigned from, |
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 12ae194ac943..3a44a648dae7 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c | |||
@@ -2322,7 +2322,7 @@ int ocfs2_inode_lock_full_nested(struct inode *inode, | |||
2322 | status = __ocfs2_cluster_lock(osb, lockres, level, dlm_flags, | 2322 | status = __ocfs2_cluster_lock(osb, lockres, level, dlm_flags, |
2323 | arg_flags, subclass, _RET_IP_); | 2323 | arg_flags, subclass, _RET_IP_); |
2324 | if (status < 0) { | 2324 | if (status < 0) { |
2325 | if (status != -EAGAIN && status != -EIOCBRETRY) | 2325 | if (status != -EAGAIN) |
2326 | mlog_errno(status); | 2326 | mlog_errno(status); |
2327 | goto bail; | 2327 | goto bail; |
2328 | } | 2328 | } |
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index 88924a3133fa..621fc73bf23d 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h | |||
@@ -147,8 +147,6 @@ void ocfs2_refresh_inode(struct inode *inode, | |||
147 | int ocfs2_mark_inode_dirty(handle_t *handle, | 147 | int ocfs2_mark_inode_dirty(handle_t *handle, |
148 | struct inode *inode, | 148 | struct inode *inode, |
149 | struct buffer_head *bh); | 149 | struct buffer_head *bh); |
150 | int ocfs2_aio_read(struct file *file, struct kiocb *req, struct iocb *iocb); | ||
151 | int ocfs2_aio_write(struct file *file, struct kiocb *req, struct iocb *iocb); | ||
152 | struct buffer_head *ocfs2_bread(struct inode *inode, | 150 | struct buffer_head *ocfs2_bread(struct inode *inode, |
153 | int block, int *err, int reada); | 151 | int block, int *err, int reada); |
154 | 152 | ||
@@ -21,6 +21,7 @@ | |||
21 | #include <linux/audit.h> | 21 | #include <linux/audit.h> |
22 | #include <linux/syscalls.h> | 22 | #include <linux/syscalls.h> |
23 | #include <linux/fcntl.h> | 23 | #include <linux/fcntl.h> |
24 | #include <linux/aio.h> | ||
24 | 25 | ||
25 | #include <asm/uaccess.h> | 26 | #include <asm/uaccess.h> |
26 | #include <asm/ioctls.h> | 27 | #include <asm/ioctls.h> |
diff --git a/fs/read_write.c b/fs/read_write.c index 90ba3b350e50..03430008704e 100644 --- a/fs/read_write.c +++ b/fs/read_write.c | |||
@@ -9,6 +9,7 @@ | |||
9 | #include <linux/fcntl.h> | 9 | #include <linux/fcntl.h> |
10 | #include <linux/file.h> | 10 | #include <linux/file.h> |
11 | #include <linux/uio.h> | 11 | #include <linux/uio.h> |
12 | #include <linux/aio.h> | ||
12 | #include <linux/fsnotify.h> | 13 | #include <linux/fsnotify.h> |
13 | #include <linux/security.h> | 14 | #include <linux/security.h> |
14 | #include <linux/export.h> | 15 | #include <linux/export.h> |
@@ -329,16 +330,6 @@ int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count | |||
329 | return count > MAX_RW_COUNT ? MAX_RW_COUNT : count; | 330 | return count > MAX_RW_COUNT ? MAX_RW_COUNT : count; |
330 | } | 331 | } |
331 | 332 | ||
332 | static void wait_on_retry_sync_kiocb(struct kiocb *iocb) | ||
333 | { | ||
334 | set_current_state(TASK_UNINTERRUPTIBLE); | ||
335 | if (!kiocbIsKicked(iocb)) | ||
336 | schedule(); | ||
337 | else | ||
338 | kiocbClearKicked(iocb); | ||
339 | __set_current_state(TASK_RUNNING); | ||
340 | } | ||
341 | |||
342 | ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos) | 333 | ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos) |
343 | { | 334 | { |
344 | struct iovec iov = { .iov_base = buf, .iov_len = len }; | 335 | struct iovec iov = { .iov_base = buf, .iov_len = len }; |
@@ -350,13 +341,7 @@ ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *pp | |||
350 | kiocb.ki_left = len; | 341 | kiocb.ki_left = len; |
351 | kiocb.ki_nbytes = len; | 342 | kiocb.ki_nbytes = len; |
352 | 343 | ||
353 | for (;;) { | 344 | ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos); |
354 | ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos); | ||
355 | if (ret != -EIOCBRETRY) | ||
356 | break; | ||
357 | wait_on_retry_sync_kiocb(&kiocb); | ||
358 | } | ||
359 | |||
360 | if (-EIOCBQUEUED == ret) | 345 | if (-EIOCBQUEUED == ret) |
361 | ret = wait_on_sync_kiocb(&kiocb); | 346 | ret = wait_on_sync_kiocb(&kiocb); |
362 | *ppos = kiocb.ki_pos; | 347 | *ppos = kiocb.ki_pos; |
@@ -406,13 +391,7 @@ ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, lof | |||
406 | kiocb.ki_left = len; | 391 | kiocb.ki_left = len; |
407 | kiocb.ki_nbytes = len; | 392 | kiocb.ki_nbytes = len; |
408 | 393 | ||
409 | for (;;) { | 394 | ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos); |
410 | ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos); | ||
411 | if (ret != -EIOCBRETRY) | ||
412 | break; | ||
413 | wait_on_retry_sync_kiocb(&kiocb); | ||
414 | } | ||
415 | |||
416 | if (-EIOCBQUEUED == ret) | 395 | if (-EIOCBQUEUED == ret) |
417 | ret = wait_on_sync_kiocb(&kiocb); | 396 | ret = wait_on_sync_kiocb(&kiocb); |
418 | *ppos = kiocb.ki_pos; | 397 | *ppos = kiocb.ki_pos; |
@@ -592,13 +571,7 @@ static ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov, | |||
592 | kiocb.ki_left = len; | 571 | kiocb.ki_left = len; |
593 | kiocb.ki_nbytes = len; | 572 | kiocb.ki_nbytes = len; |
594 | 573 | ||
595 | for (;;) { | 574 | ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos); |
596 | ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos); | ||
597 | if (ret != -EIOCBRETRY) | ||
598 | break; | ||
599 | wait_on_retry_sync_kiocb(&kiocb); | ||
600 | } | ||
601 | |||
602 | if (ret == -EIOCBQUEUED) | 575 | if (ret == -EIOCBQUEUED) |
603 | ret = wait_on_sync_kiocb(&kiocb); | 576 | ret = wait_on_sync_kiocb(&kiocb); |
604 | *ppos = kiocb.ki_pos; | 577 | *ppos = kiocb.ki_pos; |
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index ea5061fd4f3e..77d6d47abc83 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c | |||
@@ -18,6 +18,7 @@ | |||
18 | #include <linux/writeback.h> | 18 | #include <linux/writeback.h> |
19 | #include <linux/quotaops.h> | 19 | #include <linux/quotaops.h> |
20 | #include <linux/swap.h> | 20 | #include <linux/swap.h> |
21 | #include <linux/aio.h> | ||
21 | 22 | ||
22 | int reiserfs_commit_write(struct file *f, struct page *page, | 23 | int reiserfs_commit_write(struct file *f, struct page *page, |
23 | unsigned from, unsigned to); | 24 | unsigned from, unsigned to); |
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index f12189d2db1d..14374530784c 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c | |||
@@ -50,6 +50,7 @@ | |||
50 | */ | 50 | */ |
51 | 51 | ||
52 | #include "ubifs.h" | 52 | #include "ubifs.h" |
53 | #include <linux/aio.h> | ||
53 | #include <linux/mount.h> | 54 | #include <linux/mount.h> |
54 | #include <linux/namei.h> | 55 | #include <linux/namei.h> |
55 | #include <linux/slab.h> | 56 | #include <linux/slab.h> |
diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 7a12e48ad819..b6d15d349810 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c | |||
@@ -38,6 +38,7 @@ | |||
38 | #include <linux/slab.h> | 38 | #include <linux/slab.h> |
39 | #include <linux/crc-itu-t.h> | 39 | #include <linux/crc-itu-t.h> |
40 | #include <linux/mpage.h> | 40 | #include <linux/mpage.h> |
41 | #include <linux/aio.h> | ||
41 | 42 | ||
42 | #include "udf_i.h" | 43 | #include "udf_i.h" |
43 | #include "udf_sb.h" | 44 | #include "udf_sb.h" |
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 3244c988d379..2b2691b73428 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c | |||
@@ -31,6 +31,7 @@ | |||
31 | #include "xfs_vnodeops.h" | 31 | #include "xfs_vnodeops.h" |
32 | #include "xfs_trace.h" | 32 | #include "xfs_trace.h" |
33 | #include "xfs_bmap.h" | 33 | #include "xfs_bmap.h" |
34 | #include <linux/aio.h> | ||
34 | #include <linux/gfp.h> | 35 | #include <linux/gfp.h> |
35 | #include <linux/mpage.h> | 36 | #include <linux/mpage.h> |
36 | #include <linux/pagevec.h> | 37 | #include <linux/pagevec.h> |
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 054d60c0ac57..a5f2042aec8b 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c | |||
@@ -36,6 +36,7 @@ | |||
36 | #include "xfs_ioctl.h" | 36 | #include "xfs_ioctl.h" |
37 | #include "xfs_trace.h" | 37 | #include "xfs_trace.h" |
38 | 38 | ||
39 | #include <linux/aio.h> | ||
39 | #include <linux/dcache.h> | 40 | #include <linux/dcache.h> |
40 | #include <linux/falloc.h> | 41 | #include <linux/falloc.h> |
41 | #include <linux/pagevec.h> | 42 | #include <linux/pagevec.h> |
diff --git a/include/linux/aio.h b/include/linux/aio.h index 31ff6dba4872..1bdf965339f9 100644 --- a/include/linux/aio.h +++ b/include/linux/aio.h | |||
@@ -9,91 +9,32 @@ | |||
9 | 9 | ||
10 | #include <linux/atomic.h> | 10 | #include <linux/atomic.h> |
11 | 11 | ||
12 | #define AIO_MAXSEGS 4 | ||
13 | #define AIO_KIOGRP_NR_ATOMIC 8 | ||
14 | |||
15 | struct kioctx; | 12 | struct kioctx; |
13 | struct kiocb; | ||
16 | 14 | ||
17 | /* Notes on cancelling a kiocb: | 15 | #define KIOCB_KEY 0 |
18 | * If a kiocb is cancelled, aio_complete may return 0 to indicate | ||
19 | * that cancel has not yet disposed of the kiocb. All cancel | ||
20 | * operations *must* call aio_put_req to dispose of the kiocb | ||
21 | * to guard against races with the completion code. | ||
22 | */ | ||
23 | #define KIOCB_C_CANCELLED 0x01 | ||
24 | #define KIOCB_C_COMPLETE 0x02 | ||
25 | |||
26 | #define KIOCB_SYNC_KEY (~0U) | ||
27 | 16 | ||
28 | /* ki_flags bits */ | ||
29 | /* | 17 | /* |
30 | * This may be used for cancel/retry serialization in the future, but | 18 | * We use ki_cancel == KIOCB_CANCELLED to indicate that a kiocb has been either |
31 | * for now it's unused and we probably don't want modules to even | 19 | * cancelled or completed (this makes a certain amount of sense because |
32 | * think they can use it. | 20 | * successful cancellation - io_cancel() - does deliver the completion to |
21 | * userspace). | ||
22 | * | ||
23 | * And since most things don't implement kiocb cancellation and we'd really like | ||
24 | * kiocb completion to be lockless when possible, we use ki_cancel to | ||
25 | * synchronize cancellation and completion - we only set it to KIOCB_CANCELLED | ||
26 | * with xchg() or cmpxchg(), see batch_complete_aio() and kiocb_cancel(). | ||
33 | */ | 27 | */ |
34 | /* #define KIF_LOCKED 0 */ | 28 | #define KIOCB_CANCELLED ((void *) (~0ULL)) |
35 | #define KIF_KICKED 1 | ||
36 | #define KIF_CANCELLED 2 | ||
37 | |||
38 | #define kiocbTryLock(iocb) test_and_set_bit(KIF_LOCKED, &(iocb)->ki_flags) | ||
39 | #define kiocbTryKick(iocb) test_and_set_bit(KIF_KICKED, &(iocb)->ki_flags) | ||
40 | 29 | ||
41 | #define kiocbSetLocked(iocb) set_bit(KIF_LOCKED, &(iocb)->ki_flags) | 30 | typedef int (kiocb_cancel_fn)(struct kiocb *, struct io_event *); |
42 | #define kiocbSetKicked(iocb) set_bit(KIF_KICKED, &(iocb)->ki_flags) | ||
43 | #define kiocbSetCancelled(iocb) set_bit(KIF_CANCELLED, &(iocb)->ki_flags) | ||
44 | 31 | ||
45 | #define kiocbClearLocked(iocb) clear_bit(KIF_LOCKED, &(iocb)->ki_flags) | ||
46 | #define kiocbClearKicked(iocb) clear_bit(KIF_KICKED, &(iocb)->ki_flags) | ||
47 | #define kiocbClearCancelled(iocb) clear_bit(KIF_CANCELLED, &(iocb)->ki_flags) | ||
48 | |||
49 | #define kiocbIsLocked(iocb) test_bit(KIF_LOCKED, &(iocb)->ki_flags) | ||
50 | #define kiocbIsKicked(iocb) test_bit(KIF_KICKED, &(iocb)->ki_flags) | ||
51 | #define kiocbIsCancelled(iocb) test_bit(KIF_CANCELLED, &(iocb)->ki_flags) | ||
52 | |||
53 | /* is there a better place to document function pointer methods? */ | ||
54 | /** | ||
55 | * ki_retry - iocb forward progress callback | ||
56 | * @kiocb: The kiocb struct to advance by performing an operation. | ||
57 | * | ||
58 | * This callback is called when the AIO core wants a given AIO operation | ||
59 | * to make forward progress. The kiocb argument describes the operation | ||
60 | * that is to be performed. As the operation proceeds, perhaps partially, | ||
61 | * ki_retry is expected to update the kiocb with progress made. Typically | ||
62 | * ki_retry is set in the AIO core and it itself calls file_operations | ||
63 | * helpers. | ||
64 | * | ||
65 | * ki_retry's return value determines when the AIO operation is completed | ||
66 | * and an event is generated in the AIO event ring. Except the special | ||
67 | * return values described below, the value that is returned from ki_retry | ||
68 | * is transferred directly into the completion ring as the operation's | ||
69 | * resulting status. Once this has happened ki_retry *MUST NOT* reference | ||
70 | * the kiocb pointer again. | ||
71 | * | ||
72 | * If ki_retry returns -EIOCBQUEUED it has made a promise that aio_complete() | ||
73 | * will be called on the kiocb pointer in the future. The AIO core will | ||
74 | * not ask the method again -- ki_retry must ensure forward progress. | ||
75 | * aio_complete() must be called once and only once in the future, multiple | ||
76 | * calls may result in undefined behaviour. | ||
77 | * | ||
78 | * If ki_retry returns -EIOCBRETRY it has made a promise that kick_iocb() | ||
79 | * will be called on the kiocb pointer in the future. This may happen | ||
80 | * through generic helpers that associate kiocb->ki_wait with a wait | ||
81 | * queue head that ki_retry uses via current->io_wait. It can also happen | ||
82 | * with custom tracking and manual calls to kick_iocb(), though that is | ||
83 | * discouraged. In either case, kick_iocb() must be called once and only | ||
84 | * once. ki_retry must ensure forward progress, the AIO core will wait | ||
85 | * indefinitely for kick_iocb() to be called. | ||
86 | */ | ||
87 | struct kiocb { | 32 | struct kiocb { |
88 | struct list_head ki_run_list; | 33 | atomic_t ki_users; |
89 | unsigned long ki_flags; | ||
90 | int ki_users; | ||
91 | unsigned ki_key; /* id of this request */ | ||
92 | 34 | ||
93 | struct file *ki_filp; | 35 | struct file *ki_filp; |
94 | struct kioctx *ki_ctx; /* may be NULL for sync ops */ | 36 | struct kioctx *ki_ctx; /* NULL for sync ops */ |
95 | int (*ki_cancel)(struct kiocb *, struct io_event *); | 37 | kiocb_cancel_fn *ki_cancel; |
96 | ssize_t (*ki_retry)(struct kiocb *); | ||
97 | void (*ki_dtor)(struct kiocb *); | 38 | void (*ki_dtor)(struct kiocb *); |
98 | 39 | ||
99 | union { | 40 | union { |
@@ -117,7 +58,6 @@ struct kiocb { | |||
117 | 58 | ||
118 | struct list_head ki_list; /* the aio core uses this | 59 | struct list_head ki_list; /* the aio core uses this |
119 | * for cancellation */ | 60 | * for cancellation */ |
120 | struct list_head ki_batch; /* batch allocation */ | ||
121 | 61 | ||
122 | /* | 62 | /* |
123 | * If the aio_resfd field of the userspace iocb is not zero, | 63 | * If the aio_resfd field of the userspace iocb is not zero, |
@@ -128,106 +68,40 @@ struct kiocb { | |||
128 | 68 | ||
129 | static inline bool is_sync_kiocb(struct kiocb *kiocb) | 69 | static inline bool is_sync_kiocb(struct kiocb *kiocb) |
130 | { | 70 | { |
131 | return kiocb->ki_key == KIOCB_SYNC_KEY; | 71 | return kiocb->ki_ctx == NULL; |
132 | } | 72 | } |
133 | 73 | ||
134 | static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp) | 74 | static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp) |
135 | { | 75 | { |
136 | *kiocb = (struct kiocb) { | 76 | *kiocb = (struct kiocb) { |
137 | .ki_users = 1, | 77 | .ki_users = ATOMIC_INIT(1), |
138 | .ki_key = KIOCB_SYNC_KEY, | 78 | .ki_ctx = NULL, |
139 | .ki_filp = filp, | 79 | .ki_filp = filp, |
140 | .ki_obj.tsk = current, | 80 | .ki_obj.tsk = current, |
141 | }; | 81 | }; |
142 | } | 82 | } |
143 | 83 | ||
144 | #define AIO_RING_MAGIC 0xa10a10a1 | ||
145 | #define AIO_RING_COMPAT_FEATURES 1 | ||
146 | #define AIO_RING_INCOMPAT_FEATURES 0 | ||
147 | struct aio_ring { | ||
148 | unsigned id; /* kernel internal index number */ | ||
149 | unsigned nr; /* number of io_events */ | ||
150 | unsigned head; | ||
151 | unsigned tail; | ||
152 | |||
153 | unsigned magic; | ||
154 | unsigned compat_features; | ||
155 | unsigned incompat_features; | ||
156 | unsigned header_length; /* size of aio_ring */ | ||
157 | |||
158 | |||
159 | struct io_event io_events[0]; | ||
160 | }; /* 128 bytes + ring size */ | ||
161 | |||
162 | #define AIO_RING_PAGES 8 | ||
163 | struct aio_ring_info { | ||
164 | unsigned long mmap_base; | ||
165 | unsigned long mmap_size; | ||
166 | |||
167 | struct page **ring_pages; | ||
168 | spinlock_t ring_lock; | ||
169 | long nr_pages; | ||
170 | |||
171 | unsigned nr, tail; | ||
172 | |||
173 | struct page *internal_pages[AIO_RING_PAGES]; | ||
174 | }; | ||
175 | |||
176 | static inline unsigned aio_ring_avail(struct aio_ring_info *info, | ||
177 | struct aio_ring *ring) | ||
178 | { | ||
179 | return (ring->head + info->nr - 1 - ring->tail) % info->nr; | ||
180 | } | ||
181 | |||
182 | struct kioctx { | ||
183 | atomic_t users; | ||
184 | int dead; | ||
185 | struct mm_struct *mm; | ||
186 | |||
187 | /* This needs improving */ | ||
188 | unsigned long user_id; | ||
189 | struct hlist_node list; | ||
190 | |||
191 | wait_queue_head_t wait; | ||
192 | |||
193 | spinlock_t ctx_lock; | ||
194 | |||
195 | int reqs_active; | ||
196 | struct list_head active_reqs; /* used for cancellation */ | ||
197 | struct list_head run_list; /* used for kicked reqs */ | ||
198 | |||
199 | /* sys_io_setup currently limits this to an unsigned int */ | ||
200 | unsigned max_reqs; | ||
201 | |||
202 | struct aio_ring_info ring_info; | ||
203 | |||
204 | struct delayed_work wq; | ||
205 | |||
206 | struct rcu_head rcu_head; | ||
207 | }; | ||
208 | |||
209 | /* prototypes */ | 84 | /* prototypes */ |
210 | extern unsigned aio_max_size; | ||
211 | |||
212 | #ifdef CONFIG_AIO | 85 | #ifdef CONFIG_AIO |
213 | extern ssize_t wait_on_sync_kiocb(struct kiocb *iocb); | 86 | extern ssize_t wait_on_sync_kiocb(struct kiocb *iocb); |
214 | extern int aio_put_req(struct kiocb *iocb); | 87 | extern void aio_put_req(struct kiocb *iocb); |
215 | extern void kick_iocb(struct kiocb *iocb); | 88 | extern void aio_complete(struct kiocb *iocb, long res, long res2); |
216 | extern int aio_complete(struct kiocb *iocb, long res, long res2); | ||
217 | struct mm_struct; | 89 | struct mm_struct; |
218 | extern void exit_aio(struct mm_struct *mm); | 90 | extern void exit_aio(struct mm_struct *mm); |
219 | extern long do_io_submit(aio_context_t ctx_id, long nr, | 91 | extern long do_io_submit(aio_context_t ctx_id, long nr, |
220 | struct iocb __user *__user *iocbpp, bool compat); | 92 | struct iocb __user *__user *iocbpp, bool compat); |
93 | void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel); | ||
221 | #else | 94 | #else |
222 | static inline ssize_t wait_on_sync_kiocb(struct kiocb *iocb) { return 0; } | 95 | static inline ssize_t wait_on_sync_kiocb(struct kiocb *iocb) { return 0; } |
223 | static inline int aio_put_req(struct kiocb *iocb) { return 0; } | 96 | static inline void aio_put_req(struct kiocb *iocb) { } |
224 | static inline void kick_iocb(struct kiocb *iocb) { } | 97 | static inline void aio_complete(struct kiocb *iocb, long res, long res2) { } |
225 | static inline int aio_complete(struct kiocb *iocb, long res, long res2) { return 0; } | ||
226 | struct mm_struct; | 98 | struct mm_struct; |
227 | static inline void exit_aio(struct mm_struct *mm) { } | 99 | static inline void exit_aio(struct mm_struct *mm) { } |
228 | static inline long do_io_submit(aio_context_t ctx_id, long nr, | 100 | static inline long do_io_submit(aio_context_t ctx_id, long nr, |
229 | struct iocb __user * __user *iocbpp, | 101 | struct iocb __user * __user *iocbpp, |
230 | bool compat) { return 0; } | 102 | bool compat) { return 0; } |
103 | static inline void kiocb_set_cancel_fn(struct kiocb *req, | ||
104 | kiocb_cancel_fn *cancel) { } | ||
231 | #endif /* CONFIG_AIO */ | 105 | #endif /* CONFIG_AIO */ |
232 | 106 | ||
233 | static inline struct kiocb *list_kiocb(struct list_head *h) | 107 | static inline struct kiocb *list_kiocb(struct list_head *h) |
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 3bff9ce09cf7..5047355b9a0f 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h | |||
@@ -28,6 +28,7 @@ struct cgroup_subsys; | |||
28 | struct inode; | 28 | struct inode; |
29 | struct cgroup; | 29 | struct cgroup; |
30 | struct css_id; | 30 | struct css_id; |
31 | struct eventfd_ctx; | ||
31 | 32 | ||
32 | extern int cgroup_init_early(void); | 33 | extern int cgroup_init_early(void); |
33 | extern int cgroup_init(void); | 34 | extern int cgroup_init(void); |
diff --git a/include/linux/errno.h b/include/linux/errno.h index f6bf082d4d4f..89627b9187f9 100644 --- a/include/linux/errno.h +++ b/include/linux/errno.h | |||
@@ -28,6 +28,5 @@ | |||
28 | #define EBADTYPE 527 /* Type not supported by server */ | 28 | #define EBADTYPE 527 /* Type not supported by server */ |
29 | #define EJUKEBOX 528 /* Request initiated, but will not complete before timeout */ | 29 | #define EJUKEBOX 528 /* Request initiated, but will not complete before timeout */ |
30 | #define EIOCBQUEUED 529 /* iocb queued, will get completion event */ | 30 | #define EIOCBQUEUED 529 /* iocb queued, will get completion event */ |
31 | #define EIOCBRETRY 530 /* iocb queued, will trigger a retry */ | ||
32 | 31 | ||
33 | #endif | 32 | #endif |
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 3a62df310f2e..6b4890fa57e7 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h | |||
@@ -189,8 +189,7 @@ static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb) | |||
189 | 189 | ||
190 | extern const struct file_operations hugetlbfs_file_operations; | 190 | extern const struct file_operations hugetlbfs_file_operations; |
191 | extern const struct vm_operations_struct hugetlb_vm_ops; | 191 | extern const struct vm_operations_struct hugetlb_vm_ops; |
192 | struct file *hugetlb_file_setup(const char *name, unsigned long addr, | 192 | struct file *hugetlb_file_setup(const char *name, size_t size, vm_flags_t acct, |
193 | size_t size, vm_flags_t acct, | ||
194 | struct user_struct **user, int creat_flags, | 193 | struct user_struct **user, int creat_flags, |
195 | int page_size_log); | 194 | int page_size_log); |
196 | 195 | ||
@@ -209,8 +208,8 @@ static inline int is_file_hugepages(struct file *file) | |||
209 | 208 | ||
210 | #define is_file_hugepages(file) 0 | 209 | #define is_file_hugepages(file) 0 |
211 | static inline struct file * | 210 | static inline struct file * |
212 | hugetlb_file_setup(const char *name, unsigned long addr, size_t size, | 211 | hugetlb_file_setup(const char *name, size_t size, vm_flags_t acctflag, |
213 | vm_flags_t acctflag, struct user_struct **user, int creat_flags, | 212 | struct user_struct **user, int creat_flags, |
214 | int page_size_log) | 213 | int page_size_log) |
215 | { | 214 | { |
216 | return ERR_PTR(-ENOSYS); | 215 | return ERR_PTR(-ENOSYS); |
@@ -288,6 +287,13 @@ static inline struct hstate *hstate_file(struct file *f) | |||
288 | return hstate_inode(file_inode(f)); | 287 | return hstate_inode(file_inode(f)); |
289 | } | 288 | } |
290 | 289 | ||
290 | static inline struct hstate *hstate_sizelog(int page_size_log) | ||
291 | { | ||
292 | if (!page_size_log) | ||
293 | return &default_hstate; | ||
294 | return size_to_hstate(1 << page_size_log); | ||
295 | } | ||
296 | |||
291 | static inline struct hstate *hstate_vma(struct vm_area_struct *vma) | 297 | static inline struct hstate *hstate_vma(struct vm_area_struct *vma) |
292 | { | 298 | { |
293 | return hstate_file(vma->vm_file); | 299 | return hstate_file(vma->vm_file); |
@@ -352,11 +358,12 @@ static inline int hstate_index(struct hstate *h) | |||
352 | return h - hstates; | 358 | return h - hstates; |
353 | } | 359 | } |
354 | 360 | ||
355 | #else | 361 | #else /* CONFIG_HUGETLB_PAGE */ |
356 | struct hstate {}; | 362 | struct hstate {}; |
357 | #define alloc_huge_page_node(h, nid) NULL | 363 | #define alloc_huge_page_node(h, nid) NULL |
358 | #define alloc_bootmem_huge_page(h) NULL | 364 | #define alloc_bootmem_huge_page(h) NULL |
359 | #define hstate_file(f) NULL | 365 | #define hstate_file(f) NULL |
366 | #define hstate_sizelog(s) NULL | ||
360 | #define hstate_vma(v) NULL | 367 | #define hstate_vma(v) NULL |
361 | #define hstate_inode(i) NULL | 368 | #define hstate_inode(i) NULL |
362 | #define huge_page_size(h) PAGE_SIZE | 369 | #define huge_page_size(h) PAGE_SIZE |
@@ -371,6 +378,6 @@ static inline unsigned int pages_per_huge_page(struct hstate *h) | |||
371 | } | 378 | } |
372 | #define hstate_index_to_shift(index) 0 | 379 | #define hstate_index_to_shift(index) 0 |
373 | #define hstate_index(h) 0 | 380 | #define hstate_index(h) 0 |
374 | #endif | 381 | #endif /* CONFIG_HUGETLB_PAGE */ |
375 | 382 | ||
376 | #endif /* _LINUX_HUGETLB_H */ | 383 | #endif /* _LINUX_HUGETLB_H */ |
diff --git a/include/linux/mm.h b/include/linux/mm.h index 1a7f19e7f1a0..e0c8528a41a4 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h | |||
@@ -951,13 +951,19 @@ void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma, | |||
951 | * (see walk_page_range for more details) | 951 | * (see walk_page_range for more details) |
952 | */ | 952 | */ |
953 | struct mm_walk { | 953 | struct mm_walk { |
954 | int (*pgd_entry)(pgd_t *, unsigned long, unsigned long, struct mm_walk *); | 954 | int (*pgd_entry)(pgd_t *pgd, unsigned long addr, |
955 | int (*pud_entry)(pud_t *, unsigned long, unsigned long, struct mm_walk *); | 955 | unsigned long next, struct mm_walk *walk); |
956 | int (*pmd_entry)(pmd_t *, unsigned long, unsigned long, struct mm_walk *); | 956 | int (*pud_entry)(pud_t *pud, unsigned long addr, |
957 | int (*pte_entry)(pte_t *, unsigned long, unsigned long, struct mm_walk *); | 957 | unsigned long next, struct mm_walk *walk); |
958 | int (*pte_hole)(unsigned long, unsigned long, struct mm_walk *); | 958 | int (*pmd_entry)(pmd_t *pmd, unsigned long addr, |
959 | int (*hugetlb_entry)(pte_t *, unsigned long, | 959 | unsigned long next, struct mm_walk *walk); |
960 | unsigned long, unsigned long, struct mm_walk *); | 960 | int (*pte_entry)(pte_t *pte, unsigned long addr, |
961 | unsigned long next, struct mm_walk *walk); | ||
962 | int (*pte_hole)(unsigned long addr, unsigned long next, | ||
963 | struct mm_walk *walk); | ||
964 | int (*hugetlb_entry)(pte_t *pte, unsigned long hmask, | ||
965 | unsigned long addr, unsigned long next, | ||
966 | struct mm_walk *walk); | ||
961 | struct mm_struct *mm; | 967 | struct mm_struct *mm; |
962 | void *private; | 968 | void *private; |
963 | }; | 969 | }; |
diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index 731e4ecee3bd..e2772666f004 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h | |||
@@ -4,6 +4,7 @@ | |||
4 | #include <linux/sched.h> | 4 | #include <linux/sched.h> |
5 | #include <linux/bug.h> | 5 | #include <linux/bug.h> |
6 | #include <linux/mm.h> | 6 | #include <linux/mm.h> |
7 | #include <linux/workqueue.h> | ||
7 | #include <linux/threads.h> | 8 | #include <linux/threads.h> |
8 | #include <linux/nsproxy.h> | 9 | #include <linux/nsproxy.h> |
9 | #include <linux/kref.h> | 10 | #include <linux/kref.h> |
diff --git a/include/linux/random.h b/include/linux/random.h index 347ce553a306..3b9377d6b7a5 100644 --- a/include/linux/random.h +++ b/include/linux/random.h | |||
@@ -29,13 +29,6 @@ u32 prandom_u32(void); | |||
29 | void prandom_bytes(void *buf, int nbytes); | 29 | void prandom_bytes(void *buf, int nbytes); |
30 | void prandom_seed(u32 seed); | 30 | void prandom_seed(u32 seed); |
31 | 31 | ||
32 | /* | ||
33 | * These macros are preserved for backward compatibility and should be | ||
34 | * removed as soon as a transition is finished. | ||
35 | */ | ||
36 | #define random32() prandom_u32() | ||
37 | #define srandom32(seed) prandom_seed(seed) | ||
38 | |||
39 | u32 prandom_u32_state(struct rnd_state *); | 32 | u32 prandom_u32_state(struct rnd_state *); |
40 | void prandom_bytes_state(struct rnd_state *state, void *buf, int nbytes); | 33 | void prandom_bytes_state(struct rnd_state *state, void *buf, int nbytes); |
41 | 34 | ||
diff --git a/include/linux/sched.h b/include/linux/sched.h index 4800e9d1864c..022c085ac3c5 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -313,8 +313,6 @@ extern void schedule_preempt_disabled(void); | |||
313 | struct nsproxy; | 313 | struct nsproxy; |
314 | struct user_namespace; | 314 | struct user_namespace; |
315 | 315 | ||
316 | #include <linux/aio.h> | ||
317 | |||
318 | #ifdef CONFIG_MMU | 316 | #ifdef CONFIG_MMU |
319 | extern void arch_pick_mmap_layout(struct mm_struct *mm); | 317 | extern void arch_pick_mmap_layout(struct mm_struct *mm); |
320 | extern unsigned long | 318 | extern unsigned long |
diff --git a/include/linux/wait.h b/include/linux/wait.h index 7cb64d4b499d..ac38be2692d8 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h | |||
@@ -330,6 +330,92 @@ do { \ | |||
330 | __ret; \ | 330 | __ret; \ |
331 | }) | 331 | }) |
332 | 332 | ||
333 | #define __wait_event_hrtimeout(wq, condition, timeout, state) \ | ||
334 | ({ \ | ||
335 | int __ret = 0; \ | ||
336 | DEFINE_WAIT(__wait); \ | ||
337 | struct hrtimer_sleeper __t; \ | ||
338 | \ | ||
339 | hrtimer_init_on_stack(&__t.timer, CLOCK_MONOTONIC, \ | ||
340 | HRTIMER_MODE_REL); \ | ||
341 | hrtimer_init_sleeper(&__t, current); \ | ||
342 | if ((timeout).tv64 != KTIME_MAX) \ | ||
343 | hrtimer_start_range_ns(&__t.timer, timeout, \ | ||
344 | current->timer_slack_ns, \ | ||
345 | HRTIMER_MODE_REL); \ | ||
346 | \ | ||
347 | for (;;) { \ | ||
348 | prepare_to_wait(&wq, &__wait, state); \ | ||
349 | if (condition) \ | ||
350 | break; \ | ||
351 | if (state == TASK_INTERRUPTIBLE && \ | ||
352 | signal_pending(current)) { \ | ||
353 | __ret = -ERESTARTSYS; \ | ||
354 | break; \ | ||
355 | } \ | ||
356 | if (!__t.task) { \ | ||
357 | __ret = -ETIME; \ | ||
358 | break; \ | ||
359 | } \ | ||
360 | schedule(); \ | ||
361 | } \ | ||
362 | \ | ||
363 | hrtimer_cancel(&__t.timer); \ | ||
364 | destroy_hrtimer_on_stack(&__t.timer); \ | ||
365 | finish_wait(&wq, &__wait); \ | ||
366 | __ret; \ | ||
367 | }) | ||
368 | |||
369 | /** | ||
370 | * wait_event_hrtimeout - sleep until a condition gets true or a timeout elapses | ||
371 | * @wq: the waitqueue to wait on | ||
372 | * @condition: a C expression for the event to wait for | ||
373 | * @timeout: timeout, as a ktime_t | ||
374 | * | ||
375 | * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the | ||
376 | * @condition evaluates to true or a signal is received. | ||
377 | * The @condition is checked each time the waitqueue @wq is woken up. | ||
378 | * | ||
379 | * wake_up() has to be called after changing any variable that could | ||
380 | * change the result of the wait condition. | ||
381 | * | ||
382 | * The function returns 0 if @condition became true, or -ETIME if the timeout | ||
383 | * elapsed. | ||
384 | */ | ||
385 | #define wait_event_hrtimeout(wq, condition, timeout) \ | ||
386 | ({ \ | ||
387 | int __ret = 0; \ | ||
388 | if (!(condition)) \ | ||
389 | __ret = __wait_event_hrtimeout(wq, condition, timeout, \ | ||
390 | TASK_UNINTERRUPTIBLE); \ | ||
391 | __ret; \ | ||
392 | }) | ||
393 | |||
394 | /** | ||
395 | * wait_event_interruptible_hrtimeout - sleep until a condition gets true or a timeout elapses | ||
396 | * @wq: the waitqueue to wait on | ||
397 | * @condition: a C expression for the event to wait for | ||
398 | * @timeout: timeout, as a ktime_t | ||
399 | * | ||
400 | * The process is put to sleep (TASK_INTERRUPTIBLE) until the | ||
401 | * @condition evaluates to true or a signal is received. | ||
402 | * The @condition is checked each time the waitqueue @wq is woken up. | ||
403 | * | ||
404 | * wake_up() has to be called after changing any variable that could | ||
405 | * change the result of the wait condition. | ||
406 | * | ||
407 | * The function returns 0 if @condition became true, -ERESTARTSYS if it was | ||
408 | * interrupted by a signal, or -ETIME if the timeout elapsed. | ||
409 | */ | ||
410 | #define wait_event_interruptible_hrtimeout(wq, condition, timeout) \ | ||
411 | ({ \ | ||
412 | long __ret = 0; \ | ||
413 | if (!(condition)) \ | ||
414 | __ret = __wait_event_hrtimeout(wq, condition, timeout, \ | ||
415 | TASK_INTERRUPTIBLE); \ | ||
416 | __ret; \ | ||
417 | }) | ||
418 | |||
333 | #define __wait_event_interruptible_exclusive(wq, condition, ret) \ | 419 | #define __wait_event_interruptible_exclusive(wq, condition, ret) \ |
334 | do { \ | 420 | do { \ |
335 | DEFINE_WAIT(__wait); \ | 421 | DEFINE_WAIT(__wait); \ |
diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 9a9367c0c076..579a5007c696 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h | |||
@@ -5,6 +5,7 @@ | |||
5 | #define WRITEBACK_H | 5 | #define WRITEBACK_H |
6 | 6 | ||
7 | #include <linux/sched.h> | 7 | #include <linux/sched.h> |
8 | #include <linux/workqueue.h> | ||
8 | #include <linux/fs.h> | 9 | #include <linux/fs.h> |
9 | 10 | ||
10 | DECLARE_PER_CPU(int, dirty_throttle_leaks); | 11 | DECLARE_PER_CPU(int, dirty_throttle_leaks); |
@@ -491,10 +491,14 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) | |||
491 | 491 | ||
492 | sprintf (name, "SYSV%08x", key); | 492 | sprintf (name, "SYSV%08x", key); |
493 | if (shmflg & SHM_HUGETLB) { | 493 | if (shmflg & SHM_HUGETLB) { |
494 | struct hstate *hs = hstate_sizelog((shmflg >> SHM_HUGE_SHIFT) | ||
495 | & SHM_HUGE_MASK); | ||
496 | size_t hugesize = ALIGN(size, huge_page_size(hs)); | ||
497 | |||
494 | /* hugetlb_file_setup applies strict accounting */ | 498 | /* hugetlb_file_setup applies strict accounting */ |
495 | if (shmflg & SHM_NORESERVE) | 499 | if (shmflg & SHM_NORESERVE) |
496 | acctflag = VM_NORESERVE; | 500 | acctflag = VM_NORESERVE; |
497 | file = hugetlb_file_setup(name, 0, size, acctflag, | 501 | file = hugetlb_file_setup(name, hugesize, acctflag, |
498 | &shp->mlock_user, HUGETLB_SHMFS_INODE, | 502 | &shp->mlock_user, HUGETLB_SHMFS_INODE, |
499 | (shmflg >> SHM_HUGE_SHIFT) & SHM_HUGE_MASK); | 503 | (shmflg >> SHM_HUGE_SHIFT) & SHM_HUGE_MASK); |
500 | } else { | 504 | } else { |
diff --git a/kernel/fork.c b/kernel/fork.c index 7d40687b1434..c509cc4a0d53 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -70,6 +70,7 @@ | |||
70 | #include <linux/khugepaged.h> | 70 | #include <linux/khugepaged.h> |
71 | #include <linux/signalfd.h> | 71 | #include <linux/signalfd.h> |
72 | #include <linux/uprobes.h> | 72 | #include <linux/uprobes.h> |
73 | #include <linux/aio.h> | ||
73 | 74 | ||
74 | #include <asm/pgtable.h> | 75 | #include <asm/pgtable.h> |
75 | #include <asm/pgalloc.h> | 76 | #include <asm/pgalloc.h> |
diff --git a/kernel/printk.c b/kernel/printk.c index 96dcfcd9a2d4..fa36e1494420 100644 --- a/kernel/printk.c +++ b/kernel/printk.c | |||
@@ -32,6 +32,7 @@ | |||
32 | #include <linux/security.h> | 32 | #include <linux/security.h> |
33 | #include <linux/bootmem.h> | 33 | #include <linux/bootmem.h> |
34 | #include <linux/memblock.h> | 34 | #include <linux/memblock.h> |
35 | #include <linux/aio.h> | ||
35 | #include <linux/syscalls.h> | 36 | #include <linux/syscalls.h> |
36 | #include <linux/kexec.h> | 37 | #include <linux/kexec.h> |
37 | #include <linux/kdb.h> | 38 | #include <linux/kdb.h> |
diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 17ae54da0ec2..aed981a3f69c 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c | |||
@@ -17,6 +17,7 @@ | |||
17 | #include <linux/ptrace.h> | 17 | #include <linux/ptrace.h> |
18 | #include <linux/security.h> | 18 | #include <linux/security.h> |
19 | #include <linux/signal.h> | 19 | #include <linux/signal.h> |
20 | #include <linux/uio.h> | ||
20 | #include <linux/audit.h> | 21 | #include <linux/audit.h> |
21 | #include <linux/pid_namespace.h> | 22 | #include <linux/pid_namespace.h> |
22 | #include <linux/syscalls.h> | 23 | #include <linux/syscalls.h> |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 0f1d92163f30..cb1c9dedf9b6 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -92,16 +92,18 @@ enum mem_cgroup_stat_index { | |||
92 | /* | 92 | /* |
93 | * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss. | 93 | * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss. |
94 | */ | 94 | */ |
95 | MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ | 95 | MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ |
96 | MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ | 96 | MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ |
97 | MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ | 97 | MEM_CGROUP_STAT_RSS_HUGE, /* # of pages charged as anon huge */ |
98 | MEM_CGROUP_STAT_SWAP, /* # of pages, swapped out */ | 98 | MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ |
99 | MEM_CGROUP_STAT_SWAP, /* # of pages, swapped out */ | ||
99 | MEM_CGROUP_STAT_NSTATS, | 100 | MEM_CGROUP_STAT_NSTATS, |
100 | }; | 101 | }; |
101 | 102 | ||
102 | static const char * const mem_cgroup_stat_names[] = { | 103 | static const char * const mem_cgroup_stat_names[] = { |
103 | "cache", | 104 | "cache", |
104 | "rss", | 105 | "rss", |
106 | "rss_huge", | ||
105 | "mapped_file", | 107 | "mapped_file", |
106 | "swap", | 108 | "swap", |
107 | }; | 109 | }; |
@@ -917,6 +919,7 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, | |||
917 | } | 919 | } |
918 | 920 | ||
919 | static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, | 921 | static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, |
922 | struct page *page, | ||
920 | bool anon, int nr_pages) | 923 | bool anon, int nr_pages) |
921 | { | 924 | { |
922 | preempt_disable(); | 925 | preempt_disable(); |
@@ -932,6 +935,10 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, | |||
932 | __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE], | 935 | __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE], |
933 | nr_pages); | 936 | nr_pages); |
934 | 937 | ||
938 | if (PageTransHuge(page)) | ||
939 | __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], | ||
940 | nr_pages); | ||
941 | |||
935 | /* pagein of a big page is an event. So, ignore page size */ | 942 | /* pagein of a big page is an event. So, ignore page size */ |
936 | if (nr_pages > 0) | 943 | if (nr_pages > 0) |
937 | __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]); | 944 | __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]); |
@@ -2914,7 +2921,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, | |||
2914 | else | 2921 | else |
2915 | anon = false; | 2922 | anon = false; |
2916 | 2923 | ||
2917 | mem_cgroup_charge_statistics(memcg, anon, nr_pages); | 2924 | mem_cgroup_charge_statistics(memcg, page, anon, nr_pages); |
2918 | unlock_page_cgroup(pc); | 2925 | unlock_page_cgroup(pc); |
2919 | 2926 | ||
2920 | /* | 2927 | /* |
@@ -3708,16 +3715,21 @@ void mem_cgroup_split_huge_fixup(struct page *head) | |||
3708 | { | 3715 | { |
3709 | struct page_cgroup *head_pc = lookup_page_cgroup(head); | 3716 | struct page_cgroup *head_pc = lookup_page_cgroup(head); |
3710 | struct page_cgroup *pc; | 3717 | struct page_cgroup *pc; |
3718 | struct mem_cgroup *memcg; | ||
3711 | int i; | 3719 | int i; |
3712 | 3720 | ||
3713 | if (mem_cgroup_disabled()) | 3721 | if (mem_cgroup_disabled()) |
3714 | return; | 3722 | return; |
3723 | |||
3724 | memcg = head_pc->mem_cgroup; | ||
3715 | for (i = 1; i < HPAGE_PMD_NR; i++) { | 3725 | for (i = 1; i < HPAGE_PMD_NR; i++) { |
3716 | pc = head_pc + i; | 3726 | pc = head_pc + i; |
3717 | pc->mem_cgroup = head_pc->mem_cgroup; | 3727 | pc->mem_cgroup = memcg; |
3718 | smp_wmb();/* see __commit_charge() */ | 3728 | smp_wmb();/* see __commit_charge() */ |
3719 | pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT; | 3729 | pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT; |
3720 | } | 3730 | } |
3731 | __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], | ||
3732 | HPAGE_PMD_NR); | ||
3721 | } | 3733 | } |
3722 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | 3734 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ |
3723 | 3735 | ||
@@ -3773,11 +3785,11 @@ static int mem_cgroup_move_account(struct page *page, | |||
3773 | __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); | 3785 | __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); |
3774 | preempt_enable(); | 3786 | preempt_enable(); |
3775 | } | 3787 | } |
3776 | mem_cgroup_charge_statistics(from, anon, -nr_pages); | 3788 | mem_cgroup_charge_statistics(from, page, anon, -nr_pages); |
3777 | 3789 | ||
3778 | /* caller should have done css_get */ | 3790 | /* caller should have done css_get */ |
3779 | pc->mem_cgroup = to; | 3791 | pc->mem_cgroup = to; |
3780 | mem_cgroup_charge_statistics(to, anon, nr_pages); | 3792 | mem_cgroup_charge_statistics(to, page, anon, nr_pages); |
3781 | move_unlock_mem_cgroup(from, &flags); | 3793 | move_unlock_mem_cgroup(from, &flags); |
3782 | ret = 0; | 3794 | ret = 0; |
3783 | unlock: | 3795 | unlock: |
@@ -4152,7 +4164,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype, | |||
4152 | break; | 4164 | break; |
4153 | } | 4165 | } |
4154 | 4166 | ||
4155 | mem_cgroup_charge_statistics(memcg, anon, -nr_pages); | 4167 | mem_cgroup_charge_statistics(memcg, page, anon, -nr_pages); |
4156 | 4168 | ||
4157 | ClearPageCgroupUsed(pc); | 4169 | ClearPageCgroupUsed(pc); |
4158 | /* | 4170 | /* |
@@ -4502,7 +4514,7 @@ void mem_cgroup_replace_page_cache(struct page *oldpage, | |||
4502 | lock_page_cgroup(pc); | 4514 | lock_page_cgroup(pc); |
4503 | if (PageCgroupUsed(pc)) { | 4515 | if (PageCgroupUsed(pc)) { |
4504 | memcg = pc->mem_cgroup; | 4516 | memcg = pc->mem_cgroup; |
4505 | mem_cgroup_charge_statistics(memcg, false, -1); | 4517 | mem_cgroup_charge_statistics(memcg, oldpage, false, -1); |
4506 | ClearPageCgroupUsed(pc); | 4518 | ClearPageCgroupUsed(pc); |
4507 | } | 4519 | } |
4508 | unlock_page_cgroup(pc); | 4520 | unlock_page_cgroup(pc); |
@@ -5030,6 +5042,10 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) | |||
5030 | return res_counter_read_u64(&memcg->memsw, RES_USAGE); | 5042 | return res_counter_read_u64(&memcg->memsw, RES_USAGE); |
5031 | } | 5043 | } |
5032 | 5044 | ||
5045 | /* | ||
5046 | * Transparent hugepages are still accounted for in MEM_CGROUP_STAT_RSS | ||
5047 | * as well as in MEM_CGROUP_STAT_RSS_HUGE. | ||
5048 | */ | ||
5033 | val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE); | 5049 | val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE); |
5034 | val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS); | 5050 | val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS); |
5035 | 5051 | ||
@@ -1363,15 +1363,20 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, | |||
1363 | file = fget(fd); | 1363 | file = fget(fd); |
1364 | if (!file) | 1364 | if (!file) |
1365 | goto out; | 1365 | goto out; |
1366 | if (is_file_hugepages(file)) | ||
1367 | len = ALIGN(len, huge_page_size(hstate_file(file))); | ||
1366 | } else if (flags & MAP_HUGETLB) { | 1368 | } else if (flags & MAP_HUGETLB) { |
1367 | struct user_struct *user = NULL; | 1369 | struct user_struct *user = NULL; |
1370 | |||
1371 | len = ALIGN(len, huge_page_size(hstate_sizelog( | ||
1372 | (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK))); | ||
1368 | /* | 1373 | /* |
1369 | * VM_NORESERVE is used because the reservations will be | 1374 | * VM_NORESERVE is used because the reservations will be |
1370 | * taken when vm_ops->mmap() is called | 1375 | * taken when vm_ops->mmap() is called |
1371 | * A dummy user value is used because we are not locking | 1376 | * A dummy user value is used because we are not locking |
1372 | * memory so no accounting is necessary | 1377 | * memory so no accounting is necessary |
1373 | */ | 1378 | */ |
1374 | file = hugetlb_file_setup(HUGETLB_ANON_FILE, addr, len, | 1379 | file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, |
1375 | VM_NORESERVE, | 1380 | VM_NORESERVE, |
1376 | &user, HUGETLB_ANONHUGE_INODE, | 1381 | &user, HUGETLB_ANONHUGE_INODE, |
1377 | (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK); | 1382 | (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK); |
diff --git a/mm/mmu_context.c b/mm/mmu_context.c index 3dcfaf4ed355..8a8cd0265e52 100644 --- a/mm/mmu_context.c +++ b/mm/mmu_context.c | |||
@@ -14,9 +14,6 @@ | |||
14 | * use_mm | 14 | * use_mm |
15 | * Makes the calling kernel thread take on the specified | 15 | * Makes the calling kernel thread take on the specified |
16 | * mm context. | 16 | * mm context. |
17 | * Called by the retry thread execute retries within the | ||
18 | * iocb issuer's mm context, so that copy_from/to_user | ||
19 | * operations work seamlessly for aio. | ||
20 | * (Note: this routine is intended to be called only | 17 | * (Note: this routine is intended to be called only |
21 | * from a kernel thread context) | 18 | * from a kernel thread context) |
22 | */ | 19 | */ |
diff --git a/mm/page_io.c b/mm/page_io.c index bb5d75274686..06a8842a6ec6 100644 --- a/mm/page_io.c +++ b/mm/page_io.c | |||
@@ -20,6 +20,7 @@ | |||
20 | #include <linux/buffer_head.h> | 20 | #include <linux/buffer_head.h> |
21 | #include <linux/writeback.h> | 21 | #include <linux/writeback.h> |
22 | #include <linux/frontswap.h> | 22 | #include <linux/frontswap.h> |
23 | #include <linux/aio.h> | ||
23 | #include <asm/pgtable.h> | 24 | #include <asm/pgtable.h> |
24 | 25 | ||
25 | static struct bio *get_swap_bio(gfp_t gfp_flags, | 26 | static struct bio *get_swap_bio(gfp_t gfp_flags, |
diff --git a/mm/shmem.c b/mm/shmem.c index 39b2a0b86fe8..5e6a8422658b 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -31,6 +31,7 @@ | |||
31 | #include <linux/mm.h> | 31 | #include <linux/mm.h> |
32 | #include <linux/export.h> | 32 | #include <linux/export.h> |
33 | #include <linux/swap.h> | 33 | #include <linux/swap.h> |
34 | #include <linux/aio.h> | ||
34 | 35 | ||
35 | static struct vfsmount *shm_mnt; | 36 | static struct vfsmount *shm_mnt; |
36 | 37 | ||
@@ -30,6 +30,7 @@ | |||
30 | #include <linux/backing-dev.h> | 30 | #include <linux/backing-dev.h> |
31 | #include <linux/memcontrol.h> | 31 | #include <linux/memcontrol.h> |
32 | #include <linux/gfp.h> | 32 | #include <linux/gfp.h> |
33 | #include <linux/uio.h> | ||
33 | 34 | ||
34 | #include "internal.h" | 35 | #include "internal.h" |
35 | 36 | ||
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index b12fd8612604..d365724feb05 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
@@ -1522,6 +1522,8 @@ static void __vunmap(const void *addr, int deallocate_pages) | |||
1522 | * Must not be called in NMI context (strictly speaking, only if we don't | 1522 | * Must not be called in NMI context (strictly speaking, only if we don't |
1523 | * have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling | 1523 | * have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling |
1524 | * conventions for vfree() arch-depenedent would be a really bad idea) | 1524 | * conventions for vfree() arch-depenedent would be a really bad idea) |
1525 | * | ||
1526 | * NOTE: assumes that the object at *addr has a size >= sizeof(llist_node) | ||
1525 | * | 1527 | * |
1526 | */ | 1528 | */ |
1527 | void vfree(const void *addr) | 1529 | void vfree(const void *addr) |
diff --git a/security/keys/internal.h b/security/keys/internal.h index 8bbefc3b55d4..d4f1468b9b50 100644 --- a/security/keys/internal.h +++ b/security/keys/internal.h | |||
@@ -16,6 +16,8 @@ | |||
16 | #include <linux/key-type.h> | 16 | #include <linux/key-type.h> |
17 | #include <linux/task_work.h> | 17 | #include <linux/task_work.h> |
18 | 18 | ||
19 | struct iovec; | ||
20 | |||
19 | #ifdef __KDEBUG | 21 | #ifdef __KDEBUG |
20 | #define kenter(FMT, ...) \ | 22 | #define kenter(FMT, ...) \ |
21 | printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__) | 23 | printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__) |
diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c index 4b5c948eb414..33cfd27b4de2 100644 --- a/security/keys/keyctl.c +++ b/security/keys/keyctl.c | |||
@@ -22,6 +22,7 @@ | |||
22 | #include <linux/err.h> | 22 | #include <linux/err.h> |
23 | #include <linux/vmalloc.h> | 23 | #include <linux/vmalloc.h> |
24 | #include <linux/security.h> | 24 | #include <linux/security.h> |
25 | #include <linux/uio.h> | ||
25 | #include <asm/uaccess.h> | 26 | #include <asm/uaccess.h> |
26 | #include "internal.h" | 27 | #include "internal.h" |
27 | 28 | ||
diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c index 23e3c46cd0a4..ccfa383f1fda 100644 --- a/sound/core/pcm_native.c +++ b/sound/core/pcm_native.c | |||
@@ -25,7 +25,7 @@ | |||
25 | #include <linux/slab.h> | 25 | #include <linux/slab.h> |
26 | #include <linux/time.h> | 26 | #include <linux/time.h> |
27 | #include <linux/pm_qos.h> | 27 | #include <linux/pm_qos.h> |
28 | #include <linux/uio.h> | 28 | #include <linux/aio.h> |
29 | #include <linux/dma-mapping.h> | 29 | #include <linux/dma-mapping.h> |
30 | #include <sound/core.h> | 30 | #include <sound/core.h> |
31 | #include <sound/control.h> | 31 | #include <sound/control.h> |