diff options
89 files changed, 900 insertions, 1364 deletions
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt index 09027a9fece5..ddf4f93967a9 100644 --- a/Documentation/cgroups/memory.txt +++ b/Documentation/cgroups/memory.txt | |||
| @@ -480,7 +480,9 @@ memory.stat file includes following statistics | |||
| 480 | 480 | ||
| 481 | # per-memory cgroup local status | 481 | # per-memory cgroup local status |
| 482 | cache - # of bytes of page cache memory. | 482 | cache - # of bytes of page cache memory. |
| 483 | rss - # of bytes of anonymous and swap cache memory. | 483 | rss - # of bytes of anonymous and swap cache memory (includes |
| 484 | transparent hugepages). | ||
| 485 | rss_huge - # of bytes of anonymous transparent hugepages. | ||
| 484 | mapped_file - # of bytes of mapped file (includes tmpfs/shmem) | 486 | mapped_file - # of bytes of mapped file (includes tmpfs/shmem) |
| 485 | pgpgin - # of charging events to the memory cgroup. The charging | 487 | pgpgin - # of charging events to the memory cgroup. The charging |
| 486 | event happens each time a page is accounted as either mapped | 488 | event happens each time a page is accounted as either mapped |
diff --git a/arch/s390/hypfs/inode.c b/arch/s390/hypfs/inode.c index 5f7d7ba2874c..7a539f4f5e30 100644 --- a/arch/s390/hypfs/inode.c +++ b/arch/s390/hypfs/inode.c | |||
| @@ -21,6 +21,7 @@ | |||
| 21 | #include <linux/module.h> | 21 | #include <linux/module.h> |
| 22 | #include <linux/seq_file.h> | 22 | #include <linux/seq_file.h> |
| 23 | #include <linux/mount.h> | 23 | #include <linux/mount.h> |
| 24 | #include <linux/aio.h> | ||
| 24 | #include <asm/ebcdic.h> | 25 | #include <asm/ebcdic.h> |
| 25 | #include "hypfs.h" | 26 | #include "hypfs.h" |
| 26 | 27 | ||
diff --git a/arch/sparc/kernel/leon_smp.c b/arch/sparc/kernel/leon_smp.c index 9b40c9c12a0c..6cfc1b09ec25 100644 --- a/arch/sparc/kernel/leon_smp.c +++ b/arch/sparc/kernel/leon_smp.c | |||
| @@ -253,24 +253,15 @@ void __init leon_smp_done(void) | |||
| 253 | 253 | ||
| 254 | /* Free unneeded trap tables */ | 254 | /* Free unneeded trap tables */ |
| 255 | if (!cpu_present(1)) { | 255 | if (!cpu_present(1)) { |
| 256 | ClearPageReserved(virt_to_page(&trapbase_cpu1)); | 256 | free_reserved_page(virt_to_page(&trapbase_cpu1)); |
| 257 | init_page_count(virt_to_page(&trapbase_cpu1)); | ||
| 258 | free_page((unsigned long)&trapbase_cpu1); | ||
| 259 | totalram_pages++; | ||
| 260 | num_physpages++; | 257 | num_physpages++; |
| 261 | } | 258 | } |
| 262 | if (!cpu_present(2)) { | 259 | if (!cpu_present(2)) { |
| 263 | ClearPageReserved(virt_to_page(&trapbase_cpu2)); | 260 | free_reserved_page(virt_to_page(&trapbase_cpu2)); |
| 264 | init_page_count(virt_to_page(&trapbase_cpu2)); | ||
| 265 | free_page((unsigned long)&trapbase_cpu2); | ||
| 266 | totalram_pages++; | ||
| 267 | num_physpages++; | 261 | num_physpages++; |
| 268 | } | 262 | } |
| 269 | if (!cpu_present(3)) { | 263 | if (!cpu_present(3)) { |
| 270 | ClearPageReserved(virt_to_page(&trapbase_cpu3)); | 264 | free_reserved_page(virt_to_page(&trapbase_cpu3)); |
| 271 | init_page_count(virt_to_page(&trapbase_cpu3)); | ||
| 272 | free_page((unsigned long)&trapbase_cpu3); | ||
| 273 | totalram_pages++; | ||
| 274 | num_physpages++; | 265 | num_physpages++; |
| 275 | } | 266 | } |
| 276 | /* Ok, they are spinning and ready to go. */ | 267 | /* Ok, they are spinning and ready to go. */ |
diff --git a/arch/sparc/mm/init_32.c b/arch/sparc/mm/init_32.c index 4490c397bb5b..af472cf7c69a 100644 --- a/arch/sparc/mm/init_32.c +++ b/arch/sparc/mm/init_32.c | |||
| @@ -366,45 +366,14 @@ void __init mem_init(void) | |||
| 366 | 366 | ||
| 367 | void free_initmem (void) | 367 | void free_initmem (void) |
| 368 | { | 368 | { |
| 369 | unsigned long addr; | 369 | num_physpages += free_initmem_default(POISON_FREE_INITMEM); |
| 370 | unsigned long freed; | ||
| 371 | |||
| 372 | addr = (unsigned long)(&__init_begin); | ||
| 373 | freed = (unsigned long)(&__init_end) - addr; | ||
| 374 | for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) { | ||
| 375 | struct page *p; | ||
| 376 | |||
| 377 | memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE); | ||
| 378 | p = virt_to_page(addr); | ||
| 379 | |||
| 380 | ClearPageReserved(p); | ||
| 381 | init_page_count(p); | ||
| 382 | __free_page(p); | ||
| 383 | totalram_pages++; | ||
| 384 | num_physpages++; | ||
| 385 | } | ||
| 386 | printk(KERN_INFO "Freeing unused kernel memory: %ldk freed\n", | ||
| 387 | freed >> 10); | ||
| 388 | } | 370 | } |
| 389 | 371 | ||
| 390 | #ifdef CONFIG_BLK_DEV_INITRD | 372 | #ifdef CONFIG_BLK_DEV_INITRD |
| 391 | void free_initrd_mem(unsigned long start, unsigned long end) | 373 | void free_initrd_mem(unsigned long start, unsigned long end) |
| 392 | { | 374 | { |
| 393 | if (start < end) | 375 | num_physpages += free_reserved_area(start, end, POISON_FREE_INITMEM, |
| 394 | printk(KERN_INFO "Freeing initrd memory: %ldk freed\n", | 376 | "initrd"); |
| 395 | (end - start) >> 10); | ||
| 396 | for (; start < end; start += PAGE_SIZE) { | ||
| 397 | struct page *p; | ||
| 398 | |||
| 399 | memset((void *)start, POISON_FREE_INITMEM, PAGE_SIZE); | ||
| 400 | p = virt_to_page(start); | ||
| 401 | |||
| 402 | ClearPageReserved(p); | ||
| 403 | init_page_count(p); | ||
| 404 | __free_page(p); | ||
| 405 | totalram_pages++; | ||
| 406 | num_physpages++; | ||
| 407 | } | ||
| 408 | } | 377 | } |
| 409 | #endif | 378 | #endif |
| 410 | 379 | ||
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index cf72a8a5b3aa..a7171997adfd 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c | |||
| @@ -2059,8 +2059,7 @@ void __init mem_init(void) | |||
| 2059 | /* We subtract one to account for the mem_map_zero page | 2059 | /* We subtract one to account for the mem_map_zero page |
| 2060 | * allocated below. | 2060 | * allocated below. |
| 2061 | */ | 2061 | */ |
| 2062 | totalram_pages -= 1; | 2062 | num_physpages = totalram_pages - 1; |
| 2063 | num_physpages = totalram_pages; | ||
| 2064 | 2063 | ||
| 2065 | /* | 2064 | /* |
| 2066 | * Set up the zero page, mark it reserved, so that page count | 2065 | * Set up the zero page, mark it reserved, so that page count |
| @@ -2071,7 +2070,7 @@ void __init mem_init(void) | |||
| 2071 | prom_printf("paging_init: Cannot alloc zero page.\n"); | 2070 | prom_printf("paging_init: Cannot alloc zero page.\n"); |
| 2072 | prom_halt(); | 2071 | prom_halt(); |
| 2073 | } | 2072 | } |
| 2074 | SetPageReserved(mem_map_zero); | 2073 | mark_page_reserved(mem_map_zero); |
| 2075 | 2074 | ||
| 2076 | codepages = (((unsigned long) _etext) - ((unsigned long) _start)); | 2075 | codepages = (((unsigned long) _etext) - ((unsigned long) _start)); |
| 2077 | codepages = PAGE_ALIGN(codepages) >> PAGE_SHIFT; | 2076 | codepages = PAGE_ALIGN(codepages) >> PAGE_SHIFT; |
| @@ -2111,37 +2110,22 @@ void free_initmem(void) | |||
| 2111 | initend = (unsigned long)(__init_end) & PAGE_MASK; | 2110 | initend = (unsigned long)(__init_end) & PAGE_MASK; |
| 2112 | for (; addr < initend; addr += PAGE_SIZE) { | 2111 | for (; addr < initend; addr += PAGE_SIZE) { |
| 2113 | unsigned long page; | 2112 | unsigned long page; |
| 2114 | struct page *p; | ||
| 2115 | 2113 | ||
| 2116 | page = (addr + | 2114 | page = (addr + |
| 2117 | ((unsigned long) __va(kern_base)) - | 2115 | ((unsigned long) __va(kern_base)) - |
| 2118 | ((unsigned long) KERNBASE)); | 2116 | ((unsigned long) KERNBASE)); |
| 2119 | memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE); | 2117 | memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE); |
| 2120 | 2118 | ||
| 2121 | if (do_free) { | 2119 | if (do_free) |
| 2122 | p = virt_to_page(page); | 2120 | free_reserved_page(virt_to_page(page)); |
| 2123 | |||
| 2124 | ClearPageReserved(p); | ||
| 2125 | init_page_count(p); | ||
| 2126 | __free_page(p); | ||
| 2127 | totalram_pages++; | ||
| 2128 | } | ||
| 2129 | } | 2121 | } |
| 2130 | } | 2122 | } |
| 2131 | 2123 | ||
| 2132 | #ifdef CONFIG_BLK_DEV_INITRD | 2124 | #ifdef CONFIG_BLK_DEV_INITRD |
| 2133 | void free_initrd_mem(unsigned long start, unsigned long end) | 2125 | void free_initrd_mem(unsigned long start, unsigned long end) |
| 2134 | { | 2126 | { |
| 2135 | if (start < end) | 2127 | num_physpages += free_reserved_area(start, end, POISON_FREE_INITMEM, |
| 2136 | printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10); | 2128 | "initrd"); |
| 2137 | for (; start < end; start += PAGE_SIZE) { | ||
| 2138 | struct page *p = virt_to_page(start); | ||
| 2139 | |||
| 2140 | ClearPageReserved(p); | ||
| 2141 | init_page_count(p); | ||
| 2142 | __free_page(p); | ||
| 2143 | totalram_pages++; | ||
| 2144 | } | ||
| 2145 | } | 2129 | } |
| 2146 | #endif | 2130 | #endif |
| 2147 | 2131 | ||
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c index 9a87daa6f4fb..a5ffcc988f0b 100644 --- a/block/scsi_ioctl.c +++ b/block/scsi_ioctl.c | |||
| @@ -27,6 +27,7 @@ | |||
| 27 | #include <linux/ratelimit.h> | 27 | #include <linux/ratelimit.h> |
| 28 | #include <linux/slab.h> | 28 | #include <linux/slab.h> |
| 29 | #include <linux/times.h> | 29 | #include <linux/times.h> |
| 30 | #include <linux/uio.h> | ||
| 30 | #include <asm/uaccess.h> | 31 | #include <asm/uaccess.h> |
| 31 | 32 | ||
| 32 | #include <scsi/scsi.h> | 33 | #include <scsi/scsi.h> |
diff --git a/drivers/char/mem.c b/drivers/char/mem.c index 2c644afbcdd4..1ccbe9482faa 100644 --- a/drivers/char/mem.c +++ b/drivers/char/mem.c | |||
| @@ -28,6 +28,7 @@ | |||
| 28 | #include <linux/pfn.h> | 28 | #include <linux/pfn.h> |
| 29 | #include <linux/export.h> | 29 | #include <linux/export.h> |
| 30 | #include <linux/io.h> | 30 | #include <linux/io.h> |
| 31 | #include <linux/aio.h> | ||
| 31 | 32 | ||
| 32 | #include <asm/uaccess.h> | 33 | #include <asm/uaccess.h> |
| 33 | 34 | ||
| @@ -627,6 +628,18 @@ static ssize_t write_null(struct file *file, const char __user *buf, | |||
| 627 | return count; | 628 | return count; |
| 628 | } | 629 | } |
| 629 | 630 | ||
| 631 | static ssize_t aio_read_null(struct kiocb *iocb, const struct iovec *iov, | ||
| 632 | unsigned long nr_segs, loff_t pos) | ||
| 633 | { | ||
| 634 | return 0; | ||
| 635 | } | ||
| 636 | |||
| 637 | static ssize_t aio_write_null(struct kiocb *iocb, const struct iovec *iov, | ||
| 638 | unsigned long nr_segs, loff_t pos) | ||
| 639 | { | ||
| 640 | return iov_length(iov, nr_segs); | ||
| 641 | } | ||
| 642 | |||
| 630 | static int pipe_to_null(struct pipe_inode_info *info, struct pipe_buffer *buf, | 643 | static int pipe_to_null(struct pipe_inode_info *info, struct pipe_buffer *buf, |
| 631 | struct splice_desc *sd) | 644 | struct splice_desc *sd) |
| 632 | { | 645 | { |
| @@ -670,6 +683,24 @@ static ssize_t read_zero(struct file *file, char __user *buf, | |||
| 670 | return written ? written : -EFAULT; | 683 | return written ? written : -EFAULT; |
| 671 | } | 684 | } |
| 672 | 685 | ||
| 686 | static ssize_t aio_read_zero(struct kiocb *iocb, const struct iovec *iov, | ||
| 687 | unsigned long nr_segs, loff_t pos) | ||
| 688 | { | ||
| 689 | size_t written = 0; | ||
| 690 | unsigned long i; | ||
| 691 | ssize_t ret; | ||
| 692 | |||
| 693 | for (i = 0; i < nr_segs; i++) { | ||
| 694 | ret = read_zero(iocb->ki_filp, iov[i].iov_base, iov[i].iov_len, | ||
| 695 | &pos); | ||
| 696 | if (ret < 0) | ||
| 697 | break; | ||
| 698 | written += ret; | ||
| 699 | } | ||
| 700 | |||
| 701 | return written ? written : -EFAULT; | ||
| 702 | } | ||
| 703 | |||
| 673 | static int mmap_zero(struct file *file, struct vm_area_struct *vma) | 704 | static int mmap_zero(struct file *file, struct vm_area_struct *vma) |
| 674 | { | 705 | { |
| 675 | #ifndef CONFIG_MMU | 706 | #ifndef CONFIG_MMU |
| @@ -738,6 +769,7 @@ static int open_port(struct inode *inode, struct file *filp) | |||
| 738 | #define full_lseek null_lseek | 769 | #define full_lseek null_lseek |
| 739 | #define write_zero write_null | 770 | #define write_zero write_null |
| 740 | #define read_full read_zero | 771 | #define read_full read_zero |
| 772 | #define aio_write_zero aio_write_null | ||
| 741 | #define open_mem open_port | 773 | #define open_mem open_port |
| 742 | #define open_kmem open_mem | 774 | #define open_kmem open_mem |
| 743 | #define open_oldmem open_mem | 775 | #define open_oldmem open_mem |
| @@ -766,6 +798,8 @@ static const struct file_operations null_fops = { | |||
| 766 | .llseek = null_lseek, | 798 | .llseek = null_lseek, |
| 767 | .read = read_null, | 799 | .read = read_null, |
| 768 | .write = write_null, | 800 | .write = write_null, |
| 801 | .aio_read = aio_read_null, | ||
| 802 | .aio_write = aio_write_null, | ||
| 769 | .splice_write = splice_write_null, | 803 | .splice_write = splice_write_null, |
| 770 | }; | 804 | }; |
| 771 | 805 | ||
| @@ -782,6 +816,8 @@ static const struct file_operations zero_fops = { | |||
| 782 | .llseek = zero_lseek, | 816 | .llseek = zero_lseek, |
| 783 | .read = read_zero, | 817 | .read = read_zero, |
| 784 | .write = write_zero, | 818 | .write = write_zero, |
| 819 | .aio_read = aio_read_zero, | ||
| 820 | .aio_write = aio_write_zero, | ||
| 785 | .mmap = mmap_zero, | 821 | .mmap = mmap_zero, |
| 786 | }; | 822 | }; |
| 787 | 823 | ||
diff --git a/drivers/infiniband/hw/cxgb3/cxio_resource.c b/drivers/infiniband/hw/cxgb3/cxio_resource.c index 31f9201b2980..c40088ecf9f3 100644 --- a/drivers/infiniband/hw/cxgb3/cxio_resource.c +++ b/drivers/infiniband/hw/cxgb3/cxio_resource.c | |||
| @@ -62,13 +62,13 @@ static int __cxio_init_resource_fifo(struct kfifo *fifo, | |||
| 62 | kfifo_in(fifo, (unsigned char *) &entry, sizeof(u32)); | 62 | kfifo_in(fifo, (unsigned char *) &entry, sizeof(u32)); |
| 63 | if (random) { | 63 | if (random) { |
| 64 | j = 0; | 64 | j = 0; |
| 65 | random_bytes = random32(); | 65 | random_bytes = prandom_u32(); |
| 66 | for (i = 0; i < RANDOM_SIZE; i++) | 66 | for (i = 0; i < RANDOM_SIZE; i++) |
| 67 | rarray[i] = i + skip_low; | 67 | rarray[i] = i + skip_low; |
| 68 | for (i = skip_low + RANDOM_SIZE; i < nr - skip_high; i++) { | 68 | for (i = skip_low + RANDOM_SIZE; i < nr - skip_high; i++) { |
| 69 | if (j >= RANDOM_SIZE) { | 69 | if (j >= RANDOM_SIZE) { |
| 70 | j = 0; | 70 | j = 0; |
| 71 | random_bytes = random32(); | 71 | random_bytes = prandom_u32(); |
| 72 | } | 72 | } |
| 73 | idx = (random_bytes >> (j * 2)) & 0xF; | 73 | idx = (random_bytes >> (j * 2)) & 0xF; |
| 74 | kfifo_in(fifo, | 74 | kfifo_in(fifo, |
diff --git a/drivers/infiniband/hw/cxgb4/id_table.c b/drivers/infiniband/hw/cxgb4/id_table.c index f95e5df30db2..0161ae6ad629 100644 --- a/drivers/infiniband/hw/cxgb4/id_table.c +++ b/drivers/infiniband/hw/cxgb4/id_table.c | |||
| @@ -54,7 +54,7 @@ u32 c4iw_id_alloc(struct c4iw_id_table *alloc) | |||
| 54 | 54 | ||
| 55 | if (obj < alloc->max) { | 55 | if (obj < alloc->max) { |
| 56 | if (alloc->flags & C4IW_ID_TABLE_F_RANDOM) | 56 | if (alloc->flags & C4IW_ID_TABLE_F_RANDOM) |
| 57 | alloc->last += random32() % RANDOM_SKIP; | 57 | alloc->last += prandom_u32() % RANDOM_SKIP; |
| 58 | else | 58 | else |
| 59 | alloc->last = obj + 1; | 59 | alloc->last = obj + 1; |
| 60 | if (alloc->last >= alloc->max) | 60 | if (alloc->last >= alloc->max) |
| @@ -88,7 +88,7 @@ int c4iw_id_table_alloc(struct c4iw_id_table *alloc, u32 start, u32 num, | |||
| 88 | alloc->start = start; | 88 | alloc->start = start; |
| 89 | alloc->flags = flags; | 89 | alloc->flags = flags; |
| 90 | if (flags & C4IW_ID_TABLE_F_RANDOM) | 90 | if (flags & C4IW_ID_TABLE_F_RANDOM) |
| 91 | alloc->last = random32() % RANDOM_SKIP; | 91 | alloc->last = prandom_u32() % RANDOM_SKIP; |
| 92 | else | 92 | else |
| 93 | alloc->last = 0; | 93 | alloc->last = 0; |
| 94 | alloc->max = num; | 94 | alloc->max = num; |
diff --git a/drivers/infiniband/hw/ipath/ipath_file_ops.c b/drivers/infiniband/hw/ipath/ipath_file_ops.c index aed8afee56da..6d7f453b4d05 100644 --- a/drivers/infiniband/hw/ipath/ipath_file_ops.c +++ b/drivers/infiniband/hw/ipath/ipath_file_ops.c | |||
| @@ -40,6 +40,7 @@ | |||
| 40 | #include <linux/slab.h> | 40 | #include <linux/slab.h> |
| 41 | #include <linux/highmem.h> | 41 | #include <linux/highmem.h> |
| 42 | #include <linux/io.h> | 42 | #include <linux/io.h> |
| 43 | #include <linux/aio.h> | ||
| 43 | #include <linux/jiffies.h> | 44 | #include <linux/jiffies.h> |
| 44 | #include <linux/cpu.h> | 45 | #include <linux/cpu.h> |
| 45 | #include <asm/pgtable.h> | 46 | #include <asm/pgtable.h> |
diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c index 934792c477bc..4d599cedbb0b 100644 --- a/drivers/infiniband/hw/mlx4/mad.c +++ b/drivers/infiniband/hw/mlx4/mad.c | |||
| @@ -93,7 +93,7 @@ static void __propagate_pkey_ev(struct mlx4_ib_dev *dev, int port_num, | |||
| 93 | __be64 mlx4_ib_gen_node_guid(void) | 93 | __be64 mlx4_ib_gen_node_guid(void) |
| 94 | { | 94 | { |
| 95 | #define NODE_GUID_HI ((u64) (((u64)IB_OPENIB_OUI) << 40)) | 95 | #define NODE_GUID_HI ((u64) (((u64)IB_OPENIB_OUI) << 40)) |
| 96 | return cpu_to_be64(NODE_GUID_HI | random32()); | 96 | return cpu_to_be64(NODE_GUID_HI | prandom_u32()); |
| 97 | } | 97 | } |
| 98 | 98 | ||
| 99 | __be64 mlx4_ib_get_new_demux_tid(struct mlx4_ib_demux_ctx *ctx) | 99 | __be64 mlx4_ib_get_new_demux_tid(struct mlx4_ib_demux_ctx *ctx) |
diff --git a/drivers/infiniband/hw/qib/qib_file_ops.c b/drivers/infiniband/hw/qib/qib_file_ops.c index 4f7aa301b3b1..b56c9428f3c5 100644 --- a/drivers/infiniband/hw/qib/qib_file_ops.c +++ b/drivers/infiniband/hw/qib/qib_file_ops.c | |||
| @@ -39,7 +39,7 @@ | |||
| 39 | #include <linux/vmalloc.h> | 39 | #include <linux/vmalloc.h> |
| 40 | #include <linux/highmem.h> | 40 | #include <linux/highmem.h> |
| 41 | #include <linux/io.h> | 41 | #include <linux/io.h> |
| 42 | #include <linux/uio.h> | 42 | #include <linux/aio.h> |
| 43 | #include <linux/jiffies.h> | 43 | #include <linux/jiffies.h> |
| 44 | #include <asm/pgtable.h> | 44 | #include <asm/pgtable.h> |
| 45 | #include <linux/delay.h> | 45 | #include <linux/delay.h> |
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c index 1ef880de3a41..3eceb61e3532 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c | |||
| @@ -460,7 +460,7 @@ static int ipoib_cm_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *even | |||
| 460 | goto err_qp; | 460 | goto err_qp; |
| 461 | } | 461 | } |
| 462 | 462 | ||
| 463 | psn = random32() & 0xffffff; | 463 | psn = prandom_u32() & 0xffffff; |
| 464 | ret = ipoib_cm_modify_rx_qp(dev, cm_id, p->qp, psn); | 464 | ret = ipoib_cm_modify_rx_qp(dev, cm_id, p->qp, psn); |
| 465 | if (ret) | 465 | if (ret) |
| 466 | goto err_modify; | 466 | goto err_modify; |
diff --git a/drivers/net/ethernet/broadcom/cnic.c b/drivers/net/ethernet/broadcom/cnic.c index 40649a8bf390..6b0dc131b20e 100644 --- a/drivers/net/ethernet/broadcom/cnic.c +++ b/drivers/net/ethernet/broadcom/cnic.c | |||
| @@ -4085,7 +4085,7 @@ static int cnic_cm_alloc_mem(struct cnic_dev *dev) | |||
| 4085 | if (!cp->csk_tbl) | 4085 | if (!cp->csk_tbl) |
| 4086 | return -ENOMEM; | 4086 | return -ENOMEM; |
| 4087 | 4087 | ||
| 4088 | port_id = random32(); | 4088 | port_id = prandom_u32(); |
| 4089 | port_id %= CNIC_LOCAL_PORT_RANGE; | 4089 | port_id %= CNIC_LOCAL_PORT_RANGE; |
| 4090 | if (cnic_init_id_tbl(&cp->csk_port_tbl, CNIC_LOCAL_PORT_RANGE, | 4090 | if (cnic_init_id_tbl(&cp->csk_port_tbl, CNIC_LOCAL_PORT_RANGE, |
| 4091 | CNIC_LOCAL_PORT_MIN, port_id)) { | 4091 | CNIC_LOCAL_PORT_MIN, port_id)) { |
| @@ -4145,7 +4145,7 @@ static int cnic_cm_init_bnx2_hw(struct cnic_dev *dev) | |||
| 4145 | { | 4145 | { |
| 4146 | u32 seed; | 4146 | u32 seed; |
| 4147 | 4147 | ||
| 4148 | seed = random32(); | 4148 | seed = prandom_u32(); |
| 4149 | cnic_ctx_wr(dev, 45, 0, seed); | 4149 | cnic_ctx_wr(dev, 45, 0, seed); |
| 4150 | return 0; | 4150 | return 0; |
| 4151 | } | 4151 | } |
diff --git a/drivers/net/hamradio/baycom_epp.c b/drivers/net/hamradio/baycom_epp.c index 49b8b58fc5c6..484f77ec2ce1 100644 --- a/drivers/net/hamradio/baycom_epp.c +++ b/drivers/net/hamradio/baycom_epp.c | |||
| @@ -449,7 +449,7 @@ static int transmit(struct baycom_state *bc, int cnt, unsigned char stat) | |||
| 449 | if ((--bc->hdlctx.slotcnt) > 0) | 449 | if ((--bc->hdlctx.slotcnt) > 0) |
| 450 | return 0; | 450 | return 0; |
| 451 | bc->hdlctx.slotcnt = bc->ch_params.slottime; | 451 | bc->hdlctx.slotcnt = bc->ch_params.slottime; |
| 452 | if ((random32() % 256) > bc->ch_params.ppersist) | 452 | if ((prandom_u32() % 256) > bc->ch_params.ppersist) |
| 453 | return 0; | 453 | return 0; |
| 454 | } | 454 | } |
| 455 | } | 455 | } |
diff --git a/drivers/net/hamradio/hdlcdrv.c b/drivers/net/hamradio/hdlcdrv.c index a4a3516b6bbf..3169252613fa 100644 --- a/drivers/net/hamradio/hdlcdrv.c +++ b/drivers/net/hamradio/hdlcdrv.c | |||
| @@ -389,7 +389,7 @@ void hdlcdrv_arbitrate(struct net_device *dev, struct hdlcdrv_state *s) | |||
| 389 | if ((--s->hdlctx.slotcnt) > 0) | 389 | if ((--s->hdlctx.slotcnt) > 0) |
| 390 | return; | 390 | return; |
| 391 | s->hdlctx.slotcnt = s->ch_params.slottime; | 391 | s->hdlctx.slotcnt = s->ch_params.slottime; |
| 392 | if ((random32() % 256) > s->ch_params.ppersist) | 392 | if ((prandom_u32() % 256) > s->ch_params.ppersist) |
| 393 | return; | 393 | return; |
| 394 | start_tx(dev, s); | 394 | start_tx(dev, s); |
| 395 | } | 395 | } |
diff --git a/drivers/net/hamradio/yam.c b/drivers/net/hamradio/yam.c index b2d863f2ea42..0721e72f9299 100644 --- a/drivers/net/hamradio/yam.c +++ b/drivers/net/hamradio/yam.c | |||
| @@ -638,7 +638,7 @@ static void yam_arbitrate(struct net_device *dev) | |||
| 638 | yp->slotcnt = yp->slot / 10; | 638 | yp->slotcnt = yp->slot / 10; |
| 639 | 639 | ||
| 640 | /* is random > persist ? */ | 640 | /* is random > persist ? */ |
| 641 | if ((random32() % 256) > yp->pers) | 641 | if ((prandom_u32() % 256) > yp->pers) |
| 642 | return; | 642 | return; |
| 643 | 643 | ||
| 644 | yam_start_tx(dev, yp); | 644 | yam_start_tx(dev, yp); |
diff --git a/drivers/net/team/team_mode_random.c b/drivers/net/team/team_mode_random.c index 9eabfaa22f3e..5ca14d463ba7 100644 --- a/drivers/net/team/team_mode_random.c +++ b/drivers/net/team/team_mode_random.c | |||
| @@ -18,7 +18,7 @@ | |||
| 18 | 18 | ||
| 19 | static u32 random_N(unsigned int N) | 19 | static u32 random_N(unsigned int N) |
| 20 | { | 20 | { |
| 21 | return reciprocal_divide(random32(), N); | 21 | return reciprocal_divide(prandom_u32(), N); |
| 22 | } | 22 | } |
| 23 | 23 | ||
| 24 | static bool rnd_transmit(struct team *team, struct sk_buff *skb) | 24 | static bool rnd_transmit(struct team *team, struct sk_buff *skb) |
diff --git a/drivers/net/wireless/brcm80211/brcmfmac/p2p.c b/drivers/net/wireless/brcm80211/brcmfmac/p2p.c index 2b90da0d85f3..e7a1a4770996 100644 --- a/drivers/net/wireless/brcm80211/brcmfmac/p2p.c +++ b/drivers/net/wireless/brcm80211/brcmfmac/p2p.c | |||
| @@ -1117,7 +1117,7 @@ static void brcmf_p2p_afx_handler(struct work_struct *work) | |||
| 1117 | if (afx_hdl->is_listen && afx_hdl->my_listen_chan) | 1117 | if (afx_hdl->is_listen && afx_hdl->my_listen_chan) |
| 1118 | /* 100ms ~ 300ms */ | 1118 | /* 100ms ~ 300ms */ |
| 1119 | err = brcmf_p2p_discover_listen(p2p, afx_hdl->my_listen_chan, | 1119 | err = brcmf_p2p_discover_listen(p2p, afx_hdl->my_listen_chan, |
| 1120 | 100 * (1 + (random32() % 3))); | 1120 | 100 * (1 + prandom_u32() % 3)); |
| 1121 | else | 1121 | else |
| 1122 | err = brcmf_p2p_act_frm_search(p2p, afx_hdl->peer_listen_chan); | 1122 | err = brcmf_p2p_act_frm_search(p2p, afx_hdl->peer_listen_chan); |
| 1123 | 1123 | ||
diff --git a/drivers/net/wireless/mwifiex/cfg80211.c b/drivers/net/wireless/mwifiex/cfg80211.c index a0cb0770d319..d3c8ece980d8 100644 --- a/drivers/net/wireless/mwifiex/cfg80211.c +++ b/drivers/net/wireless/mwifiex/cfg80211.c | |||
| @@ -216,7 +216,7 @@ mwifiex_cfg80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev, | |||
| 216 | mwifiex_form_mgmt_frame(skb, buf, len); | 216 | mwifiex_form_mgmt_frame(skb, buf, len); |
| 217 | mwifiex_queue_tx_pkt(priv, skb); | 217 | mwifiex_queue_tx_pkt(priv, skb); |
| 218 | 218 | ||
| 219 | *cookie = random32() | 1; | 219 | *cookie = prandom_u32() | 1; |
| 220 | cfg80211_mgmt_tx_status(wdev, *cookie, buf, len, true, GFP_ATOMIC); | 220 | cfg80211_mgmt_tx_status(wdev, *cookie, buf, len, true, GFP_ATOMIC); |
| 221 | 221 | ||
| 222 | wiphy_dbg(wiphy, "info: management frame transmitted\n"); | 222 | wiphy_dbg(wiphy, "info: management frame transmitted\n"); |
| @@ -271,7 +271,7 @@ mwifiex_cfg80211_remain_on_channel(struct wiphy *wiphy, | |||
| 271 | duration); | 271 | duration); |
| 272 | 272 | ||
| 273 | if (!ret) { | 273 | if (!ret) { |
| 274 | *cookie = random32() | 1; | 274 | *cookie = prandom_u32() | 1; |
| 275 | priv->roc_cfg.cookie = *cookie; | 275 | priv->roc_cfg.cookie = *cookie; |
| 276 | priv->roc_cfg.chan = *chan; | 276 | priv->roc_cfg.chan = *chan; |
| 277 | 277 | ||
diff --git a/drivers/rtc/rtc-rs5c372.c b/drivers/rtc/rtc-rs5c372.c index 224d634322b4..ccf54f06396b 100644 --- a/drivers/rtc/rtc-rs5c372.c +++ b/drivers/rtc/rtc-rs5c372.c | |||
| @@ -68,6 +68,7 @@ | |||
| 68 | enum rtc_type { | 68 | enum rtc_type { |
| 69 | rtc_undef = 0, | 69 | rtc_undef = 0, |
| 70 | rtc_r2025sd, | 70 | rtc_r2025sd, |
| 71 | rtc_r2221tl, | ||
| 71 | rtc_rs5c372a, | 72 | rtc_rs5c372a, |
| 72 | rtc_rs5c372b, | 73 | rtc_rs5c372b, |
| 73 | rtc_rv5c386, | 74 | rtc_rv5c386, |
| @@ -76,6 +77,7 @@ enum rtc_type { | |||
| 76 | 77 | ||
| 77 | static const struct i2c_device_id rs5c372_id[] = { | 78 | static const struct i2c_device_id rs5c372_id[] = { |
| 78 | { "r2025sd", rtc_r2025sd }, | 79 | { "r2025sd", rtc_r2025sd }, |
| 80 | { "r2221tl", rtc_r2221tl }, | ||
| 79 | { "rs5c372a", rtc_rs5c372a }, | 81 | { "rs5c372a", rtc_rs5c372a }, |
| 80 | { "rs5c372b", rtc_rs5c372b }, | 82 | { "rs5c372b", rtc_rs5c372b }, |
| 81 | { "rv5c386", rtc_rv5c386 }, | 83 | { "rv5c386", rtc_rv5c386 }, |
| @@ -529,6 +531,7 @@ static int rs5c_oscillator_setup(struct rs5c372 *rs5c372) | |||
| 529 | rs5c372->time24 = 1; | 531 | rs5c372->time24 = 1; |
| 530 | break; | 532 | break; |
| 531 | case rtc_r2025sd: | 533 | case rtc_r2025sd: |
| 534 | case rtc_r2221tl: | ||
| 532 | case rtc_rv5c386: | 535 | case rtc_rv5c386: |
| 533 | case rtc_rv5c387a: | 536 | case rtc_rv5c387a: |
| 534 | buf[0] |= RV5C387_CTRL1_24; | 537 | buf[0] |= RV5C387_CTRL1_24; |
| @@ -609,6 +612,7 @@ static int rs5c372_probe(struct i2c_client *client, | |||
| 609 | rs5c372->time24 = 1; | 612 | rs5c372->time24 = 1; |
| 610 | break; | 613 | break; |
| 611 | case rtc_r2025sd: | 614 | case rtc_r2025sd: |
| 615 | case rtc_r2221tl: | ||
| 612 | case rtc_rv5c386: | 616 | case rtc_rv5c386: |
| 613 | case rtc_rv5c387a: | 617 | case rtc_rv5c387a: |
| 614 | if (rs5c372->regs[RS5C_REG_CTRL1] & RV5C387_CTRL1_24) | 618 | if (rs5c372->regs[RS5C_REG_CTRL1] & RV5C387_CTRL1_24) |
| @@ -640,6 +644,7 @@ static int rs5c372_probe(struct i2c_client *client, | |||
| 640 | dev_info(&client->dev, "%s found, %s, driver version " DRV_VERSION "\n", | 644 | dev_info(&client->dev, "%s found, %s, driver version " DRV_VERSION "\n", |
| 641 | ({ char *s; switch (rs5c372->type) { | 645 | ({ char *s; switch (rs5c372->type) { |
| 642 | case rtc_r2025sd: s = "r2025sd"; break; | 646 | case rtc_r2025sd: s = "r2025sd"; break; |
| 647 | case rtc_r2221tl: s = "r2221tl"; break; | ||
| 643 | case rtc_rs5c372a: s = "rs5c372a"; break; | 648 | case rtc_rs5c372a: s = "rs5c372a"; break; |
| 644 | case rtc_rs5c372b: s = "rs5c372b"; break; | 649 | case rtc_rs5c372b: s = "rs5c372b"; break; |
| 645 | case rtc_rv5c386: s = "rv5c386"; break; | 650 | case rtc_rv5c386: s = "rv5c386"; break; |
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index 9f0c46547459..df5e961484e1 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c | |||
| @@ -35,6 +35,7 @@ static int sg_version_num = 30534; /* 2 digits for each component */ | |||
| 35 | #include <linux/sched.h> | 35 | #include <linux/sched.h> |
| 36 | #include <linux/string.h> | 36 | #include <linux/string.h> |
| 37 | #include <linux/mm.h> | 37 | #include <linux/mm.h> |
| 38 | #include <linux/aio.h> | ||
| 38 | #include <linux/errno.h> | 39 | #include <linux/errno.h> |
| 39 | #include <linux/mtio.h> | 40 | #include <linux/mtio.h> |
| 40 | #include <linux/ioctl.h> | 41 | #include <linux/ioctl.h> |
diff --git a/drivers/staging/android/logger.c b/drivers/staging/android/logger.c index b14a55742559..b040200a5a55 100644 --- a/drivers/staging/android/logger.c +++ b/drivers/staging/android/logger.c | |||
| @@ -28,6 +28,7 @@ | |||
| 28 | #include <linux/slab.h> | 28 | #include <linux/slab.h> |
| 29 | #include <linux/time.h> | 29 | #include <linux/time.h> |
| 30 | #include <linux/vmalloc.h> | 30 | #include <linux/vmalloc.h> |
| 31 | #include <linux/aio.h> | ||
| 31 | #include "logger.h" | 32 | #include "logger.h" |
| 32 | 33 | ||
| 33 | #include <asm/ioctls.h> | 34 | #include <asm/ioctls.h> |
diff --git a/drivers/usb/gadget/inode.c b/drivers/usb/gadget/inode.c index dda0dc4a5567..570c005062ab 100644 --- a/drivers/usb/gadget/inode.c +++ b/drivers/usb/gadget/inode.c | |||
| @@ -24,6 +24,8 @@ | |||
| 24 | #include <linux/sched.h> | 24 | #include <linux/sched.h> |
| 25 | #include <linux/slab.h> | 25 | #include <linux/slab.h> |
| 26 | #include <linux/poll.h> | 26 | #include <linux/poll.h> |
| 27 | #include <linux/mmu_context.h> | ||
| 28 | #include <linux/aio.h> | ||
| 27 | 29 | ||
| 28 | #include <linux/device.h> | 30 | #include <linux/device.h> |
| 29 | #include <linux/moduleparam.h> | 31 | #include <linux/moduleparam.h> |
| @@ -513,6 +515,9 @@ static long ep_ioctl(struct file *fd, unsigned code, unsigned long value) | |||
| 513 | struct kiocb_priv { | 515 | struct kiocb_priv { |
| 514 | struct usb_request *req; | 516 | struct usb_request *req; |
| 515 | struct ep_data *epdata; | 517 | struct ep_data *epdata; |
| 518 | struct kiocb *iocb; | ||
| 519 | struct mm_struct *mm; | ||
| 520 | struct work_struct work; | ||
| 516 | void *buf; | 521 | void *buf; |
| 517 | const struct iovec *iv; | 522 | const struct iovec *iv; |
| 518 | unsigned long nr_segs; | 523 | unsigned long nr_segs; |
| @@ -528,7 +533,6 @@ static int ep_aio_cancel(struct kiocb *iocb, struct io_event *e) | |||
| 528 | local_irq_disable(); | 533 | local_irq_disable(); |
| 529 | epdata = priv->epdata; | 534 | epdata = priv->epdata; |
| 530 | // spin_lock(&epdata->dev->lock); | 535 | // spin_lock(&epdata->dev->lock); |
| 531 | kiocbSetCancelled(iocb); | ||
| 532 | if (likely(epdata && epdata->ep && priv->req)) | 536 | if (likely(epdata && epdata->ep && priv->req)) |
| 533 | value = usb_ep_dequeue (epdata->ep, priv->req); | 537 | value = usb_ep_dequeue (epdata->ep, priv->req); |
| 534 | else | 538 | else |
| @@ -540,15 +544,12 @@ static int ep_aio_cancel(struct kiocb *iocb, struct io_event *e) | |||
| 540 | return value; | 544 | return value; |
| 541 | } | 545 | } |
| 542 | 546 | ||
| 543 | static ssize_t ep_aio_read_retry(struct kiocb *iocb) | 547 | static ssize_t ep_copy_to_user(struct kiocb_priv *priv) |
| 544 | { | 548 | { |
| 545 | struct kiocb_priv *priv = iocb->private; | ||
| 546 | ssize_t len, total; | 549 | ssize_t len, total; |
| 547 | void *to_copy; | 550 | void *to_copy; |
| 548 | int i; | 551 | int i; |
| 549 | 552 | ||
| 550 | /* we "retry" to get the right mm context for this: */ | ||
| 551 | |||
| 552 | /* copy stuff into user buffers */ | 553 | /* copy stuff into user buffers */ |
| 553 | total = priv->actual; | 554 | total = priv->actual; |
| 554 | len = 0; | 555 | len = 0; |
| @@ -568,9 +569,26 @@ static ssize_t ep_aio_read_retry(struct kiocb *iocb) | |||
| 568 | if (total == 0) | 569 | if (total == 0) |
| 569 | break; | 570 | break; |
| 570 | } | 571 | } |
| 572 | |||
| 573 | return len; | ||
| 574 | } | ||
| 575 | |||
| 576 | static void ep_user_copy_worker(struct work_struct *work) | ||
| 577 | { | ||
| 578 | struct kiocb_priv *priv = container_of(work, struct kiocb_priv, work); | ||
| 579 | struct mm_struct *mm = priv->mm; | ||
| 580 | struct kiocb *iocb = priv->iocb; | ||
| 581 | size_t ret; | ||
| 582 | |||
| 583 | use_mm(mm); | ||
| 584 | ret = ep_copy_to_user(priv); | ||
| 585 | unuse_mm(mm); | ||
| 586 | |||
| 587 | /* completing the iocb can drop the ctx and mm, don't touch mm after */ | ||
| 588 | aio_complete(iocb, ret, ret); | ||
| 589 | |||
| 571 | kfree(priv->buf); | 590 | kfree(priv->buf); |
| 572 | kfree(priv); | 591 | kfree(priv); |
| 573 | return len; | ||
| 574 | } | 592 | } |
| 575 | 593 | ||
| 576 | static void ep_aio_complete(struct usb_ep *ep, struct usb_request *req) | 594 | static void ep_aio_complete(struct usb_ep *ep, struct usb_request *req) |
| @@ -596,14 +614,14 @@ static void ep_aio_complete(struct usb_ep *ep, struct usb_request *req) | |||
| 596 | aio_complete(iocb, req->actual ? req->actual : req->status, | 614 | aio_complete(iocb, req->actual ? req->actual : req->status, |
| 597 | req->status); | 615 | req->status); |
| 598 | } else { | 616 | } else { |
| 599 | /* retry() won't report both; so we hide some faults */ | 617 | /* ep_copy_to_user() won't report both; we hide some faults */ |
| 600 | if (unlikely(0 != req->status)) | 618 | if (unlikely(0 != req->status)) |
| 601 | DBG(epdata->dev, "%s fault %d len %d\n", | 619 | DBG(epdata->dev, "%s fault %d len %d\n", |
| 602 | ep->name, req->status, req->actual); | 620 | ep->name, req->status, req->actual); |
| 603 | 621 | ||
| 604 | priv->buf = req->buf; | 622 | priv->buf = req->buf; |
| 605 | priv->actual = req->actual; | 623 | priv->actual = req->actual; |
| 606 | kick_iocb(iocb); | 624 | schedule_work(&priv->work); |
| 607 | } | 625 | } |
| 608 | spin_unlock(&epdata->dev->lock); | 626 | spin_unlock(&epdata->dev->lock); |
| 609 | 627 | ||
| @@ -633,8 +651,10 @@ fail: | |||
| 633 | return value; | 651 | return value; |
| 634 | } | 652 | } |
| 635 | iocb->private = priv; | 653 | iocb->private = priv; |
| 654 | priv->iocb = iocb; | ||
| 636 | priv->iv = iv; | 655 | priv->iv = iv; |
| 637 | priv->nr_segs = nr_segs; | 656 | priv->nr_segs = nr_segs; |
| 657 | INIT_WORK(&priv->work, ep_user_copy_worker); | ||
| 638 | 658 | ||
| 639 | value = get_ready_ep(iocb->ki_filp->f_flags, epdata); | 659 | value = get_ready_ep(iocb->ki_filp->f_flags, epdata); |
| 640 | if (unlikely(value < 0)) { | 660 | if (unlikely(value < 0)) { |
| @@ -642,10 +662,11 @@ fail: | |||
| 642 | goto fail; | 662 | goto fail; |
| 643 | } | 663 | } |
| 644 | 664 | ||
| 645 | iocb->ki_cancel = ep_aio_cancel; | 665 | kiocb_set_cancel_fn(iocb, ep_aio_cancel); |
| 646 | get_ep(epdata); | 666 | get_ep(epdata); |
| 647 | priv->epdata = epdata; | 667 | priv->epdata = epdata; |
| 648 | priv->actual = 0; | 668 | priv->actual = 0; |
| 669 | priv->mm = current->mm; /* mm teardown waits for iocbs in exit_aio() */ | ||
| 649 | 670 | ||
| 650 | /* each kiocb is coupled to one usb_request, but we can't | 671 | /* each kiocb is coupled to one usb_request, but we can't |
| 651 | * allocate or submit those if the host disconnected. | 672 | * allocate or submit those if the host disconnected. |
| @@ -674,7 +695,7 @@ fail: | |||
| 674 | kfree(priv); | 695 | kfree(priv); |
| 675 | put_ep(epdata); | 696 | put_ep(epdata); |
| 676 | } else | 697 | } else |
| 677 | value = (iv ? -EIOCBRETRY : -EIOCBQUEUED); | 698 | value = -EIOCBQUEUED; |
| 678 | return value; | 699 | return value; |
| 679 | } | 700 | } |
| 680 | 701 | ||
| @@ -692,7 +713,6 @@ ep_aio_read(struct kiocb *iocb, const struct iovec *iov, | |||
| 692 | if (unlikely(!buf)) | 713 | if (unlikely(!buf)) |
| 693 | return -ENOMEM; | 714 | return -ENOMEM; |
| 694 | 715 | ||
| 695 | iocb->ki_retry = ep_aio_read_retry; | ||
| 696 | return ep_aio_rwtail(iocb, buf, iocb->ki_left, epdata, iov, nr_segs); | 716 | return ep_aio_rwtail(iocb, buf, iocb->ki_left, epdata, iov, nr_segs); |
| 697 | } | 717 | } |
| 698 | 718 | ||
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index 0ad61c6a65a5..055562c580b4 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c | |||
| @@ -33,6 +33,7 @@ | |||
| 33 | #include <linux/pagemap.h> | 33 | #include <linux/pagemap.h> |
| 34 | #include <linux/idr.h> | 34 | #include <linux/idr.h> |
| 35 | #include <linux/sched.h> | 35 | #include <linux/sched.h> |
| 36 | #include <linux/aio.h> | ||
| 36 | #include <net/9p/9p.h> | 37 | #include <net/9p/9p.h> |
| 37 | #include <net/9p/client.h> | 38 | #include <net/9p/client.h> |
| 38 | 39 | ||
diff --git a/fs/afs/write.c b/fs/afs/write.c index 7e03eadb40c0..a890db4b9898 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c | |||
| @@ -14,6 +14,7 @@ | |||
| 14 | #include <linux/pagemap.h> | 14 | #include <linux/pagemap.h> |
| 15 | #include <linux/writeback.h> | 15 | #include <linux/writeback.h> |
| 16 | #include <linux/pagevec.h> | 16 | #include <linux/pagevec.h> |
| 17 | #include <linux/aio.h> | ||
| 17 | #include "internal.h" | 18 | #include "internal.h" |
| 18 | 19 | ||
| 19 | static int afs_write_back_from_locked_page(struct afs_writeback *wb, | 20 | static int afs_write_back_from_locked_page(struct afs_writeback *wb, |
| @@ -8,6 +8,8 @@ | |||
| 8 | * | 8 | * |
| 9 | * See ../COPYING for licensing terms. | 9 | * See ../COPYING for licensing terms. |
| 10 | */ | 10 | */ |
| 11 | #define pr_fmt(fmt) "%s: " fmt, __func__ | ||
| 12 | |||
| 11 | #include <linux/kernel.h> | 13 | #include <linux/kernel.h> |
| 12 | #include <linux/init.h> | 14 | #include <linux/init.h> |
| 13 | #include <linux/errno.h> | 15 | #include <linux/errno.h> |
| @@ -18,8 +20,6 @@ | |||
| 18 | #include <linux/backing-dev.h> | 20 | #include <linux/backing-dev.h> |
| 19 | #include <linux/uio.h> | 21 | #include <linux/uio.h> |
| 20 | 22 | ||
| 21 | #define DEBUG 0 | ||
| 22 | |||
| 23 | #include <linux/sched.h> | 23 | #include <linux/sched.h> |
| 24 | #include <linux/fs.h> | 24 | #include <linux/fs.h> |
| 25 | #include <linux/file.h> | 25 | #include <linux/file.h> |
| @@ -39,11 +39,76 @@ | |||
| 39 | #include <asm/kmap_types.h> | 39 | #include <asm/kmap_types.h> |
| 40 | #include <asm/uaccess.h> | 40 | #include <asm/uaccess.h> |
| 41 | 41 | ||
| 42 | #if DEBUG > 1 | 42 | #define AIO_RING_MAGIC 0xa10a10a1 |
| 43 | #define dprintk printk | 43 | #define AIO_RING_COMPAT_FEATURES 1 |
| 44 | #else | 44 | #define AIO_RING_INCOMPAT_FEATURES 0 |
| 45 | #define dprintk(x...) do { ; } while (0) | 45 | struct aio_ring { |
| 46 | #endif | 46 | unsigned id; /* kernel internal index number */ |
| 47 | unsigned nr; /* number of io_events */ | ||
| 48 | unsigned head; | ||
| 49 | unsigned tail; | ||
| 50 | |||
| 51 | unsigned magic; | ||
| 52 | unsigned compat_features; | ||
| 53 | unsigned incompat_features; | ||
| 54 | unsigned header_length; /* size of aio_ring */ | ||
| 55 | |||
| 56 | |||
| 57 | struct io_event io_events[0]; | ||
| 58 | }; /* 128 bytes + ring size */ | ||
| 59 | |||
| 60 | #define AIO_RING_PAGES 8 | ||
| 61 | |||
| 62 | struct kioctx { | ||
| 63 | atomic_t users; | ||
| 64 | atomic_t dead; | ||
| 65 | |||
| 66 | /* This needs improving */ | ||
| 67 | unsigned long user_id; | ||
| 68 | struct hlist_node list; | ||
| 69 | |||
| 70 | /* | ||
| 71 | * This is what userspace passed to io_setup(), it's not used for | ||
| 72 | * anything but counting against the global max_reqs quota. | ||
| 73 | * | ||
| 74 | * The real limit is nr_events - 1, which will be larger (see | ||
| 75 | * aio_setup_ring()) | ||
| 76 | */ | ||
| 77 | unsigned max_reqs; | ||
| 78 | |||
| 79 | /* Size of ringbuffer, in units of struct io_event */ | ||
| 80 | unsigned nr_events; | ||
| 81 | |||
| 82 | unsigned long mmap_base; | ||
| 83 | unsigned long mmap_size; | ||
| 84 | |||
| 85 | struct page **ring_pages; | ||
| 86 | long nr_pages; | ||
| 87 | |||
| 88 | struct rcu_head rcu_head; | ||
| 89 | struct work_struct rcu_work; | ||
| 90 | |||
| 91 | struct { | ||
| 92 | atomic_t reqs_active; | ||
| 93 | } ____cacheline_aligned_in_smp; | ||
| 94 | |||
| 95 | struct { | ||
| 96 | spinlock_t ctx_lock; | ||
| 97 | struct list_head active_reqs; /* used for cancellation */ | ||
| 98 | } ____cacheline_aligned_in_smp; | ||
| 99 | |||
| 100 | struct { | ||
| 101 | struct mutex ring_lock; | ||
| 102 | wait_queue_head_t wait; | ||
| 103 | } ____cacheline_aligned_in_smp; | ||
| 104 | |||
| 105 | struct { | ||
| 106 | unsigned tail; | ||
| 107 | spinlock_t completion_lock; | ||
| 108 | } ____cacheline_aligned_in_smp; | ||
| 109 | |||
| 110 | struct page *internal_pages[AIO_RING_PAGES]; | ||
| 111 | }; | ||
| 47 | 112 | ||
| 48 | /*------ sysctl variables----*/ | 113 | /*------ sysctl variables----*/ |
| 49 | static DEFINE_SPINLOCK(aio_nr_lock); | 114 | static DEFINE_SPINLOCK(aio_nr_lock); |
| @@ -54,11 +119,6 @@ unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio request | |||
| 54 | static struct kmem_cache *kiocb_cachep; | 119 | static struct kmem_cache *kiocb_cachep; |
| 55 | static struct kmem_cache *kioctx_cachep; | 120 | static struct kmem_cache *kioctx_cachep; |
| 56 | 121 | ||
| 57 | static struct workqueue_struct *aio_wq; | ||
| 58 | |||
| 59 | static void aio_kick_handler(struct work_struct *); | ||
| 60 | static void aio_queue_work(struct kioctx *); | ||
| 61 | |||
| 62 | /* aio_setup | 122 | /* aio_setup |
| 63 | * Creates the slab caches used by the aio routines, panic on | 123 | * Creates the slab caches used by the aio routines, panic on |
| 64 | * failure as this is done early during the boot sequence. | 124 | * failure as this is done early during the boot sequence. |
| @@ -68,10 +128,7 @@ static int __init aio_setup(void) | |||
| 68 | kiocb_cachep = KMEM_CACHE(kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC); | 128 | kiocb_cachep = KMEM_CACHE(kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC); |
| 69 | kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC); | 129 | kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC); |
| 70 | 130 | ||
| 71 | aio_wq = alloc_workqueue("aio", 0, 1); /* used to limit concurrency */ | 131 | pr_debug("sizeof(struct page) = %zu\n", sizeof(struct page)); |
| 72 | BUG_ON(!aio_wq); | ||
| 73 | |||
| 74 | pr_debug("aio_setup: sizeof(struct page) = %d\n", (int)sizeof(struct page)); | ||
| 75 | 132 | ||
| 76 | return 0; | 133 | return 0; |
| 77 | } | 134 | } |
| @@ -79,28 +136,23 @@ __initcall(aio_setup); | |||
| 79 | 136 | ||
| 80 | static void aio_free_ring(struct kioctx *ctx) | 137 | static void aio_free_ring(struct kioctx *ctx) |
| 81 | { | 138 | { |
| 82 | struct aio_ring_info *info = &ctx->ring_info; | ||
| 83 | long i; | 139 | long i; |
| 84 | 140 | ||
| 85 | for (i=0; i<info->nr_pages; i++) | 141 | for (i = 0; i < ctx->nr_pages; i++) |
| 86 | put_page(info->ring_pages[i]); | 142 | put_page(ctx->ring_pages[i]); |
| 87 | 143 | ||
| 88 | if (info->mmap_size) { | 144 | if (ctx->mmap_size) |
| 89 | BUG_ON(ctx->mm != current->mm); | 145 | vm_munmap(ctx->mmap_base, ctx->mmap_size); |
| 90 | vm_munmap(info->mmap_base, info->mmap_size); | ||
| 91 | } | ||
| 92 | 146 | ||
| 93 | if (info->ring_pages && info->ring_pages != info->internal_pages) | 147 | if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages) |
| 94 | kfree(info->ring_pages); | 148 | kfree(ctx->ring_pages); |
| 95 | info->ring_pages = NULL; | ||
| 96 | info->nr = 0; | ||
| 97 | } | 149 | } |
| 98 | 150 | ||
| 99 | static int aio_setup_ring(struct kioctx *ctx) | 151 | static int aio_setup_ring(struct kioctx *ctx) |
| 100 | { | 152 | { |
| 101 | struct aio_ring *ring; | 153 | struct aio_ring *ring; |
| 102 | struct aio_ring_info *info = &ctx->ring_info; | ||
| 103 | unsigned nr_events = ctx->max_reqs; | 154 | unsigned nr_events = ctx->max_reqs; |
| 155 | struct mm_struct *mm = current->mm; | ||
| 104 | unsigned long size, populate; | 156 | unsigned long size, populate; |
| 105 | int nr_pages; | 157 | int nr_pages; |
| 106 | 158 | ||
| @@ -116,46 +168,44 @@ static int aio_setup_ring(struct kioctx *ctx) | |||
| 116 | 168 | ||
| 117 | nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / sizeof(struct io_event); | 169 | nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / sizeof(struct io_event); |
| 118 | 170 | ||
| 119 | info->nr = 0; | 171 | ctx->nr_events = 0; |
| 120 | info->ring_pages = info->internal_pages; | 172 | ctx->ring_pages = ctx->internal_pages; |
| 121 | if (nr_pages > AIO_RING_PAGES) { | 173 | if (nr_pages > AIO_RING_PAGES) { |
| 122 | info->ring_pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); | 174 | ctx->ring_pages = kcalloc(nr_pages, sizeof(struct page *), |
| 123 | if (!info->ring_pages) | 175 | GFP_KERNEL); |
| 176 | if (!ctx->ring_pages) | ||
| 124 | return -ENOMEM; | 177 | return -ENOMEM; |
| 125 | } | 178 | } |
| 126 | 179 | ||
| 127 | info->mmap_size = nr_pages * PAGE_SIZE; | 180 | ctx->mmap_size = nr_pages * PAGE_SIZE; |
| 128 | dprintk("attempting mmap of %lu bytes\n", info->mmap_size); | 181 | pr_debug("attempting mmap of %lu bytes\n", ctx->mmap_size); |
| 129 | down_write(&ctx->mm->mmap_sem); | 182 | down_write(&mm->mmap_sem); |
| 130 | info->mmap_base = do_mmap_pgoff(NULL, 0, info->mmap_size, | 183 | ctx->mmap_base = do_mmap_pgoff(NULL, 0, ctx->mmap_size, |
| 131 | PROT_READ|PROT_WRITE, | 184 | PROT_READ|PROT_WRITE, |
| 132 | MAP_ANONYMOUS|MAP_PRIVATE, 0, | 185 | MAP_ANONYMOUS|MAP_PRIVATE, 0, &populate); |
| 133 | &populate); | 186 | if (IS_ERR((void *)ctx->mmap_base)) { |
| 134 | if (IS_ERR((void *)info->mmap_base)) { | 187 | up_write(&mm->mmap_sem); |
| 135 | up_write(&ctx->mm->mmap_sem); | 188 | ctx->mmap_size = 0; |
| 136 | info->mmap_size = 0; | ||
| 137 | aio_free_ring(ctx); | 189 | aio_free_ring(ctx); |
| 138 | return -EAGAIN; | 190 | return -EAGAIN; |
| 139 | } | 191 | } |
| 140 | 192 | ||
| 141 | dprintk("mmap address: 0x%08lx\n", info->mmap_base); | 193 | pr_debug("mmap address: 0x%08lx\n", ctx->mmap_base); |
| 142 | info->nr_pages = get_user_pages(current, ctx->mm, | 194 | ctx->nr_pages = get_user_pages(current, mm, ctx->mmap_base, nr_pages, |
| 143 | info->mmap_base, nr_pages, | 195 | 1, 0, ctx->ring_pages, NULL); |
| 144 | 1, 0, info->ring_pages, NULL); | 196 | up_write(&mm->mmap_sem); |
| 145 | up_write(&ctx->mm->mmap_sem); | ||
| 146 | 197 | ||
| 147 | if (unlikely(info->nr_pages != nr_pages)) { | 198 | if (unlikely(ctx->nr_pages != nr_pages)) { |
| 148 | aio_free_ring(ctx); | 199 | aio_free_ring(ctx); |
| 149 | return -EAGAIN; | 200 | return -EAGAIN; |
| 150 | } | 201 | } |
| 151 | if (populate) | 202 | if (populate) |
| 152 | mm_populate(info->mmap_base, populate); | 203 | mm_populate(ctx->mmap_base, populate); |
| 153 | 204 | ||
| 154 | ctx->user_id = info->mmap_base; | 205 | ctx->user_id = ctx->mmap_base; |
| 206 | ctx->nr_events = nr_events; /* trusted copy */ | ||
| 155 | 207 | ||
| 156 | info->nr = nr_events; /* trusted copy */ | 208 | ring = kmap_atomic(ctx->ring_pages[0]); |
| 157 | |||
| 158 | ring = kmap_atomic(info->ring_pages[0]); | ||
| 159 | ring->nr = nr_events; /* user copy */ | 209 | ring->nr = nr_events; /* user copy */ |
| 160 | ring->id = ctx->user_id; | 210 | ring->id = ctx->user_id; |
| 161 | ring->head = ring->tail = 0; | 211 | ring->head = ring->tail = 0; |
| @@ -164,72 +214,133 @@ static int aio_setup_ring(struct kioctx *ctx) | |||
| 164 | ring->incompat_features = AIO_RING_INCOMPAT_FEATURES; | 214 | ring->incompat_features = AIO_RING_INCOMPAT_FEATURES; |
| 165 | ring->header_length = sizeof(struct aio_ring); | 215 | ring->header_length = sizeof(struct aio_ring); |
| 166 | kunmap_atomic(ring); | 216 | kunmap_atomic(ring); |
| 217 | flush_dcache_page(ctx->ring_pages[0]); | ||
| 167 | 218 | ||
| 168 | return 0; | 219 | return 0; |
| 169 | } | 220 | } |
| 170 | 221 | ||
| 171 | |||
| 172 | /* aio_ring_event: returns a pointer to the event at the given index from | ||
| 173 | * kmap_atomic(). Release the pointer with put_aio_ring_event(); | ||
| 174 | */ | ||
| 175 | #define AIO_EVENTS_PER_PAGE (PAGE_SIZE / sizeof(struct io_event)) | 222 | #define AIO_EVENTS_PER_PAGE (PAGE_SIZE / sizeof(struct io_event)) |
| 176 | #define AIO_EVENTS_FIRST_PAGE ((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event)) | 223 | #define AIO_EVENTS_FIRST_PAGE ((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event)) |
| 177 | #define AIO_EVENTS_OFFSET (AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE) | 224 | #define AIO_EVENTS_OFFSET (AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE) |
| 178 | 225 | ||
| 179 | #define aio_ring_event(info, nr) ({ \ | 226 | void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel) |
| 180 | unsigned pos = (nr) + AIO_EVENTS_OFFSET; \ | 227 | { |
| 181 | struct io_event *__event; \ | 228 | struct kioctx *ctx = req->ki_ctx; |
| 182 | __event = kmap_atomic( \ | 229 | unsigned long flags; |
| 183 | (info)->ring_pages[pos / AIO_EVENTS_PER_PAGE]); \ | 230 | |
| 184 | __event += pos % AIO_EVENTS_PER_PAGE; \ | 231 | spin_lock_irqsave(&ctx->ctx_lock, flags); |
| 185 | __event; \ | 232 | |
| 186 | }) | 233 | if (!req->ki_list.next) |
| 187 | 234 | list_add(&req->ki_list, &ctx->active_reqs); | |
| 188 | #define put_aio_ring_event(event) do { \ | 235 | |
| 189 | struct io_event *__event = (event); \ | 236 | req->ki_cancel = cancel; |
| 190 | (void)__event; \ | 237 | |
| 191 | kunmap_atomic((void *)((unsigned long)__event & PAGE_MASK)); \ | 238 | spin_unlock_irqrestore(&ctx->ctx_lock, flags); |
| 192 | } while(0) | 239 | } |
| 193 | 240 | EXPORT_SYMBOL(kiocb_set_cancel_fn); | |
| 194 | static void ctx_rcu_free(struct rcu_head *head) | 241 | |
| 242 | static int kiocb_cancel(struct kioctx *ctx, struct kiocb *kiocb, | ||
| 243 | struct io_event *res) | ||
| 244 | { | ||
| 245 | kiocb_cancel_fn *old, *cancel; | ||
| 246 | int ret = -EINVAL; | ||
| 247 | |||
| 248 | /* | ||
| 249 | * Don't want to set kiocb->ki_cancel = KIOCB_CANCELLED unless it | ||
| 250 | * actually has a cancel function, hence the cmpxchg() | ||
| 251 | */ | ||
| 252 | |||
| 253 | cancel = ACCESS_ONCE(kiocb->ki_cancel); | ||
| 254 | do { | ||
| 255 | if (!cancel || cancel == KIOCB_CANCELLED) | ||
| 256 | return ret; | ||
| 257 | |||
| 258 | old = cancel; | ||
| 259 | cancel = cmpxchg(&kiocb->ki_cancel, old, KIOCB_CANCELLED); | ||
| 260 | } while (cancel != old); | ||
| 261 | |||
| 262 | atomic_inc(&kiocb->ki_users); | ||
| 263 | spin_unlock_irq(&ctx->ctx_lock); | ||
| 264 | |||
| 265 | memset(res, 0, sizeof(*res)); | ||
| 266 | res->obj = (u64)(unsigned long)kiocb->ki_obj.user; | ||
| 267 | res->data = kiocb->ki_user_data; | ||
| 268 | ret = cancel(kiocb, res); | ||
| 269 | |||
| 270 | spin_lock_irq(&ctx->ctx_lock); | ||
| 271 | |||
| 272 | return ret; | ||
| 273 | } | ||
| 274 | |||
| 275 | static void free_ioctx_rcu(struct rcu_head *head) | ||
| 195 | { | 276 | { |
| 196 | struct kioctx *ctx = container_of(head, struct kioctx, rcu_head); | 277 | struct kioctx *ctx = container_of(head, struct kioctx, rcu_head); |
| 197 | kmem_cache_free(kioctx_cachep, ctx); | 278 | kmem_cache_free(kioctx_cachep, ctx); |
| 198 | } | 279 | } |
| 199 | 280 | ||
| 200 | /* __put_ioctx | 281 | /* |
| 201 | * Called when the last user of an aio context has gone away, | 282 | * When this function runs, the kioctx has been removed from the "hash table" |
| 202 | * and the struct needs to be freed. | 283 | * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted - |
| 284 | * now it's safe to cancel any that need to be. | ||
| 203 | */ | 285 | */ |
| 204 | static void __put_ioctx(struct kioctx *ctx) | 286 | static void free_ioctx(struct kioctx *ctx) |
| 205 | { | 287 | { |
| 206 | unsigned nr_events = ctx->max_reqs; | 288 | struct aio_ring *ring; |
| 207 | BUG_ON(ctx->reqs_active); | 289 | struct io_event res; |
| 290 | struct kiocb *req; | ||
| 291 | unsigned head, avail; | ||
| 208 | 292 | ||
| 209 | cancel_delayed_work_sync(&ctx->wq); | 293 | spin_lock_irq(&ctx->ctx_lock); |
| 210 | aio_free_ring(ctx); | 294 | |
| 211 | mmdrop(ctx->mm); | 295 | while (!list_empty(&ctx->active_reqs)) { |
| 212 | ctx->mm = NULL; | 296 | req = list_first_entry(&ctx->active_reqs, |
| 213 | if (nr_events) { | 297 | struct kiocb, ki_list); |
| 214 | spin_lock(&aio_nr_lock); | 298 | |
| 215 | BUG_ON(aio_nr - nr_events > aio_nr); | 299 | list_del_init(&req->ki_list); |
| 216 | aio_nr -= nr_events; | 300 | kiocb_cancel(ctx, req, &res); |
| 217 | spin_unlock(&aio_nr_lock); | ||
| 218 | } | 301 | } |
| 219 | pr_debug("__put_ioctx: freeing %p\n", ctx); | ||
| 220 | call_rcu(&ctx->rcu_head, ctx_rcu_free); | ||
| 221 | } | ||
| 222 | 302 | ||
| 223 | static inline int try_get_ioctx(struct kioctx *kioctx) | 303 | spin_unlock_irq(&ctx->ctx_lock); |
| 224 | { | 304 | |
| 225 | return atomic_inc_not_zero(&kioctx->users); | 305 | ring = kmap_atomic(ctx->ring_pages[0]); |
| 306 | head = ring->head; | ||
| 307 | kunmap_atomic(ring); | ||
| 308 | |||
| 309 | while (atomic_read(&ctx->reqs_active) > 0) { | ||
| 310 | wait_event(ctx->wait, head != ctx->tail); | ||
| 311 | |||
| 312 | avail = (head <= ctx->tail ? ctx->tail : ctx->nr_events) - head; | ||
| 313 | |||
| 314 | atomic_sub(avail, &ctx->reqs_active); | ||
| 315 | head += avail; | ||
| 316 | head %= ctx->nr_events; | ||
| 317 | } | ||
| 318 | |||
| 319 | WARN_ON(atomic_read(&ctx->reqs_active) < 0); | ||
| 320 | |||
| 321 | aio_free_ring(ctx); | ||
| 322 | |||
| 323 | spin_lock(&aio_nr_lock); | ||
| 324 | BUG_ON(aio_nr - ctx->max_reqs > aio_nr); | ||
| 325 | aio_nr -= ctx->max_reqs; | ||
| 326 | spin_unlock(&aio_nr_lock); | ||
| 327 | |||
| 328 | pr_debug("freeing %p\n", ctx); | ||
| 329 | |||
| 330 | /* | ||
| 331 | * Here the call_rcu() is between the wait_event() for reqs_active to | ||
| 332 | * hit 0, and freeing the ioctx. | ||
| 333 | * | ||
| 334 | * aio_complete() decrements reqs_active, but it has to touch the ioctx | ||
| 335 | * after to issue a wakeup so we use rcu. | ||
| 336 | */ | ||
| 337 | call_rcu(&ctx->rcu_head, free_ioctx_rcu); | ||
| 226 | } | 338 | } |
| 227 | 339 | ||
| 228 | static inline void put_ioctx(struct kioctx *kioctx) | 340 | static void put_ioctx(struct kioctx *ctx) |
| 229 | { | 341 | { |
| 230 | BUG_ON(atomic_read(&kioctx->users) <= 0); | 342 | if (unlikely(atomic_dec_and_test(&ctx->users))) |
| 231 | if (unlikely(atomic_dec_and_test(&kioctx->users))) | 343 | free_ioctx(ctx); |
| 232 | __put_ioctx(kioctx); | ||
| 233 | } | 344 | } |
| 234 | 345 | ||
| 235 | /* ioctx_alloc | 346 | /* ioctx_alloc |
| @@ -237,7 +348,7 @@ static inline void put_ioctx(struct kioctx *kioctx) | |||
| 237 | */ | 348 | */ |
| 238 | static struct kioctx *ioctx_alloc(unsigned nr_events) | 349 | static struct kioctx *ioctx_alloc(unsigned nr_events) |
| 239 | { | 350 | { |
| 240 | struct mm_struct *mm; | 351 | struct mm_struct *mm = current->mm; |
| 241 | struct kioctx *ctx; | 352 | struct kioctx *ctx; |
| 242 | int err = -ENOMEM; | 353 | int err = -ENOMEM; |
| 243 | 354 | ||
| @@ -256,17 +367,15 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) | |||
| 256 | return ERR_PTR(-ENOMEM); | 367 | return ERR_PTR(-ENOMEM); |
| 257 | 368 | ||
| 258 | ctx->max_reqs = nr_events; | 369 | ctx->max_reqs = nr_events; |
| 259 | mm = ctx->mm = current->mm; | ||
| 260 | atomic_inc(&mm->mm_count); | ||
| 261 | 370 | ||
| 262 | atomic_set(&ctx->users, 2); | 371 | atomic_set(&ctx->users, 2); |
| 372 | atomic_set(&ctx->dead, 0); | ||
| 263 | spin_lock_init(&ctx->ctx_lock); | 373 | spin_lock_init(&ctx->ctx_lock); |
| 264 | spin_lock_init(&ctx->ring_info.ring_lock); | 374 | spin_lock_init(&ctx->completion_lock); |
| 375 | mutex_init(&ctx->ring_lock); | ||
| 265 | init_waitqueue_head(&ctx->wait); | 376 | init_waitqueue_head(&ctx->wait); |
| 266 | 377 | ||
| 267 | INIT_LIST_HEAD(&ctx->active_reqs); | 378 | INIT_LIST_HEAD(&ctx->active_reqs); |
| 268 | INIT_LIST_HEAD(&ctx->run_list); | ||
| 269 | INIT_DELAYED_WORK(&ctx->wq, aio_kick_handler); | ||
| 270 | 379 | ||
| 271 | if (aio_setup_ring(ctx) < 0) | 380 | if (aio_setup_ring(ctx) < 0) |
| 272 | goto out_freectx; | 381 | goto out_freectx; |
| @@ -286,64 +395,56 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) | |||
| 286 | hlist_add_head_rcu(&ctx->list, &mm->ioctx_list); | 395 | hlist_add_head_rcu(&ctx->list, &mm->ioctx_list); |
| 287 | spin_unlock(&mm->ioctx_lock); | 396 | spin_unlock(&mm->ioctx_lock); |
| 288 | 397 | ||
| 289 | dprintk("aio: allocated ioctx %p[%ld]: mm=%p mask=0x%x\n", | 398 | pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n", |
| 290 | ctx, ctx->user_id, current->mm, ctx->ring_info.nr); | 399 | ctx, ctx->user_id, mm, ctx->nr_events); |
| 291 | return ctx; | 400 | return ctx; |
| 292 | 401 | ||
| 293 | out_cleanup: | 402 | out_cleanup: |
| 294 | err = -EAGAIN; | 403 | err = -EAGAIN; |
| 295 | aio_free_ring(ctx); | 404 | aio_free_ring(ctx); |
| 296 | out_freectx: | 405 | out_freectx: |
| 297 | mmdrop(mm); | ||
| 298 | kmem_cache_free(kioctx_cachep, ctx); | 406 | kmem_cache_free(kioctx_cachep, ctx); |
| 299 | dprintk("aio: error allocating ioctx %d\n", err); | 407 | pr_debug("error allocating ioctx %d\n", err); |
| 300 | return ERR_PTR(err); | 408 | return ERR_PTR(err); |
| 301 | } | 409 | } |
| 302 | 410 | ||
| 303 | /* kill_ctx | 411 | static void kill_ioctx_work(struct work_struct *work) |
| 304 | * Cancels all outstanding aio requests on an aio context. Used | ||
| 305 | * when the processes owning a context have all exited to encourage | ||
| 306 | * the rapid destruction of the kioctx. | ||
| 307 | */ | ||
| 308 | static void kill_ctx(struct kioctx *ctx) | ||
| 309 | { | 412 | { |
| 310 | int (*cancel)(struct kiocb *, struct io_event *); | 413 | struct kioctx *ctx = container_of(work, struct kioctx, rcu_work); |
| 311 | struct task_struct *tsk = current; | ||
| 312 | DECLARE_WAITQUEUE(wait, tsk); | ||
| 313 | struct io_event res; | ||
| 314 | 414 | ||
| 315 | spin_lock_irq(&ctx->ctx_lock); | 415 | wake_up_all(&ctx->wait); |
| 316 | ctx->dead = 1; | 416 | put_ioctx(ctx); |
| 317 | while (!list_empty(&ctx->active_reqs)) { | 417 | } |
| 318 | struct list_head *pos = ctx->active_reqs.next; | ||
| 319 | struct kiocb *iocb = list_kiocb(pos); | ||
| 320 | list_del_init(&iocb->ki_list); | ||
| 321 | cancel = iocb->ki_cancel; | ||
| 322 | kiocbSetCancelled(iocb); | ||
| 323 | if (cancel) { | ||
| 324 | iocb->ki_users++; | ||
| 325 | spin_unlock_irq(&ctx->ctx_lock); | ||
| 326 | cancel(iocb, &res); | ||
| 327 | spin_lock_irq(&ctx->ctx_lock); | ||
| 328 | } | ||
| 329 | } | ||
| 330 | 418 | ||
| 331 | if (!ctx->reqs_active) | 419 | static void kill_ioctx_rcu(struct rcu_head *head) |
| 332 | goto out; | 420 | { |
| 421 | struct kioctx *ctx = container_of(head, struct kioctx, rcu_head); | ||
| 333 | 422 | ||
| 334 | add_wait_queue(&ctx->wait, &wait); | 423 | INIT_WORK(&ctx->rcu_work, kill_ioctx_work); |
| 335 | set_task_state(tsk, TASK_UNINTERRUPTIBLE); | 424 | schedule_work(&ctx->rcu_work); |
| 336 | while (ctx->reqs_active) { | 425 | } |
| 337 | spin_unlock_irq(&ctx->ctx_lock); | ||
| 338 | io_schedule(); | ||
| 339 | set_task_state(tsk, TASK_UNINTERRUPTIBLE); | ||
| 340 | spin_lock_irq(&ctx->ctx_lock); | ||
| 341 | } | ||
| 342 | __set_task_state(tsk, TASK_RUNNING); | ||
| 343 | remove_wait_queue(&ctx->wait, &wait); | ||
| 344 | 426 | ||
| 345 | out: | 427 | /* kill_ioctx |
| 346 | spin_unlock_irq(&ctx->ctx_lock); | 428 | * Cancels all outstanding aio requests on an aio context. Used |
| 429 | * when the processes owning a context have all exited to encourage | ||
| 430 | * the rapid destruction of the kioctx. | ||
| 431 | */ | ||
| 432 | static void kill_ioctx(struct kioctx *ctx) | ||
| 433 | { | ||
| 434 | if (!atomic_xchg(&ctx->dead, 1)) { | ||
| 435 | hlist_del_rcu(&ctx->list); | ||
| 436 | /* Between hlist_del_rcu() and dropping the initial ref */ | ||
| 437 | synchronize_rcu(); | ||
| 438 | |||
| 439 | /* | ||
| 440 | * We can't punt to workqueue here because put_ioctx() -> | ||
| 441 | * free_ioctx() will unmap the ringbuffer, and that has to be | ||
| 442 | * done in the original process's context. kill_ioctx_rcu/work() | ||
| 443 | * exist for exit_aio(), as in that path free_ioctx() won't do | ||
| 444 | * the unmap. | ||
| 445 | */ | ||
| 446 | kill_ioctx_work(&ctx->rcu_work); | ||
| 447 | } | ||
| 347 | } | 448 | } |
| 348 | 449 | ||
| 349 | /* wait_on_sync_kiocb: | 450 | /* wait_on_sync_kiocb: |
| @@ -351,9 +452,9 @@ out: | |||
| 351 | */ | 452 | */ |
| 352 | ssize_t wait_on_sync_kiocb(struct kiocb *iocb) | 453 | ssize_t wait_on_sync_kiocb(struct kiocb *iocb) |
| 353 | { | 454 | { |
| 354 | while (iocb->ki_users) { | 455 | while (atomic_read(&iocb->ki_users)) { |
| 355 | set_current_state(TASK_UNINTERRUPTIBLE); | 456 | set_current_state(TASK_UNINTERRUPTIBLE); |
| 356 | if (!iocb->ki_users) | 457 | if (!atomic_read(&iocb->ki_users)) |
| 357 | break; | 458 | break; |
| 358 | io_schedule(); | 459 | io_schedule(); |
| 359 | } | 460 | } |
| @@ -362,28 +463,26 @@ ssize_t wait_on_sync_kiocb(struct kiocb *iocb) | |||
| 362 | } | 463 | } |
| 363 | EXPORT_SYMBOL(wait_on_sync_kiocb); | 464 | EXPORT_SYMBOL(wait_on_sync_kiocb); |
| 364 | 465 | ||
| 365 | /* exit_aio: called when the last user of mm goes away. At this point, | 466 | /* |
| 366 | * there is no way for any new requests to be submited or any of the | 467 | * exit_aio: called when the last user of mm goes away. At this point, there is |
| 367 | * io_* syscalls to be called on the context. However, there may be | 468 | * no way for any new requests to be submited or any of the io_* syscalls to be |
| 368 | * outstanding requests which hold references to the context; as they | 469 | * called on the context. |
| 369 | * go away, they will call put_ioctx and release any pinned memory | 470 | * |
| 370 | * associated with the request (held via struct page * references). | 471 | * There may be outstanding kiocbs, but free_ioctx() will explicitly wait on |
| 472 | * them. | ||
| 371 | */ | 473 | */ |
| 372 | void exit_aio(struct mm_struct *mm) | 474 | void exit_aio(struct mm_struct *mm) |
| 373 | { | 475 | { |
| 374 | struct kioctx *ctx; | 476 | struct kioctx *ctx; |
| 477 | struct hlist_node *n; | ||
| 375 | 478 | ||
| 376 | while (!hlist_empty(&mm->ioctx_list)) { | 479 | hlist_for_each_entry_safe(ctx, n, &mm->ioctx_list, list) { |
| 377 | ctx = hlist_entry(mm->ioctx_list.first, struct kioctx, list); | ||
| 378 | hlist_del_rcu(&ctx->list); | ||
| 379 | |||
| 380 | kill_ctx(ctx); | ||
| 381 | |||
| 382 | if (1 != atomic_read(&ctx->users)) | 480 | if (1 != atomic_read(&ctx->users)) |
| 383 | printk(KERN_DEBUG | 481 | printk(KERN_DEBUG |
| 384 | "exit_aio:ioctx still alive: %d %d %d\n", | 482 | "exit_aio:ioctx still alive: %d %d %d\n", |
| 385 | atomic_read(&ctx->users), ctx->dead, | 483 | atomic_read(&ctx->users), |
| 386 | ctx->reqs_active); | 484 | atomic_read(&ctx->dead), |
| 485 | atomic_read(&ctx->reqs_active)); | ||
| 387 | /* | 486 | /* |
| 388 | * We don't need to bother with munmap() here - | 487 | * We don't need to bother with munmap() here - |
| 389 | * exit_mmap(mm) is coming and it'll unmap everything. | 488 | * exit_mmap(mm) is coming and it'll unmap everything. |
| @@ -391,150 +490,53 @@ void exit_aio(struct mm_struct *mm) | |||
| 391 | * as indicator that it needs to unmap the area, | 490 | * as indicator that it needs to unmap the area, |
| 392 | * just set it to 0; aio_free_ring() is the only | 491 | * just set it to 0; aio_free_ring() is the only |
| 393 | * place that uses ->mmap_size, so it's safe. | 492 | * place that uses ->mmap_size, so it's safe. |
| 394 | * That way we get all munmap done to current->mm - | ||
| 395 | * all other callers have ctx->mm == current->mm. | ||
| 396 | */ | 493 | */ |
| 397 | ctx->ring_info.mmap_size = 0; | 494 | ctx->mmap_size = 0; |
| 398 | put_ioctx(ctx); | 495 | |
| 496 | if (!atomic_xchg(&ctx->dead, 1)) { | ||
| 497 | hlist_del_rcu(&ctx->list); | ||
| 498 | call_rcu(&ctx->rcu_head, kill_ioctx_rcu); | ||
| 499 | } | ||
| 399 | } | 500 | } |
| 400 | } | 501 | } |
| 401 | 502 | ||
| 402 | /* aio_get_req | 503 | /* aio_get_req |
| 403 | * Allocate a slot for an aio request. Increments the users count | 504 | * Allocate a slot for an aio request. Increments the ki_users count |
| 404 | * of the kioctx so that the kioctx stays around until all requests are | 505 | * of the kioctx so that the kioctx stays around until all requests are |
| 405 | * complete. Returns NULL if no requests are free. | 506 | * complete. Returns NULL if no requests are free. |
| 406 | * | 507 | * |
| 407 | * Returns with kiocb->users set to 2. The io submit code path holds | 508 | * Returns with kiocb->ki_users set to 2. The io submit code path holds |
| 408 | * an extra reference while submitting the i/o. | 509 | * an extra reference while submitting the i/o. |
| 409 | * This prevents races between the aio code path referencing the | 510 | * This prevents races between the aio code path referencing the |
| 410 | * req (after submitting it) and aio_complete() freeing the req. | 511 | * req (after submitting it) and aio_complete() freeing the req. |
| 411 | */ | 512 | */ |
| 412 | static struct kiocb *__aio_get_req(struct kioctx *ctx) | 513 | static inline struct kiocb *aio_get_req(struct kioctx *ctx) |
| 413 | { | 514 | { |
| 414 | struct kiocb *req = NULL; | 515 | struct kiocb *req; |
| 415 | 516 | ||
| 416 | req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL); | 517 | if (atomic_read(&ctx->reqs_active) >= ctx->nr_events) |
| 417 | if (unlikely(!req)) | ||
| 418 | return NULL; | 518 | return NULL; |
| 419 | 519 | ||
| 420 | req->ki_flags = 0; | 520 | if (atomic_inc_return(&ctx->reqs_active) > ctx->nr_events - 1) |
| 421 | req->ki_users = 2; | 521 | goto out_put; |
| 422 | req->ki_key = 0; | ||
| 423 | req->ki_ctx = ctx; | ||
| 424 | req->ki_cancel = NULL; | ||
| 425 | req->ki_retry = NULL; | ||
| 426 | req->ki_dtor = NULL; | ||
| 427 | req->private = NULL; | ||
| 428 | req->ki_iovec = NULL; | ||
| 429 | INIT_LIST_HEAD(&req->ki_run_list); | ||
| 430 | req->ki_eventfd = NULL; | ||
| 431 | |||
| 432 | return req; | ||
| 433 | } | ||
| 434 | |||
| 435 | /* | ||
| 436 | * struct kiocb's are allocated in batches to reduce the number of | ||
| 437 | * times the ctx lock is acquired and released. | ||
| 438 | */ | ||
| 439 | #define KIOCB_BATCH_SIZE 32L | ||
| 440 | struct kiocb_batch { | ||
| 441 | struct list_head head; | ||
| 442 | long count; /* number of requests left to allocate */ | ||
| 443 | }; | ||
| 444 | |||
| 445 | static void kiocb_batch_init(struct kiocb_batch *batch, long total) | ||
| 446 | { | ||
| 447 | INIT_LIST_HEAD(&batch->head); | ||
| 448 | batch->count = total; | ||
| 449 | } | ||
| 450 | |||
| 451 | static void kiocb_batch_free(struct kioctx *ctx, struct kiocb_batch *batch) | ||
| 452 | { | ||
| 453 | struct kiocb *req, *n; | ||
| 454 | |||
| 455 | if (list_empty(&batch->head)) | ||
| 456 | return; | ||
| 457 | |||
| 458 | spin_lock_irq(&ctx->ctx_lock); | ||
| 459 | list_for_each_entry_safe(req, n, &batch->head, ki_batch) { | ||
| 460 | list_del(&req->ki_batch); | ||
| 461 | list_del(&req->ki_list); | ||
| 462 | kmem_cache_free(kiocb_cachep, req); | ||
| 463 | ctx->reqs_active--; | ||
| 464 | } | ||
| 465 | if (unlikely(!ctx->reqs_active && ctx->dead)) | ||
| 466 | wake_up_all(&ctx->wait); | ||
| 467 | spin_unlock_irq(&ctx->ctx_lock); | ||
| 468 | } | ||
| 469 | |||
| 470 | /* | ||
| 471 | * Allocate a batch of kiocbs. This avoids taking and dropping the | ||
| 472 | * context lock a lot during setup. | ||
| 473 | */ | ||
| 474 | static int kiocb_batch_refill(struct kioctx *ctx, struct kiocb_batch *batch) | ||
| 475 | { | ||
| 476 | unsigned short allocated, to_alloc; | ||
| 477 | long avail; | ||
| 478 | struct kiocb *req, *n; | ||
| 479 | struct aio_ring *ring; | ||
| 480 | |||
| 481 | to_alloc = min(batch->count, KIOCB_BATCH_SIZE); | ||
| 482 | for (allocated = 0; allocated < to_alloc; allocated++) { | ||
| 483 | req = __aio_get_req(ctx); | ||
| 484 | if (!req) | ||
| 485 | /* allocation failed, go with what we've got */ | ||
| 486 | break; | ||
| 487 | list_add(&req->ki_batch, &batch->head); | ||
| 488 | } | ||
| 489 | |||
| 490 | if (allocated == 0) | ||
| 491 | goto out; | ||
| 492 | |||
| 493 | spin_lock_irq(&ctx->ctx_lock); | ||
| 494 | ring = kmap_atomic(ctx->ring_info.ring_pages[0]); | ||
| 495 | |||
| 496 | avail = aio_ring_avail(&ctx->ring_info, ring) - ctx->reqs_active; | ||
| 497 | BUG_ON(avail < 0); | ||
| 498 | if (avail < allocated) { | ||
| 499 | /* Trim back the number of requests. */ | ||
| 500 | list_for_each_entry_safe(req, n, &batch->head, ki_batch) { | ||
| 501 | list_del(&req->ki_batch); | ||
| 502 | kmem_cache_free(kiocb_cachep, req); | ||
| 503 | if (--allocated <= avail) | ||
| 504 | break; | ||
| 505 | } | ||
| 506 | } | ||
| 507 | |||
| 508 | batch->count -= allocated; | ||
| 509 | list_for_each_entry(req, &batch->head, ki_batch) { | ||
| 510 | list_add(&req->ki_list, &ctx->active_reqs); | ||
| 511 | ctx->reqs_active++; | ||
| 512 | } | ||
| 513 | 522 | ||
| 514 | kunmap_atomic(ring); | 523 | req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL|__GFP_ZERO); |
| 515 | spin_unlock_irq(&ctx->ctx_lock); | 524 | if (unlikely(!req)) |
| 516 | 525 | goto out_put; | |
| 517 | out: | ||
| 518 | return allocated; | ||
| 519 | } | ||
| 520 | 526 | ||
| 521 | static inline struct kiocb *aio_get_req(struct kioctx *ctx, | 527 | atomic_set(&req->ki_users, 2); |
| 522 | struct kiocb_batch *batch) | 528 | req->ki_ctx = ctx; |
| 523 | { | ||
| 524 | struct kiocb *req; | ||
| 525 | 529 | ||
| 526 | if (list_empty(&batch->head)) | ||
| 527 | if (kiocb_batch_refill(ctx, batch) == 0) | ||
| 528 | return NULL; | ||
| 529 | req = list_first_entry(&batch->head, struct kiocb, ki_batch); | ||
| 530 | list_del(&req->ki_batch); | ||
| 531 | return req; | 530 | return req; |
| 531 | out_put: | ||
| 532 | atomic_dec(&ctx->reqs_active); | ||
| 533 | return NULL; | ||
| 532 | } | 534 | } |
| 533 | 535 | ||
| 534 | static inline void really_put_req(struct kioctx *ctx, struct kiocb *req) | 536 | static void kiocb_free(struct kiocb *req) |
| 535 | { | 537 | { |
| 536 | assert_spin_locked(&ctx->ctx_lock); | 538 | if (req->ki_filp) |
| 537 | 539 | fput(req->ki_filp); | |
| 538 | if (req->ki_eventfd != NULL) | 540 | if (req->ki_eventfd != NULL) |
| 539 | eventfd_ctx_put(req->ki_eventfd); | 541 | eventfd_ctx_put(req->ki_eventfd); |
| 540 | if (req->ki_dtor) | 542 | if (req->ki_dtor) |
| @@ -542,48 +544,12 @@ static inline void really_put_req(struct kioctx *ctx, struct kiocb *req) | |||
| 542 | if (req->ki_iovec != &req->ki_inline_vec) | 544 | if (req->ki_iovec != &req->ki_inline_vec) |
| 543 | kfree(req->ki_iovec); | 545 | kfree(req->ki_iovec); |
| 544 | kmem_cache_free(kiocb_cachep, req); | 546 | kmem_cache_free(kiocb_cachep, req); |
| 545 | ctx->reqs_active--; | ||
| 546 | |||
| 547 | if (unlikely(!ctx->reqs_active && ctx->dead)) | ||
| 548 | wake_up_all(&ctx->wait); | ||
| 549 | } | 547 | } |
| 550 | 548 | ||
| 551 | /* __aio_put_req | 549 | void aio_put_req(struct kiocb *req) |
| 552 | * Returns true if this put was the last user of the request. | ||
| 553 | */ | ||
| 554 | static int __aio_put_req(struct kioctx *ctx, struct kiocb *req) | ||
| 555 | { | 550 | { |
| 556 | dprintk(KERN_DEBUG "aio_put(%p): f_count=%ld\n", | 551 | if (atomic_dec_and_test(&req->ki_users)) |
| 557 | req, atomic_long_read(&req->ki_filp->f_count)); | 552 | kiocb_free(req); |
| 558 | |||
| 559 | assert_spin_locked(&ctx->ctx_lock); | ||
| 560 | |||
| 561 | req->ki_users--; | ||
| 562 | BUG_ON(req->ki_users < 0); | ||
| 563 | if (likely(req->ki_users)) | ||
| 564 | return 0; | ||
| 565 | list_del(&req->ki_list); /* remove from active_reqs */ | ||
| 566 | req->ki_cancel = NULL; | ||
| 567 | req->ki_retry = NULL; | ||
| 568 | |||
| 569 | fput(req->ki_filp); | ||
| 570 | req->ki_filp = NULL; | ||
| 571 | really_put_req(ctx, req); | ||
| 572 | return 1; | ||
| 573 | } | ||
| 574 | |||
| 575 | /* aio_put_req | ||
| 576 | * Returns true if this put was the last user of the kiocb, | ||
| 577 | * false if the request is still in use. | ||
| 578 | */ | ||
| 579 | int aio_put_req(struct kiocb *req) | ||
| 580 | { | ||
| 581 | struct kioctx *ctx = req->ki_ctx; | ||
| 582 | int ret; | ||
| 583 | spin_lock_irq(&ctx->ctx_lock); | ||
| 584 | ret = __aio_put_req(ctx, req); | ||
| 585 | spin_unlock_irq(&ctx->ctx_lock); | ||
| 586 | return ret; | ||
| 587 | } | 553 | } |
| 588 | EXPORT_SYMBOL(aio_put_req); | 554 | EXPORT_SYMBOL(aio_put_req); |
| 589 | 555 | ||
| @@ -595,13 +561,8 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id) | |||
| 595 | rcu_read_lock(); | 561 | rcu_read_lock(); |
| 596 | 562 | ||
| 597 | hlist_for_each_entry_rcu(ctx, &mm->ioctx_list, list) { | 563 | hlist_for_each_entry_rcu(ctx, &mm->ioctx_list, list) { |
| 598 | /* | 564 | if (ctx->user_id == ctx_id) { |
| 599 | * RCU protects us against accessing freed memory but | 565 | atomic_inc(&ctx->users); |
| 600 | * we have to be careful not to get a reference when the | ||
| 601 | * reference count already dropped to 0 (ctx->dead test | ||
| 602 | * is unreliable because of races). | ||
| 603 | */ | ||
| 604 | if (ctx->user_id == ctx_id && !ctx->dead && try_get_ioctx(ctx)){ | ||
| 605 | ret = ctx; | 566 | ret = ctx; |
| 606 | break; | 567 | break; |
| 607 | } | 568 | } |
| @@ -611,295 +572,16 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id) | |||
| 611 | return ret; | 572 | return ret; |
| 612 | } | 573 | } |
| 613 | 574 | ||
| 614 | /* | ||
| 615 | * Queue up a kiocb to be retried. Assumes that the kiocb | ||
| 616 | * has already been marked as kicked, and places it on | ||
| 617 | * the retry run list for the corresponding ioctx, if it | ||
| 618 | * isn't already queued. Returns 1 if it actually queued | ||
| 619 | * the kiocb (to tell the caller to activate the work | ||
| 620 | * queue to process it), or 0, if it found that it was | ||
| 621 | * already queued. | ||
| 622 | */ | ||
| 623 | static inline int __queue_kicked_iocb(struct kiocb *iocb) | ||
| 624 | { | ||
| 625 | struct kioctx *ctx = iocb->ki_ctx; | ||
| 626 | |||
| 627 | assert_spin_locked(&ctx->ctx_lock); | ||
| 628 | |||
| 629 | if (list_empty(&iocb->ki_run_list)) { | ||
| 630 | list_add_tail(&iocb->ki_run_list, | ||
| 631 | &ctx->run_list); | ||
| 632 | return 1; | ||
| 633 | } | ||
| 634 | return 0; | ||
| 635 | } | ||
| 636 | |||
| 637 | /* aio_run_iocb | ||
| 638 | * This is the core aio execution routine. It is | ||
| 639 | * invoked both for initial i/o submission and | ||
| 640 | * subsequent retries via the aio_kick_handler. | ||
| 641 | * Expects to be invoked with iocb->ki_ctx->lock | ||
| 642 | * already held. The lock is released and reacquired | ||
| 643 | * as needed during processing. | ||
| 644 | * | ||
| 645 | * Calls the iocb retry method (already setup for the | ||
| 646 | * iocb on initial submission) for operation specific | ||
| 647 | * handling, but takes care of most of common retry | ||
| 648 | * execution details for a given iocb. The retry method | ||
| 649 | * needs to be non-blocking as far as possible, to avoid | ||
| 650 | * holding up other iocbs waiting to be serviced by the | ||
| 651 | * retry kernel thread. | ||
| 652 | * | ||
| 653 | * The trickier parts in this code have to do with | ||
| 654 | * ensuring that only one retry instance is in progress | ||
| 655 | * for a given iocb at any time. Providing that guarantee | ||
| 656 | * simplifies the coding of individual aio operations as | ||
| 657 | * it avoids various potential races. | ||
| 658 | */ | ||
| 659 | static ssize_t aio_run_iocb(struct kiocb *iocb) | ||
| 660 | { | ||
| 661 | struct kioctx *ctx = iocb->ki_ctx; | ||
| 662 | ssize_t (*retry)(struct kiocb *); | ||
| 663 | ssize_t ret; | ||
| 664 | |||
| 665 | if (!(retry = iocb->ki_retry)) { | ||
| 666 | printk("aio_run_iocb: iocb->ki_retry = NULL\n"); | ||
| 667 | return 0; | ||
| 668 | } | ||
| 669 | |||
| 670 | /* | ||
| 671 | * We don't want the next retry iteration for this | ||
| 672 | * operation to start until this one has returned and | ||
| 673 | * updated the iocb state. However, wait_queue functions | ||
| 674 | * can trigger a kick_iocb from interrupt context in the | ||
| 675 | * meantime, indicating that data is available for the next | ||
| 676 | * iteration. We want to remember that and enable the | ||
| 677 | * next retry iteration _after_ we are through with | ||
| 678 | * this one. | ||
| 679 | * | ||
| 680 | * So, in order to be able to register a "kick", but | ||
| 681 | * prevent it from being queued now, we clear the kick | ||
| 682 | * flag, but make the kick code *think* that the iocb is | ||
| 683 | * still on the run list until we are actually done. | ||
| 684 | * When we are done with this iteration, we check if | ||
| 685 | * the iocb was kicked in the meantime and if so, queue | ||
| 686 | * it up afresh. | ||
| 687 | */ | ||
| 688 | |||
| 689 | kiocbClearKicked(iocb); | ||
| 690 | |||
| 691 | /* | ||
| 692 | * This is so that aio_complete knows it doesn't need to | ||
| 693 | * pull the iocb off the run list (We can't just call | ||
| 694 | * INIT_LIST_HEAD because we don't want a kick_iocb to | ||
| 695 | * queue this on the run list yet) | ||
| 696 | */ | ||
| 697 | iocb->ki_run_list.next = iocb->ki_run_list.prev = NULL; | ||
| 698 | spin_unlock_irq(&ctx->ctx_lock); | ||
| 699 | |||
| 700 | /* Quit retrying if the i/o has been cancelled */ | ||
| 701 | if (kiocbIsCancelled(iocb)) { | ||
| 702 | ret = -EINTR; | ||
| 703 | aio_complete(iocb, ret, 0); | ||
| 704 | /* must not access the iocb after this */ | ||
| 705 | goto out; | ||
| 706 | } | ||
| 707 | |||
| 708 | /* | ||
| 709 | * Now we are all set to call the retry method in async | ||
| 710 | * context. | ||
| 711 | */ | ||
| 712 | ret = retry(iocb); | ||
| 713 | |||
| 714 | if (ret != -EIOCBRETRY && ret != -EIOCBQUEUED) { | ||
| 715 | /* | ||
| 716 | * There's no easy way to restart the syscall since other AIO's | ||
| 717 | * may be already running. Just fail this IO with EINTR. | ||
| 718 | */ | ||
| 719 | if (unlikely(ret == -ERESTARTSYS || ret == -ERESTARTNOINTR || | ||
| 720 | ret == -ERESTARTNOHAND || ret == -ERESTART_RESTARTBLOCK)) | ||
| 721 | ret = -EINTR; | ||
| 722 | aio_complete(iocb, ret, 0); | ||
| 723 | } | ||
| 724 | out: | ||
| 725 | spin_lock_irq(&ctx->ctx_lock); | ||
| 726 | |||
| 727 | if (-EIOCBRETRY == ret) { | ||
| 728 | /* | ||
| 729 | * OK, now that we are done with this iteration | ||
| 730 | * and know that there is more left to go, | ||
| 731 | * this is where we let go so that a subsequent | ||
| 732 | * "kick" can start the next iteration | ||
| 733 | */ | ||
| 734 | |||
| 735 | /* will make __queue_kicked_iocb succeed from here on */ | ||
| 736 | INIT_LIST_HEAD(&iocb->ki_run_list); | ||
| 737 | /* we must queue the next iteration ourselves, if it | ||
| 738 | * has already been kicked */ | ||
| 739 | if (kiocbIsKicked(iocb)) { | ||
| 740 | __queue_kicked_iocb(iocb); | ||
| 741 | |||
| 742 | /* | ||
| 743 | * __queue_kicked_iocb will always return 1 here, because | ||
| 744 | * iocb->ki_run_list is empty at this point so it should | ||
| 745 | * be safe to unconditionally queue the context into the | ||
| 746 | * work queue. | ||
| 747 | */ | ||
| 748 | aio_queue_work(ctx); | ||
| 749 | } | ||
| 750 | } | ||
| 751 | return ret; | ||
| 752 | } | ||
| 753 | |||
| 754 | /* | ||
| 755 | * __aio_run_iocbs: | ||
| 756 | * Process all pending retries queued on the ioctx | ||
| 757 | * run list. | ||
| 758 | * Assumes it is operating within the aio issuer's mm | ||
| 759 | * context. | ||
| 760 | */ | ||
| 761 | static int __aio_run_iocbs(struct kioctx *ctx) | ||
| 762 | { | ||
| 763 | struct kiocb *iocb; | ||
| 764 | struct list_head run_list; | ||
| 765 | |||
| 766 | assert_spin_locked(&ctx->ctx_lock); | ||
| 767 | |||
| 768 | list_replace_init(&ctx->run_list, &run_list); | ||
| 769 | while (!list_empty(&run_list)) { | ||
| 770 | iocb = list_entry(run_list.next, struct kiocb, | ||
| 771 | ki_run_list); | ||
| 772 | list_del(&iocb->ki_run_list); | ||
| 773 | /* | ||
| 774 | * Hold an extra reference while retrying i/o. | ||
| 775 | */ | ||
| 776 | iocb->ki_users++; /* grab extra reference */ | ||
| 777 | aio_run_iocb(iocb); | ||
| 778 | __aio_put_req(ctx, iocb); | ||
| 779 | } | ||
| 780 | if (!list_empty(&ctx->run_list)) | ||
| 781 | return 1; | ||
| 782 | return 0; | ||
| 783 | } | ||
| 784 | |||
| 785 | static void aio_queue_work(struct kioctx * ctx) | ||
| 786 | { | ||
| 787 | unsigned long timeout; | ||
| 788 | /* | ||
| 789 | * if someone is waiting, get the work started right | ||
| 790 | * away, otherwise, use a longer delay | ||
| 791 | */ | ||
| 792 | smp_mb(); | ||
| 793 | if (waitqueue_active(&ctx->wait)) | ||
| 794 | timeout = 1; | ||
| 795 | else | ||
| 796 | timeout = HZ/10; | ||
| 797 | queue_delayed_work(aio_wq, &ctx->wq, timeout); | ||
| 798 | } | ||
| 799 | |||
| 800 | /* | ||
| 801 | * aio_run_all_iocbs: | ||
| 802 | * Process all pending retries queued on the ioctx | ||
| 803 | * run list, and keep running them until the list | ||
| 804 | * stays empty. | ||
| 805 | * Assumes it is operating within the aio issuer's mm context. | ||
| 806 | */ | ||
| 807 | static inline void aio_run_all_iocbs(struct kioctx *ctx) | ||
| 808 | { | ||
| 809 | spin_lock_irq(&ctx->ctx_lock); | ||
| 810 | while (__aio_run_iocbs(ctx)) | ||
| 811 | ; | ||
| 812 | spin_unlock_irq(&ctx->ctx_lock); | ||
| 813 | } | ||
| 814 | |||
| 815 | /* | ||
| 816 | * aio_kick_handler: | ||
| 817 | * Work queue handler triggered to process pending | ||
| 818 | * retries on an ioctx. Takes on the aio issuer's | ||
| 819 | * mm context before running the iocbs, so that | ||
| 820 | * copy_xxx_user operates on the issuer's address | ||
| 821 | * space. | ||
| 822 | * Run on aiod's context. | ||
| 823 | */ | ||
| 824 | static void aio_kick_handler(struct work_struct *work) | ||
| 825 | { | ||
| 826 | struct kioctx *ctx = container_of(work, struct kioctx, wq.work); | ||
| 827 | mm_segment_t oldfs = get_fs(); | ||
| 828 | struct mm_struct *mm; | ||
| 829 | int requeue; | ||
| 830 | |||
| 831 | set_fs(USER_DS); | ||
| 832 | use_mm(ctx->mm); | ||
| 833 | spin_lock_irq(&ctx->ctx_lock); | ||
| 834 | requeue =__aio_run_iocbs(ctx); | ||
| 835 | mm = ctx->mm; | ||
| 836 | spin_unlock_irq(&ctx->ctx_lock); | ||
| 837 | unuse_mm(mm); | ||
| 838 | set_fs(oldfs); | ||
| 839 | /* | ||
| 840 | * we're in a worker thread already; no point using non-zero delay | ||
| 841 | */ | ||
| 842 | if (requeue) | ||
| 843 | queue_delayed_work(aio_wq, &ctx->wq, 0); | ||
| 844 | } | ||
| 845 | |||
| 846 | |||
| 847 | /* | ||
| 848 | * Called by kick_iocb to queue the kiocb for retry | ||
| 849 | * and if required activate the aio work queue to process | ||
| 850 | * it | ||
| 851 | */ | ||
| 852 | static void try_queue_kicked_iocb(struct kiocb *iocb) | ||
| 853 | { | ||
| 854 | struct kioctx *ctx = iocb->ki_ctx; | ||
| 855 | unsigned long flags; | ||
| 856 | int run = 0; | ||
| 857 | |||
| 858 | spin_lock_irqsave(&ctx->ctx_lock, flags); | ||
| 859 | /* set this inside the lock so that we can't race with aio_run_iocb() | ||
| 860 | * testing it and putting the iocb on the run list under the lock */ | ||
| 861 | if (!kiocbTryKick(iocb)) | ||
| 862 | run = __queue_kicked_iocb(iocb); | ||
| 863 | spin_unlock_irqrestore(&ctx->ctx_lock, flags); | ||
| 864 | if (run) | ||
| 865 | aio_queue_work(ctx); | ||
| 866 | } | ||
| 867 | |||
| 868 | /* | ||
| 869 | * kick_iocb: | ||
| 870 | * Called typically from a wait queue callback context | ||
| 871 | * to trigger a retry of the iocb. | ||
| 872 | * The retry is usually executed by aio workqueue | ||
| 873 | * threads (See aio_kick_handler). | ||
| 874 | */ | ||
| 875 | void kick_iocb(struct kiocb *iocb) | ||
| 876 | { | ||
| 877 | /* sync iocbs are easy: they can only ever be executing from a | ||
| 878 | * single context. */ | ||
| 879 | if (is_sync_kiocb(iocb)) { | ||
| 880 | kiocbSetKicked(iocb); | ||
| 881 | wake_up_process(iocb->ki_obj.tsk); | ||
| 882 | return; | ||
| 883 | } | ||
| 884 | |||
| 885 | try_queue_kicked_iocb(iocb); | ||
| 886 | } | ||
| 887 | EXPORT_SYMBOL(kick_iocb); | ||
| 888 | |||
| 889 | /* aio_complete | 575 | /* aio_complete |
| 890 | * Called when the io request on the given iocb is complete. | 576 | * Called when the io request on the given iocb is complete. |
| 891 | * Returns true if this is the last user of the request. The | ||
| 892 | * only other user of the request can be the cancellation code. | ||
| 893 | */ | 577 | */ |
| 894 | int aio_complete(struct kiocb *iocb, long res, long res2) | 578 | void aio_complete(struct kiocb *iocb, long res, long res2) |
| 895 | { | 579 | { |
| 896 | struct kioctx *ctx = iocb->ki_ctx; | 580 | struct kioctx *ctx = iocb->ki_ctx; |
| 897 | struct aio_ring_info *info; | ||
| 898 | struct aio_ring *ring; | 581 | struct aio_ring *ring; |
| 899 | struct io_event *event; | 582 | struct io_event *ev_page, *event; |
| 900 | unsigned long flags; | 583 | unsigned long flags; |
| 901 | unsigned long tail; | 584 | unsigned tail, pos; |
| 902 | int ret; | ||
| 903 | 585 | ||
| 904 | /* | 586 | /* |
| 905 | * Special case handling for sync iocbs: | 587 | * Special case handling for sync iocbs: |
| @@ -909,61 +591,81 @@ int aio_complete(struct kiocb *iocb, long res, long res2) | |||
| 909 | * - the sync task helpfully left a reference to itself in the iocb | 591 | * - the sync task helpfully left a reference to itself in the iocb |
| 910 | */ | 592 | */ |
| 911 | if (is_sync_kiocb(iocb)) { | 593 | if (is_sync_kiocb(iocb)) { |
| 912 | BUG_ON(iocb->ki_users != 1); | 594 | BUG_ON(atomic_read(&iocb->ki_users) != 1); |
| 913 | iocb->ki_user_data = res; | 595 | iocb->ki_user_data = res; |
| 914 | iocb->ki_users = 0; | 596 | atomic_set(&iocb->ki_users, 0); |
| 915 | wake_up_process(iocb->ki_obj.tsk); | 597 | wake_up_process(iocb->ki_obj.tsk); |
| 916 | return 1; | 598 | return; |
| 917 | } | 599 | } |
| 918 | 600 | ||
| 919 | info = &ctx->ring_info; | 601 | /* |
| 920 | 602 | * Take rcu_read_lock() in case the kioctx is being destroyed, as we | |
| 921 | /* add a completion event to the ring buffer. | 603 | * need to issue a wakeup after decrementing reqs_active. |
| 922 | * must be done holding ctx->ctx_lock to prevent | ||
| 923 | * other code from messing with the tail | ||
| 924 | * pointer since we might be called from irq | ||
| 925 | * context. | ||
| 926 | */ | 604 | */ |
| 927 | spin_lock_irqsave(&ctx->ctx_lock, flags); | 605 | rcu_read_lock(); |
| 928 | 606 | ||
| 929 | if (iocb->ki_run_list.prev && !list_empty(&iocb->ki_run_list)) | 607 | if (iocb->ki_list.next) { |
| 930 | list_del_init(&iocb->ki_run_list); | 608 | unsigned long flags; |
| 609 | |||
| 610 | spin_lock_irqsave(&ctx->ctx_lock, flags); | ||
| 611 | list_del(&iocb->ki_list); | ||
| 612 | spin_unlock_irqrestore(&ctx->ctx_lock, flags); | ||
| 613 | } | ||
| 931 | 614 | ||
| 932 | /* | 615 | /* |
| 933 | * cancelled requests don't get events, userland was given one | 616 | * cancelled requests don't get events, userland was given one |
| 934 | * when the event got cancelled. | 617 | * when the event got cancelled. |
| 935 | */ | 618 | */ |
| 936 | if (kiocbIsCancelled(iocb)) | 619 | if (unlikely(xchg(&iocb->ki_cancel, |
| 620 | KIOCB_CANCELLED) == KIOCB_CANCELLED)) { | ||
| 621 | atomic_dec(&ctx->reqs_active); | ||
| 622 | /* Still need the wake_up in case free_ioctx is waiting */ | ||
| 937 | goto put_rq; | 623 | goto put_rq; |
| 624 | } | ||
| 938 | 625 | ||
| 939 | ring = kmap_atomic(info->ring_pages[0]); | 626 | /* |
| 627 | * Add a completion event to the ring buffer. Must be done holding | ||
| 628 | * ctx->ctx_lock to prevent other code from messing with the tail | ||
| 629 | * pointer since we might be called from irq context. | ||
| 630 | */ | ||
| 631 | spin_lock_irqsave(&ctx->completion_lock, flags); | ||
| 940 | 632 | ||
| 941 | tail = info->tail; | 633 | tail = ctx->tail; |
| 942 | event = aio_ring_event(info, tail); | 634 | pos = tail + AIO_EVENTS_OFFSET; |
| 943 | if (++tail >= info->nr) | 635 | |
| 636 | if (++tail >= ctx->nr_events) | ||
| 944 | tail = 0; | 637 | tail = 0; |
| 945 | 638 | ||
| 639 | ev_page = kmap_atomic(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]); | ||
| 640 | event = ev_page + pos % AIO_EVENTS_PER_PAGE; | ||
| 641 | |||
| 946 | event->obj = (u64)(unsigned long)iocb->ki_obj.user; | 642 | event->obj = (u64)(unsigned long)iocb->ki_obj.user; |
| 947 | event->data = iocb->ki_user_data; | 643 | event->data = iocb->ki_user_data; |
| 948 | event->res = res; | 644 | event->res = res; |
| 949 | event->res2 = res2; | 645 | event->res2 = res2; |
| 950 | 646 | ||
| 951 | dprintk("aio_complete: %p[%lu]: %p: %p %Lx %lx %lx\n", | 647 | kunmap_atomic(ev_page); |
| 952 | ctx, tail, iocb, iocb->ki_obj.user, iocb->ki_user_data, | 648 | flush_dcache_page(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]); |
| 953 | res, res2); | 649 | |
| 650 | pr_debug("%p[%u]: %p: %p %Lx %lx %lx\n", | ||
| 651 | ctx, tail, iocb, iocb->ki_obj.user, iocb->ki_user_data, | ||
| 652 | res, res2); | ||
| 954 | 653 | ||
| 955 | /* after flagging the request as done, we | 654 | /* after flagging the request as done, we |
| 956 | * must never even look at it again | 655 | * must never even look at it again |
| 957 | */ | 656 | */ |
| 958 | smp_wmb(); /* make event visible before updating tail */ | 657 | smp_wmb(); /* make event visible before updating tail */ |
| 959 | 658 | ||
| 960 | info->tail = tail; | 659 | ctx->tail = tail; |
| 961 | ring->tail = tail; | ||
| 962 | 660 | ||
| 963 | put_aio_ring_event(event); | 661 | ring = kmap_atomic(ctx->ring_pages[0]); |
| 662 | ring->tail = tail; | ||
| 964 | kunmap_atomic(ring); | 663 | kunmap_atomic(ring); |
| 664 | flush_dcache_page(ctx->ring_pages[0]); | ||
| 665 | |||
| 666 | spin_unlock_irqrestore(&ctx->completion_lock, flags); | ||
| 965 | 667 | ||
| 966 | pr_debug("added to ring %p at [%lu]\n", iocb, tail); | 668 | pr_debug("added to ring %p at [%u]\n", iocb, tail); |
| 967 | 669 | ||
| 968 | /* | 670 | /* |
| 969 | * Check if the user asked us to deliver the result through an | 671 | * Check if the user asked us to deliver the result through an |
| @@ -975,7 +677,7 @@ int aio_complete(struct kiocb *iocb, long res, long res2) | |||
| 975 | 677 | ||
| 976 | put_rq: | 678 | put_rq: |
| 977 | /* everything turned out well, dispose of the aiocb. */ | 679 | /* everything turned out well, dispose of the aiocb. */ |
| 978 | ret = __aio_put_req(ctx, iocb); | 680 | aio_put_req(iocb); |
| 979 | 681 | ||
| 980 | /* | 682 | /* |
| 981 | * We have to order our ring_info tail store above and test | 683 | * We have to order our ring_info tail store above and test |
| @@ -988,233 +690,133 @@ put_rq: | |||
| 988 | if (waitqueue_active(&ctx->wait)) | 690 | if (waitqueue_active(&ctx->wait)) |
| 989 | wake_up(&ctx->wait); | 691 | wake_up(&ctx->wait); |
| 990 | 692 | ||
| 991 | spin_unlock_irqrestore(&ctx->ctx_lock, flags); | 693 | rcu_read_unlock(); |
| 992 | return ret; | ||
| 993 | } | 694 | } |
| 994 | EXPORT_SYMBOL(aio_complete); | 695 | EXPORT_SYMBOL(aio_complete); |
| 995 | 696 | ||
| 996 | /* aio_read_evt | 697 | /* aio_read_events |
| 997 | * Pull an event off of the ioctx's event ring. Returns the number of | 698 | * Pull an event off of the ioctx's event ring. Returns the number of |
| 998 | * events fetched (0 or 1 ;-) | 699 | * events fetched |
| 999 | * FIXME: make this use cmpxchg. | ||
| 1000 | * TODO: make the ringbuffer user mmap()able (requires FIXME). | ||
| 1001 | */ | 700 | */ |
| 1002 | static int aio_read_evt(struct kioctx *ioctx, struct io_event *ent) | 701 | static long aio_read_events_ring(struct kioctx *ctx, |
| 702 | struct io_event __user *event, long nr) | ||
| 1003 | { | 703 | { |
| 1004 | struct aio_ring_info *info = &ioctx->ring_info; | ||
| 1005 | struct aio_ring *ring; | 704 | struct aio_ring *ring; |
| 1006 | unsigned long head; | 705 | unsigned head, pos; |
| 1007 | int ret = 0; | 706 | long ret = 0; |
| 1008 | 707 | int copy_ret; | |
| 1009 | ring = kmap_atomic(info->ring_pages[0]); | ||
| 1010 | dprintk("in aio_read_evt h%lu t%lu m%lu\n", | ||
| 1011 | (unsigned long)ring->head, (unsigned long)ring->tail, | ||
| 1012 | (unsigned long)ring->nr); | ||
| 1013 | |||
| 1014 | if (ring->head == ring->tail) | ||
| 1015 | goto out; | ||
| 1016 | 708 | ||
| 1017 | spin_lock(&info->ring_lock); | 709 | mutex_lock(&ctx->ring_lock); |
| 1018 | |||
| 1019 | head = ring->head % info->nr; | ||
| 1020 | if (head != ring->tail) { | ||
| 1021 | struct io_event *evp = aio_ring_event(info, head); | ||
| 1022 | *ent = *evp; | ||
| 1023 | head = (head + 1) % info->nr; | ||
| 1024 | smp_mb(); /* finish reading the event before updatng the head */ | ||
| 1025 | ring->head = head; | ||
| 1026 | ret = 1; | ||
| 1027 | put_aio_ring_event(evp); | ||
| 1028 | } | ||
| 1029 | spin_unlock(&info->ring_lock); | ||
| 1030 | 710 | ||
| 1031 | out: | 711 | ring = kmap_atomic(ctx->ring_pages[0]); |
| 1032 | dprintk("leaving aio_read_evt: %d h%lu t%lu\n", ret, | 712 | head = ring->head; |
| 1033 | (unsigned long)ring->head, (unsigned long)ring->tail); | ||
| 1034 | kunmap_atomic(ring); | 713 | kunmap_atomic(ring); |
| 1035 | return ret; | ||
| 1036 | } | ||
| 1037 | 714 | ||
| 1038 | struct aio_timeout { | 715 | pr_debug("h%u t%u m%u\n", head, ctx->tail, ctx->nr_events); |
| 1039 | struct timer_list timer; | ||
| 1040 | int timed_out; | ||
| 1041 | struct task_struct *p; | ||
| 1042 | }; | ||
| 1043 | 716 | ||
| 1044 | static void timeout_func(unsigned long data) | 717 | if (head == ctx->tail) |
| 1045 | { | 718 | goto out; |
| 1046 | struct aio_timeout *to = (struct aio_timeout *)data; | ||
| 1047 | 719 | ||
| 1048 | to->timed_out = 1; | 720 | while (ret < nr) { |
| 1049 | wake_up_process(to->p); | 721 | long avail; |
| 1050 | } | 722 | struct io_event *ev; |
| 723 | struct page *page; | ||
| 1051 | 724 | ||
| 1052 | static inline void init_timeout(struct aio_timeout *to) | 725 | avail = (head <= ctx->tail ? ctx->tail : ctx->nr_events) - head; |
| 1053 | { | 726 | if (head == ctx->tail) |
| 1054 | setup_timer_on_stack(&to->timer, timeout_func, (unsigned long) to); | 727 | break; |
| 1055 | to->timed_out = 0; | ||
| 1056 | to->p = current; | ||
| 1057 | } | ||
| 1058 | 728 | ||
| 1059 | static inline void set_timeout(long start_jiffies, struct aio_timeout *to, | 729 | avail = min(avail, nr - ret); |
| 1060 | const struct timespec *ts) | 730 | avail = min_t(long, avail, AIO_EVENTS_PER_PAGE - |
| 1061 | { | 731 | ((head + AIO_EVENTS_OFFSET) % AIO_EVENTS_PER_PAGE)); |
| 1062 | to->timer.expires = start_jiffies + timespec_to_jiffies(ts); | ||
| 1063 | if (time_after(to->timer.expires, jiffies)) | ||
| 1064 | add_timer(&to->timer); | ||
| 1065 | else | ||
| 1066 | to->timed_out = 1; | ||
| 1067 | } | ||
| 1068 | 732 | ||
| 1069 | static inline void clear_timeout(struct aio_timeout *to) | 733 | pos = head + AIO_EVENTS_OFFSET; |
| 1070 | { | 734 | page = ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]; |
| 1071 | del_singleshot_timer_sync(&to->timer); | 735 | pos %= AIO_EVENTS_PER_PAGE; |
| 1072 | } | ||
| 1073 | 736 | ||
| 1074 | static int read_events(struct kioctx *ctx, | 737 | ev = kmap(page); |
| 1075 | long min_nr, long nr, | 738 | copy_ret = copy_to_user(event + ret, ev + pos, |
| 1076 | struct io_event __user *event, | 739 | sizeof(*ev) * avail); |
| 1077 | struct timespec __user *timeout) | 740 | kunmap(page); |
| 1078 | { | ||
| 1079 | long start_jiffies = jiffies; | ||
| 1080 | struct task_struct *tsk = current; | ||
| 1081 | DECLARE_WAITQUEUE(wait, tsk); | ||
| 1082 | int ret; | ||
| 1083 | int i = 0; | ||
| 1084 | struct io_event ent; | ||
| 1085 | struct aio_timeout to; | ||
| 1086 | int retry = 0; | ||
| 1087 | |||
| 1088 | /* needed to zero any padding within an entry (there shouldn't be | ||
| 1089 | * any, but C is fun! | ||
| 1090 | */ | ||
| 1091 | memset(&ent, 0, sizeof(ent)); | ||
| 1092 | retry: | ||
| 1093 | ret = 0; | ||
| 1094 | while (likely(i < nr)) { | ||
| 1095 | ret = aio_read_evt(ctx, &ent); | ||
| 1096 | if (unlikely(ret <= 0)) | ||
| 1097 | break; | ||
| 1098 | |||
| 1099 | dprintk("read event: %Lx %Lx %Lx %Lx\n", | ||
| 1100 | ent.data, ent.obj, ent.res, ent.res2); | ||
| 1101 | 741 | ||
| 1102 | /* Could we split the check in two? */ | 742 | if (unlikely(copy_ret)) { |
| 1103 | ret = -EFAULT; | 743 | ret = -EFAULT; |
| 1104 | if (unlikely(copy_to_user(event, &ent, sizeof(ent)))) { | 744 | goto out; |
| 1105 | dprintk("aio: lost an event due to EFAULT.\n"); | ||
| 1106 | break; | ||
| 1107 | } | 745 | } |
| 1108 | ret = 0; | ||
| 1109 | 746 | ||
| 1110 | /* Good, event copied to userland, update counts. */ | 747 | ret += avail; |
| 1111 | event ++; | 748 | head += avail; |
| 1112 | i ++; | 749 | head %= ctx->nr_events; |
| 1113 | } | 750 | } |
| 1114 | 751 | ||
| 1115 | if (min_nr <= i) | 752 | ring = kmap_atomic(ctx->ring_pages[0]); |
| 1116 | return i; | 753 | ring->head = head; |
| 1117 | if (ret) | 754 | kunmap_atomic(ring); |
| 1118 | return ret; | 755 | flush_dcache_page(ctx->ring_pages[0]); |
| 1119 | 756 | ||
| 1120 | /* End fast path */ | 757 | pr_debug("%li h%u t%u\n", ret, head, ctx->tail); |
| 1121 | 758 | ||
| 1122 | /* racey check, but it gets redone */ | 759 | atomic_sub(ret, &ctx->reqs_active); |
| 1123 | if (!retry && unlikely(!list_empty(&ctx->run_list))) { | 760 | out: |
| 1124 | retry = 1; | 761 | mutex_unlock(&ctx->ring_lock); |
| 1125 | aio_run_all_iocbs(ctx); | ||
| 1126 | goto retry; | ||
| 1127 | } | ||
| 1128 | 762 | ||
| 1129 | init_timeout(&to); | 763 | return ret; |
| 1130 | if (timeout) { | 764 | } |
| 1131 | struct timespec ts; | ||
| 1132 | ret = -EFAULT; | ||
| 1133 | if (unlikely(copy_from_user(&ts, timeout, sizeof(ts)))) | ||
| 1134 | goto out; | ||
| 1135 | 765 | ||
| 1136 | set_timeout(start_jiffies, &to, &ts); | 766 | static bool aio_read_events(struct kioctx *ctx, long min_nr, long nr, |
| 1137 | } | 767 | struct io_event __user *event, long *i) |
| 768 | { | ||
| 769 | long ret = aio_read_events_ring(ctx, event + *i, nr - *i); | ||
| 1138 | 770 | ||
| 1139 | while (likely(i < nr)) { | 771 | if (ret > 0) |
| 1140 | add_wait_queue_exclusive(&ctx->wait, &wait); | 772 | *i += ret; |
| 1141 | do { | ||
| 1142 | set_task_state(tsk, TASK_INTERRUPTIBLE); | ||
| 1143 | ret = aio_read_evt(ctx, &ent); | ||
| 1144 | if (ret) | ||
| 1145 | break; | ||
| 1146 | if (min_nr <= i) | ||
| 1147 | break; | ||
| 1148 | if (unlikely(ctx->dead)) { | ||
| 1149 | ret = -EINVAL; | ||
| 1150 | break; | ||
| 1151 | } | ||
| 1152 | if (to.timed_out) /* Only check after read evt */ | ||
| 1153 | break; | ||
| 1154 | /* Try to only show up in io wait if there are ops | ||
| 1155 | * in flight */ | ||
| 1156 | if (ctx->reqs_active) | ||
| 1157 | io_schedule(); | ||
| 1158 | else | ||
| 1159 | schedule(); | ||
| 1160 | if (signal_pending(tsk)) { | ||
| 1161 | ret = -EINTR; | ||
| 1162 | break; | ||
| 1163 | } | ||
| 1164 | /*ret = aio_read_evt(ctx, &ent);*/ | ||
| 1165 | } while (1) ; | ||
| 1166 | |||
| 1167 | set_task_state(tsk, TASK_RUNNING); | ||
| 1168 | remove_wait_queue(&ctx->wait, &wait); | ||
| 1169 | |||
| 1170 | if (unlikely(ret <= 0)) | ||
| 1171 | break; | ||
| 1172 | 773 | ||
| 1173 | ret = -EFAULT; | 774 | if (unlikely(atomic_read(&ctx->dead))) |
| 1174 | if (unlikely(copy_to_user(event, &ent, sizeof(ent)))) { | 775 | ret = -EINVAL; |
| 1175 | dprintk("aio: lost an event due to EFAULT.\n"); | ||
| 1176 | break; | ||
| 1177 | } | ||
| 1178 | 776 | ||
| 1179 | /* Good, event copied to userland, update counts. */ | 777 | if (!*i) |
| 1180 | event ++; | 778 | *i = ret; |
| 1181 | i ++; | ||
| 1182 | } | ||
| 1183 | 779 | ||
| 1184 | if (timeout) | 780 | return ret < 0 || *i >= min_nr; |
| 1185 | clear_timeout(&to); | ||
| 1186 | out: | ||
| 1187 | destroy_timer_on_stack(&to.timer); | ||
| 1188 | return i ? i : ret; | ||
| 1189 | } | 781 | } |
| 1190 | 782 | ||
| 1191 | /* Take an ioctx and remove it from the list of ioctx's. Protects | 783 | static long read_events(struct kioctx *ctx, long min_nr, long nr, |
| 1192 | * against races with itself via ->dead. | 784 | struct io_event __user *event, |
| 1193 | */ | 785 | struct timespec __user *timeout) |
| 1194 | static void io_destroy(struct kioctx *ioctx) | ||
| 1195 | { | 786 | { |
| 1196 | struct mm_struct *mm = current->mm; | 787 | ktime_t until = { .tv64 = KTIME_MAX }; |
| 1197 | int was_dead; | 788 | long ret = 0; |
| 1198 | 789 | ||
| 1199 | /* delete the entry from the list is someone else hasn't already */ | 790 | if (timeout) { |
| 1200 | spin_lock(&mm->ioctx_lock); | 791 | struct timespec ts; |
| 1201 | was_dead = ioctx->dead; | ||
| 1202 | ioctx->dead = 1; | ||
| 1203 | hlist_del_rcu(&ioctx->list); | ||
| 1204 | spin_unlock(&mm->ioctx_lock); | ||
| 1205 | 792 | ||
| 1206 | dprintk("aio_release(%p)\n", ioctx); | 793 | if (unlikely(copy_from_user(&ts, timeout, sizeof(ts)))) |
| 1207 | if (likely(!was_dead)) | 794 | return -EFAULT; |
| 1208 | put_ioctx(ioctx); /* twice for the list */ | ||
| 1209 | 795 | ||
| 1210 | kill_ctx(ioctx); | 796 | until = timespec_to_ktime(ts); |
| 797 | } | ||
| 1211 | 798 | ||
| 1212 | /* | 799 | /* |
| 1213 | * Wake up any waiters. The setting of ctx->dead must be seen | 800 | * Note that aio_read_events() is being called as the conditional - i.e. |
| 1214 | * by other CPUs at this point. Right now, we rely on the | 801 | * we're calling it after prepare_to_wait() has set task state to |
| 1215 | * locking done by the above calls to ensure this consistency. | 802 | * TASK_INTERRUPTIBLE. |
| 803 | * | ||
| 804 | * But aio_read_events() can block, and if it blocks it's going to flip | ||
| 805 | * the task state back to TASK_RUNNING. | ||
| 806 | * | ||
| 807 | * This should be ok, provided it doesn't flip the state back to | ||
| 808 | * TASK_RUNNING and return 0 too much - that causes us to spin. That | ||
| 809 | * will only happen if the mutex_lock() call blocks, and we then find | ||
| 810 | * the ringbuffer empty. So in practice we should be ok, but it's | ||
| 811 | * something to be aware of when touching this code. | ||
| 1216 | */ | 812 | */ |
| 1217 | wake_up_all(&ioctx->wait); | 813 | wait_event_interruptible_hrtimeout(ctx->wait, |
| 814 | aio_read_events(ctx, min_nr, nr, event, &ret), until); | ||
| 815 | |||
| 816 | if (!ret && signal_pending(current)) | ||
| 817 | ret = -EINTR; | ||
| 818 | |||
| 819 | return ret; | ||
| 1218 | } | 820 | } |
| 1219 | 821 | ||
| 1220 | /* sys_io_setup: | 822 | /* sys_io_setup: |
| @@ -1252,7 +854,7 @@ SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp) | |||
| 1252 | if (!IS_ERR(ioctx)) { | 854 | if (!IS_ERR(ioctx)) { |
| 1253 | ret = put_user(ioctx->user_id, ctxp); | 855 | ret = put_user(ioctx->user_id, ctxp); |
| 1254 | if (ret) | 856 | if (ret) |
| 1255 | io_destroy(ioctx); | 857 | kill_ioctx(ioctx); |
| 1256 | put_ioctx(ioctx); | 858 | put_ioctx(ioctx); |
| 1257 | } | 859 | } |
| 1258 | 860 | ||
| @@ -1270,7 +872,7 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx) | |||
| 1270 | { | 872 | { |
| 1271 | struct kioctx *ioctx = lookup_ioctx(ctx); | 873 | struct kioctx *ioctx = lookup_ioctx(ctx); |
| 1272 | if (likely(NULL != ioctx)) { | 874 | if (likely(NULL != ioctx)) { |
| 1273 | io_destroy(ioctx); | 875 | kill_ioctx(ioctx); |
| 1274 | put_ioctx(ioctx); | 876 | put_ioctx(ioctx); |
| 1275 | return 0; | 877 | return 0; |
| 1276 | } | 878 | } |
| @@ -1301,30 +903,21 @@ static void aio_advance_iovec(struct kiocb *iocb, ssize_t ret) | |||
| 1301 | BUG_ON(ret > 0 && iocb->ki_left == 0); | 903 | BUG_ON(ret > 0 && iocb->ki_left == 0); |
| 1302 | } | 904 | } |
| 1303 | 905 | ||
| 1304 | static ssize_t aio_rw_vect_retry(struct kiocb *iocb) | 906 | typedef ssize_t (aio_rw_op)(struct kiocb *, const struct iovec *, |
| 907 | unsigned long, loff_t); | ||
| 908 | |||
| 909 | static ssize_t aio_rw_vect_retry(struct kiocb *iocb, int rw, aio_rw_op *rw_op) | ||
| 1305 | { | 910 | { |
| 1306 | struct file *file = iocb->ki_filp; | 911 | struct file *file = iocb->ki_filp; |
| 1307 | struct address_space *mapping = file->f_mapping; | 912 | struct address_space *mapping = file->f_mapping; |
| 1308 | struct inode *inode = mapping->host; | 913 | struct inode *inode = mapping->host; |
| 1309 | ssize_t (*rw_op)(struct kiocb *, const struct iovec *, | ||
| 1310 | unsigned long, loff_t); | ||
| 1311 | ssize_t ret = 0; | 914 | ssize_t ret = 0; |
| 1312 | unsigned short opcode; | ||
| 1313 | |||
| 1314 | if ((iocb->ki_opcode == IOCB_CMD_PREADV) || | ||
| 1315 | (iocb->ki_opcode == IOCB_CMD_PREAD)) { | ||
| 1316 | rw_op = file->f_op->aio_read; | ||
| 1317 | opcode = IOCB_CMD_PREADV; | ||
| 1318 | } else { | ||
| 1319 | rw_op = file->f_op->aio_write; | ||
| 1320 | opcode = IOCB_CMD_PWRITEV; | ||
| 1321 | } | ||
| 1322 | 915 | ||
| 1323 | /* This matches the pread()/pwrite() logic */ | 916 | /* This matches the pread()/pwrite() logic */ |
| 1324 | if (iocb->ki_pos < 0) | 917 | if (iocb->ki_pos < 0) |
| 1325 | return -EINVAL; | 918 | return -EINVAL; |
| 1326 | 919 | ||
| 1327 | if (opcode == IOCB_CMD_PWRITEV) | 920 | if (rw == WRITE) |
| 1328 | file_start_write(file); | 921 | file_start_write(file); |
| 1329 | do { | 922 | do { |
| 1330 | ret = rw_op(iocb, &iocb->ki_iovec[iocb->ki_cur_seg], | 923 | ret = rw_op(iocb, &iocb->ki_iovec[iocb->ki_cur_seg], |
| @@ -1336,9 +929,9 @@ static ssize_t aio_rw_vect_retry(struct kiocb *iocb) | |||
| 1336 | /* retry all partial writes. retry partial reads as long as its a | 929 | /* retry all partial writes. retry partial reads as long as its a |
| 1337 | * regular file. */ | 930 | * regular file. */ |
| 1338 | } while (ret > 0 && iocb->ki_left > 0 && | 931 | } while (ret > 0 && iocb->ki_left > 0 && |
| 1339 | (opcode == IOCB_CMD_PWRITEV || | 932 | (rw == WRITE || |
| 1340 | (!S_ISFIFO(inode->i_mode) && !S_ISSOCK(inode->i_mode)))); | 933 | (!S_ISFIFO(inode->i_mode) && !S_ISSOCK(inode->i_mode)))); |
| 1341 | if (opcode == IOCB_CMD_PWRITEV) | 934 | if (rw == WRITE) |
| 1342 | file_end_write(file); | 935 | file_end_write(file); |
| 1343 | 936 | ||
| 1344 | /* This means we must have transferred all that we could */ | 937 | /* This means we must have transferred all that we could */ |
| @@ -1348,81 +941,49 @@ static ssize_t aio_rw_vect_retry(struct kiocb *iocb) | |||
| 1348 | 941 | ||
| 1349 | /* If we managed to write some out we return that, rather than | 942 | /* If we managed to write some out we return that, rather than |
| 1350 | * the eventual error. */ | 943 | * the eventual error. */ |
| 1351 | if (opcode == IOCB_CMD_PWRITEV | 944 | if (rw == WRITE |
| 1352 | && ret < 0 && ret != -EIOCBQUEUED && ret != -EIOCBRETRY | 945 | && ret < 0 && ret != -EIOCBQUEUED |
| 1353 | && iocb->ki_nbytes - iocb->ki_left) | 946 | && iocb->ki_nbytes - iocb->ki_left) |
| 1354 | ret = iocb->ki_nbytes - iocb->ki_left; | 947 | ret = iocb->ki_nbytes - iocb->ki_left; |
| 1355 | 948 | ||
| 1356 | return ret; | 949 | return ret; |
| 1357 | } | 950 | } |
| 1358 | 951 | ||
| 1359 | static ssize_t aio_fdsync(struct kiocb *iocb) | 952 | static ssize_t aio_setup_vectored_rw(int rw, struct kiocb *kiocb, bool compat) |
| 1360 | { | ||
| 1361 | struct file *file = iocb->ki_filp; | ||
| 1362 | ssize_t ret = -EINVAL; | ||
| 1363 | |||
| 1364 | if (file->f_op->aio_fsync) | ||
| 1365 | ret = file->f_op->aio_fsync(iocb, 1); | ||
| 1366 | return ret; | ||
| 1367 | } | ||
| 1368 | |||
| 1369 | static ssize_t aio_fsync(struct kiocb *iocb) | ||
| 1370 | { | ||
| 1371 | struct file *file = iocb->ki_filp; | ||
| 1372 | ssize_t ret = -EINVAL; | ||
| 1373 | |||
| 1374 | if (file->f_op->aio_fsync) | ||
| 1375 | ret = file->f_op->aio_fsync(iocb, 0); | ||
| 1376 | return ret; | ||
| 1377 | } | ||
| 1378 | |||
| 1379 | static ssize_t aio_setup_vectored_rw(int type, struct kiocb *kiocb, bool compat) | ||
| 1380 | { | 953 | { |
| 1381 | ssize_t ret; | 954 | ssize_t ret; |
| 1382 | 955 | ||
| 956 | kiocb->ki_nr_segs = kiocb->ki_nbytes; | ||
| 957 | |||
| 1383 | #ifdef CONFIG_COMPAT | 958 | #ifdef CONFIG_COMPAT |
| 1384 | if (compat) | 959 | if (compat) |
| 1385 | ret = compat_rw_copy_check_uvector(type, | 960 | ret = compat_rw_copy_check_uvector(rw, |
| 1386 | (struct compat_iovec __user *)kiocb->ki_buf, | 961 | (struct compat_iovec __user *)kiocb->ki_buf, |
| 1387 | kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec, | 962 | kiocb->ki_nr_segs, 1, &kiocb->ki_inline_vec, |
| 1388 | &kiocb->ki_iovec); | 963 | &kiocb->ki_iovec); |
| 1389 | else | 964 | else |
| 1390 | #endif | 965 | #endif |
| 1391 | ret = rw_copy_check_uvector(type, | 966 | ret = rw_copy_check_uvector(rw, |
| 1392 | (struct iovec __user *)kiocb->ki_buf, | 967 | (struct iovec __user *)kiocb->ki_buf, |
| 1393 | kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec, | 968 | kiocb->ki_nr_segs, 1, &kiocb->ki_inline_vec, |
| 1394 | &kiocb->ki_iovec); | 969 | &kiocb->ki_iovec); |
| 1395 | if (ret < 0) | 970 | if (ret < 0) |
| 1396 | goto out; | 971 | return ret; |
| 1397 | |||
| 1398 | ret = rw_verify_area(type, kiocb->ki_filp, &kiocb->ki_pos, ret); | ||
| 1399 | if (ret < 0) | ||
| 1400 | goto out; | ||
| 1401 | 972 | ||
| 1402 | kiocb->ki_nr_segs = kiocb->ki_nbytes; | 973 | /* ki_nbytes now reflect bytes instead of segs */ |
| 1403 | kiocb->ki_cur_seg = 0; | ||
| 1404 | /* ki_nbytes/left now reflect bytes instead of segs */ | ||
| 1405 | kiocb->ki_nbytes = ret; | 974 | kiocb->ki_nbytes = ret; |
| 1406 | kiocb->ki_left = ret; | 975 | return 0; |
| 1407 | |||
| 1408 | ret = 0; | ||
| 1409 | out: | ||
| 1410 | return ret; | ||
| 1411 | } | 976 | } |
| 1412 | 977 | ||
| 1413 | static ssize_t aio_setup_single_vector(int type, struct file * file, struct kiocb *kiocb) | 978 | static ssize_t aio_setup_single_vector(int rw, struct kiocb *kiocb) |
| 1414 | { | 979 | { |
| 1415 | int bytes; | 980 | if (unlikely(!access_ok(!rw, kiocb->ki_buf, kiocb->ki_nbytes))) |
| 1416 | 981 | return -EFAULT; | |
| 1417 | bytes = rw_verify_area(type, file, &kiocb->ki_pos, kiocb->ki_left); | ||
| 1418 | if (bytes < 0) | ||
| 1419 | return bytes; | ||
| 1420 | 982 | ||
| 1421 | kiocb->ki_iovec = &kiocb->ki_inline_vec; | 983 | kiocb->ki_iovec = &kiocb->ki_inline_vec; |
| 1422 | kiocb->ki_iovec->iov_base = kiocb->ki_buf; | 984 | kiocb->ki_iovec->iov_base = kiocb->ki_buf; |
| 1423 | kiocb->ki_iovec->iov_len = bytes; | 985 | kiocb->ki_iovec->iov_len = kiocb->ki_nbytes; |
| 1424 | kiocb->ki_nr_segs = 1; | 986 | kiocb->ki_nr_segs = 1; |
| 1425 | kiocb->ki_cur_seg = 0; | ||
| 1426 | return 0; | 987 | return 0; |
| 1427 | } | 988 | } |
| 1428 | 989 | ||
| @@ -1431,96 +992,95 @@ static ssize_t aio_setup_single_vector(int type, struct file * file, struct kioc | |||
| 1431 | * Performs the initial checks and aio retry method | 992 | * Performs the initial checks and aio retry method |
| 1432 | * setup for the kiocb at the time of io submission. | 993 | * setup for the kiocb at the time of io submission. |
| 1433 | */ | 994 | */ |
| 1434 | static ssize_t aio_setup_iocb(struct kiocb *kiocb, bool compat) | 995 | static ssize_t aio_run_iocb(struct kiocb *req, bool compat) |
| 1435 | { | 996 | { |
| 1436 | struct file *file = kiocb->ki_filp; | 997 | struct file *file = req->ki_filp; |
| 1437 | ssize_t ret = 0; | 998 | ssize_t ret; |
| 999 | int rw; | ||
| 1000 | fmode_t mode; | ||
| 1001 | aio_rw_op *rw_op; | ||
| 1438 | 1002 | ||
| 1439 | switch (kiocb->ki_opcode) { | 1003 | switch (req->ki_opcode) { |
| 1440 | case IOCB_CMD_PREAD: | 1004 | case IOCB_CMD_PREAD: |
| 1441 | ret = -EBADF; | ||
| 1442 | if (unlikely(!(file->f_mode & FMODE_READ))) | ||
| 1443 | break; | ||
| 1444 | ret = -EFAULT; | ||
| 1445 | if (unlikely(!access_ok(VERIFY_WRITE, kiocb->ki_buf, | ||
| 1446 | kiocb->ki_left))) | ||
| 1447 | break; | ||
| 1448 | ret = aio_setup_single_vector(READ, file, kiocb); | ||
| 1449 | if (ret) | ||
| 1450 | break; | ||
| 1451 | ret = -EINVAL; | ||
| 1452 | if (file->f_op->aio_read) | ||
| 1453 | kiocb->ki_retry = aio_rw_vect_retry; | ||
| 1454 | break; | ||
| 1455 | case IOCB_CMD_PWRITE: | ||
| 1456 | ret = -EBADF; | ||
| 1457 | if (unlikely(!(file->f_mode & FMODE_WRITE))) | ||
| 1458 | break; | ||
| 1459 | ret = -EFAULT; | ||
| 1460 | if (unlikely(!access_ok(VERIFY_READ, kiocb->ki_buf, | ||
| 1461 | kiocb->ki_left))) | ||
| 1462 | break; | ||
| 1463 | ret = aio_setup_single_vector(WRITE, file, kiocb); | ||
| 1464 | if (ret) | ||
| 1465 | break; | ||
| 1466 | ret = -EINVAL; | ||
| 1467 | if (file->f_op->aio_write) | ||
| 1468 | kiocb->ki_retry = aio_rw_vect_retry; | ||
| 1469 | break; | ||
| 1470 | case IOCB_CMD_PREADV: | 1005 | case IOCB_CMD_PREADV: |
| 1471 | ret = -EBADF; | 1006 | mode = FMODE_READ; |
| 1472 | if (unlikely(!(file->f_mode & FMODE_READ))) | 1007 | rw = READ; |
| 1473 | break; | 1008 | rw_op = file->f_op->aio_read; |
| 1474 | ret = aio_setup_vectored_rw(READ, kiocb, compat); | 1009 | goto rw_common; |
| 1475 | if (ret) | 1010 | |
| 1476 | break; | 1011 | case IOCB_CMD_PWRITE: |
| 1477 | ret = -EINVAL; | ||
| 1478 | if (file->f_op->aio_read) | ||
| 1479 | kiocb->ki_retry = aio_rw_vect_retry; | ||
| 1480 | break; | ||
| 1481 | case IOCB_CMD_PWRITEV: | 1012 | case IOCB_CMD_PWRITEV: |
| 1482 | ret = -EBADF; | 1013 | mode = FMODE_WRITE; |
| 1483 | if (unlikely(!(file->f_mode & FMODE_WRITE))) | 1014 | rw = WRITE; |
| 1484 | break; | 1015 | rw_op = file->f_op->aio_write; |
| 1485 | ret = aio_setup_vectored_rw(WRITE, kiocb, compat); | 1016 | goto rw_common; |
| 1017 | rw_common: | ||
| 1018 | if (unlikely(!(file->f_mode & mode))) | ||
| 1019 | return -EBADF; | ||
| 1020 | |||
| 1021 | if (!rw_op) | ||
| 1022 | return -EINVAL; | ||
| 1023 | |||
| 1024 | ret = (req->ki_opcode == IOCB_CMD_PREADV || | ||
| 1025 | req->ki_opcode == IOCB_CMD_PWRITEV) | ||
| 1026 | ? aio_setup_vectored_rw(rw, req, compat) | ||
| 1027 | : aio_setup_single_vector(rw, req); | ||
| 1486 | if (ret) | 1028 | if (ret) |
| 1487 | break; | 1029 | return ret; |
| 1488 | ret = -EINVAL; | 1030 | |
| 1489 | if (file->f_op->aio_write) | 1031 | ret = rw_verify_area(rw, file, &req->ki_pos, req->ki_nbytes); |
| 1490 | kiocb->ki_retry = aio_rw_vect_retry; | 1032 | if (ret < 0) |
| 1033 | return ret; | ||
| 1034 | |||
| 1035 | req->ki_nbytes = ret; | ||
| 1036 | req->ki_left = ret; | ||
| 1037 | |||
| 1038 | ret = aio_rw_vect_retry(req, rw, rw_op); | ||
| 1491 | break; | 1039 | break; |
| 1040 | |||
| 1492 | case IOCB_CMD_FDSYNC: | 1041 | case IOCB_CMD_FDSYNC: |
| 1493 | ret = -EINVAL; | 1042 | if (!file->f_op->aio_fsync) |
| 1494 | if (file->f_op->aio_fsync) | 1043 | return -EINVAL; |
| 1495 | kiocb->ki_retry = aio_fdsync; | 1044 | |
| 1045 | ret = file->f_op->aio_fsync(req, 1); | ||
| 1496 | break; | 1046 | break; |
| 1047 | |||
| 1497 | case IOCB_CMD_FSYNC: | 1048 | case IOCB_CMD_FSYNC: |
| 1498 | ret = -EINVAL; | 1049 | if (!file->f_op->aio_fsync) |
| 1499 | if (file->f_op->aio_fsync) | 1050 | return -EINVAL; |
| 1500 | kiocb->ki_retry = aio_fsync; | 1051 | |
| 1052 | ret = file->f_op->aio_fsync(req, 0); | ||
| 1501 | break; | 1053 | break; |
| 1054 | |||
| 1502 | default: | 1055 | default: |
| 1503 | dprintk("EINVAL: io_submit: no operation provided\n"); | 1056 | pr_debug("EINVAL: no operation provided\n"); |
| 1504 | ret = -EINVAL; | 1057 | return -EINVAL; |
| 1505 | } | 1058 | } |
| 1506 | 1059 | ||
| 1507 | if (!kiocb->ki_retry) | 1060 | if (ret != -EIOCBQUEUED) { |
| 1508 | return ret; | 1061 | /* |
| 1062 | * There's no easy way to restart the syscall since other AIO's | ||
| 1063 | * may be already running. Just fail this IO with EINTR. | ||
| 1064 | */ | ||
| 1065 | if (unlikely(ret == -ERESTARTSYS || ret == -ERESTARTNOINTR || | ||
| 1066 | ret == -ERESTARTNOHAND || | ||
| 1067 | ret == -ERESTART_RESTARTBLOCK)) | ||
| 1068 | ret = -EINTR; | ||
| 1069 | aio_complete(req, ret, 0); | ||
| 1070 | } | ||
| 1509 | 1071 | ||
| 1510 | return 0; | 1072 | return 0; |
| 1511 | } | 1073 | } |
| 1512 | 1074 | ||
| 1513 | static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, | 1075 | static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, |
| 1514 | struct iocb *iocb, struct kiocb_batch *batch, | 1076 | struct iocb *iocb, bool compat) |
| 1515 | bool compat) | ||
| 1516 | { | 1077 | { |
| 1517 | struct kiocb *req; | 1078 | struct kiocb *req; |
| 1518 | struct file *file; | ||
| 1519 | ssize_t ret; | 1079 | ssize_t ret; |
| 1520 | 1080 | ||
| 1521 | /* enforce forwards compatibility on users */ | 1081 | /* enforce forwards compatibility on users */ |
| 1522 | if (unlikely(iocb->aio_reserved1 || iocb->aio_reserved2)) { | 1082 | if (unlikely(iocb->aio_reserved1 || iocb->aio_reserved2)) { |
| 1523 | pr_debug("EINVAL: io_submit: reserve field set\n"); | 1083 | pr_debug("EINVAL: reserve field set\n"); |
| 1524 | return -EINVAL; | 1084 | return -EINVAL; |
| 1525 | } | 1085 | } |
| 1526 | 1086 | ||
| @@ -1534,16 +1094,16 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, | |||
| 1534 | return -EINVAL; | 1094 | return -EINVAL; |
| 1535 | } | 1095 | } |
| 1536 | 1096 | ||
| 1537 | file = fget(iocb->aio_fildes); | 1097 | req = aio_get_req(ctx); |
| 1538 | if (unlikely(!file)) | 1098 | if (unlikely(!req)) |
| 1539 | return -EBADF; | ||
| 1540 | |||
| 1541 | req = aio_get_req(ctx, batch); /* returns with 2 references to req */ | ||
| 1542 | if (unlikely(!req)) { | ||
| 1543 | fput(file); | ||
| 1544 | return -EAGAIN; | 1099 | return -EAGAIN; |
| 1100 | |||
| 1101 | req->ki_filp = fget(iocb->aio_fildes); | ||
| 1102 | if (unlikely(!req->ki_filp)) { | ||
| 1103 | ret = -EBADF; | ||
| 1104 | goto out_put_req; | ||
| 1545 | } | 1105 | } |
| 1546 | req->ki_filp = file; | 1106 | |
| 1547 | if (iocb->aio_flags & IOCB_FLAG_RESFD) { | 1107 | if (iocb->aio_flags & IOCB_FLAG_RESFD) { |
| 1548 | /* | 1108 | /* |
| 1549 | * If the IOCB_FLAG_RESFD flag of aio_flags is set, get an | 1109 | * If the IOCB_FLAG_RESFD flag of aio_flags is set, get an |
| @@ -1559,9 +1119,9 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, | |||
| 1559 | } | 1119 | } |
| 1560 | } | 1120 | } |
| 1561 | 1121 | ||
| 1562 | ret = put_user(req->ki_key, &user_iocb->aio_key); | 1122 | ret = put_user(KIOCB_KEY, &user_iocb->aio_key); |
| 1563 | if (unlikely(ret)) { | 1123 | if (unlikely(ret)) { |
| 1564 | dprintk("EFAULT: aio_key\n"); | 1124 | pr_debug("EFAULT: aio_key\n"); |
| 1565 | goto out_put_req; | 1125 | goto out_put_req; |
| 1566 | } | 1126 | } |
| 1567 | 1127 | ||
| @@ -1573,41 +1133,14 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, | |||
| 1573 | req->ki_left = req->ki_nbytes = iocb->aio_nbytes; | 1133 | req->ki_left = req->ki_nbytes = iocb->aio_nbytes; |
| 1574 | req->ki_opcode = iocb->aio_lio_opcode; | 1134 | req->ki_opcode = iocb->aio_lio_opcode; |
| 1575 | 1135 | ||
| 1576 | ret = aio_setup_iocb(req, compat); | 1136 | ret = aio_run_iocb(req, compat); |
| 1577 | |||
| 1578 | if (ret) | 1137 | if (ret) |
| 1579 | goto out_put_req; | 1138 | goto out_put_req; |
| 1580 | 1139 | ||
| 1581 | spin_lock_irq(&ctx->ctx_lock); | ||
| 1582 | /* | ||
| 1583 | * We could have raced with io_destroy() and are currently holding a | ||
| 1584 | * reference to ctx which should be destroyed. We cannot submit IO | ||
| 1585 | * since ctx gets freed as soon as io_submit() puts its reference. The | ||
| 1586 | * check here is reliable: io_destroy() sets ctx->dead before waiting | ||
| 1587 | * for outstanding IO and the barrier between these two is realized by | ||
| 1588 | * unlock of mm->ioctx_lock and lock of ctx->ctx_lock. Analogously we | ||
| 1589 | * increment ctx->reqs_active before checking for ctx->dead and the | ||
| 1590 | * barrier is realized by unlock and lock of ctx->ctx_lock. Thus if we | ||
| 1591 | * don't see ctx->dead set here, io_destroy() waits for our IO to | ||
| 1592 | * finish. | ||
| 1593 | */ | ||
| 1594 | if (ctx->dead) { | ||
| 1595 | spin_unlock_irq(&ctx->ctx_lock); | ||
| 1596 | ret = -EINVAL; | ||
| 1597 | goto out_put_req; | ||
| 1598 | } | ||
| 1599 | aio_run_iocb(req); | ||
| 1600 | if (!list_empty(&ctx->run_list)) { | ||
| 1601 | /* drain the run list */ | ||
| 1602 | while (__aio_run_iocbs(ctx)) | ||
| 1603 | ; | ||
| 1604 | } | ||
| 1605 | spin_unlock_irq(&ctx->ctx_lock); | ||
| 1606 | |||
| 1607 | aio_put_req(req); /* drop extra ref to req */ | 1140 | aio_put_req(req); /* drop extra ref to req */ |
| 1608 | return 0; | 1141 | return 0; |
| 1609 | |||
| 1610 | out_put_req: | 1142 | out_put_req: |
| 1143 | atomic_dec(&ctx->reqs_active); | ||
| 1611 | aio_put_req(req); /* drop extra ref to req */ | 1144 | aio_put_req(req); /* drop extra ref to req */ |
| 1612 | aio_put_req(req); /* drop i/o ref to req */ | 1145 | aio_put_req(req); /* drop i/o ref to req */ |
| 1613 | return ret; | 1146 | return ret; |
| @@ -1620,7 +1153,6 @@ long do_io_submit(aio_context_t ctx_id, long nr, | |||
| 1620 | long ret = 0; | 1153 | long ret = 0; |
| 1621 | int i = 0; | 1154 | int i = 0; |
| 1622 | struct blk_plug plug; | 1155 | struct blk_plug plug; |
| 1623 | struct kiocb_batch batch; | ||
| 1624 | 1156 | ||
| 1625 | if (unlikely(nr < 0)) | 1157 | if (unlikely(nr < 0)) |
| 1626 | return -EINVAL; | 1158 | return -EINVAL; |
| @@ -1633,12 +1165,10 @@ long do_io_submit(aio_context_t ctx_id, long nr, | |||
| 1633 | 1165 | ||
| 1634 | ctx = lookup_ioctx(ctx_id); | 1166 | ctx = lookup_ioctx(ctx_id); |
| 1635 | if (unlikely(!ctx)) { | 1167 | if (unlikely(!ctx)) { |
| 1636 | pr_debug("EINVAL: io_submit: invalid context id\n"); | 1168 | pr_debug("EINVAL: invalid context id\n"); |
| 1637 | return -EINVAL; | 1169 | return -EINVAL; |
| 1638 | } | 1170 | } |
| 1639 | 1171 | ||
| 1640 | kiocb_batch_init(&batch, nr); | ||
| 1641 | |||
| 1642 | blk_start_plug(&plug); | 1172 | blk_start_plug(&plug); |
| 1643 | 1173 | ||
| 1644 | /* | 1174 | /* |
| @@ -1659,13 +1189,12 @@ long do_io_submit(aio_context_t ctx_id, long nr, | |||
| 1659 | break; | 1189 | break; |
| 1660 | } | 1190 | } |
| 1661 | 1191 | ||
| 1662 | ret = io_submit_one(ctx, user_iocb, &tmp, &batch, compat); | 1192 | ret = io_submit_one(ctx, user_iocb, &tmp, compat); |
| 1663 | if (ret) | 1193 | if (ret) |
| 1664 | break; | 1194 | break; |
| 1665 | } | 1195 | } |
| 1666 | blk_finish_plug(&plug); | 1196 | blk_finish_plug(&plug); |
| 1667 | 1197 | ||
| 1668 | kiocb_batch_free(ctx, &batch); | ||
| 1669 | put_ioctx(ctx); | 1198 | put_ioctx(ctx); |
| 1670 | return i ? i : ret; | 1199 | return i ? i : ret; |
| 1671 | } | 1200 | } |
| @@ -1698,10 +1227,13 @@ static struct kiocb *lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb, | |||
| 1698 | 1227 | ||
| 1699 | assert_spin_locked(&ctx->ctx_lock); | 1228 | assert_spin_locked(&ctx->ctx_lock); |
| 1700 | 1229 | ||
| 1230 | if (key != KIOCB_KEY) | ||
| 1231 | return NULL; | ||
| 1232 | |||
| 1701 | /* TODO: use a hash or array, this sucks. */ | 1233 | /* TODO: use a hash or array, this sucks. */ |
| 1702 | list_for_each(pos, &ctx->active_reqs) { | 1234 | list_for_each(pos, &ctx->active_reqs) { |
| 1703 | struct kiocb *kiocb = list_kiocb(pos); | 1235 | struct kiocb *kiocb = list_kiocb(pos); |
| 1704 | if (kiocb->ki_obj.user == iocb && kiocb->ki_key == key) | 1236 | if (kiocb->ki_obj.user == iocb) |
| 1705 | return kiocb; | 1237 | return kiocb; |
| 1706 | } | 1238 | } |
| 1707 | return NULL; | 1239 | return NULL; |
| @@ -1720,7 +1252,7 @@ static struct kiocb *lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb, | |||
| 1720 | SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, | 1252 | SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, |
| 1721 | struct io_event __user *, result) | 1253 | struct io_event __user *, result) |
| 1722 | { | 1254 | { |
| 1723 | int (*cancel)(struct kiocb *iocb, struct io_event *res); | 1255 | struct io_event res; |
| 1724 | struct kioctx *ctx; | 1256 | struct kioctx *ctx; |
| 1725 | struct kiocb *kiocb; | 1257 | struct kiocb *kiocb; |
| 1726 | u32 key; | 1258 | u32 key; |
| @@ -1735,32 +1267,22 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, | |||
| 1735 | return -EINVAL; | 1267 | return -EINVAL; |
| 1736 | 1268 | ||
| 1737 | spin_lock_irq(&ctx->ctx_lock); | 1269 | spin_lock_irq(&ctx->ctx_lock); |
| 1738 | ret = -EAGAIN; | 1270 | |
| 1739 | kiocb = lookup_kiocb(ctx, iocb, key); | 1271 | kiocb = lookup_kiocb(ctx, iocb, key); |
| 1740 | if (kiocb && kiocb->ki_cancel) { | 1272 | if (kiocb) |
| 1741 | cancel = kiocb->ki_cancel; | 1273 | ret = kiocb_cancel(ctx, kiocb, &res); |
| 1742 | kiocb->ki_users ++; | 1274 | else |
| 1743 | kiocbSetCancelled(kiocb); | 1275 | ret = -EINVAL; |
| 1744 | } else | 1276 | |
| 1745 | cancel = NULL; | ||
| 1746 | spin_unlock_irq(&ctx->ctx_lock); | 1277 | spin_unlock_irq(&ctx->ctx_lock); |
| 1747 | 1278 | ||
| 1748 | if (NULL != cancel) { | 1279 | if (!ret) { |
| 1749 | struct io_event tmp; | 1280 | /* Cancellation succeeded -- copy the result |
| 1750 | pr_debug("calling cancel\n"); | 1281 | * into the user's buffer. |
| 1751 | memset(&tmp, 0, sizeof(tmp)); | 1282 | */ |
| 1752 | tmp.obj = (u64)(unsigned long)kiocb->ki_obj.user; | 1283 | if (copy_to_user(result, &res, sizeof(res))) |
| 1753 | tmp.data = kiocb->ki_user_data; | 1284 | ret = -EFAULT; |
| 1754 | ret = cancel(kiocb, &tmp); | 1285 | } |
| 1755 | if (!ret) { | ||
| 1756 | /* Cancellation succeeded -- copy the result | ||
| 1757 | * into the user's buffer. | ||
| 1758 | */ | ||
| 1759 | if (copy_to_user(result, &tmp, sizeof(tmp))) | ||
| 1760 | ret = -EFAULT; | ||
| 1761 | } | ||
| 1762 | } else | ||
| 1763 | ret = -EINVAL; | ||
| 1764 | 1286 | ||
| 1765 | put_ioctx(ctx); | 1287 | put_ioctx(ctx); |
| 1766 | 1288 | ||
| @@ -19,6 +19,7 @@ | |||
| 19 | #include <linux/swap.h> | 19 | #include <linux/swap.h> |
| 20 | #include <linux/bio.h> | 20 | #include <linux/bio.h> |
| 21 | #include <linux/blkdev.h> | 21 | #include <linux/blkdev.h> |
| 22 | #include <linux/uio.h> | ||
| 22 | #include <linux/iocontext.h> | 23 | #include <linux/iocontext.h> |
| 23 | #include <linux/slab.h> | 24 | #include <linux/slab.h> |
| 24 | #include <linux/init.h> | 25 | #include <linux/init.h> |
diff --git a/fs/block_dev.c b/fs/block_dev.c index 3823d3ffb760..d9871c1f0894 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c | |||
| @@ -27,6 +27,7 @@ | |||
| 27 | #include <linux/namei.h> | 27 | #include <linux/namei.h> |
| 28 | #include <linux/log2.h> | 28 | #include <linux/log2.h> |
| 29 | #include <linux/cleancache.h> | 29 | #include <linux/cleancache.h> |
| 30 | #include <linux/aio.h> | ||
| 30 | #include <asm/uaccess.h> | 31 | #include <asm/uaccess.h> |
| 31 | #include "internal.h" | 32 | #include "internal.h" |
| 32 | 33 | ||
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index bb8b7a0e28a6..bc4d54c465a0 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c | |||
| @@ -24,6 +24,7 @@ | |||
| 24 | #include <linux/string.h> | 24 | #include <linux/string.h> |
| 25 | #include <linux/backing-dev.h> | 25 | #include <linux/backing-dev.h> |
| 26 | #include <linux/mpage.h> | 26 | #include <linux/mpage.h> |
| 27 | #include <linux/aio.h> | ||
| 27 | #include <linux/falloc.h> | 28 | #include <linux/falloc.h> |
| 28 | #include <linux/swap.h> | 29 | #include <linux/swap.h> |
| 29 | #include <linux/writeback.h> | 30 | #include <linux/writeback.h> |
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 09c58a35b429..898da0a01e04 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c | |||
| @@ -32,6 +32,7 @@ | |||
| 32 | #include <linux/writeback.h> | 32 | #include <linux/writeback.h> |
| 33 | #include <linux/statfs.h> | 33 | #include <linux/statfs.h> |
| 34 | #include <linux/compat.h> | 34 | #include <linux/compat.h> |
| 35 | #include <linux/aio.h> | ||
| 35 | #include <linux/bit_spinlock.h> | 36 | #include <linux/bit_spinlock.h> |
| 36 | #include <linux/xattr.h> | 37 | #include <linux/xattr.h> |
| 37 | #include <linux/posix_acl.h> | 38 | #include <linux/posix_acl.h> |
diff --git a/fs/ceph/file.c b/fs/ceph/file.c index d70830c66833..656e16907430 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c | |||
| @@ -7,6 +7,7 @@ | |||
| 7 | #include <linux/mount.h> | 7 | #include <linux/mount.h> |
| 8 | #include <linux/namei.h> | 8 | #include <linux/namei.h> |
| 9 | #include <linux/writeback.h> | 9 | #include <linux/writeback.h> |
| 10 | #include <linux/aio.h> | ||
| 10 | 11 | ||
| 11 | #include "super.h" | 12 | #include "super.h" |
| 12 | #include "mds_client.h" | 13 | #include "mds_client.h" |
diff --git a/fs/compat.c b/fs/compat.c index 93f7d021b716..fc3b55dce184 100644 --- a/fs/compat.c +++ b/fs/compat.c | |||
| @@ -47,6 +47,7 @@ | |||
| 47 | #include <linux/fs_struct.h> | 47 | #include <linux/fs_struct.h> |
| 48 | #include <linux/slab.h> | 48 | #include <linux/slab.h> |
| 49 | #include <linux/pagemap.h> | 49 | #include <linux/pagemap.h> |
| 50 | #include <linux/aio.h> | ||
| 50 | 51 | ||
| 51 | #include <asm/uaccess.h> | 52 | #include <asm/uaccess.h> |
| 52 | #include <asm/mmu_context.h> | 53 | #include <asm/mmu_context.h> |
diff --git a/fs/direct-io.c b/fs/direct-io.c index cfb816dc6d9f..51d16e067d68 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c | |||
| @@ -37,6 +37,7 @@ | |||
| 37 | #include <linux/uio.h> | 37 | #include <linux/uio.h> |
| 38 | #include <linux/atomic.h> | 38 | #include <linux/atomic.h> |
| 39 | #include <linux/prefetch.h> | 39 | #include <linux/prefetch.h> |
| 40 | #include <linux/aio.h> | ||
| 40 | 41 | ||
| 41 | /* | 42 | /* |
| 42 | * How many user pages to map in one call to get_user_pages(). This determines | 43 | * How many user pages to map in one call to get_user_pages(). This determines |
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index 63b1f54b6a1f..201f0a0d6b0a 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c | |||
| @@ -31,6 +31,7 @@ | |||
| 31 | #include <linux/security.h> | 31 | #include <linux/security.h> |
| 32 | #include <linux/compat.h> | 32 | #include <linux/compat.h> |
| 33 | #include <linux/fs_stack.h> | 33 | #include <linux/fs_stack.h> |
| 34 | #include <linux/aio.h> | ||
| 34 | #include "ecryptfs_kernel.h" | 35 | #include "ecryptfs_kernel.h" |
| 35 | 36 | ||
| 36 | /** | 37 | /** |
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index fe60cc1117d8..0a87bb10998d 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c | |||
| @@ -31,6 +31,7 @@ | |||
| 31 | #include <linux/mpage.h> | 31 | #include <linux/mpage.h> |
| 32 | #include <linux/fiemap.h> | 32 | #include <linux/fiemap.h> |
| 33 | #include <linux/namei.h> | 33 | #include <linux/namei.h> |
| 34 | #include <linux/aio.h> | ||
| 34 | #include "ext2.h" | 35 | #include "ext2.h" |
| 35 | #include "acl.h" | 36 | #include "acl.h" |
| 36 | #include "xip.h" | 37 | #include "xip.h" |
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index d706dbfa6220..23c712825640 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c | |||
| @@ -27,6 +27,7 @@ | |||
| 27 | #include <linux/writeback.h> | 27 | #include <linux/writeback.h> |
| 28 | #include <linux/mpage.h> | 28 | #include <linux/mpage.h> |
| 29 | #include <linux/namei.h> | 29 | #include <linux/namei.h> |
| 30 | #include <linux/aio.h> | ||
| 30 | #include "ext3.h" | 31 | #include "ext3.h" |
| 31 | #include "xattr.h" | 32 | #include "xattr.h" |
| 32 | #include "acl.h" | 33 | #include "acl.h" |
diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 64848b595b24..4959e29573b6 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c | |||
| @@ -23,6 +23,7 @@ | |||
| 23 | #include <linux/jbd2.h> | 23 | #include <linux/jbd2.h> |
| 24 | #include <linux/mount.h> | 24 | #include <linux/mount.h> |
| 25 | #include <linux/path.h> | 25 | #include <linux/path.h> |
| 26 | #include <linux/aio.h> | ||
| 26 | #include <linux/quotaops.h> | 27 | #include <linux/quotaops.h> |
| 27 | #include <linux/pagevec.h> | 28 | #include <linux/pagevec.h> |
| 28 | #include "ext4.h" | 29 | #include "ext4.h" |
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 98be6f697463..b8d5d351e24f 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c | |||
| @@ -20,6 +20,7 @@ | |||
| 20 | * (sct@redhat.com), 1993, 1998 | 20 | * (sct@redhat.com), 1993, 1998 |
| 21 | */ | 21 | */ |
| 22 | 22 | ||
| 23 | #include <linux/aio.h> | ||
| 23 | #include "ext4_jbd2.h" | 24 | #include "ext4_jbd2.h" |
| 24 | #include "truncate.h" | 25 | #include "truncate.h" |
| 25 | #include "ext4_extents.h" /* Needed for EXT_MAX_BLOCKS */ | 26 | #include "ext4_extents.h" /* Needed for EXT_MAX_BLOCKS */ |
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 793d44b84d7f..0723774bdfb5 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c | |||
| @@ -37,6 +37,7 @@ | |||
| 37 | #include <linux/printk.h> | 37 | #include <linux/printk.h> |
| 38 | #include <linux/slab.h> | 38 | #include <linux/slab.h> |
| 39 | #include <linux/ratelimit.h> | 39 | #include <linux/ratelimit.h> |
| 40 | #include <linux/aio.h> | ||
| 40 | 41 | ||
| 41 | #include "ext4_jbd2.h" | 42 | #include "ext4_jbd2.h" |
| 42 | #include "xattr.h" | 43 | #include "xattr.h" |
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 5929cd0baa20..19599bded62a 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c | |||
| @@ -18,6 +18,7 @@ | |||
| 18 | #include <linux/pagevec.h> | 18 | #include <linux/pagevec.h> |
| 19 | #include <linux/mpage.h> | 19 | #include <linux/mpage.h> |
| 20 | #include <linux/namei.h> | 20 | #include <linux/namei.h> |
| 21 | #include <linux/aio.h> | ||
| 21 | #include <linux/uio.h> | 22 | #include <linux/uio.h> |
| 22 | #include <linux/bio.h> | 23 | #include <linux/bio.h> |
| 23 | #include <linux/workqueue.h> | 24 | #include <linux/workqueue.h> |
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 7bd22a201125..d0ed4ba4b61b 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c | |||
| @@ -12,6 +12,7 @@ | |||
| 12 | #include <linux/f2fs_fs.h> | 12 | #include <linux/f2fs_fs.h> |
| 13 | #include <linux/buffer_head.h> | 13 | #include <linux/buffer_head.h> |
| 14 | #include <linux/mpage.h> | 14 | #include <linux/mpage.h> |
| 15 | #include <linux/aio.h> | ||
| 15 | #include <linux/writeback.h> | 16 | #include <linux/writeback.h> |
| 16 | #include <linux/backing-dev.h> | 17 | #include <linux/backing-dev.h> |
| 17 | #include <linux/blkdev.h> | 18 | #include <linux/blkdev.h> |
diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 4ff901632b26..dfce656ddb33 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c | |||
| @@ -19,6 +19,7 @@ | |||
| 19 | #include <linux/mpage.h> | 19 | #include <linux/mpage.h> |
| 20 | #include <linux/buffer_head.h> | 20 | #include <linux/buffer_head.h> |
| 21 | #include <linux/mount.h> | 21 | #include <linux/mount.h> |
| 22 | #include <linux/aio.h> | ||
| 22 | #include <linux/vfs.h> | 23 | #include <linux/vfs.h> |
| 23 | #include <linux/parser.h> | 24 | #include <linux/parser.h> |
| 24 | #include <linux/uio.h> | 25 | #include <linux/uio.h> |
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index b3aaf7b3578b..aef34b1e635e 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c | |||
| @@ -38,6 +38,7 @@ | |||
| 38 | #include <linux/device.h> | 38 | #include <linux/device.h> |
| 39 | #include <linux/file.h> | 39 | #include <linux/file.h> |
| 40 | #include <linux/fs.h> | 40 | #include <linux/fs.h> |
| 41 | #include <linux/aio.h> | ||
| 41 | #include <linux/kdev_t.h> | 42 | #include <linux/kdev_t.h> |
| 42 | #include <linux/kthread.h> | 43 | #include <linux/kthread.h> |
| 43 | #include <linux/list.h> | 44 | #include <linux/list.h> |
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index a6c1664e330b..1d55f9465400 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c | |||
| @@ -19,6 +19,7 @@ | |||
| 19 | #include <linux/pipe_fs_i.h> | 19 | #include <linux/pipe_fs_i.h> |
| 20 | #include <linux/swap.h> | 20 | #include <linux/swap.h> |
| 21 | #include <linux/splice.h> | 21 | #include <linux/splice.h> |
| 22 | #include <linux/aio.h> | ||
| 22 | 23 | ||
| 23 | MODULE_ALIAS_MISCDEV(FUSE_MINOR); | 24 | MODULE_ALIAS_MISCDEV(FUSE_MINOR); |
| 24 | MODULE_ALIAS("devname:fuse"); | 25 | MODULE_ALIAS("devname:fuse"); |
diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 4655e59d545b..d1c9b85b3f58 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c | |||
| @@ -15,6 +15,7 @@ | |||
| 15 | #include <linux/module.h> | 15 | #include <linux/module.h> |
| 16 | #include <linux/compat.h> | 16 | #include <linux/compat.h> |
| 17 | #include <linux/swap.h> | 17 | #include <linux/swap.h> |
| 18 | #include <linux/aio.h> | ||
| 18 | 19 | ||
| 19 | static const struct file_operations fuse_direct_io_file_operations; | 20 | static const struct file_operations fuse_direct_io_file_operations; |
| 20 | 21 | ||
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 9883694f1e7c..0bad69ed6336 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c | |||
| @@ -20,6 +20,7 @@ | |||
| 20 | #include <linux/swap.h> | 20 | #include <linux/swap.h> |
| 21 | #include <linux/gfs2_ondisk.h> | 21 | #include <linux/gfs2_ondisk.h> |
| 22 | #include <linux/backing-dev.h> | 22 | #include <linux/backing-dev.h> |
| 23 | #include <linux/aio.h> | ||
| 23 | 24 | ||
| 24 | #include "gfs2.h" | 25 | #include "gfs2.h" |
| 25 | #include "incore.h" | 26 | #include "incore.h" |
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index d79c2dadc536..acd16764b133 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c | |||
| @@ -25,6 +25,7 @@ | |||
| 25 | #include <asm/uaccess.h> | 25 | #include <asm/uaccess.h> |
| 26 | #include <linux/dlm.h> | 26 | #include <linux/dlm.h> |
| 27 | #include <linux/dlm_plock.h> | 27 | #include <linux/dlm_plock.h> |
| 28 | #include <linux/aio.h> | ||
| 28 | 29 | ||
| 29 | #include "gfs2.h" | 30 | #include "gfs2.h" |
| 30 | #include "incore.h" | 31 | #include "incore.h" |
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index 716e1aafb2e2..f9299d8a64e3 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c | |||
| @@ -14,6 +14,7 @@ | |||
| 14 | #include <linux/pagemap.h> | 14 | #include <linux/pagemap.h> |
| 15 | #include <linux/mpage.h> | 15 | #include <linux/mpage.h> |
| 16 | #include <linux/sched.h> | 16 | #include <linux/sched.h> |
| 17 | #include <linux/aio.h> | ||
| 17 | 18 | ||
| 18 | #include "hfs_fs.h" | 19 | #include "hfs_fs.h" |
| 19 | #include "btree.h" | 20 | #include "btree.h" |
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index 7faaa964968e..f833d35630ab 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c | |||
| @@ -14,6 +14,7 @@ | |||
| 14 | #include <linux/pagemap.h> | 14 | #include <linux/pagemap.h> |
| 15 | #include <linux/mpage.h> | 15 | #include <linux/mpage.h> |
| 16 | #include <linux/sched.h> | 16 | #include <linux/sched.h> |
| 17 | #include <linux/aio.h> | ||
| 17 | 18 | ||
| 18 | #include "hfsplus_fs.h" | 19 | #include "hfsplus_fs.h" |
| 19 | #include "hfsplus_raw.h" | 20 | #include "hfsplus_raw.h" |
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 523464e62849..a3f868ae3fd4 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c | |||
| @@ -909,11 +909,8 @@ static int can_do_hugetlb_shm(void) | |||
| 909 | 909 | ||
| 910 | static int get_hstate_idx(int page_size_log) | 910 | static int get_hstate_idx(int page_size_log) |
| 911 | { | 911 | { |
| 912 | struct hstate *h; | 912 | struct hstate *h = hstate_sizelog(page_size_log); |
| 913 | 913 | ||
| 914 | if (!page_size_log) | ||
| 915 | return default_hstate_idx; | ||
| 916 | h = size_to_hstate(1 << page_size_log); | ||
| 917 | if (!h) | 914 | if (!h) |
| 918 | return -1; | 915 | return -1; |
| 919 | return h - hstates; | 916 | return h - hstates; |
| @@ -929,9 +926,12 @@ static struct dentry_operations anon_ops = { | |||
| 929 | .d_dname = hugetlb_dname | 926 | .d_dname = hugetlb_dname |
| 930 | }; | 927 | }; |
| 931 | 928 | ||
| 932 | struct file *hugetlb_file_setup(const char *name, unsigned long addr, | 929 | /* |
| 933 | size_t size, vm_flags_t acctflag, | 930 | * Note that size should be aligned to proper hugepage size in caller side, |
| 934 | struct user_struct **user, | 931 | * otherwise hugetlb_reserve_pages reserves one less hugepages than intended. |
| 932 | */ | ||
| 933 | struct file *hugetlb_file_setup(const char *name, size_t size, | ||
| 934 | vm_flags_t acctflag, struct user_struct **user, | ||
| 935 | int creat_flags, int page_size_log) | 935 | int creat_flags, int page_size_log) |
| 936 | { | 936 | { |
| 937 | struct file *file = ERR_PTR(-ENOMEM); | 937 | struct file *file = ERR_PTR(-ENOMEM); |
| @@ -939,8 +939,6 @@ struct file *hugetlb_file_setup(const char *name, unsigned long addr, | |||
| 939 | struct path path; | 939 | struct path path; |
| 940 | struct super_block *sb; | 940 | struct super_block *sb; |
| 941 | struct qstr quick_string; | 941 | struct qstr quick_string; |
| 942 | struct hstate *hstate; | ||
| 943 | unsigned long num_pages; | ||
| 944 | int hstate_idx; | 942 | int hstate_idx; |
| 945 | 943 | ||
| 946 | hstate_idx = get_hstate_idx(page_size_log); | 944 | hstate_idx = get_hstate_idx(page_size_log); |
| @@ -980,12 +978,10 @@ struct file *hugetlb_file_setup(const char *name, unsigned long addr, | |||
| 980 | if (!inode) | 978 | if (!inode) |
| 981 | goto out_dentry; | 979 | goto out_dentry; |
| 982 | 980 | ||
| 983 | hstate = hstate_inode(inode); | ||
| 984 | size += addr & ~huge_page_mask(hstate); | ||
| 985 | num_pages = ALIGN(size, huge_page_size(hstate)) >> | ||
| 986 | huge_page_shift(hstate); | ||
| 987 | file = ERR_PTR(-ENOMEM); | 981 | file = ERR_PTR(-ENOMEM); |
| 988 | if (hugetlb_reserve_pages(inode, 0, num_pages, NULL, acctflag)) | 982 | if (hugetlb_reserve_pages(inode, 0, |
| 983 | size >> huge_page_shift(hstate_inode(inode)), NULL, | ||
| 984 | acctflag)) | ||
| 989 | goto out_inode; | 985 | goto out_inode; |
| 990 | 986 | ||
| 991 | d_instantiate(path.dentry, inode); | 987 | d_instantiate(path.dentry, inode); |
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index 77554b61d124..730f24e282a6 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c | |||
| @@ -23,6 +23,7 @@ | |||
| 23 | #include <linux/pagemap.h> | 23 | #include <linux/pagemap.h> |
| 24 | #include <linux/quotaops.h> | 24 | #include <linux/quotaops.h> |
| 25 | #include <linux/writeback.h> | 25 | #include <linux/writeback.h> |
| 26 | #include <linux/aio.h> | ||
| 26 | #include "jfs_incore.h" | 27 | #include "jfs_incore.h" |
| 27 | #include "jfs_inode.h" | 28 | #include "jfs_inode.h" |
| 28 | #include "jfs_filsys.h" | 29 | #include "jfs_filsys.h" |
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index cf02f5530713..689fb608648e 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c | |||
| @@ -25,7 +25,7 @@ | |||
| 25 | #include <linux/gfp.h> | 25 | #include <linux/gfp.h> |
| 26 | #include <linux/mpage.h> | 26 | #include <linux/mpage.h> |
| 27 | #include <linux/writeback.h> | 27 | #include <linux/writeback.h> |
| 28 | #include <linux/uio.h> | 28 | #include <linux/aio.h> |
| 29 | #include "nilfs.h" | 29 | #include "nilfs.h" |
| 30 | #include "btnode.h" | 30 | #include "btnode.h" |
| 31 | #include "segment.h" | 31 | #include "segment.h" |
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index 1da4b81e6f76..c5670b8d198c 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c | |||
| @@ -27,6 +27,7 @@ | |||
| 27 | #include <linux/swap.h> | 27 | #include <linux/swap.h> |
| 28 | #include <linux/uio.h> | 28 | #include <linux/uio.h> |
| 29 | #include <linux/writeback.h> | 29 | #include <linux/writeback.h> |
| 30 | #include <linux/aio.h> | ||
| 30 | 31 | ||
| 31 | #include <asm/page.h> | 32 | #include <asm/page.h> |
| 32 | #include <asm/uaccess.h> | 33 | #include <asm/uaccess.h> |
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index d3e118cc6ffa..2778b0255dc6 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c | |||
| @@ -28,6 +28,7 @@ | |||
| 28 | #include <linux/quotaops.h> | 28 | #include <linux/quotaops.h> |
| 29 | #include <linux/slab.h> | 29 | #include <linux/slab.h> |
| 30 | #include <linux/log2.h> | 30 | #include <linux/log2.h> |
| 31 | #include <linux/aio.h> | ||
| 31 | 32 | ||
| 32 | #include "aops.h" | 33 | #include "aops.h" |
| 33 | #include "attrib.h" | 34 | #include "attrib.h" |
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h index ffb2da370a99..f671e49beb34 100644 --- a/fs/ocfs2/aops.h +++ b/fs/ocfs2/aops.h | |||
| @@ -22,6 +22,8 @@ | |||
| 22 | #ifndef OCFS2_AOPS_H | 22 | #ifndef OCFS2_AOPS_H |
| 23 | #define OCFS2_AOPS_H | 23 | #define OCFS2_AOPS_H |
| 24 | 24 | ||
| 25 | #include <linux/aio.h> | ||
| 26 | |||
| 25 | handle_t *ocfs2_start_walk_page_trans(struct inode *inode, | 27 | handle_t *ocfs2_start_walk_page_trans(struct inode *inode, |
| 26 | struct page *page, | 28 | struct page *page, |
| 27 | unsigned from, | 29 | unsigned from, |
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 12ae194ac943..3a44a648dae7 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c | |||
| @@ -2322,7 +2322,7 @@ int ocfs2_inode_lock_full_nested(struct inode *inode, | |||
| 2322 | status = __ocfs2_cluster_lock(osb, lockres, level, dlm_flags, | 2322 | status = __ocfs2_cluster_lock(osb, lockres, level, dlm_flags, |
| 2323 | arg_flags, subclass, _RET_IP_); | 2323 | arg_flags, subclass, _RET_IP_); |
| 2324 | if (status < 0) { | 2324 | if (status < 0) { |
| 2325 | if (status != -EAGAIN && status != -EIOCBRETRY) | 2325 | if (status != -EAGAIN) |
| 2326 | mlog_errno(status); | 2326 | mlog_errno(status); |
| 2327 | goto bail; | 2327 | goto bail; |
| 2328 | } | 2328 | } |
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index 88924a3133fa..621fc73bf23d 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h | |||
| @@ -147,8 +147,6 @@ void ocfs2_refresh_inode(struct inode *inode, | |||
| 147 | int ocfs2_mark_inode_dirty(handle_t *handle, | 147 | int ocfs2_mark_inode_dirty(handle_t *handle, |
| 148 | struct inode *inode, | 148 | struct inode *inode, |
| 149 | struct buffer_head *bh); | 149 | struct buffer_head *bh); |
| 150 | int ocfs2_aio_read(struct file *file, struct kiocb *req, struct iocb *iocb); | ||
| 151 | int ocfs2_aio_write(struct file *file, struct kiocb *req, struct iocb *iocb); | ||
| 152 | struct buffer_head *ocfs2_bread(struct inode *inode, | 150 | struct buffer_head *ocfs2_bread(struct inode *inode, |
| 153 | int block, int *err, int reada); | 151 | int block, int *err, int reada); |
| 154 | 152 | ||
| @@ -21,6 +21,7 @@ | |||
| 21 | #include <linux/audit.h> | 21 | #include <linux/audit.h> |
| 22 | #include <linux/syscalls.h> | 22 | #include <linux/syscalls.h> |
| 23 | #include <linux/fcntl.h> | 23 | #include <linux/fcntl.h> |
| 24 | #include <linux/aio.h> | ||
| 24 | 25 | ||
| 25 | #include <asm/uaccess.h> | 26 | #include <asm/uaccess.h> |
| 26 | #include <asm/ioctls.h> | 27 | #include <asm/ioctls.h> |
diff --git a/fs/read_write.c b/fs/read_write.c index 90ba3b350e50..03430008704e 100644 --- a/fs/read_write.c +++ b/fs/read_write.c | |||
| @@ -9,6 +9,7 @@ | |||
| 9 | #include <linux/fcntl.h> | 9 | #include <linux/fcntl.h> |
| 10 | #include <linux/file.h> | 10 | #include <linux/file.h> |
| 11 | #include <linux/uio.h> | 11 | #include <linux/uio.h> |
| 12 | #include <linux/aio.h> | ||
| 12 | #include <linux/fsnotify.h> | 13 | #include <linux/fsnotify.h> |
| 13 | #include <linux/security.h> | 14 | #include <linux/security.h> |
| 14 | #include <linux/export.h> | 15 | #include <linux/export.h> |
| @@ -329,16 +330,6 @@ int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count | |||
| 329 | return count > MAX_RW_COUNT ? MAX_RW_COUNT : count; | 330 | return count > MAX_RW_COUNT ? MAX_RW_COUNT : count; |
| 330 | } | 331 | } |
| 331 | 332 | ||
| 332 | static void wait_on_retry_sync_kiocb(struct kiocb *iocb) | ||
| 333 | { | ||
| 334 | set_current_state(TASK_UNINTERRUPTIBLE); | ||
| 335 | if (!kiocbIsKicked(iocb)) | ||
| 336 | schedule(); | ||
| 337 | else | ||
| 338 | kiocbClearKicked(iocb); | ||
| 339 | __set_current_state(TASK_RUNNING); | ||
| 340 | } | ||
| 341 | |||
| 342 | ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos) | 333 | ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos) |
| 343 | { | 334 | { |
| 344 | struct iovec iov = { .iov_base = buf, .iov_len = len }; | 335 | struct iovec iov = { .iov_base = buf, .iov_len = len }; |
| @@ -350,13 +341,7 @@ ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *pp | |||
| 350 | kiocb.ki_left = len; | 341 | kiocb.ki_left = len; |
| 351 | kiocb.ki_nbytes = len; | 342 | kiocb.ki_nbytes = len; |
| 352 | 343 | ||
| 353 | for (;;) { | 344 | ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos); |
| 354 | ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos); | ||
| 355 | if (ret != -EIOCBRETRY) | ||
| 356 | break; | ||
| 357 | wait_on_retry_sync_kiocb(&kiocb); | ||
| 358 | } | ||
| 359 | |||
| 360 | if (-EIOCBQUEUED == ret) | 345 | if (-EIOCBQUEUED == ret) |
| 361 | ret = wait_on_sync_kiocb(&kiocb); | 346 | ret = wait_on_sync_kiocb(&kiocb); |
| 362 | *ppos = kiocb.ki_pos; | 347 | *ppos = kiocb.ki_pos; |
| @@ -406,13 +391,7 @@ ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, lof | |||
| 406 | kiocb.ki_left = len; | 391 | kiocb.ki_left = len; |
| 407 | kiocb.ki_nbytes = len; | 392 | kiocb.ki_nbytes = len; |
| 408 | 393 | ||
| 409 | for (;;) { | 394 | ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos); |
| 410 | ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos); | ||
| 411 | if (ret != -EIOCBRETRY) | ||
| 412 | break; | ||
| 413 | wait_on_retry_sync_kiocb(&kiocb); | ||
| 414 | } | ||
| 415 | |||
| 416 | if (-EIOCBQUEUED == ret) | 395 | if (-EIOCBQUEUED == ret) |
| 417 | ret = wait_on_sync_kiocb(&kiocb); | 396 | ret = wait_on_sync_kiocb(&kiocb); |
| 418 | *ppos = kiocb.ki_pos; | 397 | *ppos = kiocb.ki_pos; |
| @@ -592,13 +571,7 @@ static ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov, | |||
| 592 | kiocb.ki_left = len; | 571 | kiocb.ki_left = len; |
| 593 | kiocb.ki_nbytes = len; | 572 | kiocb.ki_nbytes = len; |
| 594 | 573 | ||
| 595 | for (;;) { | 574 | ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos); |
| 596 | ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos); | ||
| 597 | if (ret != -EIOCBRETRY) | ||
| 598 | break; | ||
| 599 | wait_on_retry_sync_kiocb(&kiocb); | ||
| 600 | } | ||
| 601 | |||
| 602 | if (ret == -EIOCBQUEUED) | 575 | if (ret == -EIOCBQUEUED) |
| 603 | ret = wait_on_sync_kiocb(&kiocb); | 576 | ret = wait_on_sync_kiocb(&kiocb); |
| 604 | *ppos = kiocb.ki_pos; | 577 | *ppos = kiocb.ki_pos; |
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index ea5061fd4f3e..77d6d47abc83 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c | |||
| @@ -18,6 +18,7 @@ | |||
| 18 | #include <linux/writeback.h> | 18 | #include <linux/writeback.h> |
| 19 | #include <linux/quotaops.h> | 19 | #include <linux/quotaops.h> |
| 20 | #include <linux/swap.h> | 20 | #include <linux/swap.h> |
| 21 | #include <linux/aio.h> | ||
| 21 | 22 | ||
| 22 | int reiserfs_commit_write(struct file *f, struct page *page, | 23 | int reiserfs_commit_write(struct file *f, struct page *page, |
| 23 | unsigned from, unsigned to); | 24 | unsigned from, unsigned to); |
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index f12189d2db1d..14374530784c 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c | |||
| @@ -50,6 +50,7 @@ | |||
| 50 | */ | 50 | */ |
| 51 | 51 | ||
| 52 | #include "ubifs.h" | 52 | #include "ubifs.h" |
| 53 | #include <linux/aio.h> | ||
| 53 | #include <linux/mount.h> | 54 | #include <linux/mount.h> |
| 54 | #include <linux/namei.h> | 55 | #include <linux/namei.h> |
| 55 | #include <linux/slab.h> | 56 | #include <linux/slab.h> |
diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 7a12e48ad819..b6d15d349810 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c | |||
| @@ -38,6 +38,7 @@ | |||
| 38 | #include <linux/slab.h> | 38 | #include <linux/slab.h> |
| 39 | #include <linux/crc-itu-t.h> | 39 | #include <linux/crc-itu-t.h> |
| 40 | #include <linux/mpage.h> | 40 | #include <linux/mpage.h> |
| 41 | #include <linux/aio.h> | ||
| 41 | 42 | ||
| 42 | #include "udf_i.h" | 43 | #include "udf_i.h" |
| 43 | #include "udf_sb.h" | 44 | #include "udf_sb.h" |
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 3244c988d379..2b2691b73428 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c | |||
| @@ -31,6 +31,7 @@ | |||
| 31 | #include "xfs_vnodeops.h" | 31 | #include "xfs_vnodeops.h" |
| 32 | #include "xfs_trace.h" | 32 | #include "xfs_trace.h" |
| 33 | #include "xfs_bmap.h" | 33 | #include "xfs_bmap.h" |
| 34 | #include <linux/aio.h> | ||
| 34 | #include <linux/gfp.h> | 35 | #include <linux/gfp.h> |
| 35 | #include <linux/mpage.h> | 36 | #include <linux/mpage.h> |
| 36 | #include <linux/pagevec.h> | 37 | #include <linux/pagevec.h> |
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 054d60c0ac57..a5f2042aec8b 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c | |||
| @@ -36,6 +36,7 @@ | |||
| 36 | #include "xfs_ioctl.h" | 36 | #include "xfs_ioctl.h" |
| 37 | #include "xfs_trace.h" | 37 | #include "xfs_trace.h" |
| 38 | 38 | ||
| 39 | #include <linux/aio.h> | ||
| 39 | #include <linux/dcache.h> | 40 | #include <linux/dcache.h> |
| 40 | #include <linux/falloc.h> | 41 | #include <linux/falloc.h> |
| 41 | #include <linux/pagevec.h> | 42 | #include <linux/pagevec.h> |
diff --git a/include/linux/aio.h b/include/linux/aio.h index 31ff6dba4872..1bdf965339f9 100644 --- a/include/linux/aio.h +++ b/include/linux/aio.h | |||
| @@ -9,91 +9,32 @@ | |||
| 9 | 9 | ||
| 10 | #include <linux/atomic.h> | 10 | #include <linux/atomic.h> |
| 11 | 11 | ||
| 12 | #define AIO_MAXSEGS 4 | ||
| 13 | #define AIO_KIOGRP_NR_ATOMIC 8 | ||
| 14 | |||
| 15 | struct kioctx; | 12 | struct kioctx; |
| 13 | struct kiocb; | ||
| 16 | 14 | ||
| 17 | /* Notes on cancelling a kiocb: | 15 | #define KIOCB_KEY 0 |
| 18 | * If a kiocb is cancelled, aio_complete may return 0 to indicate | ||
| 19 | * that cancel has not yet disposed of the kiocb. All cancel | ||
| 20 | * operations *must* call aio_put_req to dispose of the kiocb | ||
| 21 | * to guard against races with the completion code. | ||
| 22 | */ | ||
| 23 | #define KIOCB_C_CANCELLED 0x01 | ||
| 24 | #define KIOCB_C_COMPLETE 0x02 | ||
| 25 | |||
| 26 | #define KIOCB_SYNC_KEY (~0U) | ||
| 27 | 16 | ||
| 28 | /* ki_flags bits */ | ||
| 29 | /* | 17 | /* |
| 30 | * This may be used for cancel/retry serialization in the future, but | 18 | * We use ki_cancel == KIOCB_CANCELLED to indicate that a kiocb has been either |
| 31 | * for now it's unused and we probably don't want modules to even | 19 | * cancelled or completed (this makes a certain amount of sense because |
| 32 | * think they can use it. | 20 | * successful cancellation - io_cancel() - does deliver the completion to |
| 21 | * userspace). | ||
| 22 | * | ||
| 23 | * And since most things don't implement kiocb cancellation and we'd really like | ||
| 24 | * kiocb completion to be lockless when possible, we use ki_cancel to | ||
| 25 | * synchronize cancellation and completion - we only set it to KIOCB_CANCELLED | ||
| 26 | * with xchg() or cmpxchg(), see batch_complete_aio() and kiocb_cancel(). | ||
| 33 | */ | 27 | */ |
| 34 | /* #define KIF_LOCKED 0 */ | 28 | #define KIOCB_CANCELLED ((void *) (~0ULL)) |
| 35 | #define KIF_KICKED 1 | ||
| 36 | #define KIF_CANCELLED 2 | ||
| 37 | |||
| 38 | #define kiocbTryLock(iocb) test_and_set_bit(KIF_LOCKED, &(iocb)->ki_flags) | ||
| 39 | #define kiocbTryKick(iocb) test_and_set_bit(KIF_KICKED, &(iocb)->ki_flags) | ||
| 40 | 29 | ||
| 41 | #define kiocbSetLocked(iocb) set_bit(KIF_LOCKED, &(iocb)->ki_flags) | 30 | typedef int (kiocb_cancel_fn)(struct kiocb *, struct io_event *); |
| 42 | #define kiocbSetKicked(iocb) set_bit(KIF_KICKED, &(iocb)->ki_flags) | ||
| 43 | #define kiocbSetCancelled(iocb) set_bit(KIF_CANCELLED, &(iocb)->ki_flags) | ||
| 44 | 31 | ||
| 45 | #define kiocbClearLocked(iocb) clear_bit(KIF_LOCKED, &(iocb)->ki_flags) | ||
| 46 | #define kiocbClearKicked(iocb) clear_bit(KIF_KICKED, &(iocb)->ki_flags) | ||
| 47 | #define kiocbClearCancelled(iocb) clear_bit(KIF_CANCELLED, &(iocb)->ki_flags) | ||
| 48 | |||
| 49 | #define kiocbIsLocked(iocb) test_bit(KIF_LOCKED, &(iocb)->ki_flags) | ||
| 50 | #define kiocbIsKicked(iocb) test_bit(KIF_KICKED, &(iocb)->ki_flags) | ||
| 51 | #define kiocbIsCancelled(iocb) test_bit(KIF_CANCELLED, &(iocb)->ki_flags) | ||
| 52 | |||
| 53 | /* is there a better place to document function pointer methods? */ | ||
| 54 | /** | ||
| 55 | * ki_retry - iocb forward progress callback | ||
| 56 | * @kiocb: The kiocb struct to advance by performing an operation. | ||
| 57 | * | ||
| 58 | * This callback is called when the AIO core wants a given AIO operation | ||
| 59 | * to make forward progress. The kiocb argument describes the operation | ||
| 60 | * that is to be performed. As the operation proceeds, perhaps partially, | ||
| 61 | * ki_retry is expected to update the kiocb with progress made. Typically | ||
| 62 | * ki_retry is set in the AIO core and it itself calls file_operations | ||
| 63 | * helpers. | ||
| 64 | * | ||
| 65 | * ki_retry's return value determines when the AIO operation is completed | ||
| 66 | * and an event is generated in the AIO event ring. Except the special | ||
| 67 | * return values described below, the value that is returned from ki_retry | ||
| 68 | * is transferred directly into the completion ring as the operation's | ||
| 69 | * resulting status. Once this has happened ki_retry *MUST NOT* reference | ||
| 70 | * the kiocb pointer again. | ||
| 71 | * | ||
| 72 | * If ki_retry returns -EIOCBQUEUED it has made a promise that aio_complete() | ||
| 73 | * will be called on the kiocb pointer in the future. The AIO core will | ||
| 74 | * not ask the method again -- ki_retry must ensure forward progress. | ||
| 75 | * aio_complete() must be called once and only once in the future, multiple | ||
| 76 | * calls may result in undefined behaviour. | ||
| 77 | * | ||
| 78 | * If ki_retry returns -EIOCBRETRY it has made a promise that kick_iocb() | ||
| 79 | * will be called on the kiocb pointer in the future. This may happen | ||
| 80 | * through generic helpers that associate kiocb->ki_wait with a wait | ||
| 81 | * queue head that ki_retry uses via current->io_wait. It can also happen | ||
| 82 | * with custom tracking and manual calls to kick_iocb(), though that is | ||
| 83 | * discouraged. In either case, kick_iocb() must be called once and only | ||
| 84 | * once. ki_retry must ensure forward progress, the AIO core will wait | ||
| 85 | * indefinitely for kick_iocb() to be called. | ||
| 86 | */ | ||
| 87 | struct kiocb { | 32 | struct kiocb { |
| 88 | struct list_head ki_run_list; | 33 | atomic_t ki_users; |
| 89 | unsigned long ki_flags; | ||
| 90 | int ki_users; | ||
| 91 | unsigned ki_key; /* id of this request */ | ||
| 92 | 34 | ||
| 93 | struct file *ki_filp; | 35 | struct file *ki_filp; |
| 94 | struct kioctx *ki_ctx; /* may be NULL for sync ops */ | 36 | struct kioctx *ki_ctx; /* NULL for sync ops */ |
| 95 | int (*ki_cancel)(struct kiocb *, struct io_event *); | 37 | kiocb_cancel_fn *ki_cancel; |
| 96 | ssize_t (*ki_retry)(struct kiocb *); | ||
| 97 | void (*ki_dtor)(struct kiocb *); | 38 | void (*ki_dtor)(struct kiocb *); |
| 98 | 39 | ||
| 99 | union { | 40 | union { |
| @@ -117,7 +58,6 @@ struct kiocb { | |||
| 117 | 58 | ||
| 118 | struct list_head ki_list; /* the aio core uses this | 59 | struct list_head ki_list; /* the aio core uses this |
| 119 | * for cancellation */ | 60 | * for cancellation */ |
| 120 | struct list_head ki_batch; /* batch allocation */ | ||
| 121 | 61 | ||
| 122 | /* | 62 | /* |
| 123 | * If the aio_resfd field of the userspace iocb is not zero, | 63 | * If the aio_resfd field of the userspace iocb is not zero, |
| @@ -128,106 +68,40 @@ struct kiocb { | |||
| 128 | 68 | ||
| 129 | static inline bool is_sync_kiocb(struct kiocb *kiocb) | 69 | static inline bool is_sync_kiocb(struct kiocb *kiocb) |
| 130 | { | 70 | { |
| 131 | return kiocb->ki_key == KIOCB_SYNC_KEY; | 71 | return kiocb->ki_ctx == NULL; |
| 132 | } | 72 | } |
| 133 | 73 | ||
| 134 | static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp) | 74 | static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp) |
| 135 | { | 75 | { |
| 136 | *kiocb = (struct kiocb) { | 76 | *kiocb = (struct kiocb) { |
| 137 | .ki_users = 1, | 77 | .ki_users = ATOMIC_INIT(1), |
| 138 | .ki_key = KIOCB_SYNC_KEY, | 78 | .ki_ctx = NULL, |
| 139 | .ki_filp = filp, | 79 | .ki_filp = filp, |
| 140 | .ki_obj.tsk = current, | 80 | .ki_obj.tsk = current, |
| 141 | }; | 81 | }; |
| 142 | } | 82 | } |
| 143 | 83 | ||
| 144 | #define AIO_RING_MAGIC 0xa10a10a1 | ||
| 145 | #define AIO_RING_COMPAT_FEATURES 1 | ||
| 146 | #define AIO_RING_INCOMPAT_FEATURES 0 | ||
| 147 | struct aio_ring { | ||
| 148 | unsigned id; /* kernel internal index number */ | ||
| 149 | unsigned nr; /* number of io_events */ | ||
| 150 | unsigned head; | ||
| 151 | unsigned tail; | ||
| 152 | |||
| 153 | unsigned magic; | ||
| 154 | unsigned compat_features; | ||
| 155 | unsigned incompat_features; | ||
| 156 | unsigned header_length; /* size of aio_ring */ | ||
| 157 | |||
| 158 | |||
| 159 | struct io_event io_events[0]; | ||
| 160 | }; /* 128 bytes + ring size */ | ||
| 161 | |||
| 162 | #define AIO_RING_PAGES 8 | ||
| 163 | struct aio_ring_info { | ||
| 164 | unsigned long mmap_base; | ||
| 165 | unsigned long mmap_size; | ||
| 166 | |||
| 167 | struct page **ring_pages; | ||
| 168 | spinlock_t ring_lock; | ||
| 169 | long nr_pages; | ||
| 170 | |||
| 171 | unsigned nr, tail; | ||
| 172 | |||
| 173 | struct page *internal_pages[AIO_RING_PAGES]; | ||
| 174 | }; | ||
| 175 | |||
| 176 | static inline unsigned aio_ring_avail(struct aio_ring_info *info, | ||
| 177 | struct aio_ring *ring) | ||
| 178 | { | ||
| 179 | return (ring->head + info->nr - 1 - ring->tail) % info->nr; | ||
| 180 | } | ||
| 181 | |||
| 182 | struct kioctx { | ||
| 183 | atomic_t users; | ||
| 184 | int dead; | ||
| 185 | struct mm_struct *mm; | ||
| 186 | |||
| 187 | /* This needs improving */ | ||
| 188 | unsigned long user_id; | ||
| 189 | struct hlist_node list; | ||
| 190 | |||
| 191 | wait_queue_head_t wait; | ||
| 192 | |||
| 193 | spinlock_t ctx_lock; | ||
| 194 | |||
| 195 | int reqs_active; | ||
| 196 | struct list_head active_reqs; /* used for cancellation */ | ||
| 197 | struct list_head run_list; /* used for kicked reqs */ | ||
| 198 | |||
| 199 | /* sys_io_setup currently limits this to an unsigned int */ | ||
| 200 | unsigned max_reqs; | ||
| 201 | |||
| 202 | struct aio_ring_info ring_info; | ||
| 203 | |||
| 204 | struct delayed_work wq; | ||
| 205 | |||
| 206 | struct rcu_head rcu_head; | ||
| 207 | }; | ||
| 208 | |||
| 209 | /* prototypes */ | 84 | /* prototypes */ |
| 210 | extern unsigned aio_max_size; | ||
| 211 | |||
| 212 | #ifdef CONFIG_AIO | 85 | #ifdef CONFIG_AIO |
| 213 | extern ssize_t wait_on_sync_kiocb(struct kiocb *iocb); | 86 | extern ssize_t wait_on_sync_kiocb(struct kiocb *iocb); |
| 214 | extern int aio_put_req(struct kiocb *iocb); | 87 | extern void aio_put_req(struct kiocb *iocb); |
| 215 | extern void kick_iocb(struct kiocb *iocb); | 88 | extern void aio_complete(struct kiocb *iocb, long res, long res2); |
| 216 | extern int aio_complete(struct kiocb *iocb, long res, long res2); | ||
| 217 | struct mm_struct; | 89 | struct mm_struct; |
| 218 | extern void exit_aio(struct mm_struct *mm); | 90 | extern void exit_aio(struct mm_struct *mm); |
| 219 | extern long do_io_submit(aio_context_t ctx_id, long nr, | 91 | extern long do_io_submit(aio_context_t ctx_id, long nr, |
| 220 | struct iocb __user *__user *iocbpp, bool compat); | 92 | struct iocb __user *__user *iocbpp, bool compat); |
| 93 | void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel); | ||
| 221 | #else | 94 | #else |
| 222 | static inline ssize_t wait_on_sync_kiocb(struct kiocb *iocb) { return 0; } | 95 | static inline ssize_t wait_on_sync_kiocb(struct kiocb *iocb) { return 0; } |
| 223 | static inline int aio_put_req(struct kiocb *iocb) { return 0; } | 96 | static inline void aio_put_req(struct kiocb *iocb) { } |
| 224 | static inline void kick_iocb(struct kiocb *iocb) { } | 97 | static inline void aio_complete(struct kiocb *iocb, long res, long res2) { } |
| 225 | static inline int aio_complete(struct kiocb *iocb, long res, long res2) { return 0; } | ||
| 226 | struct mm_struct; | 98 | struct mm_struct; |
| 227 | static inline void exit_aio(struct mm_struct *mm) { } | 99 | static inline void exit_aio(struct mm_struct *mm) { } |
| 228 | static inline long do_io_submit(aio_context_t ctx_id, long nr, | 100 | static inline long do_io_submit(aio_context_t ctx_id, long nr, |
| 229 | struct iocb __user * __user *iocbpp, | 101 | struct iocb __user * __user *iocbpp, |
| 230 | bool compat) { return 0; } | 102 | bool compat) { return 0; } |
| 103 | static inline void kiocb_set_cancel_fn(struct kiocb *req, | ||
| 104 | kiocb_cancel_fn *cancel) { } | ||
| 231 | #endif /* CONFIG_AIO */ | 105 | #endif /* CONFIG_AIO */ |
| 232 | 106 | ||
| 233 | static inline struct kiocb *list_kiocb(struct list_head *h) | 107 | static inline struct kiocb *list_kiocb(struct list_head *h) |
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 3bff9ce09cf7..5047355b9a0f 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h | |||
| @@ -28,6 +28,7 @@ struct cgroup_subsys; | |||
| 28 | struct inode; | 28 | struct inode; |
| 29 | struct cgroup; | 29 | struct cgroup; |
| 30 | struct css_id; | 30 | struct css_id; |
| 31 | struct eventfd_ctx; | ||
| 31 | 32 | ||
| 32 | extern int cgroup_init_early(void); | 33 | extern int cgroup_init_early(void); |
| 33 | extern int cgroup_init(void); | 34 | extern int cgroup_init(void); |
diff --git a/include/linux/errno.h b/include/linux/errno.h index f6bf082d4d4f..89627b9187f9 100644 --- a/include/linux/errno.h +++ b/include/linux/errno.h | |||
| @@ -28,6 +28,5 @@ | |||
| 28 | #define EBADTYPE 527 /* Type not supported by server */ | 28 | #define EBADTYPE 527 /* Type not supported by server */ |
| 29 | #define EJUKEBOX 528 /* Request initiated, but will not complete before timeout */ | 29 | #define EJUKEBOX 528 /* Request initiated, but will not complete before timeout */ |
| 30 | #define EIOCBQUEUED 529 /* iocb queued, will get completion event */ | 30 | #define EIOCBQUEUED 529 /* iocb queued, will get completion event */ |
| 31 | #define EIOCBRETRY 530 /* iocb queued, will trigger a retry */ | ||
| 32 | 31 | ||
| 33 | #endif | 32 | #endif |
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 3a62df310f2e..6b4890fa57e7 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h | |||
| @@ -189,8 +189,7 @@ static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb) | |||
| 189 | 189 | ||
| 190 | extern const struct file_operations hugetlbfs_file_operations; | 190 | extern const struct file_operations hugetlbfs_file_operations; |
| 191 | extern const struct vm_operations_struct hugetlb_vm_ops; | 191 | extern const struct vm_operations_struct hugetlb_vm_ops; |
| 192 | struct file *hugetlb_file_setup(const char *name, unsigned long addr, | 192 | struct file *hugetlb_file_setup(const char *name, size_t size, vm_flags_t acct, |
| 193 | size_t size, vm_flags_t acct, | ||
| 194 | struct user_struct **user, int creat_flags, | 193 | struct user_struct **user, int creat_flags, |
| 195 | int page_size_log); | 194 | int page_size_log); |
| 196 | 195 | ||
| @@ -209,8 +208,8 @@ static inline int is_file_hugepages(struct file *file) | |||
| 209 | 208 | ||
| 210 | #define is_file_hugepages(file) 0 | 209 | #define is_file_hugepages(file) 0 |
| 211 | static inline struct file * | 210 | static inline struct file * |
| 212 | hugetlb_file_setup(const char *name, unsigned long addr, size_t size, | 211 | hugetlb_file_setup(const char *name, size_t size, vm_flags_t acctflag, |
| 213 | vm_flags_t acctflag, struct user_struct **user, int creat_flags, | 212 | struct user_struct **user, int creat_flags, |
| 214 | int page_size_log) | 213 | int page_size_log) |
| 215 | { | 214 | { |
| 216 | return ERR_PTR(-ENOSYS); | 215 | return ERR_PTR(-ENOSYS); |
| @@ -288,6 +287,13 @@ static inline struct hstate *hstate_file(struct file *f) | |||
| 288 | return hstate_inode(file_inode(f)); | 287 | return hstate_inode(file_inode(f)); |
| 289 | } | 288 | } |
| 290 | 289 | ||
| 290 | static inline struct hstate *hstate_sizelog(int page_size_log) | ||
| 291 | { | ||
| 292 | if (!page_size_log) | ||
| 293 | return &default_hstate; | ||
| 294 | return size_to_hstate(1 << page_size_log); | ||
| 295 | } | ||
| 296 | |||
| 291 | static inline struct hstate *hstate_vma(struct vm_area_struct *vma) | 297 | static inline struct hstate *hstate_vma(struct vm_area_struct *vma) |
| 292 | { | 298 | { |
| 293 | return hstate_file(vma->vm_file); | 299 | return hstate_file(vma->vm_file); |
| @@ -352,11 +358,12 @@ static inline int hstate_index(struct hstate *h) | |||
| 352 | return h - hstates; | 358 | return h - hstates; |
| 353 | } | 359 | } |
| 354 | 360 | ||
| 355 | #else | 361 | #else /* CONFIG_HUGETLB_PAGE */ |
| 356 | struct hstate {}; | 362 | struct hstate {}; |
| 357 | #define alloc_huge_page_node(h, nid) NULL | 363 | #define alloc_huge_page_node(h, nid) NULL |
| 358 | #define alloc_bootmem_huge_page(h) NULL | 364 | #define alloc_bootmem_huge_page(h) NULL |
| 359 | #define hstate_file(f) NULL | 365 | #define hstate_file(f) NULL |
| 366 | #define hstate_sizelog(s) NULL | ||
| 360 | #define hstate_vma(v) NULL | 367 | #define hstate_vma(v) NULL |
| 361 | #define hstate_inode(i) NULL | 368 | #define hstate_inode(i) NULL |
| 362 | #define huge_page_size(h) PAGE_SIZE | 369 | #define huge_page_size(h) PAGE_SIZE |
| @@ -371,6 +378,6 @@ static inline unsigned int pages_per_huge_page(struct hstate *h) | |||
| 371 | } | 378 | } |
| 372 | #define hstate_index_to_shift(index) 0 | 379 | #define hstate_index_to_shift(index) 0 |
| 373 | #define hstate_index(h) 0 | 380 | #define hstate_index(h) 0 |
| 374 | #endif | 381 | #endif /* CONFIG_HUGETLB_PAGE */ |
| 375 | 382 | ||
| 376 | #endif /* _LINUX_HUGETLB_H */ | 383 | #endif /* _LINUX_HUGETLB_H */ |
diff --git a/include/linux/mm.h b/include/linux/mm.h index 1a7f19e7f1a0..e0c8528a41a4 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h | |||
| @@ -951,13 +951,19 @@ void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma, | |||
| 951 | * (see walk_page_range for more details) | 951 | * (see walk_page_range for more details) |
| 952 | */ | 952 | */ |
| 953 | struct mm_walk { | 953 | struct mm_walk { |
| 954 | int (*pgd_entry)(pgd_t *, unsigned long, unsigned long, struct mm_walk *); | 954 | int (*pgd_entry)(pgd_t *pgd, unsigned long addr, |
| 955 | int (*pud_entry)(pud_t *, unsigned long, unsigned long, struct mm_walk *); | 955 | unsigned long next, struct mm_walk *walk); |
| 956 | int (*pmd_entry)(pmd_t *, unsigned long, unsigned long, struct mm_walk *); | 956 | int (*pud_entry)(pud_t *pud, unsigned long addr, |
| 957 | int (*pte_entry)(pte_t *, unsigned long, unsigned long, struct mm_walk *); | 957 | unsigned long next, struct mm_walk *walk); |
| 958 | int (*pte_hole)(unsigned long, unsigned long, struct mm_walk *); | 958 | int (*pmd_entry)(pmd_t *pmd, unsigned long addr, |
| 959 | int (*hugetlb_entry)(pte_t *, unsigned long, | 959 | unsigned long next, struct mm_walk *walk); |
| 960 | unsigned long, unsigned long, struct mm_walk *); | 960 | int (*pte_entry)(pte_t *pte, unsigned long addr, |
| 961 | unsigned long next, struct mm_walk *walk); | ||
| 962 | int (*pte_hole)(unsigned long addr, unsigned long next, | ||
| 963 | struct mm_walk *walk); | ||
| 964 | int (*hugetlb_entry)(pte_t *pte, unsigned long hmask, | ||
| 965 | unsigned long addr, unsigned long next, | ||
| 966 | struct mm_walk *walk); | ||
| 961 | struct mm_struct *mm; | 967 | struct mm_struct *mm; |
| 962 | void *private; | 968 | void *private; |
| 963 | }; | 969 | }; |
diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index 731e4ecee3bd..e2772666f004 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h | |||
| @@ -4,6 +4,7 @@ | |||
| 4 | #include <linux/sched.h> | 4 | #include <linux/sched.h> |
| 5 | #include <linux/bug.h> | 5 | #include <linux/bug.h> |
| 6 | #include <linux/mm.h> | 6 | #include <linux/mm.h> |
| 7 | #include <linux/workqueue.h> | ||
| 7 | #include <linux/threads.h> | 8 | #include <linux/threads.h> |
| 8 | #include <linux/nsproxy.h> | 9 | #include <linux/nsproxy.h> |
| 9 | #include <linux/kref.h> | 10 | #include <linux/kref.h> |
diff --git a/include/linux/random.h b/include/linux/random.h index 347ce553a306..3b9377d6b7a5 100644 --- a/include/linux/random.h +++ b/include/linux/random.h | |||
| @@ -29,13 +29,6 @@ u32 prandom_u32(void); | |||
| 29 | void prandom_bytes(void *buf, int nbytes); | 29 | void prandom_bytes(void *buf, int nbytes); |
| 30 | void prandom_seed(u32 seed); | 30 | void prandom_seed(u32 seed); |
| 31 | 31 | ||
| 32 | /* | ||
| 33 | * These macros are preserved for backward compatibility and should be | ||
| 34 | * removed as soon as a transition is finished. | ||
| 35 | */ | ||
| 36 | #define random32() prandom_u32() | ||
| 37 | #define srandom32(seed) prandom_seed(seed) | ||
| 38 | |||
| 39 | u32 prandom_u32_state(struct rnd_state *); | 32 | u32 prandom_u32_state(struct rnd_state *); |
| 40 | void prandom_bytes_state(struct rnd_state *state, void *buf, int nbytes); | 33 | void prandom_bytes_state(struct rnd_state *state, void *buf, int nbytes); |
| 41 | 34 | ||
diff --git a/include/linux/sched.h b/include/linux/sched.h index 4800e9d1864c..022c085ac3c5 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
| @@ -313,8 +313,6 @@ extern void schedule_preempt_disabled(void); | |||
| 313 | struct nsproxy; | 313 | struct nsproxy; |
| 314 | struct user_namespace; | 314 | struct user_namespace; |
| 315 | 315 | ||
| 316 | #include <linux/aio.h> | ||
| 317 | |||
| 318 | #ifdef CONFIG_MMU | 316 | #ifdef CONFIG_MMU |
| 319 | extern void arch_pick_mmap_layout(struct mm_struct *mm); | 317 | extern void arch_pick_mmap_layout(struct mm_struct *mm); |
| 320 | extern unsigned long | 318 | extern unsigned long |
diff --git a/include/linux/wait.h b/include/linux/wait.h index 7cb64d4b499d..ac38be2692d8 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h | |||
| @@ -330,6 +330,92 @@ do { \ | |||
| 330 | __ret; \ | 330 | __ret; \ |
| 331 | }) | 331 | }) |
| 332 | 332 | ||
| 333 | #define __wait_event_hrtimeout(wq, condition, timeout, state) \ | ||
| 334 | ({ \ | ||
| 335 | int __ret = 0; \ | ||
| 336 | DEFINE_WAIT(__wait); \ | ||
| 337 | struct hrtimer_sleeper __t; \ | ||
| 338 | \ | ||
| 339 | hrtimer_init_on_stack(&__t.timer, CLOCK_MONOTONIC, \ | ||
| 340 | HRTIMER_MODE_REL); \ | ||
| 341 | hrtimer_init_sleeper(&__t, current); \ | ||
| 342 | if ((timeout).tv64 != KTIME_MAX) \ | ||
| 343 | hrtimer_start_range_ns(&__t.timer, timeout, \ | ||
| 344 | current->timer_slack_ns, \ | ||
| 345 | HRTIMER_MODE_REL); \ | ||
| 346 | \ | ||
| 347 | for (;;) { \ | ||
| 348 | prepare_to_wait(&wq, &__wait, state); \ | ||
| 349 | if (condition) \ | ||
| 350 | break; \ | ||
| 351 | if (state == TASK_INTERRUPTIBLE && \ | ||
| 352 | signal_pending(current)) { \ | ||
| 353 | __ret = -ERESTARTSYS; \ | ||
| 354 | break; \ | ||
| 355 | } \ | ||
| 356 | if (!__t.task) { \ | ||
| 357 | __ret = -ETIME; \ | ||
| 358 | break; \ | ||
| 359 | } \ | ||
| 360 | schedule(); \ | ||
| 361 | } \ | ||
| 362 | \ | ||
| 363 | hrtimer_cancel(&__t.timer); \ | ||
| 364 | destroy_hrtimer_on_stack(&__t.timer); \ | ||
| 365 | finish_wait(&wq, &__wait); \ | ||
| 366 | __ret; \ | ||
| 367 | }) | ||
| 368 | |||
| 369 | /** | ||
| 370 | * wait_event_hrtimeout - sleep until a condition gets true or a timeout elapses | ||
| 371 | * @wq: the waitqueue to wait on | ||
| 372 | * @condition: a C expression for the event to wait for | ||
| 373 | * @timeout: timeout, as a ktime_t | ||
| 374 | * | ||
| 375 | * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the | ||
| 376 | * @condition evaluates to true or a signal is received. | ||
| 377 | * The @condition is checked each time the waitqueue @wq is woken up. | ||
| 378 | * | ||
| 379 | * wake_up() has to be called after changing any variable that could | ||
| 380 | * change the result of the wait condition. | ||
| 381 | * | ||
| 382 | * The function returns 0 if @condition became true, or -ETIME if the timeout | ||
| 383 | * elapsed. | ||
| 384 | */ | ||
| 385 | #define wait_event_hrtimeout(wq, condition, timeout) \ | ||
| 386 | ({ \ | ||
| 387 | int __ret = 0; \ | ||
| 388 | if (!(condition)) \ | ||
| 389 | __ret = __wait_event_hrtimeout(wq, condition, timeout, \ | ||
| 390 | TASK_UNINTERRUPTIBLE); \ | ||
| 391 | __ret; \ | ||
| 392 | }) | ||
| 393 | |||
| 394 | /** | ||
| 395 | * wait_event_interruptible_hrtimeout - sleep until a condition gets true or a timeout elapses | ||
| 396 | * @wq: the waitqueue to wait on | ||
| 397 | * @condition: a C expression for the event to wait for | ||
| 398 | * @timeout: timeout, as a ktime_t | ||
| 399 | * | ||
| 400 | * The process is put to sleep (TASK_INTERRUPTIBLE) until the | ||
| 401 | * @condition evaluates to true or a signal is received. | ||
| 402 | * The @condition is checked each time the waitqueue @wq is woken up. | ||
| 403 | * | ||
| 404 | * wake_up() has to be called after changing any variable that could | ||
| 405 | * change the result of the wait condition. | ||
| 406 | * | ||
| 407 | * The function returns 0 if @condition became true, -ERESTARTSYS if it was | ||
| 408 | * interrupted by a signal, or -ETIME if the timeout elapsed. | ||
| 409 | */ | ||
| 410 | #define wait_event_interruptible_hrtimeout(wq, condition, timeout) \ | ||
| 411 | ({ \ | ||
| 412 | long __ret = 0; \ | ||
| 413 | if (!(condition)) \ | ||
| 414 | __ret = __wait_event_hrtimeout(wq, condition, timeout, \ | ||
| 415 | TASK_INTERRUPTIBLE); \ | ||
| 416 | __ret; \ | ||
| 417 | }) | ||
| 418 | |||
| 333 | #define __wait_event_interruptible_exclusive(wq, condition, ret) \ | 419 | #define __wait_event_interruptible_exclusive(wq, condition, ret) \ |
| 334 | do { \ | 420 | do { \ |
| 335 | DEFINE_WAIT(__wait); \ | 421 | DEFINE_WAIT(__wait); \ |
diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 9a9367c0c076..579a5007c696 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h | |||
| @@ -5,6 +5,7 @@ | |||
| 5 | #define WRITEBACK_H | 5 | #define WRITEBACK_H |
| 6 | 6 | ||
| 7 | #include <linux/sched.h> | 7 | #include <linux/sched.h> |
| 8 | #include <linux/workqueue.h> | ||
| 8 | #include <linux/fs.h> | 9 | #include <linux/fs.h> |
| 9 | 10 | ||
| 10 | DECLARE_PER_CPU(int, dirty_throttle_leaks); | 11 | DECLARE_PER_CPU(int, dirty_throttle_leaks); |
| @@ -491,10 +491,14 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) | |||
| 491 | 491 | ||
| 492 | sprintf (name, "SYSV%08x", key); | 492 | sprintf (name, "SYSV%08x", key); |
| 493 | if (shmflg & SHM_HUGETLB) { | 493 | if (shmflg & SHM_HUGETLB) { |
| 494 | struct hstate *hs = hstate_sizelog((shmflg >> SHM_HUGE_SHIFT) | ||
| 495 | & SHM_HUGE_MASK); | ||
| 496 | size_t hugesize = ALIGN(size, huge_page_size(hs)); | ||
| 497 | |||
| 494 | /* hugetlb_file_setup applies strict accounting */ | 498 | /* hugetlb_file_setup applies strict accounting */ |
| 495 | if (shmflg & SHM_NORESERVE) | 499 | if (shmflg & SHM_NORESERVE) |
| 496 | acctflag = VM_NORESERVE; | 500 | acctflag = VM_NORESERVE; |
| 497 | file = hugetlb_file_setup(name, 0, size, acctflag, | 501 | file = hugetlb_file_setup(name, hugesize, acctflag, |
| 498 | &shp->mlock_user, HUGETLB_SHMFS_INODE, | 502 | &shp->mlock_user, HUGETLB_SHMFS_INODE, |
| 499 | (shmflg >> SHM_HUGE_SHIFT) & SHM_HUGE_MASK); | 503 | (shmflg >> SHM_HUGE_SHIFT) & SHM_HUGE_MASK); |
| 500 | } else { | 504 | } else { |
diff --git a/kernel/fork.c b/kernel/fork.c index 7d40687b1434..c509cc4a0d53 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
| @@ -70,6 +70,7 @@ | |||
| 70 | #include <linux/khugepaged.h> | 70 | #include <linux/khugepaged.h> |
| 71 | #include <linux/signalfd.h> | 71 | #include <linux/signalfd.h> |
| 72 | #include <linux/uprobes.h> | 72 | #include <linux/uprobes.h> |
| 73 | #include <linux/aio.h> | ||
| 73 | 74 | ||
| 74 | #include <asm/pgtable.h> | 75 | #include <asm/pgtable.h> |
| 75 | #include <asm/pgalloc.h> | 76 | #include <asm/pgalloc.h> |
diff --git a/kernel/printk.c b/kernel/printk.c index 96dcfcd9a2d4..fa36e1494420 100644 --- a/kernel/printk.c +++ b/kernel/printk.c | |||
| @@ -32,6 +32,7 @@ | |||
| 32 | #include <linux/security.h> | 32 | #include <linux/security.h> |
| 33 | #include <linux/bootmem.h> | 33 | #include <linux/bootmem.h> |
| 34 | #include <linux/memblock.h> | 34 | #include <linux/memblock.h> |
| 35 | #include <linux/aio.h> | ||
| 35 | #include <linux/syscalls.h> | 36 | #include <linux/syscalls.h> |
| 36 | #include <linux/kexec.h> | 37 | #include <linux/kexec.h> |
| 37 | #include <linux/kdb.h> | 38 | #include <linux/kdb.h> |
diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 17ae54da0ec2..aed981a3f69c 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c | |||
| @@ -17,6 +17,7 @@ | |||
| 17 | #include <linux/ptrace.h> | 17 | #include <linux/ptrace.h> |
| 18 | #include <linux/security.h> | 18 | #include <linux/security.h> |
| 19 | #include <linux/signal.h> | 19 | #include <linux/signal.h> |
| 20 | #include <linux/uio.h> | ||
| 20 | #include <linux/audit.h> | 21 | #include <linux/audit.h> |
| 21 | #include <linux/pid_namespace.h> | 22 | #include <linux/pid_namespace.h> |
| 22 | #include <linux/syscalls.h> | 23 | #include <linux/syscalls.h> |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 0f1d92163f30..cb1c9dedf9b6 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
| @@ -92,16 +92,18 @@ enum mem_cgroup_stat_index { | |||
| 92 | /* | 92 | /* |
| 93 | * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss. | 93 | * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss. |
| 94 | */ | 94 | */ |
| 95 | MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ | 95 | MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ |
| 96 | MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ | 96 | MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ |
| 97 | MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ | 97 | MEM_CGROUP_STAT_RSS_HUGE, /* # of pages charged as anon huge */ |
| 98 | MEM_CGROUP_STAT_SWAP, /* # of pages, swapped out */ | 98 | MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ |
| 99 | MEM_CGROUP_STAT_SWAP, /* # of pages, swapped out */ | ||
| 99 | MEM_CGROUP_STAT_NSTATS, | 100 | MEM_CGROUP_STAT_NSTATS, |
| 100 | }; | 101 | }; |
| 101 | 102 | ||
| 102 | static const char * const mem_cgroup_stat_names[] = { | 103 | static const char * const mem_cgroup_stat_names[] = { |
| 103 | "cache", | 104 | "cache", |
| 104 | "rss", | 105 | "rss", |
| 106 | "rss_huge", | ||
| 105 | "mapped_file", | 107 | "mapped_file", |
| 106 | "swap", | 108 | "swap", |
| 107 | }; | 109 | }; |
| @@ -917,6 +919,7 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, | |||
| 917 | } | 919 | } |
| 918 | 920 | ||
| 919 | static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, | 921 | static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, |
| 922 | struct page *page, | ||
| 920 | bool anon, int nr_pages) | 923 | bool anon, int nr_pages) |
| 921 | { | 924 | { |
| 922 | preempt_disable(); | 925 | preempt_disable(); |
| @@ -932,6 +935,10 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, | |||
| 932 | __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE], | 935 | __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE], |
| 933 | nr_pages); | 936 | nr_pages); |
| 934 | 937 | ||
| 938 | if (PageTransHuge(page)) | ||
| 939 | __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], | ||
| 940 | nr_pages); | ||
| 941 | |||
| 935 | /* pagein of a big page is an event. So, ignore page size */ | 942 | /* pagein of a big page is an event. So, ignore page size */ |
| 936 | if (nr_pages > 0) | 943 | if (nr_pages > 0) |
| 937 | __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]); | 944 | __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]); |
| @@ -2914,7 +2921,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, | |||
| 2914 | else | 2921 | else |
| 2915 | anon = false; | 2922 | anon = false; |
| 2916 | 2923 | ||
| 2917 | mem_cgroup_charge_statistics(memcg, anon, nr_pages); | 2924 | mem_cgroup_charge_statistics(memcg, page, anon, nr_pages); |
| 2918 | unlock_page_cgroup(pc); | 2925 | unlock_page_cgroup(pc); |
| 2919 | 2926 | ||
| 2920 | /* | 2927 | /* |
| @@ -3708,16 +3715,21 @@ void mem_cgroup_split_huge_fixup(struct page *head) | |||
| 3708 | { | 3715 | { |
| 3709 | struct page_cgroup *head_pc = lookup_page_cgroup(head); | 3716 | struct page_cgroup *head_pc = lookup_page_cgroup(head); |
| 3710 | struct page_cgroup *pc; | 3717 | struct page_cgroup *pc; |
| 3718 | struct mem_cgroup *memcg; | ||
| 3711 | int i; | 3719 | int i; |
| 3712 | 3720 | ||
| 3713 | if (mem_cgroup_disabled()) | 3721 | if (mem_cgroup_disabled()) |
| 3714 | return; | 3722 | return; |
| 3723 | |||
| 3724 | memcg = head_pc->mem_cgroup; | ||
| 3715 | for (i = 1; i < HPAGE_PMD_NR; i++) { | 3725 | for (i = 1; i < HPAGE_PMD_NR; i++) { |
| 3716 | pc = head_pc + i; | 3726 | pc = head_pc + i; |
| 3717 | pc->mem_cgroup = head_pc->mem_cgroup; | 3727 | pc->mem_cgroup = memcg; |
| 3718 | smp_wmb();/* see __commit_charge() */ | 3728 | smp_wmb();/* see __commit_charge() */ |
| 3719 | pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT; | 3729 | pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT; |
| 3720 | } | 3730 | } |
| 3731 | __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], | ||
| 3732 | HPAGE_PMD_NR); | ||
| 3721 | } | 3733 | } |
| 3722 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | 3734 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ |
| 3723 | 3735 | ||
| @@ -3773,11 +3785,11 @@ static int mem_cgroup_move_account(struct page *page, | |||
| 3773 | __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); | 3785 | __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); |
| 3774 | preempt_enable(); | 3786 | preempt_enable(); |
| 3775 | } | 3787 | } |
| 3776 | mem_cgroup_charge_statistics(from, anon, -nr_pages); | 3788 | mem_cgroup_charge_statistics(from, page, anon, -nr_pages); |
| 3777 | 3789 | ||
| 3778 | /* caller should have done css_get */ | 3790 | /* caller should have done css_get */ |
| 3779 | pc->mem_cgroup = to; | 3791 | pc->mem_cgroup = to; |
| 3780 | mem_cgroup_charge_statistics(to, anon, nr_pages); | 3792 | mem_cgroup_charge_statistics(to, page, anon, nr_pages); |
| 3781 | move_unlock_mem_cgroup(from, &flags); | 3793 | move_unlock_mem_cgroup(from, &flags); |
| 3782 | ret = 0; | 3794 | ret = 0; |
| 3783 | unlock: | 3795 | unlock: |
| @@ -4152,7 +4164,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype, | |||
| 4152 | break; | 4164 | break; |
| 4153 | } | 4165 | } |
| 4154 | 4166 | ||
| 4155 | mem_cgroup_charge_statistics(memcg, anon, -nr_pages); | 4167 | mem_cgroup_charge_statistics(memcg, page, anon, -nr_pages); |
| 4156 | 4168 | ||
| 4157 | ClearPageCgroupUsed(pc); | 4169 | ClearPageCgroupUsed(pc); |
| 4158 | /* | 4170 | /* |
| @@ -4502,7 +4514,7 @@ void mem_cgroup_replace_page_cache(struct page *oldpage, | |||
| 4502 | lock_page_cgroup(pc); | 4514 | lock_page_cgroup(pc); |
| 4503 | if (PageCgroupUsed(pc)) { | 4515 | if (PageCgroupUsed(pc)) { |
| 4504 | memcg = pc->mem_cgroup; | 4516 | memcg = pc->mem_cgroup; |
| 4505 | mem_cgroup_charge_statistics(memcg, false, -1); | 4517 | mem_cgroup_charge_statistics(memcg, oldpage, false, -1); |
| 4506 | ClearPageCgroupUsed(pc); | 4518 | ClearPageCgroupUsed(pc); |
| 4507 | } | 4519 | } |
| 4508 | unlock_page_cgroup(pc); | 4520 | unlock_page_cgroup(pc); |
| @@ -5030,6 +5042,10 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) | |||
| 5030 | return res_counter_read_u64(&memcg->memsw, RES_USAGE); | 5042 | return res_counter_read_u64(&memcg->memsw, RES_USAGE); |
| 5031 | } | 5043 | } |
| 5032 | 5044 | ||
| 5045 | /* | ||
| 5046 | * Transparent hugepages are still accounted for in MEM_CGROUP_STAT_RSS | ||
| 5047 | * as well as in MEM_CGROUP_STAT_RSS_HUGE. | ||
| 5048 | */ | ||
| 5033 | val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE); | 5049 | val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE); |
| 5034 | val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS); | 5050 | val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS); |
| 5035 | 5051 | ||
| @@ -1363,15 +1363,20 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, | |||
| 1363 | file = fget(fd); | 1363 | file = fget(fd); |
| 1364 | if (!file) | 1364 | if (!file) |
| 1365 | goto out; | 1365 | goto out; |
| 1366 | if (is_file_hugepages(file)) | ||
| 1367 | len = ALIGN(len, huge_page_size(hstate_file(file))); | ||
| 1366 | } else if (flags & MAP_HUGETLB) { | 1368 | } else if (flags & MAP_HUGETLB) { |
| 1367 | struct user_struct *user = NULL; | 1369 | struct user_struct *user = NULL; |
| 1370 | |||
| 1371 | len = ALIGN(len, huge_page_size(hstate_sizelog( | ||
| 1372 | (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK))); | ||
| 1368 | /* | 1373 | /* |
| 1369 | * VM_NORESERVE is used because the reservations will be | 1374 | * VM_NORESERVE is used because the reservations will be |
| 1370 | * taken when vm_ops->mmap() is called | 1375 | * taken when vm_ops->mmap() is called |
| 1371 | * A dummy user value is used because we are not locking | 1376 | * A dummy user value is used because we are not locking |
| 1372 | * memory so no accounting is necessary | 1377 | * memory so no accounting is necessary |
| 1373 | */ | 1378 | */ |
| 1374 | file = hugetlb_file_setup(HUGETLB_ANON_FILE, addr, len, | 1379 | file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, |
| 1375 | VM_NORESERVE, | 1380 | VM_NORESERVE, |
| 1376 | &user, HUGETLB_ANONHUGE_INODE, | 1381 | &user, HUGETLB_ANONHUGE_INODE, |
| 1377 | (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK); | 1382 | (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK); |
diff --git a/mm/mmu_context.c b/mm/mmu_context.c index 3dcfaf4ed355..8a8cd0265e52 100644 --- a/mm/mmu_context.c +++ b/mm/mmu_context.c | |||
| @@ -14,9 +14,6 @@ | |||
| 14 | * use_mm | 14 | * use_mm |
| 15 | * Makes the calling kernel thread take on the specified | 15 | * Makes the calling kernel thread take on the specified |
| 16 | * mm context. | 16 | * mm context. |
| 17 | * Called by the retry thread execute retries within the | ||
| 18 | * iocb issuer's mm context, so that copy_from/to_user | ||
| 19 | * operations work seamlessly for aio. | ||
| 20 | * (Note: this routine is intended to be called only | 17 | * (Note: this routine is intended to be called only |
| 21 | * from a kernel thread context) | 18 | * from a kernel thread context) |
| 22 | */ | 19 | */ |
diff --git a/mm/page_io.c b/mm/page_io.c index bb5d75274686..06a8842a6ec6 100644 --- a/mm/page_io.c +++ b/mm/page_io.c | |||
| @@ -20,6 +20,7 @@ | |||
| 20 | #include <linux/buffer_head.h> | 20 | #include <linux/buffer_head.h> |
| 21 | #include <linux/writeback.h> | 21 | #include <linux/writeback.h> |
| 22 | #include <linux/frontswap.h> | 22 | #include <linux/frontswap.h> |
| 23 | #include <linux/aio.h> | ||
| 23 | #include <asm/pgtable.h> | 24 | #include <asm/pgtable.h> |
| 24 | 25 | ||
| 25 | static struct bio *get_swap_bio(gfp_t gfp_flags, | 26 | static struct bio *get_swap_bio(gfp_t gfp_flags, |
diff --git a/mm/shmem.c b/mm/shmem.c index 39b2a0b86fe8..5e6a8422658b 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
| @@ -31,6 +31,7 @@ | |||
| 31 | #include <linux/mm.h> | 31 | #include <linux/mm.h> |
| 32 | #include <linux/export.h> | 32 | #include <linux/export.h> |
| 33 | #include <linux/swap.h> | 33 | #include <linux/swap.h> |
| 34 | #include <linux/aio.h> | ||
| 34 | 35 | ||
| 35 | static struct vfsmount *shm_mnt; | 36 | static struct vfsmount *shm_mnt; |
| 36 | 37 | ||
| @@ -30,6 +30,7 @@ | |||
| 30 | #include <linux/backing-dev.h> | 30 | #include <linux/backing-dev.h> |
| 31 | #include <linux/memcontrol.h> | 31 | #include <linux/memcontrol.h> |
| 32 | #include <linux/gfp.h> | 32 | #include <linux/gfp.h> |
| 33 | #include <linux/uio.h> | ||
| 33 | 34 | ||
| 34 | #include "internal.h" | 35 | #include "internal.h" |
| 35 | 36 | ||
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index b12fd8612604..d365724feb05 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
| @@ -1522,6 +1522,8 @@ static void __vunmap(const void *addr, int deallocate_pages) | |||
| 1522 | * Must not be called in NMI context (strictly speaking, only if we don't | 1522 | * Must not be called in NMI context (strictly speaking, only if we don't |
| 1523 | * have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling | 1523 | * have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling |
| 1524 | * conventions for vfree() arch-depenedent would be a really bad idea) | 1524 | * conventions for vfree() arch-depenedent would be a really bad idea) |
| 1525 | * | ||
| 1526 | * NOTE: assumes that the object at *addr has a size >= sizeof(llist_node) | ||
| 1525 | * | 1527 | * |
| 1526 | */ | 1528 | */ |
| 1527 | void vfree(const void *addr) | 1529 | void vfree(const void *addr) |
diff --git a/security/keys/internal.h b/security/keys/internal.h index 8bbefc3b55d4..d4f1468b9b50 100644 --- a/security/keys/internal.h +++ b/security/keys/internal.h | |||
| @@ -16,6 +16,8 @@ | |||
| 16 | #include <linux/key-type.h> | 16 | #include <linux/key-type.h> |
| 17 | #include <linux/task_work.h> | 17 | #include <linux/task_work.h> |
| 18 | 18 | ||
| 19 | struct iovec; | ||
| 20 | |||
| 19 | #ifdef __KDEBUG | 21 | #ifdef __KDEBUG |
| 20 | #define kenter(FMT, ...) \ | 22 | #define kenter(FMT, ...) \ |
| 21 | printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__) | 23 | printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__) |
diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c index 4b5c948eb414..33cfd27b4de2 100644 --- a/security/keys/keyctl.c +++ b/security/keys/keyctl.c | |||
| @@ -22,6 +22,7 @@ | |||
| 22 | #include <linux/err.h> | 22 | #include <linux/err.h> |
| 23 | #include <linux/vmalloc.h> | 23 | #include <linux/vmalloc.h> |
| 24 | #include <linux/security.h> | 24 | #include <linux/security.h> |
| 25 | #include <linux/uio.h> | ||
| 25 | #include <asm/uaccess.h> | 26 | #include <asm/uaccess.h> |
| 26 | #include "internal.h" | 27 | #include "internal.h" |
| 27 | 28 | ||
diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c index 23e3c46cd0a4..ccfa383f1fda 100644 --- a/sound/core/pcm_native.c +++ b/sound/core/pcm_native.c | |||
| @@ -25,7 +25,7 @@ | |||
| 25 | #include <linux/slab.h> | 25 | #include <linux/slab.h> |
| 26 | #include <linux/time.h> | 26 | #include <linux/time.h> |
| 27 | #include <linux/pm_qos.h> | 27 | #include <linux/pm_qos.h> |
| 28 | #include <linux/uio.h> | 28 | #include <linux/aio.h> |
| 29 | #include <linux/dma-mapping.h> | 29 | #include <linux/dma-mapping.h> |
| 30 | #include <sound/core.h> | 30 | #include <sound/core.h> |
| 31 | #include <sound/control.h> | 31 | #include <sound/control.h> |
