aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/auxdisplay/cfag12864b-example.c1
-rw-r--r--Documentation/cgroups/cgroups.txt32
-rw-r--r--Documentation/cgroups/memory.txt41
-rw-r--r--Documentation/crypto/async-tx-api.txt75
-rw-r--r--Documentation/filesystems/sharedsubtree.txt220
-rw-r--r--Documentation/filesystems/vfs.txt7
-rw-r--r--Documentation/ioctl/ioctl-number.txt1
-rw-r--r--Documentation/sysctl/fs.txt17
-rw-r--r--Documentation/sysctl/kernel.txt22
-rw-r--r--Documentation/sysctl/vm.txt41
-rw-r--r--Documentation/vm/.gitignore1
-rw-r--r--Documentation/vm/page-types.c200
-rw-r--r--MAINTAINERS4
-rw-r--r--arch/alpha/include/asm/fcntl.h2
-rw-r--r--arch/alpha/kernel/core_marvel.c2
-rw-r--r--arch/alpha/kernel/core_titan.c2
-rw-r--r--arch/alpha/kernel/pci_impl.h2
-rw-r--r--arch/alpha/kernel/pci_iommu.c4
-rw-r--r--arch/arm/include/asm/hardware/iop3xx-adma.h81
-rw-r--r--arch/arm/include/asm/hardware/iop_adma.h3
-rw-r--r--arch/arm/mach-iop13xx/include/mach/adma.h119
-rw-r--r--arch/arm/mach-iop13xx/setup.c17
-rw-r--r--arch/arm/plat-iop/adma.c4
-rw-r--r--arch/frv/kernel/pm.c14
-rw-r--r--arch/mips/lasat/sysctl.c18
-rw-r--r--arch/parisc/include/asm/fcntl.h2
-rw-r--r--arch/powerpc/include/asm/fsldma.h136
-rw-r--r--arch/s390/appldata/appldata_base.c9
-rw-r--r--arch/s390/kernel/debug.c4
-rw-r--r--arch/s390/mm/cmm.c4
-rw-r--r--arch/sh/drivers/dma/Kconfig12
-rw-r--r--arch/sh/drivers/dma/Makefile3
-rw-r--r--arch/sh/include/asm/dma-sh.h13
-rw-r--r--arch/x86/include/asm/nmi.h3
-rw-r--r--arch/x86/kernel/apic/nmi.c4
-rw-r--r--arch/x86/kernel/vsyscall_64.c10
-rw-r--r--arch/x86/mm/fault.c19
-rw-r--r--crypto/async_tx/Kconfig9
-rw-r--r--crypto/async_tx/Makefile3
-rw-r--r--crypto/async_tx/async_memcpy.c44
-rw-r--r--crypto/async_tx/async_memset.c43
-rw-r--r--crypto/async_tx/async_pq.c395
-rw-r--r--crypto/async_tx/async_raid6_recov.c468
-rw-r--r--crypto/async_tx/async_tx.c87
-rw-r--r--crypto/async_tx/async_xor.c207
-rw-r--r--crypto/async_tx/raid6test.c240
-rw-r--r--drivers/cdrom/cdrom.c8
-rw-r--r--drivers/char/Kconfig8
-rw-r--r--drivers/char/Makefile1
-rw-r--r--drivers/char/bfin-otp.c173
-rw-r--r--drivers/char/hpet.c21
-rw-r--r--drivers/char/mem.c2
-rw-r--r--drivers/char/mwave/mwavedd.c22
-rw-r--r--drivers/char/random.c4
-rw-r--r--drivers/char/rio/rioctrl.c2
-rw-r--r--drivers/char/uv_mmtimer.c216
-rw-r--r--drivers/dca/dca-core.c124
-rw-r--r--drivers/dma/Kconfig14
-rw-r--r--drivers/dma/Makefile4
-rw-r--r--drivers/dma/at_hdmac.c60
-rw-r--r--drivers/dma/at_hdmac_regs.h1
-rw-r--r--drivers/dma/dmaengine.c94
-rw-r--r--drivers/dma/dmatest.c40
-rw-r--r--drivers/dma/dw_dmac.c50
-rw-r--r--drivers/dma/dw_dmac_regs.h1
-rw-r--r--drivers/dma/fsldma.c288
-rw-r--r--drivers/dma/fsldma.h4
-rw-r--r--drivers/dma/ioat.c202
-rw-r--r--drivers/dma/ioat/Makefile2
-rw-r--r--drivers/dma/ioat/dca.c (renamed from drivers/dma/ioat_dca.c)13
-rw-r--r--drivers/dma/ioat/dma.c1238
-rw-r--r--drivers/dma/ioat/dma.h337
-rw-r--r--drivers/dma/ioat/dma_v2.c871
-rw-r--r--drivers/dma/ioat/dma_v2.h190
-rw-r--r--drivers/dma/ioat/dma_v3.c1223
-rw-r--r--drivers/dma/ioat/hw.h215
-rw-r--r--drivers/dma/ioat/pci.c210
-rw-r--r--drivers/dma/ioat/registers.h (renamed from drivers/dma/ioatdma_registers.h)54
-rw-r--r--drivers/dma/ioat_dma.c1741
-rw-r--r--drivers/dma/ioatdma.h165
-rw-r--r--drivers/dma/ioatdma_hw.h70
-rw-r--r--drivers/dma/iop-adma.c491
-rw-r--r--drivers/dma/iovlock.c10
-rw-r--r--drivers/dma/mv_xor.c7
-rw-r--r--drivers/dma/mv_xor.h4
-rw-r--r--drivers/dma/shdma.c786
-rw-r--r--drivers/dma/shdma.h64
-rw-r--r--drivers/dma/txx9dmac.c24
-rw-r--r--drivers/dma/txx9dmac.h1
-rw-r--r--drivers/edac/Kconfig13
-rw-r--r--drivers/edac/Makefile2
-rw-r--r--drivers/edac/cpc925_edac.c6
-rw-r--r--drivers/edac/edac_device.c5
-rw-r--r--drivers/edac/edac_mc.c4
-rw-r--r--drivers/edac/edac_pci.c4
-rw-r--r--drivers/edac/i3200_edac.c527
-rw-r--r--drivers/edac/mpc85xx_edac.c30
-rw-r--r--drivers/edac/mv64x60_edac.c22
-rw-r--r--drivers/idle/i7300_idle.c20
-rw-r--r--drivers/input/misc/Kconfig1
-rw-r--r--drivers/md/Kconfig26
-rw-r--r--drivers/md/bitmap.c5
-rw-r--r--drivers/md/linear.c3
-rw-r--r--drivers/md/md.c25
-rw-r--r--drivers/md/md.h1
-rw-r--r--drivers/md/multipath.c6
-rw-r--r--drivers/md/raid0.c8
-rw-r--r--drivers/md/raid1.c15
-rw-r--r--drivers/md/raid10.c12
-rw-r--r--drivers/md/raid5.c1493
-rw-r--r--drivers/md/raid5.h28
-rw-r--r--drivers/media/dvb/dvb-core/dvbdev.h5
-rw-r--r--drivers/media/dvb/dvb-usb/Kconfig2
-rw-r--r--drivers/media/video/saa7164/saa7164-api.c8
-rw-r--r--drivers/media/video/saa7164/saa7164-cmd.c2
-rw-r--r--drivers/media/video/saa7164/saa7164-core.c6
-rw-r--r--drivers/media/video/saa7164/saa7164.h4
-rw-r--r--drivers/memstick/core/memstick.c2
-rw-r--r--drivers/misc/sgi-gru/grukservices.c2
-rw-r--r--drivers/misc/sgi-gru/gruprocfs.c3
-rw-r--r--drivers/mmc/host/atmel-mci.c9
-rw-r--r--drivers/net/wireless/arlan-proc.c28
-rw-r--r--drivers/parport/procfs.c12
-rw-r--r--drivers/staging/go7007/Makefile5
-rw-r--r--drivers/usb/serial/sierra.c5
-rw-r--r--drivers/vlynq/vlynq.c2
-rw-r--r--fs/adfs/inode.c7
-rw-r--r--fs/binfmt_elf.c52
-rw-r--r--fs/binfmt_elf_fdpic.c17
-rw-r--r--fs/binfmt_flat.c22
-rw-r--r--fs/btrfs/inode.c1
-rw-r--r--fs/char_dev.c3
-rw-r--r--fs/coda/coda_int.h1
-rw-r--r--fs/drop_caches.c4
-rw-r--r--fs/exec.c114
-rw-r--r--fs/ext2/inode.c2
-rw-r--r--fs/ext3/inode.c3
-rw-r--r--fs/ext4/inode.c4
-rw-r--r--fs/fcntl.c108
-rw-r--r--fs/file_table.c6
-rw-r--r--fs/gfs2/aops.c3
-rw-r--r--fs/hugetlbfs/inode.c12
-rw-r--r--fs/nfs/file.c1
-rw-r--r--fs/ntfs/aops.c2
-rw-r--r--fs/ocfs2/aops.c1
-rw-r--r--fs/proc/meminfo.c9
-rw-r--r--fs/proc/proc_sysctl.c2
-rw-r--r--fs/romfs/super.c2
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c1
-rw-r--r--fs/xfs/linux-2.6/xfs_sysctl.c3
-rw-r--r--include/asm-generic/fcntl.h13
-rw-r--r--include/asm-generic/mman-common.h1
-rw-r--r--include/asm-generic/siginfo.h8
-rw-r--r--include/linux/async_tx.h129
-rw-r--r--include/linux/binfmts.h2
-rw-r--r--include/linux/cgroup.h53
-rw-r--r--include/linux/configfs.h4
-rw-r--r--include/linux/dca.h11
-rw-r--r--include/linux/debugfs.h2
-rw-r--r--include/linux/dmaengine.h179
-rw-r--r--include/linux/fs.h3
-rw-r--r--include/linux/ftrace.h4
-rw-r--r--include/linux/futex.h10
-rw-r--r--include/linux/hugetlb.h6
-rw-r--r--include/linux/memcontrol.h10
-rw-r--r--include/linux/mm.h17
-rw-r--r--include/linux/mm_types.h7
-rw-r--r--include/linux/mmzone.h13
-rw-r--r--include/linux/page-flags.h17
-rw-r--r--include/linux/page_cgroup.h13
-rw-r--r--include/linux/pci_ids.h10
-rw-r--r--include/linux/prctl.h2
-rw-r--r--include/linux/relay.h2
-rw-r--r--include/linux/res_counter.h64
-rw-r--r--include/linux/rmap.h21
-rw-r--r--include/linux/sched.h17
-rw-r--r--include/linux/security.h2
-rw-r--r--include/linux/signal.h2
-rw-r--r--include/linux/swap.h41
-rw-r--r--include/linux/swapops.h38
-rw-r--r--include/linux/sysctl.h19
-rw-r--r--include/linux/time.h28
-rw-r--r--include/linux/tracehook.h34
-rw-r--r--include/linux/tracepoint.h2
-rw-r--r--include/linux/unaligned/be_byteshift.h2
-rw-r--r--include/linux/unaligned/le_byteshift.h2
-rw-r--r--include/linux/writeback.h11
-rw-r--r--include/net/ip.h2
-rw-r--r--include/net/ndisc.h2
-rw-r--r--ipc/ipc_sysctl.c16
-rw-r--r--ipc/mq_sysctl.c8
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/cgroup.c1107
-rw-r--r--kernel/cgroup_debug.c105
-rw-r--r--kernel/cgroup_freezer.c15
-rw-r--r--kernel/cpuset.c66
-rw-r--r--kernel/exit.c146
-rw-r--r--kernel/fork.c34
-rw-r--r--kernel/hung_task.c4
-rw-r--r--kernel/ns_cgroup.c16
-rw-r--r--kernel/pid_namespace.c2
-rw-r--r--kernel/ptrace.c11
-rw-r--r--kernel/res_counter.c21
-rw-r--r--kernel/sched.c39
-rw-r--r--kernel/sched_fair.c4
-rw-r--r--kernel/signal.c168
-rw-r--r--kernel/slow-work.c12
-rw-r--r--kernel/softlockup.c4
-rw-r--r--kernel/sys.c22
-rw-r--r--kernel/sysctl.c112
-rw-r--r--kernel/time/Makefile2
-rw-r--r--kernel/time/timeconv.c127
-rw-r--r--kernel/trace/ftrace.c4
-rw-r--r--kernel/trace/trace_stack.c4
-rw-r--r--kernel/utsname_sysctl.c4
-rw-r--r--lib/decompress_inflate.c8
-rw-r--r--lib/decompress_unlzma.c10
-rw-r--r--mm/Kconfig14
-rw-r--r--mm/Makefile2
-rw-r--r--mm/filemap.c4
-rw-r--r--mm/hugetlb.c12
-rw-r--r--mm/hwpoison-inject.c41
-rw-r--r--mm/ksm.c14
-rw-r--r--mm/madvise.c30
-rw-r--r--mm/memcontrol.c737
-rw-r--r--mm/memory-failure.c832
-rw-r--r--mm/memory.c24
-rw-r--r--mm/migrate.c2
-rw-r--r--mm/page-writeback.c27
-rw-r--r--mm/page_alloc.c44
-rw-r--r--mm/rmap.c60
-rw-r--r--mm/shmem.c5
-rw-r--r--mm/swapfile.c4
-rw-r--r--mm/truncate.c72
-rw-r--r--mm/vmscan.c51
-rw-r--r--net/bridge/br_netfilter.c4
-rw-r--r--net/decnet/dn_dev.c5
-rw-r--r--net/decnet/sysctl_net_decnet.c2
-rw-r--r--net/ipv4/devinet.c12
-rw-r--r--net/ipv4/route.c7
-rw-r--r--net/ipv4/sysctl_net_ipv4.c16
-rw-r--r--net/ipv6/addrconf.c8
-rw-r--r--net/ipv6/ndisc.c8
-rw-r--r--net/ipv6/route.c4
-rw-r--r--net/irda/irsysctl.c8
-rw-r--r--net/netfilter/ipvs/ip_vs_ctl.c8
-rw-r--r--net/netfilter/nf_log.c4
-rw-r--r--net/phonet/sysctl.c4
-rw-r--r--net/sunrpc/sysctl.c4
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma.c2
-rw-r--r--security/device_cgroup.c3
-rw-r--r--security/min_addr.c4
-rw-r--r--security/selinux/hooks.c2
253 files changed, 14531 insertions, 4855 deletions
diff --git a/Documentation/auxdisplay/cfag12864b-example.c b/Documentation/auxdisplay/cfag12864b-example.c
index 1d2c010bae12..e7823ffb1ca0 100644
--- a/Documentation/auxdisplay/cfag12864b-example.c
+++ b/Documentation/auxdisplay/cfag12864b-example.c
@@ -194,7 +194,6 @@ static void cfag12864b_blit(void)
194 */ 194 */
195 195
196#include <stdio.h> 196#include <stdio.h>
197#include <string.h>
198 197
199#define EXAMPLES 6 198#define EXAMPLES 6
200 199
diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt
index 6eb1a97e88ce..455d4e6d346d 100644
--- a/Documentation/cgroups/cgroups.txt
+++ b/Documentation/cgroups/cgroups.txt
@@ -408,6 +408,26 @@ You can attach the current shell task by echoing 0:
408 408
409# echo 0 > tasks 409# echo 0 > tasks
410 410
4112.3 Mounting hierarchies by name
412--------------------------------
413
414Passing the name=<x> option when mounting a cgroups hierarchy
415associates the given name with the hierarchy. This can be used when
416mounting a pre-existing hierarchy, in order to refer to it by name
417rather than by its set of active subsystems. Each hierarchy is either
418nameless, or has a unique name.
419
420The name should match [\w.-]+
421
422When passing a name=<x> option for a new hierarchy, you need to
423specify subsystems manually; the legacy behaviour of mounting all
424subsystems when none are explicitly specified is not supported when
425you give a subsystem a name.
426
427The name of the subsystem appears as part of the hierarchy description
428in /proc/mounts and /proc/<pid>/cgroups.
429
430
4113. Kernel API 4313. Kernel API
412============= 432=============
413 433
@@ -501,7 +521,7 @@ rmdir() will fail with it. From this behavior, pre_destroy() can be
501called multiple times against a cgroup. 521called multiple times against a cgroup.
502 522
503int can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, 523int can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
504 struct task_struct *task) 524 struct task_struct *task, bool threadgroup)
505(cgroup_mutex held by caller) 525(cgroup_mutex held by caller)
506 526
507Called prior to moving a task into a cgroup; if the subsystem 527Called prior to moving a task into a cgroup; if the subsystem
@@ -509,14 +529,20 @@ returns an error, this will abort the attach operation. If a NULL
509task is passed, then a successful result indicates that *any* 529task is passed, then a successful result indicates that *any*
510unspecified task can be moved into the cgroup. Note that this isn't 530unspecified task can be moved into the cgroup. Note that this isn't
511called on a fork. If this method returns 0 (success) then this should 531called on a fork. If this method returns 0 (success) then this should
512remain valid while the caller holds cgroup_mutex. 532remain valid while the caller holds cgroup_mutex. If threadgroup is
533true, then a successful result indicates that all threads in the given
534thread's threadgroup can be moved together.
513 535
514void attach(struct cgroup_subsys *ss, struct cgroup *cgrp, 536void attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
515 struct cgroup *old_cgrp, struct task_struct *task) 537 struct cgroup *old_cgrp, struct task_struct *task,
538 bool threadgroup)
516(cgroup_mutex held by caller) 539(cgroup_mutex held by caller)
517 540
518Called after the task has been attached to the cgroup, to allow any 541Called after the task has been attached to the cgroup, to allow any
519post-attachment activity that requires memory allocations or blocking. 542post-attachment activity that requires memory allocations or blocking.
543If threadgroup is true, the subsystem should take care of all threads
544in the specified thread's threadgroup. Currently does not support any
545subsystem that might need the old_cgrp for every thread in the group.
520 546
521void fork(struct cgroup_subsy *ss, struct task_struct *task) 547void fork(struct cgroup_subsy *ss, struct task_struct *task)
522 548
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt
index 23d1262c0775..b871f2552b45 100644
--- a/Documentation/cgroups/memory.txt
+++ b/Documentation/cgroups/memory.txt
@@ -179,6 +179,9 @@ The reclaim algorithm has not been modified for cgroups, except that
179pages that are selected for reclaiming come from the per cgroup LRU 179pages that are selected for reclaiming come from the per cgroup LRU
180list. 180list.
181 181
182NOTE: Reclaim does not work for the root cgroup, since we cannot set any
183limits on the root cgroup.
184
1822. Locking 1852. Locking
183 186
184The memory controller uses the following hierarchy 187The memory controller uses the following hierarchy
@@ -210,6 +213,7 @@ We can alter the memory limit:
210NOTE: We can use a suffix (k, K, m, M, g or G) to indicate values in kilo, 213NOTE: We can use a suffix (k, K, m, M, g or G) to indicate values in kilo,
211mega or gigabytes. 214mega or gigabytes.
212NOTE: We can write "-1" to reset the *.limit_in_bytes(unlimited). 215NOTE: We can write "-1" to reset the *.limit_in_bytes(unlimited).
216NOTE: We cannot set limits on the root cgroup any more.
213 217
214# cat /cgroups/0/memory.limit_in_bytes 218# cat /cgroups/0/memory.limit_in_bytes
2154194304 2194194304
@@ -375,7 +379,42 @@ cgroups created below it.
375 379
376NOTE2: This feature can be enabled/disabled per subtree. 380NOTE2: This feature can be enabled/disabled per subtree.
377 381
3787. TODO 3827. Soft limits
383
384Soft limits allow for greater sharing of memory. The idea behind soft limits
385is to allow control groups to use as much of the memory as needed, provided
386
387a. There is no memory contention
388b. They do not exceed their hard limit
389
390When the system detects memory contention or low memory control groups
391are pushed back to their soft limits. If the soft limit of each control
392group is very high, they are pushed back as much as possible to make
393sure that one control group does not starve the others of memory.
394
395Please note that soft limits is a best effort feature, it comes with
396no guarantees, but it does its best to make sure that when memory is
397heavily contended for, memory is allocated based on the soft limit
398hints/setup. Currently soft limit based reclaim is setup such that
399it gets invoked from balance_pgdat (kswapd).
400
4017.1 Interface
402
403Soft limits can be setup by using the following commands (in this example we
404assume a soft limit of 256 megabytes)
405
406# echo 256M > memory.soft_limit_in_bytes
407
408If we want to change this to 1G, we can at any time use
409
410# echo 1G > memory.soft_limit_in_bytes
411
412NOTE1: Soft limits take effect over a long period of time, since they involve
413 reclaiming memory for balancing between memory cgroups
414NOTE2: It is recommended to set the soft limit always below the hard limit,
415 otherwise the hard limit will take precedence.
416
4178. TODO
379 418
3801. Add support for accounting huge pages (as a separate controller) 4191. Add support for accounting huge pages (as a separate controller)
3812. Make per-cgroup scanner reclaim not-shared pages first 4202. Make per-cgroup scanner reclaim not-shared pages first
diff --git a/Documentation/crypto/async-tx-api.txt b/Documentation/crypto/async-tx-api.txt
index 9f59fcbf5d82..ba046b8fa92f 100644
--- a/Documentation/crypto/async-tx-api.txt
+++ b/Documentation/crypto/async-tx-api.txt
@@ -54,20 +54,23 @@ features surfaced as a result:
54 54
553.1 General format of the API: 553.1 General format of the API:
56struct dma_async_tx_descriptor * 56struct dma_async_tx_descriptor *
57async_<operation>(<op specific parameters>, 57async_<operation>(<op specific parameters>, struct async_submit ctl *submit)
58 enum async_tx_flags flags,
59 struct dma_async_tx_descriptor *dependency,
60 dma_async_tx_callback callback_routine,
61 void *callback_parameter);
62 58
633.2 Supported operations: 593.2 Supported operations:
64memcpy - memory copy between a source and a destination buffer 60memcpy - memory copy between a source and a destination buffer
65memset - fill a destination buffer with a byte value 61memset - fill a destination buffer with a byte value
66xor - xor a series of source buffers and write the result to a 62xor - xor a series of source buffers and write the result to a
67 destination buffer 63 destination buffer
68xor_zero_sum - xor a series of source buffers and set a flag if the 64xor_val - xor a series of source buffers and set a flag if the
69 result is zero. The implementation attempts to prevent 65 result is zero. The implementation attempts to prevent
70 writes to memory 66 writes to memory
67pq - generate the p+q (raid6 syndrome) from a series of source buffers
68pq_val - validate that a p and or q buffer are in sync with a given series of
69 sources
70datap - (raid6_datap_recov) recover a raid6 data block and the p block
71 from the given sources
722data - (raid6_2data_recov) recover 2 raid6 data blocks from the given
73 sources
71 74
723.3 Descriptor management: 753.3 Descriptor management:
73The return value is non-NULL and points to a 'descriptor' when the operation 76The return value is non-NULL and points to a 'descriptor' when the operation
@@ -80,8 +83,8 @@ acknowledged by the application before the offload engine driver is allowed to
80recycle (or free) the descriptor. A descriptor can be acked by one of the 83recycle (or free) the descriptor. A descriptor can be acked by one of the
81following methods: 84following methods:
821/ setting the ASYNC_TX_ACK flag if no child operations are to be submitted 851/ setting the ASYNC_TX_ACK flag if no child operations are to be submitted
832/ setting the ASYNC_TX_DEP_ACK flag to acknowledge the parent 862/ submitting an unacknowledged descriptor as a dependency to another
84 descriptor of a new operation. 87 async_tx call will implicitly set the acknowledged state.
853/ calling async_tx_ack() on the descriptor. 883/ calling async_tx_ack() on the descriptor.
86 89
873.4 When does the operation execute? 903.4 When does the operation execute?
@@ -119,30 +122,42 @@ of an operation.
119Perform a xor->copy->xor operation where each operation depends on the 122Perform a xor->copy->xor operation where each operation depends on the
120result from the previous operation: 123result from the previous operation:
121 124
122void complete_xor_copy_xor(void *param) 125void callback(void *param)
123{ 126{
124 printk("complete\n"); 127 struct completion *cmp = param;
128
129 complete(cmp);
125} 130}
126 131
127int run_xor_copy_xor(struct page **xor_srcs, 132void run_xor_copy_xor(struct page **xor_srcs,
128 int xor_src_cnt, 133 int xor_src_cnt,
129 struct page *xor_dest, 134 struct page *xor_dest,
130 size_t xor_len, 135 size_t xor_len,
131 struct page *copy_src, 136 struct page *copy_src,
132 struct page *copy_dest, 137 struct page *copy_dest,
133 size_t copy_len) 138 size_t copy_len)
134{ 139{
135 struct dma_async_tx_descriptor *tx; 140 struct dma_async_tx_descriptor *tx;
141 addr_conv_t addr_conv[xor_src_cnt];
142 struct async_submit_ctl submit;
143 addr_conv_t addr_conv[NDISKS];
144 struct completion cmp;
145
146 init_async_submit(&submit, ASYNC_TX_XOR_DROP_DST, NULL, NULL, NULL,
147 addr_conv);
148 tx = async_xor(xor_dest, xor_srcs, 0, xor_src_cnt, xor_len, &submit)
136 149
137 tx = async_xor(xor_dest, xor_srcs, 0, xor_src_cnt, xor_len, 150 submit->depend_tx = tx;
138 ASYNC_TX_XOR_DROP_DST, NULL, NULL, NULL); 151 tx = async_memcpy(copy_dest, copy_src, 0, 0, copy_len, &submit);
139 tx = async_memcpy(copy_dest, copy_src, 0, 0, copy_len, 152
140 ASYNC_TX_DEP_ACK, tx, NULL, NULL); 153 init_completion(&cmp);
141 tx = async_xor(xor_dest, xor_srcs, 0, xor_src_cnt, xor_len, 154 init_async_submit(&submit, ASYNC_TX_XOR_DROP_DST | ASYNC_TX_ACK, tx,
142 ASYNC_TX_XOR_DROP_DST | ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, 155 callback, &cmp, addr_conv);
143 tx, complete_xor_copy_xor, NULL); 156 tx = async_xor(xor_dest, xor_srcs, 0, xor_src_cnt, xor_len, &submit);
144 157
145 async_tx_issue_pending_all(); 158 async_tx_issue_pending_all();
159
160 wait_for_completion(&cmp);
146} 161}
147 162
148See include/linux/async_tx.h for more information on the flags. See the 163See include/linux/async_tx.h for more information on the flags. See the
diff --git a/Documentation/filesystems/sharedsubtree.txt b/Documentation/filesystems/sharedsubtree.txt
index 736540045dc7..23a181074f94 100644
--- a/Documentation/filesystems/sharedsubtree.txt
+++ b/Documentation/filesystems/sharedsubtree.txt
@@ -4,7 +4,7 @@ Shared Subtrees
4Contents: 4Contents:
5 1) Overview 5 1) Overview
6 2) Features 6 2) Features
7 3) smount command 7 3) Setting mount states
8 4) Use-case 8 4) Use-case
9 5) Detailed semantics 9 5) Detailed semantics
10 6) Quiz 10 6) Quiz
@@ -41,14 +41,14 @@ replicas continue to be exactly same.
41 41
42 Here is an example: 42 Here is an example:
43 43
44 Lets say /mnt has a mount that is shared. 44 Let's say /mnt has a mount that is shared.
45 mount --make-shared /mnt 45 mount --make-shared /mnt
46 46
47 note: mount command does not yet support the --make-shared flag. 47 Note: mount(8) command now supports the --make-shared flag,
48 I have included a small C program which does the same by executing 48 so the sample 'smount' program is no longer needed and has been
49 'smount /mnt shared' 49 removed.
50 50
51 #mount --bind /mnt /tmp 51 # mount --bind /mnt /tmp
52 The above command replicates the mount at /mnt to the mountpoint /tmp 52 The above command replicates the mount at /mnt to the mountpoint /tmp
53 and the contents of both the mounts remain identical. 53 and the contents of both the mounts remain identical.
54 54
@@ -58,8 +58,8 @@ replicas continue to be exactly same.
58 #ls /tmp 58 #ls /tmp
59 a b c 59 a b c
60 60
61 Now lets say we mount a device at /tmp/a 61 Now let's say we mount a device at /tmp/a
62 #mount /dev/sd0 /tmp/a 62 # mount /dev/sd0 /tmp/a
63 63
64 #ls /tmp/a 64 #ls /tmp/a
65 t1 t2 t2 65 t1 t2 t2
@@ -80,21 +80,20 @@ replicas continue to be exactly same.
80 80
81 Here is an example: 81 Here is an example:
82 82
83 Lets say /mnt has a mount which is shared. 83 Let's say /mnt has a mount which is shared.
84 #mount --make-shared /mnt 84 # mount --make-shared /mnt
85 85
86 Lets bind mount /mnt to /tmp 86 Let's bind mount /mnt to /tmp
87 #mount --bind /mnt /tmp 87 # mount --bind /mnt /tmp
88 88
89 the new mount at /tmp becomes a shared mount and it is a replica of 89 the new mount at /tmp becomes a shared mount and it is a replica of
90 the mount at /mnt. 90 the mount at /mnt.
91 91
92 Now lets make the mount at /tmp; a slave of /mnt 92 Now let's make the mount at /tmp; a slave of /mnt
93 #mount --make-slave /tmp 93 # mount --make-slave /tmp
94 [or smount /tmp slave]
95 94
96 lets mount /dev/sd0 on /mnt/a 95 let's mount /dev/sd0 on /mnt/a
97 #mount /dev/sd0 /mnt/a 96 # mount /dev/sd0 /mnt/a
98 97
99 #ls /mnt/a 98 #ls /mnt/a
100 t1 t2 t3 99 t1 t2 t3
@@ -104,9 +103,9 @@ replicas continue to be exactly same.
104 103
105 Note the mount event has propagated to the mount at /tmp 104 Note the mount event has propagated to the mount at /tmp
106 105
107 However lets see what happens if we mount something on the mount at /tmp 106 However let's see what happens if we mount something on the mount at /tmp
108 107
109 #mount /dev/sd1 /tmp/b 108 # mount /dev/sd1 /tmp/b
110 109
111 #ls /tmp/b 110 #ls /tmp/b
112 s1 s2 s3 111 s1 s2 s3
@@ -124,12 +123,11 @@ replicas continue to be exactly same.
124 123
1252d) A unbindable mount is a unbindable private mount 1242d) A unbindable mount is a unbindable private mount
126 125
127 lets say we have a mount at /mnt and we make is unbindable 126 let's say we have a mount at /mnt and we make is unbindable
128 127
129 #mount --make-unbindable /mnt 128 # mount --make-unbindable /mnt
130 [ smount /mnt unbindable ]
131 129
132 Lets try to bind mount this mount somewhere else. 130 Let's try to bind mount this mount somewhere else.
133 # mount --bind /mnt /tmp 131 # mount --bind /mnt /tmp
134 mount: wrong fs type, bad option, bad superblock on /mnt, 132 mount: wrong fs type, bad option, bad superblock on /mnt,
135 or too many mounted file systems 133 or too many mounted file systems
@@ -137,149 +135,15 @@ replicas continue to be exactly same.
137 Binding a unbindable mount is a invalid operation. 135 Binding a unbindable mount is a invalid operation.
138 136
139 137
1403) smount command 1383) Setting mount states
141 139
142 Currently the mount command is not aware of shared subtree features. 140 The mount command (util-linux package) can be used to set mount
143 Work is in progress to add the support in mount ( util-linux package ). 141 states:
144 Till then use the following program.
145 142
146 ------------------------------------------------------------------------ 143 mount --make-shared mountpoint
147 // 144 mount --make-slave mountpoint
148 //this code was developed my Miklos Szeredi <miklos@szeredi.hu> 145 mount --make-private mountpoint
149 //and modified by Ram Pai <linuxram@us.ibm.com> 146 mount --make-unbindable mountpoint
150 // sample usage:
151 // smount /tmp shared
152 //
153 #include <stdio.h>
154 #include <stdlib.h>
155 #include <unistd.h>
156 #include <string.h>
157 #include <sys/mount.h>
158 #include <sys/fsuid.h>
159
160 #ifndef MS_REC
161 #define MS_REC 0x4000 /* 16384: Recursive loopback */
162 #endif
163
164 #ifndef MS_SHARED
165 #define MS_SHARED 1<<20 /* Shared */
166 #endif
167
168 #ifndef MS_PRIVATE
169 #define MS_PRIVATE 1<<18 /* Private */
170 #endif
171
172 #ifndef MS_SLAVE
173 #define MS_SLAVE 1<<19 /* Slave */
174 #endif
175
176 #ifndef MS_UNBINDABLE
177 #define MS_UNBINDABLE 1<<17 /* Unbindable */
178 #endif
179
180 int main(int argc, char *argv[])
181 {
182 int type;
183 if(argc != 3) {
184 fprintf(stderr, "usage: %s dir "
185 "<rshared|rslave|rprivate|runbindable|shared|slave"
186 "|private|unbindable>\n" , argv[0]);
187 return 1;
188 }
189
190 fprintf(stdout, "%s %s %s\n", argv[0], argv[1], argv[2]);
191
192 if (strcmp(argv[2],"rshared")==0)
193 type=(MS_SHARED|MS_REC);
194 else if (strcmp(argv[2],"rslave")==0)
195 type=(MS_SLAVE|MS_REC);
196 else if (strcmp(argv[2],"rprivate")==0)
197 type=(MS_PRIVATE|MS_REC);
198 else if (strcmp(argv[2],"runbindable")==0)
199 type=(MS_UNBINDABLE|MS_REC);
200 else if (strcmp(argv[2],"shared")==0)
201 type=MS_SHARED;
202 else if (strcmp(argv[2],"slave")==0)
203 type=MS_SLAVE;
204 else if (strcmp(argv[2],"private")==0)
205 type=MS_PRIVATE;
206 else if (strcmp(argv[2],"unbindable")==0)
207 type=MS_UNBINDABLE;
208 else {
209 fprintf(stderr, "invalid operation: %s\n", argv[2]);
210 return 1;
211 }
212 setfsuid(getuid());
213
214 if(mount("", argv[1], "dontcare", type, "") == -1) {
215 perror("mount");
216 return 1;
217 }
218 return 0;
219 }
220 -----------------------------------------------------------------------
221
222 Copy the above code snippet into smount.c
223 gcc -o smount smount.c
224
225
226 (i) To mark all the mounts under /mnt as shared execute the following
227 command:
228
229 smount /mnt rshared
230 the corresponding syntax planned for mount command is
231 mount --make-rshared /mnt
232
233 just to mark a mount /mnt as shared, execute the following
234 command:
235 smount /mnt shared
236 the corresponding syntax planned for mount command is
237 mount --make-shared /mnt
238
239 (ii) To mark all the shared mounts under /mnt as slave execute the
240 following
241
242 command:
243 smount /mnt rslave
244 the corresponding syntax planned for mount command is
245 mount --make-rslave /mnt
246
247 just to mark a mount /mnt as slave, execute the following
248 command:
249 smount /mnt slave
250 the corresponding syntax planned for mount command is
251 mount --make-slave /mnt
252
253 (iii) To mark all the mounts under /mnt as private execute the
254 following command:
255
256 smount /mnt rprivate
257 the corresponding syntax planned for mount command is
258 mount --make-rprivate /mnt
259
260 just to mark a mount /mnt as private, execute the following
261 command:
262 smount /mnt private
263 the corresponding syntax planned for mount command is
264 mount --make-private /mnt
265
266 NOTE: by default all the mounts are created as private. But if
267 you want to change some shared/slave/unbindable mount as
268 private at a later point in time, this command can help.
269
270 (iv) To mark all the mounts under /mnt as unbindable execute the
271 following
272
273 command:
274 smount /mnt runbindable
275 the corresponding syntax planned for mount command is
276 mount --make-runbindable /mnt
277
278 just to mark a mount /mnt as unbindable, execute the following
279 command:
280 smount /mnt unbindable
281 the corresponding syntax planned for mount command is
282 mount --make-unbindable /mnt
283 147
284 148
2854) Use cases 1494) Use cases
@@ -350,7 +214,7 @@ replicas continue to be exactly same.
350 mount --rbind / /view/v3 214 mount --rbind / /view/v3
351 mount --rbind / /view/v4 215 mount --rbind / /view/v4
352 216
353 and if /usr has a versioning filesystem mounted, than that 217 and if /usr has a versioning filesystem mounted, then that
354 mount appears at /view/v1/usr, /view/v2/usr, /view/v3/usr and 218 mount appears at /view/v1/usr, /view/v2/usr, /view/v3/usr and
355 /view/v4/usr too 219 /view/v4/usr too
356 220
@@ -390,7 +254,7 @@ replicas continue to be exactly same.
390 254
391 For example: 255 For example:
392 mount --make-shared /mnt 256 mount --make-shared /mnt
393 mount --bin /mnt /tmp 257 mount --bind /mnt /tmp
394 258
395 The mount at /mnt and that at /tmp are both shared and belong 259 The mount at /mnt and that at /tmp are both shared and belong
396 to the same peer group. Anything mounted or unmounted under 260 to the same peer group. Anything mounted or unmounted under
@@ -558,7 +422,7 @@ replicas continue to be exactly same.
558 then the subtree under the unbindable mount is pruned in the new 422 then the subtree under the unbindable mount is pruned in the new
559 location. 423 location.
560 424
561 eg: lets say we have the following mount tree. 425 eg: let's say we have the following mount tree.
562 426
563 A 427 A
564 / \ 428 / \
@@ -566,7 +430,7 @@ replicas continue to be exactly same.
566 / \ / \ 430 / \ / \
567 D E F G 431 D E F G
568 432
569 Lets say all the mount except the mount C in the tree are 433 Let's say all the mount except the mount C in the tree are
570 of a type other than unbindable. 434 of a type other than unbindable.
571 435
572 If this tree is rbound to say Z 436 If this tree is rbound to say Z
@@ -683,13 +547,13 @@ replicas continue to be exactly same.
683 'b' on mounts that receive propagation from mount 'B' and does not have 547 'b' on mounts that receive propagation from mount 'B' and does not have
684 sub-mounts within them are unmounted. 548 sub-mounts within them are unmounted.
685 549
686 Example: Lets say 'B1', 'B2', 'B3' are shared mounts that propagate to 550 Example: Let's say 'B1', 'B2', 'B3' are shared mounts that propagate to
687 each other. 551 each other.
688 552
689 lets say 'A1', 'A2', 'A3' are first mounted at dentry 'b' on mount 553 let's say 'A1', 'A2', 'A3' are first mounted at dentry 'b' on mount
690 'B1', 'B2' and 'B3' respectively. 554 'B1', 'B2' and 'B3' respectively.
691 555
692 lets say 'C1', 'C2', 'C3' are next mounted at the same dentry 'b' on 556 let's say 'C1', 'C2', 'C3' are next mounted at the same dentry 'b' on
693 mount 'B1', 'B2' and 'B3' respectively. 557 mount 'B1', 'B2' and 'B3' respectively.
694 558
695 if 'C1' is unmounted, all the mounts that are most-recently-mounted on 559 if 'C1' is unmounted, all the mounts that are most-recently-mounted on
@@ -710,7 +574,7 @@ replicas continue to be exactly same.
710 A cloned namespace contains all the mounts as that of the parent 574 A cloned namespace contains all the mounts as that of the parent
711 namespace. 575 namespace.
712 576
713 Lets say 'A' and 'B' are the corresponding mounts in the parent and the 577 Let's say 'A' and 'B' are the corresponding mounts in the parent and the
714 child namespace. 578 child namespace.
715 579
716 If 'A' is shared, then 'B' is also shared and 'A' and 'B' propagate to 580 If 'A' is shared, then 'B' is also shared and 'A' and 'B' propagate to
@@ -759,11 +623,11 @@ replicas continue to be exactly same.
759 mount --make-slave /mnt 623 mount --make-slave /mnt
760 624
761 At this point we have the first mount at /tmp and 625 At this point we have the first mount at /tmp and
762 its root dentry is 1. Lets call this mount 'A' 626 its root dentry is 1. Let's call this mount 'A'
763 And then we have a second mount at /tmp1 with root 627 And then we have a second mount at /tmp1 with root
764 dentry 2. Lets call this mount 'B' 628 dentry 2. Let's call this mount 'B'
765 Next we have a third mount at /mnt with root dentry 629 Next we have a third mount at /mnt with root dentry
766 mnt. Lets call this mount 'C' 630 mnt. Let's call this mount 'C'
767 631
768 'B' is the slave of 'A' and 'C' is a slave of 'B' 632 'B' is the slave of 'A' and 'C' is a slave of 'B'
769 A -> B -> C 633 A -> B -> C
@@ -794,7 +658,7 @@ replicas continue to be exactly same.
794 658
795 Q3 Why is unbindable mount needed? 659 Q3 Why is unbindable mount needed?
796 660
797 Lets say we want to replicate the mount tree at multiple 661 Let's say we want to replicate the mount tree at multiple
798 locations within the same subtree. 662 locations within the same subtree.
799 663
800 if one rbind mounts a tree within the same subtree 'n' times 664 if one rbind mounts a tree within the same subtree 'n' times
@@ -803,7 +667,7 @@ replicas continue to be exactly same.
803 mounts. Here is a example. 667 mounts. Here is a example.
804 668
805 step 1: 669 step 1:
806 lets say the root tree has just two directories with 670 let's say the root tree has just two directories with
807 one vfsmount. 671 one vfsmount.
808 root 672 root
809 / \ 673 / \
@@ -875,7 +739,7 @@ replicas continue to be exactly same.
875 Unclonable mounts come in handy here. 739 Unclonable mounts come in handy here.
876 740
877 step 1: 741 step 1:
878 lets say the root tree has just two directories with 742 let's say the root tree has just two directories with
879 one vfsmount. 743 one vfsmount.
880 root 744 root
881 / \ 745 / \
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index f49eecf2e573..623f094c9d8d 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -536,6 +536,7 @@ struct address_space_operations {
536 /* migrate the contents of a page to the specified target */ 536 /* migrate the contents of a page to the specified target */
537 int (*migratepage) (struct page *, struct page *); 537 int (*migratepage) (struct page *, struct page *);
538 int (*launder_page) (struct page *); 538 int (*launder_page) (struct page *);
539 int (*error_remove_page) (struct mapping *mapping, struct page *page);
539}; 540};
540 541
541 writepage: called by the VM to write a dirty page to backing store. 542 writepage: called by the VM to write a dirty page to backing store.
@@ -694,6 +695,12 @@ struct address_space_operations {
694 prevent redirtying the page, it is kept locked during the whole 695 prevent redirtying the page, it is kept locked during the whole
695 operation. 696 operation.
696 697
698 error_remove_page: normally set to generic_error_remove_page if truncation
699 is ok for this address space. Used for memory failure handling.
700 Setting this implies you deal with pages going away under you,
701 unless you have them locked or reference counts increased.
702
703
697The File Object 704The File Object
698=============== 705===============
699 706
diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt
index aafca0a8f66a..947374977ca5 100644
--- a/Documentation/ioctl/ioctl-number.txt
+++ b/Documentation/ioctl/ioctl-number.txt
@@ -135,6 +135,7 @@ Code Seq# Include File Comments
135 <http://mikonos.dia.unisa.it/tcfs> 135 <http://mikonos.dia.unisa.it/tcfs>
136'l' 40-7F linux/udf_fs_i.h in development: 136'l' 40-7F linux/udf_fs_i.h in development:
137 <http://sourceforge.net/projects/linux-udf/> 137 <http://sourceforge.net/projects/linux-udf/>
138'm' 00-09 linux/mmtimer.h
138'm' all linux/mtio.h conflict! 139'm' all linux/mtio.h conflict!
139'm' all linux/soundcard.h conflict! 140'm' all linux/soundcard.h conflict!
140'm' all linux/synclink.h conflict! 141'm' all linux/synclink.h conflict!
diff --git a/Documentation/sysctl/fs.txt b/Documentation/sysctl/fs.txt
index 1458448436cc..62682500878a 100644
--- a/Documentation/sysctl/fs.txt
+++ b/Documentation/sysctl/fs.txt
@@ -96,13 +96,16 @@ handles that the Linux kernel will allocate. When you get lots
96of error messages about running out of file handles, you might 96of error messages about running out of file handles, you might
97want to increase this limit. 97want to increase this limit.
98 98
99The three values in file-nr denote the number of allocated 99Historically, the three values in file-nr denoted the number of
100file handles, the number of unused file handles and the maximum 100allocated file handles, the number of allocated but unused file
101number of file handles. When the allocated file handles come 101handles, and the maximum number of file handles. Linux 2.6 always
102close to the maximum, but the number of unused file handles is 102reports 0 as the number of free file handles -- this is not an
103significantly greater than 0, you've encountered a peak in your 103error, it just means that the number of allocated file handles
104usage of file handles and you don't need to increase the maximum. 104exactly matches the number of used file handles.
105 105
106Attempts to allocate more file descriptors than file-max are
107reported with printk, look for "VFS: file-max limit <number>
108reached".
106============================================================== 109==============================================================
107 110
108nr_open: 111nr_open:
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index b3d8b4922740..a028b92001ed 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -22,6 +22,7 @@ show up in /proc/sys/kernel:
22- callhome [ S390 only ] 22- callhome [ S390 only ]
23- auto_msgmni 23- auto_msgmni
24- core_pattern 24- core_pattern
25- core_pipe_limit
25- core_uses_pid 26- core_uses_pid
26- ctrl-alt-del 27- ctrl-alt-del
27- dentry-state 28- dentry-state
@@ -135,6 +136,27 @@ core_pattern is used to specify a core dumpfile pattern name.
135 136
136============================================================== 137==============================================================
137 138
139core_pipe_limit:
140
141This sysctl is only applicable when core_pattern is configured to pipe core
142files to user space helper a (when the first character of core_pattern is a '|',
143see above). When collecting cores via a pipe to an application, it is
144occasionally usefull for the collecting application to gather data about the
145crashing process from its /proc/pid directory. In order to do this safely, the
146kernel must wait for the collecting process to exit, so as not to remove the
147crashing processes proc files prematurely. This in turn creates the possibility
148that a misbehaving userspace collecting process can block the reaping of a
149crashed process simply by never exiting. This sysctl defends against that. It
150defines how many concurrent crashing processes may be piped to user space
151applications in parallel. If this value is exceeded, then those crashing
152processes above that value are noted via the kernel log and their cores are
153skipped. 0 is a special value, indicating that unlimited processes may be
154captured in parallel, but that no waiting will take place (i.e. the collecting
155process is not guaranteed access to /proc/<crahing pid>/). This value defaults
156to 0.
157
158==============================================================
159
138core_uses_pid: 160core_uses_pid:
139 161
140The default coredump filename is "core". By setting 162The default coredump filename is "core". By setting
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index e6fb1ec2744b..a6e360d2055c 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -32,6 +32,8 @@ Currently, these files are in /proc/sys/vm:
32- legacy_va_layout 32- legacy_va_layout
33- lowmem_reserve_ratio 33- lowmem_reserve_ratio
34- max_map_count 34- max_map_count
35- memory_failure_early_kill
36- memory_failure_recovery
35- min_free_kbytes 37- min_free_kbytes
36- min_slab_ratio 38- min_slab_ratio
37- min_unmapped_ratio 39- min_unmapped_ratio
@@ -53,7 +55,6 @@ Currently, these files are in /proc/sys/vm:
53- vfs_cache_pressure 55- vfs_cache_pressure
54- zone_reclaim_mode 56- zone_reclaim_mode
55 57
56
57============================================================== 58==============================================================
58 59
59block_dump 60block_dump
@@ -275,6 +276,44 @@ e.g., up to one or two maps per allocation.
275 276
276The default value is 65536. 277The default value is 65536.
277 278
279=============================================================
280
281memory_failure_early_kill:
282
283Control how to kill processes when uncorrected memory error (typically
284a 2bit error in a memory module) is detected in the background by hardware
285that cannot be handled by the kernel. In some cases (like the page
286still having a valid copy on disk) the kernel will handle the failure
287transparently without affecting any applications. But if there is
288no other uptodate copy of the data it will kill to prevent any data
289corruptions from propagating.
290
2911: Kill all processes that have the corrupted and not reloadable page mapped
292as soon as the corruption is detected. Note this is not supported
293for a few types of pages, like kernel internally allocated data or
294the swap cache, but works for the majority of user pages.
295
2960: Only unmap the corrupted page from all processes and only kill a process
297who tries to access it.
298
299The kill is done using a catchable SIGBUS with BUS_MCEERR_AO, so processes can
300handle this if they want to.
301
302This is only active on architectures/platforms with advanced machine
303check handling and depends on the hardware capabilities.
304
305Applications can override this setting individually with the PR_MCE_KILL prctl
306
307==============================================================
308
309memory_failure_recovery
310
311Enable memory failure recovery (when supported by the platform)
312
3131: Attempt recovery.
314
3150: Always panic on a memory failure.
316
278============================================================== 317==============================================================
279 318
280min_free_kbytes: 319min_free_kbytes:
diff --git a/Documentation/vm/.gitignore b/Documentation/vm/.gitignore
index 33e8a023df02..09b164a5700f 100644
--- a/Documentation/vm/.gitignore
+++ b/Documentation/vm/.gitignore
@@ -1 +1,2 @@
1page-types
1slabinfo 2slabinfo
diff --git a/Documentation/vm/page-types.c b/Documentation/vm/page-types.c
index 3eda8ea00852..fa1a30d9e9d5 100644
--- a/Documentation/vm/page-types.c
+++ b/Documentation/vm/page-types.c
@@ -5,6 +5,7 @@
5 * Copyright (C) 2009 Wu Fengguang <fengguang.wu@intel.com> 5 * Copyright (C) 2009 Wu Fengguang <fengguang.wu@intel.com>
6 */ 6 */
7 7
8#define _LARGEFILE64_SOURCE
8#include <stdio.h> 9#include <stdio.h>
9#include <stdlib.h> 10#include <stdlib.h>
10#include <unistd.h> 11#include <unistd.h>
@@ -13,12 +14,33 @@
13#include <string.h> 14#include <string.h>
14#include <getopt.h> 15#include <getopt.h>
15#include <limits.h> 16#include <limits.h>
17#include <assert.h>
16#include <sys/types.h> 18#include <sys/types.h>
17#include <sys/errno.h> 19#include <sys/errno.h>
18#include <sys/fcntl.h> 20#include <sys/fcntl.h>
19 21
20 22
21/* 23/*
24 * pagemap kernel ABI bits
25 */
26
27#define PM_ENTRY_BYTES sizeof(uint64_t)
28#define PM_STATUS_BITS 3
29#define PM_STATUS_OFFSET (64 - PM_STATUS_BITS)
30#define PM_STATUS_MASK (((1LL << PM_STATUS_BITS) - 1) << PM_STATUS_OFFSET)
31#define PM_STATUS(nr) (((nr) << PM_STATUS_OFFSET) & PM_STATUS_MASK)
32#define PM_PSHIFT_BITS 6
33#define PM_PSHIFT_OFFSET (PM_STATUS_OFFSET - PM_PSHIFT_BITS)
34#define PM_PSHIFT_MASK (((1LL << PM_PSHIFT_BITS) - 1) << PM_PSHIFT_OFFSET)
35#define PM_PSHIFT(x) (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK)
36#define PM_PFRAME_MASK ((1LL << PM_PSHIFT_OFFSET) - 1)
37#define PM_PFRAME(x) ((x) & PM_PFRAME_MASK)
38
39#define PM_PRESENT PM_STATUS(4LL)
40#define PM_SWAP PM_STATUS(2LL)
41
42
43/*
22 * kernel page flags 44 * kernel page flags
23 */ 45 */
24 46
@@ -126,6 +148,14 @@ static int nr_addr_ranges;
126static unsigned long opt_offset[MAX_ADDR_RANGES]; 148static unsigned long opt_offset[MAX_ADDR_RANGES];
127static unsigned long opt_size[MAX_ADDR_RANGES]; 149static unsigned long opt_size[MAX_ADDR_RANGES];
128 150
151#define MAX_VMAS 10240
152static int nr_vmas;
153static unsigned long pg_start[MAX_VMAS];
154static unsigned long pg_end[MAX_VMAS];
155static unsigned long voffset;
156
157static int pagemap_fd;
158
129#define MAX_BIT_FILTERS 64 159#define MAX_BIT_FILTERS 64
130static int nr_bit_filters; 160static int nr_bit_filters;
131static uint64_t opt_mask[MAX_BIT_FILTERS]; 161static uint64_t opt_mask[MAX_BIT_FILTERS];
@@ -135,7 +165,6 @@ static int page_size;
135 165
136#define PAGES_BATCH (64 << 10) /* 64k pages */ 166#define PAGES_BATCH (64 << 10) /* 64k pages */
137static int kpageflags_fd; 167static int kpageflags_fd;
138static uint64_t kpageflags_buf[KPF_BYTES * PAGES_BATCH];
139 168
140#define HASH_SHIFT 13 169#define HASH_SHIFT 13
141#define HASH_SIZE (1 << HASH_SHIFT) 170#define HASH_SIZE (1 << HASH_SHIFT)
@@ -158,6 +187,11 @@ static uint64_t page_flags[HASH_SIZE];
158 type __min2 = (y); \ 187 type __min2 = (y); \
159 __min1 < __min2 ? __min1 : __min2; }) 188 __min1 < __min2 ? __min1 : __min2; })
160 189
190#define max_t(type, x, y) ({ \
191 type __max1 = (x); \
192 type __max2 = (y); \
193 __max1 > __max2 ? __max1 : __max2; })
194
161static unsigned long pages2mb(unsigned long pages) 195static unsigned long pages2mb(unsigned long pages)
162{ 196{
163 return (pages * page_size) >> 20; 197 return (pages * page_size) >> 20;
@@ -224,26 +258,34 @@ static char *page_flag_longname(uint64_t flags)
224static void show_page_range(unsigned long offset, uint64_t flags) 258static void show_page_range(unsigned long offset, uint64_t flags)
225{ 259{
226 static uint64_t flags0; 260 static uint64_t flags0;
261 static unsigned long voff;
227 static unsigned long index; 262 static unsigned long index;
228 static unsigned long count; 263 static unsigned long count;
229 264
230 if (flags == flags0 && offset == index + count) { 265 if (flags == flags0 && offset == index + count &&
266 (!opt_pid || voffset == voff + count)) {
231 count++; 267 count++;
232 return; 268 return;
233 } 269 }
234 270
235 if (count) 271 if (count) {
236 printf("%lu\t%lu\t%s\n", 272 if (opt_pid)
273 printf("%lx\t", voff);
274 printf("%lx\t%lx\t%s\n",
237 index, count, page_flag_name(flags0)); 275 index, count, page_flag_name(flags0));
276 }
238 277
239 flags0 = flags; 278 flags0 = flags;
240 index = offset; 279 index = offset;
280 voff = voffset;
241 count = 1; 281 count = 1;
242} 282}
243 283
244static void show_page(unsigned long offset, uint64_t flags) 284static void show_page(unsigned long offset, uint64_t flags)
245{ 285{
246 printf("%lu\t%s\n", offset, page_flag_name(flags)); 286 if (opt_pid)
287 printf("%lx\t", voffset);
288 printf("%lx\t%s\n", offset, page_flag_name(flags));
247} 289}
248 290
249static void show_summary(void) 291static void show_summary(void)
@@ -383,6 +425,8 @@ static void walk_pfn(unsigned long index, unsigned long count)
383 lseek(kpageflags_fd, index * KPF_BYTES, SEEK_SET); 425 lseek(kpageflags_fd, index * KPF_BYTES, SEEK_SET);
384 426
385 while (count) { 427 while (count) {
428 uint64_t kpageflags_buf[KPF_BYTES * PAGES_BATCH];
429
386 batch = min_t(unsigned long, count, PAGES_BATCH); 430 batch = min_t(unsigned long, count, PAGES_BATCH);
387 n = read(kpageflags_fd, kpageflags_buf, batch * KPF_BYTES); 431 n = read(kpageflags_fd, kpageflags_buf, batch * KPF_BYTES);
388 if (n == 0) 432 if (n == 0)
@@ -404,6 +448,81 @@ static void walk_pfn(unsigned long index, unsigned long count)
404 } 448 }
405} 449}
406 450
451
452#define PAGEMAP_BATCH 4096
453static unsigned long task_pfn(unsigned long pgoff)
454{
455 static uint64_t buf[PAGEMAP_BATCH];
456 static unsigned long start;
457 static long count;
458 uint64_t pfn;
459
460 if (pgoff < start || pgoff >= start + count) {
461 if (lseek64(pagemap_fd,
462 (uint64_t)pgoff * PM_ENTRY_BYTES,
463 SEEK_SET) < 0) {
464 perror("pagemap seek");
465 exit(EXIT_FAILURE);
466 }
467 count = read(pagemap_fd, buf, sizeof(buf));
468 if (count == 0)
469 return 0;
470 if (count < 0) {
471 perror("pagemap read");
472 exit(EXIT_FAILURE);
473 }
474 if (count % PM_ENTRY_BYTES) {
475 fatal("pagemap read not aligned.\n");
476 exit(EXIT_FAILURE);
477 }
478 count /= PM_ENTRY_BYTES;
479 start = pgoff;
480 }
481
482 pfn = buf[pgoff - start];
483 if (pfn & PM_PRESENT)
484 pfn = PM_PFRAME(pfn);
485 else
486 pfn = 0;
487
488 return pfn;
489}
490
491static void walk_task(unsigned long index, unsigned long count)
492{
493 int i = 0;
494 const unsigned long end = index + count;
495
496 while (index < end) {
497
498 while (pg_end[i] <= index)
499 if (++i >= nr_vmas)
500 return;
501 if (pg_start[i] >= end)
502 return;
503
504 voffset = max_t(unsigned long, pg_start[i], index);
505 index = min_t(unsigned long, pg_end[i], end);
506
507 assert(voffset < index);
508 for (; voffset < index; voffset++) {
509 unsigned long pfn = task_pfn(voffset);
510 if (pfn)
511 walk_pfn(pfn, 1);
512 }
513 }
514}
515
516static void add_addr_range(unsigned long offset, unsigned long size)
517{
518 if (nr_addr_ranges >= MAX_ADDR_RANGES)
519 fatal("too many addr ranges\n");
520
521 opt_offset[nr_addr_ranges] = offset;
522 opt_size[nr_addr_ranges] = min_t(unsigned long, size, ULONG_MAX-offset);
523 nr_addr_ranges++;
524}
525
407static void walk_addr_ranges(void) 526static void walk_addr_ranges(void)
408{ 527{
409 int i; 528 int i;
@@ -415,10 +534,13 @@ static void walk_addr_ranges(void)
415 } 534 }
416 535
417 if (!nr_addr_ranges) 536 if (!nr_addr_ranges)
418 walk_pfn(0, ULONG_MAX); 537 add_addr_range(0, ULONG_MAX);
419 538
420 for (i = 0; i < nr_addr_ranges; i++) 539 for (i = 0; i < nr_addr_ranges; i++)
421 walk_pfn(opt_offset[i], opt_size[i]); 540 if (!opt_pid)
541 walk_pfn(opt_offset[i], opt_size[i]);
542 else
543 walk_task(opt_offset[i], opt_size[i]);
422 544
423 close(kpageflags_fd); 545 close(kpageflags_fd);
424} 546}
@@ -446,8 +568,8 @@ static void usage(void)
446" -r|--raw Raw mode, for kernel developers\n" 568" -r|--raw Raw mode, for kernel developers\n"
447" -a|--addr addr-spec Walk a range of pages\n" 569" -a|--addr addr-spec Walk a range of pages\n"
448" -b|--bits bits-spec Walk pages with specified bits\n" 570" -b|--bits bits-spec Walk pages with specified bits\n"
449#if 0 /* planned features */
450" -p|--pid pid Walk process address space\n" 571" -p|--pid pid Walk process address space\n"
572#if 0 /* planned features */
451" -f|--file filename Walk file address space\n" 573" -f|--file filename Walk file address space\n"
452#endif 574#endif
453" -l|--list Show page details in ranges\n" 575" -l|--list Show page details in ranges\n"
@@ -459,7 +581,7 @@ static void usage(void)
459" N+M pages range from N to N+M-1\n" 581" N+M pages range from N to N+M-1\n"
460" N,M pages range from N to M-1\n" 582" N,M pages range from N to M-1\n"
461" N, pages range from N to end\n" 583" N, pages range from N to end\n"
462" ,M pages range from 0 to M\n" 584" ,M pages range from 0 to M-1\n"
463"bits-spec:\n" 585"bits-spec:\n"
464" bit1,bit2 (flags & (bit1|bit2)) != 0\n" 586" bit1,bit2 (flags & (bit1|bit2)) != 0\n"
465" bit1,bit2=bit1 (flags & (bit1|bit2)) == bit1\n" 587" bit1,bit2=bit1 (flags & (bit1|bit2)) == bit1\n"
@@ -496,21 +618,57 @@ static unsigned long long parse_number(const char *str)
496 618
497static void parse_pid(const char *str) 619static void parse_pid(const char *str)
498{ 620{
621 FILE *file;
622 char buf[5000];
623
499 opt_pid = parse_number(str); 624 opt_pid = parse_number(str);
500}
501 625
502static void parse_file(const char *name) 626 sprintf(buf, "/proc/%d/pagemap", opt_pid);
503{ 627 pagemap_fd = open(buf, O_RDONLY);
628 if (pagemap_fd < 0) {
629 perror(buf);
630 exit(EXIT_FAILURE);
631 }
632
633 sprintf(buf, "/proc/%d/maps", opt_pid);
634 file = fopen(buf, "r");
635 if (!file) {
636 perror(buf);
637 exit(EXIT_FAILURE);
638 }
639
640 while (fgets(buf, sizeof(buf), file) != NULL) {
641 unsigned long vm_start;
642 unsigned long vm_end;
643 unsigned long long pgoff;
644 int major, minor;
645 char r, w, x, s;
646 unsigned long ino;
647 int n;
648
649 n = sscanf(buf, "%lx-%lx %c%c%c%c %llx %x:%x %lu",
650 &vm_start,
651 &vm_end,
652 &r, &w, &x, &s,
653 &pgoff,
654 &major, &minor,
655 &ino);
656 if (n < 10) {
657 fprintf(stderr, "unexpected line: %s\n", buf);
658 continue;
659 }
660 pg_start[nr_vmas] = vm_start / page_size;
661 pg_end[nr_vmas] = vm_end / page_size;
662 if (++nr_vmas >= MAX_VMAS) {
663 fprintf(stderr, "too many VMAs\n");
664 break;
665 }
666 }
667 fclose(file);
504} 668}
505 669
506static void add_addr_range(unsigned long offset, unsigned long size) 670static void parse_file(const char *name)
507{ 671{
508 if (nr_addr_ranges >= MAX_ADDR_RANGES)
509 fatal("too much addr ranges\n");
510
511 opt_offset[nr_addr_ranges] = offset;
512 opt_size[nr_addr_ranges] = size;
513 nr_addr_ranges++;
514} 672}
515 673
516static void parse_addr_range(const char *optarg) 674static void parse_addr_range(const char *optarg)
@@ -676,8 +834,10 @@ int main(int argc, char *argv[])
676 } 834 }
677 } 835 }
678 836
837 if (opt_list && opt_pid)
838 printf("voffset\t");
679 if (opt_list == 1) 839 if (opt_list == 1)
680 printf("offset\tcount\tflags\n"); 840 printf("offset\tlen\tflags\n");
681 if (opt_list == 2) 841 if (opt_list == 2)
682 printf("offset\tflags\n"); 842 printf("offset\tflags\n");
683 843
diff --git a/MAINTAINERS b/MAINTAINERS
index 7c1c0b05b298..0c138ba86526 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2331,7 +2331,9 @@ S: Orphan
2331F: drivers/hwmon/ 2331F: drivers/hwmon/
2332 2332
2333HARDWARE RANDOM NUMBER GENERATOR CORE 2333HARDWARE RANDOM NUMBER GENERATOR CORE
2334S: Orphan 2334M: Matt Mackall <mpm@selenic.com>
2335M: Herbert Xu <herbert@gondor.apana.org.au>
2336S: Odd fixes
2335F: Documentation/hw_random.txt 2337F: Documentation/hw_random.txt
2336F: drivers/char/hw_random/ 2338F: drivers/char/hw_random/
2337F: include/linux/hw_random.h 2339F: include/linux/hw_random.h
diff --git a/arch/alpha/include/asm/fcntl.h b/arch/alpha/include/asm/fcntl.h
index 25da0017ec87..e42823e954aa 100644
--- a/arch/alpha/include/asm/fcntl.h
+++ b/arch/alpha/include/asm/fcntl.h
@@ -26,6 +26,8 @@
26#define F_GETOWN 6 /* for sockets. */ 26#define F_GETOWN 6 /* for sockets. */
27#define F_SETSIG 10 /* for sockets. */ 27#define F_SETSIG 10 /* for sockets. */
28#define F_GETSIG 11 /* for sockets. */ 28#define F_GETSIG 11 /* for sockets. */
29#define F_SETOWN_EX 12
30#define F_GETOWN_EX 13
29 31
30/* for posix fcntl() and lockf() */ 32/* for posix fcntl() and lockf() */
31#define F_RDLCK 1 33#define F_RDLCK 1
diff --git a/arch/alpha/kernel/core_marvel.c b/arch/alpha/kernel/core_marvel.c
index e302daecbe56..8e059e58b0ac 100644
--- a/arch/alpha/kernel/core_marvel.c
+++ b/arch/alpha/kernel/core_marvel.c
@@ -1016,7 +1016,7 @@ marvel_agp_bind_memory(alpha_agp_info *agp, off_t pg_start, struct agp_memory *m
1016{ 1016{
1017 struct marvel_agp_aperture *aper = agp->aperture.sysdata; 1017 struct marvel_agp_aperture *aper = agp->aperture.sysdata;
1018 return iommu_bind(aper->arena, aper->pg_start + pg_start, 1018 return iommu_bind(aper->arena, aper->pg_start + pg_start,
1019 mem->page_count, mem->memory); 1019 mem->page_count, mem->pages);
1020} 1020}
1021 1021
1022static int 1022static int
diff --git a/arch/alpha/kernel/core_titan.c b/arch/alpha/kernel/core_titan.c
index 319fcb74611e..76686497b1e2 100644
--- a/arch/alpha/kernel/core_titan.c
+++ b/arch/alpha/kernel/core_titan.c
@@ -680,7 +680,7 @@ titan_agp_bind_memory(alpha_agp_info *agp, off_t pg_start, struct agp_memory *me
680{ 680{
681 struct titan_agp_aperture *aper = agp->aperture.sysdata; 681 struct titan_agp_aperture *aper = agp->aperture.sysdata;
682 return iommu_bind(aper->arena, aper->pg_start + pg_start, 682 return iommu_bind(aper->arena, aper->pg_start + pg_start,
683 mem->page_count, mem->memory); 683 mem->page_count, mem->pages);
684} 684}
685 685
686static int 686static int
diff --git a/arch/alpha/kernel/pci_impl.h b/arch/alpha/kernel/pci_impl.h
index 00edd04b585e..85457b2d4516 100644
--- a/arch/alpha/kernel/pci_impl.h
+++ b/arch/alpha/kernel/pci_impl.h
@@ -198,7 +198,7 @@ extern unsigned long size_for_memory(unsigned long max);
198 198
199extern int iommu_reserve(struct pci_iommu_arena *, long, long); 199extern int iommu_reserve(struct pci_iommu_arena *, long, long);
200extern int iommu_release(struct pci_iommu_arena *, long, long); 200extern int iommu_release(struct pci_iommu_arena *, long, long);
201extern int iommu_bind(struct pci_iommu_arena *, long, long, unsigned long *); 201extern int iommu_bind(struct pci_iommu_arena *, long, long, struct page **);
202extern int iommu_unbind(struct pci_iommu_arena *, long, long); 202extern int iommu_unbind(struct pci_iommu_arena *, long, long);
203 203
204 204
diff --git a/arch/alpha/kernel/pci_iommu.c b/arch/alpha/kernel/pci_iommu.c
index d15aedfe6066..8449504f5e0b 100644
--- a/arch/alpha/kernel/pci_iommu.c
+++ b/arch/alpha/kernel/pci_iommu.c
@@ -876,7 +876,7 @@ iommu_release(struct pci_iommu_arena *arena, long pg_start, long pg_count)
876 876
877int 877int
878iommu_bind(struct pci_iommu_arena *arena, long pg_start, long pg_count, 878iommu_bind(struct pci_iommu_arena *arena, long pg_start, long pg_count,
879 unsigned long *physaddrs) 879 struct page **pages)
880{ 880{
881 unsigned long flags; 881 unsigned long flags;
882 unsigned long *ptes; 882 unsigned long *ptes;
@@ -896,7 +896,7 @@ iommu_bind(struct pci_iommu_arena *arena, long pg_start, long pg_count,
896 } 896 }
897 897
898 for(i = 0, j = pg_start; i < pg_count; i++, j++) 898 for(i = 0, j = pg_start; i < pg_count; i++, j++)
899 ptes[j] = mk_iommu_pte(physaddrs[i]); 899 ptes[j] = mk_iommu_pte(page_to_phys(pages[i]));
900 900
901 spin_unlock_irqrestore(&arena->lock, flags); 901 spin_unlock_irqrestore(&arena->lock, flags);
902 902
diff --git a/arch/arm/include/asm/hardware/iop3xx-adma.h b/arch/arm/include/asm/hardware/iop3xx-adma.h
index 83e6ba338e2c..1a8c7279a28b 100644
--- a/arch/arm/include/asm/hardware/iop3xx-adma.h
+++ b/arch/arm/include/asm/hardware/iop3xx-adma.h
@@ -187,11 +187,74 @@ union iop3xx_desc {
187 void *ptr; 187 void *ptr;
188}; 188};
189 189
190/* No support for p+q operations */
191static inline int
192iop_chan_pq_slot_count(size_t len, int src_cnt, int *slots_per_op)
193{
194 BUG();
195 return 0;
196}
197
198static inline void
199iop_desc_init_pq(struct iop_adma_desc_slot *desc, int src_cnt,
200 unsigned long flags)
201{
202 BUG();
203}
204
205static inline void
206iop_desc_set_pq_addr(struct iop_adma_desc_slot *desc, dma_addr_t *addr)
207{
208 BUG();
209}
210
211static inline void
212iop_desc_set_pq_src_addr(struct iop_adma_desc_slot *desc, int src_idx,
213 dma_addr_t addr, unsigned char coef)
214{
215 BUG();
216}
217
218static inline int
219iop_chan_pq_zero_sum_slot_count(size_t len, int src_cnt, int *slots_per_op)
220{
221 BUG();
222 return 0;
223}
224
225static inline void
226iop_desc_init_pq_zero_sum(struct iop_adma_desc_slot *desc, int src_cnt,
227 unsigned long flags)
228{
229 BUG();
230}
231
232static inline void
233iop_desc_set_pq_zero_sum_byte_count(struct iop_adma_desc_slot *desc, u32 len)
234{
235 BUG();
236}
237
238#define iop_desc_set_pq_zero_sum_src_addr iop_desc_set_pq_src_addr
239
240static inline void
241iop_desc_set_pq_zero_sum_addr(struct iop_adma_desc_slot *desc, int pq_idx,
242 dma_addr_t *src)
243{
244 BUG();
245}
246
190static inline int iop_adma_get_max_xor(void) 247static inline int iop_adma_get_max_xor(void)
191{ 248{
192 return 32; 249 return 32;
193} 250}
194 251
252static inline int iop_adma_get_max_pq(void)
253{
254 BUG();
255 return 0;
256}
257
195static inline u32 iop_chan_get_current_descriptor(struct iop_adma_chan *chan) 258static inline u32 iop_chan_get_current_descriptor(struct iop_adma_chan *chan)
196{ 259{
197 int id = chan->device->id; 260 int id = chan->device->id;
@@ -332,6 +395,11 @@ static inline int iop_chan_zero_sum_slot_count(size_t len, int src_cnt,
332 return slot_cnt; 395 return slot_cnt;
333} 396}
334 397
398static inline int iop_desc_is_pq(struct iop_adma_desc_slot *desc)
399{
400 return 0;
401}
402
335static inline u32 iop_desc_get_dest_addr(struct iop_adma_desc_slot *desc, 403static inline u32 iop_desc_get_dest_addr(struct iop_adma_desc_slot *desc,
336 struct iop_adma_chan *chan) 404 struct iop_adma_chan *chan)
337{ 405{
@@ -349,6 +417,14 @@ static inline u32 iop_desc_get_dest_addr(struct iop_adma_desc_slot *desc,
349 return 0; 417 return 0;
350} 418}
351 419
420
421static inline u32 iop_desc_get_qdest_addr(struct iop_adma_desc_slot *desc,
422 struct iop_adma_chan *chan)
423{
424 BUG();
425 return 0;
426}
427
352static inline u32 iop_desc_get_byte_count(struct iop_adma_desc_slot *desc, 428static inline u32 iop_desc_get_byte_count(struct iop_adma_desc_slot *desc,
353 struct iop_adma_chan *chan) 429 struct iop_adma_chan *chan)
354{ 430{
@@ -756,13 +832,14 @@ static inline void iop_desc_set_block_fill_val(struct iop_adma_desc_slot *desc,
756 hw_desc->src[0] = val; 832 hw_desc->src[0] = val;
757} 833}
758 834
759static inline int iop_desc_get_zero_result(struct iop_adma_desc_slot *desc) 835static inline enum sum_check_flags
836iop_desc_get_zero_result(struct iop_adma_desc_slot *desc)
760{ 837{
761 struct iop3xx_desc_aau *hw_desc = desc->hw_desc; 838 struct iop3xx_desc_aau *hw_desc = desc->hw_desc;
762 struct iop3xx_aau_desc_ctrl desc_ctrl = hw_desc->desc_ctrl_field; 839 struct iop3xx_aau_desc_ctrl desc_ctrl = hw_desc->desc_ctrl_field;
763 840
764 iop_paranoia(!(desc_ctrl.tx_complete && desc_ctrl.zero_result_en)); 841 iop_paranoia(!(desc_ctrl.tx_complete && desc_ctrl.zero_result_en));
765 return desc_ctrl.zero_result_err; 842 return desc_ctrl.zero_result_err << SUM_CHECK_P;
766} 843}
767 844
768static inline void iop_chan_append(struct iop_adma_chan *chan) 845static inline void iop_chan_append(struct iop_adma_chan *chan)
diff --git a/arch/arm/include/asm/hardware/iop_adma.h b/arch/arm/include/asm/hardware/iop_adma.h
index 385c6e8cbbd2..59b8c3892f76 100644
--- a/arch/arm/include/asm/hardware/iop_adma.h
+++ b/arch/arm/include/asm/hardware/iop_adma.h
@@ -86,6 +86,7 @@ struct iop_adma_chan {
86 * @idx: pool index 86 * @idx: pool index
87 * @unmap_src_cnt: number of xor sources 87 * @unmap_src_cnt: number of xor sources
88 * @unmap_len: transaction bytecount 88 * @unmap_len: transaction bytecount
89 * @tx_list: list of descriptors that are associated with one operation
89 * @async_tx: support for the async_tx api 90 * @async_tx: support for the async_tx api
90 * @group_list: list of slots that make up a multi-descriptor transaction 91 * @group_list: list of slots that make up a multi-descriptor transaction
91 * for example transfer lengths larger than the supported hw max 92 * for example transfer lengths larger than the supported hw max
@@ -102,10 +103,12 @@ struct iop_adma_desc_slot {
102 u16 idx; 103 u16 idx;
103 u16 unmap_src_cnt; 104 u16 unmap_src_cnt;
104 size_t unmap_len; 105 size_t unmap_len;
106 struct list_head tx_list;
105 struct dma_async_tx_descriptor async_tx; 107 struct dma_async_tx_descriptor async_tx;
106 union { 108 union {
107 u32 *xor_check_result; 109 u32 *xor_check_result;
108 u32 *crc32_result; 110 u32 *crc32_result;
111 u32 *pq_check_result;
109 }; 112 };
110}; 113};
111 114
diff --git a/arch/arm/mach-iop13xx/include/mach/adma.h b/arch/arm/mach-iop13xx/include/mach/adma.h
index 5722e86f2174..6d3782d85a9f 100644
--- a/arch/arm/mach-iop13xx/include/mach/adma.h
+++ b/arch/arm/mach-iop13xx/include/mach/adma.h
@@ -150,6 +150,8 @@ static inline int iop_adma_get_max_xor(void)
150 return 16; 150 return 16;
151} 151}
152 152
153#define iop_adma_get_max_pq iop_adma_get_max_xor
154
153static inline u32 iop_chan_get_current_descriptor(struct iop_adma_chan *chan) 155static inline u32 iop_chan_get_current_descriptor(struct iop_adma_chan *chan)
154{ 156{
155 return __raw_readl(ADMA_ADAR(chan)); 157 return __raw_readl(ADMA_ADAR(chan));
@@ -211,7 +213,10 @@ iop_chan_xor_slot_count(size_t len, int src_cnt, int *slots_per_op)
211#define IOP_ADMA_MAX_BYTE_COUNT ADMA_MAX_BYTE_COUNT 213#define IOP_ADMA_MAX_BYTE_COUNT ADMA_MAX_BYTE_COUNT
212#define IOP_ADMA_ZERO_SUM_MAX_BYTE_COUNT ADMA_MAX_BYTE_COUNT 214#define IOP_ADMA_ZERO_SUM_MAX_BYTE_COUNT ADMA_MAX_BYTE_COUNT
213#define IOP_ADMA_XOR_MAX_BYTE_COUNT ADMA_MAX_BYTE_COUNT 215#define IOP_ADMA_XOR_MAX_BYTE_COUNT ADMA_MAX_BYTE_COUNT
216#define IOP_ADMA_PQ_MAX_BYTE_COUNT ADMA_MAX_BYTE_COUNT
214#define iop_chan_zero_sum_slot_count(l, s, o) iop_chan_xor_slot_count(l, s, o) 217#define iop_chan_zero_sum_slot_count(l, s, o) iop_chan_xor_slot_count(l, s, o)
218#define iop_chan_pq_slot_count iop_chan_xor_slot_count
219#define iop_chan_pq_zero_sum_slot_count iop_chan_xor_slot_count
215 220
216static inline u32 iop_desc_get_dest_addr(struct iop_adma_desc_slot *desc, 221static inline u32 iop_desc_get_dest_addr(struct iop_adma_desc_slot *desc,
217 struct iop_adma_chan *chan) 222 struct iop_adma_chan *chan)
@@ -220,6 +225,13 @@ static inline u32 iop_desc_get_dest_addr(struct iop_adma_desc_slot *desc,
220 return hw_desc->dest_addr; 225 return hw_desc->dest_addr;
221} 226}
222 227
228static inline u32 iop_desc_get_qdest_addr(struct iop_adma_desc_slot *desc,
229 struct iop_adma_chan *chan)
230{
231 struct iop13xx_adma_desc_hw *hw_desc = desc->hw_desc;
232 return hw_desc->q_dest_addr;
233}
234
223static inline u32 iop_desc_get_byte_count(struct iop_adma_desc_slot *desc, 235static inline u32 iop_desc_get_byte_count(struct iop_adma_desc_slot *desc,
224 struct iop_adma_chan *chan) 236 struct iop_adma_chan *chan)
225{ 237{
@@ -319,6 +331,58 @@ iop_desc_init_zero_sum(struct iop_adma_desc_slot *desc, int src_cnt,
319 return 1; 331 return 1;
320} 332}
321 333
334static inline void
335iop_desc_init_pq(struct iop_adma_desc_slot *desc, int src_cnt,
336 unsigned long flags)
337{
338 struct iop13xx_adma_desc_hw *hw_desc = desc->hw_desc;
339 union {
340 u32 value;
341 struct iop13xx_adma_desc_ctrl field;
342 } u_desc_ctrl;
343
344 u_desc_ctrl.value = 0;
345 u_desc_ctrl.field.src_select = src_cnt - 1;
346 u_desc_ctrl.field.xfer_dir = 3; /* local to internal bus */
347 u_desc_ctrl.field.pq_xfer_en = 1;
348 u_desc_ctrl.field.p_xfer_dis = !!(flags & DMA_PREP_PQ_DISABLE_P);
349 u_desc_ctrl.field.int_en = flags & DMA_PREP_INTERRUPT;
350 hw_desc->desc_ctrl = u_desc_ctrl.value;
351}
352
353static inline int iop_desc_is_pq(struct iop_adma_desc_slot *desc)
354{
355 struct iop13xx_adma_desc_hw *hw_desc = desc->hw_desc;
356 union {
357 u32 value;
358 struct iop13xx_adma_desc_ctrl field;
359 } u_desc_ctrl;
360
361 u_desc_ctrl.value = hw_desc->desc_ctrl;
362 return u_desc_ctrl.field.pq_xfer_en;
363}
364
365static inline void
366iop_desc_init_pq_zero_sum(struct iop_adma_desc_slot *desc, int src_cnt,
367 unsigned long flags)
368{
369 struct iop13xx_adma_desc_hw *hw_desc = desc->hw_desc;
370 union {
371 u32 value;
372 struct iop13xx_adma_desc_ctrl field;
373 } u_desc_ctrl;
374
375 u_desc_ctrl.value = 0;
376 u_desc_ctrl.field.src_select = src_cnt - 1;
377 u_desc_ctrl.field.xfer_dir = 3; /* local to internal bus */
378 u_desc_ctrl.field.zero_result = 1;
379 u_desc_ctrl.field.status_write_back_en = 1;
380 u_desc_ctrl.field.pq_xfer_en = 1;
381 u_desc_ctrl.field.p_xfer_dis = !!(flags & DMA_PREP_PQ_DISABLE_P);
382 u_desc_ctrl.field.int_en = flags & DMA_PREP_INTERRUPT;
383 hw_desc->desc_ctrl = u_desc_ctrl.value;
384}
385
322static inline void iop_desc_set_byte_count(struct iop_adma_desc_slot *desc, 386static inline void iop_desc_set_byte_count(struct iop_adma_desc_slot *desc,
323 struct iop_adma_chan *chan, 387 struct iop_adma_chan *chan,
324 u32 byte_count) 388 u32 byte_count)
@@ -351,6 +415,7 @@ iop_desc_set_zero_sum_byte_count(struct iop_adma_desc_slot *desc, u32 len)
351 } 415 }
352} 416}
353 417
418#define iop_desc_set_pq_zero_sum_byte_count iop_desc_set_zero_sum_byte_count
354 419
355static inline void iop_desc_set_dest_addr(struct iop_adma_desc_slot *desc, 420static inline void iop_desc_set_dest_addr(struct iop_adma_desc_slot *desc,
356 struct iop_adma_chan *chan, 421 struct iop_adma_chan *chan,
@@ -361,6 +426,16 @@ static inline void iop_desc_set_dest_addr(struct iop_adma_desc_slot *desc,
361 hw_desc->upper_dest_addr = 0; 426 hw_desc->upper_dest_addr = 0;
362} 427}
363 428
429static inline void
430iop_desc_set_pq_addr(struct iop_adma_desc_slot *desc, dma_addr_t *addr)
431{
432 struct iop13xx_adma_desc_hw *hw_desc = desc->hw_desc;
433
434 hw_desc->dest_addr = addr[0];
435 hw_desc->q_dest_addr = addr[1];
436 hw_desc->upper_dest_addr = 0;
437}
438
364static inline void iop_desc_set_memcpy_src_addr(struct iop_adma_desc_slot *desc, 439static inline void iop_desc_set_memcpy_src_addr(struct iop_adma_desc_slot *desc,
365 dma_addr_t addr) 440 dma_addr_t addr)
366{ 441{
@@ -389,6 +464,29 @@ static inline void iop_desc_set_xor_src_addr(struct iop_adma_desc_slot *desc,
389} 464}
390 465
391static inline void 466static inline void
467iop_desc_set_pq_src_addr(struct iop_adma_desc_slot *desc, int src_idx,
468 dma_addr_t addr, unsigned char coef)
469{
470 int slot_cnt = desc->slot_cnt, slots_per_op = desc->slots_per_op;
471 struct iop13xx_adma_desc_hw *hw_desc = desc->hw_desc, *iter;
472 struct iop13xx_adma_src *src;
473 int i = 0;
474
475 do {
476 iter = iop_hw_desc_slot_idx(hw_desc, i);
477 src = &iter->src[src_idx];
478 src->src_addr = addr;
479 src->pq_upper_src_addr = 0;
480 src->pq_dmlt = coef;
481 slot_cnt -= slots_per_op;
482 if (slot_cnt) {
483 i += slots_per_op;
484 addr += IOP_ADMA_PQ_MAX_BYTE_COUNT;
485 }
486 } while (slot_cnt);
487}
488
489static inline void
392iop_desc_init_interrupt(struct iop_adma_desc_slot *desc, 490iop_desc_init_interrupt(struct iop_adma_desc_slot *desc,
393 struct iop_adma_chan *chan) 491 struct iop_adma_chan *chan)
394{ 492{
@@ -399,6 +497,15 @@ iop_desc_init_interrupt(struct iop_adma_desc_slot *desc,
399} 497}
400 498
401#define iop_desc_set_zero_sum_src_addr iop_desc_set_xor_src_addr 499#define iop_desc_set_zero_sum_src_addr iop_desc_set_xor_src_addr
500#define iop_desc_set_pq_zero_sum_src_addr iop_desc_set_pq_src_addr
501
502static inline void
503iop_desc_set_pq_zero_sum_addr(struct iop_adma_desc_slot *desc, int pq_idx,
504 dma_addr_t *src)
505{
506 iop_desc_set_xor_src_addr(desc, pq_idx, src[pq_idx]);
507 iop_desc_set_xor_src_addr(desc, pq_idx+1, src[pq_idx+1]);
508}
402 509
403static inline void iop_desc_set_next_desc(struct iop_adma_desc_slot *desc, 510static inline void iop_desc_set_next_desc(struct iop_adma_desc_slot *desc,
404 u32 next_desc_addr) 511 u32 next_desc_addr)
@@ -428,18 +535,20 @@ static inline void iop_desc_set_block_fill_val(struct iop_adma_desc_slot *desc,
428 hw_desc->block_fill_data = val; 535 hw_desc->block_fill_data = val;
429} 536}
430 537
431static inline int iop_desc_get_zero_result(struct iop_adma_desc_slot *desc) 538static inline enum sum_check_flags
539iop_desc_get_zero_result(struct iop_adma_desc_slot *desc)
432{ 540{
433 struct iop13xx_adma_desc_hw *hw_desc = desc->hw_desc; 541 struct iop13xx_adma_desc_hw *hw_desc = desc->hw_desc;
434 struct iop13xx_adma_desc_ctrl desc_ctrl = hw_desc->desc_ctrl_field; 542 struct iop13xx_adma_desc_ctrl desc_ctrl = hw_desc->desc_ctrl_field;
435 struct iop13xx_adma_byte_count byte_count = hw_desc->byte_count_field; 543 struct iop13xx_adma_byte_count byte_count = hw_desc->byte_count_field;
544 enum sum_check_flags flags;
436 545
437 BUG_ON(!(byte_count.tx_complete && desc_ctrl.zero_result)); 546 BUG_ON(!(byte_count.tx_complete && desc_ctrl.zero_result));
438 547
439 if (desc_ctrl.pq_xfer_en) 548 flags = byte_count.zero_result_err_q << SUM_CHECK_Q;
440 return byte_count.zero_result_err_q; 549 flags |= byte_count.zero_result_err << SUM_CHECK_P;
441 else 550
442 return byte_count.zero_result_err; 551 return flags;
443} 552}
444 553
445static inline void iop_chan_append(struct iop_adma_chan *chan) 554static inline void iop_chan_append(struct iop_adma_chan *chan)
diff --git a/arch/arm/mach-iop13xx/setup.c b/arch/arm/mach-iop13xx/setup.c
index bee42c609df6..5c147fb66a01 100644
--- a/arch/arm/mach-iop13xx/setup.c
+++ b/arch/arm/mach-iop13xx/setup.c
@@ -477,10 +477,8 @@ void __init iop13xx_platform_init(void)
477 plat_data = &iop13xx_adma_0_data; 477 plat_data = &iop13xx_adma_0_data;
478 dma_cap_set(DMA_MEMCPY, plat_data->cap_mask); 478 dma_cap_set(DMA_MEMCPY, plat_data->cap_mask);
479 dma_cap_set(DMA_XOR, plat_data->cap_mask); 479 dma_cap_set(DMA_XOR, plat_data->cap_mask);
480 dma_cap_set(DMA_DUAL_XOR, plat_data->cap_mask); 480 dma_cap_set(DMA_XOR_VAL, plat_data->cap_mask);
481 dma_cap_set(DMA_ZERO_SUM, plat_data->cap_mask);
482 dma_cap_set(DMA_MEMSET, plat_data->cap_mask); 481 dma_cap_set(DMA_MEMSET, plat_data->cap_mask);
483 dma_cap_set(DMA_MEMCPY_CRC32C, plat_data->cap_mask);
484 dma_cap_set(DMA_INTERRUPT, plat_data->cap_mask); 482 dma_cap_set(DMA_INTERRUPT, plat_data->cap_mask);
485 break; 483 break;
486 case IOP13XX_INIT_ADMA_1: 484 case IOP13XX_INIT_ADMA_1:
@@ -489,10 +487,8 @@ void __init iop13xx_platform_init(void)
489 plat_data = &iop13xx_adma_1_data; 487 plat_data = &iop13xx_adma_1_data;
490 dma_cap_set(DMA_MEMCPY, plat_data->cap_mask); 488 dma_cap_set(DMA_MEMCPY, plat_data->cap_mask);
491 dma_cap_set(DMA_XOR, plat_data->cap_mask); 489 dma_cap_set(DMA_XOR, plat_data->cap_mask);
492 dma_cap_set(DMA_DUAL_XOR, plat_data->cap_mask); 490 dma_cap_set(DMA_XOR_VAL, plat_data->cap_mask);
493 dma_cap_set(DMA_ZERO_SUM, plat_data->cap_mask);
494 dma_cap_set(DMA_MEMSET, plat_data->cap_mask); 491 dma_cap_set(DMA_MEMSET, plat_data->cap_mask);
495 dma_cap_set(DMA_MEMCPY_CRC32C, plat_data->cap_mask);
496 dma_cap_set(DMA_INTERRUPT, plat_data->cap_mask); 492 dma_cap_set(DMA_INTERRUPT, plat_data->cap_mask);
497 break; 493 break;
498 case IOP13XX_INIT_ADMA_2: 494 case IOP13XX_INIT_ADMA_2:
@@ -501,14 +497,11 @@ void __init iop13xx_platform_init(void)
501 plat_data = &iop13xx_adma_2_data; 497 plat_data = &iop13xx_adma_2_data;
502 dma_cap_set(DMA_MEMCPY, plat_data->cap_mask); 498 dma_cap_set(DMA_MEMCPY, plat_data->cap_mask);
503 dma_cap_set(DMA_XOR, plat_data->cap_mask); 499 dma_cap_set(DMA_XOR, plat_data->cap_mask);
504 dma_cap_set(DMA_DUAL_XOR, plat_data->cap_mask); 500 dma_cap_set(DMA_XOR_VAL, plat_data->cap_mask);
505 dma_cap_set(DMA_ZERO_SUM, plat_data->cap_mask);
506 dma_cap_set(DMA_MEMSET, plat_data->cap_mask); 501 dma_cap_set(DMA_MEMSET, plat_data->cap_mask);
507 dma_cap_set(DMA_MEMCPY_CRC32C, plat_data->cap_mask);
508 dma_cap_set(DMA_INTERRUPT, plat_data->cap_mask); 502 dma_cap_set(DMA_INTERRUPT, plat_data->cap_mask);
509 dma_cap_set(DMA_PQ_XOR, plat_data->cap_mask); 503 dma_cap_set(DMA_PQ, plat_data->cap_mask);
510 dma_cap_set(DMA_PQ_UPDATE, plat_data->cap_mask); 504 dma_cap_set(DMA_PQ_VAL, plat_data->cap_mask);
511 dma_cap_set(DMA_PQ_ZERO_SUM, plat_data->cap_mask);
512 break; 505 break;
513 } 506 }
514 } 507 }
diff --git a/arch/arm/plat-iop/adma.c b/arch/arm/plat-iop/adma.c
index 3c127aabe214..1ff6a37e893c 100644
--- a/arch/arm/plat-iop/adma.c
+++ b/arch/arm/plat-iop/adma.c
@@ -179,7 +179,6 @@ static int __init iop3xx_adma_cap_init(void)
179 dma_cap_set(DMA_INTERRUPT, iop3xx_dma_0_data.cap_mask); 179 dma_cap_set(DMA_INTERRUPT, iop3xx_dma_0_data.cap_mask);
180 #else 180 #else
181 dma_cap_set(DMA_MEMCPY, iop3xx_dma_0_data.cap_mask); 181 dma_cap_set(DMA_MEMCPY, iop3xx_dma_0_data.cap_mask);
182 dma_cap_set(DMA_MEMCPY_CRC32C, iop3xx_dma_0_data.cap_mask);
183 dma_cap_set(DMA_INTERRUPT, iop3xx_dma_0_data.cap_mask); 182 dma_cap_set(DMA_INTERRUPT, iop3xx_dma_0_data.cap_mask);
184 #endif 183 #endif
185 184
@@ -188,7 +187,6 @@ static int __init iop3xx_adma_cap_init(void)
188 dma_cap_set(DMA_INTERRUPT, iop3xx_dma_1_data.cap_mask); 187 dma_cap_set(DMA_INTERRUPT, iop3xx_dma_1_data.cap_mask);
189 #else 188 #else
190 dma_cap_set(DMA_MEMCPY, iop3xx_dma_1_data.cap_mask); 189 dma_cap_set(DMA_MEMCPY, iop3xx_dma_1_data.cap_mask);
191 dma_cap_set(DMA_MEMCPY_CRC32C, iop3xx_dma_1_data.cap_mask);
192 dma_cap_set(DMA_INTERRUPT, iop3xx_dma_1_data.cap_mask); 190 dma_cap_set(DMA_INTERRUPT, iop3xx_dma_1_data.cap_mask);
193 #endif 191 #endif
194 192
@@ -198,7 +196,7 @@ static int __init iop3xx_adma_cap_init(void)
198 dma_cap_set(DMA_INTERRUPT, iop3xx_aau_data.cap_mask); 196 dma_cap_set(DMA_INTERRUPT, iop3xx_aau_data.cap_mask);
199 #else 197 #else
200 dma_cap_set(DMA_XOR, iop3xx_aau_data.cap_mask); 198 dma_cap_set(DMA_XOR, iop3xx_aau_data.cap_mask);
201 dma_cap_set(DMA_ZERO_SUM, iop3xx_aau_data.cap_mask); 199 dma_cap_set(DMA_XOR_VAL, iop3xx_aau_data.cap_mask);
202 dma_cap_set(DMA_MEMSET, iop3xx_aau_data.cap_mask); 200 dma_cap_set(DMA_MEMSET, iop3xx_aau_data.cap_mask);
203 dma_cap_set(DMA_INTERRUPT, iop3xx_aau_data.cap_mask); 201 dma_cap_set(DMA_INTERRUPT, iop3xx_aau_data.cap_mask);
204 #endif 202 #endif
diff --git a/arch/frv/kernel/pm.c b/arch/frv/kernel/pm.c
index be722fc1acff..0d4d3e3a4cfc 100644
--- a/arch/frv/kernel/pm.c
+++ b/arch/frv/kernel/pm.c
@@ -150,7 +150,7 @@ static int user_atoi(char __user *ubuf, size_t len)
150/* 150/*
151 * Send us to sleep. 151 * Send us to sleep.
152 */ 152 */
153static int sysctl_pm_do_suspend(ctl_table *ctl, int write, struct file *filp, 153static int sysctl_pm_do_suspend(ctl_table *ctl, int write,
154 void __user *buffer, size_t *lenp, loff_t *fpos) 154 void __user *buffer, size_t *lenp, loff_t *fpos)
155{ 155{
156 int retval, mode; 156 int retval, mode;
@@ -198,13 +198,13 @@ static int try_set_cmode(int new_cmode)
198} 198}
199 199
200 200
201static int cmode_procctl(ctl_table *ctl, int write, struct file *filp, 201static int cmode_procctl(ctl_table *ctl, int write,
202 void __user *buffer, size_t *lenp, loff_t *fpos) 202 void __user *buffer, size_t *lenp, loff_t *fpos)
203{ 203{
204 int new_cmode; 204 int new_cmode;
205 205
206 if (!write) 206 if (!write)
207 return proc_dointvec(ctl, write, filp, buffer, lenp, fpos); 207 return proc_dointvec(ctl, write, buffer, lenp, fpos);
208 208
209 new_cmode = user_atoi(buffer, *lenp); 209 new_cmode = user_atoi(buffer, *lenp);
210 210
@@ -301,13 +301,13 @@ static int try_set_cm(int new_cm)
301 return 0; 301 return 0;
302} 302}
303 303
304static int p0_procctl(ctl_table *ctl, int write, struct file *filp, 304static int p0_procctl(ctl_table *ctl, int write,
305 void __user *buffer, size_t *lenp, loff_t *fpos) 305 void __user *buffer, size_t *lenp, loff_t *fpos)
306{ 306{
307 int new_p0; 307 int new_p0;
308 308
309 if (!write) 309 if (!write)
310 return proc_dointvec(ctl, write, filp, buffer, lenp, fpos); 310 return proc_dointvec(ctl, write, buffer, lenp, fpos);
311 311
312 new_p0 = user_atoi(buffer, *lenp); 312 new_p0 = user_atoi(buffer, *lenp);
313 313
@@ -345,13 +345,13 @@ static int p0_sysctl(ctl_table *table,
345 return 1; 345 return 1;
346} 346}
347 347
348static int cm_procctl(ctl_table *ctl, int write, struct file *filp, 348static int cm_procctl(ctl_table *ctl, int write,
349 void __user *buffer, size_t *lenp, loff_t *fpos) 349 void __user *buffer, size_t *lenp, loff_t *fpos)
350{ 350{
351 int new_cm; 351 int new_cm;
352 352
353 if (!write) 353 if (!write)
354 return proc_dointvec(ctl, write, filp, buffer, lenp, fpos); 354 return proc_dointvec(ctl, write, buffer, lenp, fpos);
355 355
356 new_cm = user_atoi(buffer, *lenp); 356 new_cm = user_atoi(buffer, *lenp);
357 357
diff --git a/arch/mips/lasat/sysctl.c b/arch/mips/lasat/sysctl.c
index 3f04d4c406b7..b3deed8db619 100644
--- a/arch/mips/lasat/sysctl.c
+++ b/arch/mips/lasat/sysctl.c
@@ -56,12 +56,12 @@ int sysctl_lasatstring(ctl_table *table,
56 56
57 57
58/* And the same for proc */ 58/* And the same for proc */
59int proc_dolasatstring(ctl_table *table, int write, struct file *filp, 59int proc_dolasatstring(ctl_table *table, int write,
60 void *buffer, size_t *lenp, loff_t *ppos) 60 void *buffer, size_t *lenp, loff_t *ppos)
61{ 61{
62 int r; 62 int r;
63 63
64 r = proc_dostring(table, write, filp, buffer, lenp, ppos); 64 r = proc_dostring(table, write, buffer, lenp, ppos);
65 if ((!write) || r) 65 if ((!write) || r)
66 return r; 66 return r;
67 67
@@ -71,12 +71,12 @@ int proc_dolasatstring(ctl_table *table, int write, struct file *filp,
71} 71}
72 72
73/* proc function to write EEPROM after changing int entry */ 73/* proc function to write EEPROM after changing int entry */
74int proc_dolasatint(ctl_table *table, int write, struct file *filp, 74int proc_dolasatint(ctl_table *table, int write,
75 void *buffer, size_t *lenp, loff_t *ppos) 75 void *buffer, size_t *lenp, loff_t *ppos)
76{ 76{
77 int r; 77 int r;
78 78
79 r = proc_dointvec(table, write, filp, buffer, lenp, ppos); 79 r = proc_dointvec(table, write, buffer, lenp, ppos);
80 if ((!write) || r) 80 if ((!write) || r)
81 return r; 81 return r;
82 82
@@ -89,7 +89,7 @@ int proc_dolasatint(ctl_table *table, int write, struct file *filp,
89static int rtctmp; 89static int rtctmp;
90 90
91/* proc function to read/write RealTime Clock */ 91/* proc function to read/write RealTime Clock */
92int proc_dolasatrtc(ctl_table *table, int write, struct file *filp, 92int proc_dolasatrtc(ctl_table *table, int write,
93 void *buffer, size_t *lenp, loff_t *ppos) 93 void *buffer, size_t *lenp, loff_t *ppos)
94{ 94{
95 struct timespec ts; 95 struct timespec ts;
@@ -102,7 +102,7 @@ int proc_dolasatrtc(ctl_table *table, int write, struct file *filp,
102 if (rtctmp < 0) 102 if (rtctmp < 0)
103 rtctmp = 0; 103 rtctmp = 0;
104 } 104 }
105 r = proc_dointvec(table, write, filp, buffer, lenp, ppos); 105 r = proc_dointvec(table, write, buffer, lenp, ppos);
106 if (r) 106 if (r)
107 return r; 107 return r;
108 108
@@ -154,7 +154,7 @@ int sysctl_lasat_rtc(ctl_table *table,
154#endif 154#endif
155 155
156#ifdef CONFIG_INET 156#ifdef CONFIG_INET
157int proc_lasat_ip(ctl_table *table, int write, struct file *filp, 157int proc_lasat_ip(ctl_table *table, int write,
158 void *buffer, size_t *lenp, loff_t *ppos) 158 void *buffer, size_t *lenp, loff_t *ppos)
159{ 159{
160 unsigned int ip; 160 unsigned int ip;
@@ -231,12 +231,12 @@ static int sysctl_lasat_prid(ctl_table *table,
231 return 0; 231 return 0;
232} 232}
233 233
234int proc_lasat_prid(ctl_table *table, int write, struct file *filp, 234int proc_lasat_prid(ctl_table *table, int write,
235 void *buffer, size_t *lenp, loff_t *ppos) 235 void *buffer, size_t *lenp, loff_t *ppos)
236{ 236{
237 int r; 237 int r;
238 238
239 r = proc_dointvec(table, write, filp, buffer, lenp, ppos); 239 r = proc_dointvec(table, write, buffer, lenp, ppos);
240 if (r < 0) 240 if (r < 0)
241 return r; 241 return r;
242 if (write) { 242 if (write) {
diff --git a/arch/parisc/include/asm/fcntl.h b/arch/parisc/include/asm/fcntl.h
index 1e1c824764ee..5f39d5597ced 100644
--- a/arch/parisc/include/asm/fcntl.h
+++ b/arch/parisc/include/asm/fcntl.h
@@ -28,6 +28,8 @@
28#define F_SETOWN 12 /* for sockets. */ 28#define F_SETOWN 12 /* for sockets. */
29#define F_SETSIG 13 /* for sockets. */ 29#define F_SETSIG 13 /* for sockets. */
30#define F_GETSIG 14 /* for sockets. */ 30#define F_GETSIG 14 /* for sockets. */
31#define F_GETOWN_EX 15
32#define F_SETOWN_EX 16
31 33
32/* for posix fcntl() and lockf() */ 34/* for posix fcntl() and lockf() */
33#define F_RDLCK 01 35#define F_RDLCK 01
diff --git a/arch/powerpc/include/asm/fsldma.h b/arch/powerpc/include/asm/fsldma.h
new file mode 100644
index 000000000000..a67aeed17d40
--- /dev/null
+++ b/arch/powerpc/include/asm/fsldma.h
@@ -0,0 +1,136 @@
1/*
2 * Freescale MPC83XX / MPC85XX DMA Controller
3 *
4 * Copyright (c) 2009 Ira W. Snyder <iws@ovro.caltech.edu>
5 *
6 * This file is licensed under the terms of the GNU General Public License
7 * version 2. This program is licensed "as is" without any warranty of any
8 * kind, whether express or implied.
9 */
10
11#ifndef __ARCH_POWERPC_ASM_FSLDMA_H__
12#define __ARCH_POWERPC_ASM_FSLDMA_H__
13
14#include <linux/dmaengine.h>
15
16/*
17 * Definitions for the Freescale DMA controller's DMA_SLAVE implemention
18 *
19 * The Freescale DMA_SLAVE implementation was designed to handle many-to-many
20 * transfers. An example usage would be an accelerated copy between two
21 * scatterlists. Another example use would be an accelerated copy from
22 * multiple non-contiguous device buffers into a single scatterlist.
23 *
24 * A DMA_SLAVE transaction is defined by a struct fsl_dma_slave. This
25 * structure contains a list of hardware addresses that should be copied
26 * to/from the scatterlist passed into device_prep_slave_sg(). The structure
27 * also has some fields to enable hardware-specific features.
28 */
29
30/**
31 * struct fsl_dma_hw_addr
32 * @entry: linked list entry
33 * @address: the hardware address
34 * @length: length to transfer
35 *
36 * Holds a single physical hardware address / length pair for use
37 * with the DMAEngine DMA_SLAVE API.
38 */
39struct fsl_dma_hw_addr {
40 struct list_head entry;
41
42 dma_addr_t address;
43 size_t length;
44};
45
46/**
47 * struct fsl_dma_slave
48 * @addresses: a linked list of struct fsl_dma_hw_addr structures
49 * @request_count: value for DMA request count
50 * @src_loop_size: setup and enable constant source-address DMA transfers
51 * @dst_loop_size: setup and enable constant destination address DMA transfers
52 * @external_start: enable externally started DMA transfers
53 * @external_pause: enable externally paused DMA transfers
54 *
55 * Holds a list of address / length pairs for use with the DMAEngine
56 * DMA_SLAVE API implementation for the Freescale DMA controller.
57 */
58struct fsl_dma_slave {
59
60 /* List of hardware address/length pairs */
61 struct list_head addresses;
62
63 /* Support for extra controller features */
64 unsigned int request_count;
65 unsigned int src_loop_size;
66 unsigned int dst_loop_size;
67 bool external_start;
68 bool external_pause;
69};
70
71/**
72 * fsl_dma_slave_append - add an address/length pair to a struct fsl_dma_slave
73 * @slave: the &struct fsl_dma_slave to add to
74 * @address: the hardware address to add
75 * @length: the length of bytes to transfer from @address
76 *
77 * Add a hardware address/length pair to a struct fsl_dma_slave. Returns 0 on
78 * success, -ERRNO otherwise.
79 */
80static inline int fsl_dma_slave_append(struct fsl_dma_slave *slave,
81 dma_addr_t address, size_t length)
82{
83 struct fsl_dma_hw_addr *addr;
84
85 addr = kzalloc(sizeof(*addr), GFP_ATOMIC);
86 if (!addr)
87 return -ENOMEM;
88
89 INIT_LIST_HEAD(&addr->entry);
90 addr->address = address;
91 addr->length = length;
92
93 list_add_tail(&addr->entry, &slave->addresses);
94 return 0;
95}
96
97/**
98 * fsl_dma_slave_free - free a struct fsl_dma_slave
99 * @slave: the struct fsl_dma_slave to free
100 *
101 * Free a struct fsl_dma_slave and all associated address/length pairs
102 */
103static inline void fsl_dma_slave_free(struct fsl_dma_slave *slave)
104{
105 struct fsl_dma_hw_addr *addr, *tmp;
106
107 if (slave) {
108 list_for_each_entry_safe(addr, tmp, &slave->addresses, entry) {
109 list_del(&addr->entry);
110 kfree(addr);
111 }
112
113 kfree(slave);
114 }
115}
116
117/**
118 * fsl_dma_slave_alloc - allocate a struct fsl_dma_slave
119 * @gfp: the flags to pass to kmalloc when allocating this structure
120 *
121 * Allocate a struct fsl_dma_slave for use by the DMA_SLAVE API. Returns a new
122 * struct fsl_dma_slave on success, or NULL on failure.
123 */
124static inline struct fsl_dma_slave *fsl_dma_slave_alloc(gfp_t gfp)
125{
126 struct fsl_dma_slave *slave;
127
128 slave = kzalloc(sizeof(*slave), gfp);
129 if (!slave)
130 return NULL;
131
132 INIT_LIST_HEAD(&slave->addresses);
133 return slave;
134}
135
136#endif /* __ARCH_POWERPC_ASM_FSLDMA_H__ */
diff --git a/arch/s390/appldata/appldata_base.c b/arch/s390/appldata/appldata_base.c
index 264528e4f58d..b55fd7ed1c31 100644
--- a/arch/s390/appldata/appldata_base.c
+++ b/arch/s390/appldata/appldata_base.c
@@ -50,10 +50,9 @@ static struct platform_device *appldata_pdev;
50 * /proc entries (sysctl) 50 * /proc entries (sysctl)
51 */ 51 */
52static const char appldata_proc_name[APPLDATA_PROC_NAME_LENGTH] = "appldata"; 52static const char appldata_proc_name[APPLDATA_PROC_NAME_LENGTH] = "appldata";
53static int appldata_timer_handler(ctl_table *ctl, int write, struct file *filp, 53static int appldata_timer_handler(ctl_table *ctl, int write,
54 void __user *buffer, size_t *lenp, loff_t *ppos); 54 void __user *buffer, size_t *lenp, loff_t *ppos);
55static int appldata_interval_handler(ctl_table *ctl, int write, 55static int appldata_interval_handler(ctl_table *ctl, int write,
56 struct file *filp,
57 void __user *buffer, 56 void __user *buffer,
58 size_t *lenp, loff_t *ppos); 57 size_t *lenp, loff_t *ppos);
59 58
@@ -247,7 +246,7 @@ __appldata_vtimer_setup(int cmd)
247 * Start/Stop timer, show status of timer (0 = not active, 1 = active) 246 * Start/Stop timer, show status of timer (0 = not active, 1 = active)
248 */ 247 */
249static int 248static int
250appldata_timer_handler(ctl_table *ctl, int write, struct file *filp, 249appldata_timer_handler(ctl_table *ctl, int write,
251 void __user *buffer, size_t *lenp, loff_t *ppos) 250 void __user *buffer, size_t *lenp, loff_t *ppos)
252{ 251{
253 int len; 252 int len;
@@ -289,7 +288,7 @@ out:
289 * current timer interval. 288 * current timer interval.
290 */ 289 */
291static int 290static int
292appldata_interval_handler(ctl_table *ctl, int write, struct file *filp, 291appldata_interval_handler(ctl_table *ctl, int write,
293 void __user *buffer, size_t *lenp, loff_t *ppos) 292 void __user *buffer, size_t *lenp, loff_t *ppos)
294{ 293{
295 int len, interval; 294 int len, interval;
@@ -335,7 +334,7 @@ out:
335 * monitoring (0 = not in process, 1 = in process) 334 * monitoring (0 = not in process, 1 = in process)
336 */ 335 */
337static int 336static int
338appldata_generic_handler(ctl_table *ctl, int write, struct file *filp, 337appldata_generic_handler(ctl_table *ctl, int write,
339 void __user *buffer, size_t *lenp, loff_t *ppos) 338 void __user *buffer, size_t *lenp, loff_t *ppos)
340{ 339{
341 struct appldata_ops *ops = NULL, *tmp_ops; 340 struct appldata_ops *ops = NULL, *tmp_ops;
diff --git a/arch/s390/kernel/debug.c b/arch/s390/kernel/debug.c
index 4c512561687d..20f282c911c2 100644
--- a/arch/s390/kernel/debug.c
+++ b/arch/s390/kernel/debug.c
@@ -881,11 +881,11 @@ static int debug_active=1;
881 * if debug_active is already off 881 * if debug_active is already off
882 */ 882 */
883static int 883static int
884s390dbf_procactive(ctl_table *table, int write, struct file *filp, 884s390dbf_procactive(ctl_table *table, int write,
885 void __user *buffer, size_t *lenp, loff_t *ppos) 885 void __user *buffer, size_t *lenp, loff_t *ppos)
886{ 886{
887 if (!write || debug_stoppable || !debug_active) 887 if (!write || debug_stoppable || !debug_active)
888 return proc_dointvec(table, write, filp, buffer, lenp, ppos); 888 return proc_dointvec(table, write, buffer, lenp, ppos);
889 else 889 else
890 return 0; 890 return 0;
891} 891}
diff --git a/arch/s390/mm/cmm.c b/arch/s390/mm/cmm.c
index 413c240cbca7..b201135cc18c 100644
--- a/arch/s390/mm/cmm.c
+++ b/arch/s390/mm/cmm.c
@@ -262,7 +262,7 @@ cmm_skip_blanks(char *cp, char **endp)
262static struct ctl_table cmm_table[]; 262static struct ctl_table cmm_table[];
263 263
264static int 264static int
265cmm_pages_handler(ctl_table *ctl, int write, struct file *filp, 265cmm_pages_handler(ctl_table *ctl, int write,
266 void __user *buffer, size_t *lenp, loff_t *ppos) 266 void __user *buffer, size_t *lenp, loff_t *ppos)
267{ 267{
268 char buf[16], *p; 268 char buf[16], *p;
@@ -303,7 +303,7 @@ cmm_pages_handler(ctl_table *ctl, int write, struct file *filp,
303} 303}
304 304
305static int 305static int
306cmm_timeout_handler(ctl_table *ctl, int write, struct file *filp, 306cmm_timeout_handler(ctl_table *ctl, int write,
307 void __user *buffer, size_t *lenp, loff_t *ppos) 307 void __user *buffer, size_t *lenp, loff_t *ppos)
308{ 308{
309 char buf[64], *p; 309 char buf[64], *p;
diff --git a/arch/sh/drivers/dma/Kconfig b/arch/sh/drivers/dma/Kconfig
index b91fa8dbf047..4d58eb0973d4 100644
--- a/arch/sh/drivers/dma/Kconfig
+++ b/arch/sh/drivers/dma/Kconfig
@@ -1,12 +1,9 @@
1menu "DMA support" 1menu "DMA support"
2 2
3config SH_DMA_API
4 bool
5 3
6config SH_DMA 4config SH_DMA
7 bool "SuperH on-chip DMA controller (DMAC) support" 5 bool "SuperH on-chip DMA controller (DMAC) support"
8 depends on CPU_SH3 || CPU_SH4 6 depends on CPU_SH3 || CPU_SH4
9 select SH_DMA_API
10 default n 7 default n
11 8
12config SH_DMA_IRQ_MULTI 9config SH_DMA_IRQ_MULTI
@@ -19,6 +16,15 @@ config SH_DMA_IRQ_MULTI
19 CPU_SUBTYPE_SH7780 || CPU_SUBTYPE_SH7785 || \ 16 CPU_SUBTYPE_SH7780 || CPU_SUBTYPE_SH7785 || \
20 CPU_SUBTYPE_SH7760 17 CPU_SUBTYPE_SH7760
21 18
19config SH_DMA_API
20 depends on SH_DMA
21 bool "SuperH DMA API support"
22 default n
23 help
24 SH_DMA_API always enabled DMA API of used SuperH.
25 If you want to use DMA ENGINE, you must not enable this.
26 Please enable DMA_ENGINE and SH_DMAE.
27
22config NR_ONCHIP_DMA_CHANNELS 28config NR_ONCHIP_DMA_CHANNELS
23 int 29 int
24 depends on SH_DMA 30 depends on SH_DMA
diff --git a/arch/sh/drivers/dma/Makefile b/arch/sh/drivers/dma/Makefile
index c6068137b46f..d88c9484762c 100644
--- a/arch/sh/drivers/dma/Makefile
+++ b/arch/sh/drivers/dma/Makefile
@@ -2,8 +2,7 @@
2# Makefile for the SuperH DMA specific kernel interface routines under Linux. 2# Makefile for the SuperH DMA specific kernel interface routines under Linux.
3# 3#
4 4
5obj-$(CONFIG_SH_DMA_API) += dma-api.o dma-sysfs.o 5obj-$(CONFIG_SH_DMA_API) += dma-sh.o dma-api.o dma-sysfs.o
6obj-$(CONFIG_SH_DMA) += dma-sh.o
7obj-$(CONFIG_PVR2_DMA) += dma-pvr2.o 6obj-$(CONFIG_PVR2_DMA) += dma-pvr2.o
8obj-$(CONFIG_G2_DMA) += dma-g2.o 7obj-$(CONFIG_G2_DMA) += dma-g2.o
9obj-$(CONFIG_SH_DMABRG) += dmabrg.o 8obj-$(CONFIG_SH_DMABRG) += dmabrg.o
diff --git a/arch/sh/include/asm/dma-sh.h b/arch/sh/include/asm/dma-sh.h
index 68a5f4cb0343..78eed3e0bdf5 100644
--- a/arch/sh/include/asm/dma-sh.h
+++ b/arch/sh/include/asm/dma-sh.h
@@ -116,4 +116,17 @@ static u32 dma_base_addr[] __maybe_unused = {
116#define CHCR 0x0C 116#define CHCR 0x0C
117#define DMAOR 0x40 117#define DMAOR 0x40
118 118
119/*
120 * for dma engine
121 *
122 * SuperH DMA mode
123 */
124#define SHDMA_MIX_IRQ (1 << 1)
125#define SHDMA_DMAOR1 (1 << 2)
126#define SHDMA_DMAE1 (1 << 3)
127
128struct sh_dmae_pdata {
129 unsigned int mode;
130};
131
119#endif /* __DMA_SH_H */ 132#endif /* __DMA_SH_H */
diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
index e63cf7d441e1..139d4c1a33a7 100644
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -40,8 +40,7 @@ extern unsigned int nmi_watchdog;
40#define NMI_INVALID 3 40#define NMI_INVALID 3
41 41
42struct ctl_table; 42struct ctl_table;
43struct file; 43extern int proc_nmi_enabled(struct ctl_table *, int ,
44extern int proc_nmi_enabled(struct ctl_table *, int , struct file *,
45 void __user *, size_t *, loff_t *); 44 void __user *, size_t *, loff_t *);
46extern int unknown_nmi_panic; 45extern int unknown_nmi_panic;
47 46
diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c
index cb66a22d98ad..7ff61d6a188a 100644
--- a/arch/x86/kernel/apic/nmi.c
+++ b/arch/x86/kernel/apic/nmi.c
@@ -508,14 +508,14 @@ static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
508/* 508/*
509 * proc handler for /proc/sys/kernel/nmi 509 * proc handler for /proc/sys/kernel/nmi
510 */ 510 */
511int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file, 511int proc_nmi_enabled(struct ctl_table *table, int write,
512 void __user *buffer, size_t *length, loff_t *ppos) 512 void __user *buffer, size_t *length, loff_t *ppos)
513{ 513{
514 int old_state; 514 int old_state;
515 515
516 nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0; 516 nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0;
517 old_state = nmi_watchdog_enabled; 517 old_state = nmi_watchdog_enabled;
518 proc_dointvec(table, write, file, buffer, length, ppos); 518 proc_dointvec(table, write, buffer, length, ppos);
519 if (!!old_state == !!nmi_watchdog_enabled) 519 if (!!old_state == !!nmi_watchdog_enabled)
520 return 0; 520 return 0;
521 521
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index cf53a78e2dcf..8cb4974ff599 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -228,19 +228,11 @@ static long __vsyscall(3) venosys_1(void)
228} 228}
229 229
230#ifdef CONFIG_SYSCTL 230#ifdef CONFIG_SYSCTL
231
232static int
233vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp,
234 void __user *buffer, size_t *lenp, loff_t *ppos)
235{
236 return proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
237}
238
239static ctl_table kernel_table2[] = { 231static ctl_table kernel_table2[] = {
240 { .procname = "vsyscall64", 232 { .procname = "vsyscall64",
241 .data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int), 233 .data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int),
242 .mode = 0644, 234 .mode = 0644,
243 .proc_handler = vsyscall_sysctl_change }, 235 .proc_handler = proc_dointvec },
244 {} 236 {}
245}; 237};
246 238
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 82728f2c6d55..f4cee9028cf0 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -167,6 +167,7 @@ force_sig_info_fault(int si_signo, int si_code, unsigned long address,
167 info.si_errno = 0; 167 info.si_errno = 0;
168 info.si_code = si_code; 168 info.si_code = si_code;
169 info.si_addr = (void __user *)address; 169 info.si_addr = (void __user *)address;
170 info.si_addr_lsb = si_code == BUS_MCEERR_AR ? PAGE_SHIFT : 0;
170 171
171 force_sig_info(si_signo, &info, tsk); 172 force_sig_info(si_signo, &info, tsk);
172} 173}
@@ -790,10 +791,12 @@ out_of_memory(struct pt_regs *regs, unsigned long error_code,
790} 791}
791 792
792static void 793static void
793do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address) 794do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
795 unsigned int fault)
794{ 796{
795 struct task_struct *tsk = current; 797 struct task_struct *tsk = current;
796 struct mm_struct *mm = tsk->mm; 798 struct mm_struct *mm = tsk->mm;
799 int code = BUS_ADRERR;
797 800
798 up_read(&mm->mmap_sem); 801 up_read(&mm->mmap_sem);
799 802
@@ -809,7 +812,15 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address)
809 tsk->thread.error_code = error_code; 812 tsk->thread.error_code = error_code;
810 tsk->thread.trap_no = 14; 813 tsk->thread.trap_no = 14;
811 814
812 force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk); 815#ifdef CONFIG_MEMORY_FAILURE
816 if (fault & VM_FAULT_HWPOISON) {
817 printk(KERN_ERR
818 "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n",
819 tsk->comm, tsk->pid, address);
820 code = BUS_MCEERR_AR;
821 }
822#endif
823 force_sig_info_fault(SIGBUS, code, address, tsk);
813} 824}
814 825
815static noinline void 826static noinline void
@@ -819,8 +830,8 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
819 if (fault & VM_FAULT_OOM) { 830 if (fault & VM_FAULT_OOM) {
820 out_of_memory(regs, error_code, address); 831 out_of_memory(regs, error_code, address);
821 } else { 832 } else {
822 if (fault & VM_FAULT_SIGBUS) 833 if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON))
823 do_sigbus(regs, error_code, address); 834 do_sigbus(regs, error_code, address, fault);
824 else 835 else
825 BUG(); 836 BUG();
826 } 837 }
diff --git a/crypto/async_tx/Kconfig b/crypto/async_tx/Kconfig
index d8fb39145986..e5aeb2b79e6f 100644
--- a/crypto/async_tx/Kconfig
+++ b/crypto/async_tx/Kconfig
@@ -14,3 +14,12 @@ config ASYNC_MEMSET
14 tristate 14 tristate
15 select ASYNC_CORE 15 select ASYNC_CORE
16 16
17config ASYNC_PQ
18 tristate
19 select ASYNC_CORE
20
21config ASYNC_RAID6_RECOV
22 tristate
23 select ASYNC_CORE
24 select ASYNC_PQ
25
diff --git a/crypto/async_tx/Makefile b/crypto/async_tx/Makefile
index 27baa7d52fbc..d1e0e6f72bc1 100644
--- a/crypto/async_tx/Makefile
+++ b/crypto/async_tx/Makefile
@@ -2,3 +2,6 @@ obj-$(CONFIG_ASYNC_CORE) += async_tx.o
2obj-$(CONFIG_ASYNC_MEMCPY) += async_memcpy.o 2obj-$(CONFIG_ASYNC_MEMCPY) += async_memcpy.o
3obj-$(CONFIG_ASYNC_MEMSET) += async_memset.o 3obj-$(CONFIG_ASYNC_MEMSET) += async_memset.o
4obj-$(CONFIG_ASYNC_XOR) += async_xor.o 4obj-$(CONFIG_ASYNC_XOR) += async_xor.o
5obj-$(CONFIG_ASYNC_PQ) += async_pq.o
6obj-$(CONFIG_ASYNC_RAID6_RECOV) += async_raid6_recov.o
7obj-$(CONFIG_ASYNC_RAID6_TEST) += raid6test.o
diff --git a/crypto/async_tx/async_memcpy.c b/crypto/async_tx/async_memcpy.c
index ddccfb01c416..0ec1fb69d4ea 100644
--- a/crypto/async_tx/async_memcpy.c
+++ b/crypto/async_tx/async_memcpy.c
@@ -33,28 +33,31 @@
33 * async_memcpy - attempt to copy memory with a dma engine. 33 * async_memcpy - attempt to copy memory with a dma engine.
34 * @dest: destination page 34 * @dest: destination page
35 * @src: src page 35 * @src: src page
36 * @offset: offset in pages to start transaction 36 * @dest_offset: offset into 'dest' to start transaction
37 * @src_offset: offset into 'src' to start transaction
37 * @len: length in bytes 38 * @len: length in bytes
38 * @flags: ASYNC_TX_ACK, ASYNC_TX_DEP_ACK, 39 * @submit: submission / completion modifiers
39 * @depend_tx: memcpy depends on the result of this transaction 40 *
40 * @cb_fn: function to call when the memcpy completes 41 * honored flags: ASYNC_TX_ACK
41 * @cb_param: parameter to pass to the callback routine
42 */ 42 */
43struct dma_async_tx_descriptor * 43struct dma_async_tx_descriptor *
44async_memcpy(struct page *dest, struct page *src, unsigned int dest_offset, 44async_memcpy(struct page *dest, struct page *src, unsigned int dest_offset,
45 unsigned int src_offset, size_t len, enum async_tx_flags flags, 45 unsigned int src_offset, size_t len,
46 struct dma_async_tx_descriptor *depend_tx, 46 struct async_submit_ctl *submit)
47 dma_async_tx_callback cb_fn, void *cb_param)
48{ 47{
49 struct dma_chan *chan = async_tx_find_channel(depend_tx, DMA_MEMCPY, 48 struct dma_chan *chan = async_tx_find_channel(submit, DMA_MEMCPY,
50 &dest, 1, &src, 1, len); 49 &dest, 1, &src, 1, len);
51 struct dma_device *device = chan ? chan->device : NULL; 50 struct dma_device *device = chan ? chan->device : NULL;
52 struct dma_async_tx_descriptor *tx = NULL; 51 struct dma_async_tx_descriptor *tx = NULL;
53 52
54 if (device) { 53 if (device && is_dma_copy_aligned(device, src_offset, dest_offset, len)) {
55 dma_addr_t dma_dest, dma_src; 54 dma_addr_t dma_dest, dma_src;
56 unsigned long dma_prep_flags = cb_fn ? DMA_PREP_INTERRUPT : 0; 55 unsigned long dma_prep_flags = 0;
57 56
57 if (submit->cb_fn)
58 dma_prep_flags |= DMA_PREP_INTERRUPT;
59 if (submit->flags & ASYNC_TX_FENCE)
60 dma_prep_flags |= DMA_PREP_FENCE;
58 dma_dest = dma_map_page(device->dev, dest, dest_offset, len, 61 dma_dest = dma_map_page(device->dev, dest, dest_offset, len,
59 DMA_FROM_DEVICE); 62 DMA_FROM_DEVICE);
60 63
@@ -67,13 +70,13 @@ async_memcpy(struct page *dest, struct page *src, unsigned int dest_offset,
67 70
68 if (tx) { 71 if (tx) {
69 pr_debug("%s: (async) len: %zu\n", __func__, len); 72 pr_debug("%s: (async) len: %zu\n", __func__, len);
70 async_tx_submit(chan, tx, flags, depend_tx, cb_fn, cb_param); 73 async_tx_submit(chan, tx, submit);
71 } else { 74 } else {
72 void *dest_buf, *src_buf; 75 void *dest_buf, *src_buf;
73 pr_debug("%s: (sync) len: %zu\n", __func__, len); 76 pr_debug("%s: (sync) len: %zu\n", __func__, len);
74 77
75 /* wait for any prerequisite operations */ 78 /* wait for any prerequisite operations */
76 async_tx_quiesce(&depend_tx); 79 async_tx_quiesce(&submit->depend_tx);
77 80
78 dest_buf = kmap_atomic(dest, KM_USER0) + dest_offset; 81 dest_buf = kmap_atomic(dest, KM_USER0) + dest_offset;
79 src_buf = kmap_atomic(src, KM_USER1) + src_offset; 82 src_buf = kmap_atomic(src, KM_USER1) + src_offset;
@@ -83,26 +86,13 @@ async_memcpy(struct page *dest, struct page *src, unsigned int dest_offset,
83 kunmap_atomic(dest_buf, KM_USER0); 86 kunmap_atomic(dest_buf, KM_USER0);
84 kunmap_atomic(src_buf, KM_USER1); 87 kunmap_atomic(src_buf, KM_USER1);
85 88
86 async_tx_sync_epilog(cb_fn, cb_param); 89 async_tx_sync_epilog(submit);
87 } 90 }
88 91
89 return tx; 92 return tx;
90} 93}
91EXPORT_SYMBOL_GPL(async_memcpy); 94EXPORT_SYMBOL_GPL(async_memcpy);
92 95
93static int __init async_memcpy_init(void)
94{
95 return 0;
96}
97
98static void __exit async_memcpy_exit(void)
99{
100 do { } while (0);
101}
102
103module_init(async_memcpy_init);
104module_exit(async_memcpy_exit);
105
106MODULE_AUTHOR("Intel Corporation"); 96MODULE_AUTHOR("Intel Corporation");
107MODULE_DESCRIPTION("asynchronous memcpy api"); 97MODULE_DESCRIPTION("asynchronous memcpy api");
108MODULE_LICENSE("GPL"); 98MODULE_LICENSE("GPL");
diff --git a/crypto/async_tx/async_memset.c b/crypto/async_tx/async_memset.c
index 5b5eb99bb244..58e4a8752aee 100644
--- a/crypto/async_tx/async_memset.c
+++ b/crypto/async_tx/async_memset.c
@@ -35,26 +35,26 @@
35 * @val: fill value 35 * @val: fill value
36 * @offset: offset in pages to start transaction 36 * @offset: offset in pages to start transaction
37 * @len: length in bytes 37 * @len: length in bytes
38 * @flags: ASYNC_TX_ACK, ASYNC_TX_DEP_ACK 38 *
39 * @depend_tx: memset depends on the result of this transaction 39 * honored flags: ASYNC_TX_ACK
40 * @cb_fn: function to call when the memcpy completes
41 * @cb_param: parameter to pass to the callback routine
42 */ 40 */
43struct dma_async_tx_descriptor * 41struct dma_async_tx_descriptor *
44async_memset(struct page *dest, int val, unsigned int offset, 42async_memset(struct page *dest, int val, unsigned int offset, size_t len,
45 size_t len, enum async_tx_flags flags, 43 struct async_submit_ctl *submit)
46 struct dma_async_tx_descriptor *depend_tx,
47 dma_async_tx_callback cb_fn, void *cb_param)
48{ 44{
49 struct dma_chan *chan = async_tx_find_channel(depend_tx, DMA_MEMSET, 45 struct dma_chan *chan = async_tx_find_channel(submit, DMA_MEMSET,
50 &dest, 1, NULL, 0, len); 46 &dest, 1, NULL, 0, len);
51 struct dma_device *device = chan ? chan->device : NULL; 47 struct dma_device *device = chan ? chan->device : NULL;
52 struct dma_async_tx_descriptor *tx = NULL; 48 struct dma_async_tx_descriptor *tx = NULL;
53 49
54 if (device) { 50 if (device && is_dma_fill_aligned(device, offset, 0, len)) {
55 dma_addr_t dma_dest; 51 dma_addr_t dma_dest;
56 unsigned long dma_prep_flags = cb_fn ? DMA_PREP_INTERRUPT : 0; 52 unsigned long dma_prep_flags = 0;
57 53
54 if (submit->cb_fn)
55 dma_prep_flags |= DMA_PREP_INTERRUPT;
56 if (submit->flags & ASYNC_TX_FENCE)
57 dma_prep_flags |= DMA_PREP_FENCE;
58 dma_dest = dma_map_page(device->dev, dest, offset, len, 58 dma_dest = dma_map_page(device->dev, dest, offset, len,
59 DMA_FROM_DEVICE); 59 DMA_FROM_DEVICE);
60 60
@@ -64,38 +64,25 @@ async_memset(struct page *dest, int val, unsigned int offset,
64 64
65 if (tx) { 65 if (tx) {
66 pr_debug("%s: (async) len: %zu\n", __func__, len); 66 pr_debug("%s: (async) len: %zu\n", __func__, len);
67 async_tx_submit(chan, tx, flags, depend_tx, cb_fn, cb_param); 67 async_tx_submit(chan, tx, submit);
68 } else { /* run the memset synchronously */ 68 } else { /* run the memset synchronously */
69 void *dest_buf; 69 void *dest_buf;
70 pr_debug("%s: (sync) len: %zu\n", __func__, len); 70 pr_debug("%s: (sync) len: %zu\n", __func__, len);
71 71
72 dest_buf = (void *) (((char *) page_address(dest)) + offset); 72 dest_buf = page_address(dest) + offset;
73 73
74 /* wait for any prerequisite operations */ 74 /* wait for any prerequisite operations */
75 async_tx_quiesce(&depend_tx); 75 async_tx_quiesce(&submit->depend_tx);
76 76
77 memset(dest_buf, val, len); 77 memset(dest_buf, val, len);
78 78
79 async_tx_sync_epilog(cb_fn, cb_param); 79 async_tx_sync_epilog(submit);
80 } 80 }
81 81
82 return tx; 82 return tx;
83} 83}
84EXPORT_SYMBOL_GPL(async_memset); 84EXPORT_SYMBOL_GPL(async_memset);
85 85
86static int __init async_memset_init(void)
87{
88 return 0;
89}
90
91static void __exit async_memset_exit(void)
92{
93 do { } while (0);
94}
95
96module_init(async_memset_init);
97module_exit(async_memset_exit);
98
99MODULE_AUTHOR("Intel Corporation"); 86MODULE_AUTHOR("Intel Corporation");
100MODULE_DESCRIPTION("asynchronous memset api"); 87MODULE_DESCRIPTION("asynchronous memset api");
101MODULE_LICENSE("GPL"); 88MODULE_LICENSE("GPL");
diff --git a/crypto/async_tx/async_pq.c b/crypto/async_tx/async_pq.c
new file mode 100644
index 000000000000..b88db6d1dc65
--- /dev/null
+++ b/crypto/async_tx/async_pq.c
@@ -0,0 +1,395 @@
1/*
2 * Copyright(c) 2007 Yuri Tikhonov <yur@emcraft.com>
3 * Copyright(c) 2009 Intel Corporation
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License as published by the Free
7 * Software Foundation; either version 2 of the License, or (at your option)
8 * any later version.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along with
16 * this program; if not, write to the Free Software Foundation, Inc., 59
17 * Temple Place - Suite 330, Boston, MA 02111-1307, USA.
18 *
19 * The full GNU General Public License is included in this distribution in the
20 * file called COPYING.
21 */
22#include <linux/kernel.h>
23#include <linux/interrupt.h>
24#include <linux/dma-mapping.h>
25#include <linux/raid/pq.h>
26#include <linux/async_tx.h>
27
28/**
29 * scribble - space to hold throwaway P buffer for synchronous gen_syndrome
30 */
31static struct page *scribble;
32
33static bool is_raid6_zero_block(struct page *p)
34{
35 return p == (void *) raid6_empty_zero_page;
36}
37
38/* the struct page *blocks[] parameter passed to async_gen_syndrome()
39 * and async_syndrome_val() contains the 'P' destination address at
40 * blocks[disks-2] and the 'Q' destination address at blocks[disks-1]
41 *
42 * note: these are macros as they are used as lvalues
43 */
44#define P(b, d) (b[d-2])
45#define Q(b, d) (b[d-1])
46
47/**
48 * do_async_gen_syndrome - asynchronously calculate P and/or Q
49 */
50static __async_inline struct dma_async_tx_descriptor *
51do_async_gen_syndrome(struct dma_chan *chan, struct page **blocks,
52 const unsigned char *scfs, unsigned int offset, int disks,
53 size_t len, dma_addr_t *dma_src,
54 struct async_submit_ctl *submit)
55{
56 struct dma_async_tx_descriptor *tx = NULL;
57 struct dma_device *dma = chan->device;
58 enum dma_ctrl_flags dma_flags = 0;
59 enum async_tx_flags flags_orig = submit->flags;
60 dma_async_tx_callback cb_fn_orig = submit->cb_fn;
61 dma_async_tx_callback cb_param_orig = submit->cb_param;
62 int src_cnt = disks - 2;
63 unsigned char coefs[src_cnt];
64 unsigned short pq_src_cnt;
65 dma_addr_t dma_dest[2];
66 int src_off = 0;
67 int idx;
68 int i;
69
70 /* DMAs use destinations as sources, so use BIDIRECTIONAL mapping */
71 if (P(blocks, disks))
72 dma_dest[0] = dma_map_page(dma->dev, P(blocks, disks), offset,
73 len, DMA_BIDIRECTIONAL);
74 else
75 dma_flags |= DMA_PREP_PQ_DISABLE_P;
76 if (Q(blocks, disks))
77 dma_dest[1] = dma_map_page(dma->dev, Q(blocks, disks), offset,
78 len, DMA_BIDIRECTIONAL);
79 else
80 dma_flags |= DMA_PREP_PQ_DISABLE_Q;
81
82 /* convert source addresses being careful to collapse 'empty'
83 * sources and update the coefficients accordingly
84 */
85 for (i = 0, idx = 0; i < src_cnt; i++) {
86 if (is_raid6_zero_block(blocks[i]))
87 continue;
88 dma_src[idx] = dma_map_page(dma->dev, blocks[i], offset, len,
89 DMA_TO_DEVICE);
90 coefs[idx] = scfs[i];
91 idx++;
92 }
93 src_cnt = idx;
94
95 while (src_cnt > 0) {
96 submit->flags = flags_orig;
97 pq_src_cnt = min(src_cnt, dma_maxpq(dma, dma_flags));
98 /* if we are submitting additional pqs, leave the chain open,
99 * clear the callback parameters, and leave the destination
100 * buffers mapped
101 */
102 if (src_cnt > pq_src_cnt) {
103 submit->flags &= ~ASYNC_TX_ACK;
104 submit->flags |= ASYNC_TX_FENCE;
105 dma_flags |= DMA_COMPL_SKIP_DEST_UNMAP;
106 submit->cb_fn = NULL;
107 submit->cb_param = NULL;
108 } else {
109 dma_flags &= ~DMA_COMPL_SKIP_DEST_UNMAP;
110 submit->cb_fn = cb_fn_orig;
111 submit->cb_param = cb_param_orig;
112 if (cb_fn_orig)
113 dma_flags |= DMA_PREP_INTERRUPT;
114 }
115 if (submit->flags & ASYNC_TX_FENCE)
116 dma_flags |= DMA_PREP_FENCE;
117
118 /* Since we have clobbered the src_list we are committed
119 * to doing this asynchronously. Drivers force forward
120 * progress in case they can not provide a descriptor
121 */
122 for (;;) {
123 tx = dma->device_prep_dma_pq(chan, dma_dest,
124 &dma_src[src_off],
125 pq_src_cnt,
126 &coefs[src_off], len,
127 dma_flags);
128 if (likely(tx))
129 break;
130 async_tx_quiesce(&submit->depend_tx);
131 dma_async_issue_pending(chan);
132 }
133
134 async_tx_submit(chan, tx, submit);
135 submit->depend_tx = tx;
136
137 /* drop completed sources */
138 src_cnt -= pq_src_cnt;
139 src_off += pq_src_cnt;
140
141 dma_flags |= DMA_PREP_CONTINUE;
142 }
143
144 return tx;
145}
146
147/**
148 * do_sync_gen_syndrome - synchronously calculate a raid6 syndrome
149 */
150static void
151do_sync_gen_syndrome(struct page **blocks, unsigned int offset, int disks,
152 size_t len, struct async_submit_ctl *submit)
153{
154 void **srcs;
155 int i;
156
157 if (submit->scribble)
158 srcs = submit->scribble;
159 else
160 srcs = (void **) blocks;
161
162 for (i = 0; i < disks; i++) {
163 if (is_raid6_zero_block(blocks[i])) {
164 BUG_ON(i > disks - 3); /* P or Q can't be zero */
165 srcs[i] = blocks[i];
166 } else
167 srcs[i] = page_address(blocks[i]) + offset;
168 }
169 raid6_call.gen_syndrome(disks, len, srcs);
170 async_tx_sync_epilog(submit);
171}
172
173/**
174 * async_gen_syndrome - asynchronously calculate a raid6 syndrome
175 * @blocks: source blocks from idx 0..disks-3, P @ disks-2 and Q @ disks-1
176 * @offset: common offset into each block (src and dest) to start transaction
177 * @disks: number of blocks (including missing P or Q, see below)
178 * @len: length of operation in bytes
179 * @submit: submission/completion modifiers
180 *
181 * General note: This routine assumes a field of GF(2^8) with a
182 * primitive polynomial of 0x11d and a generator of {02}.
183 *
184 * 'disks' note: callers can optionally omit either P or Q (but not
185 * both) from the calculation by setting blocks[disks-2] or
186 * blocks[disks-1] to NULL. When P or Q is omitted 'len' must be <=
187 * PAGE_SIZE as a temporary buffer of this size is used in the
188 * synchronous path. 'disks' always accounts for both destination
189 * buffers.
190 *
191 * 'blocks' note: if submit->scribble is NULL then the contents of
192 * 'blocks' may be overridden
193 */
194struct dma_async_tx_descriptor *
195async_gen_syndrome(struct page **blocks, unsigned int offset, int disks,
196 size_t len, struct async_submit_ctl *submit)
197{
198 int src_cnt = disks - 2;
199 struct dma_chan *chan = async_tx_find_channel(submit, DMA_PQ,
200 &P(blocks, disks), 2,
201 blocks, src_cnt, len);
202 struct dma_device *device = chan ? chan->device : NULL;
203 dma_addr_t *dma_src = NULL;
204
205 BUG_ON(disks > 255 || !(P(blocks, disks) || Q(blocks, disks)));
206
207 if (submit->scribble)
208 dma_src = submit->scribble;
209 else if (sizeof(dma_addr_t) <= sizeof(struct page *))
210 dma_src = (dma_addr_t *) blocks;
211
212 if (dma_src && device &&
213 (src_cnt <= dma_maxpq(device, 0) ||
214 dma_maxpq(device, DMA_PREP_CONTINUE) > 0) &&
215 is_dma_pq_aligned(device, offset, 0, len)) {
216 /* run the p+q asynchronously */
217 pr_debug("%s: (async) disks: %d len: %zu\n",
218 __func__, disks, len);
219 return do_async_gen_syndrome(chan, blocks, raid6_gfexp, offset,
220 disks, len, dma_src, submit);
221 }
222
223 /* run the pq synchronously */
224 pr_debug("%s: (sync) disks: %d len: %zu\n", __func__, disks, len);
225
226 /* wait for any prerequisite operations */
227 async_tx_quiesce(&submit->depend_tx);
228
229 if (!P(blocks, disks)) {
230 P(blocks, disks) = scribble;
231 BUG_ON(len + offset > PAGE_SIZE);
232 }
233 if (!Q(blocks, disks)) {
234 Q(blocks, disks) = scribble;
235 BUG_ON(len + offset > PAGE_SIZE);
236 }
237 do_sync_gen_syndrome(blocks, offset, disks, len, submit);
238
239 return NULL;
240}
241EXPORT_SYMBOL_GPL(async_gen_syndrome);
242
243/**
244 * async_syndrome_val - asynchronously validate a raid6 syndrome
245 * @blocks: source blocks from idx 0..disks-3, P @ disks-2 and Q @ disks-1
246 * @offset: common offset into each block (src and dest) to start transaction
247 * @disks: number of blocks (including missing P or Q, see below)
248 * @len: length of operation in bytes
249 * @pqres: on val failure SUM_CHECK_P_RESULT and/or SUM_CHECK_Q_RESULT are set
250 * @spare: temporary result buffer for the synchronous case
251 * @submit: submission / completion modifiers
252 *
253 * The same notes from async_gen_syndrome apply to the 'blocks',
254 * and 'disks' parameters of this routine. The synchronous path
255 * requires a temporary result buffer and submit->scribble to be
256 * specified.
257 */
258struct dma_async_tx_descriptor *
259async_syndrome_val(struct page **blocks, unsigned int offset, int disks,
260 size_t len, enum sum_check_flags *pqres, struct page *spare,
261 struct async_submit_ctl *submit)
262{
263 struct dma_chan *chan = async_tx_find_channel(submit, DMA_PQ_VAL,
264 NULL, 0, blocks, disks,
265 len);
266 struct dma_device *device = chan ? chan->device : NULL;
267 struct dma_async_tx_descriptor *tx;
268 enum dma_ctrl_flags dma_flags = submit->cb_fn ? DMA_PREP_INTERRUPT : 0;
269 dma_addr_t *dma_src = NULL;
270
271 BUG_ON(disks < 4);
272
273 if (submit->scribble)
274 dma_src = submit->scribble;
275 else if (sizeof(dma_addr_t) <= sizeof(struct page *))
276 dma_src = (dma_addr_t *) blocks;
277
278 if (dma_src && device && disks <= dma_maxpq(device, 0) &&
279 is_dma_pq_aligned(device, offset, 0, len)) {
280 struct device *dev = device->dev;
281 dma_addr_t *pq = &dma_src[disks-2];
282 int i;
283
284 pr_debug("%s: (async) disks: %d len: %zu\n",
285 __func__, disks, len);
286 if (!P(blocks, disks))
287 dma_flags |= DMA_PREP_PQ_DISABLE_P;
288 if (!Q(blocks, disks))
289 dma_flags |= DMA_PREP_PQ_DISABLE_Q;
290 if (submit->flags & ASYNC_TX_FENCE)
291 dma_flags |= DMA_PREP_FENCE;
292 for (i = 0; i < disks; i++)
293 if (likely(blocks[i])) {
294 BUG_ON(is_raid6_zero_block(blocks[i]));
295 dma_src[i] = dma_map_page(dev, blocks[i],
296 offset, len,
297 DMA_TO_DEVICE);
298 }
299
300 for (;;) {
301 tx = device->device_prep_dma_pq_val(chan, pq, dma_src,
302 disks - 2,
303 raid6_gfexp,
304 len, pqres,
305 dma_flags);
306 if (likely(tx))
307 break;
308 async_tx_quiesce(&submit->depend_tx);
309 dma_async_issue_pending(chan);
310 }
311 async_tx_submit(chan, tx, submit);
312
313 return tx;
314 } else {
315 struct page *p_src = P(blocks, disks);
316 struct page *q_src = Q(blocks, disks);
317 enum async_tx_flags flags_orig = submit->flags;
318 dma_async_tx_callback cb_fn_orig = submit->cb_fn;
319 void *scribble = submit->scribble;
320 void *cb_param_orig = submit->cb_param;
321 void *p, *q, *s;
322
323 pr_debug("%s: (sync) disks: %d len: %zu\n",
324 __func__, disks, len);
325
326 /* caller must provide a temporary result buffer and
327 * allow the input parameters to be preserved
328 */
329 BUG_ON(!spare || !scribble);
330
331 /* wait for any prerequisite operations */
332 async_tx_quiesce(&submit->depend_tx);
333
334 /* recompute p and/or q into the temporary buffer and then
335 * check to see the result matches the current value
336 */
337 tx = NULL;
338 *pqres = 0;
339 if (p_src) {
340 init_async_submit(submit, ASYNC_TX_XOR_ZERO_DST, NULL,
341 NULL, NULL, scribble);
342 tx = async_xor(spare, blocks, offset, disks-2, len, submit);
343 async_tx_quiesce(&tx);
344 p = page_address(p_src) + offset;
345 s = page_address(spare) + offset;
346 *pqres |= !!memcmp(p, s, len) << SUM_CHECK_P;
347 }
348
349 if (q_src) {
350 P(blocks, disks) = NULL;
351 Q(blocks, disks) = spare;
352 init_async_submit(submit, 0, NULL, NULL, NULL, scribble);
353 tx = async_gen_syndrome(blocks, offset, disks, len, submit);
354 async_tx_quiesce(&tx);
355 q = page_address(q_src) + offset;
356 s = page_address(spare) + offset;
357 *pqres |= !!memcmp(q, s, len) << SUM_CHECK_Q;
358 }
359
360 /* restore P, Q and submit */
361 P(blocks, disks) = p_src;
362 Q(blocks, disks) = q_src;
363
364 submit->cb_fn = cb_fn_orig;
365 submit->cb_param = cb_param_orig;
366 submit->flags = flags_orig;
367 async_tx_sync_epilog(submit);
368
369 return NULL;
370 }
371}
372EXPORT_SYMBOL_GPL(async_syndrome_val);
373
374static int __init async_pq_init(void)
375{
376 scribble = alloc_page(GFP_KERNEL);
377
378 if (scribble)
379 return 0;
380
381 pr_err("%s: failed to allocate required spare page\n", __func__);
382
383 return -ENOMEM;
384}
385
386static void __exit async_pq_exit(void)
387{
388 put_page(scribble);
389}
390
391module_init(async_pq_init);
392module_exit(async_pq_exit);
393
394MODULE_DESCRIPTION("asynchronous raid6 syndrome generation/validation");
395MODULE_LICENSE("GPL");
diff --git a/crypto/async_tx/async_raid6_recov.c b/crypto/async_tx/async_raid6_recov.c
new file mode 100644
index 000000000000..6d73dde4786d
--- /dev/null
+++ b/crypto/async_tx/async_raid6_recov.c
@@ -0,0 +1,468 @@
1/*
2 * Asynchronous RAID-6 recovery calculations ASYNC_TX API.
3 * Copyright(c) 2009 Intel Corporation
4 *
5 * based on raid6recov.c:
6 * Copyright 2002 H. Peter Anvin
7 *
8 * This program is free software; you can redistribute it and/or modify it
9 * under the terms of the GNU General Public License as published by the Free
10 * Software Foundation; either version 2 of the License, or (at your option)
11 * any later version.
12 *
13 * This program is distributed in the hope that it will be useful, but WITHOUT
14 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
16 * more details.
17 *
18 * You should have received a copy of the GNU General Public License along with
19 * this program; if not, write to the Free Software Foundation, Inc., 51
20 * Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
21 *
22 */
23#include <linux/kernel.h>
24#include <linux/interrupt.h>
25#include <linux/dma-mapping.h>
26#include <linux/raid/pq.h>
27#include <linux/async_tx.h>
28
29static struct dma_async_tx_descriptor *
30async_sum_product(struct page *dest, struct page **srcs, unsigned char *coef,
31 size_t len, struct async_submit_ctl *submit)
32{
33 struct dma_chan *chan = async_tx_find_channel(submit, DMA_PQ,
34 &dest, 1, srcs, 2, len);
35 struct dma_device *dma = chan ? chan->device : NULL;
36 const u8 *amul, *bmul;
37 u8 ax, bx;
38 u8 *a, *b, *c;
39
40 if (dma) {
41 dma_addr_t dma_dest[2];
42 dma_addr_t dma_src[2];
43 struct device *dev = dma->dev;
44 struct dma_async_tx_descriptor *tx;
45 enum dma_ctrl_flags dma_flags = DMA_PREP_PQ_DISABLE_P;
46
47 if (submit->flags & ASYNC_TX_FENCE)
48 dma_flags |= DMA_PREP_FENCE;
49 dma_dest[1] = dma_map_page(dev, dest, 0, len, DMA_BIDIRECTIONAL);
50 dma_src[0] = dma_map_page(dev, srcs[0], 0, len, DMA_TO_DEVICE);
51 dma_src[1] = dma_map_page(dev, srcs[1], 0, len, DMA_TO_DEVICE);
52 tx = dma->device_prep_dma_pq(chan, dma_dest, dma_src, 2, coef,
53 len, dma_flags);
54 if (tx) {
55 async_tx_submit(chan, tx, submit);
56 return tx;
57 }
58
59 /* could not get a descriptor, unmap and fall through to
60 * the synchronous path
61 */
62 dma_unmap_page(dev, dma_dest[1], len, DMA_BIDIRECTIONAL);
63 dma_unmap_page(dev, dma_src[0], len, DMA_TO_DEVICE);
64 dma_unmap_page(dev, dma_src[1], len, DMA_TO_DEVICE);
65 }
66
67 /* run the operation synchronously */
68 async_tx_quiesce(&submit->depend_tx);
69 amul = raid6_gfmul[coef[0]];
70 bmul = raid6_gfmul[coef[1]];
71 a = page_address(srcs[0]);
72 b = page_address(srcs[1]);
73 c = page_address(dest);
74
75 while (len--) {
76 ax = amul[*a++];
77 bx = bmul[*b++];
78 *c++ = ax ^ bx;
79 }
80
81 return NULL;
82}
83
84static struct dma_async_tx_descriptor *
85async_mult(struct page *dest, struct page *src, u8 coef, size_t len,
86 struct async_submit_ctl *submit)
87{
88 struct dma_chan *chan = async_tx_find_channel(submit, DMA_PQ,
89 &dest, 1, &src, 1, len);
90 struct dma_device *dma = chan ? chan->device : NULL;
91 const u8 *qmul; /* Q multiplier table */
92 u8 *d, *s;
93
94 if (dma) {
95 dma_addr_t dma_dest[2];
96 dma_addr_t dma_src[1];
97 struct device *dev = dma->dev;
98 struct dma_async_tx_descriptor *tx;
99 enum dma_ctrl_flags dma_flags = DMA_PREP_PQ_DISABLE_P;
100
101 if (submit->flags & ASYNC_TX_FENCE)
102 dma_flags |= DMA_PREP_FENCE;
103 dma_dest[1] = dma_map_page(dev, dest, 0, len, DMA_BIDIRECTIONAL);
104 dma_src[0] = dma_map_page(dev, src, 0, len, DMA_TO_DEVICE);
105 tx = dma->device_prep_dma_pq(chan, dma_dest, dma_src, 1, &coef,
106 len, dma_flags);
107 if (tx) {
108 async_tx_submit(chan, tx, submit);
109 return tx;
110 }
111
112 /* could not get a descriptor, unmap and fall through to
113 * the synchronous path
114 */
115 dma_unmap_page(dev, dma_dest[1], len, DMA_BIDIRECTIONAL);
116 dma_unmap_page(dev, dma_src[0], len, DMA_TO_DEVICE);
117 }
118
119 /* no channel available, or failed to allocate a descriptor, so
120 * perform the operation synchronously
121 */
122 async_tx_quiesce(&submit->depend_tx);
123 qmul = raid6_gfmul[coef];
124 d = page_address(dest);
125 s = page_address(src);
126
127 while (len--)
128 *d++ = qmul[*s++];
129
130 return NULL;
131}
132
133static struct dma_async_tx_descriptor *
134__2data_recov_4(size_t bytes, int faila, int failb, struct page **blocks,
135 struct async_submit_ctl *submit)
136{
137 struct dma_async_tx_descriptor *tx = NULL;
138 struct page *p, *q, *a, *b;
139 struct page *srcs[2];
140 unsigned char coef[2];
141 enum async_tx_flags flags = submit->flags;
142 dma_async_tx_callback cb_fn = submit->cb_fn;
143 void *cb_param = submit->cb_param;
144 void *scribble = submit->scribble;
145
146 p = blocks[4-2];
147 q = blocks[4-1];
148
149 a = blocks[faila];
150 b = blocks[failb];
151
152 /* in the 4 disk case P + Pxy == P and Q + Qxy == Q */
153 /* Dx = A*(P+Pxy) + B*(Q+Qxy) */
154 srcs[0] = p;
155 srcs[1] = q;
156 coef[0] = raid6_gfexi[failb-faila];
157 coef[1] = raid6_gfinv[raid6_gfexp[faila]^raid6_gfexp[failb]];
158 init_async_submit(submit, ASYNC_TX_FENCE, tx, NULL, NULL, scribble);
159 tx = async_sum_product(b, srcs, coef, bytes, submit);
160
161 /* Dy = P+Pxy+Dx */
162 srcs[0] = p;
163 srcs[1] = b;
164 init_async_submit(submit, flags | ASYNC_TX_XOR_ZERO_DST, tx, cb_fn,
165 cb_param, scribble);
166 tx = async_xor(a, srcs, 0, 2, bytes, submit);
167
168 return tx;
169
170}
171
172static struct dma_async_tx_descriptor *
173__2data_recov_5(size_t bytes, int faila, int failb, struct page **blocks,
174 struct async_submit_ctl *submit)
175{
176 struct dma_async_tx_descriptor *tx = NULL;
177 struct page *p, *q, *g, *dp, *dq;
178 struct page *srcs[2];
179 unsigned char coef[2];
180 enum async_tx_flags flags = submit->flags;
181 dma_async_tx_callback cb_fn = submit->cb_fn;
182 void *cb_param = submit->cb_param;
183 void *scribble = submit->scribble;
184 int uninitialized_var(good);
185 int i;
186
187 for (i = 0; i < 3; i++) {
188 if (i == faila || i == failb)
189 continue;
190 else {
191 good = i;
192 break;
193 }
194 }
195 BUG_ON(i >= 3);
196
197 p = blocks[5-2];
198 q = blocks[5-1];
199 g = blocks[good];
200
201 /* Compute syndrome with zero for the missing data pages
202 * Use the dead data pages as temporary storage for delta p and
203 * delta q
204 */
205 dp = blocks[faila];
206 dq = blocks[failb];
207
208 init_async_submit(submit, ASYNC_TX_FENCE, tx, NULL, NULL, scribble);
209 tx = async_memcpy(dp, g, 0, 0, bytes, submit);
210 init_async_submit(submit, ASYNC_TX_FENCE, tx, NULL, NULL, scribble);
211 tx = async_mult(dq, g, raid6_gfexp[good], bytes, submit);
212
213 /* compute P + Pxy */
214 srcs[0] = dp;
215 srcs[1] = p;
216 init_async_submit(submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
217 NULL, NULL, scribble);
218 tx = async_xor(dp, srcs, 0, 2, bytes, submit);
219
220 /* compute Q + Qxy */
221 srcs[0] = dq;
222 srcs[1] = q;
223 init_async_submit(submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
224 NULL, NULL, scribble);
225 tx = async_xor(dq, srcs, 0, 2, bytes, submit);
226
227 /* Dx = A*(P+Pxy) + B*(Q+Qxy) */
228 srcs[0] = dp;
229 srcs[1] = dq;
230 coef[0] = raid6_gfexi[failb-faila];
231 coef[1] = raid6_gfinv[raid6_gfexp[faila]^raid6_gfexp[failb]];
232 init_async_submit(submit, ASYNC_TX_FENCE, tx, NULL, NULL, scribble);
233 tx = async_sum_product(dq, srcs, coef, bytes, submit);
234
235 /* Dy = P+Pxy+Dx */
236 srcs[0] = dp;
237 srcs[1] = dq;
238 init_async_submit(submit, flags | ASYNC_TX_XOR_DROP_DST, tx, cb_fn,
239 cb_param, scribble);
240 tx = async_xor(dp, srcs, 0, 2, bytes, submit);
241
242 return tx;
243}
244
245static struct dma_async_tx_descriptor *
246__2data_recov_n(int disks, size_t bytes, int faila, int failb,
247 struct page **blocks, struct async_submit_ctl *submit)
248{
249 struct dma_async_tx_descriptor *tx = NULL;
250 struct page *p, *q, *dp, *dq;
251 struct page *srcs[2];
252 unsigned char coef[2];
253 enum async_tx_flags flags = submit->flags;
254 dma_async_tx_callback cb_fn = submit->cb_fn;
255 void *cb_param = submit->cb_param;
256 void *scribble = submit->scribble;
257
258 p = blocks[disks-2];
259 q = blocks[disks-1];
260
261 /* Compute syndrome with zero for the missing data pages
262 * Use the dead data pages as temporary storage for
263 * delta p and delta q
264 */
265 dp = blocks[faila];
266 blocks[faila] = (void *)raid6_empty_zero_page;
267 blocks[disks-2] = dp;
268 dq = blocks[failb];
269 blocks[failb] = (void *)raid6_empty_zero_page;
270 blocks[disks-1] = dq;
271
272 init_async_submit(submit, ASYNC_TX_FENCE, tx, NULL, NULL, scribble);
273 tx = async_gen_syndrome(blocks, 0, disks, bytes, submit);
274
275 /* Restore pointer table */
276 blocks[faila] = dp;
277 blocks[failb] = dq;
278 blocks[disks-2] = p;
279 blocks[disks-1] = q;
280
281 /* compute P + Pxy */
282 srcs[0] = dp;
283 srcs[1] = p;
284 init_async_submit(submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
285 NULL, NULL, scribble);
286 tx = async_xor(dp, srcs, 0, 2, bytes, submit);
287
288 /* compute Q + Qxy */
289 srcs[0] = dq;
290 srcs[1] = q;
291 init_async_submit(submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
292 NULL, NULL, scribble);
293 tx = async_xor(dq, srcs, 0, 2, bytes, submit);
294
295 /* Dx = A*(P+Pxy) + B*(Q+Qxy) */
296 srcs[0] = dp;
297 srcs[1] = dq;
298 coef[0] = raid6_gfexi[failb-faila];
299 coef[1] = raid6_gfinv[raid6_gfexp[faila]^raid6_gfexp[failb]];
300 init_async_submit(submit, ASYNC_TX_FENCE, tx, NULL, NULL, scribble);
301 tx = async_sum_product(dq, srcs, coef, bytes, submit);
302
303 /* Dy = P+Pxy+Dx */
304 srcs[0] = dp;
305 srcs[1] = dq;
306 init_async_submit(submit, flags | ASYNC_TX_XOR_DROP_DST, tx, cb_fn,
307 cb_param, scribble);
308 tx = async_xor(dp, srcs, 0, 2, bytes, submit);
309
310 return tx;
311}
312
313/**
314 * async_raid6_2data_recov - asynchronously calculate two missing data blocks
315 * @disks: number of disks in the RAID-6 array
316 * @bytes: block size
317 * @faila: first failed drive index
318 * @failb: second failed drive index
319 * @blocks: array of source pointers where the last two entries are p and q
320 * @submit: submission/completion modifiers
321 */
322struct dma_async_tx_descriptor *
323async_raid6_2data_recov(int disks, size_t bytes, int faila, int failb,
324 struct page **blocks, struct async_submit_ctl *submit)
325{
326 BUG_ON(faila == failb);
327 if (failb < faila)
328 swap(faila, failb);
329
330 pr_debug("%s: disks: %d len: %zu\n", __func__, disks, bytes);
331
332 /* we need to preserve the contents of 'blocks' for the async
333 * case, so punt to synchronous if a scribble buffer is not available
334 */
335 if (!submit->scribble) {
336 void **ptrs = (void **) blocks;
337 int i;
338
339 async_tx_quiesce(&submit->depend_tx);
340 for (i = 0; i < disks; i++)
341 ptrs[i] = page_address(blocks[i]);
342
343 raid6_2data_recov(disks, bytes, faila, failb, ptrs);
344
345 async_tx_sync_epilog(submit);
346
347 return NULL;
348 }
349
350 switch (disks) {
351 case 4:
352 /* dma devices do not uniformly understand a zero source pq
353 * operation (in contrast to the synchronous case), so
354 * explicitly handle the 4 disk special case
355 */
356 return __2data_recov_4(bytes, faila, failb, blocks, submit);
357 case 5:
358 /* dma devices do not uniformly understand a single
359 * source pq operation (in contrast to the synchronous
360 * case), so explicitly handle the 5 disk special case
361 */
362 return __2data_recov_5(bytes, faila, failb, blocks, submit);
363 default:
364 return __2data_recov_n(disks, bytes, faila, failb, blocks, submit);
365 }
366}
367EXPORT_SYMBOL_GPL(async_raid6_2data_recov);
368
369/**
370 * async_raid6_datap_recov - asynchronously calculate a data and the 'p' block
371 * @disks: number of disks in the RAID-6 array
372 * @bytes: block size
373 * @faila: failed drive index
374 * @blocks: array of source pointers where the last two entries are p and q
375 * @submit: submission/completion modifiers
376 */
377struct dma_async_tx_descriptor *
378async_raid6_datap_recov(int disks, size_t bytes, int faila,
379 struct page **blocks, struct async_submit_ctl *submit)
380{
381 struct dma_async_tx_descriptor *tx = NULL;
382 struct page *p, *q, *dq;
383 u8 coef;
384 enum async_tx_flags flags = submit->flags;
385 dma_async_tx_callback cb_fn = submit->cb_fn;
386 void *cb_param = submit->cb_param;
387 void *scribble = submit->scribble;
388 struct page *srcs[2];
389
390 pr_debug("%s: disks: %d len: %zu\n", __func__, disks, bytes);
391
392 /* we need to preserve the contents of 'blocks' for the async
393 * case, so punt to synchronous if a scribble buffer is not available
394 */
395 if (!scribble) {
396 void **ptrs = (void **) blocks;
397 int i;
398
399 async_tx_quiesce(&submit->depend_tx);
400 for (i = 0; i < disks; i++)
401 ptrs[i] = page_address(blocks[i]);
402
403 raid6_datap_recov(disks, bytes, faila, ptrs);
404
405 async_tx_sync_epilog(submit);
406
407 return NULL;
408 }
409
410 p = blocks[disks-2];
411 q = blocks[disks-1];
412
413 /* Compute syndrome with zero for the missing data page
414 * Use the dead data page as temporary storage for delta q
415 */
416 dq = blocks[faila];
417 blocks[faila] = (void *)raid6_empty_zero_page;
418 blocks[disks-1] = dq;
419
420 /* in the 4 disk case we only need to perform a single source
421 * multiplication
422 */
423 if (disks == 4) {
424 int good = faila == 0 ? 1 : 0;
425 struct page *g = blocks[good];
426
427 init_async_submit(submit, ASYNC_TX_FENCE, tx, NULL, NULL,
428 scribble);
429 tx = async_memcpy(p, g, 0, 0, bytes, submit);
430
431 init_async_submit(submit, ASYNC_TX_FENCE, tx, NULL, NULL,
432 scribble);
433 tx = async_mult(dq, g, raid6_gfexp[good], bytes, submit);
434 } else {
435 init_async_submit(submit, ASYNC_TX_FENCE, tx, NULL, NULL,
436 scribble);
437 tx = async_gen_syndrome(blocks, 0, disks, bytes, submit);
438 }
439
440 /* Restore pointer table */
441 blocks[faila] = dq;
442 blocks[disks-1] = q;
443
444 /* calculate g^{-faila} */
445 coef = raid6_gfinv[raid6_gfexp[faila]];
446
447 srcs[0] = dq;
448 srcs[1] = q;
449 init_async_submit(submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
450 NULL, NULL, scribble);
451 tx = async_xor(dq, srcs, 0, 2, bytes, submit);
452
453 init_async_submit(submit, ASYNC_TX_FENCE, tx, NULL, NULL, scribble);
454 tx = async_mult(dq, dq, coef, bytes, submit);
455
456 srcs[0] = p;
457 srcs[1] = dq;
458 init_async_submit(submit, flags | ASYNC_TX_XOR_DROP_DST, tx, cb_fn,
459 cb_param, scribble);
460 tx = async_xor(p, srcs, 0, 2, bytes, submit);
461
462 return tx;
463}
464EXPORT_SYMBOL_GPL(async_raid6_datap_recov);
465
466MODULE_AUTHOR("Dan Williams <dan.j.williams@intel.com>");
467MODULE_DESCRIPTION("asynchronous RAID-6 recovery api");
468MODULE_LICENSE("GPL");
diff --git a/crypto/async_tx/async_tx.c b/crypto/async_tx/async_tx.c
index 06eb6cc09fef..f9cdf04fe7c0 100644
--- a/crypto/async_tx/async_tx.c
+++ b/crypto/async_tx/async_tx.c
@@ -42,16 +42,21 @@ static void __exit async_tx_exit(void)
42 async_dmaengine_put(); 42 async_dmaengine_put();
43} 43}
44 44
45module_init(async_tx_init);
46module_exit(async_tx_exit);
47
45/** 48/**
46 * __async_tx_find_channel - find a channel to carry out the operation or let 49 * __async_tx_find_channel - find a channel to carry out the operation or let
47 * the transaction execute synchronously 50 * the transaction execute synchronously
48 * @depend_tx: transaction dependency 51 * @submit: transaction dependency and submission modifiers
49 * @tx_type: transaction type 52 * @tx_type: transaction type
50 */ 53 */
51struct dma_chan * 54struct dma_chan *
52__async_tx_find_channel(struct dma_async_tx_descriptor *depend_tx, 55__async_tx_find_channel(struct async_submit_ctl *submit,
53 enum dma_transaction_type tx_type) 56 enum dma_transaction_type tx_type)
54{ 57{
58 struct dma_async_tx_descriptor *depend_tx = submit->depend_tx;
59
55 /* see if we can keep the chain on one channel */ 60 /* see if we can keep the chain on one channel */
56 if (depend_tx && 61 if (depend_tx &&
57 dma_has_cap(tx_type, depend_tx->chan->device->cap_mask)) 62 dma_has_cap(tx_type, depend_tx->chan->device->cap_mask))
@@ -59,17 +64,6 @@ __async_tx_find_channel(struct dma_async_tx_descriptor *depend_tx,
59 return async_dma_find_channel(tx_type); 64 return async_dma_find_channel(tx_type);
60} 65}
61EXPORT_SYMBOL_GPL(__async_tx_find_channel); 66EXPORT_SYMBOL_GPL(__async_tx_find_channel);
62#else
63static int __init async_tx_init(void)
64{
65 printk(KERN_INFO "async_tx: api initialized (sync-only)\n");
66 return 0;
67}
68
69static void __exit async_tx_exit(void)
70{
71 do { } while (0);
72}
73#endif 67#endif
74 68
75 69
@@ -83,10 +77,14 @@ static void
83async_tx_channel_switch(struct dma_async_tx_descriptor *depend_tx, 77async_tx_channel_switch(struct dma_async_tx_descriptor *depend_tx,
84 struct dma_async_tx_descriptor *tx) 78 struct dma_async_tx_descriptor *tx)
85{ 79{
86 struct dma_chan *chan; 80 struct dma_chan *chan = depend_tx->chan;
87 struct dma_device *device; 81 struct dma_device *device = chan->device;
88 struct dma_async_tx_descriptor *intr_tx = (void *) ~0; 82 struct dma_async_tx_descriptor *intr_tx = (void *) ~0;
89 83
84 #ifdef CONFIG_ASYNC_TX_DISABLE_CHANNEL_SWITCH
85 BUG();
86 #endif
87
90 /* first check to see if we can still append to depend_tx */ 88 /* first check to see if we can still append to depend_tx */
91 spin_lock_bh(&depend_tx->lock); 89 spin_lock_bh(&depend_tx->lock);
92 if (depend_tx->parent && depend_tx->chan == tx->chan) { 90 if (depend_tx->parent && depend_tx->chan == tx->chan) {
@@ -96,11 +94,11 @@ async_tx_channel_switch(struct dma_async_tx_descriptor *depend_tx,
96 } 94 }
97 spin_unlock_bh(&depend_tx->lock); 95 spin_unlock_bh(&depend_tx->lock);
98 96
99 if (!intr_tx) 97 /* attached dependency, flush the parent channel */
98 if (!intr_tx) {
99 device->device_issue_pending(chan);
100 return; 100 return;
101 101 }
102 chan = depend_tx->chan;
103 device = chan->device;
104 102
105 /* see if we can schedule an interrupt 103 /* see if we can schedule an interrupt
106 * otherwise poll for completion 104 * otherwise poll for completion
@@ -134,6 +132,7 @@ async_tx_channel_switch(struct dma_async_tx_descriptor *depend_tx,
134 intr_tx->tx_submit(intr_tx); 132 intr_tx->tx_submit(intr_tx);
135 async_tx_ack(intr_tx); 133 async_tx_ack(intr_tx);
136 } 134 }
135 device->device_issue_pending(chan);
137 } else { 136 } else {
138 if (dma_wait_for_async_tx(depend_tx) == DMA_ERROR) 137 if (dma_wait_for_async_tx(depend_tx) == DMA_ERROR)
139 panic("%s: DMA_ERROR waiting for depend_tx\n", 138 panic("%s: DMA_ERROR waiting for depend_tx\n",
@@ -144,13 +143,14 @@ async_tx_channel_switch(struct dma_async_tx_descriptor *depend_tx,
144 143
145 144
146/** 145/**
147 * submit_disposition - while holding depend_tx->lock we must avoid submitting 146 * submit_disposition - flags for routing an incoming operation
148 * new operations to prevent a circular locking dependency with
149 * drivers that already hold a channel lock when calling
150 * async_tx_run_dependencies.
151 * @ASYNC_TX_SUBMITTED: we were able to append the new operation under the lock 147 * @ASYNC_TX_SUBMITTED: we were able to append the new operation under the lock
152 * @ASYNC_TX_CHANNEL_SWITCH: when the lock is dropped schedule a channel switch 148 * @ASYNC_TX_CHANNEL_SWITCH: when the lock is dropped schedule a channel switch
153 * @ASYNC_TX_DIRECT_SUBMIT: when the lock is dropped submit directly 149 * @ASYNC_TX_DIRECT_SUBMIT: when the lock is dropped submit directly
150 *
151 * while holding depend_tx->lock we must avoid submitting new operations
152 * to prevent a circular locking dependency with drivers that already
153 * hold a channel lock when calling async_tx_run_dependencies.
154 */ 154 */
155enum submit_disposition { 155enum submit_disposition {
156 ASYNC_TX_SUBMITTED, 156 ASYNC_TX_SUBMITTED,
@@ -160,11 +160,12 @@ enum submit_disposition {
160 160
161void 161void
162async_tx_submit(struct dma_chan *chan, struct dma_async_tx_descriptor *tx, 162async_tx_submit(struct dma_chan *chan, struct dma_async_tx_descriptor *tx,
163 enum async_tx_flags flags, struct dma_async_tx_descriptor *depend_tx, 163 struct async_submit_ctl *submit)
164 dma_async_tx_callback cb_fn, void *cb_param)
165{ 164{
166 tx->callback = cb_fn; 165 struct dma_async_tx_descriptor *depend_tx = submit->depend_tx;
167 tx->callback_param = cb_param; 166
167 tx->callback = submit->cb_fn;
168 tx->callback_param = submit->cb_param;
168 169
169 if (depend_tx) { 170 if (depend_tx) {
170 enum submit_disposition s; 171 enum submit_disposition s;
@@ -220,30 +221,29 @@ async_tx_submit(struct dma_chan *chan, struct dma_async_tx_descriptor *tx,
220 tx->tx_submit(tx); 221 tx->tx_submit(tx);
221 } 222 }
222 223
223 if (flags & ASYNC_TX_ACK) 224 if (submit->flags & ASYNC_TX_ACK)
224 async_tx_ack(tx); 225 async_tx_ack(tx);
225 226
226 if (depend_tx && (flags & ASYNC_TX_DEP_ACK)) 227 if (depend_tx)
227 async_tx_ack(depend_tx); 228 async_tx_ack(depend_tx);
228} 229}
229EXPORT_SYMBOL_GPL(async_tx_submit); 230EXPORT_SYMBOL_GPL(async_tx_submit);
230 231
231/** 232/**
232 * async_trigger_callback - schedules the callback function to be run after 233 * async_trigger_callback - schedules the callback function to be run
233 * any dependent operations have been completed. 234 * @submit: submission and completion parameters
234 * @flags: ASYNC_TX_ACK, ASYNC_TX_DEP_ACK 235 *
235 * @depend_tx: 'callback' requires the completion of this transaction 236 * honored flags: ASYNC_TX_ACK
236 * @cb_fn: function to call after depend_tx completes 237 *
237 * @cb_param: parameter to pass to the callback routine 238 * The callback is run after any dependent operations have completed.
238 */ 239 */
239struct dma_async_tx_descriptor * 240struct dma_async_tx_descriptor *
240async_trigger_callback(enum async_tx_flags flags, 241async_trigger_callback(struct async_submit_ctl *submit)
241 struct dma_async_tx_descriptor *depend_tx,
242 dma_async_tx_callback cb_fn, void *cb_param)
243{ 242{
244 struct dma_chan *chan; 243 struct dma_chan *chan;
245 struct dma_device *device; 244 struct dma_device *device;
246 struct dma_async_tx_descriptor *tx; 245 struct dma_async_tx_descriptor *tx;
246 struct dma_async_tx_descriptor *depend_tx = submit->depend_tx;
247 247
248 if (depend_tx) { 248 if (depend_tx) {
249 chan = depend_tx->chan; 249 chan = depend_tx->chan;
@@ -262,14 +262,14 @@ async_trigger_callback(enum async_tx_flags flags,
262 if (tx) { 262 if (tx) {
263 pr_debug("%s: (async)\n", __func__); 263 pr_debug("%s: (async)\n", __func__);
264 264
265 async_tx_submit(chan, tx, flags, depend_tx, cb_fn, cb_param); 265 async_tx_submit(chan, tx, submit);
266 } else { 266 } else {
267 pr_debug("%s: (sync)\n", __func__); 267 pr_debug("%s: (sync)\n", __func__);
268 268
269 /* wait for any prerequisite operations */ 269 /* wait for any prerequisite operations */
270 async_tx_quiesce(&depend_tx); 270 async_tx_quiesce(&submit->depend_tx);
271 271
272 async_tx_sync_epilog(cb_fn, cb_param); 272 async_tx_sync_epilog(submit);
273 } 273 }
274 274
275 return tx; 275 return tx;
@@ -295,9 +295,6 @@ void async_tx_quiesce(struct dma_async_tx_descriptor **tx)
295} 295}
296EXPORT_SYMBOL_GPL(async_tx_quiesce); 296EXPORT_SYMBOL_GPL(async_tx_quiesce);
297 297
298module_init(async_tx_init);
299module_exit(async_tx_exit);
300
301MODULE_AUTHOR("Intel Corporation"); 298MODULE_AUTHOR("Intel Corporation");
302MODULE_DESCRIPTION("Asynchronous Bulk Memory Transactions API"); 299MODULE_DESCRIPTION("Asynchronous Bulk Memory Transactions API");
303MODULE_LICENSE("GPL"); 300MODULE_LICENSE("GPL");
diff --git a/crypto/async_tx/async_xor.c b/crypto/async_tx/async_xor.c
index 90dd3f8bd283..b459a9034aac 100644
--- a/crypto/async_tx/async_xor.c
+++ b/crypto/async_tx/async_xor.c
@@ -33,19 +33,16 @@
33/* do_async_xor - dma map the pages and perform the xor with an engine */ 33/* do_async_xor - dma map the pages and perform the xor with an engine */
34static __async_inline struct dma_async_tx_descriptor * 34static __async_inline struct dma_async_tx_descriptor *
35do_async_xor(struct dma_chan *chan, struct page *dest, struct page **src_list, 35do_async_xor(struct dma_chan *chan, struct page *dest, struct page **src_list,
36 unsigned int offset, int src_cnt, size_t len, 36 unsigned int offset, int src_cnt, size_t len, dma_addr_t *dma_src,
37 enum async_tx_flags flags, 37 struct async_submit_ctl *submit)
38 struct dma_async_tx_descriptor *depend_tx,
39 dma_async_tx_callback cb_fn, void *cb_param)
40{ 38{
41 struct dma_device *dma = chan->device; 39 struct dma_device *dma = chan->device;
42 dma_addr_t *dma_src = (dma_addr_t *) src_list;
43 struct dma_async_tx_descriptor *tx = NULL; 40 struct dma_async_tx_descriptor *tx = NULL;
44 int src_off = 0; 41 int src_off = 0;
45 int i; 42 int i;
46 dma_async_tx_callback _cb_fn; 43 dma_async_tx_callback cb_fn_orig = submit->cb_fn;
47 void *_cb_param; 44 void *cb_param_orig = submit->cb_param;
48 enum async_tx_flags async_flags; 45 enum async_tx_flags flags_orig = submit->flags;
49 enum dma_ctrl_flags dma_flags; 46 enum dma_ctrl_flags dma_flags;
50 int xor_src_cnt; 47 int xor_src_cnt;
51 dma_addr_t dma_dest; 48 dma_addr_t dma_dest;
@@ -63,25 +60,27 @@ do_async_xor(struct dma_chan *chan, struct page *dest, struct page **src_list,
63 } 60 }
64 61
65 while (src_cnt) { 62 while (src_cnt) {
66 async_flags = flags; 63 submit->flags = flags_orig;
67 dma_flags = 0; 64 dma_flags = 0;
68 xor_src_cnt = min(src_cnt, dma->max_xor); 65 xor_src_cnt = min(src_cnt, (int)dma->max_xor);
69 /* if we are submitting additional xors, leave the chain open, 66 /* if we are submitting additional xors, leave the chain open,
70 * clear the callback parameters, and leave the destination 67 * clear the callback parameters, and leave the destination
71 * buffer mapped 68 * buffer mapped
72 */ 69 */
73 if (src_cnt > xor_src_cnt) { 70 if (src_cnt > xor_src_cnt) {
74 async_flags &= ~ASYNC_TX_ACK; 71 submit->flags &= ~ASYNC_TX_ACK;
72 submit->flags |= ASYNC_TX_FENCE;
75 dma_flags = DMA_COMPL_SKIP_DEST_UNMAP; 73 dma_flags = DMA_COMPL_SKIP_DEST_UNMAP;
76 _cb_fn = NULL; 74 submit->cb_fn = NULL;
77 _cb_param = NULL; 75 submit->cb_param = NULL;
78 } else { 76 } else {
79 _cb_fn = cb_fn; 77 submit->cb_fn = cb_fn_orig;
80 _cb_param = cb_param; 78 submit->cb_param = cb_param_orig;
81 } 79 }
82 if (_cb_fn) 80 if (submit->cb_fn)
83 dma_flags |= DMA_PREP_INTERRUPT; 81 dma_flags |= DMA_PREP_INTERRUPT;
84 82 if (submit->flags & ASYNC_TX_FENCE)
83 dma_flags |= DMA_PREP_FENCE;
85 /* Since we have clobbered the src_list we are committed 84 /* Since we have clobbered the src_list we are committed
86 * to doing this asynchronously. Drivers force forward progress 85 * to doing this asynchronously. Drivers force forward progress
87 * in case they can not provide a descriptor 86 * in case they can not provide a descriptor
@@ -90,7 +89,7 @@ do_async_xor(struct dma_chan *chan, struct page *dest, struct page **src_list,
90 xor_src_cnt, len, dma_flags); 89 xor_src_cnt, len, dma_flags);
91 90
92 if (unlikely(!tx)) 91 if (unlikely(!tx))
93 async_tx_quiesce(&depend_tx); 92 async_tx_quiesce(&submit->depend_tx);
94 93
95 /* spin wait for the preceeding transactions to complete */ 94 /* spin wait for the preceeding transactions to complete */
96 while (unlikely(!tx)) { 95 while (unlikely(!tx)) {
@@ -101,11 +100,8 @@ do_async_xor(struct dma_chan *chan, struct page *dest, struct page **src_list,
101 dma_flags); 100 dma_flags);
102 } 101 }
103 102
104 async_tx_submit(chan, tx, async_flags, depend_tx, _cb_fn, 103 async_tx_submit(chan, tx, submit);
105 _cb_param); 104 submit->depend_tx = tx;
106
107 depend_tx = tx;
108 flags |= ASYNC_TX_DEP_ACK;
109 105
110 if (src_cnt > xor_src_cnt) { 106 if (src_cnt > xor_src_cnt) {
111 /* drop completed sources */ 107 /* drop completed sources */
@@ -124,23 +120,27 @@ do_async_xor(struct dma_chan *chan, struct page *dest, struct page **src_list,
124 120
125static void 121static void
126do_sync_xor(struct page *dest, struct page **src_list, unsigned int offset, 122do_sync_xor(struct page *dest, struct page **src_list, unsigned int offset,
127 int src_cnt, size_t len, enum async_tx_flags flags, 123 int src_cnt, size_t len, struct async_submit_ctl *submit)
128 dma_async_tx_callback cb_fn, void *cb_param)
129{ 124{
130 int i; 125 int i;
131 int xor_src_cnt; 126 int xor_src_cnt;
132 int src_off = 0; 127 int src_off = 0;
133 void *dest_buf; 128 void *dest_buf;
134 void **srcs = (void **) src_list; 129 void **srcs;
130
131 if (submit->scribble)
132 srcs = submit->scribble;
133 else
134 srcs = (void **) src_list;
135 135
136 /* reuse the 'src_list' array to convert to buffer pointers */ 136 /* convert to buffer pointers */
137 for (i = 0; i < src_cnt; i++) 137 for (i = 0; i < src_cnt; i++)
138 srcs[i] = page_address(src_list[i]) + offset; 138 srcs[i] = page_address(src_list[i]) + offset;
139 139
140 /* set destination address */ 140 /* set destination address */
141 dest_buf = page_address(dest) + offset; 141 dest_buf = page_address(dest) + offset;
142 142
143 if (flags & ASYNC_TX_XOR_ZERO_DST) 143 if (submit->flags & ASYNC_TX_XOR_ZERO_DST)
144 memset(dest_buf, 0, len); 144 memset(dest_buf, 0, len);
145 145
146 while (src_cnt > 0) { 146 while (src_cnt > 0) {
@@ -153,61 +153,70 @@ do_sync_xor(struct page *dest, struct page **src_list, unsigned int offset,
153 src_off += xor_src_cnt; 153 src_off += xor_src_cnt;
154 } 154 }
155 155
156 async_tx_sync_epilog(cb_fn, cb_param); 156 async_tx_sync_epilog(submit);
157} 157}
158 158
159/** 159/**
160 * async_xor - attempt to xor a set of blocks with a dma engine. 160 * async_xor - attempt to xor a set of blocks with a dma engine.
161 * xor_blocks always uses the dest as a source so the ASYNC_TX_XOR_ZERO_DST
162 * flag must be set to not include dest data in the calculation. The
163 * assumption with dma eninges is that they only use the destination
164 * buffer as a source when it is explicity specified in the source list.
165 * @dest: destination page 161 * @dest: destination page
166 * @src_list: array of source pages (if the dest is also a source it must be 162 * @src_list: array of source pages
167 * at index zero). The contents of this array may be overwritten. 163 * @offset: common src/dst offset to start transaction
168 * @offset: offset in pages to start transaction
169 * @src_cnt: number of source pages 164 * @src_cnt: number of source pages
170 * @len: length in bytes 165 * @len: length in bytes
171 * @flags: ASYNC_TX_XOR_ZERO_DST, ASYNC_TX_XOR_DROP_DEST, 166 * @submit: submission / completion modifiers
172 * ASYNC_TX_ACK, ASYNC_TX_DEP_ACK 167 *
173 * @depend_tx: xor depends on the result of this transaction. 168 * honored flags: ASYNC_TX_ACK, ASYNC_TX_XOR_ZERO_DST, ASYNC_TX_XOR_DROP_DST
174 * @cb_fn: function to call when the xor completes 169 *
175 * @cb_param: parameter to pass to the callback routine 170 * xor_blocks always uses the dest as a source so the
171 * ASYNC_TX_XOR_ZERO_DST flag must be set to not include dest data in
172 * the calculation. The assumption with dma eninges is that they only
173 * use the destination buffer as a source when it is explicity specified
174 * in the source list.
175 *
176 * src_list note: if the dest is also a source it must be at index zero.
177 * The contents of this array will be overwritten if a scribble region
178 * is not specified.
176 */ 179 */
177struct dma_async_tx_descriptor * 180struct dma_async_tx_descriptor *
178async_xor(struct page *dest, struct page **src_list, unsigned int offset, 181async_xor(struct page *dest, struct page **src_list, unsigned int offset,
179 int src_cnt, size_t len, enum async_tx_flags flags, 182 int src_cnt, size_t len, struct async_submit_ctl *submit)
180 struct dma_async_tx_descriptor *depend_tx,
181 dma_async_tx_callback cb_fn, void *cb_param)
182{ 183{
183 struct dma_chan *chan = async_tx_find_channel(depend_tx, DMA_XOR, 184 struct dma_chan *chan = async_tx_find_channel(submit, DMA_XOR,
184 &dest, 1, src_list, 185 &dest, 1, src_list,
185 src_cnt, len); 186 src_cnt, len);
187 dma_addr_t *dma_src = NULL;
188
186 BUG_ON(src_cnt <= 1); 189 BUG_ON(src_cnt <= 1);
187 190
188 if (chan) { 191 if (submit->scribble)
192 dma_src = submit->scribble;
193 else if (sizeof(dma_addr_t) <= sizeof(struct page *))
194 dma_src = (dma_addr_t *) src_list;
195
196 if (dma_src && chan && is_dma_xor_aligned(chan->device, offset, 0, len)) {
189 /* run the xor asynchronously */ 197 /* run the xor asynchronously */
190 pr_debug("%s (async): len: %zu\n", __func__, len); 198 pr_debug("%s (async): len: %zu\n", __func__, len);
191 199
192 return do_async_xor(chan, dest, src_list, offset, src_cnt, len, 200 return do_async_xor(chan, dest, src_list, offset, src_cnt, len,
193 flags, depend_tx, cb_fn, cb_param); 201 dma_src, submit);
194 } else { 202 } else {
195 /* run the xor synchronously */ 203 /* run the xor synchronously */
196 pr_debug("%s (sync): len: %zu\n", __func__, len); 204 pr_debug("%s (sync): len: %zu\n", __func__, len);
205 WARN_ONCE(chan, "%s: no space for dma address conversion\n",
206 __func__);
197 207
198 /* in the sync case the dest is an implied source 208 /* in the sync case the dest is an implied source
199 * (assumes the dest is the first source) 209 * (assumes the dest is the first source)
200 */ 210 */
201 if (flags & ASYNC_TX_XOR_DROP_DST) { 211 if (submit->flags & ASYNC_TX_XOR_DROP_DST) {
202 src_cnt--; 212 src_cnt--;
203 src_list++; 213 src_list++;
204 } 214 }
205 215
206 /* wait for any prerequisite operations */ 216 /* wait for any prerequisite operations */
207 async_tx_quiesce(&depend_tx); 217 async_tx_quiesce(&submit->depend_tx);
208 218
209 do_sync_xor(dest, src_list, offset, src_cnt, len, 219 do_sync_xor(dest, src_list, offset, src_cnt, len, submit);
210 flags, cb_fn, cb_param);
211 220
212 return NULL; 221 return NULL;
213 } 222 }
@@ -222,104 +231,94 @@ static int page_is_zero(struct page *p, unsigned int offset, size_t len)
222} 231}
223 232
224/** 233/**
225 * async_xor_zero_sum - attempt a xor parity check with a dma engine. 234 * async_xor_val - attempt a xor parity check with a dma engine.
226 * @dest: destination page used if the xor is performed synchronously 235 * @dest: destination page used if the xor is performed synchronously
227 * @src_list: array of source pages. The dest page must be listed as a source 236 * @src_list: array of source pages
228 * at index zero. The contents of this array may be overwritten.
229 * @offset: offset in pages to start transaction 237 * @offset: offset in pages to start transaction
230 * @src_cnt: number of source pages 238 * @src_cnt: number of source pages
231 * @len: length in bytes 239 * @len: length in bytes
232 * @result: 0 if sum == 0 else non-zero 240 * @result: 0 if sum == 0 else non-zero
233 * @flags: ASYNC_TX_ACK, ASYNC_TX_DEP_ACK 241 * @submit: submission / completion modifiers
234 * @depend_tx: xor depends on the result of this transaction. 242 *
235 * @cb_fn: function to call when the xor completes 243 * honored flags: ASYNC_TX_ACK
236 * @cb_param: parameter to pass to the callback routine 244 *
245 * src_list note: if the dest is also a source it must be at index zero.
246 * The contents of this array will be overwritten if a scribble region
247 * is not specified.
237 */ 248 */
238struct dma_async_tx_descriptor * 249struct dma_async_tx_descriptor *
239async_xor_zero_sum(struct page *dest, struct page **src_list, 250async_xor_val(struct page *dest, struct page **src_list, unsigned int offset,
240 unsigned int offset, int src_cnt, size_t len, 251 int src_cnt, size_t len, enum sum_check_flags *result,
241 u32 *result, enum async_tx_flags flags, 252 struct async_submit_ctl *submit)
242 struct dma_async_tx_descriptor *depend_tx,
243 dma_async_tx_callback cb_fn, void *cb_param)
244{ 253{
245 struct dma_chan *chan = async_tx_find_channel(depend_tx, DMA_ZERO_SUM, 254 struct dma_chan *chan = async_tx_find_channel(submit, DMA_XOR_VAL,
246 &dest, 1, src_list, 255 &dest, 1, src_list,
247 src_cnt, len); 256 src_cnt, len);
248 struct dma_device *device = chan ? chan->device : NULL; 257 struct dma_device *device = chan ? chan->device : NULL;
249 struct dma_async_tx_descriptor *tx = NULL; 258 struct dma_async_tx_descriptor *tx = NULL;
259 dma_addr_t *dma_src = NULL;
250 260
251 BUG_ON(src_cnt <= 1); 261 BUG_ON(src_cnt <= 1);
252 262
253 if (device && src_cnt <= device->max_xor) { 263 if (submit->scribble)
254 dma_addr_t *dma_src = (dma_addr_t *) src_list; 264 dma_src = submit->scribble;
255 unsigned long dma_prep_flags = cb_fn ? DMA_PREP_INTERRUPT : 0; 265 else if (sizeof(dma_addr_t) <= sizeof(struct page *))
266 dma_src = (dma_addr_t *) src_list;
267
268 if (dma_src && device && src_cnt <= device->max_xor &&
269 is_dma_xor_aligned(device, offset, 0, len)) {
270 unsigned long dma_prep_flags = 0;
256 int i; 271 int i;
257 272
258 pr_debug("%s: (async) len: %zu\n", __func__, len); 273 pr_debug("%s: (async) len: %zu\n", __func__, len);
259 274
275 if (submit->cb_fn)
276 dma_prep_flags |= DMA_PREP_INTERRUPT;
277 if (submit->flags & ASYNC_TX_FENCE)
278 dma_prep_flags |= DMA_PREP_FENCE;
260 for (i = 0; i < src_cnt; i++) 279 for (i = 0; i < src_cnt; i++)
261 dma_src[i] = dma_map_page(device->dev, src_list[i], 280 dma_src[i] = dma_map_page(device->dev, src_list[i],
262 offset, len, DMA_TO_DEVICE); 281 offset, len, DMA_TO_DEVICE);
263 282
264 tx = device->device_prep_dma_zero_sum(chan, dma_src, src_cnt, 283 tx = device->device_prep_dma_xor_val(chan, dma_src, src_cnt,
265 len, result, 284 len, result,
266 dma_prep_flags); 285 dma_prep_flags);
267 if (unlikely(!tx)) { 286 if (unlikely(!tx)) {
268 async_tx_quiesce(&depend_tx); 287 async_tx_quiesce(&submit->depend_tx);
269 288
270 while (!tx) { 289 while (!tx) {
271 dma_async_issue_pending(chan); 290 dma_async_issue_pending(chan);
272 tx = device->device_prep_dma_zero_sum(chan, 291 tx = device->device_prep_dma_xor_val(chan,
273 dma_src, src_cnt, len, result, 292 dma_src, src_cnt, len, result,
274 dma_prep_flags); 293 dma_prep_flags);
275 } 294 }
276 } 295 }
277 296
278 async_tx_submit(chan, tx, flags, depend_tx, cb_fn, cb_param); 297 async_tx_submit(chan, tx, submit);
279 } else { 298 } else {
280 unsigned long xor_flags = flags; 299 enum async_tx_flags flags_orig = submit->flags;
281 300
282 pr_debug("%s: (sync) len: %zu\n", __func__, len); 301 pr_debug("%s: (sync) len: %zu\n", __func__, len);
302 WARN_ONCE(device && src_cnt <= device->max_xor,
303 "%s: no space for dma address conversion\n",
304 __func__);
283 305
284 xor_flags |= ASYNC_TX_XOR_DROP_DST; 306 submit->flags |= ASYNC_TX_XOR_DROP_DST;
285 xor_flags &= ~ASYNC_TX_ACK; 307 submit->flags &= ~ASYNC_TX_ACK;
286 308
287 tx = async_xor(dest, src_list, offset, src_cnt, len, xor_flags, 309 tx = async_xor(dest, src_list, offset, src_cnt, len, submit);
288 depend_tx, NULL, NULL);
289 310
290 async_tx_quiesce(&tx); 311 async_tx_quiesce(&tx);
291 312
292 *result = page_is_zero(dest, offset, len) ? 0 : 1; 313 *result = !page_is_zero(dest, offset, len) << SUM_CHECK_P;
293 314
294 async_tx_sync_epilog(cb_fn, cb_param); 315 async_tx_sync_epilog(submit);
316 submit->flags = flags_orig;
295 } 317 }
296 318
297 return tx; 319 return tx;
298} 320}
299EXPORT_SYMBOL_GPL(async_xor_zero_sum); 321EXPORT_SYMBOL_GPL(async_xor_val);
300
301static int __init async_xor_init(void)
302{
303 #ifdef CONFIG_ASYNC_TX_DMA
304 /* To conserve stack space the input src_list (array of page pointers)
305 * is reused to hold the array of dma addresses passed to the driver.
306 * This conversion is only possible when dma_addr_t is less than the
307 * the size of a pointer. HIGHMEM64G is known to violate this
308 * assumption.
309 */
310 BUILD_BUG_ON(sizeof(dma_addr_t) > sizeof(struct page *));
311 #endif
312
313 return 0;
314}
315
316static void __exit async_xor_exit(void)
317{
318 do { } while (0);
319}
320
321module_init(async_xor_init);
322module_exit(async_xor_exit);
323 322
324MODULE_AUTHOR("Intel Corporation"); 323MODULE_AUTHOR("Intel Corporation");
325MODULE_DESCRIPTION("asynchronous xor/xor-zero-sum api"); 324MODULE_DESCRIPTION("asynchronous xor/xor-zero-sum api");
diff --git a/crypto/async_tx/raid6test.c b/crypto/async_tx/raid6test.c
new file mode 100644
index 000000000000..3ec27c7e62ea
--- /dev/null
+++ b/crypto/async_tx/raid6test.c
@@ -0,0 +1,240 @@
1/*
2 * asynchronous raid6 recovery self test
3 * Copyright (c) 2009, Intel Corporation.
4 *
5 * based on drivers/md/raid6test/test.c:
6 * Copyright 2002-2007 H. Peter Anvin
7 *
8 * This program is free software; you can redistribute it and/or modify it
9 * under the terms and conditions of the GNU General Public License,
10 * version 2, as published by the Free Software Foundation.
11 *
12 * This program is distributed in the hope it will be useful, but WITHOUT
13 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
15 * more details.
16 *
17 * You should have received a copy of the GNU General Public License along with
18 * this program; if not, write to the Free Software Foundation, Inc.,
19 * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 */
22#include <linux/async_tx.h>
23#include <linux/random.h>
24
25#undef pr
26#define pr(fmt, args...) pr_info("raid6test: " fmt, ##args)
27
28#define NDISKS 16 /* Including P and Q */
29
30static struct page *dataptrs[NDISKS];
31static addr_conv_t addr_conv[NDISKS];
32static struct page *data[NDISKS+3];
33static struct page *spare;
34static struct page *recovi;
35static struct page *recovj;
36
37static void callback(void *param)
38{
39 struct completion *cmp = param;
40
41 complete(cmp);
42}
43
44static void makedata(int disks)
45{
46 int i, j;
47
48 for (i = 0; i < disks; i++) {
49 for (j = 0; j < PAGE_SIZE/sizeof(u32); j += sizeof(u32)) {
50 u32 *p = page_address(data[i]) + j;
51
52 *p = random32();
53 }
54
55 dataptrs[i] = data[i];
56 }
57}
58
59static char disk_type(int d, int disks)
60{
61 if (d == disks - 2)
62 return 'P';
63 else if (d == disks - 1)
64 return 'Q';
65 else
66 return 'D';
67}
68
69/* Recover two failed blocks. */
70static void raid6_dual_recov(int disks, size_t bytes, int faila, int failb, struct page **ptrs)
71{
72 struct async_submit_ctl submit;
73 struct completion cmp;
74 struct dma_async_tx_descriptor *tx = NULL;
75 enum sum_check_flags result = ~0;
76
77 if (faila > failb)
78 swap(faila, failb);
79
80 if (failb == disks-1) {
81 if (faila == disks-2) {
82 /* P+Q failure. Just rebuild the syndrome. */
83 init_async_submit(&submit, 0, NULL, NULL, NULL, addr_conv);
84 tx = async_gen_syndrome(ptrs, 0, disks, bytes, &submit);
85 } else {
86 struct page *blocks[disks];
87 struct page *dest;
88 int count = 0;
89 int i;
90
91 /* data+Q failure. Reconstruct data from P,
92 * then rebuild syndrome
93 */
94 for (i = disks; i-- ; ) {
95 if (i == faila || i == failb)
96 continue;
97 blocks[count++] = ptrs[i];
98 }
99 dest = ptrs[faila];
100 init_async_submit(&submit, ASYNC_TX_XOR_ZERO_DST, NULL,
101 NULL, NULL, addr_conv);
102 tx = async_xor(dest, blocks, 0, count, bytes, &submit);
103
104 init_async_submit(&submit, 0, tx, NULL, NULL, addr_conv);
105 tx = async_gen_syndrome(ptrs, 0, disks, bytes, &submit);
106 }
107 } else {
108 if (failb == disks-2) {
109 /* data+P failure. */
110 init_async_submit(&submit, 0, NULL, NULL, NULL, addr_conv);
111 tx = async_raid6_datap_recov(disks, bytes, faila, ptrs, &submit);
112 } else {
113 /* data+data failure. */
114 init_async_submit(&submit, 0, NULL, NULL, NULL, addr_conv);
115 tx = async_raid6_2data_recov(disks, bytes, faila, failb, ptrs, &submit);
116 }
117 }
118 init_completion(&cmp);
119 init_async_submit(&submit, ASYNC_TX_ACK, tx, callback, &cmp, addr_conv);
120 tx = async_syndrome_val(ptrs, 0, disks, bytes, &result, spare, &submit);
121 async_tx_issue_pending(tx);
122
123 if (wait_for_completion_timeout(&cmp, msecs_to_jiffies(3000)) == 0)
124 pr("%s: timeout! (faila: %d failb: %d disks: %d)\n",
125 __func__, faila, failb, disks);
126
127 if (result != 0)
128 pr("%s: validation failure! faila: %d failb: %d sum_check_flags: %x\n",
129 __func__, faila, failb, result);
130}
131
132static int test_disks(int i, int j, int disks)
133{
134 int erra, errb;
135
136 memset(page_address(recovi), 0xf0, PAGE_SIZE);
137 memset(page_address(recovj), 0xba, PAGE_SIZE);
138
139 dataptrs[i] = recovi;
140 dataptrs[j] = recovj;
141
142 raid6_dual_recov(disks, PAGE_SIZE, i, j, dataptrs);
143
144 erra = memcmp(page_address(data[i]), page_address(recovi), PAGE_SIZE);
145 errb = memcmp(page_address(data[j]), page_address(recovj), PAGE_SIZE);
146
147 pr("%s(%d, %d): faila=%3d(%c) failb=%3d(%c) %s\n",
148 __func__, i, j, i, disk_type(i, disks), j, disk_type(j, disks),
149 (!erra && !errb) ? "OK" : !erra ? "ERRB" : !errb ? "ERRA" : "ERRAB");
150
151 dataptrs[i] = data[i];
152 dataptrs[j] = data[j];
153
154 return erra || errb;
155}
156
157static int test(int disks, int *tests)
158{
159 struct dma_async_tx_descriptor *tx;
160 struct async_submit_ctl submit;
161 struct completion cmp;
162 int err = 0;
163 int i, j;
164
165 recovi = data[disks];
166 recovj = data[disks+1];
167 spare = data[disks+2];
168
169 makedata(disks);
170
171 /* Nuke syndromes */
172 memset(page_address(data[disks-2]), 0xee, PAGE_SIZE);
173 memset(page_address(data[disks-1]), 0xee, PAGE_SIZE);
174
175 /* Generate assumed good syndrome */
176 init_completion(&cmp);
177 init_async_submit(&submit, ASYNC_TX_ACK, NULL, callback, &cmp, addr_conv);
178 tx = async_gen_syndrome(dataptrs, 0, disks, PAGE_SIZE, &submit);
179 async_tx_issue_pending(tx);
180
181 if (wait_for_completion_timeout(&cmp, msecs_to_jiffies(3000)) == 0) {
182 pr("error: initial gen_syndrome(%d) timed out\n", disks);
183 return 1;
184 }
185
186 pr("testing the %d-disk case...\n", disks);
187 for (i = 0; i < disks-1; i++)
188 for (j = i+1; j < disks; j++) {
189 (*tests)++;
190 err += test_disks(i, j, disks);
191 }
192
193 return err;
194}
195
196
197static int raid6_test(void)
198{
199 int err = 0;
200 int tests = 0;
201 int i;
202
203 for (i = 0; i < NDISKS+3; i++) {
204 data[i] = alloc_page(GFP_KERNEL);
205 if (!data[i]) {
206 while (i--)
207 put_page(data[i]);
208 return -ENOMEM;
209 }
210 }
211
212 /* the 4-disk and 5-disk cases are special for the recovery code */
213 if (NDISKS > 4)
214 err += test(4, &tests);
215 if (NDISKS > 5)
216 err += test(5, &tests);
217 err += test(NDISKS, &tests);
218
219 pr("\n");
220 pr("complete (%d tests, %d failure%s)\n",
221 tests, err, err == 1 ? "" : "s");
222
223 for (i = 0; i < NDISKS+3; i++)
224 put_page(data[i]);
225
226 return 0;
227}
228
229static void raid6_test_exit(void)
230{
231}
232
233/* when compiled-in wait for drivers to load first (assumes dma drivers
234 * are also compliled-in)
235 */
236late_initcall(raid6_test);
237module_exit(raid6_test_exit);
238MODULE_AUTHOR("Dan Williams <dan.j.williams@intel.com>");
239MODULE_DESCRIPTION("asynchronous RAID-6 recovery self tests");
240MODULE_LICENSE("GPL");
diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c
index 71d1b9bab70b..614da5b8613a 100644
--- a/drivers/cdrom/cdrom.c
+++ b/drivers/cdrom/cdrom.c
@@ -3412,7 +3412,7 @@ static int cdrom_print_info(const char *header, int val, char *info,
3412 return 0; 3412 return 0;
3413} 3413}
3414 3414
3415static int cdrom_sysctl_info(ctl_table *ctl, int write, struct file * filp, 3415static int cdrom_sysctl_info(ctl_table *ctl, int write,
3416 void __user *buffer, size_t *lenp, loff_t *ppos) 3416 void __user *buffer, size_t *lenp, loff_t *ppos)
3417{ 3417{
3418 int pos; 3418 int pos;
@@ -3489,7 +3489,7 @@ static int cdrom_sysctl_info(ctl_table *ctl, int write, struct file * filp,
3489 goto done; 3489 goto done;
3490doit: 3490doit:
3491 mutex_unlock(&cdrom_mutex); 3491 mutex_unlock(&cdrom_mutex);
3492 return proc_dostring(ctl, write, filp, buffer, lenp, ppos); 3492 return proc_dostring(ctl, write, buffer, lenp, ppos);
3493done: 3493done:
3494 printk(KERN_INFO "cdrom: info buffer too small\n"); 3494 printk(KERN_INFO "cdrom: info buffer too small\n");
3495 goto doit; 3495 goto doit;
@@ -3525,12 +3525,12 @@ static void cdrom_update_settings(void)
3525 mutex_unlock(&cdrom_mutex); 3525 mutex_unlock(&cdrom_mutex);
3526} 3526}
3527 3527
3528static int cdrom_sysctl_handler(ctl_table *ctl, int write, struct file * filp, 3528static int cdrom_sysctl_handler(ctl_table *ctl, int write,
3529 void __user *buffer, size_t *lenp, loff_t *ppos) 3529 void __user *buffer, size_t *lenp, loff_t *ppos)
3530{ 3530{
3531 int ret; 3531 int ret;
3532 3532
3533 ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); 3533 ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
3534 3534
3535 if (write) { 3535 if (write) {
3536 3536
diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig
index 6a06913b01d3..08a6f50ae791 100644
--- a/drivers/char/Kconfig
+++ b/drivers/char/Kconfig
@@ -1087,6 +1087,14 @@ config MMTIMER
1087 The mmtimer device allows direct userspace access to the 1087 The mmtimer device allows direct userspace access to the
1088 Altix system timer. 1088 Altix system timer.
1089 1089
1090config UV_MMTIMER
1091 tristate "UV_MMTIMER Memory mapped RTC for SGI UV"
1092 depends on X86_UV
1093 default m
1094 help
1095 The uv_mmtimer device allows direct userspace access to the
1096 UV system timer.
1097
1090source "drivers/char/tpm/Kconfig" 1098source "drivers/char/tpm/Kconfig"
1091 1099
1092config TELCLOCK 1100config TELCLOCK
diff --git a/drivers/char/Makefile b/drivers/char/Makefile
index 66f779ad4f4c..19a79dd79eee 100644
--- a/drivers/char/Makefile
+++ b/drivers/char/Makefile
@@ -58,6 +58,7 @@ obj-$(CONFIG_RAW_DRIVER) += raw.o
58obj-$(CONFIG_SGI_SNSC) += snsc.o snsc_event.o 58obj-$(CONFIG_SGI_SNSC) += snsc.o snsc_event.o
59obj-$(CONFIG_MSPEC) += mspec.o 59obj-$(CONFIG_MSPEC) += mspec.o
60obj-$(CONFIG_MMTIMER) += mmtimer.o 60obj-$(CONFIG_MMTIMER) += mmtimer.o
61obj-$(CONFIG_UV_MMTIMER) += uv_mmtimer.o
61obj-$(CONFIG_VIOTAPE) += viotape.o 62obj-$(CONFIG_VIOTAPE) += viotape.o
62obj-$(CONFIG_HVCS) += hvcs.o 63obj-$(CONFIG_HVCS) += hvcs.o
63obj-$(CONFIG_IBM_BSR) += bsr.o 64obj-$(CONFIG_IBM_BSR) += bsr.o
diff --git a/drivers/char/bfin-otp.c b/drivers/char/bfin-otp.c
index 0a01329451e4..e3dd24bff514 100644
--- a/drivers/char/bfin-otp.c
+++ b/drivers/char/bfin-otp.c
@@ -1,8 +1,7 @@
1/* 1/*
2 * Blackfin On-Chip OTP Memory Interface 2 * Blackfin On-Chip OTP Memory Interface
3 * Supports BF52x/BF54x
4 * 3 *
5 * Copyright 2007-2008 Analog Devices Inc. 4 * Copyright 2007-2009 Analog Devices Inc.
6 * 5 *
7 * Enter bugs at http://blackfin.uclinux.org/ 6 * Enter bugs at http://blackfin.uclinux.org/
8 * 7 *
@@ -17,8 +16,10 @@
17#include <linux/module.h> 16#include <linux/module.h>
18#include <linux/mutex.h> 17#include <linux/mutex.h>
19#include <linux/types.h> 18#include <linux/types.h>
19#include <mtd/mtd-abi.h>
20 20
21#include <asm/blackfin.h> 21#include <asm/blackfin.h>
22#include <asm/bfrom.h>
22#include <asm/uaccess.h> 23#include <asm/uaccess.h>
23 24
24#define stamp(fmt, args...) pr_debug("%s:%i: " fmt "\n", __func__, __LINE__, ## args) 25#define stamp(fmt, args...) pr_debug("%s:%i: " fmt "\n", __func__, __LINE__, ## args)
@@ -30,39 +31,6 @@
30 31
31static DEFINE_MUTEX(bfin_otp_lock); 32static DEFINE_MUTEX(bfin_otp_lock);
32 33
33/* OTP Boot ROM functions */
34#define _BOOTROM_OTP_COMMAND 0xEF000018
35#define _BOOTROM_OTP_READ 0xEF00001A
36#define _BOOTROM_OTP_WRITE 0xEF00001C
37
38static u32 (* const otp_command)(u32 command, u32 value) = (void *)_BOOTROM_OTP_COMMAND;
39static u32 (* const otp_read)(u32 page, u32 flags, u64 *page_content) = (void *)_BOOTROM_OTP_READ;
40static u32 (* const otp_write)(u32 page, u32 flags, u64 *page_content) = (void *)_BOOTROM_OTP_WRITE;
41
42/* otp_command(): defines for "command" */
43#define OTP_INIT 0x00000001
44#define OTP_CLOSE 0x00000002
45
46/* otp_{read,write}(): defines for "flags" */
47#define OTP_LOWER_HALF 0x00000000 /* select upper/lower 64-bit half (bit 0) */
48#define OTP_UPPER_HALF 0x00000001
49#define OTP_NO_ECC 0x00000010 /* do not use ECC */
50#define OTP_LOCK 0x00000020 /* sets page protection bit for page */
51#define OTP_ACCESS_READ 0x00001000
52#define OTP_ACCESS_READWRITE 0x00002000
53
54/* Return values for all functions */
55#define OTP_SUCCESS 0x00000000
56#define OTP_MASTER_ERROR 0x001
57#define OTP_WRITE_ERROR 0x003
58#define OTP_READ_ERROR 0x005
59#define OTP_ACC_VIO_ERROR 0x009
60#define OTP_DATA_MULT_ERROR 0x011
61#define OTP_ECC_MULT_ERROR 0x021
62#define OTP_PREV_WR_ERROR 0x041
63#define OTP_DATA_SB_WARN 0x100
64#define OTP_ECC_SB_WARN 0x200
65
66/** 34/**
67 * bfin_otp_read - Read OTP pages 35 * bfin_otp_read - Read OTP pages
68 * 36 *
@@ -86,9 +54,11 @@ static ssize_t bfin_otp_read(struct file *file, char __user *buff, size_t count,
86 page = *pos / (sizeof(u64) * 2); 54 page = *pos / (sizeof(u64) * 2);
87 while (bytes_done < count) { 55 while (bytes_done < count) {
88 flags = (*pos % (sizeof(u64) * 2) ? OTP_UPPER_HALF : OTP_LOWER_HALF); 56 flags = (*pos % (sizeof(u64) * 2) ? OTP_UPPER_HALF : OTP_LOWER_HALF);
89 stamp("processing page %i (%s)", page, (flags == OTP_UPPER_HALF ? "upper" : "lower")); 57 stamp("processing page %i (0x%x:%s)", page, flags,
90 ret = otp_read(page, flags, &content); 58 (flags & OTP_UPPER_HALF ? "upper" : "lower"));
59 ret = bfrom_OtpRead(page, flags, &content);
91 if (ret & OTP_MASTER_ERROR) { 60 if (ret & OTP_MASTER_ERROR) {
61 stamp("error from otp: 0x%x", ret);
92 bytes_done = -EIO; 62 bytes_done = -EIO;
93 break; 63 break;
94 } 64 }
@@ -96,7 +66,7 @@ static ssize_t bfin_otp_read(struct file *file, char __user *buff, size_t count,
96 bytes_done = -EFAULT; 66 bytes_done = -EFAULT;
97 break; 67 break;
98 } 68 }
99 if (flags == OTP_UPPER_HALF) 69 if (flags & OTP_UPPER_HALF)
100 ++page; 70 ++page;
101 bytes_done += sizeof(content); 71 bytes_done += sizeof(content);
102 *pos += sizeof(content); 72 *pos += sizeof(content);
@@ -108,14 +78,53 @@ static ssize_t bfin_otp_read(struct file *file, char __user *buff, size_t count,
108} 78}
109 79
110#ifdef CONFIG_BFIN_OTP_WRITE_ENABLE 80#ifdef CONFIG_BFIN_OTP_WRITE_ENABLE
81static bool allow_writes;
82
83/**
84 * bfin_otp_init_timing - setup OTP timing parameters
85 *
86 * Required before doing any write operation. Algorithms from HRM.
87 */
88static u32 bfin_otp_init_timing(void)
89{
90 u32 tp1, tp2, tp3, timing;
91
92 tp1 = get_sclk() / 1000000;
93 tp2 = (2 * get_sclk() / 10000000) << 8;
94 tp3 = (0x1401) << 15;
95 timing = tp1 | tp2 | tp3;
96 if (bfrom_OtpCommand(OTP_INIT, timing))
97 return 0;
98
99 return timing;
100}
101
102/**
103 * bfin_otp_deinit_timing - set timings to only allow reads
104 *
105 * Should be called after all writes are done.
106 */
107static void bfin_otp_deinit_timing(u32 timing)
108{
109 /* mask bits [31:15] so that any attempts to write fail */
110 bfrom_OtpCommand(OTP_CLOSE, 0);
111 bfrom_OtpCommand(OTP_INIT, timing & ~(-1 << 15));
112 bfrom_OtpCommand(OTP_CLOSE, 0);
113}
114
111/** 115/**
112 * bfin_otp_write - Write OTP pages 116 * bfin_otp_write - write OTP pages
113 * 117 *
114 * All writes must be in half page chunks (half page == 64 bits). 118 * All writes must be in half page chunks (half page == 64 bits).
115 */ 119 */
116static ssize_t bfin_otp_write(struct file *filp, const char __user *buff, size_t count, loff_t *pos) 120static ssize_t bfin_otp_write(struct file *filp, const char __user *buff, size_t count, loff_t *pos)
117{ 121{
118 stampit(); 122 ssize_t bytes_done;
123 u32 timing, page, base_flags, flags, ret;
124 u64 content;
125
126 if (!allow_writes)
127 return -EACCES;
119 128
120 if (count % sizeof(u64)) 129 if (count % sizeof(u64))
121 return -EMSGSIZE; 130 return -EMSGSIZE;
@@ -123,20 +132,96 @@ static ssize_t bfin_otp_write(struct file *filp, const char __user *buff, size_t
123 if (mutex_lock_interruptible(&bfin_otp_lock)) 132 if (mutex_lock_interruptible(&bfin_otp_lock))
124 return -ERESTARTSYS; 133 return -ERESTARTSYS;
125 134
126 /* need otp_init() documentation before this can be implemented */ 135 stampit();
136
137 timing = bfin_otp_init_timing();
138 if (timing == 0) {
139 mutex_unlock(&bfin_otp_lock);
140 return -EIO;
141 }
142
143 base_flags = OTP_CHECK_FOR_PREV_WRITE;
144
145 bytes_done = 0;
146 page = *pos / (sizeof(u64) * 2);
147 while (bytes_done < count) {
148 flags = base_flags | (*pos % (sizeof(u64) * 2) ? OTP_UPPER_HALF : OTP_LOWER_HALF);
149 stamp("processing page %i (0x%x:%s) from %p", page, flags,
150 (flags & OTP_UPPER_HALF ? "upper" : "lower"), buff + bytes_done);
151 if (copy_from_user(&content, buff + bytes_done, sizeof(content))) {
152 bytes_done = -EFAULT;
153 break;
154 }
155 ret = bfrom_OtpWrite(page, flags, &content);
156 if (ret & OTP_MASTER_ERROR) {
157 stamp("error from otp: 0x%x", ret);
158 bytes_done = -EIO;
159 break;
160 }
161 if (flags & OTP_UPPER_HALF)
162 ++page;
163 bytes_done += sizeof(content);
164 *pos += sizeof(content);
165 }
166
167 bfin_otp_deinit_timing(timing);
127 168
128 mutex_unlock(&bfin_otp_lock); 169 mutex_unlock(&bfin_otp_lock);
129 170
171 return bytes_done;
172}
173
174static long bfin_otp_ioctl(struct file *filp, unsigned cmd, unsigned long arg)
175{
176 stampit();
177
178 switch (cmd) {
179 case OTPLOCK: {
180 u32 timing;
181 int ret = -EIO;
182
183 if (!allow_writes)
184 return -EACCES;
185
186 if (mutex_lock_interruptible(&bfin_otp_lock))
187 return -ERESTARTSYS;
188
189 timing = bfin_otp_init_timing();
190 if (timing) {
191 u32 otp_result = bfrom_OtpWrite(arg, OTP_LOCK, NULL);
192 stamp("locking page %lu resulted in 0x%x", arg, otp_result);
193 if (!(otp_result & OTP_MASTER_ERROR))
194 ret = 0;
195
196 bfin_otp_deinit_timing(timing);
197 }
198
199 mutex_unlock(&bfin_otp_lock);
200
201 return ret;
202 }
203
204 case MEMLOCK:
205 allow_writes = false;
206 return 0;
207
208 case MEMUNLOCK:
209 allow_writes = true;
210 return 0;
211 }
212
130 return -EINVAL; 213 return -EINVAL;
131} 214}
132#else 215#else
133# define bfin_otp_write NULL 216# define bfin_otp_write NULL
217# define bfin_otp_ioctl NULL
134#endif 218#endif
135 219
136static struct file_operations bfin_otp_fops = { 220static struct file_operations bfin_otp_fops = {
137 .owner = THIS_MODULE, 221 .owner = THIS_MODULE,
138 .read = bfin_otp_read, 222 .unlocked_ioctl = bfin_otp_ioctl,
139 .write = bfin_otp_write, 223 .read = bfin_otp_read,
224 .write = bfin_otp_write,
140}; 225};
141 226
142static struct miscdevice bfin_otp_misc_device = { 227static struct miscdevice bfin_otp_misc_device = {
diff --git a/drivers/char/hpet.c b/drivers/char/hpet.c
index 4a9f3492b921..70a770ac0138 100644
--- a/drivers/char/hpet.c
+++ b/drivers/char/hpet.c
@@ -166,9 +166,8 @@ static irqreturn_t hpet_interrupt(int irq, void *data)
166 unsigned long m, t; 166 unsigned long m, t;
167 167
168 t = devp->hd_ireqfreq; 168 t = devp->hd_ireqfreq;
169 m = read_counter(&devp->hd_hpet->hpet_mc); 169 m = read_counter(&devp->hd_timer->hpet_compare);
170 write_counter(t + m + devp->hd_hpets->hp_delta, 170 write_counter(t + m, &devp->hd_timer->hpet_compare);
171 &devp->hd_timer->hpet_compare);
172 } 171 }
173 172
174 if (devp->hd_flags & HPET_SHARED_IRQ) 173 if (devp->hd_flags & HPET_SHARED_IRQ)
@@ -504,21 +503,25 @@ static int hpet_ioctl_ieon(struct hpet_dev *devp)
504 g = v | Tn_32MODE_CNF_MASK | Tn_INT_ENB_CNF_MASK; 503 g = v | Tn_32MODE_CNF_MASK | Tn_INT_ENB_CNF_MASK;
505 504
506 if (devp->hd_flags & HPET_PERIODIC) { 505 if (devp->hd_flags & HPET_PERIODIC) {
507 write_counter(t, &timer->hpet_compare);
508 g |= Tn_TYPE_CNF_MASK; 506 g |= Tn_TYPE_CNF_MASK;
509 v |= Tn_TYPE_CNF_MASK; 507 v |= Tn_TYPE_CNF_MASK | Tn_VAL_SET_CNF_MASK;
510 writeq(v, &timer->hpet_config);
511 v |= Tn_VAL_SET_CNF_MASK;
512 writeq(v, &timer->hpet_config); 508 writeq(v, &timer->hpet_config);
513 local_irq_save(flags); 509 local_irq_save(flags);
514 510
515 /* NOTE: what we modify here is a hidden accumulator 511 /*
512 * NOTE: First we modify the hidden accumulator
516 * register supported by periodic-capable comparators. 513 * register supported by periodic-capable comparators.
517 * We never want to modify the (single) counter; that 514 * We never want to modify the (single) counter; that
518 * would affect all the comparators. 515 * would affect all the comparators. The value written
516 * is the counter value when the first interrupt is due.
519 */ 517 */
520 m = read_counter(&hpet->hpet_mc); 518 m = read_counter(&hpet->hpet_mc);
521 write_counter(t + m + hpetp->hp_delta, &timer->hpet_compare); 519 write_counter(t + m + hpetp->hp_delta, &timer->hpet_compare);
520 /*
521 * Then we modify the comparator, indicating the period
522 * for subsequent interrupt.
523 */
524 write_counter(t, &timer->hpet_compare);
522 } else { 525 } else {
523 local_irq_save(flags); 526 local_irq_save(flags);
524 m = read_counter(&hpet->hpet_mc); 527 m = read_counter(&hpet->hpet_mc);
diff --git a/drivers/char/mem.c b/drivers/char/mem.c
index 0aede1d6a9ea..6c8b65d069e5 100644
--- a/drivers/char/mem.c
+++ b/drivers/char/mem.c
@@ -690,7 +690,7 @@ static ssize_t read_zero(struct file * file, char __user * buf,
690 690
691 if (chunk > PAGE_SIZE) 691 if (chunk > PAGE_SIZE)
692 chunk = PAGE_SIZE; /* Just for latency reasons */ 692 chunk = PAGE_SIZE; /* Just for latency reasons */
693 unwritten = clear_user(buf, chunk); 693 unwritten = __clear_user(buf, chunk);
694 written += chunk - unwritten; 694 written += chunk - unwritten;
695 if (unwritten) 695 if (unwritten)
696 break; 696 break;
diff --git a/drivers/char/mwave/mwavedd.c b/drivers/char/mwave/mwavedd.c
index 94ad2c3bfc4a..a4ec50c95072 100644
--- a/drivers/char/mwave/mwavedd.c
+++ b/drivers/char/mwave/mwavedd.c
@@ -281,12 +281,6 @@ static long mwave_ioctl(struct file *file, unsigned int iocmd,
281 case IOCTL_MW_REGISTER_IPC: { 281 case IOCTL_MW_REGISTER_IPC: {
282 unsigned int ipcnum = (unsigned int) ioarg; 282 unsigned int ipcnum = (unsigned int) ioarg;
283 283
284 PRINTK_3(TRACE_MWAVE,
285 "mwavedd::mwave_ioctl IOCTL_MW_REGISTER_IPC"
286 " ipcnum %x entry usIntCount %x\n",
287 ipcnum,
288 pDrvData->IPCs[ipcnum].usIntCount);
289
290 if (ipcnum >= ARRAY_SIZE(pDrvData->IPCs)) { 284 if (ipcnum >= ARRAY_SIZE(pDrvData->IPCs)) {
291 PRINTK_ERROR(KERN_ERR_MWAVE 285 PRINTK_ERROR(KERN_ERR_MWAVE
292 "mwavedd::mwave_ioctl:" 286 "mwavedd::mwave_ioctl:"
@@ -295,6 +289,12 @@ static long mwave_ioctl(struct file *file, unsigned int iocmd,
295 ipcnum); 289 ipcnum);
296 return -EINVAL; 290 return -EINVAL;
297 } 291 }
292 PRINTK_3(TRACE_MWAVE,
293 "mwavedd::mwave_ioctl IOCTL_MW_REGISTER_IPC"
294 " ipcnum %x entry usIntCount %x\n",
295 ipcnum,
296 pDrvData->IPCs[ipcnum].usIntCount);
297
298 lock_kernel(); 298 lock_kernel();
299 pDrvData->IPCs[ipcnum].bIsHere = FALSE; 299 pDrvData->IPCs[ipcnum].bIsHere = FALSE;
300 pDrvData->IPCs[ipcnum].bIsEnabled = TRUE; 300 pDrvData->IPCs[ipcnum].bIsEnabled = TRUE;
@@ -310,11 +310,6 @@ static long mwave_ioctl(struct file *file, unsigned int iocmd,
310 case IOCTL_MW_GET_IPC: { 310 case IOCTL_MW_GET_IPC: {
311 unsigned int ipcnum = (unsigned int) ioarg; 311 unsigned int ipcnum = (unsigned int) ioarg;
312 312
313 PRINTK_3(TRACE_MWAVE,
314 "mwavedd::mwave_ioctl IOCTL_MW_GET_IPC"
315 " ipcnum %x, usIntCount %x\n",
316 ipcnum,
317 pDrvData->IPCs[ipcnum].usIntCount);
318 if (ipcnum >= ARRAY_SIZE(pDrvData->IPCs)) { 313 if (ipcnum >= ARRAY_SIZE(pDrvData->IPCs)) {
319 PRINTK_ERROR(KERN_ERR_MWAVE 314 PRINTK_ERROR(KERN_ERR_MWAVE
320 "mwavedd::mwave_ioctl:" 315 "mwavedd::mwave_ioctl:"
@@ -322,6 +317,11 @@ static long mwave_ioctl(struct file *file, unsigned int iocmd,
322 " Invalid ipcnum %x\n", ipcnum); 317 " Invalid ipcnum %x\n", ipcnum);
323 return -EINVAL; 318 return -EINVAL;
324 } 319 }
320 PRINTK_3(TRACE_MWAVE,
321 "mwavedd::mwave_ioctl IOCTL_MW_GET_IPC"
322 " ipcnum %x, usIntCount %x\n",
323 ipcnum,
324 pDrvData->IPCs[ipcnum].usIntCount);
325 325
326 lock_kernel(); 326 lock_kernel();
327 if (pDrvData->IPCs[ipcnum].bIsEnabled == TRUE) { 327 if (pDrvData->IPCs[ipcnum].bIsEnabled == TRUE) {
diff --git a/drivers/char/random.c b/drivers/char/random.c
index d8a9255e1a3f..04b505e5a5e2 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -1231,7 +1231,7 @@ static char sysctl_bootid[16];
1231 * as an ASCII string in the standard UUID format. If accesses via the 1231 * as an ASCII string in the standard UUID format. If accesses via the
1232 * sysctl system call, it is returned as 16 bytes of binary data. 1232 * sysctl system call, it is returned as 16 bytes of binary data.
1233 */ 1233 */
1234static int proc_do_uuid(ctl_table *table, int write, struct file *filp, 1234static int proc_do_uuid(ctl_table *table, int write,
1235 void __user *buffer, size_t *lenp, loff_t *ppos) 1235 void __user *buffer, size_t *lenp, loff_t *ppos)
1236{ 1236{
1237 ctl_table fake_table; 1237 ctl_table fake_table;
@@ -1254,7 +1254,7 @@ static int proc_do_uuid(ctl_table *table, int write, struct file *filp,
1254 fake_table.data = buf; 1254 fake_table.data = buf;
1255 fake_table.maxlen = sizeof(buf); 1255 fake_table.maxlen = sizeof(buf);
1256 1256
1257 return proc_dostring(&fake_table, write, filp, buffer, lenp, ppos); 1257 return proc_dostring(&fake_table, write, buffer, lenp, ppos);
1258} 1258}
1259 1259
1260static int uuid_strategy(ctl_table *table, 1260static int uuid_strategy(ctl_table *table,
diff --git a/drivers/char/rio/rioctrl.c b/drivers/char/rio/rioctrl.c
index eecee0f576d2..74339559f0b9 100644
--- a/drivers/char/rio/rioctrl.c
+++ b/drivers/char/rio/rioctrl.c
@@ -873,7 +873,7 @@ int riocontrol(struct rio_info *p, dev_t dev, int cmd, unsigned long arg, int su
873 /* 873 /*
874 ** It is important that the product code is an unsigned object! 874 ** It is important that the product code is an unsigned object!
875 */ 875 */
876 if (DownLoad.ProductCode > MAX_PRODUCT) { 876 if (DownLoad.ProductCode >= MAX_PRODUCT) {
877 rio_dprintk(RIO_DEBUG_CTRL, "RIO_DOWNLOAD: Bad product code %d passed\n", DownLoad.ProductCode); 877 rio_dprintk(RIO_DEBUG_CTRL, "RIO_DOWNLOAD: Bad product code %d passed\n", DownLoad.ProductCode);
878 p->RIOError.Error = NO_SUCH_PRODUCT; 878 p->RIOError.Error = NO_SUCH_PRODUCT;
879 return -ENXIO; 879 return -ENXIO;
diff --git a/drivers/char/uv_mmtimer.c b/drivers/char/uv_mmtimer.c
new file mode 100644
index 000000000000..867b67be9f0a
--- /dev/null
+++ b/drivers/char/uv_mmtimer.c
@@ -0,0 +1,216 @@
1/*
2 * Timer device implementation for SGI UV platform.
3 *
4 * This file is subject to the terms and conditions of the GNU General Public
5 * License. See the file "COPYING" in the main directory of this archive
6 * for more details.
7 *
8 * Copyright (c) 2009 Silicon Graphics, Inc. All rights reserved.
9 *
10 */
11
12#include <linux/types.h>
13#include <linux/kernel.h>
14#include <linux/ioctl.h>
15#include <linux/module.h>
16#include <linux/init.h>
17#include <linux/errno.h>
18#include <linux/mm.h>
19#include <linux/fs.h>
20#include <linux/mmtimer.h>
21#include <linux/miscdevice.h>
22#include <linux/posix-timers.h>
23#include <linux/interrupt.h>
24#include <linux/time.h>
25#include <linux/math64.h>
26#include <linux/smp_lock.h>
27
28#include <asm/genapic.h>
29#include <asm/uv/uv_hub.h>
30#include <asm/uv/bios.h>
31#include <asm/uv/uv.h>
32
33MODULE_AUTHOR("Dimitri Sivanich <sivanich@sgi.com>");
34MODULE_DESCRIPTION("SGI UV Memory Mapped RTC Timer");
35MODULE_LICENSE("GPL");
36
37/* name of the device, usually in /dev */
38#define UV_MMTIMER_NAME "mmtimer"
39#define UV_MMTIMER_DESC "SGI UV Memory Mapped RTC Timer"
40#define UV_MMTIMER_VERSION "1.0"
41
42static long uv_mmtimer_ioctl(struct file *file, unsigned int cmd,
43 unsigned long arg);
44static int uv_mmtimer_mmap(struct file *file, struct vm_area_struct *vma);
45
46/*
47 * Period in femtoseconds (10^-15 s)
48 */
49static unsigned long uv_mmtimer_femtoperiod;
50
51static const struct file_operations uv_mmtimer_fops = {
52 .owner = THIS_MODULE,
53 .mmap = uv_mmtimer_mmap,
54 .unlocked_ioctl = uv_mmtimer_ioctl,
55};
56
57/**
58 * uv_mmtimer_ioctl - ioctl interface for /dev/uv_mmtimer
59 * @file: file structure for the device
60 * @cmd: command to execute
61 * @arg: optional argument to command
62 *
63 * Executes the command specified by @cmd. Returns 0 for success, < 0 for
64 * failure.
65 *
66 * Valid commands:
67 *
68 * %MMTIMER_GETOFFSET - Should return the offset (relative to the start
69 * of the page where the registers are mapped) for the counter in question.
70 *
71 * %MMTIMER_GETRES - Returns the resolution of the clock in femto (10^-15)
72 * seconds
73 *
74 * %MMTIMER_GETFREQ - Copies the frequency of the clock in Hz to the address
75 * specified by @arg
76 *
77 * %MMTIMER_GETBITS - Returns the number of bits in the clock's counter
78 *
79 * %MMTIMER_MMAPAVAIL - Returns 1 if registers can be mmap'd into userspace
80 *
81 * %MMTIMER_GETCOUNTER - Gets the current value in the counter and places it
82 * in the address specified by @arg.
83 */
84static long uv_mmtimer_ioctl(struct file *file, unsigned int cmd,
85 unsigned long arg)
86{
87 int ret = 0;
88
89 switch (cmd) {
90 case MMTIMER_GETOFFSET: /* offset of the counter */
91 /*
92 * UV RTC register is on its own page
93 */
94 if (PAGE_SIZE <= (1 << 16))
95 ret = ((UV_LOCAL_MMR_BASE | UVH_RTC) & (PAGE_SIZE-1))
96 / 8;
97 else
98 ret = -ENOSYS;
99 break;
100
101 case MMTIMER_GETRES: /* resolution of the clock in 10^-15 s */
102 if (copy_to_user((unsigned long __user *)arg,
103 &uv_mmtimer_femtoperiod, sizeof(unsigned long)))
104 ret = -EFAULT;
105 break;
106
107 case MMTIMER_GETFREQ: /* frequency in Hz */
108 if (copy_to_user((unsigned long __user *)arg,
109 &sn_rtc_cycles_per_second,
110 sizeof(unsigned long)))
111 ret = -EFAULT;
112 break;
113
114 case MMTIMER_GETBITS: /* number of bits in the clock */
115 ret = hweight64(UVH_RTC_REAL_TIME_CLOCK_MASK);
116 break;
117
118 case MMTIMER_MMAPAVAIL: /* can we mmap the clock into userspace? */
119 ret = (PAGE_SIZE <= (1 << 16)) ? 1 : 0;
120 break;
121
122 case MMTIMER_GETCOUNTER:
123 if (copy_to_user((unsigned long __user *)arg,
124 (unsigned long *)uv_local_mmr_address(UVH_RTC),
125 sizeof(unsigned long)))
126 ret = -EFAULT;
127 break;
128 default:
129 ret = -ENOTTY;
130 break;
131 }
132 return ret;
133}
134
135/**
136 * uv_mmtimer_mmap - maps the clock's registers into userspace
137 * @file: file structure for the device
138 * @vma: VMA to map the registers into
139 *
140 * Calls remap_pfn_range() to map the clock's registers into
141 * the calling process' address space.
142 */
143static int uv_mmtimer_mmap(struct file *file, struct vm_area_struct *vma)
144{
145 unsigned long uv_mmtimer_addr;
146
147 if (vma->vm_end - vma->vm_start != PAGE_SIZE)
148 return -EINVAL;
149
150 if (vma->vm_flags & VM_WRITE)
151 return -EPERM;
152
153 if (PAGE_SIZE > (1 << 16))
154 return -ENOSYS;
155
156 vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
157
158 uv_mmtimer_addr = UV_LOCAL_MMR_BASE | UVH_RTC;
159 uv_mmtimer_addr &= ~(PAGE_SIZE - 1);
160 uv_mmtimer_addr &= 0xfffffffffffffffUL;
161
162 if (remap_pfn_range(vma, vma->vm_start, uv_mmtimer_addr >> PAGE_SHIFT,
163 PAGE_SIZE, vma->vm_page_prot)) {
164 printk(KERN_ERR "remap_pfn_range failed in uv_mmtimer_mmap\n");
165 return -EAGAIN;
166 }
167
168 return 0;
169}
170
171static struct miscdevice uv_mmtimer_miscdev = {
172 MISC_DYNAMIC_MINOR,
173 UV_MMTIMER_NAME,
174 &uv_mmtimer_fops
175};
176
177
178/**
179 * uv_mmtimer_init - device initialization routine
180 *
181 * Does initial setup for the uv_mmtimer device.
182 */
183static int __init uv_mmtimer_init(void)
184{
185 if (!is_uv_system()) {
186 printk(KERN_ERR "%s: Hardware unsupported\n", UV_MMTIMER_NAME);
187 return -1;
188 }
189
190 /*
191 * Sanity check the cycles/sec variable
192 */
193 if (sn_rtc_cycles_per_second < 100000) {
194 printk(KERN_ERR "%s: unable to determine clock frequency\n",
195 UV_MMTIMER_NAME);
196 return -1;
197 }
198
199 uv_mmtimer_femtoperiod = ((unsigned long)1E15 +
200 sn_rtc_cycles_per_second / 2) /
201 sn_rtc_cycles_per_second;
202
203 if (misc_register(&uv_mmtimer_miscdev)) {
204 printk(KERN_ERR "%s: failed to register device\n",
205 UV_MMTIMER_NAME);
206 return -1;
207 }
208
209 printk(KERN_INFO "%s: v%s, %ld MHz\n", UV_MMTIMER_DESC,
210 UV_MMTIMER_VERSION,
211 sn_rtc_cycles_per_second/(unsigned long)1E6);
212
213 return 0;
214}
215
216module_init(uv_mmtimer_init);
diff --git a/drivers/dca/dca-core.c b/drivers/dca/dca-core.c
index 25b743abfb59..52e6bb70a490 100644
--- a/drivers/dca/dca-core.c
+++ b/drivers/dca/dca-core.c
@@ -28,7 +28,7 @@
28#include <linux/device.h> 28#include <linux/device.h>
29#include <linux/dca.h> 29#include <linux/dca.h>
30 30
31#define DCA_VERSION "1.8" 31#define DCA_VERSION "1.12.1"
32 32
33MODULE_VERSION(DCA_VERSION); 33MODULE_VERSION(DCA_VERSION);
34MODULE_LICENSE("GPL"); 34MODULE_LICENSE("GPL");
@@ -36,20 +36,92 @@ MODULE_AUTHOR("Intel Corporation");
36 36
37static DEFINE_SPINLOCK(dca_lock); 37static DEFINE_SPINLOCK(dca_lock);
38 38
39static LIST_HEAD(dca_providers); 39static LIST_HEAD(dca_domains);
40 40
41static struct dca_provider *dca_find_provider_by_dev(struct device *dev) 41static struct pci_bus *dca_pci_rc_from_dev(struct device *dev)
42{ 42{
43 struct dca_provider *dca, *ret = NULL; 43 struct pci_dev *pdev = to_pci_dev(dev);
44 struct pci_bus *bus = pdev->bus;
44 45
45 list_for_each_entry(dca, &dca_providers, node) { 46 while (bus->parent)
46 if ((!dev) || (dca->ops->dev_managed(dca, dev))) { 47 bus = bus->parent;
47 ret = dca; 48
48 break; 49 return bus;
49 } 50}
51
52static struct dca_domain *dca_allocate_domain(struct pci_bus *rc)
53{
54 struct dca_domain *domain;
55
56 domain = kzalloc(sizeof(*domain), GFP_NOWAIT);
57 if (!domain)
58 return NULL;
59
60 INIT_LIST_HEAD(&domain->dca_providers);
61 domain->pci_rc = rc;
62
63 return domain;
64}
65
66static void dca_free_domain(struct dca_domain *domain)
67{
68 list_del(&domain->node);
69 kfree(domain);
70}
71
72static struct dca_domain *dca_find_domain(struct pci_bus *rc)
73{
74 struct dca_domain *domain;
75
76 list_for_each_entry(domain, &dca_domains, node)
77 if (domain->pci_rc == rc)
78 return domain;
79
80 return NULL;
81}
82
83static struct dca_domain *dca_get_domain(struct device *dev)
84{
85 struct pci_bus *rc;
86 struct dca_domain *domain;
87
88 rc = dca_pci_rc_from_dev(dev);
89 domain = dca_find_domain(rc);
90
91 if (!domain) {
92 domain = dca_allocate_domain(rc);
93 if (domain)
94 list_add(&domain->node, &dca_domains);
95 }
96
97 return domain;
98}
99
100static struct dca_provider *dca_find_provider_by_dev(struct device *dev)
101{
102 struct dca_provider *dca;
103 struct pci_bus *rc;
104 struct dca_domain *domain;
105
106 if (dev) {
107 rc = dca_pci_rc_from_dev(dev);
108 domain = dca_find_domain(rc);
109 if (!domain)
110 return NULL;
111 } else {
112 if (!list_empty(&dca_domains))
113 domain = list_first_entry(&dca_domains,
114 struct dca_domain,
115 node);
116 else
117 return NULL;
50 } 118 }
51 119
52 return ret; 120 list_for_each_entry(dca, &domain->dca_providers, node)
121 if ((!dev) || (dca->ops->dev_managed(dca, dev)))
122 return dca;
123
124 return NULL;
53} 125}
54 126
55/** 127/**
@@ -61,6 +133,8 @@ int dca_add_requester(struct device *dev)
61 struct dca_provider *dca; 133 struct dca_provider *dca;
62 int err, slot = -ENODEV; 134 int err, slot = -ENODEV;
63 unsigned long flags; 135 unsigned long flags;
136 struct pci_bus *pci_rc;
137 struct dca_domain *domain;
64 138
65 if (!dev) 139 if (!dev)
66 return -EFAULT; 140 return -EFAULT;
@@ -74,7 +148,14 @@ int dca_add_requester(struct device *dev)
74 return -EEXIST; 148 return -EEXIST;
75 } 149 }
76 150
77 list_for_each_entry(dca, &dca_providers, node) { 151 pci_rc = dca_pci_rc_from_dev(dev);
152 domain = dca_find_domain(pci_rc);
153 if (!domain) {
154 spin_unlock_irqrestore(&dca_lock, flags);
155 return -ENODEV;
156 }
157
158 list_for_each_entry(dca, &domain->dca_providers, node) {
78 slot = dca->ops->add_requester(dca, dev); 159 slot = dca->ops->add_requester(dca, dev);
79 if (slot >= 0) 160 if (slot >= 0)
80 break; 161 break;
@@ -222,13 +303,19 @@ int register_dca_provider(struct dca_provider *dca, struct device *dev)
222{ 303{
223 int err; 304 int err;
224 unsigned long flags; 305 unsigned long flags;
306 struct dca_domain *domain;
225 307
226 err = dca_sysfs_add_provider(dca, dev); 308 err = dca_sysfs_add_provider(dca, dev);
227 if (err) 309 if (err)
228 return err; 310 return err;
229 311
230 spin_lock_irqsave(&dca_lock, flags); 312 spin_lock_irqsave(&dca_lock, flags);
231 list_add(&dca->node, &dca_providers); 313 domain = dca_get_domain(dev);
314 if (!domain) {
315 spin_unlock_irqrestore(&dca_lock, flags);
316 return -ENODEV;
317 }
318 list_add(&dca->node, &domain->dca_providers);
232 spin_unlock_irqrestore(&dca_lock, flags); 319 spin_unlock_irqrestore(&dca_lock, flags);
233 320
234 blocking_notifier_call_chain(&dca_provider_chain, 321 blocking_notifier_call_chain(&dca_provider_chain,
@@ -241,15 +328,24 @@ EXPORT_SYMBOL_GPL(register_dca_provider);
241 * unregister_dca_provider - remove a dca provider 328 * unregister_dca_provider - remove a dca provider
242 * @dca - struct created by alloc_dca_provider() 329 * @dca - struct created by alloc_dca_provider()
243 */ 330 */
244void unregister_dca_provider(struct dca_provider *dca) 331void unregister_dca_provider(struct dca_provider *dca, struct device *dev)
245{ 332{
246 unsigned long flags; 333 unsigned long flags;
334 struct pci_bus *pci_rc;
335 struct dca_domain *domain;
247 336
248 blocking_notifier_call_chain(&dca_provider_chain, 337 blocking_notifier_call_chain(&dca_provider_chain,
249 DCA_PROVIDER_REMOVE, NULL); 338 DCA_PROVIDER_REMOVE, NULL);
250 339
251 spin_lock_irqsave(&dca_lock, flags); 340 spin_lock_irqsave(&dca_lock, flags);
341
252 list_del(&dca->node); 342 list_del(&dca->node);
343
344 pci_rc = dca_pci_rc_from_dev(dev);
345 domain = dca_find_domain(pci_rc);
346 if (list_empty(&domain->dca_providers))
347 dca_free_domain(domain);
348
253 spin_unlock_irqrestore(&dca_lock, flags); 349 spin_unlock_irqrestore(&dca_lock, flags);
254 350
255 dca_sysfs_remove_provider(dca); 351 dca_sysfs_remove_provider(dca);
@@ -276,7 +372,7 @@ EXPORT_SYMBOL_GPL(dca_unregister_notify);
276 372
277static int __init dca_init(void) 373static int __init dca_init(void)
278{ 374{
279 printk(KERN_ERR "dca service started, version %s\n", DCA_VERSION); 375 pr_info("dca service started, version %s\n", DCA_VERSION);
280 return dca_sysfs_init(); 376 return dca_sysfs_init();
281} 377}
282 378
diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
index 81e1020fb514..5903a88351bf 100644
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -17,11 +17,15 @@ if DMADEVICES
17 17
18comment "DMA Devices" 18comment "DMA Devices"
19 19
20config ASYNC_TX_DISABLE_CHANNEL_SWITCH
21 bool
22
20config INTEL_IOATDMA 23config INTEL_IOATDMA
21 tristate "Intel I/OAT DMA support" 24 tristate "Intel I/OAT DMA support"
22 depends on PCI && X86 25 depends on PCI && X86
23 select DMA_ENGINE 26 select DMA_ENGINE
24 select DCA 27 select DCA
28 select ASYNC_TX_DISABLE_CHANNEL_SWITCH
25 help 29 help
26 Enable support for the Intel(R) I/OAT DMA engine present 30 Enable support for the Intel(R) I/OAT DMA engine present
27 in recent Intel Xeon chipsets. 31 in recent Intel Xeon chipsets.
@@ -97,6 +101,14 @@ config TXX9_DMAC
97 Support the TXx9 SoC internal DMA controller. This can be 101 Support the TXx9 SoC internal DMA controller. This can be
98 integrated in chips such as the Toshiba TX4927/38/39. 102 integrated in chips such as the Toshiba TX4927/38/39.
99 103
104config SH_DMAE
105 tristate "Renesas SuperH DMAC support"
106 depends on SUPERH && SH_DMA
107 depends on !SH_DMA_API
108 select DMA_ENGINE
109 help
110 Enable support for the Renesas SuperH DMA controllers.
111
100config DMA_ENGINE 112config DMA_ENGINE
101 bool 113 bool
102 114
@@ -116,7 +128,7 @@ config NET_DMA
116 128
117config ASYNC_TX_DMA 129config ASYNC_TX_DMA
118 bool "Async_tx: Offload support for the async_tx api" 130 bool "Async_tx: Offload support for the async_tx api"
119 depends on DMA_ENGINE && !HIGHMEM64G 131 depends on DMA_ENGINE
120 help 132 help
121 This allows the async_tx api to take advantage of offload engines for 133 This allows the async_tx api to take advantage of offload engines for
122 memcpy, memset, xor, and raid6 p+q operations. If your platform has 134 memcpy, memset, xor, and raid6 p+q operations. If your platform has
diff --git a/drivers/dma/Makefile b/drivers/dma/Makefile
index 40e1e0083571..eca71ba78ae9 100644
--- a/drivers/dma/Makefile
+++ b/drivers/dma/Makefile
@@ -1,8 +1,7 @@
1obj-$(CONFIG_DMA_ENGINE) += dmaengine.o 1obj-$(CONFIG_DMA_ENGINE) += dmaengine.o
2obj-$(CONFIG_NET_DMA) += iovlock.o 2obj-$(CONFIG_NET_DMA) += iovlock.o
3obj-$(CONFIG_DMATEST) += dmatest.o 3obj-$(CONFIG_DMATEST) += dmatest.o
4obj-$(CONFIG_INTEL_IOATDMA) += ioatdma.o 4obj-$(CONFIG_INTEL_IOATDMA) += ioat/
5ioatdma-objs := ioat.o ioat_dma.o ioat_dca.o
6obj-$(CONFIG_INTEL_IOP_ADMA) += iop-adma.o 5obj-$(CONFIG_INTEL_IOP_ADMA) += iop-adma.o
7obj-$(CONFIG_FSL_DMA) += fsldma.o 6obj-$(CONFIG_FSL_DMA) += fsldma.o
8obj-$(CONFIG_MV_XOR) += mv_xor.o 7obj-$(CONFIG_MV_XOR) += mv_xor.o
@@ -10,3 +9,4 @@ obj-$(CONFIG_DW_DMAC) += dw_dmac.o
10obj-$(CONFIG_AT_HDMAC) += at_hdmac.o 9obj-$(CONFIG_AT_HDMAC) += at_hdmac.o
11obj-$(CONFIG_MX3_IPU) += ipu/ 10obj-$(CONFIG_MX3_IPU) += ipu/
12obj-$(CONFIG_TXX9_DMAC) += txx9dmac.o 11obj-$(CONFIG_TXX9_DMAC) += txx9dmac.o
12obj-$(CONFIG_SH_DMAE) += shdma.o
diff --git a/drivers/dma/at_hdmac.c b/drivers/dma/at_hdmac.c
index c8522e6f1ad2..7585c4164bd5 100644
--- a/drivers/dma/at_hdmac.c
+++ b/drivers/dma/at_hdmac.c
@@ -87,6 +87,7 @@ static struct at_desc *atc_alloc_descriptor(struct dma_chan *chan,
87 desc = dma_pool_alloc(atdma->dma_desc_pool, gfp_flags, &phys); 87 desc = dma_pool_alloc(atdma->dma_desc_pool, gfp_flags, &phys);
88 if (desc) { 88 if (desc) {
89 memset(desc, 0, sizeof(struct at_desc)); 89 memset(desc, 0, sizeof(struct at_desc));
90 INIT_LIST_HEAD(&desc->tx_list);
90 dma_async_tx_descriptor_init(&desc->txd, chan); 91 dma_async_tx_descriptor_init(&desc->txd, chan);
91 /* txd.flags will be overwritten in prep functions */ 92 /* txd.flags will be overwritten in prep functions */
92 desc->txd.flags = DMA_CTRL_ACK; 93 desc->txd.flags = DMA_CTRL_ACK;
@@ -150,11 +151,11 @@ static void atc_desc_put(struct at_dma_chan *atchan, struct at_desc *desc)
150 struct at_desc *child; 151 struct at_desc *child;
151 152
152 spin_lock_bh(&atchan->lock); 153 spin_lock_bh(&atchan->lock);
153 list_for_each_entry(child, &desc->txd.tx_list, desc_node) 154 list_for_each_entry(child, &desc->tx_list, desc_node)
154 dev_vdbg(chan2dev(&atchan->chan_common), 155 dev_vdbg(chan2dev(&atchan->chan_common),
155 "moving child desc %p to freelist\n", 156 "moving child desc %p to freelist\n",
156 child); 157 child);
157 list_splice_init(&desc->txd.tx_list, &atchan->free_list); 158 list_splice_init(&desc->tx_list, &atchan->free_list);
158 dev_vdbg(chan2dev(&atchan->chan_common), 159 dev_vdbg(chan2dev(&atchan->chan_common),
159 "moving desc %p to freelist\n", desc); 160 "moving desc %p to freelist\n", desc);
160 list_add(&desc->desc_node, &atchan->free_list); 161 list_add(&desc->desc_node, &atchan->free_list);
@@ -247,30 +248,33 @@ atc_chain_complete(struct at_dma_chan *atchan, struct at_desc *desc)
247 param = txd->callback_param; 248 param = txd->callback_param;
248 249
249 /* move children to free_list */ 250 /* move children to free_list */
250 list_splice_init(&txd->tx_list, &atchan->free_list); 251 list_splice_init(&desc->tx_list, &atchan->free_list);
251 /* move myself to free_list */ 252 /* move myself to free_list */
252 list_move(&desc->desc_node, &atchan->free_list); 253 list_move(&desc->desc_node, &atchan->free_list);
253 254
254 /* unmap dma addresses */ 255 /* unmap dma addresses */
255 if (!(txd->flags & DMA_COMPL_SKIP_DEST_UNMAP)) { 256 if (!atchan->chan_common.private) {
256 if (txd->flags & DMA_COMPL_DEST_UNMAP_SINGLE) 257 struct device *parent = chan2parent(&atchan->chan_common);
257 dma_unmap_single(chan2parent(&atchan->chan_common), 258 if (!(txd->flags & DMA_COMPL_SKIP_DEST_UNMAP)) {
258 desc->lli.daddr, 259 if (txd->flags & DMA_COMPL_DEST_UNMAP_SINGLE)
259 desc->len, DMA_FROM_DEVICE); 260 dma_unmap_single(parent,
260 else 261 desc->lli.daddr,
261 dma_unmap_page(chan2parent(&atchan->chan_common), 262 desc->len, DMA_FROM_DEVICE);
262 desc->lli.daddr, 263 else
263 desc->len, DMA_FROM_DEVICE); 264 dma_unmap_page(parent,
264 } 265 desc->lli.daddr,
265 if (!(txd->flags & DMA_COMPL_SKIP_SRC_UNMAP)) { 266 desc->len, DMA_FROM_DEVICE);
266 if (txd->flags & DMA_COMPL_SRC_UNMAP_SINGLE) 267 }
267 dma_unmap_single(chan2parent(&atchan->chan_common), 268 if (!(txd->flags & DMA_COMPL_SKIP_SRC_UNMAP)) {
268 desc->lli.saddr, 269 if (txd->flags & DMA_COMPL_SRC_UNMAP_SINGLE)
269 desc->len, DMA_TO_DEVICE); 270 dma_unmap_single(parent,
270 else 271 desc->lli.saddr,
271 dma_unmap_page(chan2parent(&atchan->chan_common), 272 desc->len, DMA_TO_DEVICE);
272 desc->lli.saddr, 273 else
273 desc->len, DMA_TO_DEVICE); 274 dma_unmap_page(parent,
275 desc->lli.saddr,
276 desc->len, DMA_TO_DEVICE);
277 }
274 } 278 }
275 279
276 /* 280 /*
@@ -334,7 +338,7 @@ static void atc_cleanup_descriptors(struct at_dma_chan *atchan)
334 /* This one is currently in progress */ 338 /* This one is currently in progress */
335 return; 339 return;
336 340
337 list_for_each_entry(child, &desc->txd.tx_list, desc_node) 341 list_for_each_entry(child, &desc->tx_list, desc_node)
338 if (!(child->lli.ctrla & ATC_DONE)) 342 if (!(child->lli.ctrla & ATC_DONE))
339 /* Currently in progress */ 343 /* Currently in progress */
340 return; 344 return;
@@ -407,7 +411,7 @@ static void atc_handle_error(struct at_dma_chan *atchan)
407 dev_crit(chan2dev(&atchan->chan_common), 411 dev_crit(chan2dev(&atchan->chan_common),
408 " cookie: %d\n", bad_desc->txd.cookie); 412 " cookie: %d\n", bad_desc->txd.cookie);
409 atc_dump_lli(atchan, &bad_desc->lli); 413 atc_dump_lli(atchan, &bad_desc->lli);
410 list_for_each_entry(child, &bad_desc->txd.tx_list, desc_node) 414 list_for_each_entry(child, &bad_desc->tx_list, desc_node)
411 atc_dump_lli(atchan, &child->lli); 415 atc_dump_lli(atchan, &child->lli);
412 416
413 /* Pretend the descriptor completed successfully */ 417 /* Pretend the descriptor completed successfully */
@@ -587,7 +591,7 @@ atc_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dest, dma_addr_t src,
587 prev->lli.dscr = desc->txd.phys; 591 prev->lli.dscr = desc->txd.phys;
588 /* insert the link descriptor to the LD ring */ 592 /* insert the link descriptor to the LD ring */
589 list_add_tail(&desc->desc_node, 593 list_add_tail(&desc->desc_node,
590 &first->txd.tx_list); 594 &first->tx_list);
591 } 595 }
592 prev = desc; 596 prev = desc;
593 } 597 }
@@ -646,8 +650,6 @@ atc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
646 650
647 reg_width = atslave->reg_width; 651 reg_width = atslave->reg_width;
648 652
649 sg_len = dma_map_sg(chan2parent(chan), sgl, sg_len, direction);
650
651 ctrla = ATC_DEFAULT_CTRLA | atslave->ctrla; 653 ctrla = ATC_DEFAULT_CTRLA | atslave->ctrla;
652 ctrlb = ATC_DEFAULT_CTRLB | ATC_IEN; 654 ctrlb = ATC_DEFAULT_CTRLB | ATC_IEN;
653 655
@@ -687,7 +689,7 @@ atc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
687 prev->lli.dscr = desc->txd.phys; 689 prev->lli.dscr = desc->txd.phys;
688 /* insert the link descriptor to the LD ring */ 690 /* insert the link descriptor to the LD ring */
689 list_add_tail(&desc->desc_node, 691 list_add_tail(&desc->desc_node,
690 &first->txd.tx_list); 692 &first->tx_list);
691 } 693 }
692 prev = desc; 694 prev = desc;
693 total_len += len; 695 total_len += len;
@@ -729,7 +731,7 @@ atc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
729 prev->lli.dscr = desc->txd.phys; 731 prev->lli.dscr = desc->txd.phys;
730 /* insert the link descriptor to the LD ring */ 732 /* insert the link descriptor to the LD ring */
731 list_add_tail(&desc->desc_node, 733 list_add_tail(&desc->desc_node,
732 &first->txd.tx_list); 734 &first->tx_list);
733 } 735 }
734 prev = desc; 736 prev = desc;
735 total_len += len; 737 total_len += len;
diff --git a/drivers/dma/at_hdmac_regs.h b/drivers/dma/at_hdmac_regs.h
index 4c972afc49ec..495457e3dc4b 100644
--- a/drivers/dma/at_hdmac_regs.h
+++ b/drivers/dma/at_hdmac_regs.h
@@ -165,6 +165,7 @@ struct at_desc {
165 struct at_lli lli; 165 struct at_lli lli;
166 166
167 /* THEN values for driver housekeeping */ 167 /* THEN values for driver housekeeping */
168 struct list_head tx_list;
168 struct dma_async_tx_descriptor txd; 169 struct dma_async_tx_descriptor txd;
169 struct list_head desc_node; 170 struct list_head desc_node;
170 size_t len; 171 size_t len;
diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index 5a87384ea4ff..bd0b248de2cf 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -608,6 +608,40 @@ void dmaengine_put(void)
608} 608}
609EXPORT_SYMBOL(dmaengine_put); 609EXPORT_SYMBOL(dmaengine_put);
610 610
611static bool device_has_all_tx_types(struct dma_device *device)
612{
613 /* A device that satisfies this test has channels that will never cause
614 * an async_tx channel switch event as all possible operation types can
615 * be handled.
616 */
617 #ifdef CONFIG_ASYNC_TX_DMA
618 if (!dma_has_cap(DMA_INTERRUPT, device->cap_mask))
619 return false;
620 #endif
621
622 #if defined(CONFIG_ASYNC_MEMCPY) || defined(CONFIG_ASYNC_MEMCPY_MODULE)
623 if (!dma_has_cap(DMA_MEMCPY, device->cap_mask))
624 return false;
625 #endif
626
627 #if defined(CONFIG_ASYNC_MEMSET) || defined(CONFIG_ASYNC_MEMSET_MODULE)
628 if (!dma_has_cap(DMA_MEMSET, device->cap_mask))
629 return false;
630 #endif
631
632 #if defined(CONFIG_ASYNC_XOR) || defined(CONFIG_ASYNC_XOR_MODULE)
633 if (!dma_has_cap(DMA_XOR, device->cap_mask))
634 return false;
635 #endif
636
637 #if defined(CONFIG_ASYNC_PQ) || defined(CONFIG_ASYNC_PQ_MODULE)
638 if (!dma_has_cap(DMA_PQ, device->cap_mask))
639 return false;
640 #endif
641
642 return true;
643}
644
611static int get_dma_id(struct dma_device *device) 645static int get_dma_id(struct dma_device *device)
612{ 646{
613 int rc; 647 int rc;
@@ -644,8 +678,12 @@ int dma_async_device_register(struct dma_device *device)
644 !device->device_prep_dma_memcpy); 678 !device->device_prep_dma_memcpy);
645 BUG_ON(dma_has_cap(DMA_XOR, device->cap_mask) && 679 BUG_ON(dma_has_cap(DMA_XOR, device->cap_mask) &&
646 !device->device_prep_dma_xor); 680 !device->device_prep_dma_xor);
647 BUG_ON(dma_has_cap(DMA_ZERO_SUM, device->cap_mask) && 681 BUG_ON(dma_has_cap(DMA_XOR_VAL, device->cap_mask) &&
648 !device->device_prep_dma_zero_sum); 682 !device->device_prep_dma_xor_val);
683 BUG_ON(dma_has_cap(DMA_PQ, device->cap_mask) &&
684 !device->device_prep_dma_pq);
685 BUG_ON(dma_has_cap(DMA_PQ_VAL, device->cap_mask) &&
686 !device->device_prep_dma_pq_val);
649 BUG_ON(dma_has_cap(DMA_MEMSET, device->cap_mask) && 687 BUG_ON(dma_has_cap(DMA_MEMSET, device->cap_mask) &&
650 !device->device_prep_dma_memset); 688 !device->device_prep_dma_memset);
651 BUG_ON(dma_has_cap(DMA_INTERRUPT, device->cap_mask) && 689 BUG_ON(dma_has_cap(DMA_INTERRUPT, device->cap_mask) &&
@@ -661,6 +699,12 @@ int dma_async_device_register(struct dma_device *device)
661 BUG_ON(!device->device_issue_pending); 699 BUG_ON(!device->device_issue_pending);
662 BUG_ON(!device->dev); 700 BUG_ON(!device->dev);
663 701
702 /* note: this only matters in the
703 * CONFIG_ASYNC_TX_DISABLE_CHANNEL_SWITCH=y case
704 */
705 if (device_has_all_tx_types(device))
706 dma_cap_set(DMA_ASYNC_TX, device->cap_mask);
707
664 idr_ref = kmalloc(sizeof(*idr_ref), GFP_KERNEL); 708 idr_ref = kmalloc(sizeof(*idr_ref), GFP_KERNEL);
665 if (!idr_ref) 709 if (!idr_ref)
666 return -ENOMEM; 710 return -ENOMEM;
@@ -933,55 +977,29 @@ void dma_async_tx_descriptor_init(struct dma_async_tx_descriptor *tx,
933{ 977{
934 tx->chan = chan; 978 tx->chan = chan;
935 spin_lock_init(&tx->lock); 979 spin_lock_init(&tx->lock);
936 INIT_LIST_HEAD(&tx->tx_list);
937} 980}
938EXPORT_SYMBOL(dma_async_tx_descriptor_init); 981EXPORT_SYMBOL(dma_async_tx_descriptor_init);
939 982
940/* dma_wait_for_async_tx - spin wait for a transaction to complete 983/* dma_wait_for_async_tx - spin wait for a transaction to complete
941 * @tx: in-flight transaction to wait on 984 * @tx: in-flight transaction to wait on
942 *
943 * This routine assumes that tx was obtained from a call to async_memcpy,
944 * async_xor, async_memset, etc which ensures that tx is "in-flight" (prepped
945 * and submitted). Walking the parent chain is only meant to cover for DMA
946 * drivers that do not implement the DMA_INTERRUPT capability and may race with
947 * the driver's descriptor cleanup routine.
948 */ 985 */
949enum dma_status 986enum dma_status
950dma_wait_for_async_tx(struct dma_async_tx_descriptor *tx) 987dma_wait_for_async_tx(struct dma_async_tx_descriptor *tx)
951{ 988{
952 enum dma_status status; 989 unsigned long dma_sync_wait_timeout = jiffies + msecs_to_jiffies(5000);
953 struct dma_async_tx_descriptor *iter;
954 struct dma_async_tx_descriptor *parent;
955 990
956 if (!tx) 991 if (!tx)
957 return DMA_SUCCESS; 992 return DMA_SUCCESS;
958 993
959 WARN_ONCE(tx->parent, "%s: speculatively walking dependency chain for" 994 while (tx->cookie == -EBUSY) {
960 " %s\n", __func__, dma_chan_name(tx->chan)); 995 if (time_after_eq(jiffies, dma_sync_wait_timeout)) {
961 996 pr_err("%s timeout waiting for descriptor submission\n",
962 /* poll through the dependency chain, return when tx is complete */ 997 __func__);
963 do { 998 return DMA_ERROR;
964 iter = tx; 999 }
965 1000 cpu_relax();
966 /* find the root of the unsubmitted dependency chain */ 1001 }
967 do { 1002 return dma_sync_wait(tx->chan, tx->cookie);
968 parent = iter->parent;
969 if (!parent)
970 break;
971 else
972 iter = parent;
973 } while (parent);
974
975 /* there is a small window for ->parent == NULL and
976 * ->cookie == -EBUSY
977 */
978 while (iter->cookie == -EBUSY)
979 cpu_relax();
980
981 status = dma_sync_wait(iter->chan, iter->cookie);
982 } while (status == DMA_IN_PROGRESS || (iter != tx));
983
984 return status;
985} 1003}
986EXPORT_SYMBOL_GPL(dma_wait_for_async_tx); 1004EXPORT_SYMBOL_GPL(dma_wait_for_async_tx);
987 1005
diff --git a/drivers/dma/dmatest.c b/drivers/dma/dmatest.c
index d93017fc7872..a32a4cf7b1e0 100644
--- a/drivers/dma/dmatest.c
+++ b/drivers/dma/dmatest.c
@@ -48,6 +48,11 @@ module_param(xor_sources, uint, S_IRUGO);
48MODULE_PARM_DESC(xor_sources, 48MODULE_PARM_DESC(xor_sources,
49 "Number of xor source buffers (default: 3)"); 49 "Number of xor source buffers (default: 3)");
50 50
51static unsigned int pq_sources = 3;
52module_param(pq_sources, uint, S_IRUGO);
53MODULE_PARM_DESC(pq_sources,
54 "Number of p+q source buffers (default: 3)");
55
51/* 56/*
52 * Initialization patterns. All bytes in the source buffer has bit 7 57 * Initialization patterns. All bytes in the source buffer has bit 7
53 * set, all bytes in the destination buffer has bit 7 cleared. 58 * set, all bytes in the destination buffer has bit 7 cleared.
@@ -232,6 +237,7 @@ static int dmatest_func(void *data)
232 dma_cookie_t cookie; 237 dma_cookie_t cookie;
233 enum dma_status status; 238 enum dma_status status;
234 enum dma_ctrl_flags flags; 239 enum dma_ctrl_flags flags;
240 u8 pq_coefs[pq_sources];
235 int ret; 241 int ret;
236 int src_cnt; 242 int src_cnt;
237 int dst_cnt; 243 int dst_cnt;
@@ -248,6 +254,11 @@ static int dmatest_func(void *data)
248 else if (thread->type == DMA_XOR) { 254 else if (thread->type == DMA_XOR) {
249 src_cnt = xor_sources | 1; /* force odd to ensure dst = src */ 255 src_cnt = xor_sources | 1; /* force odd to ensure dst = src */
250 dst_cnt = 1; 256 dst_cnt = 1;
257 } else if (thread->type == DMA_PQ) {
258 src_cnt = pq_sources | 1; /* force odd to ensure dst = src */
259 dst_cnt = 2;
260 for (i = 0; i < pq_sources; i++)
261 pq_coefs[i] = 1;
251 } else 262 } else
252 goto err_srcs; 263 goto err_srcs;
253 264
@@ -283,6 +294,7 @@ static int dmatest_func(void *data)
283 dma_addr_t dma_dsts[dst_cnt]; 294 dma_addr_t dma_dsts[dst_cnt];
284 struct completion cmp; 295 struct completion cmp;
285 unsigned long tmo = msecs_to_jiffies(3000); 296 unsigned long tmo = msecs_to_jiffies(3000);
297 u8 align = 0;
286 298
287 total_tests++; 299 total_tests++;
288 300
@@ -290,6 +302,18 @@ static int dmatest_func(void *data)
290 src_off = dmatest_random() % (test_buf_size - len + 1); 302 src_off = dmatest_random() % (test_buf_size - len + 1);
291 dst_off = dmatest_random() % (test_buf_size - len + 1); 303 dst_off = dmatest_random() % (test_buf_size - len + 1);
292 304
305 /* honor alignment restrictions */
306 if (thread->type == DMA_MEMCPY)
307 align = dev->copy_align;
308 else if (thread->type == DMA_XOR)
309 align = dev->xor_align;
310 else if (thread->type == DMA_PQ)
311 align = dev->pq_align;
312
313 len = (len >> align) << align;
314 src_off = (src_off >> align) << align;
315 dst_off = (dst_off >> align) << align;
316
293 dmatest_init_srcs(thread->srcs, src_off, len); 317 dmatest_init_srcs(thread->srcs, src_off, len);
294 dmatest_init_dsts(thread->dsts, dst_off, len); 318 dmatest_init_dsts(thread->dsts, dst_off, len);
295 319
@@ -306,6 +330,7 @@ static int dmatest_func(void *data)
306 DMA_BIDIRECTIONAL); 330 DMA_BIDIRECTIONAL);
307 } 331 }
308 332
333
309 if (thread->type == DMA_MEMCPY) 334 if (thread->type == DMA_MEMCPY)
310 tx = dev->device_prep_dma_memcpy(chan, 335 tx = dev->device_prep_dma_memcpy(chan,
311 dma_dsts[0] + dst_off, 336 dma_dsts[0] + dst_off,
@@ -316,6 +341,15 @@ static int dmatest_func(void *data)
316 dma_dsts[0] + dst_off, 341 dma_dsts[0] + dst_off,
317 dma_srcs, xor_sources, 342 dma_srcs, xor_sources,
318 len, flags); 343 len, flags);
344 else if (thread->type == DMA_PQ) {
345 dma_addr_t dma_pq[dst_cnt];
346
347 for (i = 0; i < dst_cnt; i++)
348 dma_pq[i] = dma_dsts[i] + dst_off;
349 tx = dev->device_prep_dma_pq(chan, dma_pq, dma_srcs,
350 pq_sources, pq_coefs,
351 len, flags);
352 }
319 353
320 if (!tx) { 354 if (!tx) {
321 for (i = 0; i < src_cnt; i++) 355 for (i = 0; i < src_cnt; i++)
@@ -459,6 +493,8 @@ static int dmatest_add_threads(struct dmatest_chan *dtc, enum dma_transaction_ty
459 op = "copy"; 493 op = "copy";
460 else if (type == DMA_XOR) 494 else if (type == DMA_XOR)
461 op = "xor"; 495 op = "xor";
496 else if (type == DMA_PQ)
497 op = "pq";
462 else 498 else
463 return -EINVAL; 499 return -EINVAL;
464 500
@@ -514,6 +550,10 @@ static int dmatest_add_channel(struct dma_chan *chan)
514 cnt = dmatest_add_threads(dtc, DMA_XOR); 550 cnt = dmatest_add_threads(dtc, DMA_XOR);
515 thread_count += cnt > 0 ? cnt : 0; 551 thread_count += cnt > 0 ? cnt : 0;
516 } 552 }
553 if (dma_has_cap(DMA_PQ, dma_dev->cap_mask)) {
554 cnt = dmatest_add_threads(dtc, DMA_PQ);
555 thread_count += cnt > 0 ?: 0;
556 }
517 557
518 pr_info("dmatest: Started %u threads using %s\n", 558 pr_info("dmatest: Started %u threads using %s\n",
519 thread_count, dma_chan_name(chan)); 559 thread_count, dma_chan_name(chan));
diff --git a/drivers/dma/dw_dmac.c b/drivers/dma/dw_dmac.c
index 933c143b6a74..2eea823516a7 100644
--- a/drivers/dma/dw_dmac.c
+++ b/drivers/dma/dw_dmac.c
@@ -116,7 +116,7 @@ static void dwc_sync_desc_for_cpu(struct dw_dma_chan *dwc, struct dw_desc *desc)
116{ 116{
117 struct dw_desc *child; 117 struct dw_desc *child;
118 118
119 list_for_each_entry(child, &desc->txd.tx_list, desc_node) 119 list_for_each_entry(child, &desc->tx_list, desc_node)
120 dma_sync_single_for_cpu(chan2parent(&dwc->chan), 120 dma_sync_single_for_cpu(chan2parent(&dwc->chan),
121 child->txd.phys, sizeof(child->lli), 121 child->txd.phys, sizeof(child->lli),
122 DMA_TO_DEVICE); 122 DMA_TO_DEVICE);
@@ -137,11 +137,11 @@ static void dwc_desc_put(struct dw_dma_chan *dwc, struct dw_desc *desc)
137 dwc_sync_desc_for_cpu(dwc, desc); 137 dwc_sync_desc_for_cpu(dwc, desc);
138 138
139 spin_lock_bh(&dwc->lock); 139 spin_lock_bh(&dwc->lock);
140 list_for_each_entry(child, &desc->txd.tx_list, desc_node) 140 list_for_each_entry(child, &desc->tx_list, desc_node)
141 dev_vdbg(chan2dev(&dwc->chan), 141 dev_vdbg(chan2dev(&dwc->chan),
142 "moving child desc %p to freelist\n", 142 "moving child desc %p to freelist\n",
143 child); 143 child);
144 list_splice_init(&desc->txd.tx_list, &dwc->free_list); 144 list_splice_init(&desc->tx_list, &dwc->free_list);
145 dev_vdbg(chan2dev(&dwc->chan), "moving desc %p to freelist\n", desc); 145 dev_vdbg(chan2dev(&dwc->chan), "moving desc %p to freelist\n", desc);
146 list_add(&desc->desc_node, &dwc->free_list); 146 list_add(&desc->desc_node, &dwc->free_list);
147 spin_unlock_bh(&dwc->lock); 147 spin_unlock_bh(&dwc->lock);
@@ -209,19 +209,28 @@ dwc_descriptor_complete(struct dw_dma_chan *dwc, struct dw_desc *desc)
209 param = txd->callback_param; 209 param = txd->callback_param;
210 210
211 dwc_sync_desc_for_cpu(dwc, desc); 211 dwc_sync_desc_for_cpu(dwc, desc);
212 list_splice_init(&txd->tx_list, &dwc->free_list); 212 list_splice_init(&desc->tx_list, &dwc->free_list);
213 list_move(&desc->desc_node, &dwc->free_list); 213 list_move(&desc->desc_node, &dwc->free_list);
214 214
215 /* 215 if (!dwc->chan.private) {
216 * We use dma_unmap_page() regardless of how the buffers were 216 struct device *parent = chan2parent(&dwc->chan);
217 * mapped before they were submitted... 217 if (!(txd->flags & DMA_COMPL_SKIP_DEST_UNMAP)) {
218 */ 218 if (txd->flags & DMA_COMPL_DEST_UNMAP_SINGLE)
219 if (!(txd->flags & DMA_COMPL_SKIP_DEST_UNMAP)) 219 dma_unmap_single(parent, desc->lli.dar,
220 dma_unmap_page(chan2parent(&dwc->chan), desc->lli.dar, 220 desc->len, DMA_FROM_DEVICE);
221 desc->len, DMA_FROM_DEVICE); 221 else
222 if (!(txd->flags & DMA_COMPL_SKIP_SRC_UNMAP)) 222 dma_unmap_page(parent, desc->lli.dar,
223 dma_unmap_page(chan2parent(&dwc->chan), desc->lli.sar, 223 desc->len, DMA_FROM_DEVICE);
224 desc->len, DMA_TO_DEVICE); 224 }
225 if (!(txd->flags & DMA_COMPL_SKIP_SRC_UNMAP)) {
226 if (txd->flags & DMA_COMPL_SRC_UNMAP_SINGLE)
227 dma_unmap_single(parent, desc->lli.sar,
228 desc->len, DMA_TO_DEVICE);
229 else
230 dma_unmap_page(parent, desc->lli.sar,
231 desc->len, DMA_TO_DEVICE);
232 }
233 }
225 234
226 /* 235 /*
227 * The API requires that no submissions are done from a 236 * The API requires that no submissions are done from a
@@ -289,7 +298,7 @@ static void dwc_scan_descriptors(struct dw_dma *dw, struct dw_dma_chan *dwc)
289 /* This one is currently in progress */ 298 /* This one is currently in progress */
290 return; 299 return;
291 300
292 list_for_each_entry(child, &desc->txd.tx_list, desc_node) 301 list_for_each_entry(child, &desc->tx_list, desc_node)
293 if (child->lli.llp == llp) 302 if (child->lli.llp == llp)
294 /* Currently in progress */ 303 /* Currently in progress */
295 return; 304 return;
@@ -356,7 +365,7 @@ static void dwc_handle_error(struct dw_dma *dw, struct dw_dma_chan *dwc)
356 dev_printk(KERN_CRIT, chan2dev(&dwc->chan), 365 dev_printk(KERN_CRIT, chan2dev(&dwc->chan),
357 " cookie: %d\n", bad_desc->txd.cookie); 366 " cookie: %d\n", bad_desc->txd.cookie);
358 dwc_dump_lli(dwc, &bad_desc->lli); 367 dwc_dump_lli(dwc, &bad_desc->lli);
359 list_for_each_entry(child, &bad_desc->txd.tx_list, desc_node) 368 list_for_each_entry(child, &bad_desc->tx_list, desc_node)
360 dwc_dump_lli(dwc, &child->lli); 369 dwc_dump_lli(dwc, &child->lli);
361 370
362 /* Pretend the descriptor completed successfully */ 371 /* Pretend the descriptor completed successfully */
@@ -608,7 +617,7 @@ dwc_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dest, dma_addr_t src,
608 prev->txd.phys, sizeof(prev->lli), 617 prev->txd.phys, sizeof(prev->lli),
609 DMA_TO_DEVICE); 618 DMA_TO_DEVICE);
610 list_add_tail(&desc->desc_node, 619 list_add_tail(&desc->desc_node,
611 &first->txd.tx_list); 620 &first->tx_list);
612 } 621 }
613 prev = desc; 622 prev = desc;
614 } 623 }
@@ -658,8 +667,6 @@ dwc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
658 reg_width = dws->reg_width; 667 reg_width = dws->reg_width;
659 prev = first = NULL; 668 prev = first = NULL;
660 669
661 sg_len = dma_map_sg(chan2parent(chan), sgl, sg_len, direction);
662
663 switch (direction) { 670 switch (direction) {
664 case DMA_TO_DEVICE: 671 case DMA_TO_DEVICE:
665 ctllo = (DWC_DEFAULT_CTLLO 672 ctllo = (DWC_DEFAULT_CTLLO
@@ -700,7 +707,7 @@ dwc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
700 sizeof(prev->lli), 707 sizeof(prev->lli),
701 DMA_TO_DEVICE); 708 DMA_TO_DEVICE);
702 list_add_tail(&desc->desc_node, 709 list_add_tail(&desc->desc_node,
703 &first->txd.tx_list); 710 &first->tx_list);
704 } 711 }
705 prev = desc; 712 prev = desc;
706 total_len += len; 713 total_len += len;
@@ -746,7 +753,7 @@ dwc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
746 sizeof(prev->lli), 753 sizeof(prev->lli),
747 DMA_TO_DEVICE); 754 DMA_TO_DEVICE);
748 list_add_tail(&desc->desc_node, 755 list_add_tail(&desc->desc_node,
749 &first->txd.tx_list); 756 &first->tx_list);
750 } 757 }
751 prev = desc; 758 prev = desc;
752 total_len += len; 759 total_len += len;
@@ -902,6 +909,7 @@ static int dwc_alloc_chan_resources(struct dma_chan *chan)
902 break; 909 break;
903 } 910 }
904 911
912 INIT_LIST_HEAD(&desc->tx_list);
905 dma_async_tx_descriptor_init(&desc->txd, chan); 913 dma_async_tx_descriptor_init(&desc->txd, chan);
906 desc->txd.tx_submit = dwc_tx_submit; 914 desc->txd.tx_submit = dwc_tx_submit;
907 desc->txd.flags = DMA_CTRL_ACK; 915 desc->txd.flags = DMA_CTRL_ACK;
diff --git a/drivers/dma/dw_dmac_regs.h b/drivers/dma/dw_dmac_regs.h
index 13a580767031..d9a939f67f46 100644
--- a/drivers/dma/dw_dmac_regs.h
+++ b/drivers/dma/dw_dmac_regs.h
@@ -217,6 +217,7 @@ struct dw_desc {
217 217
218 /* THEN values for driver housekeeping */ 218 /* THEN values for driver housekeeping */
219 struct list_head desc_node; 219 struct list_head desc_node;
220 struct list_head tx_list;
220 struct dma_async_tx_descriptor txd; 221 struct dma_async_tx_descriptor txd;
221 size_t len; 222 size_t len;
222}; 223};
diff --git a/drivers/dma/fsldma.c b/drivers/dma/fsldma.c
index ef87a8984145..296f9e747fac 100644
--- a/drivers/dma/fsldma.c
+++ b/drivers/dma/fsldma.c
@@ -34,6 +34,7 @@
34#include <linux/dmapool.h> 34#include <linux/dmapool.h>
35#include <linux/of_platform.h> 35#include <linux/of_platform.h>
36 36
37#include <asm/fsldma.h>
37#include "fsldma.h" 38#include "fsldma.h"
38 39
39static void dma_init(struct fsl_dma_chan *fsl_chan) 40static void dma_init(struct fsl_dma_chan *fsl_chan)
@@ -280,28 +281,40 @@ static void fsl_chan_set_dest_loop_size(struct fsl_dma_chan *fsl_chan, int size)
280} 281}
281 282
282/** 283/**
283 * fsl_chan_toggle_ext_pause - Toggle channel external pause status 284 * fsl_chan_set_request_count - Set DMA Request Count for external control
284 * @fsl_chan : Freescale DMA channel 285 * @fsl_chan : Freescale DMA channel
285 * @size : Pause control size, 0 for disable external pause control. 286 * @size : Number of bytes to transfer in a single request
286 * The maximum is 1024. 287 *
288 * The Freescale DMA channel can be controlled by the external signal DREQ#.
289 * The DMA request count is how many bytes are allowed to transfer before
290 * pausing the channel, after which a new assertion of DREQ# resumes channel
291 * operation.
287 * 292 *
288 * The Freescale DMA channel can be controlled by the external 293 * A size of 0 disables external pause control. The maximum size is 1024.
289 * signal DREQ#. The pause control size is how many bytes are allowed
290 * to transfer before pausing the channel, after which a new assertion
291 * of DREQ# resumes channel operation.
292 */ 294 */
293static void fsl_chan_toggle_ext_pause(struct fsl_dma_chan *fsl_chan, int size) 295static void fsl_chan_set_request_count(struct fsl_dma_chan *fsl_chan, int size)
294{ 296{
295 if (size > 1024) 297 BUG_ON(size > 1024);
296 return; 298 DMA_OUT(fsl_chan, &fsl_chan->reg_base->mr,
299 DMA_IN(fsl_chan, &fsl_chan->reg_base->mr, 32)
300 | ((__ilog2(size) << 24) & 0x0f000000),
301 32);
302}
297 303
298 if (size) { 304/**
299 DMA_OUT(fsl_chan, &fsl_chan->reg_base->mr, 305 * fsl_chan_toggle_ext_pause - Toggle channel external pause status
300 DMA_IN(fsl_chan, &fsl_chan->reg_base->mr, 32) 306 * @fsl_chan : Freescale DMA channel
301 | ((__ilog2(size) << 24) & 0x0f000000), 307 * @enable : 0 is disabled, 1 is enabled.
302 32); 308 *
309 * The Freescale DMA channel can be controlled by the external signal DREQ#.
310 * The DMA Request Count feature should be used in addition to this feature
311 * to set the number of bytes to transfer before pausing the channel.
312 */
313static void fsl_chan_toggle_ext_pause(struct fsl_dma_chan *fsl_chan, int enable)
314{
315 if (enable)
303 fsl_chan->feature |= FSL_DMA_CHAN_PAUSE_EXT; 316 fsl_chan->feature |= FSL_DMA_CHAN_PAUSE_EXT;
304 } else 317 else
305 fsl_chan->feature &= ~FSL_DMA_CHAN_PAUSE_EXT; 318 fsl_chan->feature &= ~FSL_DMA_CHAN_PAUSE_EXT;
306} 319}
307 320
@@ -326,7 +339,8 @@ static void fsl_chan_toggle_ext_start(struct fsl_dma_chan *fsl_chan, int enable)
326static dma_cookie_t fsl_dma_tx_submit(struct dma_async_tx_descriptor *tx) 339static dma_cookie_t fsl_dma_tx_submit(struct dma_async_tx_descriptor *tx)
327{ 340{
328 struct fsl_dma_chan *fsl_chan = to_fsl_chan(tx->chan); 341 struct fsl_dma_chan *fsl_chan = to_fsl_chan(tx->chan);
329 struct fsl_desc_sw *desc; 342 struct fsl_desc_sw *desc = tx_to_fsl_desc(tx);
343 struct fsl_desc_sw *child;
330 unsigned long flags; 344 unsigned long flags;
331 dma_cookie_t cookie; 345 dma_cookie_t cookie;
332 346
@@ -334,7 +348,7 @@ static dma_cookie_t fsl_dma_tx_submit(struct dma_async_tx_descriptor *tx)
334 spin_lock_irqsave(&fsl_chan->desc_lock, flags); 348 spin_lock_irqsave(&fsl_chan->desc_lock, flags);
335 349
336 cookie = fsl_chan->common.cookie; 350 cookie = fsl_chan->common.cookie;
337 list_for_each_entry(desc, &tx->tx_list, node) { 351 list_for_each_entry(child, &desc->tx_list, node) {
338 cookie++; 352 cookie++;
339 if (cookie < 0) 353 if (cookie < 0)
340 cookie = 1; 354 cookie = 1;
@@ -343,8 +357,8 @@ static dma_cookie_t fsl_dma_tx_submit(struct dma_async_tx_descriptor *tx)
343 } 357 }
344 358
345 fsl_chan->common.cookie = cookie; 359 fsl_chan->common.cookie = cookie;
346 append_ld_queue(fsl_chan, tx_to_fsl_desc(tx)); 360 append_ld_queue(fsl_chan, desc);
347 list_splice_init(&tx->tx_list, fsl_chan->ld_queue.prev); 361 list_splice_init(&desc->tx_list, fsl_chan->ld_queue.prev);
348 362
349 spin_unlock_irqrestore(&fsl_chan->desc_lock, flags); 363 spin_unlock_irqrestore(&fsl_chan->desc_lock, flags);
350 364
@@ -366,6 +380,7 @@ static struct fsl_desc_sw *fsl_dma_alloc_descriptor(
366 desc_sw = dma_pool_alloc(fsl_chan->desc_pool, GFP_ATOMIC, &pdesc); 380 desc_sw = dma_pool_alloc(fsl_chan->desc_pool, GFP_ATOMIC, &pdesc);
367 if (desc_sw) { 381 if (desc_sw) {
368 memset(desc_sw, 0, sizeof(struct fsl_desc_sw)); 382 memset(desc_sw, 0, sizeof(struct fsl_desc_sw));
383 INIT_LIST_HEAD(&desc_sw->tx_list);
369 dma_async_tx_descriptor_init(&desc_sw->async_tx, 384 dma_async_tx_descriptor_init(&desc_sw->async_tx,
370 &fsl_chan->common); 385 &fsl_chan->common);
371 desc_sw->async_tx.tx_submit = fsl_dma_tx_submit; 386 desc_sw->async_tx.tx_submit = fsl_dma_tx_submit;
@@ -455,7 +470,7 @@ fsl_dma_prep_interrupt(struct dma_chan *chan, unsigned long flags)
455 new->async_tx.flags = flags; 470 new->async_tx.flags = flags;
456 471
457 /* Insert the link descriptor to the LD ring */ 472 /* Insert the link descriptor to the LD ring */
458 list_add_tail(&new->node, &new->async_tx.tx_list); 473 list_add_tail(&new->node, &new->tx_list);
459 474
460 /* Set End-of-link to the last link descriptor of new list*/ 475 /* Set End-of-link to the last link descriptor of new list*/
461 set_ld_eol(fsl_chan, new); 476 set_ld_eol(fsl_chan, new);
@@ -513,7 +528,7 @@ static struct dma_async_tx_descriptor *fsl_dma_prep_memcpy(
513 dma_dest += copy; 528 dma_dest += copy;
514 529
515 /* Insert the link descriptor to the LD ring */ 530 /* Insert the link descriptor to the LD ring */
516 list_add_tail(&new->node, &first->async_tx.tx_list); 531 list_add_tail(&new->node, &first->tx_list);
517 } while (len); 532 } while (len);
518 533
519 new->async_tx.flags = flags; /* client is in control of this ack */ 534 new->async_tx.flags = flags; /* client is in control of this ack */
@@ -528,7 +543,7 @@ fail:
528 if (!first) 543 if (!first)
529 return NULL; 544 return NULL;
530 545
531 list = &first->async_tx.tx_list; 546 list = &first->tx_list;
532 list_for_each_entry_safe_reverse(new, prev, list, node) { 547 list_for_each_entry_safe_reverse(new, prev, list, node) {
533 list_del(&new->node); 548 list_del(&new->node);
534 dma_pool_free(fsl_chan->desc_pool, new, new->async_tx.phys); 549 dma_pool_free(fsl_chan->desc_pool, new, new->async_tx.phys);
@@ -538,6 +553,229 @@ fail:
538} 553}
539 554
540/** 555/**
556 * fsl_dma_prep_slave_sg - prepare descriptors for a DMA_SLAVE transaction
557 * @chan: DMA channel
558 * @sgl: scatterlist to transfer to/from
559 * @sg_len: number of entries in @scatterlist
560 * @direction: DMA direction
561 * @flags: DMAEngine flags
562 *
563 * Prepare a set of descriptors for a DMA_SLAVE transaction. Following the
564 * DMA_SLAVE API, this gets the device-specific information from the
565 * chan->private variable.
566 */
567static struct dma_async_tx_descriptor *fsl_dma_prep_slave_sg(
568 struct dma_chan *chan, struct scatterlist *sgl, unsigned int sg_len,
569 enum dma_data_direction direction, unsigned long flags)
570{
571 struct fsl_dma_chan *fsl_chan;
572 struct fsl_desc_sw *first = NULL, *prev = NULL, *new = NULL;
573 struct fsl_dma_slave *slave;
574 struct list_head *tx_list;
575 size_t copy;
576
577 int i;
578 struct scatterlist *sg;
579 size_t sg_used;
580 size_t hw_used;
581 struct fsl_dma_hw_addr *hw;
582 dma_addr_t dma_dst, dma_src;
583
584 if (!chan)
585 return NULL;
586
587 if (!chan->private)
588 return NULL;
589
590 fsl_chan = to_fsl_chan(chan);
591 slave = chan->private;
592
593 if (list_empty(&slave->addresses))
594 return NULL;
595
596 hw = list_first_entry(&slave->addresses, struct fsl_dma_hw_addr, entry);
597 hw_used = 0;
598
599 /*
600 * Build the hardware transaction to copy from the scatterlist to
601 * the hardware, or from the hardware to the scatterlist
602 *
603 * If you are copying from the hardware to the scatterlist and it
604 * takes two hardware entries to fill an entire page, then both
605 * hardware entries will be coalesced into the same page
606 *
607 * If you are copying from the scatterlist to the hardware and a
608 * single page can fill two hardware entries, then the data will
609 * be read out of the page into the first hardware entry, and so on
610 */
611 for_each_sg(sgl, sg, sg_len, i) {
612 sg_used = 0;
613
614 /* Loop until the entire scatterlist entry is used */
615 while (sg_used < sg_dma_len(sg)) {
616
617 /*
618 * If we've used up the current hardware address/length
619 * pair, we need to load a new one
620 *
621 * This is done in a while loop so that descriptors with
622 * length == 0 will be skipped
623 */
624 while (hw_used >= hw->length) {
625
626 /*
627 * If the current hardware entry is the last
628 * entry in the list, we're finished
629 */
630 if (list_is_last(&hw->entry, &slave->addresses))
631 goto finished;
632
633 /* Get the next hardware address/length pair */
634 hw = list_entry(hw->entry.next,
635 struct fsl_dma_hw_addr, entry);
636 hw_used = 0;
637 }
638
639 /* Allocate the link descriptor from DMA pool */
640 new = fsl_dma_alloc_descriptor(fsl_chan);
641 if (!new) {
642 dev_err(fsl_chan->dev, "No free memory for "
643 "link descriptor\n");
644 goto fail;
645 }
646#ifdef FSL_DMA_LD_DEBUG
647 dev_dbg(fsl_chan->dev, "new link desc alloc %p\n", new);
648#endif
649
650 /*
651 * Calculate the maximum number of bytes to transfer,
652 * making sure it is less than the DMA controller limit
653 */
654 copy = min_t(size_t, sg_dma_len(sg) - sg_used,
655 hw->length - hw_used);
656 copy = min_t(size_t, copy, FSL_DMA_BCR_MAX_CNT);
657
658 /*
659 * DMA_FROM_DEVICE
660 * from the hardware to the scatterlist
661 *
662 * DMA_TO_DEVICE
663 * from the scatterlist to the hardware
664 */
665 if (direction == DMA_FROM_DEVICE) {
666 dma_src = hw->address + hw_used;
667 dma_dst = sg_dma_address(sg) + sg_used;
668 } else {
669 dma_src = sg_dma_address(sg) + sg_used;
670 dma_dst = hw->address + hw_used;
671 }
672
673 /* Fill in the descriptor */
674 set_desc_cnt(fsl_chan, &new->hw, copy);
675 set_desc_src(fsl_chan, &new->hw, dma_src);
676 set_desc_dest(fsl_chan, &new->hw, dma_dst);
677
678 /*
679 * If this is not the first descriptor, chain the
680 * current descriptor after the previous descriptor
681 */
682 if (!first) {
683 first = new;
684 } else {
685 set_desc_next(fsl_chan, &prev->hw,
686 new->async_tx.phys);
687 }
688
689 new->async_tx.cookie = 0;
690 async_tx_ack(&new->async_tx);
691
692 prev = new;
693 sg_used += copy;
694 hw_used += copy;
695
696 /* Insert the link descriptor into the LD ring */
697 list_add_tail(&new->node, &first->tx_list);
698 }
699 }
700
701finished:
702
703 /* All of the hardware address/length pairs had length == 0 */
704 if (!first || !new)
705 return NULL;
706
707 new->async_tx.flags = flags;
708 new->async_tx.cookie = -EBUSY;
709
710 /* Set End-of-link to the last link descriptor of new list */
711 set_ld_eol(fsl_chan, new);
712
713 /* Enable extra controller features */
714 if (fsl_chan->set_src_loop_size)
715 fsl_chan->set_src_loop_size(fsl_chan, slave->src_loop_size);
716
717 if (fsl_chan->set_dest_loop_size)
718 fsl_chan->set_dest_loop_size(fsl_chan, slave->dst_loop_size);
719
720 if (fsl_chan->toggle_ext_start)
721 fsl_chan->toggle_ext_start(fsl_chan, slave->external_start);
722
723 if (fsl_chan->toggle_ext_pause)
724 fsl_chan->toggle_ext_pause(fsl_chan, slave->external_pause);
725
726 if (fsl_chan->set_request_count)
727 fsl_chan->set_request_count(fsl_chan, slave->request_count);
728
729 return &first->async_tx;
730
731fail:
732 /* If first was not set, then we failed to allocate the very first
733 * descriptor, and we're done */
734 if (!first)
735 return NULL;
736
737 /*
738 * First is set, so all of the descriptors we allocated have been added
739 * to first->tx_list, INCLUDING "first" itself. Therefore we
740 * must traverse the list backwards freeing each descriptor in turn
741 *
742 * We're re-using variables for the loop, oh well
743 */
744 tx_list = &first->tx_list;
745 list_for_each_entry_safe_reverse(new, prev, tx_list, node) {
746 list_del_init(&new->node);
747 dma_pool_free(fsl_chan->desc_pool, new, new->async_tx.phys);
748 }
749
750 return NULL;
751}
752
753static void fsl_dma_device_terminate_all(struct dma_chan *chan)
754{
755 struct fsl_dma_chan *fsl_chan;
756 struct fsl_desc_sw *desc, *tmp;
757 unsigned long flags;
758
759 if (!chan)
760 return;
761
762 fsl_chan = to_fsl_chan(chan);
763
764 /* Halt the DMA engine */
765 dma_halt(fsl_chan);
766
767 spin_lock_irqsave(&fsl_chan->desc_lock, flags);
768
769 /* Remove and free all of the descriptors in the LD queue */
770 list_for_each_entry_safe(desc, tmp, &fsl_chan->ld_queue, node) {
771 list_del(&desc->node);
772 dma_pool_free(fsl_chan->desc_pool, desc, desc->async_tx.phys);
773 }
774
775 spin_unlock_irqrestore(&fsl_chan->desc_lock, flags);
776}
777
778/**
541 * fsl_dma_update_completed_cookie - Update the completed cookie. 779 * fsl_dma_update_completed_cookie - Update the completed cookie.
542 * @fsl_chan : Freescale DMA channel 780 * @fsl_chan : Freescale DMA channel
543 */ 781 */
@@ -883,6 +1121,7 @@ static int __devinit fsl_dma_chan_probe(struct fsl_dma_device *fdev,
883 new_fsl_chan->toggle_ext_start = fsl_chan_toggle_ext_start; 1121 new_fsl_chan->toggle_ext_start = fsl_chan_toggle_ext_start;
884 new_fsl_chan->set_src_loop_size = fsl_chan_set_src_loop_size; 1122 new_fsl_chan->set_src_loop_size = fsl_chan_set_src_loop_size;
885 new_fsl_chan->set_dest_loop_size = fsl_chan_set_dest_loop_size; 1123 new_fsl_chan->set_dest_loop_size = fsl_chan_set_dest_loop_size;
1124 new_fsl_chan->set_request_count = fsl_chan_set_request_count;
886 } 1125 }
887 1126
888 spin_lock_init(&new_fsl_chan->desc_lock); 1127 spin_lock_init(&new_fsl_chan->desc_lock);
@@ -962,12 +1201,15 @@ static int __devinit of_fsl_dma_probe(struct of_device *dev,
962 1201
963 dma_cap_set(DMA_MEMCPY, fdev->common.cap_mask); 1202 dma_cap_set(DMA_MEMCPY, fdev->common.cap_mask);
964 dma_cap_set(DMA_INTERRUPT, fdev->common.cap_mask); 1203 dma_cap_set(DMA_INTERRUPT, fdev->common.cap_mask);
1204 dma_cap_set(DMA_SLAVE, fdev->common.cap_mask);
965 fdev->common.device_alloc_chan_resources = fsl_dma_alloc_chan_resources; 1205 fdev->common.device_alloc_chan_resources = fsl_dma_alloc_chan_resources;
966 fdev->common.device_free_chan_resources = fsl_dma_free_chan_resources; 1206 fdev->common.device_free_chan_resources = fsl_dma_free_chan_resources;
967 fdev->common.device_prep_dma_interrupt = fsl_dma_prep_interrupt; 1207 fdev->common.device_prep_dma_interrupt = fsl_dma_prep_interrupt;
968 fdev->common.device_prep_dma_memcpy = fsl_dma_prep_memcpy; 1208 fdev->common.device_prep_dma_memcpy = fsl_dma_prep_memcpy;
969 fdev->common.device_is_tx_complete = fsl_dma_is_complete; 1209 fdev->common.device_is_tx_complete = fsl_dma_is_complete;
970 fdev->common.device_issue_pending = fsl_dma_memcpy_issue_pending; 1210 fdev->common.device_issue_pending = fsl_dma_memcpy_issue_pending;
1211 fdev->common.device_prep_slave_sg = fsl_dma_prep_slave_sg;
1212 fdev->common.device_terminate_all = fsl_dma_device_terminate_all;
971 fdev->common.dev = &dev->dev; 1213 fdev->common.dev = &dev->dev;
972 1214
973 fdev->irq = irq_of_parse_and_map(dev->node, 0); 1215 fdev->irq = irq_of_parse_and_map(dev->node, 0);
diff --git a/drivers/dma/fsldma.h b/drivers/dma/fsldma.h
index dc7f26865797..0df14cbb8ca3 100644
--- a/drivers/dma/fsldma.h
+++ b/drivers/dma/fsldma.h
@@ -90,6 +90,7 @@ struct fsl_dma_ld_hw {
90struct fsl_desc_sw { 90struct fsl_desc_sw {
91 struct fsl_dma_ld_hw hw; 91 struct fsl_dma_ld_hw hw;
92 struct list_head node; 92 struct list_head node;
93 struct list_head tx_list;
93 struct dma_async_tx_descriptor async_tx; 94 struct dma_async_tx_descriptor async_tx;
94 struct list_head *ld; 95 struct list_head *ld;
95 void *priv; 96 void *priv;
@@ -143,10 +144,11 @@ struct fsl_dma_chan {
143 struct tasklet_struct tasklet; 144 struct tasklet_struct tasklet;
144 u32 feature; 145 u32 feature;
145 146
146 void (*toggle_ext_pause)(struct fsl_dma_chan *fsl_chan, int size); 147 void (*toggle_ext_pause)(struct fsl_dma_chan *fsl_chan, int enable);
147 void (*toggle_ext_start)(struct fsl_dma_chan *fsl_chan, int enable); 148 void (*toggle_ext_start)(struct fsl_dma_chan *fsl_chan, int enable);
148 void (*set_src_loop_size)(struct fsl_dma_chan *fsl_chan, int size); 149 void (*set_src_loop_size)(struct fsl_dma_chan *fsl_chan, int size);
149 void (*set_dest_loop_size)(struct fsl_dma_chan *fsl_chan, int size); 150 void (*set_dest_loop_size)(struct fsl_dma_chan *fsl_chan, int size);
151 void (*set_request_count)(struct fsl_dma_chan *fsl_chan, int size);
150}; 152};
151 153
152#define to_fsl_chan(chan) container_of(chan, struct fsl_dma_chan, common) 154#define to_fsl_chan(chan) container_of(chan, struct fsl_dma_chan, common)
diff --git a/drivers/dma/ioat.c b/drivers/dma/ioat.c
deleted file mode 100644
index 2225bb6ba3d1..000000000000
--- a/drivers/dma/ioat.c
+++ /dev/null
@@ -1,202 +0,0 @@
1/*
2 * Intel I/OAT DMA Linux driver
3 * Copyright(c) 2007 - 2009 Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 * You should have received a copy of the GNU General Public License along with
15 * this program; if not, write to the Free Software Foundation, Inc.,
16 * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
17 *
18 * The full GNU General Public License is included in this distribution in
19 * the file called "COPYING".
20 *
21 */
22
23/*
24 * This driver supports an Intel I/OAT DMA engine, which does asynchronous
25 * copy operations.
26 */
27
28#include <linux/init.h>
29#include <linux/module.h>
30#include <linux/pci.h>
31#include <linux/interrupt.h>
32#include <linux/dca.h>
33#include "ioatdma.h"
34#include "ioatdma_registers.h"
35#include "ioatdma_hw.h"
36
37MODULE_VERSION(IOAT_DMA_VERSION);
38MODULE_LICENSE("GPL");
39MODULE_AUTHOR("Intel Corporation");
40
41static struct pci_device_id ioat_pci_tbl[] = {
42 /* I/OAT v1 platforms */
43 { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT) },
44 { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_CNB) },
45 { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SCNB) },
46 { PCI_DEVICE(PCI_VENDOR_ID_UNISYS, PCI_DEVICE_ID_UNISYS_DMA_DIRECTOR) },
47
48 /* I/OAT v2 platforms */
49 { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB) },
50
51 /* I/OAT v3 platforms */
52 { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_TBG0) },
53 { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_TBG1) },
54 { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_TBG2) },
55 { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_TBG3) },
56 { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_TBG4) },
57 { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_TBG5) },
58 { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_TBG6) },
59 { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_TBG7) },
60 { 0, }
61};
62
63struct ioat_device {
64 struct pci_dev *pdev;
65 void __iomem *iobase;
66 struct ioatdma_device *dma;
67 struct dca_provider *dca;
68};
69
70static int __devinit ioat_probe(struct pci_dev *pdev,
71 const struct pci_device_id *id);
72static void __devexit ioat_remove(struct pci_dev *pdev);
73
74static int ioat_dca_enabled = 1;
75module_param(ioat_dca_enabled, int, 0644);
76MODULE_PARM_DESC(ioat_dca_enabled, "control support of dca service (default: 1)");
77
78static struct pci_driver ioat_pci_driver = {
79 .name = "ioatdma",
80 .id_table = ioat_pci_tbl,
81 .probe = ioat_probe,
82 .remove = __devexit_p(ioat_remove),
83};
84
85static int __devinit ioat_probe(struct pci_dev *pdev,
86 const struct pci_device_id *id)
87{
88 void __iomem *iobase;
89 struct ioat_device *device;
90 unsigned long mmio_start, mmio_len;
91 int err;
92
93 err = pci_enable_device(pdev);
94 if (err)
95 goto err_enable_device;
96
97 err = pci_request_regions(pdev, ioat_pci_driver.name);
98 if (err)
99 goto err_request_regions;
100
101 err = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
102 if (err)
103 err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
104 if (err)
105 goto err_set_dma_mask;
106
107 err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
108 if (err)
109 err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
110 if (err)
111 goto err_set_dma_mask;
112
113 mmio_start = pci_resource_start(pdev, 0);
114 mmio_len = pci_resource_len(pdev, 0);
115 iobase = ioremap(mmio_start, mmio_len);
116 if (!iobase) {
117 err = -ENOMEM;
118 goto err_ioremap;
119 }
120
121 device = kzalloc(sizeof(*device), GFP_KERNEL);
122 if (!device) {
123 err = -ENOMEM;
124 goto err_kzalloc;
125 }
126 device->pdev = pdev;
127 pci_set_drvdata(pdev, device);
128 device->iobase = iobase;
129
130 pci_set_master(pdev);
131
132 switch (readb(iobase + IOAT_VER_OFFSET)) {
133 case IOAT_VER_1_2:
134 device->dma = ioat_dma_probe(pdev, iobase);
135 if (device->dma && ioat_dca_enabled)
136 device->dca = ioat_dca_init(pdev, iobase);
137 break;
138 case IOAT_VER_2_0:
139 device->dma = ioat_dma_probe(pdev, iobase);
140 if (device->dma && ioat_dca_enabled)
141 device->dca = ioat2_dca_init(pdev, iobase);
142 break;
143 case IOAT_VER_3_0:
144 device->dma = ioat_dma_probe(pdev, iobase);
145 if (device->dma && ioat_dca_enabled)
146 device->dca = ioat3_dca_init(pdev, iobase);
147 break;
148 default:
149 err = -ENODEV;
150 break;
151 }
152 if (!device->dma)
153 err = -ENODEV;
154
155 if (err)
156 goto err_version;
157
158 return 0;
159
160err_version:
161 kfree(device);
162err_kzalloc:
163 iounmap(iobase);
164err_ioremap:
165err_set_dma_mask:
166 pci_release_regions(pdev);
167 pci_disable_device(pdev);
168err_request_regions:
169err_enable_device:
170 return err;
171}
172
173static void __devexit ioat_remove(struct pci_dev *pdev)
174{
175 struct ioat_device *device = pci_get_drvdata(pdev);
176
177 dev_err(&pdev->dev, "Removing dma and dca services\n");
178 if (device->dca) {
179 unregister_dca_provider(device->dca);
180 free_dca_provider(device->dca);
181 device->dca = NULL;
182 }
183
184 if (device->dma) {
185 ioat_dma_remove(device->dma);
186 device->dma = NULL;
187 }
188
189 kfree(device);
190}
191
192static int __init ioat_init_module(void)
193{
194 return pci_register_driver(&ioat_pci_driver);
195}
196module_init(ioat_init_module);
197
198static void __exit ioat_exit_module(void)
199{
200 pci_unregister_driver(&ioat_pci_driver);
201}
202module_exit(ioat_exit_module);
diff --git a/drivers/dma/ioat/Makefile b/drivers/dma/ioat/Makefile
new file mode 100644
index 000000000000..8997d3fb9051
--- /dev/null
+++ b/drivers/dma/ioat/Makefile
@@ -0,0 +1,2 @@
1obj-$(CONFIG_INTEL_IOATDMA) += ioatdma.o
2ioatdma-objs := pci.o dma.o dma_v2.o dma_v3.o dca.o
diff --git a/drivers/dma/ioat_dca.c b/drivers/dma/ioat/dca.c
index c012a1e15043..69d02615c4d6 100644
--- a/drivers/dma/ioat_dca.c
+++ b/drivers/dma/ioat/dca.c
@@ -33,8 +33,8 @@
33#define cpu_physical_id(cpu) (cpuid_ebx(1) >> 24) 33#define cpu_physical_id(cpu) (cpuid_ebx(1) >> 24)
34#endif 34#endif
35 35
36#include "ioatdma.h" 36#include "dma.h"
37#include "ioatdma_registers.h" 37#include "registers.h"
38 38
39/* 39/*
40 * Bit 7 of a tag map entry is the "valid" bit, if it is set then bits 0:6 40 * Bit 7 of a tag map entry is the "valid" bit, if it is set then bits 0:6
@@ -242,7 +242,8 @@ static struct dca_ops ioat_dca_ops = {
242}; 242};
243 243
244 244
245struct dca_provider *ioat_dca_init(struct pci_dev *pdev, void __iomem *iobase) 245struct dca_provider * __devinit
246ioat_dca_init(struct pci_dev *pdev, void __iomem *iobase)
246{ 247{
247 struct dca_provider *dca; 248 struct dca_provider *dca;
248 struct ioat_dca_priv *ioatdca; 249 struct ioat_dca_priv *ioatdca;
@@ -407,7 +408,8 @@ static int ioat2_dca_count_dca_slots(void __iomem *iobase, u16 dca_offset)
407 return slots; 408 return slots;
408} 409}
409 410
410struct dca_provider *ioat2_dca_init(struct pci_dev *pdev, void __iomem *iobase) 411struct dca_provider * __devinit
412ioat2_dca_init(struct pci_dev *pdev, void __iomem *iobase)
411{ 413{
412 struct dca_provider *dca; 414 struct dca_provider *dca;
413 struct ioat_dca_priv *ioatdca; 415 struct ioat_dca_priv *ioatdca;
@@ -602,7 +604,8 @@ static int ioat3_dca_count_dca_slots(void *iobase, u16 dca_offset)
602 return slots; 604 return slots;
603} 605}
604 606
605struct dca_provider *ioat3_dca_init(struct pci_dev *pdev, void __iomem *iobase) 607struct dca_provider * __devinit
608ioat3_dca_init(struct pci_dev *pdev, void __iomem *iobase)
606{ 609{
607 struct dca_provider *dca; 610 struct dca_provider *dca;
608 struct ioat_dca_priv *ioatdca; 611 struct ioat_dca_priv *ioatdca;
diff --git a/drivers/dma/ioat/dma.c b/drivers/dma/ioat/dma.c
new file mode 100644
index 000000000000..c524d36d3c2e
--- /dev/null
+++ b/drivers/dma/ioat/dma.c
@@ -0,0 +1,1238 @@
1/*
2 * Intel I/OAT DMA Linux driver
3 * Copyright(c) 2004 - 2009 Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 * You should have received a copy of the GNU General Public License along with
15 * this program; if not, write to the Free Software Foundation, Inc.,
16 * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
17 *
18 * The full GNU General Public License is included in this distribution in
19 * the file called "COPYING".
20 *
21 */
22
23/*
24 * This driver supports an Intel I/OAT DMA engine, which does asynchronous
25 * copy operations.
26 */
27
28#include <linux/init.h>
29#include <linux/module.h>
30#include <linux/pci.h>
31#include <linux/interrupt.h>
32#include <linux/dmaengine.h>
33#include <linux/delay.h>
34#include <linux/dma-mapping.h>
35#include <linux/workqueue.h>
36#include <linux/i7300_idle.h>
37#include "dma.h"
38#include "registers.h"
39#include "hw.h"
40
41int ioat_pending_level = 4;
42module_param(ioat_pending_level, int, 0644);
43MODULE_PARM_DESC(ioat_pending_level,
44 "high-water mark for pushing ioat descriptors (default: 4)");
45
46/* internal functions */
47static void ioat1_cleanup(struct ioat_dma_chan *ioat);
48static void ioat1_dma_start_null_desc(struct ioat_dma_chan *ioat);
49
50/**
51 * ioat_dma_do_interrupt - handler used for single vector interrupt mode
52 * @irq: interrupt id
53 * @data: interrupt data
54 */
55static irqreturn_t ioat_dma_do_interrupt(int irq, void *data)
56{
57 struct ioatdma_device *instance = data;
58 struct ioat_chan_common *chan;
59 unsigned long attnstatus;
60 int bit;
61 u8 intrctrl;
62
63 intrctrl = readb(instance->reg_base + IOAT_INTRCTRL_OFFSET);
64
65 if (!(intrctrl & IOAT_INTRCTRL_MASTER_INT_EN))
66 return IRQ_NONE;
67
68 if (!(intrctrl & IOAT_INTRCTRL_INT_STATUS)) {
69 writeb(intrctrl, instance->reg_base + IOAT_INTRCTRL_OFFSET);
70 return IRQ_NONE;
71 }
72
73 attnstatus = readl(instance->reg_base + IOAT_ATTNSTATUS_OFFSET);
74 for_each_bit(bit, &attnstatus, BITS_PER_LONG) {
75 chan = ioat_chan_by_index(instance, bit);
76 tasklet_schedule(&chan->cleanup_task);
77 }
78
79 writeb(intrctrl, instance->reg_base + IOAT_INTRCTRL_OFFSET);
80 return IRQ_HANDLED;
81}
82
83/**
84 * ioat_dma_do_interrupt_msix - handler used for vector-per-channel interrupt mode
85 * @irq: interrupt id
86 * @data: interrupt data
87 */
88static irqreturn_t ioat_dma_do_interrupt_msix(int irq, void *data)
89{
90 struct ioat_chan_common *chan = data;
91
92 tasklet_schedule(&chan->cleanup_task);
93
94 return IRQ_HANDLED;
95}
96
97static void ioat1_cleanup_tasklet(unsigned long data);
98
99/* common channel initialization */
100void ioat_init_channel(struct ioatdma_device *device,
101 struct ioat_chan_common *chan, int idx,
102 void (*timer_fn)(unsigned long),
103 void (*tasklet)(unsigned long),
104 unsigned long ioat)
105{
106 struct dma_device *dma = &device->common;
107
108 chan->device = device;
109 chan->reg_base = device->reg_base + (0x80 * (idx + 1));
110 spin_lock_init(&chan->cleanup_lock);
111 chan->common.device = dma;
112 list_add_tail(&chan->common.device_node, &dma->channels);
113 device->idx[idx] = chan;
114 init_timer(&chan->timer);
115 chan->timer.function = timer_fn;
116 chan->timer.data = ioat;
117 tasklet_init(&chan->cleanup_task, tasklet, ioat);
118 tasklet_disable(&chan->cleanup_task);
119}
120
121static void ioat1_timer_event(unsigned long data);
122
123/**
124 * ioat1_dma_enumerate_channels - find and initialize the device's channels
125 * @device: the device to be enumerated
126 */
127static int ioat1_enumerate_channels(struct ioatdma_device *device)
128{
129 u8 xfercap_scale;
130 u32 xfercap;
131 int i;
132 struct ioat_dma_chan *ioat;
133 struct device *dev = &device->pdev->dev;
134 struct dma_device *dma = &device->common;
135
136 INIT_LIST_HEAD(&dma->channels);
137 dma->chancnt = readb(device->reg_base + IOAT_CHANCNT_OFFSET);
138 dma->chancnt &= 0x1f; /* bits [4:0] valid */
139 if (dma->chancnt > ARRAY_SIZE(device->idx)) {
140 dev_warn(dev, "(%d) exceeds max supported channels (%zu)\n",
141 dma->chancnt, ARRAY_SIZE(device->idx));
142 dma->chancnt = ARRAY_SIZE(device->idx);
143 }
144 xfercap_scale = readb(device->reg_base + IOAT_XFERCAP_OFFSET);
145 xfercap_scale &= 0x1f; /* bits [4:0] valid */
146 xfercap = (xfercap_scale == 0 ? -1 : (1UL << xfercap_scale));
147 dev_dbg(dev, "%s: xfercap = %d\n", __func__, xfercap);
148
149#ifdef CONFIG_I7300_IDLE_IOAT_CHANNEL
150 if (i7300_idle_platform_probe(NULL, NULL, 1) == 0)
151 dma->chancnt--;
152#endif
153 for (i = 0; i < dma->chancnt; i++) {
154 ioat = devm_kzalloc(dev, sizeof(*ioat), GFP_KERNEL);
155 if (!ioat)
156 break;
157
158 ioat_init_channel(device, &ioat->base, i,
159 ioat1_timer_event,
160 ioat1_cleanup_tasklet,
161 (unsigned long) ioat);
162 ioat->xfercap = xfercap;
163 spin_lock_init(&ioat->desc_lock);
164 INIT_LIST_HEAD(&ioat->free_desc);
165 INIT_LIST_HEAD(&ioat->used_desc);
166 }
167 dma->chancnt = i;
168 return i;
169}
170
171/**
172 * ioat_dma_memcpy_issue_pending - push potentially unrecognized appended
173 * descriptors to hw
174 * @chan: DMA channel handle
175 */
176static inline void
177__ioat1_dma_memcpy_issue_pending(struct ioat_dma_chan *ioat)
178{
179 void __iomem *reg_base = ioat->base.reg_base;
180
181 dev_dbg(to_dev(&ioat->base), "%s: pending: %d\n",
182 __func__, ioat->pending);
183 ioat->pending = 0;
184 writeb(IOAT_CHANCMD_APPEND, reg_base + IOAT1_CHANCMD_OFFSET);
185}
186
187static void ioat1_dma_memcpy_issue_pending(struct dma_chan *chan)
188{
189 struct ioat_dma_chan *ioat = to_ioat_chan(chan);
190
191 if (ioat->pending > 0) {
192 spin_lock_bh(&ioat->desc_lock);
193 __ioat1_dma_memcpy_issue_pending(ioat);
194 spin_unlock_bh(&ioat->desc_lock);
195 }
196}
197
198/**
199 * ioat1_reset_channel - restart a channel
200 * @ioat: IOAT DMA channel handle
201 */
202static void ioat1_reset_channel(struct ioat_dma_chan *ioat)
203{
204 struct ioat_chan_common *chan = &ioat->base;
205 void __iomem *reg_base = chan->reg_base;
206 u32 chansts, chanerr;
207
208 dev_warn(to_dev(chan), "reset\n");
209 chanerr = readl(reg_base + IOAT_CHANERR_OFFSET);
210 chansts = *chan->completion & IOAT_CHANSTS_STATUS;
211 if (chanerr) {
212 dev_err(to_dev(chan),
213 "chan%d, CHANSTS = 0x%08x CHANERR = 0x%04x, clearing\n",
214 chan_num(chan), chansts, chanerr);
215 writel(chanerr, reg_base + IOAT_CHANERR_OFFSET);
216 }
217
218 /*
219 * whack it upside the head with a reset
220 * and wait for things to settle out.
221 * force the pending count to a really big negative
222 * to make sure no one forces an issue_pending
223 * while we're waiting.
224 */
225
226 ioat->pending = INT_MIN;
227 writeb(IOAT_CHANCMD_RESET,
228 reg_base + IOAT_CHANCMD_OFFSET(chan->device->version));
229 set_bit(IOAT_RESET_PENDING, &chan->state);
230 mod_timer(&chan->timer, jiffies + RESET_DELAY);
231}
232
233static dma_cookie_t ioat1_tx_submit(struct dma_async_tx_descriptor *tx)
234{
235 struct dma_chan *c = tx->chan;
236 struct ioat_dma_chan *ioat = to_ioat_chan(c);
237 struct ioat_desc_sw *desc = tx_to_ioat_desc(tx);
238 struct ioat_chan_common *chan = &ioat->base;
239 struct ioat_desc_sw *first;
240 struct ioat_desc_sw *chain_tail;
241 dma_cookie_t cookie;
242
243 spin_lock_bh(&ioat->desc_lock);
244 /* cookie incr and addition to used_list must be atomic */
245 cookie = c->cookie;
246 cookie++;
247 if (cookie < 0)
248 cookie = 1;
249 c->cookie = cookie;
250 tx->cookie = cookie;
251 dev_dbg(to_dev(&ioat->base), "%s: cookie: %d\n", __func__, cookie);
252
253 /* write address into NextDescriptor field of last desc in chain */
254 first = to_ioat_desc(desc->tx_list.next);
255 chain_tail = to_ioat_desc(ioat->used_desc.prev);
256 /* make descriptor updates globally visible before chaining */
257 wmb();
258 chain_tail->hw->next = first->txd.phys;
259 list_splice_tail_init(&desc->tx_list, &ioat->used_desc);
260 dump_desc_dbg(ioat, chain_tail);
261 dump_desc_dbg(ioat, first);
262
263 if (!test_and_set_bit(IOAT_COMPLETION_PENDING, &chan->state))
264 mod_timer(&chan->timer, jiffies + COMPLETION_TIMEOUT);
265
266 ioat->active += desc->hw->tx_cnt;
267 ioat->pending += desc->hw->tx_cnt;
268 if (ioat->pending >= ioat_pending_level)
269 __ioat1_dma_memcpy_issue_pending(ioat);
270 spin_unlock_bh(&ioat->desc_lock);
271
272 return cookie;
273}
274
275/**
276 * ioat_dma_alloc_descriptor - allocate and return a sw and hw descriptor pair
277 * @ioat: the channel supplying the memory pool for the descriptors
278 * @flags: allocation flags
279 */
280static struct ioat_desc_sw *
281ioat_dma_alloc_descriptor(struct ioat_dma_chan *ioat, gfp_t flags)
282{
283 struct ioat_dma_descriptor *desc;
284 struct ioat_desc_sw *desc_sw;
285 struct ioatdma_device *ioatdma_device;
286 dma_addr_t phys;
287
288 ioatdma_device = ioat->base.device;
289 desc = pci_pool_alloc(ioatdma_device->dma_pool, flags, &phys);
290 if (unlikely(!desc))
291 return NULL;
292
293 desc_sw = kzalloc(sizeof(*desc_sw), flags);
294 if (unlikely(!desc_sw)) {
295 pci_pool_free(ioatdma_device->dma_pool, desc, phys);
296 return NULL;
297 }
298
299 memset(desc, 0, sizeof(*desc));
300
301 INIT_LIST_HEAD(&desc_sw->tx_list);
302 dma_async_tx_descriptor_init(&desc_sw->txd, &ioat->base.common);
303 desc_sw->txd.tx_submit = ioat1_tx_submit;
304 desc_sw->hw = desc;
305 desc_sw->txd.phys = phys;
306 set_desc_id(desc_sw, -1);
307
308 return desc_sw;
309}
310
311static int ioat_initial_desc_count = 256;
312module_param(ioat_initial_desc_count, int, 0644);
313MODULE_PARM_DESC(ioat_initial_desc_count,
314 "ioat1: initial descriptors per channel (default: 256)");
315/**
316 * ioat1_dma_alloc_chan_resources - returns the number of allocated descriptors
317 * @chan: the channel to be filled out
318 */
319static int ioat1_dma_alloc_chan_resources(struct dma_chan *c)
320{
321 struct ioat_dma_chan *ioat = to_ioat_chan(c);
322 struct ioat_chan_common *chan = &ioat->base;
323 struct ioat_desc_sw *desc;
324 u32 chanerr;
325 int i;
326 LIST_HEAD(tmp_list);
327
328 /* have we already been set up? */
329 if (!list_empty(&ioat->free_desc))
330 return ioat->desccount;
331
332 /* Setup register to interrupt and write completion status on error */
333 writew(IOAT_CHANCTRL_RUN, chan->reg_base + IOAT_CHANCTRL_OFFSET);
334
335 chanerr = readl(chan->reg_base + IOAT_CHANERR_OFFSET);
336 if (chanerr) {
337 dev_err(to_dev(chan), "CHANERR = %x, clearing\n", chanerr);
338 writel(chanerr, chan->reg_base + IOAT_CHANERR_OFFSET);
339 }
340
341 /* Allocate descriptors */
342 for (i = 0; i < ioat_initial_desc_count; i++) {
343 desc = ioat_dma_alloc_descriptor(ioat, GFP_KERNEL);
344 if (!desc) {
345 dev_err(to_dev(chan), "Only %d initial descriptors\n", i);
346 break;
347 }
348 set_desc_id(desc, i);
349 list_add_tail(&desc->node, &tmp_list);
350 }
351 spin_lock_bh(&ioat->desc_lock);
352 ioat->desccount = i;
353 list_splice(&tmp_list, &ioat->free_desc);
354 spin_unlock_bh(&ioat->desc_lock);
355
356 /* allocate a completion writeback area */
357 /* doing 2 32bit writes to mmio since 1 64b write doesn't work */
358 chan->completion = pci_pool_alloc(chan->device->completion_pool,
359 GFP_KERNEL, &chan->completion_dma);
360 memset(chan->completion, 0, sizeof(*chan->completion));
361 writel(((u64) chan->completion_dma) & 0x00000000FFFFFFFF,
362 chan->reg_base + IOAT_CHANCMP_OFFSET_LOW);
363 writel(((u64) chan->completion_dma) >> 32,
364 chan->reg_base + IOAT_CHANCMP_OFFSET_HIGH);
365
366 tasklet_enable(&chan->cleanup_task);
367 ioat1_dma_start_null_desc(ioat); /* give chain to dma device */
368 dev_dbg(to_dev(chan), "%s: allocated %d descriptors\n",
369 __func__, ioat->desccount);
370 return ioat->desccount;
371}
372
373/**
374 * ioat1_dma_free_chan_resources - release all the descriptors
375 * @chan: the channel to be cleaned
376 */
377static void ioat1_dma_free_chan_resources(struct dma_chan *c)
378{
379 struct ioat_dma_chan *ioat = to_ioat_chan(c);
380 struct ioat_chan_common *chan = &ioat->base;
381 struct ioatdma_device *ioatdma_device = chan->device;
382 struct ioat_desc_sw *desc, *_desc;
383 int in_use_descs = 0;
384
385 /* Before freeing channel resources first check
386 * if they have been previously allocated for this channel.
387 */
388 if (ioat->desccount == 0)
389 return;
390
391 tasklet_disable(&chan->cleanup_task);
392 del_timer_sync(&chan->timer);
393 ioat1_cleanup(ioat);
394
395 /* Delay 100ms after reset to allow internal DMA logic to quiesce
396 * before removing DMA descriptor resources.
397 */
398 writeb(IOAT_CHANCMD_RESET,
399 chan->reg_base + IOAT_CHANCMD_OFFSET(chan->device->version));
400 mdelay(100);
401
402 spin_lock_bh(&ioat->desc_lock);
403 list_for_each_entry_safe(desc, _desc, &ioat->used_desc, node) {
404 dev_dbg(to_dev(chan), "%s: freeing %d from used list\n",
405 __func__, desc_id(desc));
406 dump_desc_dbg(ioat, desc);
407 in_use_descs++;
408 list_del(&desc->node);
409 pci_pool_free(ioatdma_device->dma_pool, desc->hw,
410 desc->txd.phys);
411 kfree(desc);
412 }
413 list_for_each_entry_safe(desc, _desc,
414 &ioat->free_desc, node) {
415 list_del(&desc->node);
416 pci_pool_free(ioatdma_device->dma_pool, desc->hw,
417 desc->txd.phys);
418 kfree(desc);
419 }
420 spin_unlock_bh(&ioat->desc_lock);
421
422 pci_pool_free(ioatdma_device->completion_pool,
423 chan->completion,
424 chan->completion_dma);
425
426 /* one is ok since we left it on there on purpose */
427 if (in_use_descs > 1)
428 dev_err(to_dev(chan), "Freeing %d in use descriptors!\n",
429 in_use_descs - 1);
430
431 chan->last_completion = 0;
432 chan->completion_dma = 0;
433 ioat->pending = 0;
434 ioat->desccount = 0;
435}
436
437/**
438 * ioat1_dma_get_next_descriptor - return the next available descriptor
439 * @ioat: IOAT DMA channel handle
440 *
441 * Gets the next descriptor from the chain, and must be called with the
442 * channel's desc_lock held. Allocates more descriptors if the channel
443 * has run out.
444 */
445static struct ioat_desc_sw *
446ioat1_dma_get_next_descriptor(struct ioat_dma_chan *ioat)
447{
448 struct ioat_desc_sw *new;
449
450 if (!list_empty(&ioat->free_desc)) {
451 new = to_ioat_desc(ioat->free_desc.next);
452 list_del(&new->node);
453 } else {
454 /* try to get another desc */
455 new = ioat_dma_alloc_descriptor(ioat, GFP_ATOMIC);
456 if (!new) {
457 dev_err(to_dev(&ioat->base), "alloc failed\n");
458 return NULL;
459 }
460 }
461 dev_dbg(to_dev(&ioat->base), "%s: allocated: %d\n",
462 __func__, desc_id(new));
463 prefetch(new->hw);
464 return new;
465}
466
467static struct dma_async_tx_descriptor *
468ioat1_dma_prep_memcpy(struct dma_chan *c, dma_addr_t dma_dest,
469 dma_addr_t dma_src, size_t len, unsigned long flags)
470{
471 struct ioat_dma_chan *ioat = to_ioat_chan(c);
472 struct ioat_desc_sw *desc;
473 size_t copy;
474 LIST_HEAD(chain);
475 dma_addr_t src = dma_src;
476 dma_addr_t dest = dma_dest;
477 size_t total_len = len;
478 struct ioat_dma_descriptor *hw = NULL;
479 int tx_cnt = 0;
480
481 spin_lock_bh(&ioat->desc_lock);
482 desc = ioat1_dma_get_next_descriptor(ioat);
483 do {
484 if (!desc)
485 break;
486
487 tx_cnt++;
488 copy = min_t(size_t, len, ioat->xfercap);
489
490 hw = desc->hw;
491 hw->size = copy;
492 hw->ctl = 0;
493 hw->src_addr = src;
494 hw->dst_addr = dest;
495
496 list_add_tail(&desc->node, &chain);
497
498 len -= copy;
499 dest += copy;
500 src += copy;
501 if (len) {
502 struct ioat_desc_sw *next;
503
504 async_tx_ack(&desc->txd);
505 next = ioat1_dma_get_next_descriptor(ioat);
506 hw->next = next ? next->txd.phys : 0;
507 dump_desc_dbg(ioat, desc);
508 desc = next;
509 } else
510 hw->next = 0;
511 } while (len);
512
513 if (!desc) {
514 struct ioat_chan_common *chan = &ioat->base;
515
516 dev_err(to_dev(chan),
517 "chan%d - get_next_desc failed\n", chan_num(chan));
518 list_splice(&chain, &ioat->free_desc);
519 spin_unlock_bh(&ioat->desc_lock);
520 return NULL;
521 }
522 spin_unlock_bh(&ioat->desc_lock);
523
524 desc->txd.flags = flags;
525 desc->len = total_len;
526 list_splice(&chain, &desc->tx_list);
527 hw->ctl_f.int_en = !!(flags & DMA_PREP_INTERRUPT);
528 hw->ctl_f.compl_write = 1;
529 hw->tx_cnt = tx_cnt;
530 dump_desc_dbg(ioat, desc);
531
532 return &desc->txd;
533}
534
535static void ioat1_cleanup_tasklet(unsigned long data)
536{
537 struct ioat_dma_chan *chan = (void *)data;
538
539 ioat1_cleanup(chan);
540 writew(IOAT_CHANCTRL_RUN, chan->base.reg_base + IOAT_CHANCTRL_OFFSET);
541}
542
543void ioat_dma_unmap(struct ioat_chan_common *chan, enum dma_ctrl_flags flags,
544 size_t len, struct ioat_dma_descriptor *hw)
545{
546 struct pci_dev *pdev = chan->device->pdev;
547 size_t offset = len - hw->size;
548
549 if (!(flags & DMA_COMPL_SKIP_DEST_UNMAP))
550 ioat_unmap(pdev, hw->dst_addr - offset, len,
551 PCI_DMA_FROMDEVICE, flags, 1);
552
553 if (!(flags & DMA_COMPL_SKIP_SRC_UNMAP))
554 ioat_unmap(pdev, hw->src_addr - offset, len,
555 PCI_DMA_TODEVICE, flags, 0);
556}
557
558unsigned long ioat_get_current_completion(struct ioat_chan_common *chan)
559{
560 unsigned long phys_complete;
561 u64 completion;
562
563 completion = *chan->completion;
564 phys_complete = ioat_chansts_to_addr(completion);
565
566 dev_dbg(to_dev(chan), "%s: phys_complete: %#llx\n", __func__,
567 (unsigned long long) phys_complete);
568
569 if (is_ioat_halted(completion)) {
570 u32 chanerr = readl(chan->reg_base + IOAT_CHANERR_OFFSET);
571 dev_err(to_dev(chan), "Channel halted, chanerr = %x\n",
572 chanerr);
573
574 /* TODO do something to salvage the situation */
575 }
576
577 return phys_complete;
578}
579
580bool ioat_cleanup_preamble(struct ioat_chan_common *chan,
581 unsigned long *phys_complete)
582{
583 *phys_complete = ioat_get_current_completion(chan);
584 if (*phys_complete == chan->last_completion)
585 return false;
586 clear_bit(IOAT_COMPLETION_ACK, &chan->state);
587 mod_timer(&chan->timer, jiffies + COMPLETION_TIMEOUT);
588
589 return true;
590}
591
592static void __cleanup(struct ioat_dma_chan *ioat, unsigned long phys_complete)
593{
594 struct ioat_chan_common *chan = &ioat->base;
595 struct list_head *_desc, *n;
596 struct dma_async_tx_descriptor *tx;
597
598 dev_dbg(to_dev(chan), "%s: phys_complete: %lx\n",
599 __func__, phys_complete);
600 list_for_each_safe(_desc, n, &ioat->used_desc) {
601 struct ioat_desc_sw *desc;
602
603 prefetch(n);
604 desc = list_entry(_desc, typeof(*desc), node);
605 tx = &desc->txd;
606 /*
607 * Incoming DMA requests may use multiple descriptors,
608 * due to exceeding xfercap, perhaps. If so, only the
609 * last one will have a cookie, and require unmapping.
610 */
611 dump_desc_dbg(ioat, desc);
612 if (tx->cookie) {
613 chan->completed_cookie = tx->cookie;
614 tx->cookie = 0;
615 ioat_dma_unmap(chan, tx->flags, desc->len, desc->hw);
616 ioat->active -= desc->hw->tx_cnt;
617 if (tx->callback) {
618 tx->callback(tx->callback_param);
619 tx->callback = NULL;
620 }
621 }
622
623 if (tx->phys != phys_complete) {
624 /*
625 * a completed entry, but not the last, so clean
626 * up if the client is done with the descriptor
627 */
628 if (async_tx_test_ack(tx))
629 list_move_tail(&desc->node, &ioat->free_desc);
630 } else {
631 /*
632 * last used desc. Do not remove, so we can
633 * append from it.
634 */
635
636 /* if nothing else is pending, cancel the
637 * completion timeout
638 */
639 if (n == &ioat->used_desc) {
640 dev_dbg(to_dev(chan),
641 "%s cancel completion timeout\n",
642 __func__);
643 clear_bit(IOAT_COMPLETION_PENDING, &chan->state);
644 }
645
646 /* TODO check status bits? */
647 break;
648 }
649 }
650
651 chan->last_completion = phys_complete;
652}
653
654/**
655 * ioat1_cleanup - cleanup up finished descriptors
656 * @chan: ioat channel to be cleaned up
657 *
658 * To prevent lock contention we defer cleanup when the locks are
659 * contended with a terminal timeout that forces cleanup and catches
660 * completion notification errors.
661 */
662static void ioat1_cleanup(struct ioat_dma_chan *ioat)
663{
664 struct ioat_chan_common *chan = &ioat->base;
665 unsigned long phys_complete;
666
667 prefetch(chan->completion);
668
669 if (!spin_trylock_bh(&chan->cleanup_lock))
670 return;
671
672 if (!ioat_cleanup_preamble(chan, &phys_complete)) {
673 spin_unlock_bh(&chan->cleanup_lock);
674 return;
675 }
676
677 if (!spin_trylock_bh(&ioat->desc_lock)) {
678 spin_unlock_bh(&chan->cleanup_lock);
679 return;
680 }
681
682 __cleanup(ioat, phys_complete);
683
684 spin_unlock_bh(&ioat->desc_lock);
685 spin_unlock_bh(&chan->cleanup_lock);
686}
687
688static void ioat1_timer_event(unsigned long data)
689{
690 struct ioat_dma_chan *ioat = (void *) data;
691 struct ioat_chan_common *chan = &ioat->base;
692
693 dev_dbg(to_dev(chan), "%s: state: %lx\n", __func__, chan->state);
694
695 spin_lock_bh(&chan->cleanup_lock);
696 if (test_and_clear_bit(IOAT_RESET_PENDING, &chan->state)) {
697 struct ioat_desc_sw *desc;
698
699 spin_lock_bh(&ioat->desc_lock);
700
701 /* restart active descriptors */
702 desc = to_ioat_desc(ioat->used_desc.prev);
703 ioat_set_chainaddr(ioat, desc->txd.phys);
704 ioat_start(chan);
705
706 ioat->pending = 0;
707 set_bit(IOAT_COMPLETION_PENDING, &chan->state);
708 mod_timer(&chan->timer, jiffies + COMPLETION_TIMEOUT);
709 spin_unlock_bh(&ioat->desc_lock);
710 } else if (test_bit(IOAT_COMPLETION_PENDING, &chan->state)) {
711 unsigned long phys_complete;
712
713 spin_lock_bh(&ioat->desc_lock);
714 /* if we haven't made progress and we have already
715 * acknowledged a pending completion once, then be more
716 * forceful with a restart
717 */
718 if (ioat_cleanup_preamble(chan, &phys_complete))
719 __cleanup(ioat, phys_complete);
720 else if (test_bit(IOAT_COMPLETION_ACK, &chan->state))
721 ioat1_reset_channel(ioat);
722 else {
723 u64 status = ioat_chansts(chan);
724
725 /* manually update the last completion address */
726 if (ioat_chansts_to_addr(status) != 0)
727 *chan->completion = status;
728
729 set_bit(IOAT_COMPLETION_ACK, &chan->state);
730 mod_timer(&chan->timer, jiffies + COMPLETION_TIMEOUT);
731 }
732 spin_unlock_bh(&ioat->desc_lock);
733 }
734 spin_unlock_bh(&chan->cleanup_lock);
735}
736
737static enum dma_status
738ioat1_dma_is_complete(struct dma_chan *c, dma_cookie_t cookie,
739 dma_cookie_t *done, dma_cookie_t *used)
740{
741 struct ioat_dma_chan *ioat = to_ioat_chan(c);
742
743 if (ioat_is_complete(c, cookie, done, used) == DMA_SUCCESS)
744 return DMA_SUCCESS;
745
746 ioat1_cleanup(ioat);
747
748 return ioat_is_complete(c, cookie, done, used);
749}
750
751static void ioat1_dma_start_null_desc(struct ioat_dma_chan *ioat)
752{
753 struct ioat_chan_common *chan = &ioat->base;
754 struct ioat_desc_sw *desc;
755 struct ioat_dma_descriptor *hw;
756
757 spin_lock_bh(&ioat->desc_lock);
758
759 desc = ioat1_dma_get_next_descriptor(ioat);
760
761 if (!desc) {
762 dev_err(to_dev(chan),
763 "Unable to start null desc - get next desc failed\n");
764 spin_unlock_bh(&ioat->desc_lock);
765 return;
766 }
767
768 hw = desc->hw;
769 hw->ctl = 0;
770 hw->ctl_f.null = 1;
771 hw->ctl_f.int_en = 1;
772 hw->ctl_f.compl_write = 1;
773 /* set size to non-zero value (channel returns error when size is 0) */
774 hw->size = NULL_DESC_BUFFER_SIZE;
775 hw->src_addr = 0;
776 hw->dst_addr = 0;
777 async_tx_ack(&desc->txd);
778 hw->next = 0;
779 list_add_tail(&desc->node, &ioat->used_desc);
780 dump_desc_dbg(ioat, desc);
781
782 ioat_set_chainaddr(ioat, desc->txd.phys);
783 ioat_start(chan);
784 spin_unlock_bh(&ioat->desc_lock);
785}
786
787/*
788 * Perform a IOAT transaction to verify the HW works.
789 */
790#define IOAT_TEST_SIZE 2000
791
792static void __devinit ioat_dma_test_callback(void *dma_async_param)
793{
794 struct completion *cmp = dma_async_param;
795
796 complete(cmp);
797}
798
799/**
800 * ioat_dma_self_test - Perform a IOAT transaction to verify the HW works.
801 * @device: device to be tested
802 */
803int __devinit ioat_dma_self_test(struct ioatdma_device *device)
804{
805 int i;
806 u8 *src;
807 u8 *dest;
808 struct dma_device *dma = &device->common;
809 struct device *dev = &device->pdev->dev;
810 struct dma_chan *dma_chan;
811 struct dma_async_tx_descriptor *tx;
812 dma_addr_t dma_dest, dma_src;
813 dma_cookie_t cookie;
814 int err = 0;
815 struct completion cmp;
816 unsigned long tmo;
817 unsigned long flags;
818
819 src = kzalloc(sizeof(u8) * IOAT_TEST_SIZE, GFP_KERNEL);
820 if (!src)
821 return -ENOMEM;
822 dest = kzalloc(sizeof(u8) * IOAT_TEST_SIZE, GFP_KERNEL);
823 if (!dest) {
824 kfree(src);
825 return -ENOMEM;
826 }
827
828 /* Fill in src buffer */
829 for (i = 0; i < IOAT_TEST_SIZE; i++)
830 src[i] = (u8)i;
831
832 /* Start copy, using first DMA channel */
833 dma_chan = container_of(dma->channels.next, struct dma_chan,
834 device_node);
835 if (dma->device_alloc_chan_resources(dma_chan) < 1) {
836 dev_err(dev, "selftest cannot allocate chan resource\n");
837 err = -ENODEV;
838 goto out;
839 }
840
841 dma_src = dma_map_single(dev, src, IOAT_TEST_SIZE, DMA_TO_DEVICE);
842 dma_dest = dma_map_single(dev, dest, IOAT_TEST_SIZE, DMA_FROM_DEVICE);
843 flags = DMA_COMPL_SRC_UNMAP_SINGLE | DMA_COMPL_DEST_UNMAP_SINGLE |
844 DMA_PREP_INTERRUPT;
845 tx = device->common.device_prep_dma_memcpy(dma_chan, dma_dest, dma_src,
846 IOAT_TEST_SIZE, flags);
847 if (!tx) {
848 dev_err(dev, "Self-test prep failed, disabling\n");
849 err = -ENODEV;
850 goto free_resources;
851 }
852
853 async_tx_ack(tx);
854 init_completion(&cmp);
855 tx->callback = ioat_dma_test_callback;
856 tx->callback_param = &cmp;
857 cookie = tx->tx_submit(tx);
858 if (cookie < 0) {
859 dev_err(dev, "Self-test setup failed, disabling\n");
860 err = -ENODEV;
861 goto free_resources;
862 }
863 dma->device_issue_pending(dma_chan);
864
865 tmo = wait_for_completion_timeout(&cmp, msecs_to_jiffies(3000));
866
867 if (tmo == 0 ||
868 dma->device_is_tx_complete(dma_chan, cookie, NULL, NULL)
869 != DMA_SUCCESS) {
870 dev_err(dev, "Self-test copy timed out, disabling\n");
871 err = -ENODEV;
872 goto free_resources;
873 }
874 if (memcmp(src, dest, IOAT_TEST_SIZE)) {
875 dev_err(dev, "Self-test copy failed compare, disabling\n");
876 err = -ENODEV;
877 goto free_resources;
878 }
879
880free_resources:
881 dma->device_free_chan_resources(dma_chan);
882out:
883 kfree(src);
884 kfree(dest);
885 return err;
886}
887
888static char ioat_interrupt_style[32] = "msix";
889module_param_string(ioat_interrupt_style, ioat_interrupt_style,
890 sizeof(ioat_interrupt_style), 0644);
891MODULE_PARM_DESC(ioat_interrupt_style,
892 "set ioat interrupt style: msix (default), "
893 "msix-single-vector, msi, intx)");
894
895/**
896 * ioat_dma_setup_interrupts - setup interrupt handler
897 * @device: ioat device
898 */
899static int ioat_dma_setup_interrupts(struct ioatdma_device *device)
900{
901 struct ioat_chan_common *chan;
902 struct pci_dev *pdev = device->pdev;
903 struct device *dev = &pdev->dev;
904 struct msix_entry *msix;
905 int i, j, msixcnt;
906 int err = -EINVAL;
907 u8 intrctrl = 0;
908
909 if (!strcmp(ioat_interrupt_style, "msix"))
910 goto msix;
911 if (!strcmp(ioat_interrupt_style, "msix-single-vector"))
912 goto msix_single_vector;
913 if (!strcmp(ioat_interrupt_style, "msi"))
914 goto msi;
915 if (!strcmp(ioat_interrupt_style, "intx"))
916 goto intx;
917 dev_err(dev, "invalid ioat_interrupt_style %s\n", ioat_interrupt_style);
918 goto err_no_irq;
919
920msix:
921 /* The number of MSI-X vectors should equal the number of channels */
922 msixcnt = device->common.chancnt;
923 for (i = 0; i < msixcnt; i++)
924 device->msix_entries[i].entry = i;
925
926 err = pci_enable_msix(pdev, device->msix_entries, msixcnt);
927 if (err < 0)
928 goto msi;
929 if (err > 0)
930 goto msix_single_vector;
931
932 for (i = 0; i < msixcnt; i++) {
933 msix = &device->msix_entries[i];
934 chan = ioat_chan_by_index(device, i);
935 err = devm_request_irq(dev, msix->vector,
936 ioat_dma_do_interrupt_msix, 0,
937 "ioat-msix", chan);
938 if (err) {
939 for (j = 0; j < i; j++) {
940 msix = &device->msix_entries[j];
941 chan = ioat_chan_by_index(device, j);
942 devm_free_irq(dev, msix->vector, chan);
943 }
944 goto msix_single_vector;
945 }
946 }
947 intrctrl |= IOAT_INTRCTRL_MSIX_VECTOR_CONTROL;
948 goto done;
949
950msix_single_vector:
951 msix = &device->msix_entries[0];
952 msix->entry = 0;
953 err = pci_enable_msix(pdev, device->msix_entries, 1);
954 if (err)
955 goto msi;
956
957 err = devm_request_irq(dev, msix->vector, ioat_dma_do_interrupt, 0,
958 "ioat-msix", device);
959 if (err) {
960 pci_disable_msix(pdev);
961 goto msi;
962 }
963 goto done;
964
965msi:
966 err = pci_enable_msi(pdev);
967 if (err)
968 goto intx;
969
970 err = devm_request_irq(dev, pdev->irq, ioat_dma_do_interrupt, 0,
971 "ioat-msi", device);
972 if (err) {
973 pci_disable_msi(pdev);
974 goto intx;
975 }
976 goto done;
977
978intx:
979 err = devm_request_irq(dev, pdev->irq, ioat_dma_do_interrupt,
980 IRQF_SHARED, "ioat-intx", device);
981 if (err)
982 goto err_no_irq;
983
984done:
985 if (device->intr_quirk)
986 device->intr_quirk(device);
987 intrctrl |= IOAT_INTRCTRL_MASTER_INT_EN;
988 writeb(intrctrl, device->reg_base + IOAT_INTRCTRL_OFFSET);
989 return 0;
990
991err_no_irq:
992 /* Disable all interrupt generation */
993 writeb(0, device->reg_base + IOAT_INTRCTRL_OFFSET);
994 dev_err(dev, "no usable interrupts\n");
995 return err;
996}
997
998static void ioat_disable_interrupts(struct ioatdma_device *device)
999{
1000 /* Disable all interrupt generation */
1001 writeb(0, device->reg_base + IOAT_INTRCTRL_OFFSET);
1002}
1003
1004int __devinit ioat_probe(struct ioatdma_device *device)
1005{
1006 int err = -ENODEV;
1007 struct dma_device *dma = &device->common;
1008 struct pci_dev *pdev = device->pdev;
1009 struct device *dev = &pdev->dev;
1010
1011 /* DMA coherent memory pool for DMA descriptor allocations */
1012 device->dma_pool = pci_pool_create("dma_desc_pool", pdev,
1013 sizeof(struct ioat_dma_descriptor),
1014 64, 0);
1015 if (!device->dma_pool) {
1016 err = -ENOMEM;
1017 goto err_dma_pool;
1018 }
1019
1020 device->completion_pool = pci_pool_create("completion_pool", pdev,
1021 sizeof(u64), SMP_CACHE_BYTES,
1022 SMP_CACHE_BYTES);
1023
1024 if (!device->completion_pool) {
1025 err = -ENOMEM;
1026 goto err_completion_pool;
1027 }
1028
1029 device->enumerate_channels(device);
1030
1031 dma_cap_set(DMA_MEMCPY, dma->cap_mask);
1032 dma->dev = &pdev->dev;
1033
1034 if (!dma->chancnt) {
1035 dev_err(dev, "zero channels detected\n");
1036 goto err_setup_interrupts;
1037 }
1038
1039 err = ioat_dma_setup_interrupts(device);
1040 if (err)
1041 goto err_setup_interrupts;
1042
1043 err = device->self_test(device);
1044 if (err)
1045 goto err_self_test;
1046
1047 return 0;
1048
1049err_self_test:
1050 ioat_disable_interrupts(device);
1051err_setup_interrupts:
1052 pci_pool_destroy(device->completion_pool);
1053err_completion_pool:
1054 pci_pool_destroy(device->dma_pool);
1055err_dma_pool:
1056 return err;
1057}
1058
1059int __devinit ioat_register(struct ioatdma_device *device)
1060{
1061 int err = dma_async_device_register(&device->common);
1062
1063 if (err) {
1064 ioat_disable_interrupts(device);
1065 pci_pool_destroy(device->completion_pool);
1066 pci_pool_destroy(device->dma_pool);
1067 }
1068
1069 return err;
1070}
1071
1072/* ioat1_intr_quirk - fix up dma ctrl register to enable / disable msi */
1073static void ioat1_intr_quirk(struct ioatdma_device *device)
1074{
1075 struct pci_dev *pdev = device->pdev;
1076 u32 dmactrl;
1077
1078 pci_read_config_dword(pdev, IOAT_PCI_DMACTRL_OFFSET, &dmactrl);
1079 if (pdev->msi_enabled)
1080 dmactrl |= IOAT_PCI_DMACTRL_MSI_EN;
1081 else
1082 dmactrl &= ~IOAT_PCI_DMACTRL_MSI_EN;
1083 pci_write_config_dword(pdev, IOAT_PCI_DMACTRL_OFFSET, dmactrl);
1084}
1085
1086static ssize_t ring_size_show(struct dma_chan *c, char *page)
1087{
1088 struct ioat_dma_chan *ioat = to_ioat_chan(c);
1089
1090 return sprintf(page, "%d\n", ioat->desccount);
1091}
1092static struct ioat_sysfs_entry ring_size_attr = __ATTR_RO(ring_size);
1093
1094static ssize_t ring_active_show(struct dma_chan *c, char *page)
1095{
1096 struct ioat_dma_chan *ioat = to_ioat_chan(c);
1097
1098 return sprintf(page, "%d\n", ioat->active);
1099}
1100static struct ioat_sysfs_entry ring_active_attr = __ATTR_RO(ring_active);
1101
1102static ssize_t cap_show(struct dma_chan *c, char *page)
1103{
1104 struct dma_device *dma = c->device;
1105
1106 return sprintf(page, "copy%s%s%s%s%s%s\n",
1107 dma_has_cap(DMA_PQ, dma->cap_mask) ? " pq" : "",
1108 dma_has_cap(DMA_PQ_VAL, dma->cap_mask) ? " pq_val" : "",
1109 dma_has_cap(DMA_XOR, dma->cap_mask) ? " xor" : "",
1110 dma_has_cap(DMA_XOR_VAL, dma->cap_mask) ? " xor_val" : "",
1111 dma_has_cap(DMA_MEMSET, dma->cap_mask) ? " fill" : "",
1112 dma_has_cap(DMA_INTERRUPT, dma->cap_mask) ? " intr" : "");
1113
1114}
1115struct ioat_sysfs_entry ioat_cap_attr = __ATTR_RO(cap);
1116
1117static ssize_t version_show(struct dma_chan *c, char *page)
1118{
1119 struct dma_device *dma = c->device;
1120 struct ioatdma_device *device = to_ioatdma_device(dma);
1121
1122 return sprintf(page, "%d.%d\n",
1123 device->version >> 4, device->version & 0xf);
1124}
1125struct ioat_sysfs_entry ioat_version_attr = __ATTR_RO(version);
1126
1127static struct attribute *ioat1_attrs[] = {
1128 &ring_size_attr.attr,
1129 &ring_active_attr.attr,
1130 &ioat_cap_attr.attr,
1131 &ioat_version_attr.attr,
1132 NULL,
1133};
1134
1135static ssize_t
1136ioat_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
1137{
1138 struct ioat_sysfs_entry *entry;
1139 struct ioat_chan_common *chan;
1140
1141 entry = container_of(attr, struct ioat_sysfs_entry, attr);
1142 chan = container_of(kobj, struct ioat_chan_common, kobj);
1143
1144 if (!entry->show)
1145 return -EIO;
1146 return entry->show(&chan->common, page);
1147}
1148
1149struct sysfs_ops ioat_sysfs_ops = {
1150 .show = ioat_attr_show,
1151};
1152
1153static struct kobj_type ioat1_ktype = {
1154 .sysfs_ops = &ioat_sysfs_ops,
1155 .default_attrs = ioat1_attrs,
1156};
1157
1158void ioat_kobject_add(struct ioatdma_device *device, struct kobj_type *type)
1159{
1160 struct dma_device *dma = &device->common;
1161 struct dma_chan *c;
1162
1163 list_for_each_entry(c, &dma->channels, device_node) {
1164 struct ioat_chan_common *chan = to_chan_common(c);
1165 struct kobject *parent = &c->dev->device.kobj;
1166 int err;
1167
1168 err = kobject_init_and_add(&chan->kobj, type, parent, "quickdata");
1169 if (err) {
1170 dev_warn(to_dev(chan),
1171 "sysfs init error (%d), continuing...\n", err);
1172 kobject_put(&chan->kobj);
1173 set_bit(IOAT_KOBJ_INIT_FAIL, &chan->state);
1174 }
1175 }
1176}
1177
1178void ioat_kobject_del(struct ioatdma_device *device)
1179{
1180 struct dma_device *dma = &device->common;
1181 struct dma_chan *c;
1182
1183 list_for_each_entry(c, &dma->channels, device_node) {
1184 struct ioat_chan_common *chan = to_chan_common(c);
1185
1186 if (!test_bit(IOAT_KOBJ_INIT_FAIL, &chan->state)) {
1187 kobject_del(&chan->kobj);
1188 kobject_put(&chan->kobj);
1189 }
1190 }
1191}
1192
1193int __devinit ioat1_dma_probe(struct ioatdma_device *device, int dca)
1194{
1195 struct pci_dev *pdev = device->pdev;
1196 struct dma_device *dma;
1197 int err;
1198
1199 device->intr_quirk = ioat1_intr_quirk;
1200 device->enumerate_channels = ioat1_enumerate_channels;
1201 device->self_test = ioat_dma_self_test;
1202 dma = &device->common;
1203 dma->device_prep_dma_memcpy = ioat1_dma_prep_memcpy;
1204 dma->device_issue_pending = ioat1_dma_memcpy_issue_pending;
1205 dma->device_alloc_chan_resources = ioat1_dma_alloc_chan_resources;
1206 dma->device_free_chan_resources = ioat1_dma_free_chan_resources;
1207 dma->device_is_tx_complete = ioat1_dma_is_complete;
1208
1209 err = ioat_probe(device);
1210 if (err)
1211 return err;
1212 ioat_set_tcp_copy_break(4096);
1213 err = ioat_register(device);
1214 if (err)
1215 return err;
1216 ioat_kobject_add(device, &ioat1_ktype);
1217
1218 if (dca)
1219 device->dca = ioat_dca_init(pdev, device->reg_base);
1220
1221 return err;
1222}
1223
1224void __devexit ioat_dma_remove(struct ioatdma_device *device)
1225{
1226 struct dma_device *dma = &device->common;
1227
1228 ioat_disable_interrupts(device);
1229
1230 ioat_kobject_del(device);
1231
1232 dma_async_device_unregister(dma);
1233
1234 pci_pool_destroy(device->dma_pool);
1235 pci_pool_destroy(device->completion_pool);
1236
1237 INIT_LIST_HEAD(&dma->channels);
1238}
diff --git a/drivers/dma/ioat/dma.h b/drivers/dma/ioat/dma.h
new file mode 100644
index 000000000000..c14fdfeb7f33
--- /dev/null
+++ b/drivers/dma/ioat/dma.h
@@ -0,0 +1,337 @@
1/*
2 * Copyright(c) 2004 - 2009 Intel Corporation. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of the GNU General Public License as published by the Free
6 * Software Foundation; either version 2 of the License, or (at your option)
7 * any later version.
8 *
9 * This program is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 * You should have received a copy of the GNU General Public License along with
15 * this program; if not, write to the Free Software Foundation, Inc., 59
16 * Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * The full GNU General Public License is included in this distribution in the
19 * file called COPYING.
20 */
21#ifndef IOATDMA_H
22#define IOATDMA_H
23
24#include <linux/dmaengine.h>
25#include "hw.h"
26#include "registers.h"
27#include <linux/init.h>
28#include <linux/dmapool.h>
29#include <linux/cache.h>
30#include <linux/pci_ids.h>
31#include <net/tcp.h>
32
33#define IOAT_DMA_VERSION "4.00"
34
35#define IOAT_LOW_COMPLETION_MASK 0xffffffc0
36#define IOAT_DMA_DCA_ANY_CPU ~0
37
38#define to_ioatdma_device(dev) container_of(dev, struct ioatdma_device, common)
39#define to_ioat_desc(lh) container_of(lh, struct ioat_desc_sw, node)
40#define tx_to_ioat_desc(tx) container_of(tx, struct ioat_desc_sw, txd)
41#define to_dev(ioat_chan) (&(ioat_chan)->device->pdev->dev)
42
43#define chan_num(ch) ((int)((ch)->reg_base - (ch)->device->reg_base) / 0x80)
44
45/*
46 * workaround for IOAT ver.3.0 null descriptor issue
47 * (channel returns error when size is 0)
48 */
49#define NULL_DESC_BUFFER_SIZE 1
50
51/**
52 * struct ioatdma_device - internal representation of a IOAT device
53 * @pdev: PCI-Express device
54 * @reg_base: MMIO register space base address
55 * @dma_pool: for allocating DMA descriptors
56 * @common: embedded struct dma_device
57 * @version: version of ioatdma device
58 * @msix_entries: irq handlers
59 * @idx: per channel data
60 * @dca: direct cache access context
61 * @intr_quirk: interrupt setup quirk (for ioat_v1 devices)
62 * @enumerate_channels: hw version specific channel enumeration
63 * @cleanup_tasklet: select between the v2 and v3 cleanup routines
64 * @timer_fn: select between the v2 and v3 timer watchdog routines
65 * @self_test: hardware version specific self test for each supported op type
66 *
67 * Note: the v3 cleanup routine supports raid operations
68 */
69struct ioatdma_device {
70 struct pci_dev *pdev;
71 void __iomem *reg_base;
72 struct pci_pool *dma_pool;
73 struct pci_pool *completion_pool;
74 struct dma_device common;
75 u8 version;
76 struct msix_entry msix_entries[4];
77 struct ioat_chan_common *idx[4];
78 struct dca_provider *dca;
79 void (*intr_quirk)(struct ioatdma_device *device);
80 int (*enumerate_channels)(struct ioatdma_device *device);
81 void (*cleanup_tasklet)(unsigned long data);
82 void (*timer_fn)(unsigned long data);
83 int (*self_test)(struct ioatdma_device *device);
84};
85
86struct ioat_chan_common {
87 struct dma_chan common;
88 void __iomem *reg_base;
89 unsigned long last_completion;
90 spinlock_t cleanup_lock;
91 dma_cookie_t completed_cookie;
92 unsigned long state;
93 #define IOAT_COMPLETION_PENDING 0
94 #define IOAT_COMPLETION_ACK 1
95 #define IOAT_RESET_PENDING 2
96 #define IOAT_KOBJ_INIT_FAIL 3
97 struct timer_list timer;
98 #define COMPLETION_TIMEOUT msecs_to_jiffies(100)
99 #define IDLE_TIMEOUT msecs_to_jiffies(2000)
100 #define RESET_DELAY msecs_to_jiffies(100)
101 struct ioatdma_device *device;
102 dma_addr_t completion_dma;
103 u64 *completion;
104 struct tasklet_struct cleanup_task;
105 struct kobject kobj;
106};
107
108struct ioat_sysfs_entry {
109 struct attribute attr;
110 ssize_t (*show)(struct dma_chan *, char *);
111};
112
113/**
114 * struct ioat_dma_chan - internal representation of a DMA channel
115 */
116struct ioat_dma_chan {
117 struct ioat_chan_common base;
118
119 size_t xfercap; /* XFERCAP register value expanded out */
120
121 spinlock_t desc_lock;
122 struct list_head free_desc;
123 struct list_head used_desc;
124
125 int pending;
126 u16 desccount;
127 u16 active;
128};
129
130static inline struct ioat_chan_common *to_chan_common(struct dma_chan *c)
131{
132 return container_of(c, struct ioat_chan_common, common);
133}
134
135static inline struct ioat_dma_chan *to_ioat_chan(struct dma_chan *c)
136{
137 struct ioat_chan_common *chan = to_chan_common(c);
138
139 return container_of(chan, struct ioat_dma_chan, base);
140}
141
142/**
143 * ioat_is_complete - poll the status of an ioat transaction
144 * @c: channel handle
145 * @cookie: transaction identifier
146 * @done: if set, updated with last completed transaction
147 * @used: if set, updated with last used transaction
148 */
149static inline enum dma_status
150ioat_is_complete(struct dma_chan *c, dma_cookie_t cookie,
151 dma_cookie_t *done, dma_cookie_t *used)
152{
153 struct ioat_chan_common *chan = to_chan_common(c);
154 dma_cookie_t last_used;
155 dma_cookie_t last_complete;
156
157 last_used = c->cookie;
158 last_complete = chan->completed_cookie;
159
160 if (done)
161 *done = last_complete;
162 if (used)
163 *used = last_used;
164
165 return dma_async_is_complete(cookie, last_complete, last_used);
166}
167
168/* wrapper around hardware descriptor format + additional software fields */
169
170/**
171 * struct ioat_desc_sw - wrapper around hardware descriptor
172 * @hw: hardware DMA descriptor (for memcpy)
173 * @node: this descriptor will either be on the free list,
174 * or attached to a transaction list (tx_list)
175 * @txd: the generic software descriptor for all engines
176 * @id: identifier for debug
177 */
178struct ioat_desc_sw {
179 struct ioat_dma_descriptor *hw;
180 struct list_head node;
181 size_t len;
182 struct list_head tx_list;
183 struct dma_async_tx_descriptor txd;
184 #ifdef DEBUG
185 int id;
186 #endif
187};
188
189#ifdef DEBUG
190#define set_desc_id(desc, i) ((desc)->id = (i))
191#define desc_id(desc) ((desc)->id)
192#else
193#define set_desc_id(desc, i)
194#define desc_id(desc) (0)
195#endif
196
197static inline void
198__dump_desc_dbg(struct ioat_chan_common *chan, struct ioat_dma_descriptor *hw,
199 struct dma_async_tx_descriptor *tx, int id)
200{
201 struct device *dev = to_dev(chan);
202
203 dev_dbg(dev, "desc[%d]: (%#llx->%#llx) cookie: %d flags: %#x"
204 " ctl: %#x (op: %d int_en: %d compl: %d)\n", id,
205 (unsigned long long) tx->phys,
206 (unsigned long long) hw->next, tx->cookie, tx->flags,
207 hw->ctl, hw->ctl_f.op, hw->ctl_f.int_en, hw->ctl_f.compl_write);
208}
209
210#define dump_desc_dbg(c, d) \
211 ({ if (d) __dump_desc_dbg(&c->base, d->hw, &d->txd, desc_id(d)); 0; })
212
213static inline void ioat_set_tcp_copy_break(unsigned long copybreak)
214{
215 #ifdef CONFIG_NET_DMA
216 sysctl_tcp_dma_copybreak = copybreak;
217 #endif
218}
219
220static inline struct ioat_chan_common *
221ioat_chan_by_index(struct ioatdma_device *device, int index)
222{
223 return device->idx[index];
224}
225
226static inline u64 ioat_chansts(struct ioat_chan_common *chan)
227{
228 u8 ver = chan->device->version;
229 u64 status;
230 u32 status_lo;
231
232 /* We need to read the low address first as this causes the
233 * chipset to latch the upper bits for the subsequent read
234 */
235 status_lo = readl(chan->reg_base + IOAT_CHANSTS_OFFSET_LOW(ver));
236 status = readl(chan->reg_base + IOAT_CHANSTS_OFFSET_HIGH(ver));
237 status <<= 32;
238 status |= status_lo;
239
240 return status;
241}
242
243static inline void ioat_start(struct ioat_chan_common *chan)
244{
245 u8 ver = chan->device->version;
246
247 writeb(IOAT_CHANCMD_START, chan->reg_base + IOAT_CHANCMD_OFFSET(ver));
248}
249
250static inline u64 ioat_chansts_to_addr(u64 status)
251{
252 return status & IOAT_CHANSTS_COMPLETED_DESCRIPTOR_ADDR;
253}
254
255static inline u32 ioat_chanerr(struct ioat_chan_common *chan)
256{
257 return readl(chan->reg_base + IOAT_CHANERR_OFFSET);
258}
259
260static inline void ioat_suspend(struct ioat_chan_common *chan)
261{
262 u8 ver = chan->device->version;
263
264 writeb(IOAT_CHANCMD_SUSPEND, chan->reg_base + IOAT_CHANCMD_OFFSET(ver));
265}
266
267static inline void ioat_set_chainaddr(struct ioat_dma_chan *ioat, u64 addr)
268{
269 struct ioat_chan_common *chan = &ioat->base;
270
271 writel(addr & 0x00000000FFFFFFFF,
272 chan->reg_base + IOAT1_CHAINADDR_OFFSET_LOW);
273 writel(addr >> 32,
274 chan->reg_base + IOAT1_CHAINADDR_OFFSET_HIGH);
275}
276
277static inline bool is_ioat_active(unsigned long status)
278{
279 return ((status & IOAT_CHANSTS_STATUS) == IOAT_CHANSTS_ACTIVE);
280}
281
282static inline bool is_ioat_idle(unsigned long status)
283{
284 return ((status & IOAT_CHANSTS_STATUS) == IOAT_CHANSTS_DONE);
285}
286
287static inline bool is_ioat_halted(unsigned long status)
288{
289 return ((status & IOAT_CHANSTS_STATUS) == IOAT_CHANSTS_HALTED);
290}
291
292static inline bool is_ioat_suspended(unsigned long status)
293{
294 return ((status & IOAT_CHANSTS_STATUS) == IOAT_CHANSTS_SUSPENDED);
295}
296
297/* channel was fatally programmed */
298static inline bool is_ioat_bug(unsigned long err)
299{
300 return !!(err & (IOAT_CHANERR_SRC_ADDR_ERR|IOAT_CHANERR_DEST_ADDR_ERR|
301 IOAT_CHANERR_NEXT_ADDR_ERR|IOAT_CHANERR_CONTROL_ERR|
302 IOAT_CHANERR_LENGTH_ERR));
303}
304
305static inline void ioat_unmap(struct pci_dev *pdev, dma_addr_t addr, size_t len,
306 int direction, enum dma_ctrl_flags flags, bool dst)
307{
308 if ((dst && (flags & DMA_COMPL_DEST_UNMAP_SINGLE)) ||
309 (!dst && (flags & DMA_COMPL_SRC_UNMAP_SINGLE)))
310 pci_unmap_single(pdev, addr, len, direction);
311 else
312 pci_unmap_page(pdev, addr, len, direction);
313}
314
315int __devinit ioat_probe(struct ioatdma_device *device);
316int __devinit ioat_register(struct ioatdma_device *device);
317int __devinit ioat1_dma_probe(struct ioatdma_device *dev, int dca);
318int __devinit ioat_dma_self_test(struct ioatdma_device *device);
319void __devexit ioat_dma_remove(struct ioatdma_device *device);
320struct dca_provider * __devinit ioat_dca_init(struct pci_dev *pdev,
321 void __iomem *iobase);
322unsigned long ioat_get_current_completion(struct ioat_chan_common *chan);
323void ioat_init_channel(struct ioatdma_device *device,
324 struct ioat_chan_common *chan, int idx,
325 void (*timer_fn)(unsigned long),
326 void (*tasklet)(unsigned long),
327 unsigned long ioat);
328void ioat_dma_unmap(struct ioat_chan_common *chan, enum dma_ctrl_flags flags,
329 size_t len, struct ioat_dma_descriptor *hw);
330bool ioat_cleanup_preamble(struct ioat_chan_common *chan,
331 unsigned long *phys_complete);
332void ioat_kobject_add(struct ioatdma_device *device, struct kobj_type *type);
333void ioat_kobject_del(struct ioatdma_device *device);
334extern struct sysfs_ops ioat_sysfs_ops;
335extern struct ioat_sysfs_entry ioat_version_attr;
336extern struct ioat_sysfs_entry ioat_cap_attr;
337#endif /* IOATDMA_H */
diff --git a/drivers/dma/ioat/dma_v2.c b/drivers/dma/ioat/dma_v2.c
new file mode 100644
index 000000000000..96ffab7d37a7
--- /dev/null
+++ b/drivers/dma/ioat/dma_v2.c
@@ -0,0 +1,871 @@
1/*
2 * Intel I/OAT DMA Linux driver
3 * Copyright(c) 2004 - 2009 Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 * You should have received a copy of the GNU General Public License along with
15 * this program; if not, write to the Free Software Foundation, Inc.,
16 * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
17 *
18 * The full GNU General Public License is included in this distribution in
19 * the file called "COPYING".
20 *
21 */
22
23/*
24 * This driver supports an Intel I/OAT DMA engine (versions >= 2), which
25 * does asynchronous data movement and checksumming operations.
26 */
27
28#include <linux/init.h>
29#include <linux/module.h>
30#include <linux/pci.h>
31#include <linux/interrupt.h>
32#include <linux/dmaengine.h>
33#include <linux/delay.h>
34#include <linux/dma-mapping.h>
35#include <linux/workqueue.h>
36#include <linux/i7300_idle.h>
37#include "dma.h"
38#include "dma_v2.h"
39#include "registers.h"
40#include "hw.h"
41
42int ioat_ring_alloc_order = 8;
43module_param(ioat_ring_alloc_order, int, 0644);
44MODULE_PARM_DESC(ioat_ring_alloc_order,
45 "ioat2+: allocate 2^n descriptors per channel"
46 " (default: 8 max: 16)");
47static int ioat_ring_max_alloc_order = IOAT_MAX_ORDER;
48module_param(ioat_ring_max_alloc_order, int, 0644);
49MODULE_PARM_DESC(ioat_ring_max_alloc_order,
50 "ioat2+: upper limit for ring size (default: 16)");
51
52void __ioat2_issue_pending(struct ioat2_dma_chan *ioat)
53{
54 void * __iomem reg_base = ioat->base.reg_base;
55
56 ioat->pending = 0;
57 ioat->dmacount += ioat2_ring_pending(ioat);
58 ioat->issued = ioat->head;
59 /* make descriptor updates globally visible before notifying channel */
60 wmb();
61 writew(ioat->dmacount, reg_base + IOAT_CHAN_DMACOUNT_OFFSET);
62 dev_dbg(to_dev(&ioat->base),
63 "%s: head: %#x tail: %#x issued: %#x count: %#x\n",
64 __func__, ioat->head, ioat->tail, ioat->issued, ioat->dmacount);
65}
66
67void ioat2_issue_pending(struct dma_chan *chan)
68{
69 struct ioat2_dma_chan *ioat = to_ioat2_chan(chan);
70
71 spin_lock_bh(&ioat->ring_lock);
72 if (ioat->pending == 1)
73 __ioat2_issue_pending(ioat);
74 spin_unlock_bh(&ioat->ring_lock);
75}
76
77/**
78 * ioat2_update_pending - log pending descriptors
79 * @ioat: ioat2+ channel
80 *
81 * set pending to '1' unless pending is already set to '2', pending == 2
82 * indicates that submission is temporarily blocked due to an in-flight
83 * reset. If we are already above the ioat_pending_level threshold then
84 * just issue pending.
85 *
86 * called with ring_lock held
87 */
88static void ioat2_update_pending(struct ioat2_dma_chan *ioat)
89{
90 if (unlikely(ioat->pending == 2))
91 return;
92 else if (ioat2_ring_pending(ioat) > ioat_pending_level)
93 __ioat2_issue_pending(ioat);
94 else
95 ioat->pending = 1;
96}
97
98static void __ioat2_start_null_desc(struct ioat2_dma_chan *ioat)
99{
100 struct ioat_ring_ent *desc;
101 struct ioat_dma_descriptor *hw;
102 int idx;
103
104 if (ioat2_ring_space(ioat) < 1) {
105 dev_err(to_dev(&ioat->base),
106 "Unable to start null desc - ring full\n");
107 return;
108 }
109
110 dev_dbg(to_dev(&ioat->base), "%s: head: %#x tail: %#x issued: %#x\n",
111 __func__, ioat->head, ioat->tail, ioat->issued);
112 idx = ioat2_desc_alloc(ioat, 1);
113 desc = ioat2_get_ring_ent(ioat, idx);
114
115 hw = desc->hw;
116 hw->ctl = 0;
117 hw->ctl_f.null = 1;
118 hw->ctl_f.int_en = 1;
119 hw->ctl_f.compl_write = 1;
120 /* set size to non-zero value (channel returns error when size is 0) */
121 hw->size = NULL_DESC_BUFFER_SIZE;
122 hw->src_addr = 0;
123 hw->dst_addr = 0;
124 async_tx_ack(&desc->txd);
125 ioat2_set_chainaddr(ioat, desc->txd.phys);
126 dump_desc_dbg(ioat, desc);
127 __ioat2_issue_pending(ioat);
128}
129
130static void ioat2_start_null_desc(struct ioat2_dma_chan *ioat)
131{
132 spin_lock_bh(&ioat->ring_lock);
133 __ioat2_start_null_desc(ioat);
134 spin_unlock_bh(&ioat->ring_lock);
135}
136
137static void __cleanup(struct ioat2_dma_chan *ioat, unsigned long phys_complete)
138{
139 struct ioat_chan_common *chan = &ioat->base;
140 struct dma_async_tx_descriptor *tx;
141 struct ioat_ring_ent *desc;
142 bool seen_current = false;
143 u16 active;
144 int i;
145
146 dev_dbg(to_dev(chan), "%s: head: %#x tail: %#x issued: %#x\n",
147 __func__, ioat->head, ioat->tail, ioat->issued);
148
149 active = ioat2_ring_active(ioat);
150 for (i = 0; i < active && !seen_current; i++) {
151 prefetch(ioat2_get_ring_ent(ioat, ioat->tail + i + 1));
152 desc = ioat2_get_ring_ent(ioat, ioat->tail + i);
153 tx = &desc->txd;
154 dump_desc_dbg(ioat, desc);
155 if (tx->cookie) {
156 ioat_dma_unmap(chan, tx->flags, desc->len, desc->hw);
157 chan->completed_cookie = tx->cookie;
158 tx->cookie = 0;
159 if (tx->callback) {
160 tx->callback(tx->callback_param);
161 tx->callback = NULL;
162 }
163 }
164
165 if (tx->phys == phys_complete)
166 seen_current = true;
167 }
168 ioat->tail += i;
169 BUG_ON(!seen_current); /* no active descs have written a completion? */
170
171 chan->last_completion = phys_complete;
172 if (ioat->head == ioat->tail) {
173 dev_dbg(to_dev(chan), "%s: cancel completion timeout\n",
174 __func__);
175 clear_bit(IOAT_COMPLETION_PENDING, &chan->state);
176 mod_timer(&chan->timer, jiffies + IDLE_TIMEOUT);
177 }
178}
179
180/**
181 * ioat2_cleanup - clean finished descriptors (advance tail pointer)
182 * @chan: ioat channel to be cleaned up
183 */
184static void ioat2_cleanup(struct ioat2_dma_chan *ioat)
185{
186 struct ioat_chan_common *chan = &ioat->base;
187 unsigned long phys_complete;
188
189 prefetch(chan->completion);
190
191 if (!spin_trylock_bh(&chan->cleanup_lock))
192 return;
193
194 if (!ioat_cleanup_preamble(chan, &phys_complete)) {
195 spin_unlock_bh(&chan->cleanup_lock);
196 return;
197 }
198
199 if (!spin_trylock_bh(&ioat->ring_lock)) {
200 spin_unlock_bh(&chan->cleanup_lock);
201 return;
202 }
203
204 __cleanup(ioat, phys_complete);
205
206 spin_unlock_bh(&ioat->ring_lock);
207 spin_unlock_bh(&chan->cleanup_lock);
208}
209
210void ioat2_cleanup_tasklet(unsigned long data)
211{
212 struct ioat2_dma_chan *ioat = (void *) data;
213
214 ioat2_cleanup(ioat);
215 writew(IOAT_CHANCTRL_RUN, ioat->base.reg_base + IOAT_CHANCTRL_OFFSET);
216}
217
218void __ioat2_restart_chan(struct ioat2_dma_chan *ioat)
219{
220 struct ioat_chan_common *chan = &ioat->base;
221
222 /* set the tail to be re-issued */
223 ioat->issued = ioat->tail;
224 ioat->dmacount = 0;
225 set_bit(IOAT_COMPLETION_PENDING, &chan->state);
226 mod_timer(&chan->timer, jiffies + COMPLETION_TIMEOUT);
227
228 dev_dbg(to_dev(chan),
229 "%s: head: %#x tail: %#x issued: %#x count: %#x\n",
230 __func__, ioat->head, ioat->tail, ioat->issued, ioat->dmacount);
231
232 if (ioat2_ring_pending(ioat)) {
233 struct ioat_ring_ent *desc;
234
235 desc = ioat2_get_ring_ent(ioat, ioat->tail);
236 ioat2_set_chainaddr(ioat, desc->txd.phys);
237 __ioat2_issue_pending(ioat);
238 } else
239 __ioat2_start_null_desc(ioat);
240}
241
242static void ioat2_restart_channel(struct ioat2_dma_chan *ioat)
243{
244 struct ioat_chan_common *chan = &ioat->base;
245 unsigned long phys_complete;
246 u32 status;
247
248 status = ioat_chansts(chan);
249 if (is_ioat_active(status) || is_ioat_idle(status))
250 ioat_suspend(chan);
251 while (is_ioat_active(status) || is_ioat_idle(status)) {
252 status = ioat_chansts(chan);
253 cpu_relax();
254 }
255
256 if (ioat_cleanup_preamble(chan, &phys_complete))
257 __cleanup(ioat, phys_complete);
258
259 __ioat2_restart_chan(ioat);
260}
261
262void ioat2_timer_event(unsigned long data)
263{
264 struct ioat2_dma_chan *ioat = (void *) data;
265 struct ioat_chan_common *chan = &ioat->base;
266
267 spin_lock_bh(&chan->cleanup_lock);
268 if (test_bit(IOAT_COMPLETION_PENDING, &chan->state)) {
269 unsigned long phys_complete;
270 u64 status;
271
272 spin_lock_bh(&ioat->ring_lock);
273 status = ioat_chansts(chan);
274
275 /* when halted due to errors check for channel
276 * programming errors before advancing the completion state
277 */
278 if (is_ioat_halted(status)) {
279 u32 chanerr;
280
281 chanerr = readl(chan->reg_base + IOAT_CHANERR_OFFSET);
282 BUG_ON(is_ioat_bug(chanerr));
283 }
284
285 /* if we haven't made progress and we have already
286 * acknowledged a pending completion once, then be more
287 * forceful with a restart
288 */
289 if (ioat_cleanup_preamble(chan, &phys_complete))
290 __cleanup(ioat, phys_complete);
291 else if (test_bit(IOAT_COMPLETION_ACK, &chan->state))
292 ioat2_restart_channel(ioat);
293 else {
294 set_bit(IOAT_COMPLETION_ACK, &chan->state);
295 mod_timer(&chan->timer, jiffies + COMPLETION_TIMEOUT);
296 }
297 spin_unlock_bh(&ioat->ring_lock);
298 } else {
299 u16 active;
300
301 /* if the ring is idle, empty, and oversized try to step
302 * down the size
303 */
304 spin_lock_bh(&ioat->ring_lock);
305 active = ioat2_ring_active(ioat);
306 if (active == 0 && ioat->alloc_order > ioat_get_alloc_order())
307 reshape_ring(ioat, ioat->alloc_order-1);
308 spin_unlock_bh(&ioat->ring_lock);
309
310 /* keep shrinking until we get back to our minimum
311 * default size
312 */
313 if (ioat->alloc_order > ioat_get_alloc_order())
314 mod_timer(&chan->timer, jiffies + IDLE_TIMEOUT);
315 }
316 spin_unlock_bh(&chan->cleanup_lock);
317}
318
319/**
320 * ioat2_enumerate_channels - find and initialize the device's channels
321 * @device: the device to be enumerated
322 */
323int ioat2_enumerate_channels(struct ioatdma_device *device)
324{
325 struct ioat2_dma_chan *ioat;
326 struct device *dev = &device->pdev->dev;
327 struct dma_device *dma = &device->common;
328 u8 xfercap_log;
329 int i;
330
331 INIT_LIST_HEAD(&dma->channels);
332 dma->chancnt = readb(device->reg_base + IOAT_CHANCNT_OFFSET);
333 dma->chancnt &= 0x1f; /* bits [4:0] valid */
334 if (dma->chancnt > ARRAY_SIZE(device->idx)) {
335 dev_warn(dev, "(%d) exceeds max supported channels (%zu)\n",
336 dma->chancnt, ARRAY_SIZE(device->idx));
337 dma->chancnt = ARRAY_SIZE(device->idx);
338 }
339 xfercap_log = readb(device->reg_base + IOAT_XFERCAP_OFFSET);
340 xfercap_log &= 0x1f; /* bits [4:0] valid */
341 if (xfercap_log == 0)
342 return 0;
343 dev_dbg(dev, "%s: xfercap = %d\n", __func__, 1 << xfercap_log);
344
345 /* FIXME which i/oat version is i7300? */
346#ifdef CONFIG_I7300_IDLE_IOAT_CHANNEL
347 if (i7300_idle_platform_probe(NULL, NULL, 1) == 0)
348 dma->chancnt--;
349#endif
350 for (i = 0; i < dma->chancnt; i++) {
351 ioat = devm_kzalloc(dev, sizeof(*ioat), GFP_KERNEL);
352 if (!ioat)
353 break;
354
355 ioat_init_channel(device, &ioat->base, i,
356 device->timer_fn,
357 device->cleanup_tasklet,
358 (unsigned long) ioat);
359 ioat->xfercap_log = xfercap_log;
360 spin_lock_init(&ioat->ring_lock);
361 }
362 dma->chancnt = i;
363 return i;
364}
365
366static dma_cookie_t ioat2_tx_submit_unlock(struct dma_async_tx_descriptor *tx)
367{
368 struct dma_chan *c = tx->chan;
369 struct ioat2_dma_chan *ioat = to_ioat2_chan(c);
370 struct ioat_chan_common *chan = &ioat->base;
371 dma_cookie_t cookie = c->cookie;
372
373 cookie++;
374 if (cookie < 0)
375 cookie = 1;
376 tx->cookie = cookie;
377 c->cookie = cookie;
378 dev_dbg(to_dev(&ioat->base), "%s: cookie: %d\n", __func__, cookie);
379
380 if (!test_and_set_bit(IOAT_COMPLETION_PENDING, &chan->state))
381 mod_timer(&chan->timer, jiffies + COMPLETION_TIMEOUT);
382 ioat2_update_pending(ioat);
383 spin_unlock_bh(&ioat->ring_lock);
384
385 return cookie;
386}
387
388static struct ioat_ring_ent *ioat2_alloc_ring_ent(struct dma_chan *chan, gfp_t flags)
389{
390 struct ioat_dma_descriptor *hw;
391 struct ioat_ring_ent *desc;
392 struct ioatdma_device *dma;
393 dma_addr_t phys;
394
395 dma = to_ioatdma_device(chan->device);
396 hw = pci_pool_alloc(dma->dma_pool, flags, &phys);
397 if (!hw)
398 return NULL;
399 memset(hw, 0, sizeof(*hw));
400
401 desc = kmem_cache_alloc(ioat2_cache, flags);
402 if (!desc) {
403 pci_pool_free(dma->dma_pool, hw, phys);
404 return NULL;
405 }
406 memset(desc, 0, sizeof(*desc));
407
408 dma_async_tx_descriptor_init(&desc->txd, chan);
409 desc->txd.tx_submit = ioat2_tx_submit_unlock;
410 desc->hw = hw;
411 desc->txd.phys = phys;
412 return desc;
413}
414
415static void ioat2_free_ring_ent(struct ioat_ring_ent *desc, struct dma_chan *chan)
416{
417 struct ioatdma_device *dma;
418
419 dma = to_ioatdma_device(chan->device);
420 pci_pool_free(dma->dma_pool, desc->hw, desc->txd.phys);
421 kmem_cache_free(ioat2_cache, desc);
422}
423
424static struct ioat_ring_ent **ioat2_alloc_ring(struct dma_chan *c, int order, gfp_t flags)
425{
426 struct ioat_ring_ent **ring;
427 int descs = 1 << order;
428 int i;
429
430 if (order > ioat_get_max_alloc_order())
431 return NULL;
432
433 /* allocate the array to hold the software ring */
434 ring = kcalloc(descs, sizeof(*ring), flags);
435 if (!ring)
436 return NULL;
437 for (i = 0; i < descs; i++) {
438 ring[i] = ioat2_alloc_ring_ent(c, flags);
439 if (!ring[i]) {
440 while (i--)
441 ioat2_free_ring_ent(ring[i], c);
442 kfree(ring);
443 return NULL;
444 }
445 set_desc_id(ring[i], i);
446 }
447
448 /* link descs */
449 for (i = 0; i < descs-1; i++) {
450 struct ioat_ring_ent *next = ring[i+1];
451 struct ioat_dma_descriptor *hw = ring[i]->hw;
452
453 hw->next = next->txd.phys;
454 }
455 ring[i]->hw->next = ring[0]->txd.phys;
456
457 return ring;
458}
459
460/* ioat2_alloc_chan_resources - allocate/initialize ioat2 descriptor ring
461 * @chan: channel to be initialized
462 */
463int ioat2_alloc_chan_resources(struct dma_chan *c)
464{
465 struct ioat2_dma_chan *ioat = to_ioat2_chan(c);
466 struct ioat_chan_common *chan = &ioat->base;
467 struct ioat_ring_ent **ring;
468 u32 chanerr;
469 int order;
470
471 /* have we already been set up? */
472 if (ioat->ring)
473 return 1 << ioat->alloc_order;
474
475 /* Setup register to interrupt and write completion status on error */
476 writew(IOAT_CHANCTRL_RUN, chan->reg_base + IOAT_CHANCTRL_OFFSET);
477
478 chanerr = readl(chan->reg_base + IOAT_CHANERR_OFFSET);
479 if (chanerr) {
480 dev_err(to_dev(chan), "CHANERR = %x, clearing\n", chanerr);
481 writel(chanerr, chan->reg_base + IOAT_CHANERR_OFFSET);
482 }
483
484 /* allocate a completion writeback area */
485 /* doing 2 32bit writes to mmio since 1 64b write doesn't work */
486 chan->completion = pci_pool_alloc(chan->device->completion_pool,
487 GFP_KERNEL, &chan->completion_dma);
488 if (!chan->completion)
489 return -ENOMEM;
490
491 memset(chan->completion, 0, sizeof(*chan->completion));
492 writel(((u64) chan->completion_dma) & 0x00000000FFFFFFFF,
493 chan->reg_base + IOAT_CHANCMP_OFFSET_LOW);
494 writel(((u64) chan->completion_dma) >> 32,
495 chan->reg_base + IOAT_CHANCMP_OFFSET_HIGH);
496
497 order = ioat_get_alloc_order();
498 ring = ioat2_alloc_ring(c, order, GFP_KERNEL);
499 if (!ring)
500 return -ENOMEM;
501
502 spin_lock_bh(&ioat->ring_lock);
503 ioat->ring = ring;
504 ioat->head = 0;
505 ioat->issued = 0;
506 ioat->tail = 0;
507 ioat->pending = 0;
508 ioat->alloc_order = order;
509 spin_unlock_bh(&ioat->ring_lock);
510
511 tasklet_enable(&chan->cleanup_task);
512 ioat2_start_null_desc(ioat);
513
514 return 1 << ioat->alloc_order;
515}
516
517bool reshape_ring(struct ioat2_dma_chan *ioat, int order)
518{
519 /* reshape differs from normal ring allocation in that we want
520 * to allocate a new software ring while only
521 * extending/truncating the hardware ring
522 */
523 struct ioat_chan_common *chan = &ioat->base;
524 struct dma_chan *c = &chan->common;
525 const u16 curr_size = ioat2_ring_mask(ioat) + 1;
526 const u16 active = ioat2_ring_active(ioat);
527 const u16 new_size = 1 << order;
528 struct ioat_ring_ent **ring;
529 u16 i;
530
531 if (order > ioat_get_max_alloc_order())
532 return false;
533
534 /* double check that we have at least 1 free descriptor */
535 if (active == curr_size)
536 return false;
537
538 /* when shrinking, verify that we can hold the current active
539 * set in the new ring
540 */
541 if (active >= new_size)
542 return false;
543
544 /* allocate the array to hold the software ring */
545 ring = kcalloc(new_size, sizeof(*ring), GFP_NOWAIT);
546 if (!ring)
547 return false;
548
549 /* allocate/trim descriptors as needed */
550 if (new_size > curr_size) {
551 /* copy current descriptors to the new ring */
552 for (i = 0; i < curr_size; i++) {
553 u16 curr_idx = (ioat->tail+i) & (curr_size-1);
554 u16 new_idx = (ioat->tail+i) & (new_size-1);
555
556 ring[new_idx] = ioat->ring[curr_idx];
557 set_desc_id(ring[new_idx], new_idx);
558 }
559
560 /* add new descriptors to the ring */
561 for (i = curr_size; i < new_size; i++) {
562 u16 new_idx = (ioat->tail+i) & (new_size-1);
563
564 ring[new_idx] = ioat2_alloc_ring_ent(c, GFP_NOWAIT);
565 if (!ring[new_idx]) {
566 while (i--) {
567 u16 new_idx = (ioat->tail+i) & (new_size-1);
568
569 ioat2_free_ring_ent(ring[new_idx], c);
570 }
571 kfree(ring);
572 return false;
573 }
574 set_desc_id(ring[new_idx], new_idx);
575 }
576
577 /* hw link new descriptors */
578 for (i = curr_size-1; i < new_size; i++) {
579 u16 new_idx = (ioat->tail+i) & (new_size-1);
580 struct ioat_ring_ent *next = ring[(new_idx+1) & (new_size-1)];
581 struct ioat_dma_descriptor *hw = ring[new_idx]->hw;
582
583 hw->next = next->txd.phys;
584 }
585 } else {
586 struct ioat_dma_descriptor *hw;
587 struct ioat_ring_ent *next;
588
589 /* copy current descriptors to the new ring, dropping the
590 * removed descriptors
591 */
592 for (i = 0; i < new_size; i++) {
593 u16 curr_idx = (ioat->tail+i) & (curr_size-1);
594 u16 new_idx = (ioat->tail+i) & (new_size-1);
595
596 ring[new_idx] = ioat->ring[curr_idx];
597 set_desc_id(ring[new_idx], new_idx);
598 }
599
600 /* free deleted descriptors */
601 for (i = new_size; i < curr_size; i++) {
602 struct ioat_ring_ent *ent;
603
604 ent = ioat2_get_ring_ent(ioat, ioat->tail+i);
605 ioat2_free_ring_ent(ent, c);
606 }
607
608 /* fix up hardware ring */
609 hw = ring[(ioat->tail+new_size-1) & (new_size-1)]->hw;
610 next = ring[(ioat->tail+new_size) & (new_size-1)];
611 hw->next = next->txd.phys;
612 }
613
614 dev_dbg(to_dev(chan), "%s: allocated %d descriptors\n",
615 __func__, new_size);
616
617 kfree(ioat->ring);
618 ioat->ring = ring;
619 ioat->alloc_order = order;
620
621 return true;
622}
623
624/**
625 * ioat2_alloc_and_lock - common descriptor alloc boilerplate for ioat2,3 ops
626 * @idx: gets starting descriptor index on successful allocation
627 * @ioat: ioat2,3 channel (ring) to operate on
628 * @num_descs: allocation length
629 */
630int ioat2_alloc_and_lock(u16 *idx, struct ioat2_dma_chan *ioat, int num_descs)
631{
632 struct ioat_chan_common *chan = &ioat->base;
633
634 spin_lock_bh(&ioat->ring_lock);
635 /* never allow the last descriptor to be consumed, we need at
636 * least one free at all times to allow for on-the-fly ring
637 * resizing.
638 */
639 while (unlikely(ioat2_ring_space(ioat) <= num_descs)) {
640 if (reshape_ring(ioat, ioat->alloc_order + 1) &&
641 ioat2_ring_space(ioat) > num_descs)
642 break;
643
644 if (printk_ratelimit())
645 dev_dbg(to_dev(chan),
646 "%s: ring full! num_descs: %d (%x:%x:%x)\n",
647 __func__, num_descs, ioat->head, ioat->tail,
648 ioat->issued);
649 spin_unlock_bh(&ioat->ring_lock);
650
651 /* progress reclaim in the allocation failure case we
652 * may be called under bh_disabled so we need to trigger
653 * the timer event directly
654 */
655 spin_lock_bh(&chan->cleanup_lock);
656 if (jiffies > chan->timer.expires &&
657 timer_pending(&chan->timer)) {
658 struct ioatdma_device *device = chan->device;
659
660 mod_timer(&chan->timer, jiffies + COMPLETION_TIMEOUT);
661 spin_unlock_bh(&chan->cleanup_lock);
662 device->timer_fn((unsigned long) ioat);
663 } else
664 spin_unlock_bh(&chan->cleanup_lock);
665 return -ENOMEM;
666 }
667
668 dev_dbg(to_dev(chan), "%s: num_descs: %d (%x:%x:%x)\n",
669 __func__, num_descs, ioat->head, ioat->tail, ioat->issued);
670
671 *idx = ioat2_desc_alloc(ioat, num_descs);
672 return 0; /* with ioat->ring_lock held */
673}
674
675struct dma_async_tx_descriptor *
676ioat2_dma_prep_memcpy_lock(struct dma_chan *c, dma_addr_t dma_dest,
677 dma_addr_t dma_src, size_t len, unsigned long flags)
678{
679 struct ioat2_dma_chan *ioat = to_ioat2_chan(c);
680 struct ioat_dma_descriptor *hw;
681 struct ioat_ring_ent *desc;
682 dma_addr_t dst = dma_dest;
683 dma_addr_t src = dma_src;
684 size_t total_len = len;
685 int num_descs;
686 u16 idx;
687 int i;
688
689 num_descs = ioat2_xferlen_to_descs(ioat, len);
690 if (likely(num_descs) &&
691 ioat2_alloc_and_lock(&idx, ioat, num_descs) == 0)
692 /* pass */;
693 else
694 return NULL;
695 i = 0;
696 do {
697 size_t copy = min_t(size_t, len, 1 << ioat->xfercap_log);
698
699 desc = ioat2_get_ring_ent(ioat, idx + i);
700 hw = desc->hw;
701
702 hw->size = copy;
703 hw->ctl = 0;
704 hw->src_addr = src;
705 hw->dst_addr = dst;
706
707 len -= copy;
708 dst += copy;
709 src += copy;
710 dump_desc_dbg(ioat, desc);
711 } while (++i < num_descs);
712
713 desc->txd.flags = flags;
714 desc->len = total_len;
715 hw->ctl_f.int_en = !!(flags & DMA_PREP_INTERRUPT);
716 hw->ctl_f.fence = !!(flags & DMA_PREP_FENCE);
717 hw->ctl_f.compl_write = 1;
718 dump_desc_dbg(ioat, desc);
719 /* we leave the channel locked to ensure in order submission */
720
721 return &desc->txd;
722}
723
724/**
725 * ioat2_free_chan_resources - release all the descriptors
726 * @chan: the channel to be cleaned
727 */
728void ioat2_free_chan_resources(struct dma_chan *c)
729{
730 struct ioat2_dma_chan *ioat = to_ioat2_chan(c);
731 struct ioat_chan_common *chan = &ioat->base;
732 struct ioatdma_device *device = chan->device;
733 struct ioat_ring_ent *desc;
734 const u16 total_descs = 1 << ioat->alloc_order;
735 int descs;
736 int i;
737
738 /* Before freeing channel resources first check
739 * if they have been previously allocated for this channel.
740 */
741 if (!ioat->ring)
742 return;
743
744 tasklet_disable(&chan->cleanup_task);
745 del_timer_sync(&chan->timer);
746 device->cleanup_tasklet((unsigned long) ioat);
747
748 /* Delay 100ms after reset to allow internal DMA logic to quiesce
749 * before removing DMA descriptor resources.
750 */
751 writeb(IOAT_CHANCMD_RESET,
752 chan->reg_base + IOAT_CHANCMD_OFFSET(chan->device->version));
753 mdelay(100);
754
755 spin_lock_bh(&ioat->ring_lock);
756 descs = ioat2_ring_space(ioat);
757 dev_dbg(to_dev(chan), "freeing %d idle descriptors\n", descs);
758 for (i = 0; i < descs; i++) {
759 desc = ioat2_get_ring_ent(ioat, ioat->head + i);
760 ioat2_free_ring_ent(desc, c);
761 }
762
763 if (descs < total_descs)
764 dev_err(to_dev(chan), "Freeing %d in use descriptors!\n",
765 total_descs - descs);
766
767 for (i = 0; i < total_descs - descs; i++) {
768 desc = ioat2_get_ring_ent(ioat, ioat->tail + i);
769 dump_desc_dbg(ioat, desc);
770 ioat2_free_ring_ent(desc, c);
771 }
772
773 kfree(ioat->ring);
774 ioat->ring = NULL;
775 ioat->alloc_order = 0;
776 pci_pool_free(device->completion_pool, chan->completion,
777 chan->completion_dma);
778 spin_unlock_bh(&ioat->ring_lock);
779
780 chan->last_completion = 0;
781 chan->completion_dma = 0;
782 ioat->pending = 0;
783 ioat->dmacount = 0;
784}
785
786enum dma_status
787ioat2_is_complete(struct dma_chan *c, dma_cookie_t cookie,
788 dma_cookie_t *done, dma_cookie_t *used)
789{
790 struct ioat2_dma_chan *ioat = to_ioat2_chan(c);
791 struct ioatdma_device *device = ioat->base.device;
792
793 if (ioat_is_complete(c, cookie, done, used) == DMA_SUCCESS)
794 return DMA_SUCCESS;
795
796 device->cleanup_tasklet((unsigned long) ioat);
797
798 return ioat_is_complete(c, cookie, done, used);
799}
800
801static ssize_t ring_size_show(struct dma_chan *c, char *page)
802{
803 struct ioat2_dma_chan *ioat = to_ioat2_chan(c);
804
805 return sprintf(page, "%d\n", (1 << ioat->alloc_order) & ~1);
806}
807static struct ioat_sysfs_entry ring_size_attr = __ATTR_RO(ring_size);
808
809static ssize_t ring_active_show(struct dma_chan *c, char *page)
810{
811 struct ioat2_dma_chan *ioat = to_ioat2_chan(c);
812
813 /* ...taken outside the lock, no need to be precise */
814 return sprintf(page, "%d\n", ioat2_ring_active(ioat));
815}
816static struct ioat_sysfs_entry ring_active_attr = __ATTR_RO(ring_active);
817
818static struct attribute *ioat2_attrs[] = {
819 &ring_size_attr.attr,
820 &ring_active_attr.attr,
821 &ioat_cap_attr.attr,
822 &ioat_version_attr.attr,
823 NULL,
824};
825
826struct kobj_type ioat2_ktype = {
827 .sysfs_ops = &ioat_sysfs_ops,
828 .default_attrs = ioat2_attrs,
829};
830
831int __devinit ioat2_dma_probe(struct ioatdma_device *device, int dca)
832{
833 struct pci_dev *pdev = device->pdev;
834 struct dma_device *dma;
835 struct dma_chan *c;
836 struct ioat_chan_common *chan;
837 int err;
838
839 device->enumerate_channels = ioat2_enumerate_channels;
840 device->cleanup_tasklet = ioat2_cleanup_tasklet;
841 device->timer_fn = ioat2_timer_event;
842 device->self_test = ioat_dma_self_test;
843 dma = &device->common;
844 dma->device_prep_dma_memcpy = ioat2_dma_prep_memcpy_lock;
845 dma->device_issue_pending = ioat2_issue_pending;
846 dma->device_alloc_chan_resources = ioat2_alloc_chan_resources;
847 dma->device_free_chan_resources = ioat2_free_chan_resources;
848 dma->device_is_tx_complete = ioat2_is_complete;
849
850 err = ioat_probe(device);
851 if (err)
852 return err;
853 ioat_set_tcp_copy_break(2048);
854
855 list_for_each_entry(c, &dma->channels, device_node) {
856 chan = to_chan_common(c);
857 writel(IOAT_DCACTRL_CMPL_WRITE_ENABLE | IOAT_DMA_DCA_ANY_CPU,
858 chan->reg_base + IOAT_DCACTRL_OFFSET);
859 }
860
861 err = ioat_register(device);
862 if (err)
863 return err;
864
865 ioat_kobject_add(device, &ioat2_ktype);
866
867 if (dca)
868 device->dca = ioat2_dca_init(pdev, device->reg_base);
869
870 return err;
871}
diff --git a/drivers/dma/ioat/dma_v2.h b/drivers/dma/ioat/dma_v2.h
new file mode 100644
index 000000000000..1d849ef74d5f
--- /dev/null
+++ b/drivers/dma/ioat/dma_v2.h
@@ -0,0 +1,190 @@
1/*
2 * Copyright(c) 2004 - 2009 Intel Corporation. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of the GNU General Public License as published by the Free
6 * Software Foundation; either version 2 of the License, or (at your option)
7 * any later version.
8 *
9 * This program is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 * You should have received a copy of the GNU General Public License along with
15 * this program; if not, write to the Free Software Foundation, Inc., 59
16 * Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * The full GNU General Public License is included in this distribution in the
19 * file called COPYING.
20 */
21#ifndef IOATDMA_V2_H
22#define IOATDMA_V2_H
23
24#include <linux/dmaengine.h>
25#include "dma.h"
26#include "hw.h"
27
28
29extern int ioat_pending_level;
30extern int ioat_ring_alloc_order;
31
32/*
33 * workaround for IOAT ver.3.0 null descriptor issue
34 * (channel returns error when size is 0)
35 */
36#define NULL_DESC_BUFFER_SIZE 1
37
38#define IOAT_MAX_ORDER 16
39#define ioat_get_alloc_order() \
40 (min(ioat_ring_alloc_order, IOAT_MAX_ORDER))
41#define ioat_get_max_alloc_order() \
42 (min(ioat_ring_max_alloc_order, IOAT_MAX_ORDER))
43
44/* struct ioat2_dma_chan - ioat v2 / v3 channel attributes
45 * @base: common ioat channel parameters
46 * @xfercap_log; log2 of channel max transfer length (for fast division)
47 * @head: allocated index
48 * @issued: hardware notification point
49 * @tail: cleanup index
50 * @pending: lock free indicator for issued != head
51 * @dmacount: identical to 'head' except for occasionally resetting to zero
52 * @alloc_order: log2 of the number of allocated descriptors
53 * @ring: software ring buffer implementation of hardware ring
54 * @ring_lock: protects ring attributes
55 */
56struct ioat2_dma_chan {
57 struct ioat_chan_common base;
58 size_t xfercap_log;
59 u16 head;
60 u16 issued;
61 u16 tail;
62 u16 dmacount;
63 u16 alloc_order;
64 int pending;
65 struct ioat_ring_ent **ring;
66 spinlock_t ring_lock;
67};
68
69static inline struct ioat2_dma_chan *to_ioat2_chan(struct dma_chan *c)
70{
71 struct ioat_chan_common *chan = to_chan_common(c);
72
73 return container_of(chan, struct ioat2_dma_chan, base);
74}
75
76static inline u16 ioat2_ring_mask(struct ioat2_dma_chan *ioat)
77{
78 return (1 << ioat->alloc_order) - 1;
79}
80
81/* count of descriptors in flight with the engine */
82static inline u16 ioat2_ring_active(struct ioat2_dma_chan *ioat)
83{
84 return (ioat->head - ioat->tail) & ioat2_ring_mask(ioat);
85}
86
87/* count of descriptors pending submission to hardware */
88static inline u16 ioat2_ring_pending(struct ioat2_dma_chan *ioat)
89{
90 return (ioat->head - ioat->issued) & ioat2_ring_mask(ioat);
91}
92
93static inline u16 ioat2_ring_space(struct ioat2_dma_chan *ioat)
94{
95 u16 num_descs = ioat2_ring_mask(ioat) + 1;
96 u16 active = ioat2_ring_active(ioat);
97
98 BUG_ON(active > num_descs);
99
100 return num_descs - active;
101}
102
103/* assumes caller already checked space */
104static inline u16 ioat2_desc_alloc(struct ioat2_dma_chan *ioat, u16 len)
105{
106 ioat->head += len;
107 return ioat->head - len;
108}
109
110static inline u16 ioat2_xferlen_to_descs(struct ioat2_dma_chan *ioat, size_t len)
111{
112 u16 num_descs = len >> ioat->xfercap_log;
113
114 num_descs += !!(len & ((1 << ioat->xfercap_log) - 1));
115 return num_descs;
116}
117
118/**
119 * struct ioat_ring_ent - wrapper around hardware descriptor
120 * @hw: hardware DMA descriptor (for memcpy)
121 * @fill: hardware fill descriptor
122 * @xor: hardware xor descriptor
123 * @xor_ex: hardware xor extension descriptor
124 * @pq: hardware pq descriptor
125 * @pq_ex: hardware pq extension descriptor
126 * @pqu: hardware pq update descriptor
127 * @raw: hardware raw (un-typed) descriptor
128 * @txd: the generic software descriptor for all engines
129 * @len: total transaction length for unmap
130 * @result: asynchronous result of validate operations
131 * @id: identifier for debug
132 */
133
134struct ioat_ring_ent {
135 union {
136 struct ioat_dma_descriptor *hw;
137 struct ioat_fill_descriptor *fill;
138 struct ioat_xor_descriptor *xor;
139 struct ioat_xor_ext_descriptor *xor_ex;
140 struct ioat_pq_descriptor *pq;
141 struct ioat_pq_ext_descriptor *pq_ex;
142 struct ioat_pq_update_descriptor *pqu;
143 struct ioat_raw_descriptor *raw;
144 };
145 size_t len;
146 struct dma_async_tx_descriptor txd;
147 enum sum_check_flags *result;
148 #ifdef DEBUG
149 int id;
150 #endif
151};
152
153static inline struct ioat_ring_ent *
154ioat2_get_ring_ent(struct ioat2_dma_chan *ioat, u16 idx)
155{
156 return ioat->ring[idx & ioat2_ring_mask(ioat)];
157}
158
159static inline void ioat2_set_chainaddr(struct ioat2_dma_chan *ioat, u64 addr)
160{
161 struct ioat_chan_common *chan = &ioat->base;
162
163 writel(addr & 0x00000000FFFFFFFF,
164 chan->reg_base + IOAT2_CHAINADDR_OFFSET_LOW);
165 writel(addr >> 32,
166 chan->reg_base + IOAT2_CHAINADDR_OFFSET_HIGH);
167}
168
169int __devinit ioat2_dma_probe(struct ioatdma_device *dev, int dca);
170int __devinit ioat3_dma_probe(struct ioatdma_device *dev, int dca);
171struct dca_provider * __devinit ioat2_dca_init(struct pci_dev *pdev, void __iomem *iobase);
172struct dca_provider * __devinit ioat3_dca_init(struct pci_dev *pdev, void __iomem *iobase);
173int ioat2_alloc_and_lock(u16 *idx, struct ioat2_dma_chan *ioat, int num_descs);
174int ioat2_enumerate_channels(struct ioatdma_device *device);
175struct dma_async_tx_descriptor *
176ioat2_dma_prep_memcpy_lock(struct dma_chan *c, dma_addr_t dma_dest,
177 dma_addr_t dma_src, size_t len, unsigned long flags);
178void ioat2_issue_pending(struct dma_chan *chan);
179int ioat2_alloc_chan_resources(struct dma_chan *c);
180void ioat2_free_chan_resources(struct dma_chan *c);
181enum dma_status ioat2_is_complete(struct dma_chan *c, dma_cookie_t cookie,
182 dma_cookie_t *done, dma_cookie_t *used);
183void __ioat2_restart_chan(struct ioat2_dma_chan *ioat);
184bool reshape_ring(struct ioat2_dma_chan *ioat, int order);
185void __ioat2_issue_pending(struct ioat2_dma_chan *ioat);
186void ioat2_cleanup_tasklet(unsigned long data);
187void ioat2_timer_event(unsigned long data);
188extern struct kobj_type ioat2_ktype;
189extern struct kmem_cache *ioat2_cache;
190#endif /* IOATDMA_V2_H */
diff --git a/drivers/dma/ioat/dma_v3.c b/drivers/dma/ioat/dma_v3.c
new file mode 100644
index 000000000000..35d1e33afd5b
--- /dev/null
+++ b/drivers/dma/ioat/dma_v3.c
@@ -0,0 +1,1223 @@
1/*
2 * This file is provided under a dual BSD/GPLv2 license. When using or
3 * redistributing this file, you may do so under either license.
4 *
5 * GPL LICENSE SUMMARY
6 *
7 * Copyright(c) 2004 - 2009 Intel Corporation. All rights reserved.
8 *
9 * This program is free software; you can redistribute it and/or modify it
10 * under the terms and conditions of the GNU General Public License,
11 * version 2, as published by the Free Software Foundation.
12 *
13 * This program is distributed in the hope that it will be useful, but WITHOUT
14 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
16 * more details.
17 *
18 * You should have received a copy of the GNU General Public License along with
19 * this program; if not, write to the Free Software Foundation, Inc.,
20 * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
21 *
22 * The full GNU General Public License is included in this distribution in
23 * the file called "COPYING".
24 *
25 * BSD LICENSE
26 *
27 * Copyright(c) 2004-2009 Intel Corporation. All rights reserved.
28 *
29 * Redistribution and use in source and binary forms, with or without
30 * modification, are permitted provided that the following conditions are met:
31 *
32 * * Redistributions of source code must retain the above copyright
33 * notice, this list of conditions and the following disclaimer.
34 * * Redistributions in binary form must reproduce the above copyright
35 * notice, this list of conditions and the following disclaimer in
36 * the documentation and/or other materials provided with the
37 * distribution.
38 * * Neither the name of Intel Corporation nor the names of its
39 * contributors may be used to endorse or promote products derived
40 * from this software without specific prior written permission.
41 *
42 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
43 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
46 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
47 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
48 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
49 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
50 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
51 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
52 * POSSIBILITY OF SUCH DAMAGE.
53 */
54
55/*
56 * Support routines for v3+ hardware
57 */
58
59#include <linux/pci.h>
60#include <linux/dmaengine.h>
61#include <linux/dma-mapping.h>
62#include "registers.h"
63#include "hw.h"
64#include "dma.h"
65#include "dma_v2.h"
66
67/* ioat hardware assumes at least two sources for raid operations */
68#define src_cnt_to_sw(x) ((x) + 2)
69#define src_cnt_to_hw(x) ((x) - 2)
70
71/* provide a lookup table for setting the source address in the base or
72 * extended descriptor of an xor or pq descriptor
73 */
74static const u8 xor_idx_to_desc __read_mostly = 0xd0;
75static const u8 xor_idx_to_field[] __read_mostly = { 1, 4, 5, 6, 7, 0, 1, 2 };
76static const u8 pq_idx_to_desc __read_mostly = 0xf8;
77static const u8 pq_idx_to_field[] __read_mostly = { 1, 4, 5, 0, 1, 2, 4, 5 };
78
79static dma_addr_t xor_get_src(struct ioat_raw_descriptor *descs[2], int idx)
80{
81 struct ioat_raw_descriptor *raw = descs[xor_idx_to_desc >> idx & 1];
82
83 return raw->field[xor_idx_to_field[idx]];
84}
85
86static void xor_set_src(struct ioat_raw_descriptor *descs[2],
87 dma_addr_t addr, u32 offset, int idx)
88{
89 struct ioat_raw_descriptor *raw = descs[xor_idx_to_desc >> idx & 1];
90
91 raw->field[xor_idx_to_field[idx]] = addr + offset;
92}
93
94static dma_addr_t pq_get_src(struct ioat_raw_descriptor *descs[2], int idx)
95{
96 struct ioat_raw_descriptor *raw = descs[pq_idx_to_desc >> idx & 1];
97
98 return raw->field[pq_idx_to_field[idx]];
99}
100
101static void pq_set_src(struct ioat_raw_descriptor *descs[2],
102 dma_addr_t addr, u32 offset, u8 coef, int idx)
103{
104 struct ioat_pq_descriptor *pq = (struct ioat_pq_descriptor *) descs[0];
105 struct ioat_raw_descriptor *raw = descs[pq_idx_to_desc >> idx & 1];
106
107 raw->field[pq_idx_to_field[idx]] = addr + offset;
108 pq->coef[idx] = coef;
109}
110
111static void ioat3_dma_unmap(struct ioat2_dma_chan *ioat,
112 struct ioat_ring_ent *desc, int idx)
113{
114 struct ioat_chan_common *chan = &ioat->base;
115 struct pci_dev *pdev = chan->device->pdev;
116 size_t len = desc->len;
117 size_t offset = len - desc->hw->size;
118 struct dma_async_tx_descriptor *tx = &desc->txd;
119 enum dma_ctrl_flags flags = tx->flags;
120
121 switch (desc->hw->ctl_f.op) {
122 case IOAT_OP_COPY:
123 if (!desc->hw->ctl_f.null) /* skip 'interrupt' ops */
124 ioat_dma_unmap(chan, flags, len, desc->hw);
125 break;
126 case IOAT_OP_FILL: {
127 struct ioat_fill_descriptor *hw = desc->fill;
128
129 if (!(flags & DMA_COMPL_SKIP_DEST_UNMAP))
130 ioat_unmap(pdev, hw->dst_addr - offset, len,
131 PCI_DMA_FROMDEVICE, flags, 1);
132 break;
133 }
134 case IOAT_OP_XOR_VAL:
135 case IOAT_OP_XOR: {
136 struct ioat_xor_descriptor *xor = desc->xor;
137 struct ioat_ring_ent *ext;
138 struct ioat_xor_ext_descriptor *xor_ex = NULL;
139 int src_cnt = src_cnt_to_sw(xor->ctl_f.src_cnt);
140 struct ioat_raw_descriptor *descs[2];
141 int i;
142
143 if (src_cnt > 5) {
144 ext = ioat2_get_ring_ent(ioat, idx + 1);
145 xor_ex = ext->xor_ex;
146 }
147
148 if (!(flags & DMA_COMPL_SKIP_SRC_UNMAP)) {
149 descs[0] = (struct ioat_raw_descriptor *) xor;
150 descs[1] = (struct ioat_raw_descriptor *) xor_ex;
151 for (i = 0; i < src_cnt; i++) {
152 dma_addr_t src = xor_get_src(descs, i);
153
154 ioat_unmap(pdev, src - offset, len,
155 PCI_DMA_TODEVICE, flags, 0);
156 }
157
158 /* dest is a source in xor validate operations */
159 if (xor->ctl_f.op == IOAT_OP_XOR_VAL) {
160 ioat_unmap(pdev, xor->dst_addr - offset, len,
161 PCI_DMA_TODEVICE, flags, 1);
162 break;
163 }
164 }
165
166 if (!(flags & DMA_COMPL_SKIP_DEST_UNMAP))
167 ioat_unmap(pdev, xor->dst_addr - offset, len,
168 PCI_DMA_FROMDEVICE, flags, 1);
169 break;
170 }
171 case IOAT_OP_PQ_VAL:
172 case IOAT_OP_PQ: {
173 struct ioat_pq_descriptor *pq = desc->pq;
174 struct ioat_ring_ent *ext;
175 struct ioat_pq_ext_descriptor *pq_ex = NULL;
176 int src_cnt = src_cnt_to_sw(pq->ctl_f.src_cnt);
177 struct ioat_raw_descriptor *descs[2];
178 int i;
179
180 if (src_cnt > 3) {
181 ext = ioat2_get_ring_ent(ioat, idx + 1);
182 pq_ex = ext->pq_ex;
183 }
184
185 /* in the 'continue' case don't unmap the dests as sources */
186 if (dmaf_p_disabled_continue(flags))
187 src_cnt--;
188 else if (dmaf_continue(flags))
189 src_cnt -= 3;
190
191 if (!(flags & DMA_COMPL_SKIP_SRC_UNMAP)) {
192 descs[0] = (struct ioat_raw_descriptor *) pq;
193 descs[1] = (struct ioat_raw_descriptor *) pq_ex;
194 for (i = 0; i < src_cnt; i++) {
195 dma_addr_t src = pq_get_src(descs, i);
196
197 ioat_unmap(pdev, src - offset, len,
198 PCI_DMA_TODEVICE, flags, 0);
199 }
200
201 /* the dests are sources in pq validate operations */
202 if (pq->ctl_f.op == IOAT_OP_XOR_VAL) {
203 if (!(flags & DMA_PREP_PQ_DISABLE_P))
204 ioat_unmap(pdev, pq->p_addr - offset,
205 len, PCI_DMA_TODEVICE, flags, 0);
206 if (!(flags & DMA_PREP_PQ_DISABLE_Q))
207 ioat_unmap(pdev, pq->q_addr - offset,
208 len, PCI_DMA_TODEVICE, flags, 0);
209 break;
210 }
211 }
212
213 if (!(flags & DMA_COMPL_SKIP_DEST_UNMAP)) {
214 if (!(flags & DMA_PREP_PQ_DISABLE_P))
215 ioat_unmap(pdev, pq->p_addr - offset, len,
216 PCI_DMA_BIDIRECTIONAL, flags, 1);
217 if (!(flags & DMA_PREP_PQ_DISABLE_Q))
218 ioat_unmap(pdev, pq->q_addr - offset, len,
219 PCI_DMA_BIDIRECTIONAL, flags, 1);
220 }
221 break;
222 }
223 default:
224 dev_err(&pdev->dev, "%s: unknown op type: %#x\n",
225 __func__, desc->hw->ctl_f.op);
226 }
227}
228
229static bool desc_has_ext(struct ioat_ring_ent *desc)
230{
231 struct ioat_dma_descriptor *hw = desc->hw;
232
233 if (hw->ctl_f.op == IOAT_OP_XOR ||
234 hw->ctl_f.op == IOAT_OP_XOR_VAL) {
235 struct ioat_xor_descriptor *xor = desc->xor;
236
237 if (src_cnt_to_sw(xor->ctl_f.src_cnt) > 5)
238 return true;
239 } else if (hw->ctl_f.op == IOAT_OP_PQ ||
240 hw->ctl_f.op == IOAT_OP_PQ_VAL) {
241 struct ioat_pq_descriptor *pq = desc->pq;
242
243 if (src_cnt_to_sw(pq->ctl_f.src_cnt) > 3)
244 return true;
245 }
246
247 return false;
248}
249
250/**
251 * __cleanup - reclaim used descriptors
252 * @ioat: channel (ring) to clean
253 *
254 * The difference from the dma_v2.c __cleanup() is that this routine
255 * handles extended descriptors and dma-unmapping raid operations.
256 */
257static void __cleanup(struct ioat2_dma_chan *ioat, unsigned long phys_complete)
258{
259 struct ioat_chan_common *chan = &ioat->base;
260 struct ioat_ring_ent *desc;
261 bool seen_current = false;
262 u16 active;
263 int i;
264
265 dev_dbg(to_dev(chan), "%s: head: %#x tail: %#x issued: %#x\n",
266 __func__, ioat->head, ioat->tail, ioat->issued);
267
268 active = ioat2_ring_active(ioat);
269 for (i = 0; i < active && !seen_current; i++) {
270 struct dma_async_tx_descriptor *tx;
271
272 prefetch(ioat2_get_ring_ent(ioat, ioat->tail + i + 1));
273 desc = ioat2_get_ring_ent(ioat, ioat->tail + i);
274 dump_desc_dbg(ioat, desc);
275 tx = &desc->txd;
276 if (tx->cookie) {
277 chan->completed_cookie = tx->cookie;
278 ioat3_dma_unmap(ioat, desc, ioat->tail + i);
279 tx->cookie = 0;
280 if (tx->callback) {
281 tx->callback(tx->callback_param);
282 tx->callback = NULL;
283 }
284 }
285
286 if (tx->phys == phys_complete)
287 seen_current = true;
288
289 /* skip extended descriptors */
290 if (desc_has_ext(desc)) {
291 BUG_ON(i + 1 >= active);
292 i++;
293 }
294 }
295 ioat->tail += i;
296 BUG_ON(!seen_current); /* no active descs have written a completion? */
297 chan->last_completion = phys_complete;
298 if (ioat->head == ioat->tail) {
299 dev_dbg(to_dev(chan), "%s: cancel completion timeout\n",
300 __func__);
301 clear_bit(IOAT_COMPLETION_PENDING, &chan->state);
302 mod_timer(&chan->timer, jiffies + IDLE_TIMEOUT);
303 }
304}
305
306static void ioat3_cleanup(struct ioat2_dma_chan *ioat)
307{
308 struct ioat_chan_common *chan = &ioat->base;
309 unsigned long phys_complete;
310
311 prefetch(chan->completion);
312
313 if (!spin_trylock_bh(&chan->cleanup_lock))
314 return;
315
316 if (!ioat_cleanup_preamble(chan, &phys_complete)) {
317 spin_unlock_bh(&chan->cleanup_lock);
318 return;
319 }
320
321 if (!spin_trylock_bh(&ioat->ring_lock)) {
322 spin_unlock_bh(&chan->cleanup_lock);
323 return;
324 }
325
326 __cleanup(ioat, phys_complete);
327
328 spin_unlock_bh(&ioat->ring_lock);
329 spin_unlock_bh(&chan->cleanup_lock);
330}
331
332static void ioat3_cleanup_tasklet(unsigned long data)
333{
334 struct ioat2_dma_chan *ioat = (void *) data;
335
336 ioat3_cleanup(ioat);
337 writew(IOAT_CHANCTRL_RUN | IOAT3_CHANCTRL_COMPL_DCA_EN,
338 ioat->base.reg_base + IOAT_CHANCTRL_OFFSET);
339}
340
341static void ioat3_restart_channel(struct ioat2_dma_chan *ioat)
342{
343 struct ioat_chan_common *chan = &ioat->base;
344 unsigned long phys_complete;
345 u32 status;
346
347 status = ioat_chansts(chan);
348 if (is_ioat_active(status) || is_ioat_idle(status))
349 ioat_suspend(chan);
350 while (is_ioat_active(status) || is_ioat_idle(status)) {
351 status = ioat_chansts(chan);
352 cpu_relax();
353 }
354
355 if (ioat_cleanup_preamble(chan, &phys_complete))
356 __cleanup(ioat, phys_complete);
357
358 __ioat2_restart_chan(ioat);
359}
360
361static void ioat3_timer_event(unsigned long data)
362{
363 struct ioat2_dma_chan *ioat = (void *) data;
364 struct ioat_chan_common *chan = &ioat->base;
365
366 spin_lock_bh(&chan->cleanup_lock);
367 if (test_bit(IOAT_COMPLETION_PENDING, &chan->state)) {
368 unsigned long phys_complete;
369 u64 status;
370
371 spin_lock_bh(&ioat->ring_lock);
372 status = ioat_chansts(chan);
373
374 /* when halted due to errors check for channel
375 * programming errors before advancing the completion state
376 */
377 if (is_ioat_halted(status)) {
378 u32 chanerr;
379
380 chanerr = readl(chan->reg_base + IOAT_CHANERR_OFFSET);
381 BUG_ON(is_ioat_bug(chanerr));
382 }
383
384 /* if we haven't made progress and we have already
385 * acknowledged a pending completion once, then be more
386 * forceful with a restart
387 */
388 if (ioat_cleanup_preamble(chan, &phys_complete))
389 __cleanup(ioat, phys_complete);
390 else if (test_bit(IOAT_COMPLETION_ACK, &chan->state))
391 ioat3_restart_channel(ioat);
392 else {
393 set_bit(IOAT_COMPLETION_ACK, &chan->state);
394 mod_timer(&chan->timer, jiffies + COMPLETION_TIMEOUT);
395 }
396 spin_unlock_bh(&ioat->ring_lock);
397 } else {
398 u16 active;
399
400 /* if the ring is idle, empty, and oversized try to step
401 * down the size
402 */
403 spin_lock_bh(&ioat->ring_lock);
404 active = ioat2_ring_active(ioat);
405 if (active == 0 && ioat->alloc_order > ioat_get_alloc_order())
406 reshape_ring(ioat, ioat->alloc_order-1);
407 spin_unlock_bh(&ioat->ring_lock);
408
409 /* keep shrinking until we get back to our minimum
410 * default size
411 */
412 if (ioat->alloc_order > ioat_get_alloc_order())
413 mod_timer(&chan->timer, jiffies + IDLE_TIMEOUT);
414 }
415 spin_unlock_bh(&chan->cleanup_lock);
416}
417
418static enum dma_status
419ioat3_is_complete(struct dma_chan *c, dma_cookie_t cookie,
420 dma_cookie_t *done, dma_cookie_t *used)
421{
422 struct ioat2_dma_chan *ioat = to_ioat2_chan(c);
423
424 if (ioat_is_complete(c, cookie, done, used) == DMA_SUCCESS)
425 return DMA_SUCCESS;
426
427 ioat3_cleanup(ioat);
428
429 return ioat_is_complete(c, cookie, done, used);
430}
431
432static struct dma_async_tx_descriptor *
433ioat3_prep_memset_lock(struct dma_chan *c, dma_addr_t dest, int value,
434 size_t len, unsigned long flags)
435{
436 struct ioat2_dma_chan *ioat = to_ioat2_chan(c);
437 struct ioat_ring_ent *desc;
438 size_t total_len = len;
439 struct ioat_fill_descriptor *fill;
440 int num_descs;
441 u64 src_data = (0x0101010101010101ULL) * (value & 0xff);
442 u16 idx;
443 int i;
444
445 num_descs = ioat2_xferlen_to_descs(ioat, len);
446 if (likely(num_descs) &&
447 ioat2_alloc_and_lock(&idx, ioat, num_descs) == 0)
448 /* pass */;
449 else
450 return NULL;
451 i = 0;
452 do {
453 size_t xfer_size = min_t(size_t, len, 1 << ioat->xfercap_log);
454
455 desc = ioat2_get_ring_ent(ioat, idx + i);
456 fill = desc->fill;
457
458 fill->size = xfer_size;
459 fill->src_data = src_data;
460 fill->dst_addr = dest;
461 fill->ctl = 0;
462 fill->ctl_f.op = IOAT_OP_FILL;
463
464 len -= xfer_size;
465 dest += xfer_size;
466 dump_desc_dbg(ioat, desc);
467 } while (++i < num_descs);
468
469 desc->txd.flags = flags;
470 desc->len = total_len;
471 fill->ctl_f.int_en = !!(flags & DMA_PREP_INTERRUPT);
472 fill->ctl_f.fence = !!(flags & DMA_PREP_FENCE);
473 fill->ctl_f.compl_write = 1;
474 dump_desc_dbg(ioat, desc);
475
476 /* we leave the channel locked to ensure in order submission */
477 return &desc->txd;
478}
479
480static struct dma_async_tx_descriptor *
481__ioat3_prep_xor_lock(struct dma_chan *c, enum sum_check_flags *result,
482 dma_addr_t dest, dma_addr_t *src, unsigned int src_cnt,
483 size_t len, unsigned long flags)
484{
485 struct ioat2_dma_chan *ioat = to_ioat2_chan(c);
486 struct ioat_ring_ent *compl_desc;
487 struct ioat_ring_ent *desc;
488 struct ioat_ring_ent *ext;
489 size_t total_len = len;
490 struct ioat_xor_descriptor *xor;
491 struct ioat_xor_ext_descriptor *xor_ex = NULL;
492 struct ioat_dma_descriptor *hw;
493 u32 offset = 0;
494 int num_descs;
495 int with_ext;
496 int i;
497 u16 idx;
498 u8 op = result ? IOAT_OP_XOR_VAL : IOAT_OP_XOR;
499
500 BUG_ON(src_cnt < 2);
501
502 num_descs = ioat2_xferlen_to_descs(ioat, len);
503 /* we need 2x the number of descriptors to cover greater than 5
504 * sources
505 */
506 if (src_cnt > 5) {
507 with_ext = 1;
508 num_descs *= 2;
509 } else
510 with_ext = 0;
511
512 /* completion writes from the raid engine may pass completion
513 * writes from the legacy engine, so we need one extra null
514 * (legacy) descriptor to ensure all completion writes arrive in
515 * order.
516 */
517 if (likely(num_descs) &&
518 ioat2_alloc_and_lock(&idx, ioat, num_descs+1) == 0)
519 /* pass */;
520 else
521 return NULL;
522 i = 0;
523 do {
524 struct ioat_raw_descriptor *descs[2];
525 size_t xfer_size = min_t(size_t, len, 1 << ioat->xfercap_log);
526 int s;
527
528 desc = ioat2_get_ring_ent(ioat, idx + i);
529 xor = desc->xor;
530
531 /* save a branch by unconditionally retrieving the
532 * extended descriptor xor_set_src() knows to not write
533 * to it in the single descriptor case
534 */
535 ext = ioat2_get_ring_ent(ioat, idx + i + 1);
536 xor_ex = ext->xor_ex;
537
538 descs[0] = (struct ioat_raw_descriptor *) xor;
539 descs[1] = (struct ioat_raw_descriptor *) xor_ex;
540 for (s = 0; s < src_cnt; s++)
541 xor_set_src(descs, src[s], offset, s);
542 xor->size = xfer_size;
543 xor->dst_addr = dest + offset;
544 xor->ctl = 0;
545 xor->ctl_f.op = op;
546 xor->ctl_f.src_cnt = src_cnt_to_hw(src_cnt);
547
548 len -= xfer_size;
549 offset += xfer_size;
550 dump_desc_dbg(ioat, desc);
551 } while ((i += 1 + with_ext) < num_descs);
552
553 /* last xor descriptor carries the unmap parameters and fence bit */
554 desc->txd.flags = flags;
555 desc->len = total_len;
556 if (result)
557 desc->result = result;
558 xor->ctl_f.fence = !!(flags & DMA_PREP_FENCE);
559
560 /* completion descriptor carries interrupt bit */
561 compl_desc = ioat2_get_ring_ent(ioat, idx + i);
562 compl_desc->txd.flags = flags & DMA_PREP_INTERRUPT;
563 hw = compl_desc->hw;
564 hw->ctl = 0;
565 hw->ctl_f.null = 1;
566 hw->ctl_f.int_en = !!(flags & DMA_PREP_INTERRUPT);
567 hw->ctl_f.compl_write = 1;
568 hw->size = NULL_DESC_BUFFER_SIZE;
569 dump_desc_dbg(ioat, compl_desc);
570
571 /* we leave the channel locked to ensure in order submission */
572 return &desc->txd;
573}
574
575static struct dma_async_tx_descriptor *
576ioat3_prep_xor(struct dma_chan *chan, dma_addr_t dest, dma_addr_t *src,
577 unsigned int src_cnt, size_t len, unsigned long flags)
578{
579 return __ioat3_prep_xor_lock(chan, NULL, dest, src, src_cnt, len, flags);
580}
581
582struct dma_async_tx_descriptor *
583ioat3_prep_xor_val(struct dma_chan *chan, dma_addr_t *src,
584 unsigned int src_cnt, size_t len,
585 enum sum_check_flags *result, unsigned long flags)
586{
587 /* the cleanup routine only sets bits on validate failure, it
588 * does not clear bits on validate success... so clear it here
589 */
590 *result = 0;
591
592 return __ioat3_prep_xor_lock(chan, result, src[0], &src[1],
593 src_cnt - 1, len, flags);
594}
595
596static void
597dump_pq_desc_dbg(struct ioat2_dma_chan *ioat, struct ioat_ring_ent *desc, struct ioat_ring_ent *ext)
598{
599 struct device *dev = to_dev(&ioat->base);
600 struct ioat_pq_descriptor *pq = desc->pq;
601 struct ioat_pq_ext_descriptor *pq_ex = ext ? ext->pq_ex : NULL;
602 struct ioat_raw_descriptor *descs[] = { (void *) pq, (void *) pq_ex };
603 int src_cnt = src_cnt_to_sw(pq->ctl_f.src_cnt);
604 int i;
605
606 dev_dbg(dev, "desc[%d]: (%#llx->%#llx) flags: %#x"
607 " sz: %#x ctl: %#x (op: %d int: %d compl: %d pq: '%s%s' src_cnt: %d)\n",
608 desc_id(desc), (unsigned long long) desc->txd.phys,
609 (unsigned long long) (pq_ex ? pq_ex->next : pq->next),
610 desc->txd.flags, pq->size, pq->ctl, pq->ctl_f.op, pq->ctl_f.int_en,
611 pq->ctl_f.compl_write,
612 pq->ctl_f.p_disable ? "" : "p", pq->ctl_f.q_disable ? "" : "q",
613 pq->ctl_f.src_cnt);
614 for (i = 0; i < src_cnt; i++)
615 dev_dbg(dev, "\tsrc[%d]: %#llx coef: %#x\n", i,
616 (unsigned long long) pq_get_src(descs, i), pq->coef[i]);
617 dev_dbg(dev, "\tP: %#llx\n", pq->p_addr);
618 dev_dbg(dev, "\tQ: %#llx\n", pq->q_addr);
619}
620
621static struct dma_async_tx_descriptor *
622__ioat3_prep_pq_lock(struct dma_chan *c, enum sum_check_flags *result,
623 const dma_addr_t *dst, const dma_addr_t *src,
624 unsigned int src_cnt, const unsigned char *scf,
625 size_t len, unsigned long flags)
626{
627 struct ioat2_dma_chan *ioat = to_ioat2_chan(c);
628 struct ioat_chan_common *chan = &ioat->base;
629 struct ioat_ring_ent *compl_desc;
630 struct ioat_ring_ent *desc;
631 struct ioat_ring_ent *ext;
632 size_t total_len = len;
633 struct ioat_pq_descriptor *pq;
634 struct ioat_pq_ext_descriptor *pq_ex = NULL;
635 struct ioat_dma_descriptor *hw;
636 u32 offset = 0;
637 int num_descs;
638 int with_ext;
639 int i, s;
640 u16 idx;
641 u8 op = result ? IOAT_OP_PQ_VAL : IOAT_OP_PQ;
642
643 dev_dbg(to_dev(chan), "%s\n", __func__);
644 /* the engine requires at least two sources (we provide
645 * at least 1 implied source in the DMA_PREP_CONTINUE case)
646 */
647 BUG_ON(src_cnt + dmaf_continue(flags) < 2);
648
649 num_descs = ioat2_xferlen_to_descs(ioat, len);
650 /* we need 2x the number of descriptors to cover greater than 3
651 * sources
652 */
653 if (src_cnt > 3 || flags & DMA_PREP_CONTINUE) {
654 with_ext = 1;
655 num_descs *= 2;
656 } else
657 with_ext = 0;
658
659 /* completion writes from the raid engine may pass completion
660 * writes from the legacy engine, so we need one extra null
661 * (legacy) descriptor to ensure all completion writes arrive in
662 * order.
663 */
664 if (likely(num_descs) &&
665 ioat2_alloc_and_lock(&idx, ioat, num_descs+1) == 0)
666 /* pass */;
667 else
668 return NULL;
669 i = 0;
670 do {
671 struct ioat_raw_descriptor *descs[2];
672 size_t xfer_size = min_t(size_t, len, 1 << ioat->xfercap_log);
673
674 desc = ioat2_get_ring_ent(ioat, idx + i);
675 pq = desc->pq;
676
677 /* save a branch by unconditionally retrieving the
678 * extended descriptor pq_set_src() knows to not write
679 * to it in the single descriptor case
680 */
681 ext = ioat2_get_ring_ent(ioat, idx + i + with_ext);
682 pq_ex = ext->pq_ex;
683
684 descs[0] = (struct ioat_raw_descriptor *) pq;
685 descs[1] = (struct ioat_raw_descriptor *) pq_ex;
686
687 for (s = 0; s < src_cnt; s++)
688 pq_set_src(descs, src[s], offset, scf[s], s);
689
690 /* see the comment for dma_maxpq in include/linux/dmaengine.h */
691 if (dmaf_p_disabled_continue(flags))
692 pq_set_src(descs, dst[1], offset, 1, s++);
693 else if (dmaf_continue(flags)) {
694 pq_set_src(descs, dst[0], offset, 0, s++);
695 pq_set_src(descs, dst[1], offset, 1, s++);
696 pq_set_src(descs, dst[1], offset, 0, s++);
697 }
698 pq->size = xfer_size;
699 pq->p_addr = dst[0] + offset;
700 pq->q_addr = dst[1] + offset;
701 pq->ctl = 0;
702 pq->ctl_f.op = op;
703 pq->ctl_f.src_cnt = src_cnt_to_hw(s);
704 pq->ctl_f.p_disable = !!(flags & DMA_PREP_PQ_DISABLE_P);
705 pq->ctl_f.q_disable = !!(flags & DMA_PREP_PQ_DISABLE_Q);
706
707 len -= xfer_size;
708 offset += xfer_size;
709 } while ((i += 1 + with_ext) < num_descs);
710
711 /* last pq descriptor carries the unmap parameters and fence bit */
712 desc->txd.flags = flags;
713 desc->len = total_len;
714 if (result)
715 desc->result = result;
716 pq->ctl_f.fence = !!(flags & DMA_PREP_FENCE);
717 dump_pq_desc_dbg(ioat, desc, ext);
718
719 /* completion descriptor carries interrupt bit */
720 compl_desc = ioat2_get_ring_ent(ioat, idx + i);
721 compl_desc->txd.flags = flags & DMA_PREP_INTERRUPT;
722 hw = compl_desc->hw;
723 hw->ctl = 0;
724 hw->ctl_f.null = 1;
725 hw->ctl_f.int_en = !!(flags & DMA_PREP_INTERRUPT);
726 hw->ctl_f.compl_write = 1;
727 hw->size = NULL_DESC_BUFFER_SIZE;
728 dump_desc_dbg(ioat, compl_desc);
729
730 /* we leave the channel locked to ensure in order submission */
731 return &desc->txd;
732}
733
734static struct dma_async_tx_descriptor *
735ioat3_prep_pq(struct dma_chan *chan, dma_addr_t *dst, dma_addr_t *src,
736 unsigned int src_cnt, const unsigned char *scf, size_t len,
737 unsigned long flags)
738{
739 /* handle the single source multiply case from the raid6
740 * recovery path
741 */
742 if (unlikely((flags & DMA_PREP_PQ_DISABLE_P) && src_cnt == 1)) {
743 dma_addr_t single_source[2];
744 unsigned char single_source_coef[2];
745
746 BUG_ON(flags & DMA_PREP_PQ_DISABLE_Q);
747 single_source[0] = src[0];
748 single_source[1] = src[0];
749 single_source_coef[0] = scf[0];
750 single_source_coef[1] = 0;
751
752 return __ioat3_prep_pq_lock(chan, NULL, dst, single_source, 2,
753 single_source_coef, len, flags);
754 } else
755 return __ioat3_prep_pq_lock(chan, NULL, dst, src, src_cnt, scf,
756 len, flags);
757}
758
759struct dma_async_tx_descriptor *
760ioat3_prep_pq_val(struct dma_chan *chan, dma_addr_t *pq, dma_addr_t *src,
761 unsigned int src_cnt, const unsigned char *scf, size_t len,
762 enum sum_check_flags *pqres, unsigned long flags)
763{
764 /* the cleanup routine only sets bits on validate failure, it
765 * does not clear bits on validate success... so clear it here
766 */
767 *pqres = 0;
768
769 return __ioat3_prep_pq_lock(chan, pqres, pq, src, src_cnt, scf, len,
770 flags);
771}
772
773static struct dma_async_tx_descriptor *
774ioat3_prep_pqxor(struct dma_chan *chan, dma_addr_t dst, dma_addr_t *src,
775 unsigned int src_cnt, size_t len, unsigned long flags)
776{
777 unsigned char scf[src_cnt];
778 dma_addr_t pq[2];
779
780 memset(scf, 0, src_cnt);
781 flags |= DMA_PREP_PQ_DISABLE_Q;
782 pq[0] = dst;
783 pq[1] = ~0;
784
785 return __ioat3_prep_pq_lock(chan, NULL, pq, src, src_cnt, scf, len,
786 flags);
787}
788
789struct dma_async_tx_descriptor *
790ioat3_prep_pqxor_val(struct dma_chan *chan, dma_addr_t *src,
791 unsigned int src_cnt, size_t len,
792 enum sum_check_flags *result, unsigned long flags)
793{
794 unsigned char scf[src_cnt];
795 dma_addr_t pq[2];
796
797 /* the cleanup routine only sets bits on validate failure, it
798 * does not clear bits on validate success... so clear it here
799 */
800 *result = 0;
801
802 memset(scf, 0, src_cnt);
803 flags |= DMA_PREP_PQ_DISABLE_Q;
804 pq[0] = src[0];
805 pq[1] = ~0;
806
807 return __ioat3_prep_pq_lock(chan, result, pq, &src[1], src_cnt - 1, scf,
808 len, flags);
809}
810
811static struct dma_async_tx_descriptor *
812ioat3_prep_interrupt_lock(struct dma_chan *c, unsigned long flags)
813{
814 struct ioat2_dma_chan *ioat = to_ioat2_chan(c);
815 struct ioat_ring_ent *desc;
816 struct ioat_dma_descriptor *hw;
817 u16 idx;
818
819 if (ioat2_alloc_and_lock(&idx, ioat, 1) == 0)
820 desc = ioat2_get_ring_ent(ioat, idx);
821 else
822 return NULL;
823
824 hw = desc->hw;
825 hw->ctl = 0;
826 hw->ctl_f.null = 1;
827 hw->ctl_f.int_en = 1;
828 hw->ctl_f.fence = !!(flags & DMA_PREP_FENCE);
829 hw->ctl_f.compl_write = 1;
830 hw->size = NULL_DESC_BUFFER_SIZE;
831 hw->src_addr = 0;
832 hw->dst_addr = 0;
833
834 desc->txd.flags = flags;
835 desc->len = 1;
836
837 dump_desc_dbg(ioat, desc);
838
839 /* we leave the channel locked to ensure in order submission */
840 return &desc->txd;
841}
842
843static void __devinit ioat3_dma_test_callback(void *dma_async_param)
844{
845 struct completion *cmp = dma_async_param;
846
847 complete(cmp);
848}
849
850#define IOAT_NUM_SRC_TEST 6 /* must be <= 8 */
851static int __devinit ioat_xor_val_self_test(struct ioatdma_device *device)
852{
853 int i, src_idx;
854 struct page *dest;
855 struct page *xor_srcs[IOAT_NUM_SRC_TEST];
856 struct page *xor_val_srcs[IOAT_NUM_SRC_TEST + 1];
857 dma_addr_t dma_srcs[IOAT_NUM_SRC_TEST + 1];
858 dma_addr_t dma_addr, dest_dma;
859 struct dma_async_tx_descriptor *tx;
860 struct dma_chan *dma_chan;
861 dma_cookie_t cookie;
862 u8 cmp_byte = 0;
863 u32 cmp_word;
864 u32 xor_val_result;
865 int err = 0;
866 struct completion cmp;
867 unsigned long tmo;
868 struct device *dev = &device->pdev->dev;
869 struct dma_device *dma = &device->common;
870
871 dev_dbg(dev, "%s\n", __func__);
872
873 if (!dma_has_cap(DMA_XOR, dma->cap_mask))
874 return 0;
875
876 for (src_idx = 0; src_idx < IOAT_NUM_SRC_TEST; src_idx++) {
877 xor_srcs[src_idx] = alloc_page(GFP_KERNEL);
878 if (!xor_srcs[src_idx]) {
879 while (src_idx--)
880 __free_page(xor_srcs[src_idx]);
881 return -ENOMEM;
882 }
883 }
884
885 dest = alloc_page(GFP_KERNEL);
886 if (!dest) {
887 while (src_idx--)
888 __free_page(xor_srcs[src_idx]);
889 return -ENOMEM;
890 }
891
892 /* Fill in src buffers */
893 for (src_idx = 0; src_idx < IOAT_NUM_SRC_TEST; src_idx++) {
894 u8 *ptr = page_address(xor_srcs[src_idx]);
895 for (i = 0; i < PAGE_SIZE; i++)
896 ptr[i] = (1 << src_idx);
897 }
898
899 for (src_idx = 0; src_idx < IOAT_NUM_SRC_TEST; src_idx++)
900 cmp_byte ^= (u8) (1 << src_idx);
901
902 cmp_word = (cmp_byte << 24) | (cmp_byte << 16) |
903 (cmp_byte << 8) | cmp_byte;
904
905 memset(page_address(dest), 0, PAGE_SIZE);
906
907 dma_chan = container_of(dma->channels.next, struct dma_chan,
908 device_node);
909 if (dma->device_alloc_chan_resources(dma_chan) < 1) {
910 err = -ENODEV;
911 goto out;
912 }
913
914 /* test xor */
915 dest_dma = dma_map_page(dev, dest, 0, PAGE_SIZE, DMA_FROM_DEVICE);
916 for (i = 0; i < IOAT_NUM_SRC_TEST; i++)
917 dma_srcs[i] = dma_map_page(dev, xor_srcs[i], 0, PAGE_SIZE,
918 DMA_TO_DEVICE);
919 tx = dma->device_prep_dma_xor(dma_chan, dest_dma, dma_srcs,
920 IOAT_NUM_SRC_TEST, PAGE_SIZE,
921 DMA_PREP_INTERRUPT);
922
923 if (!tx) {
924 dev_err(dev, "Self-test xor prep failed\n");
925 err = -ENODEV;
926 goto free_resources;
927 }
928
929 async_tx_ack(tx);
930 init_completion(&cmp);
931 tx->callback = ioat3_dma_test_callback;
932 tx->callback_param = &cmp;
933 cookie = tx->tx_submit(tx);
934 if (cookie < 0) {
935 dev_err(dev, "Self-test xor setup failed\n");
936 err = -ENODEV;
937 goto free_resources;
938 }
939 dma->device_issue_pending(dma_chan);
940
941 tmo = wait_for_completion_timeout(&cmp, msecs_to_jiffies(3000));
942
943 if (dma->device_is_tx_complete(dma_chan, cookie, NULL, NULL) != DMA_SUCCESS) {
944 dev_err(dev, "Self-test xor timed out\n");
945 err = -ENODEV;
946 goto free_resources;
947 }
948
949 dma_sync_single_for_cpu(dev, dest_dma, PAGE_SIZE, DMA_FROM_DEVICE);
950 for (i = 0; i < (PAGE_SIZE / sizeof(u32)); i++) {
951 u32 *ptr = page_address(dest);
952 if (ptr[i] != cmp_word) {
953 dev_err(dev, "Self-test xor failed compare\n");
954 err = -ENODEV;
955 goto free_resources;
956 }
957 }
958 dma_sync_single_for_device(dev, dest_dma, PAGE_SIZE, DMA_TO_DEVICE);
959
960 /* skip validate if the capability is not present */
961 if (!dma_has_cap(DMA_XOR_VAL, dma_chan->device->cap_mask))
962 goto free_resources;
963
964 /* validate the sources with the destintation page */
965 for (i = 0; i < IOAT_NUM_SRC_TEST; i++)
966 xor_val_srcs[i] = xor_srcs[i];
967 xor_val_srcs[i] = dest;
968
969 xor_val_result = 1;
970
971 for (i = 0; i < IOAT_NUM_SRC_TEST + 1; i++)
972 dma_srcs[i] = dma_map_page(dev, xor_val_srcs[i], 0, PAGE_SIZE,
973 DMA_TO_DEVICE);
974 tx = dma->device_prep_dma_xor_val(dma_chan, dma_srcs,
975 IOAT_NUM_SRC_TEST + 1, PAGE_SIZE,
976 &xor_val_result, DMA_PREP_INTERRUPT);
977 if (!tx) {
978 dev_err(dev, "Self-test zero prep failed\n");
979 err = -ENODEV;
980 goto free_resources;
981 }
982
983 async_tx_ack(tx);
984 init_completion(&cmp);
985 tx->callback = ioat3_dma_test_callback;
986 tx->callback_param = &cmp;
987 cookie = tx->tx_submit(tx);
988 if (cookie < 0) {
989 dev_err(dev, "Self-test zero setup failed\n");
990 err = -ENODEV;
991 goto free_resources;
992 }
993 dma->device_issue_pending(dma_chan);
994
995 tmo = wait_for_completion_timeout(&cmp, msecs_to_jiffies(3000));
996
997 if (dma->device_is_tx_complete(dma_chan, cookie, NULL, NULL) != DMA_SUCCESS) {
998 dev_err(dev, "Self-test validate timed out\n");
999 err = -ENODEV;
1000 goto free_resources;
1001 }
1002
1003 if (xor_val_result != 0) {
1004 dev_err(dev, "Self-test validate failed compare\n");
1005 err = -ENODEV;
1006 goto free_resources;
1007 }
1008
1009 /* skip memset if the capability is not present */
1010 if (!dma_has_cap(DMA_MEMSET, dma_chan->device->cap_mask))
1011 goto free_resources;
1012
1013 /* test memset */
1014 dma_addr = dma_map_page(dev, dest, 0,
1015 PAGE_SIZE, DMA_FROM_DEVICE);
1016 tx = dma->device_prep_dma_memset(dma_chan, dma_addr, 0, PAGE_SIZE,
1017 DMA_PREP_INTERRUPT);
1018 if (!tx) {
1019 dev_err(dev, "Self-test memset prep failed\n");
1020 err = -ENODEV;
1021 goto free_resources;
1022 }
1023
1024 async_tx_ack(tx);
1025 init_completion(&cmp);
1026 tx->callback = ioat3_dma_test_callback;
1027 tx->callback_param = &cmp;
1028 cookie = tx->tx_submit(tx);
1029 if (cookie < 0) {
1030 dev_err(dev, "Self-test memset setup failed\n");
1031 err = -ENODEV;
1032 goto free_resources;
1033 }
1034 dma->device_issue_pending(dma_chan);
1035
1036 tmo = wait_for_completion_timeout(&cmp, msecs_to_jiffies(3000));
1037
1038 if (dma->device_is_tx_complete(dma_chan, cookie, NULL, NULL) != DMA_SUCCESS) {
1039 dev_err(dev, "Self-test memset timed out\n");
1040 err = -ENODEV;
1041 goto free_resources;
1042 }
1043
1044 for (i = 0; i < PAGE_SIZE/sizeof(u32); i++) {
1045 u32 *ptr = page_address(dest);
1046 if (ptr[i]) {
1047 dev_err(dev, "Self-test memset failed compare\n");
1048 err = -ENODEV;
1049 goto free_resources;
1050 }
1051 }
1052
1053 /* test for non-zero parity sum */
1054 xor_val_result = 0;
1055 for (i = 0; i < IOAT_NUM_SRC_TEST + 1; i++)
1056 dma_srcs[i] = dma_map_page(dev, xor_val_srcs[i], 0, PAGE_SIZE,
1057 DMA_TO_DEVICE);
1058 tx = dma->device_prep_dma_xor_val(dma_chan, dma_srcs,
1059 IOAT_NUM_SRC_TEST + 1, PAGE_SIZE,
1060 &xor_val_result, DMA_PREP_INTERRUPT);
1061 if (!tx) {
1062 dev_err(dev, "Self-test 2nd zero prep failed\n");
1063 err = -ENODEV;
1064 goto free_resources;
1065 }
1066
1067 async_tx_ack(tx);
1068 init_completion(&cmp);
1069 tx->callback = ioat3_dma_test_callback;
1070 tx->callback_param = &cmp;
1071 cookie = tx->tx_submit(tx);
1072 if (cookie < 0) {
1073 dev_err(dev, "Self-test 2nd zero setup failed\n");
1074 err = -ENODEV;
1075 goto free_resources;
1076 }
1077 dma->device_issue_pending(dma_chan);
1078
1079 tmo = wait_for_completion_timeout(&cmp, msecs_to_jiffies(3000));
1080
1081 if (dma->device_is_tx_complete(dma_chan, cookie, NULL, NULL) != DMA_SUCCESS) {
1082 dev_err(dev, "Self-test 2nd validate timed out\n");
1083 err = -ENODEV;
1084 goto free_resources;
1085 }
1086
1087 if (xor_val_result != SUM_CHECK_P_RESULT) {
1088 dev_err(dev, "Self-test validate failed compare\n");
1089 err = -ENODEV;
1090 goto free_resources;
1091 }
1092
1093free_resources:
1094 dma->device_free_chan_resources(dma_chan);
1095out:
1096 src_idx = IOAT_NUM_SRC_TEST;
1097 while (src_idx--)
1098 __free_page(xor_srcs[src_idx]);
1099 __free_page(dest);
1100 return err;
1101}
1102
1103static int __devinit ioat3_dma_self_test(struct ioatdma_device *device)
1104{
1105 int rc = ioat_dma_self_test(device);
1106
1107 if (rc)
1108 return rc;
1109
1110 rc = ioat_xor_val_self_test(device);
1111 if (rc)
1112 return rc;
1113
1114 return 0;
1115}
1116
1117int __devinit ioat3_dma_probe(struct ioatdma_device *device, int dca)
1118{
1119 struct pci_dev *pdev = device->pdev;
1120 struct dma_device *dma;
1121 struct dma_chan *c;
1122 struct ioat_chan_common *chan;
1123 bool is_raid_device = false;
1124 int err;
1125 u16 dev_id;
1126 u32 cap;
1127
1128 device->enumerate_channels = ioat2_enumerate_channels;
1129 device->self_test = ioat3_dma_self_test;
1130 dma = &device->common;
1131 dma->device_prep_dma_memcpy = ioat2_dma_prep_memcpy_lock;
1132 dma->device_issue_pending = ioat2_issue_pending;
1133 dma->device_alloc_chan_resources = ioat2_alloc_chan_resources;
1134 dma->device_free_chan_resources = ioat2_free_chan_resources;
1135
1136 dma_cap_set(DMA_INTERRUPT, dma->cap_mask);
1137 dma->device_prep_dma_interrupt = ioat3_prep_interrupt_lock;
1138
1139 cap = readl(device->reg_base + IOAT_DMA_CAP_OFFSET);
1140 if (cap & IOAT_CAP_XOR) {
1141 is_raid_device = true;
1142 dma->max_xor = 8;
1143 dma->xor_align = 2;
1144
1145 dma_cap_set(DMA_XOR, dma->cap_mask);
1146 dma->device_prep_dma_xor = ioat3_prep_xor;
1147
1148 dma_cap_set(DMA_XOR_VAL, dma->cap_mask);
1149 dma->device_prep_dma_xor_val = ioat3_prep_xor_val;
1150 }
1151 if (cap & IOAT_CAP_PQ) {
1152 is_raid_device = true;
1153 dma_set_maxpq(dma, 8, 0);
1154 dma->pq_align = 2;
1155
1156 dma_cap_set(DMA_PQ, dma->cap_mask);
1157 dma->device_prep_dma_pq = ioat3_prep_pq;
1158
1159 dma_cap_set(DMA_PQ_VAL, dma->cap_mask);
1160 dma->device_prep_dma_pq_val = ioat3_prep_pq_val;
1161
1162 if (!(cap & IOAT_CAP_XOR)) {
1163 dma->max_xor = 8;
1164 dma->xor_align = 2;
1165
1166 dma_cap_set(DMA_XOR, dma->cap_mask);
1167 dma->device_prep_dma_xor = ioat3_prep_pqxor;
1168
1169 dma_cap_set(DMA_XOR_VAL, dma->cap_mask);
1170 dma->device_prep_dma_xor_val = ioat3_prep_pqxor_val;
1171 }
1172 }
1173 if (is_raid_device && (cap & IOAT_CAP_FILL_BLOCK)) {
1174 dma_cap_set(DMA_MEMSET, dma->cap_mask);
1175 dma->device_prep_dma_memset = ioat3_prep_memset_lock;
1176 }
1177
1178
1179 if (is_raid_device) {
1180 dma->device_is_tx_complete = ioat3_is_complete;
1181 device->cleanup_tasklet = ioat3_cleanup_tasklet;
1182 device->timer_fn = ioat3_timer_event;
1183 } else {
1184 dma->device_is_tx_complete = ioat2_is_complete;
1185 device->cleanup_tasklet = ioat2_cleanup_tasklet;
1186 device->timer_fn = ioat2_timer_event;
1187 }
1188
1189 /* -= IOAT ver.3 workarounds =- */
1190 /* Write CHANERRMSK_INT with 3E07h to mask out the errors
1191 * that can cause stability issues for IOAT ver.3
1192 */
1193 pci_write_config_dword(pdev, IOAT_PCI_CHANERRMASK_INT_OFFSET, 0x3e07);
1194
1195 /* Clear DMAUNCERRSTS Cfg-Reg Parity Error status bit
1196 * (workaround for spurious config parity error after restart)
1197 */
1198 pci_read_config_word(pdev, IOAT_PCI_DEVICE_ID_OFFSET, &dev_id);
1199 if (dev_id == PCI_DEVICE_ID_INTEL_IOAT_TBG0)
1200 pci_write_config_dword(pdev, IOAT_PCI_DMAUNCERRSTS_OFFSET, 0x10);
1201
1202 err = ioat_probe(device);
1203 if (err)
1204 return err;
1205 ioat_set_tcp_copy_break(262144);
1206
1207 list_for_each_entry(c, &dma->channels, device_node) {
1208 chan = to_chan_common(c);
1209 writel(IOAT_DMA_DCA_ANY_CPU,
1210 chan->reg_base + IOAT_DCACTRL_OFFSET);
1211 }
1212
1213 err = ioat_register(device);
1214 if (err)
1215 return err;
1216
1217 ioat_kobject_add(device, &ioat2_ktype);
1218
1219 if (dca)
1220 device->dca = ioat3_dca_init(pdev, device->reg_base);
1221
1222 return 0;
1223}
diff --git a/drivers/dma/ioat/hw.h b/drivers/dma/ioat/hw.h
new file mode 100644
index 000000000000..99afb12bd409
--- /dev/null
+++ b/drivers/dma/ioat/hw.h
@@ -0,0 +1,215 @@
1/*
2 * Copyright(c) 2004 - 2009 Intel Corporation. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of the GNU General Public License as published by the Free
6 * Software Foundation; either version 2 of the License, or (at your option)
7 * any later version.
8 *
9 * This program is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 * You should have received a copy of the GNU General Public License along with
15 * this program; if not, write to the Free Software Foundation, Inc., 59
16 * Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * The full GNU General Public License is included in this distribution in the
19 * file called COPYING.
20 */
21#ifndef _IOAT_HW_H_
22#define _IOAT_HW_H_
23
24/* PCI Configuration Space Values */
25#define IOAT_PCI_VID 0x8086
26#define IOAT_MMIO_BAR 0
27
28/* CB device ID's */
29#define IOAT_PCI_DID_5000 0x1A38
30#define IOAT_PCI_DID_CNB 0x360B
31#define IOAT_PCI_DID_SCNB 0x65FF
32#define IOAT_PCI_DID_SNB 0x402F
33
34#define IOAT_PCI_RID 0x00
35#define IOAT_PCI_SVID 0x8086
36#define IOAT_PCI_SID 0x8086
37#define IOAT_VER_1_2 0x12 /* Version 1.2 */
38#define IOAT_VER_2_0 0x20 /* Version 2.0 */
39#define IOAT_VER_3_0 0x30 /* Version 3.0 */
40#define IOAT_VER_3_2 0x32 /* Version 3.2 */
41
42struct ioat_dma_descriptor {
43 uint32_t size;
44 union {
45 uint32_t ctl;
46 struct {
47 unsigned int int_en:1;
48 unsigned int src_snoop_dis:1;
49 unsigned int dest_snoop_dis:1;
50 unsigned int compl_write:1;
51 unsigned int fence:1;
52 unsigned int null:1;
53 unsigned int src_brk:1;
54 unsigned int dest_brk:1;
55 unsigned int bundle:1;
56 unsigned int dest_dca:1;
57 unsigned int hint:1;
58 unsigned int rsvd2:13;
59 #define IOAT_OP_COPY 0x00
60 unsigned int op:8;
61 } ctl_f;
62 };
63 uint64_t src_addr;
64 uint64_t dst_addr;
65 uint64_t next;
66 uint64_t rsv1;
67 uint64_t rsv2;
68 /* store some driver data in an unused portion of the descriptor */
69 union {
70 uint64_t user1;
71 uint64_t tx_cnt;
72 };
73 uint64_t user2;
74};
75
76struct ioat_fill_descriptor {
77 uint32_t size;
78 union {
79 uint32_t ctl;
80 struct {
81 unsigned int int_en:1;
82 unsigned int rsvd:1;
83 unsigned int dest_snoop_dis:1;
84 unsigned int compl_write:1;
85 unsigned int fence:1;
86 unsigned int rsvd2:2;
87 unsigned int dest_brk:1;
88 unsigned int bundle:1;
89 unsigned int rsvd4:15;
90 #define IOAT_OP_FILL 0x01
91 unsigned int op:8;
92 } ctl_f;
93 };
94 uint64_t src_data;
95 uint64_t dst_addr;
96 uint64_t next;
97 uint64_t rsv1;
98 uint64_t next_dst_addr;
99 uint64_t user1;
100 uint64_t user2;
101};
102
103struct ioat_xor_descriptor {
104 uint32_t size;
105 union {
106 uint32_t ctl;
107 struct {
108 unsigned int int_en:1;
109 unsigned int src_snoop_dis:1;
110 unsigned int dest_snoop_dis:1;
111 unsigned int compl_write:1;
112 unsigned int fence:1;
113 unsigned int src_cnt:3;
114 unsigned int bundle:1;
115 unsigned int dest_dca:1;
116 unsigned int hint:1;
117 unsigned int rsvd:13;
118 #define IOAT_OP_XOR 0x87
119 #define IOAT_OP_XOR_VAL 0x88
120 unsigned int op:8;
121 } ctl_f;
122 };
123 uint64_t src_addr;
124 uint64_t dst_addr;
125 uint64_t next;
126 uint64_t src_addr2;
127 uint64_t src_addr3;
128 uint64_t src_addr4;
129 uint64_t src_addr5;
130};
131
132struct ioat_xor_ext_descriptor {
133 uint64_t src_addr6;
134 uint64_t src_addr7;
135 uint64_t src_addr8;
136 uint64_t next;
137 uint64_t rsvd[4];
138};
139
140struct ioat_pq_descriptor {
141 uint32_t size;
142 union {
143 uint32_t ctl;
144 struct {
145 unsigned int int_en:1;
146 unsigned int src_snoop_dis:1;
147 unsigned int dest_snoop_dis:1;
148 unsigned int compl_write:1;
149 unsigned int fence:1;
150 unsigned int src_cnt:3;
151 unsigned int bundle:1;
152 unsigned int dest_dca:1;
153 unsigned int hint:1;
154 unsigned int p_disable:1;
155 unsigned int q_disable:1;
156 unsigned int rsvd:11;
157 #define IOAT_OP_PQ 0x89
158 #define IOAT_OP_PQ_VAL 0x8a
159 unsigned int op:8;
160 } ctl_f;
161 };
162 uint64_t src_addr;
163 uint64_t p_addr;
164 uint64_t next;
165 uint64_t src_addr2;
166 uint64_t src_addr3;
167 uint8_t coef[8];
168 uint64_t q_addr;
169};
170
171struct ioat_pq_ext_descriptor {
172 uint64_t src_addr4;
173 uint64_t src_addr5;
174 uint64_t src_addr6;
175 uint64_t next;
176 uint64_t src_addr7;
177 uint64_t src_addr8;
178 uint64_t rsvd[2];
179};
180
181struct ioat_pq_update_descriptor {
182 uint32_t size;
183 union {
184 uint32_t ctl;
185 struct {
186 unsigned int int_en:1;
187 unsigned int src_snoop_dis:1;
188 unsigned int dest_snoop_dis:1;
189 unsigned int compl_write:1;
190 unsigned int fence:1;
191 unsigned int src_cnt:3;
192 unsigned int bundle:1;
193 unsigned int dest_dca:1;
194 unsigned int hint:1;
195 unsigned int p_disable:1;
196 unsigned int q_disable:1;
197 unsigned int rsvd:3;
198 unsigned int coef:8;
199 #define IOAT_OP_PQ_UP 0x8b
200 unsigned int op:8;
201 } ctl_f;
202 };
203 uint64_t src_addr;
204 uint64_t p_addr;
205 uint64_t next;
206 uint64_t src_addr2;
207 uint64_t p_src;
208 uint64_t q_src;
209 uint64_t q_addr;
210};
211
212struct ioat_raw_descriptor {
213 uint64_t field[8];
214};
215#endif
diff --git a/drivers/dma/ioat/pci.c b/drivers/dma/ioat/pci.c
new file mode 100644
index 000000000000..d545fae30f37
--- /dev/null
+++ b/drivers/dma/ioat/pci.c
@@ -0,0 +1,210 @@
1/*
2 * Intel I/OAT DMA Linux driver
3 * Copyright(c) 2007 - 2009 Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 * You should have received a copy of the GNU General Public License along with
15 * this program; if not, write to the Free Software Foundation, Inc.,
16 * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
17 *
18 * The full GNU General Public License is included in this distribution in
19 * the file called "COPYING".
20 *
21 */
22
23/*
24 * This driver supports an Intel I/OAT DMA engine, which does asynchronous
25 * copy operations.
26 */
27
28#include <linux/init.h>
29#include <linux/module.h>
30#include <linux/pci.h>
31#include <linux/interrupt.h>
32#include <linux/dca.h>
33#include "dma.h"
34#include "dma_v2.h"
35#include "registers.h"
36#include "hw.h"
37
38MODULE_VERSION(IOAT_DMA_VERSION);
39MODULE_LICENSE("Dual BSD/GPL");
40MODULE_AUTHOR("Intel Corporation");
41
42static struct pci_device_id ioat_pci_tbl[] = {
43 /* I/OAT v1 platforms */
44 { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT) },
45 { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_CNB) },
46 { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_SCNB) },
47 { PCI_VDEVICE(UNISYS, PCI_DEVICE_ID_UNISYS_DMA_DIRECTOR) },
48
49 /* I/OAT v2 platforms */
50 { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB) },
51
52 /* I/OAT v3 platforms */
53 { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_TBG0) },
54 { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_TBG1) },
55 { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_TBG2) },
56 { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_TBG3) },
57 { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_TBG4) },
58 { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_TBG5) },
59 { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_TBG6) },
60 { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_TBG7) },
61
62 /* I/OAT v3.2 platforms */
63 { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_JSF0) },
64 { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_JSF1) },
65 { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_JSF2) },
66 { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_JSF3) },
67 { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_JSF4) },
68 { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_JSF5) },
69 { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_JSF6) },
70 { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_JSF7) },
71 { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_JSF8) },
72 { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_JSF9) },
73
74 { 0, }
75};
76MODULE_DEVICE_TABLE(pci, ioat_pci_tbl);
77
78static int __devinit ioat_pci_probe(struct pci_dev *pdev,
79 const struct pci_device_id *id);
80static void __devexit ioat_remove(struct pci_dev *pdev);
81
82static int ioat_dca_enabled = 1;
83module_param(ioat_dca_enabled, int, 0644);
84MODULE_PARM_DESC(ioat_dca_enabled, "control support of dca service (default: 1)");
85
86struct kmem_cache *ioat2_cache;
87
88#define DRV_NAME "ioatdma"
89
90static struct pci_driver ioat_pci_driver = {
91 .name = DRV_NAME,
92 .id_table = ioat_pci_tbl,
93 .probe = ioat_pci_probe,
94 .remove = __devexit_p(ioat_remove),
95};
96
97static struct ioatdma_device *
98alloc_ioatdma(struct pci_dev *pdev, void __iomem *iobase)
99{
100 struct device *dev = &pdev->dev;
101 struct ioatdma_device *d = devm_kzalloc(dev, sizeof(*d), GFP_KERNEL);
102
103 if (!d)
104 return NULL;
105 d->pdev = pdev;
106 d->reg_base = iobase;
107 return d;
108}
109
110static int __devinit ioat_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
111{
112 void __iomem * const *iomap;
113 struct device *dev = &pdev->dev;
114 struct ioatdma_device *device;
115 int err;
116
117 err = pcim_enable_device(pdev);
118 if (err)
119 return err;
120
121 err = pcim_iomap_regions(pdev, 1 << IOAT_MMIO_BAR, DRV_NAME);
122 if (err)
123 return err;
124 iomap = pcim_iomap_table(pdev);
125 if (!iomap)
126 return -ENOMEM;
127
128 err = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
129 if (err)
130 err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
131 if (err)
132 return err;
133
134 err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
135 if (err)
136 err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
137 if (err)
138 return err;
139
140 device = devm_kzalloc(dev, sizeof(*device), GFP_KERNEL);
141 if (!device)
142 return -ENOMEM;
143
144 pci_set_master(pdev);
145
146 device = alloc_ioatdma(pdev, iomap[IOAT_MMIO_BAR]);
147 if (!device)
148 return -ENOMEM;
149 pci_set_drvdata(pdev, device);
150
151 device->version = readb(device->reg_base + IOAT_VER_OFFSET);
152 if (device->version == IOAT_VER_1_2)
153 err = ioat1_dma_probe(device, ioat_dca_enabled);
154 else if (device->version == IOAT_VER_2_0)
155 err = ioat2_dma_probe(device, ioat_dca_enabled);
156 else if (device->version >= IOAT_VER_3_0)
157 err = ioat3_dma_probe(device, ioat_dca_enabled);
158 else
159 return -ENODEV;
160
161 if (err) {
162 dev_err(dev, "Intel(R) I/OAT DMA Engine init failed\n");
163 return -ENODEV;
164 }
165
166 return 0;
167}
168
169static void __devexit ioat_remove(struct pci_dev *pdev)
170{
171 struct ioatdma_device *device = pci_get_drvdata(pdev);
172
173 if (!device)
174 return;
175
176 dev_err(&pdev->dev, "Removing dma and dca services\n");
177 if (device->dca) {
178 unregister_dca_provider(device->dca, &pdev->dev);
179 free_dca_provider(device->dca);
180 device->dca = NULL;
181 }
182 ioat_dma_remove(device);
183}
184
185static int __init ioat_init_module(void)
186{
187 int err;
188
189 pr_info("%s: Intel(R) QuickData Technology Driver %s\n",
190 DRV_NAME, IOAT_DMA_VERSION);
191
192 ioat2_cache = kmem_cache_create("ioat2", sizeof(struct ioat_ring_ent),
193 0, SLAB_HWCACHE_ALIGN, NULL);
194 if (!ioat2_cache)
195 return -ENOMEM;
196
197 err = pci_register_driver(&ioat_pci_driver);
198 if (err)
199 kmem_cache_destroy(ioat2_cache);
200
201 return err;
202}
203module_init(ioat_init_module);
204
205static void __exit ioat_exit_module(void)
206{
207 pci_unregister_driver(&ioat_pci_driver);
208 kmem_cache_destroy(ioat2_cache);
209}
210module_exit(ioat_exit_module);
diff --git a/drivers/dma/ioatdma_registers.h b/drivers/dma/ioat/registers.h
index 49bc277424f8..63038e18ab03 100644
--- a/drivers/dma/ioatdma_registers.h
+++ b/drivers/dma/ioat/registers.h
@@ -64,18 +64,37 @@
64 64
65#define IOAT_DEVICE_STATUS_OFFSET 0x0E /* 16-bit */ 65#define IOAT_DEVICE_STATUS_OFFSET 0x0E /* 16-bit */
66#define IOAT_DEVICE_STATUS_DEGRADED_MODE 0x0001 66#define IOAT_DEVICE_STATUS_DEGRADED_MODE 0x0001
67#define IOAT_DEVICE_MMIO_RESTRICTED 0x0002
68#define IOAT_DEVICE_MEMORY_BYPASS 0x0004
69#define IOAT_DEVICE_ADDRESS_REMAPPING 0x0008
70
71#define IOAT_DMA_CAP_OFFSET 0x10 /* 32-bit */
72#define IOAT_CAP_PAGE_BREAK 0x00000001
73#define IOAT_CAP_CRC 0x00000002
74#define IOAT_CAP_SKIP_MARKER 0x00000004
75#define IOAT_CAP_DCA 0x00000010
76#define IOAT_CAP_CRC_MOVE 0x00000020
77#define IOAT_CAP_FILL_BLOCK 0x00000040
78#define IOAT_CAP_APIC 0x00000080
79#define IOAT_CAP_XOR 0x00000100
80#define IOAT_CAP_PQ 0x00000200
67 81
68#define IOAT_CHANNEL_MMIO_SIZE 0x80 /* Each Channel MMIO space is this size */ 82#define IOAT_CHANNEL_MMIO_SIZE 0x80 /* Each Channel MMIO space is this size */
69 83
70/* DMA Channel Registers */ 84/* DMA Channel Registers */
71#define IOAT_CHANCTRL_OFFSET 0x00 /* 16-bit Channel Control Register */ 85#define IOAT_CHANCTRL_OFFSET 0x00 /* 16-bit Channel Control Register */
72#define IOAT_CHANCTRL_CHANNEL_PRIORITY_MASK 0xF000 86#define IOAT_CHANCTRL_CHANNEL_PRIORITY_MASK 0xF000
87#define IOAT3_CHANCTRL_COMPL_DCA_EN 0x0200
73#define IOAT_CHANCTRL_CHANNEL_IN_USE 0x0100 88#define IOAT_CHANCTRL_CHANNEL_IN_USE 0x0100
74#define IOAT_CHANCTRL_DESCRIPTOR_ADDR_SNOOP_CONTROL 0x0020 89#define IOAT_CHANCTRL_DESCRIPTOR_ADDR_SNOOP_CONTROL 0x0020
75#define IOAT_CHANCTRL_ERR_INT_EN 0x0010 90#define IOAT_CHANCTRL_ERR_INT_EN 0x0010
76#define IOAT_CHANCTRL_ANY_ERR_ABORT_EN 0x0008 91#define IOAT_CHANCTRL_ANY_ERR_ABORT_EN 0x0008
77#define IOAT_CHANCTRL_ERR_COMPLETION_EN 0x0004 92#define IOAT_CHANCTRL_ERR_COMPLETION_EN 0x0004
78#define IOAT_CHANCTRL_INT_DISABLE 0x0001 93#define IOAT_CHANCTRL_INT_REARM 0x0001
94#define IOAT_CHANCTRL_RUN (IOAT_CHANCTRL_INT_REARM |\
95 IOAT_CHANCTRL_ERR_COMPLETION_EN |\
96 IOAT_CHANCTRL_ANY_ERR_ABORT_EN |\
97 IOAT_CHANCTRL_ERR_INT_EN)
79 98
80#define IOAT_DMA_COMP_OFFSET 0x02 /* 16-bit DMA channel compatibility */ 99#define IOAT_DMA_COMP_OFFSET 0x02 /* 16-bit DMA channel compatibility */
81#define IOAT_DMA_COMP_V1 0x0001 /* Compatibility with DMA version 1 */ 100#define IOAT_DMA_COMP_V1 0x0001 /* Compatibility with DMA version 1 */
@@ -94,14 +113,14 @@
94#define IOAT2_CHANSTS_OFFSET_HIGH 0x0C 113#define IOAT2_CHANSTS_OFFSET_HIGH 0x0C
95#define IOAT_CHANSTS_OFFSET_HIGH(ver) ((ver) < IOAT_VER_2_0 \ 114#define IOAT_CHANSTS_OFFSET_HIGH(ver) ((ver) < IOAT_VER_2_0 \
96 ? IOAT1_CHANSTS_OFFSET_HIGH : IOAT2_CHANSTS_OFFSET_HIGH) 115 ? IOAT1_CHANSTS_OFFSET_HIGH : IOAT2_CHANSTS_OFFSET_HIGH)
97#define IOAT_CHANSTS_COMPLETED_DESCRIPTOR_ADDR ~0x3F 116#define IOAT_CHANSTS_COMPLETED_DESCRIPTOR_ADDR (~0x3fULL)
98#define IOAT_CHANSTS_SOFT_ERR 0x0000000000000010 117#define IOAT_CHANSTS_SOFT_ERR 0x10ULL
99#define IOAT_CHANSTS_UNAFFILIATED_ERR 0x0000000000000008 118#define IOAT_CHANSTS_UNAFFILIATED_ERR 0x8ULL
100#define IOAT_CHANSTS_DMA_TRANSFER_STATUS 0x0000000000000007 119#define IOAT_CHANSTS_STATUS 0x7ULL
101#define IOAT_CHANSTS_DMA_TRANSFER_STATUS_ACTIVE 0x0 120#define IOAT_CHANSTS_ACTIVE 0x0
102#define IOAT_CHANSTS_DMA_TRANSFER_STATUS_DONE 0x1 121#define IOAT_CHANSTS_DONE 0x1
103#define IOAT_CHANSTS_DMA_TRANSFER_STATUS_SUSPENDED 0x2 122#define IOAT_CHANSTS_SUSPENDED 0x2
104#define IOAT_CHANSTS_DMA_TRANSFER_STATUS_HALTED 0x3 123#define IOAT_CHANSTS_HALTED 0x3
105 124
106 125
107 126
@@ -204,22 +223,27 @@
204#define IOAT_CDAR_OFFSET_HIGH 0x24 223#define IOAT_CDAR_OFFSET_HIGH 0x24
205 224
206#define IOAT_CHANERR_OFFSET 0x28 /* 32-bit Channel Error Register */ 225#define IOAT_CHANERR_OFFSET 0x28 /* 32-bit Channel Error Register */
207#define IOAT_CHANERR_DMA_TRANSFER_SRC_ADDR_ERR 0x0001 226#define IOAT_CHANERR_SRC_ADDR_ERR 0x0001
208#define IOAT_CHANERR_DMA_TRANSFER_DEST_ADDR_ERR 0x0002 227#define IOAT_CHANERR_DEST_ADDR_ERR 0x0002
209#define IOAT_CHANERR_NEXT_DESCRIPTOR_ADDR_ERR 0x0004 228#define IOAT_CHANERR_NEXT_ADDR_ERR 0x0004
210#define IOAT_CHANERR_NEXT_DESCRIPTOR_ALIGNMENT_ERR 0x0008 229#define IOAT_CHANERR_NEXT_DESC_ALIGN_ERR 0x0008
211#define IOAT_CHANERR_CHAIN_ADDR_VALUE_ERR 0x0010 230#define IOAT_CHANERR_CHAIN_ADDR_VALUE_ERR 0x0010
212#define IOAT_CHANERR_CHANCMD_ERR 0x0020 231#define IOAT_CHANERR_CHANCMD_ERR 0x0020
213#define IOAT_CHANERR_CHIPSET_UNCORRECTABLE_DATA_INTEGRITY_ERR 0x0040 232#define IOAT_CHANERR_CHIPSET_UNCORRECTABLE_DATA_INTEGRITY_ERR 0x0040
214#define IOAT_CHANERR_DMA_UNCORRECTABLE_DATA_INTEGRITY_ERR 0x0080 233#define IOAT_CHANERR_DMA_UNCORRECTABLE_DATA_INTEGRITY_ERR 0x0080
215#define IOAT_CHANERR_READ_DATA_ERR 0x0100 234#define IOAT_CHANERR_READ_DATA_ERR 0x0100
216#define IOAT_CHANERR_WRITE_DATA_ERR 0x0200 235#define IOAT_CHANERR_WRITE_DATA_ERR 0x0200
217#define IOAT_CHANERR_DESCRIPTOR_CONTROL_ERR 0x0400 236#define IOAT_CHANERR_CONTROL_ERR 0x0400
218#define IOAT_CHANERR_DESCRIPTOR_LENGTH_ERR 0x0800 237#define IOAT_CHANERR_LENGTH_ERR 0x0800
219#define IOAT_CHANERR_COMPLETION_ADDR_ERR 0x1000 238#define IOAT_CHANERR_COMPLETION_ADDR_ERR 0x1000
220#define IOAT_CHANERR_INT_CONFIGURATION_ERR 0x2000 239#define IOAT_CHANERR_INT_CONFIGURATION_ERR 0x2000
221#define IOAT_CHANERR_SOFT_ERR 0x4000 240#define IOAT_CHANERR_SOFT_ERR 0x4000
222#define IOAT_CHANERR_UNAFFILIATED_ERR 0x8000 241#define IOAT_CHANERR_UNAFFILIATED_ERR 0x8000
242#define IOAT_CHANERR_XOR_P_OR_CRC_ERR 0x10000
243#define IOAT_CHANERR_XOR_Q_ERR 0x20000
244#define IOAT_CHANERR_DESCRIPTOR_COUNT_ERR 0x40000
245
246#define IOAT_CHANERR_HANDLE_MASK (IOAT_CHANERR_XOR_P_OR_CRC_ERR | IOAT_CHANERR_XOR_Q_ERR)
223 247
224#define IOAT_CHANERR_MASK_OFFSET 0x2C /* 32-bit Channel Error Register */ 248#define IOAT_CHANERR_MASK_OFFSET 0x2C /* 32-bit Channel Error Register */
225 249
diff --git a/drivers/dma/ioat_dma.c b/drivers/dma/ioat_dma.c
deleted file mode 100644
index a600fc0f7962..000000000000
--- a/drivers/dma/ioat_dma.c
+++ /dev/null
@@ -1,1741 +0,0 @@
1/*
2 * Intel I/OAT DMA Linux driver
3 * Copyright(c) 2004 - 2009 Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 * You should have received a copy of the GNU General Public License along with
15 * this program; if not, write to the Free Software Foundation, Inc.,
16 * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
17 *
18 * The full GNU General Public License is included in this distribution in
19 * the file called "COPYING".
20 *
21 */
22
23/*
24 * This driver supports an Intel I/OAT DMA engine, which does asynchronous
25 * copy operations.
26 */
27
28#include <linux/init.h>
29#include <linux/module.h>
30#include <linux/pci.h>
31#include <linux/interrupt.h>
32#include <linux/dmaengine.h>
33#include <linux/delay.h>
34#include <linux/dma-mapping.h>
35#include <linux/workqueue.h>
36#include <linux/i7300_idle.h>
37#include "ioatdma.h"
38#include "ioatdma_registers.h"
39#include "ioatdma_hw.h"
40
41#define to_ioat_chan(chan) container_of(chan, struct ioat_dma_chan, common)
42#define to_ioatdma_device(dev) container_of(dev, struct ioatdma_device, common)
43#define to_ioat_desc(lh) container_of(lh, struct ioat_desc_sw, node)
44#define tx_to_ioat_desc(tx) container_of(tx, struct ioat_desc_sw, async_tx)
45
46#define chan_num(ch) ((int)((ch)->reg_base - (ch)->device->reg_base) / 0x80)
47static int ioat_pending_level = 4;
48module_param(ioat_pending_level, int, 0644);
49MODULE_PARM_DESC(ioat_pending_level,
50 "high-water mark for pushing ioat descriptors (default: 4)");
51
52#define RESET_DELAY msecs_to_jiffies(100)
53#define WATCHDOG_DELAY round_jiffies(msecs_to_jiffies(2000))
54static void ioat_dma_chan_reset_part2(struct work_struct *work);
55static void ioat_dma_chan_watchdog(struct work_struct *work);
56
57/*
58 * workaround for IOAT ver.3.0 null descriptor issue
59 * (channel returns error when size is 0)
60 */
61#define NULL_DESC_BUFFER_SIZE 1
62
63/* internal functions */
64static void ioat_dma_start_null_desc(struct ioat_dma_chan *ioat_chan);
65static void ioat_dma_memcpy_cleanup(struct ioat_dma_chan *ioat_chan);
66
67static struct ioat_desc_sw *
68ioat1_dma_get_next_descriptor(struct ioat_dma_chan *ioat_chan);
69static struct ioat_desc_sw *
70ioat2_dma_get_next_descriptor(struct ioat_dma_chan *ioat_chan);
71
72static inline struct ioat_dma_chan *ioat_lookup_chan_by_index(
73 struct ioatdma_device *device,
74 int index)
75{
76 return device->idx[index];
77}
78
79/**
80 * ioat_dma_do_interrupt - handler used for single vector interrupt mode
81 * @irq: interrupt id
82 * @data: interrupt data
83 */
84static irqreturn_t ioat_dma_do_interrupt(int irq, void *data)
85{
86 struct ioatdma_device *instance = data;
87 struct ioat_dma_chan *ioat_chan;
88 unsigned long attnstatus;
89 int bit;
90 u8 intrctrl;
91
92 intrctrl = readb(instance->reg_base + IOAT_INTRCTRL_OFFSET);
93
94 if (!(intrctrl & IOAT_INTRCTRL_MASTER_INT_EN))
95 return IRQ_NONE;
96
97 if (!(intrctrl & IOAT_INTRCTRL_INT_STATUS)) {
98 writeb(intrctrl, instance->reg_base + IOAT_INTRCTRL_OFFSET);
99 return IRQ_NONE;
100 }
101
102 attnstatus = readl(instance->reg_base + IOAT_ATTNSTATUS_OFFSET);
103 for_each_bit(bit, &attnstatus, BITS_PER_LONG) {
104 ioat_chan = ioat_lookup_chan_by_index(instance, bit);
105 tasklet_schedule(&ioat_chan->cleanup_task);
106 }
107
108 writeb(intrctrl, instance->reg_base + IOAT_INTRCTRL_OFFSET);
109 return IRQ_HANDLED;
110}
111
112/**
113 * ioat_dma_do_interrupt_msix - handler used for vector-per-channel interrupt mode
114 * @irq: interrupt id
115 * @data: interrupt data
116 */
117static irqreturn_t ioat_dma_do_interrupt_msix(int irq, void *data)
118{
119 struct ioat_dma_chan *ioat_chan = data;
120
121 tasklet_schedule(&ioat_chan->cleanup_task);
122
123 return IRQ_HANDLED;
124}
125
126static void ioat_dma_cleanup_tasklet(unsigned long data);
127
128/**
129 * ioat_dma_enumerate_channels - find and initialize the device's channels
130 * @device: the device to be enumerated
131 */
132static int ioat_dma_enumerate_channels(struct ioatdma_device *device)
133{
134 u8 xfercap_scale;
135 u32 xfercap;
136 int i;
137 struct ioat_dma_chan *ioat_chan;
138
139 /*
140 * IOAT ver.3 workarounds
141 */
142 if (device->version == IOAT_VER_3_0) {
143 u32 chan_err_mask;
144 u16 dev_id;
145 u32 dmauncerrsts;
146
147 /*
148 * Write CHANERRMSK_INT with 3E07h to mask out the errors
149 * that can cause stability issues for IOAT ver.3
150 */
151 chan_err_mask = 0x3E07;
152 pci_write_config_dword(device->pdev,
153 IOAT_PCI_CHANERRMASK_INT_OFFSET,
154 chan_err_mask);
155
156 /*
157 * Clear DMAUNCERRSTS Cfg-Reg Parity Error status bit
158 * (workaround for spurious config parity error after restart)
159 */
160 pci_read_config_word(device->pdev,
161 IOAT_PCI_DEVICE_ID_OFFSET,
162 &dev_id);
163 if (dev_id == PCI_DEVICE_ID_INTEL_IOAT_TBG0) {
164 dmauncerrsts = 0x10;
165 pci_write_config_dword(device->pdev,
166 IOAT_PCI_DMAUNCERRSTS_OFFSET,
167 dmauncerrsts);
168 }
169 }
170
171 device->common.chancnt = readb(device->reg_base + IOAT_CHANCNT_OFFSET);
172 xfercap_scale = readb(device->reg_base + IOAT_XFERCAP_OFFSET);
173 xfercap = (xfercap_scale == 0 ? -1 : (1UL << xfercap_scale));
174
175#ifdef CONFIG_I7300_IDLE_IOAT_CHANNEL
176 if (i7300_idle_platform_probe(NULL, NULL, 1) == 0) {
177 device->common.chancnt--;
178 }
179#endif
180 for (i = 0; i < device->common.chancnt; i++) {
181 ioat_chan = kzalloc(sizeof(*ioat_chan), GFP_KERNEL);
182 if (!ioat_chan) {
183 device->common.chancnt = i;
184 break;
185 }
186
187 ioat_chan->device = device;
188 ioat_chan->reg_base = device->reg_base + (0x80 * (i + 1));
189 ioat_chan->xfercap = xfercap;
190 ioat_chan->desccount = 0;
191 INIT_DELAYED_WORK(&ioat_chan->work, ioat_dma_chan_reset_part2);
192 if (ioat_chan->device->version == IOAT_VER_2_0)
193 writel(IOAT_DCACTRL_CMPL_WRITE_ENABLE |
194 IOAT_DMA_DCA_ANY_CPU,
195 ioat_chan->reg_base + IOAT_DCACTRL_OFFSET);
196 else if (ioat_chan->device->version == IOAT_VER_3_0)
197 writel(IOAT_DMA_DCA_ANY_CPU,
198 ioat_chan->reg_base + IOAT_DCACTRL_OFFSET);
199 spin_lock_init(&ioat_chan->cleanup_lock);
200 spin_lock_init(&ioat_chan->desc_lock);
201 INIT_LIST_HEAD(&ioat_chan->free_desc);
202 INIT_LIST_HEAD(&ioat_chan->used_desc);
203 /* This should be made common somewhere in dmaengine.c */
204 ioat_chan->common.device = &device->common;
205 list_add_tail(&ioat_chan->common.device_node,
206 &device->common.channels);
207 device->idx[i] = ioat_chan;
208 tasklet_init(&ioat_chan->cleanup_task,
209 ioat_dma_cleanup_tasklet,
210 (unsigned long) ioat_chan);
211 tasklet_disable(&ioat_chan->cleanup_task);
212 }
213 return device->common.chancnt;
214}
215
216/**
217 * ioat_dma_memcpy_issue_pending - push potentially unrecognized appended
218 * descriptors to hw
219 * @chan: DMA channel handle
220 */
221static inline void __ioat1_dma_memcpy_issue_pending(
222 struct ioat_dma_chan *ioat_chan)
223{
224 ioat_chan->pending = 0;
225 writeb(IOAT_CHANCMD_APPEND, ioat_chan->reg_base + IOAT1_CHANCMD_OFFSET);
226}
227
228static void ioat1_dma_memcpy_issue_pending(struct dma_chan *chan)
229{
230 struct ioat_dma_chan *ioat_chan = to_ioat_chan(chan);
231
232 if (ioat_chan->pending > 0) {
233 spin_lock_bh(&ioat_chan->desc_lock);
234 __ioat1_dma_memcpy_issue_pending(ioat_chan);
235 spin_unlock_bh(&ioat_chan->desc_lock);
236 }
237}
238
239static inline void __ioat2_dma_memcpy_issue_pending(
240 struct ioat_dma_chan *ioat_chan)
241{
242 ioat_chan->pending = 0;
243 writew(ioat_chan->dmacount,
244 ioat_chan->reg_base + IOAT_CHAN_DMACOUNT_OFFSET);
245}
246
247static void ioat2_dma_memcpy_issue_pending(struct dma_chan *chan)
248{
249 struct ioat_dma_chan *ioat_chan = to_ioat_chan(chan);
250
251 if (ioat_chan->pending > 0) {
252 spin_lock_bh(&ioat_chan->desc_lock);
253 __ioat2_dma_memcpy_issue_pending(ioat_chan);
254 spin_unlock_bh(&ioat_chan->desc_lock);
255 }
256}
257
258
259/**
260 * ioat_dma_chan_reset_part2 - reinit the channel after a reset
261 */
262static void ioat_dma_chan_reset_part2(struct work_struct *work)
263{
264 struct ioat_dma_chan *ioat_chan =
265 container_of(work, struct ioat_dma_chan, work.work);
266 struct ioat_desc_sw *desc;
267
268 spin_lock_bh(&ioat_chan->cleanup_lock);
269 spin_lock_bh(&ioat_chan->desc_lock);
270
271 ioat_chan->completion_virt->low = 0;
272 ioat_chan->completion_virt->high = 0;
273 ioat_chan->pending = 0;
274
275 /*
276 * count the descriptors waiting, and be sure to do it
277 * right for both the CB1 line and the CB2 ring
278 */
279 ioat_chan->dmacount = 0;
280 if (ioat_chan->used_desc.prev) {
281 desc = to_ioat_desc(ioat_chan->used_desc.prev);
282 do {
283 ioat_chan->dmacount++;
284 desc = to_ioat_desc(desc->node.next);
285 } while (&desc->node != ioat_chan->used_desc.next);
286 }
287
288 /*
289 * write the new starting descriptor address
290 * this puts channel engine into ARMED state
291 */
292 desc = to_ioat_desc(ioat_chan->used_desc.prev);
293 switch (ioat_chan->device->version) {
294 case IOAT_VER_1_2:
295 writel(((u64) desc->async_tx.phys) & 0x00000000FFFFFFFF,
296 ioat_chan->reg_base + IOAT1_CHAINADDR_OFFSET_LOW);
297 writel(((u64) desc->async_tx.phys) >> 32,
298 ioat_chan->reg_base + IOAT1_CHAINADDR_OFFSET_HIGH);
299
300 writeb(IOAT_CHANCMD_START, ioat_chan->reg_base
301 + IOAT_CHANCMD_OFFSET(ioat_chan->device->version));
302 break;
303 case IOAT_VER_2_0:
304 writel(((u64) desc->async_tx.phys) & 0x00000000FFFFFFFF,
305 ioat_chan->reg_base + IOAT2_CHAINADDR_OFFSET_LOW);
306 writel(((u64) desc->async_tx.phys) >> 32,
307 ioat_chan->reg_base + IOAT2_CHAINADDR_OFFSET_HIGH);
308
309 /* tell the engine to go with what's left to be done */
310 writew(ioat_chan->dmacount,
311 ioat_chan->reg_base + IOAT_CHAN_DMACOUNT_OFFSET);
312
313 break;
314 }
315 dev_err(&ioat_chan->device->pdev->dev,
316 "chan%d reset - %d descs waiting, %d total desc\n",
317 chan_num(ioat_chan), ioat_chan->dmacount, ioat_chan->desccount);
318
319 spin_unlock_bh(&ioat_chan->desc_lock);
320 spin_unlock_bh(&ioat_chan->cleanup_lock);
321}
322
323/**
324 * ioat_dma_reset_channel - restart a channel
325 * @ioat_chan: IOAT DMA channel handle
326 */
327static void ioat_dma_reset_channel(struct ioat_dma_chan *ioat_chan)
328{
329 u32 chansts, chanerr;
330
331 if (!ioat_chan->used_desc.prev)
332 return;
333
334 chanerr = readl(ioat_chan->reg_base + IOAT_CHANERR_OFFSET);
335 chansts = (ioat_chan->completion_virt->low
336 & IOAT_CHANSTS_DMA_TRANSFER_STATUS);
337 if (chanerr) {
338 dev_err(&ioat_chan->device->pdev->dev,
339 "chan%d, CHANSTS = 0x%08x CHANERR = 0x%04x, clearing\n",
340 chan_num(ioat_chan), chansts, chanerr);
341 writel(chanerr, ioat_chan->reg_base + IOAT_CHANERR_OFFSET);
342 }
343
344 /*
345 * whack it upside the head with a reset
346 * and wait for things to settle out.
347 * force the pending count to a really big negative
348 * to make sure no one forces an issue_pending
349 * while we're waiting.
350 */
351
352 spin_lock_bh(&ioat_chan->desc_lock);
353 ioat_chan->pending = INT_MIN;
354 writeb(IOAT_CHANCMD_RESET,
355 ioat_chan->reg_base
356 + IOAT_CHANCMD_OFFSET(ioat_chan->device->version));
357 spin_unlock_bh(&ioat_chan->desc_lock);
358
359 /* schedule the 2nd half instead of sleeping a long time */
360 schedule_delayed_work(&ioat_chan->work, RESET_DELAY);
361}
362
363/**
364 * ioat_dma_chan_watchdog - watch for stuck channels
365 */
366static void ioat_dma_chan_watchdog(struct work_struct *work)
367{
368 struct ioatdma_device *device =
369 container_of(work, struct ioatdma_device, work.work);
370 struct ioat_dma_chan *ioat_chan;
371 int i;
372
373 union {
374 u64 full;
375 struct {
376 u32 low;
377 u32 high;
378 };
379 } completion_hw;
380 unsigned long compl_desc_addr_hw;
381
382 for (i = 0; i < device->common.chancnt; i++) {
383 ioat_chan = ioat_lookup_chan_by_index(device, i);
384
385 if (ioat_chan->device->version == IOAT_VER_1_2
386 /* have we started processing anything yet */
387 && ioat_chan->last_completion
388 /* have we completed any since last watchdog cycle? */
389 && (ioat_chan->last_completion ==
390 ioat_chan->watchdog_completion)
391 /* has TCP stuck on one cookie since last watchdog? */
392 && (ioat_chan->watchdog_tcp_cookie ==
393 ioat_chan->watchdog_last_tcp_cookie)
394 && (ioat_chan->watchdog_tcp_cookie !=
395 ioat_chan->completed_cookie)
396 /* is there something in the chain to be processed? */
397 /* CB1 chain always has at least the last one processed */
398 && (ioat_chan->used_desc.prev != ioat_chan->used_desc.next)
399 && ioat_chan->pending == 0) {
400
401 /*
402 * check CHANSTS register for completed
403 * descriptor address.
404 * if it is different than completion writeback,
405 * it is not zero
406 * and it has changed since the last watchdog
407 * we can assume that channel
408 * is still working correctly
409 * and the problem is in completion writeback.
410 * update completion writeback
411 * with actual CHANSTS value
412 * else
413 * try resetting the channel
414 */
415
416 completion_hw.low = readl(ioat_chan->reg_base +
417 IOAT_CHANSTS_OFFSET_LOW(ioat_chan->device->version));
418 completion_hw.high = readl(ioat_chan->reg_base +
419 IOAT_CHANSTS_OFFSET_HIGH(ioat_chan->device->version));
420#if (BITS_PER_LONG == 64)
421 compl_desc_addr_hw =
422 completion_hw.full
423 & IOAT_CHANSTS_COMPLETED_DESCRIPTOR_ADDR;
424#else
425 compl_desc_addr_hw =
426 completion_hw.low & IOAT_LOW_COMPLETION_MASK;
427#endif
428
429 if ((compl_desc_addr_hw != 0)
430 && (compl_desc_addr_hw != ioat_chan->watchdog_completion)
431 && (compl_desc_addr_hw != ioat_chan->last_compl_desc_addr_hw)) {
432 ioat_chan->last_compl_desc_addr_hw = compl_desc_addr_hw;
433 ioat_chan->completion_virt->low = completion_hw.low;
434 ioat_chan->completion_virt->high = completion_hw.high;
435 } else {
436 ioat_dma_reset_channel(ioat_chan);
437 ioat_chan->watchdog_completion = 0;
438 ioat_chan->last_compl_desc_addr_hw = 0;
439 }
440
441 /*
442 * for version 2.0 if there are descriptors yet to be processed
443 * and the last completed hasn't changed since the last watchdog
444 * if they haven't hit the pending level
445 * issue the pending to push them through
446 * else
447 * try resetting the channel
448 */
449 } else if (ioat_chan->device->version == IOAT_VER_2_0
450 && ioat_chan->used_desc.prev
451 && ioat_chan->last_completion
452 && ioat_chan->last_completion == ioat_chan->watchdog_completion) {
453
454 if (ioat_chan->pending < ioat_pending_level)
455 ioat2_dma_memcpy_issue_pending(&ioat_chan->common);
456 else {
457 ioat_dma_reset_channel(ioat_chan);
458 ioat_chan->watchdog_completion = 0;
459 }
460 } else {
461 ioat_chan->last_compl_desc_addr_hw = 0;
462 ioat_chan->watchdog_completion
463 = ioat_chan->last_completion;
464 }
465
466 ioat_chan->watchdog_last_tcp_cookie =
467 ioat_chan->watchdog_tcp_cookie;
468 }
469
470 schedule_delayed_work(&device->work, WATCHDOG_DELAY);
471}
472
473static dma_cookie_t ioat1_tx_submit(struct dma_async_tx_descriptor *tx)
474{
475 struct ioat_dma_chan *ioat_chan = to_ioat_chan(tx->chan);
476 struct ioat_desc_sw *first = tx_to_ioat_desc(tx);
477 struct ioat_desc_sw *prev, *new;
478 struct ioat_dma_descriptor *hw;
479 dma_cookie_t cookie;
480 LIST_HEAD(new_chain);
481 u32 copy;
482 size_t len;
483 dma_addr_t src, dst;
484 unsigned long orig_flags;
485 unsigned int desc_count = 0;
486
487 /* src and dest and len are stored in the initial descriptor */
488 len = first->len;
489 src = first->src;
490 dst = first->dst;
491 orig_flags = first->async_tx.flags;
492 new = first;
493
494 spin_lock_bh(&ioat_chan->desc_lock);
495 prev = to_ioat_desc(ioat_chan->used_desc.prev);
496 prefetch(prev->hw);
497 do {
498 copy = min_t(size_t, len, ioat_chan->xfercap);
499
500 async_tx_ack(&new->async_tx);
501
502 hw = new->hw;
503 hw->size = copy;
504 hw->ctl = 0;
505 hw->src_addr = src;
506 hw->dst_addr = dst;
507 hw->next = 0;
508
509 /* chain together the physical address list for the HW */
510 wmb();
511 prev->hw->next = (u64) new->async_tx.phys;
512
513 len -= copy;
514 dst += copy;
515 src += copy;
516
517 list_add_tail(&new->node, &new_chain);
518 desc_count++;
519 prev = new;
520 } while (len && (new = ioat1_dma_get_next_descriptor(ioat_chan)));
521
522 if (!new) {
523 dev_err(&ioat_chan->device->pdev->dev,
524 "tx submit failed\n");
525 spin_unlock_bh(&ioat_chan->desc_lock);
526 return -ENOMEM;
527 }
528
529 hw->ctl = IOAT_DMA_DESCRIPTOR_CTL_CP_STS;
530 if (first->async_tx.callback) {
531 hw->ctl |= IOAT_DMA_DESCRIPTOR_CTL_INT_GN;
532 if (first != new) {
533 /* move callback into to last desc */
534 new->async_tx.callback = first->async_tx.callback;
535 new->async_tx.callback_param
536 = first->async_tx.callback_param;
537 first->async_tx.callback = NULL;
538 first->async_tx.callback_param = NULL;
539 }
540 }
541
542 new->tx_cnt = desc_count;
543 new->async_tx.flags = orig_flags; /* client is in control of this ack */
544
545 /* store the original values for use in later cleanup */
546 if (new != first) {
547 new->src = first->src;
548 new->dst = first->dst;
549 new->len = first->len;
550 }
551
552 /* cookie incr and addition to used_list must be atomic */
553 cookie = ioat_chan->common.cookie;
554 cookie++;
555 if (cookie < 0)
556 cookie = 1;
557 ioat_chan->common.cookie = new->async_tx.cookie = cookie;
558
559 /* write address into NextDescriptor field of last desc in chain */
560 to_ioat_desc(ioat_chan->used_desc.prev)->hw->next =
561 first->async_tx.phys;
562 list_splice_tail(&new_chain, &ioat_chan->used_desc);
563
564 ioat_chan->dmacount += desc_count;
565 ioat_chan->pending += desc_count;
566 if (ioat_chan->pending >= ioat_pending_level)
567 __ioat1_dma_memcpy_issue_pending(ioat_chan);
568 spin_unlock_bh(&ioat_chan->desc_lock);
569
570 return cookie;
571}
572
573static dma_cookie_t ioat2_tx_submit(struct dma_async_tx_descriptor *tx)
574{
575 struct ioat_dma_chan *ioat_chan = to_ioat_chan(tx->chan);
576 struct ioat_desc_sw *first = tx_to_ioat_desc(tx);
577 struct ioat_desc_sw *new;
578 struct ioat_dma_descriptor *hw;
579 dma_cookie_t cookie;
580 u32 copy;
581 size_t len;
582 dma_addr_t src, dst;
583 unsigned long orig_flags;
584 unsigned int desc_count = 0;
585
586 /* src and dest and len are stored in the initial descriptor */
587 len = first->len;
588 src = first->src;
589 dst = first->dst;
590 orig_flags = first->async_tx.flags;
591 new = first;
592
593 /*
594 * ioat_chan->desc_lock is still in force in version 2 path
595 * it gets unlocked at end of this function
596 */
597 do {
598 copy = min_t(size_t, len, ioat_chan->xfercap);
599
600 async_tx_ack(&new->async_tx);
601
602 hw = new->hw;
603 hw->size = copy;
604 hw->ctl = 0;
605 hw->src_addr = src;
606 hw->dst_addr = dst;
607
608 len -= copy;
609 dst += copy;
610 src += copy;
611 desc_count++;
612 } while (len && (new = ioat2_dma_get_next_descriptor(ioat_chan)));
613
614 if (!new) {
615 dev_err(&ioat_chan->device->pdev->dev,
616 "tx submit failed\n");
617 spin_unlock_bh(&ioat_chan->desc_lock);
618 return -ENOMEM;
619 }
620
621 hw->ctl |= IOAT_DMA_DESCRIPTOR_CTL_CP_STS;
622 if (first->async_tx.callback) {
623 hw->ctl |= IOAT_DMA_DESCRIPTOR_CTL_INT_GN;
624 if (first != new) {
625 /* move callback into to last desc */
626 new->async_tx.callback = first->async_tx.callback;
627 new->async_tx.callback_param
628 = first->async_tx.callback_param;
629 first->async_tx.callback = NULL;
630 first->async_tx.callback_param = NULL;
631 }
632 }
633
634 new->tx_cnt = desc_count;
635 new->async_tx.flags = orig_flags; /* client is in control of this ack */
636
637 /* store the original values for use in later cleanup */
638 if (new != first) {
639 new->src = first->src;
640 new->dst = first->dst;
641 new->len = first->len;
642 }
643
644 /* cookie incr and addition to used_list must be atomic */
645 cookie = ioat_chan->common.cookie;
646 cookie++;
647 if (cookie < 0)
648 cookie = 1;
649 ioat_chan->common.cookie = new->async_tx.cookie = cookie;
650
651 ioat_chan->dmacount += desc_count;
652 ioat_chan->pending += desc_count;
653 if (ioat_chan->pending >= ioat_pending_level)
654 __ioat2_dma_memcpy_issue_pending(ioat_chan);
655 spin_unlock_bh(&ioat_chan->desc_lock);
656
657 return cookie;
658}
659
660/**
661 * ioat_dma_alloc_descriptor - allocate and return a sw and hw descriptor pair
662 * @ioat_chan: the channel supplying the memory pool for the descriptors
663 * @flags: allocation flags
664 */
665static struct ioat_desc_sw *ioat_dma_alloc_descriptor(
666 struct ioat_dma_chan *ioat_chan,
667 gfp_t flags)
668{
669 struct ioat_dma_descriptor *desc;
670 struct ioat_desc_sw *desc_sw;
671 struct ioatdma_device *ioatdma_device;
672 dma_addr_t phys;
673
674 ioatdma_device = to_ioatdma_device(ioat_chan->common.device);
675 desc = pci_pool_alloc(ioatdma_device->dma_pool, flags, &phys);
676 if (unlikely(!desc))
677 return NULL;
678
679 desc_sw = kzalloc(sizeof(*desc_sw), flags);
680 if (unlikely(!desc_sw)) {
681 pci_pool_free(ioatdma_device->dma_pool, desc, phys);
682 return NULL;
683 }
684
685 memset(desc, 0, sizeof(*desc));
686 dma_async_tx_descriptor_init(&desc_sw->async_tx, &ioat_chan->common);
687 switch (ioat_chan->device->version) {
688 case IOAT_VER_1_2:
689 desc_sw->async_tx.tx_submit = ioat1_tx_submit;
690 break;
691 case IOAT_VER_2_0:
692 case IOAT_VER_3_0:
693 desc_sw->async_tx.tx_submit = ioat2_tx_submit;
694 break;
695 }
696
697 desc_sw->hw = desc;
698 desc_sw->async_tx.phys = phys;
699
700 return desc_sw;
701}
702
703static int ioat_initial_desc_count = 256;
704module_param(ioat_initial_desc_count, int, 0644);
705MODULE_PARM_DESC(ioat_initial_desc_count,
706 "initial descriptors per channel (default: 256)");
707
708/**
709 * ioat2_dma_massage_chan_desc - link the descriptors into a circle
710 * @ioat_chan: the channel to be massaged
711 */
712static void ioat2_dma_massage_chan_desc(struct ioat_dma_chan *ioat_chan)
713{
714 struct ioat_desc_sw *desc, *_desc;
715
716 /* setup used_desc */
717 ioat_chan->used_desc.next = ioat_chan->free_desc.next;
718 ioat_chan->used_desc.prev = NULL;
719
720 /* pull free_desc out of the circle so that every node is a hw
721 * descriptor, but leave it pointing to the list
722 */
723 ioat_chan->free_desc.prev->next = ioat_chan->free_desc.next;
724 ioat_chan->free_desc.next->prev = ioat_chan->free_desc.prev;
725
726 /* circle link the hw descriptors */
727 desc = to_ioat_desc(ioat_chan->free_desc.next);
728 desc->hw->next = to_ioat_desc(desc->node.next)->async_tx.phys;
729 list_for_each_entry_safe(desc, _desc, ioat_chan->free_desc.next, node) {
730 desc->hw->next = to_ioat_desc(desc->node.next)->async_tx.phys;
731 }
732}
733
734/**
735 * ioat_dma_alloc_chan_resources - returns the number of allocated descriptors
736 * @chan: the channel to be filled out
737 */
738static int ioat_dma_alloc_chan_resources(struct dma_chan *chan)
739{
740 struct ioat_dma_chan *ioat_chan = to_ioat_chan(chan);
741 struct ioat_desc_sw *desc;
742 u16 chanctrl;
743 u32 chanerr;
744 int i;
745 LIST_HEAD(tmp_list);
746
747 /* have we already been set up? */
748 if (!list_empty(&ioat_chan->free_desc))
749 return ioat_chan->desccount;
750
751 /* Setup register to interrupt and write completion status on error */
752 chanctrl = IOAT_CHANCTRL_ERR_INT_EN |
753 IOAT_CHANCTRL_ANY_ERR_ABORT_EN |
754 IOAT_CHANCTRL_ERR_COMPLETION_EN;
755 writew(chanctrl, ioat_chan->reg_base + IOAT_CHANCTRL_OFFSET);
756
757 chanerr = readl(ioat_chan->reg_base + IOAT_CHANERR_OFFSET);
758 if (chanerr) {
759 dev_err(&ioat_chan->device->pdev->dev,
760 "CHANERR = %x, clearing\n", chanerr);
761 writel(chanerr, ioat_chan->reg_base + IOAT_CHANERR_OFFSET);
762 }
763
764 /* Allocate descriptors */
765 for (i = 0; i < ioat_initial_desc_count; i++) {
766 desc = ioat_dma_alloc_descriptor(ioat_chan, GFP_KERNEL);
767 if (!desc) {
768 dev_err(&ioat_chan->device->pdev->dev,
769 "Only %d initial descriptors\n", i);
770 break;
771 }
772 list_add_tail(&desc->node, &tmp_list);
773 }
774 spin_lock_bh(&ioat_chan->desc_lock);
775 ioat_chan->desccount = i;
776 list_splice(&tmp_list, &ioat_chan->free_desc);
777 if (ioat_chan->device->version != IOAT_VER_1_2)
778 ioat2_dma_massage_chan_desc(ioat_chan);
779 spin_unlock_bh(&ioat_chan->desc_lock);
780
781 /* allocate a completion writeback area */
782 /* doing 2 32bit writes to mmio since 1 64b write doesn't work */
783 ioat_chan->completion_virt =
784 pci_pool_alloc(ioat_chan->device->completion_pool,
785 GFP_KERNEL,
786 &ioat_chan->completion_addr);
787 memset(ioat_chan->completion_virt, 0,
788 sizeof(*ioat_chan->completion_virt));
789 writel(((u64) ioat_chan->completion_addr) & 0x00000000FFFFFFFF,
790 ioat_chan->reg_base + IOAT_CHANCMP_OFFSET_LOW);
791 writel(((u64) ioat_chan->completion_addr) >> 32,
792 ioat_chan->reg_base + IOAT_CHANCMP_OFFSET_HIGH);
793
794 tasklet_enable(&ioat_chan->cleanup_task);
795 ioat_dma_start_null_desc(ioat_chan); /* give chain to dma device */
796 return ioat_chan->desccount;
797}
798
799/**
800 * ioat_dma_free_chan_resources - release all the descriptors
801 * @chan: the channel to be cleaned
802 */
803static void ioat_dma_free_chan_resources(struct dma_chan *chan)
804{
805 struct ioat_dma_chan *ioat_chan = to_ioat_chan(chan);
806 struct ioatdma_device *ioatdma_device = to_ioatdma_device(chan->device);
807 struct ioat_desc_sw *desc, *_desc;
808 int in_use_descs = 0;
809
810 /* Before freeing channel resources first check
811 * if they have been previously allocated for this channel.
812 */
813 if (ioat_chan->desccount == 0)
814 return;
815
816 tasklet_disable(&ioat_chan->cleanup_task);
817 ioat_dma_memcpy_cleanup(ioat_chan);
818
819 /* Delay 100ms after reset to allow internal DMA logic to quiesce
820 * before removing DMA descriptor resources.
821 */
822 writeb(IOAT_CHANCMD_RESET,
823 ioat_chan->reg_base
824 + IOAT_CHANCMD_OFFSET(ioat_chan->device->version));
825 mdelay(100);
826
827 spin_lock_bh(&ioat_chan->desc_lock);
828 switch (ioat_chan->device->version) {
829 case IOAT_VER_1_2:
830 list_for_each_entry_safe(desc, _desc,
831 &ioat_chan->used_desc, node) {
832 in_use_descs++;
833 list_del(&desc->node);
834 pci_pool_free(ioatdma_device->dma_pool, desc->hw,
835 desc->async_tx.phys);
836 kfree(desc);
837 }
838 list_for_each_entry_safe(desc, _desc,
839 &ioat_chan->free_desc, node) {
840 list_del(&desc->node);
841 pci_pool_free(ioatdma_device->dma_pool, desc->hw,
842 desc->async_tx.phys);
843 kfree(desc);
844 }
845 break;
846 case IOAT_VER_2_0:
847 case IOAT_VER_3_0:
848 list_for_each_entry_safe(desc, _desc,
849 ioat_chan->free_desc.next, node) {
850 list_del(&desc->node);
851 pci_pool_free(ioatdma_device->dma_pool, desc->hw,
852 desc->async_tx.phys);
853 kfree(desc);
854 }
855 desc = to_ioat_desc(ioat_chan->free_desc.next);
856 pci_pool_free(ioatdma_device->dma_pool, desc->hw,
857 desc->async_tx.phys);
858 kfree(desc);
859 INIT_LIST_HEAD(&ioat_chan->free_desc);
860 INIT_LIST_HEAD(&ioat_chan->used_desc);
861 break;
862 }
863 spin_unlock_bh(&ioat_chan->desc_lock);
864
865 pci_pool_free(ioatdma_device->completion_pool,
866 ioat_chan->completion_virt,
867 ioat_chan->completion_addr);
868
869 /* one is ok since we left it on there on purpose */
870 if (in_use_descs > 1)
871 dev_err(&ioat_chan->device->pdev->dev,
872 "Freeing %d in use descriptors!\n",
873 in_use_descs - 1);
874
875 ioat_chan->last_completion = ioat_chan->completion_addr = 0;
876 ioat_chan->pending = 0;
877 ioat_chan->dmacount = 0;
878 ioat_chan->desccount = 0;
879 ioat_chan->watchdog_completion = 0;
880 ioat_chan->last_compl_desc_addr_hw = 0;
881 ioat_chan->watchdog_tcp_cookie =
882 ioat_chan->watchdog_last_tcp_cookie = 0;
883}
884
885/**
886 * ioat_dma_get_next_descriptor - return the next available descriptor
887 * @ioat_chan: IOAT DMA channel handle
888 *
889 * Gets the next descriptor from the chain, and must be called with the
890 * channel's desc_lock held. Allocates more descriptors if the channel
891 * has run out.
892 */
893static struct ioat_desc_sw *
894ioat1_dma_get_next_descriptor(struct ioat_dma_chan *ioat_chan)
895{
896 struct ioat_desc_sw *new;
897
898 if (!list_empty(&ioat_chan->free_desc)) {
899 new = to_ioat_desc(ioat_chan->free_desc.next);
900 list_del(&new->node);
901 } else {
902 /* try to get another desc */
903 new = ioat_dma_alloc_descriptor(ioat_chan, GFP_ATOMIC);
904 if (!new) {
905 dev_err(&ioat_chan->device->pdev->dev,
906 "alloc failed\n");
907 return NULL;
908 }
909 }
910
911 prefetch(new->hw);
912 return new;
913}
914
915static struct ioat_desc_sw *
916ioat2_dma_get_next_descriptor(struct ioat_dma_chan *ioat_chan)
917{
918 struct ioat_desc_sw *new;
919
920 /*
921 * used.prev points to where to start processing
922 * used.next points to next free descriptor
923 * if used.prev == NULL, there are none waiting to be processed
924 * if used.next == used.prev.prev, there is only one free descriptor,
925 * and we need to use it to as a noop descriptor before
926 * linking in a new set of descriptors, since the device
927 * has probably already read the pointer to it
928 */
929 if (ioat_chan->used_desc.prev &&
930 ioat_chan->used_desc.next == ioat_chan->used_desc.prev->prev) {
931
932 struct ioat_desc_sw *desc;
933 struct ioat_desc_sw *noop_desc;
934 int i;
935
936 /* set up the noop descriptor */
937 noop_desc = to_ioat_desc(ioat_chan->used_desc.next);
938 /* set size to non-zero value (channel returns error when size is 0) */
939 noop_desc->hw->size = NULL_DESC_BUFFER_SIZE;
940 noop_desc->hw->ctl = IOAT_DMA_DESCRIPTOR_NUL;
941 noop_desc->hw->src_addr = 0;
942 noop_desc->hw->dst_addr = 0;
943
944 ioat_chan->used_desc.next = ioat_chan->used_desc.next->next;
945 ioat_chan->pending++;
946 ioat_chan->dmacount++;
947
948 /* try to get a few more descriptors */
949 for (i = 16; i; i--) {
950 desc = ioat_dma_alloc_descriptor(ioat_chan, GFP_ATOMIC);
951 if (!desc) {
952 dev_err(&ioat_chan->device->pdev->dev,
953 "alloc failed\n");
954 break;
955 }
956 list_add_tail(&desc->node, ioat_chan->used_desc.next);
957
958 desc->hw->next
959 = to_ioat_desc(desc->node.next)->async_tx.phys;
960 to_ioat_desc(desc->node.prev)->hw->next
961 = desc->async_tx.phys;
962 ioat_chan->desccount++;
963 }
964
965 ioat_chan->used_desc.next = noop_desc->node.next;
966 }
967 new = to_ioat_desc(ioat_chan->used_desc.next);
968 prefetch(new);
969 ioat_chan->used_desc.next = new->node.next;
970
971 if (ioat_chan->used_desc.prev == NULL)
972 ioat_chan->used_desc.prev = &new->node;
973
974 prefetch(new->hw);
975 return new;
976}
977
978static struct ioat_desc_sw *ioat_dma_get_next_descriptor(
979 struct ioat_dma_chan *ioat_chan)
980{
981 if (!ioat_chan)
982 return NULL;
983
984 switch (ioat_chan->device->version) {
985 case IOAT_VER_1_2:
986 return ioat1_dma_get_next_descriptor(ioat_chan);
987 case IOAT_VER_2_0:
988 case IOAT_VER_3_0:
989 return ioat2_dma_get_next_descriptor(ioat_chan);
990 }
991 return NULL;
992}
993
994static struct dma_async_tx_descriptor *ioat1_dma_prep_memcpy(
995 struct dma_chan *chan,
996 dma_addr_t dma_dest,
997 dma_addr_t dma_src,
998 size_t len,
999 unsigned long flags)
1000{
1001 struct ioat_dma_chan *ioat_chan = to_ioat_chan(chan);
1002 struct ioat_desc_sw *new;
1003
1004 spin_lock_bh(&ioat_chan->desc_lock);
1005 new = ioat_dma_get_next_descriptor(ioat_chan);
1006 spin_unlock_bh(&ioat_chan->desc_lock);
1007
1008 if (new) {
1009 new->len = len;
1010 new->dst = dma_dest;
1011 new->src = dma_src;
1012 new->async_tx.flags = flags;
1013 return &new->async_tx;
1014 } else {
1015 dev_err(&ioat_chan->device->pdev->dev,
1016 "chan%d - get_next_desc failed: %d descs waiting, %d total desc\n",
1017 chan_num(ioat_chan), ioat_chan->dmacount, ioat_chan->desccount);
1018 return NULL;
1019 }
1020}
1021
1022static struct dma_async_tx_descriptor *ioat2_dma_prep_memcpy(
1023 struct dma_chan *chan,
1024 dma_addr_t dma_dest,
1025 dma_addr_t dma_src,
1026 size_t len,
1027 unsigned long flags)
1028{
1029 struct ioat_dma_chan *ioat_chan = to_ioat_chan(chan);
1030 struct ioat_desc_sw *new;
1031
1032 spin_lock_bh(&ioat_chan->desc_lock);
1033 new = ioat2_dma_get_next_descriptor(ioat_chan);
1034
1035 /*
1036 * leave ioat_chan->desc_lock set in ioat 2 path
1037 * it will get unlocked at end of tx_submit
1038 */
1039
1040 if (new) {
1041 new->len = len;
1042 new->dst = dma_dest;
1043 new->src = dma_src;
1044 new->async_tx.flags = flags;
1045 return &new->async_tx;
1046 } else {
1047 spin_unlock_bh(&ioat_chan->desc_lock);
1048 dev_err(&ioat_chan->device->pdev->dev,
1049 "chan%d - get_next_desc failed: %d descs waiting, %d total desc\n",
1050 chan_num(ioat_chan), ioat_chan->dmacount, ioat_chan->desccount);
1051 return NULL;
1052 }
1053}
1054
1055static void ioat_dma_cleanup_tasklet(unsigned long data)
1056{
1057 struct ioat_dma_chan *chan = (void *)data;
1058 ioat_dma_memcpy_cleanup(chan);
1059 writew(IOAT_CHANCTRL_INT_DISABLE,
1060 chan->reg_base + IOAT_CHANCTRL_OFFSET);
1061}
1062
1063static void
1064ioat_dma_unmap(struct ioat_dma_chan *ioat_chan, struct ioat_desc_sw *desc)
1065{
1066 if (!(desc->async_tx.flags & DMA_COMPL_SKIP_DEST_UNMAP)) {
1067 if (desc->async_tx.flags & DMA_COMPL_DEST_UNMAP_SINGLE)
1068 pci_unmap_single(ioat_chan->device->pdev,
1069 pci_unmap_addr(desc, dst),
1070 pci_unmap_len(desc, len),
1071 PCI_DMA_FROMDEVICE);
1072 else
1073 pci_unmap_page(ioat_chan->device->pdev,
1074 pci_unmap_addr(desc, dst),
1075 pci_unmap_len(desc, len),
1076 PCI_DMA_FROMDEVICE);
1077 }
1078
1079 if (!(desc->async_tx.flags & DMA_COMPL_SKIP_SRC_UNMAP)) {
1080 if (desc->async_tx.flags & DMA_COMPL_SRC_UNMAP_SINGLE)
1081 pci_unmap_single(ioat_chan->device->pdev,
1082 pci_unmap_addr(desc, src),
1083 pci_unmap_len(desc, len),
1084 PCI_DMA_TODEVICE);
1085 else
1086 pci_unmap_page(ioat_chan->device->pdev,
1087 pci_unmap_addr(desc, src),
1088 pci_unmap_len(desc, len),
1089 PCI_DMA_TODEVICE);
1090 }
1091}
1092
1093/**
1094 * ioat_dma_memcpy_cleanup - cleanup up finished descriptors
1095 * @chan: ioat channel to be cleaned up
1096 */
1097static void ioat_dma_memcpy_cleanup(struct ioat_dma_chan *ioat_chan)
1098{
1099 unsigned long phys_complete;
1100 struct ioat_desc_sw *desc, *_desc;
1101 dma_cookie_t cookie = 0;
1102 unsigned long desc_phys;
1103 struct ioat_desc_sw *latest_desc;
1104
1105 prefetch(ioat_chan->completion_virt);
1106
1107 if (!spin_trylock_bh(&ioat_chan->cleanup_lock))
1108 return;
1109
1110 /* The completion writeback can happen at any time,
1111 so reads by the driver need to be atomic operations
1112 The descriptor physical addresses are limited to 32-bits
1113 when the CPU can only do a 32-bit mov */
1114
1115#if (BITS_PER_LONG == 64)
1116 phys_complete =
1117 ioat_chan->completion_virt->full
1118 & IOAT_CHANSTS_COMPLETED_DESCRIPTOR_ADDR;
1119#else
1120 phys_complete =
1121 ioat_chan->completion_virt->low & IOAT_LOW_COMPLETION_MASK;
1122#endif
1123
1124 if ((ioat_chan->completion_virt->full
1125 & IOAT_CHANSTS_DMA_TRANSFER_STATUS) ==
1126 IOAT_CHANSTS_DMA_TRANSFER_STATUS_HALTED) {
1127 dev_err(&ioat_chan->device->pdev->dev,
1128 "Channel halted, chanerr = %x\n",
1129 readl(ioat_chan->reg_base + IOAT_CHANERR_OFFSET));
1130
1131 /* TODO do something to salvage the situation */
1132 }
1133
1134 if (phys_complete == ioat_chan->last_completion) {
1135 spin_unlock_bh(&ioat_chan->cleanup_lock);
1136 /*
1137 * perhaps we're stuck so hard that the watchdog can't go off?
1138 * try to catch it after 2 seconds
1139 */
1140 if (ioat_chan->device->version != IOAT_VER_3_0) {
1141 if (time_after(jiffies,
1142 ioat_chan->last_completion_time + HZ*WATCHDOG_DELAY)) {
1143 ioat_dma_chan_watchdog(&(ioat_chan->device->work.work));
1144 ioat_chan->last_completion_time = jiffies;
1145 }
1146 }
1147 return;
1148 }
1149 ioat_chan->last_completion_time = jiffies;
1150
1151 cookie = 0;
1152 if (!spin_trylock_bh(&ioat_chan->desc_lock)) {
1153 spin_unlock_bh(&ioat_chan->cleanup_lock);
1154 return;
1155 }
1156
1157 switch (ioat_chan->device->version) {
1158 case IOAT_VER_1_2:
1159 list_for_each_entry_safe(desc, _desc,
1160 &ioat_chan->used_desc, node) {
1161
1162 /*
1163 * Incoming DMA requests may use multiple descriptors,
1164 * due to exceeding xfercap, perhaps. If so, only the
1165 * last one will have a cookie, and require unmapping.
1166 */
1167 if (desc->async_tx.cookie) {
1168 cookie = desc->async_tx.cookie;
1169 ioat_dma_unmap(ioat_chan, desc);
1170 if (desc->async_tx.callback) {
1171 desc->async_tx.callback(desc->async_tx.callback_param);
1172 desc->async_tx.callback = NULL;
1173 }
1174 }
1175
1176 if (desc->async_tx.phys != phys_complete) {
1177 /*
1178 * a completed entry, but not the last, so clean
1179 * up if the client is done with the descriptor
1180 */
1181 if (async_tx_test_ack(&desc->async_tx)) {
1182 list_move_tail(&desc->node,
1183 &ioat_chan->free_desc);
1184 } else
1185 desc->async_tx.cookie = 0;
1186 } else {
1187 /*
1188 * last used desc. Do not remove, so we can
1189 * append from it, but don't look at it next
1190 * time, either
1191 */
1192 desc->async_tx.cookie = 0;
1193
1194 /* TODO check status bits? */
1195 break;
1196 }
1197 }
1198 break;
1199 case IOAT_VER_2_0:
1200 case IOAT_VER_3_0:
1201 /* has some other thread has already cleaned up? */
1202 if (ioat_chan->used_desc.prev == NULL)
1203 break;
1204
1205 /* work backwards to find latest finished desc */
1206 desc = to_ioat_desc(ioat_chan->used_desc.next);
1207 latest_desc = NULL;
1208 do {
1209 desc = to_ioat_desc(desc->node.prev);
1210 desc_phys = (unsigned long)desc->async_tx.phys
1211 & IOAT_CHANSTS_COMPLETED_DESCRIPTOR_ADDR;
1212 if (desc_phys == phys_complete) {
1213 latest_desc = desc;
1214 break;
1215 }
1216 } while (&desc->node != ioat_chan->used_desc.prev);
1217
1218 if (latest_desc != NULL) {
1219
1220 /* work forwards to clear finished descriptors */
1221 for (desc = to_ioat_desc(ioat_chan->used_desc.prev);
1222 &desc->node != latest_desc->node.next &&
1223 &desc->node != ioat_chan->used_desc.next;
1224 desc = to_ioat_desc(desc->node.next)) {
1225 if (desc->async_tx.cookie) {
1226 cookie = desc->async_tx.cookie;
1227 desc->async_tx.cookie = 0;
1228 ioat_dma_unmap(ioat_chan, desc);
1229 if (desc->async_tx.callback) {
1230 desc->async_tx.callback(desc->async_tx.callback_param);
1231 desc->async_tx.callback = NULL;
1232 }
1233 }
1234 }
1235
1236 /* move used.prev up beyond those that are finished */
1237 if (&desc->node == ioat_chan->used_desc.next)
1238 ioat_chan->used_desc.prev = NULL;
1239 else
1240 ioat_chan->used_desc.prev = &desc->node;
1241 }
1242 break;
1243 }
1244
1245 spin_unlock_bh(&ioat_chan->desc_lock);
1246
1247 ioat_chan->last_completion = phys_complete;
1248 if (cookie != 0)
1249 ioat_chan->completed_cookie = cookie;
1250
1251 spin_unlock_bh(&ioat_chan->cleanup_lock);
1252}
1253
1254/**
1255 * ioat_dma_is_complete - poll the status of a IOAT DMA transaction
1256 * @chan: IOAT DMA channel handle
1257 * @cookie: DMA transaction identifier
1258 * @done: if not %NULL, updated with last completed transaction
1259 * @used: if not %NULL, updated with last used transaction
1260 */
1261static enum dma_status ioat_dma_is_complete(struct dma_chan *chan,
1262 dma_cookie_t cookie,
1263 dma_cookie_t *done,
1264 dma_cookie_t *used)
1265{
1266 struct ioat_dma_chan *ioat_chan = to_ioat_chan(chan);
1267 dma_cookie_t last_used;
1268 dma_cookie_t last_complete;
1269 enum dma_status ret;
1270
1271 last_used = chan->cookie;
1272 last_complete = ioat_chan->completed_cookie;
1273 ioat_chan->watchdog_tcp_cookie = cookie;
1274
1275 if (done)
1276 *done = last_complete;
1277 if (used)
1278 *used = last_used;
1279
1280 ret = dma_async_is_complete(cookie, last_complete, last_used);
1281 if (ret == DMA_SUCCESS)
1282 return ret;
1283
1284 ioat_dma_memcpy_cleanup(ioat_chan);
1285
1286 last_used = chan->cookie;
1287 last_complete = ioat_chan->completed_cookie;
1288
1289 if (done)
1290 *done = last_complete;
1291 if (used)
1292 *used = last_used;
1293
1294 return dma_async_is_complete(cookie, last_complete, last_used);
1295}
1296
1297static void ioat_dma_start_null_desc(struct ioat_dma_chan *ioat_chan)
1298{
1299 struct ioat_desc_sw *desc;
1300
1301 spin_lock_bh(&ioat_chan->desc_lock);
1302
1303 desc = ioat_dma_get_next_descriptor(ioat_chan);
1304
1305 if (!desc) {
1306 dev_err(&ioat_chan->device->pdev->dev,
1307 "Unable to start null desc - get next desc failed\n");
1308 spin_unlock_bh(&ioat_chan->desc_lock);
1309 return;
1310 }
1311
1312 desc->hw->ctl = IOAT_DMA_DESCRIPTOR_NUL
1313 | IOAT_DMA_DESCRIPTOR_CTL_INT_GN
1314 | IOAT_DMA_DESCRIPTOR_CTL_CP_STS;
1315 /* set size to non-zero value (channel returns error when size is 0) */
1316 desc->hw->size = NULL_DESC_BUFFER_SIZE;
1317 desc->hw->src_addr = 0;
1318 desc->hw->dst_addr = 0;
1319 async_tx_ack(&desc->async_tx);
1320 switch (ioat_chan->device->version) {
1321 case IOAT_VER_1_2:
1322 desc->hw->next = 0;
1323 list_add_tail(&desc->node, &ioat_chan->used_desc);
1324
1325 writel(((u64) desc->async_tx.phys) & 0x00000000FFFFFFFF,
1326 ioat_chan->reg_base + IOAT1_CHAINADDR_OFFSET_LOW);
1327 writel(((u64) desc->async_tx.phys) >> 32,
1328 ioat_chan->reg_base + IOAT1_CHAINADDR_OFFSET_HIGH);
1329
1330 writeb(IOAT_CHANCMD_START, ioat_chan->reg_base
1331 + IOAT_CHANCMD_OFFSET(ioat_chan->device->version));
1332 break;
1333 case IOAT_VER_2_0:
1334 case IOAT_VER_3_0:
1335 writel(((u64) desc->async_tx.phys) & 0x00000000FFFFFFFF,
1336 ioat_chan->reg_base + IOAT2_CHAINADDR_OFFSET_LOW);
1337 writel(((u64) desc->async_tx.phys) >> 32,
1338 ioat_chan->reg_base + IOAT2_CHAINADDR_OFFSET_HIGH);
1339
1340 ioat_chan->dmacount++;
1341 __ioat2_dma_memcpy_issue_pending(ioat_chan);
1342 break;
1343 }
1344 spin_unlock_bh(&ioat_chan->desc_lock);
1345}
1346
1347/*
1348 * Perform a IOAT transaction to verify the HW works.
1349 */
1350#define IOAT_TEST_SIZE 2000
1351
1352static void ioat_dma_test_callback(void *dma_async_param)
1353{
1354 struct completion *cmp = dma_async_param;
1355
1356 complete(cmp);
1357}
1358
1359/**
1360 * ioat_dma_self_test - Perform a IOAT transaction to verify the HW works.
1361 * @device: device to be tested
1362 */
1363static int ioat_dma_self_test(struct ioatdma_device *device)
1364{
1365 int i;
1366 u8 *src;
1367 u8 *dest;
1368 struct dma_chan *dma_chan;
1369 struct dma_async_tx_descriptor *tx;
1370 dma_addr_t dma_dest, dma_src;
1371 dma_cookie_t cookie;
1372 int err = 0;
1373 struct completion cmp;
1374 unsigned long tmo;
1375 unsigned long flags;
1376
1377 src = kzalloc(sizeof(u8) * IOAT_TEST_SIZE, GFP_KERNEL);
1378 if (!src)
1379 return -ENOMEM;
1380 dest = kzalloc(sizeof(u8) * IOAT_TEST_SIZE, GFP_KERNEL);
1381 if (!dest) {
1382 kfree(src);
1383 return -ENOMEM;
1384 }
1385
1386 /* Fill in src buffer */
1387 for (i = 0; i < IOAT_TEST_SIZE; i++)
1388 src[i] = (u8)i;
1389
1390 /* Start copy, using first DMA channel */
1391 dma_chan = container_of(device->common.channels.next,
1392 struct dma_chan,
1393 device_node);
1394 if (device->common.device_alloc_chan_resources(dma_chan) < 1) {
1395 dev_err(&device->pdev->dev,
1396 "selftest cannot allocate chan resource\n");
1397 err = -ENODEV;
1398 goto out;
1399 }
1400
1401 dma_src = dma_map_single(dma_chan->device->dev, src, IOAT_TEST_SIZE,
1402 DMA_TO_DEVICE);
1403 dma_dest = dma_map_single(dma_chan->device->dev, dest, IOAT_TEST_SIZE,
1404 DMA_FROM_DEVICE);
1405 flags = DMA_COMPL_SRC_UNMAP_SINGLE | DMA_COMPL_DEST_UNMAP_SINGLE;
1406 tx = device->common.device_prep_dma_memcpy(dma_chan, dma_dest, dma_src,
1407 IOAT_TEST_SIZE, flags);
1408 if (!tx) {
1409 dev_err(&device->pdev->dev,
1410 "Self-test prep failed, disabling\n");
1411 err = -ENODEV;
1412 goto free_resources;
1413 }
1414
1415 async_tx_ack(tx);
1416 init_completion(&cmp);
1417 tx->callback = ioat_dma_test_callback;
1418 tx->callback_param = &cmp;
1419 cookie = tx->tx_submit(tx);
1420 if (cookie < 0) {
1421 dev_err(&device->pdev->dev,
1422 "Self-test setup failed, disabling\n");
1423 err = -ENODEV;
1424 goto free_resources;
1425 }
1426 device->common.device_issue_pending(dma_chan);
1427
1428 tmo = wait_for_completion_timeout(&cmp, msecs_to_jiffies(3000));
1429
1430 if (tmo == 0 ||
1431 device->common.device_is_tx_complete(dma_chan, cookie, NULL, NULL)
1432 != DMA_SUCCESS) {
1433 dev_err(&device->pdev->dev,
1434 "Self-test copy timed out, disabling\n");
1435 err = -ENODEV;
1436 goto free_resources;
1437 }
1438 if (memcmp(src, dest, IOAT_TEST_SIZE)) {
1439 dev_err(&device->pdev->dev,
1440 "Self-test copy failed compare, disabling\n");
1441 err = -ENODEV;
1442 goto free_resources;
1443 }
1444
1445free_resources:
1446 device->common.device_free_chan_resources(dma_chan);
1447out:
1448 kfree(src);
1449 kfree(dest);
1450 return err;
1451}
1452
1453static char ioat_interrupt_style[32] = "msix";
1454module_param_string(ioat_interrupt_style, ioat_interrupt_style,
1455 sizeof(ioat_interrupt_style), 0644);
1456MODULE_PARM_DESC(ioat_interrupt_style,
1457 "set ioat interrupt style: msix (default), "
1458 "msix-single-vector, msi, intx)");
1459
1460/**
1461 * ioat_dma_setup_interrupts - setup interrupt handler
1462 * @device: ioat device
1463 */
1464static int ioat_dma_setup_interrupts(struct ioatdma_device *device)
1465{
1466 struct ioat_dma_chan *ioat_chan;
1467 int err, i, j, msixcnt;
1468 u8 intrctrl = 0;
1469
1470 if (!strcmp(ioat_interrupt_style, "msix"))
1471 goto msix;
1472 if (!strcmp(ioat_interrupt_style, "msix-single-vector"))
1473 goto msix_single_vector;
1474 if (!strcmp(ioat_interrupt_style, "msi"))
1475 goto msi;
1476 if (!strcmp(ioat_interrupt_style, "intx"))
1477 goto intx;
1478 dev_err(&device->pdev->dev, "invalid ioat_interrupt_style %s\n",
1479 ioat_interrupt_style);
1480 goto err_no_irq;
1481
1482msix:
1483 /* The number of MSI-X vectors should equal the number of channels */
1484 msixcnt = device->common.chancnt;
1485 for (i = 0; i < msixcnt; i++)
1486 device->msix_entries[i].entry = i;
1487
1488 err = pci_enable_msix(device->pdev, device->msix_entries, msixcnt);
1489 if (err < 0)
1490 goto msi;
1491 if (err > 0)
1492 goto msix_single_vector;
1493
1494 for (i = 0; i < msixcnt; i++) {
1495 ioat_chan = ioat_lookup_chan_by_index(device, i);
1496 err = request_irq(device->msix_entries[i].vector,
1497 ioat_dma_do_interrupt_msix,
1498 0, "ioat-msix", ioat_chan);
1499 if (err) {
1500 for (j = 0; j < i; j++) {
1501 ioat_chan =
1502 ioat_lookup_chan_by_index(device, j);
1503 free_irq(device->msix_entries[j].vector,
1504 ioat_chan);
1505 }
1506 goto msix_single_vector;
1507 }
1508 }
1509 intrctrl |= IOAT_INTRCTRL_MSIX_VECTOR_CONTROL;
1510 device->irq_mode = msix_multi_vector;
1511 goto done;
1512
1513msix_single_vector:
1514 device->msix_entries[0].entry = 0;
1515 err = pci_enable_msix(device->pdev, device->msix_entries, 1);
1516 if (err)
1517 goto msi;
1518
1519 err = request_irq(device->msix_entries[0].vector, ioat_dma_do_interrupt,
1520 0, "ioat-msix", device);
1521 if (err) {
1522 pci_disable_msix(device->pdev);
1523 goto msi;
1524 }
1525 device->irq_mode = msix_single_vector;
1526 goto done;
1527
1528msi:
1529 err = pci_enable_msi(device->pdev);
1530 if (err)
1531 goto intx;
1532
1533 err = request_irq(device->pdev->irq, ioat_dma_do_interrupt,
1534 0, "ioat-msi", device);
1535 if (err) {
1536 pci_disable_msi(device->pdev);
1537 goto intx;
1538 }
1539 /*
1540 * CB 1.2 devices need a bit set in configuration space to enable MSI
1541 */
1542 if (device->version == IOAT_VER_1_2) {
1543 u32 dmactrl;
1544 pci_read_config_dword(device->pdev,
1545 IOAT_PCI_DMACTRL_OFFSET, &dmactrl);
1546 dmactrl |= IOAT_PCI_DMACTRL_MSI_EN;
1547 pci_write_config_dword(device->pdev,
1548 IOAT_PCI_DMACTRL_OFFSET, dmactrl);
1549 }
1550 device->irq_mode = msi;
1551 goto done;
1552
1553intx:
1554 err = request_irq(device->pdev->irq, ioat_dma_do_interrupt,
1555 IRQF_SHARED, "ioat-intx", device);
1556 if (err)
1557 goto err_no_irq;
1558 device->irq_mode = intx;
1559
1560done:
1561 intrctrl |= IOAT_INTRCTRL_MASTER_INT_EN;
1562 writeb(intrctrl, device->reg_base + IOAT_INTRCTRL_OFFSET);
1563 return 0;
1564
1565err_no_irq:
1566 /* Disable all interrupt generation */
1567 writeb(0, device->reg_base + IOAT_INTRCTRL_OFFSET);
1568 dev_err(&device->pdev->dev, "no usable interrupts\n");
1569 device->irq_mode = none;
1570 return -1;
1571}
1572
1573/**
1574 * ioat_dma_remove_interrupts - remove whatever interrupts were set
1575 * @device: ioat device
1576 */
1577static void ioat_dma_remove_interrupts(struct ioatdma_device *device)
1578{
1579 struct ioat_dma_chan *ioat_chan;
1580 int i;
1581
1582 /* Disable all interrupt generation */
1583 writeb(0, device->reg_base + IOAT_INTRCTRL_OFFSET);
1584
1585 switch (device->irq_mode) {
1586 case msix_multi_vector:
1587 for (i = 0; i < device->common.chancnt; i++) {
1588 ioat_chan = ioat_lookup_chan_by_index(device, i);
1589 free_irq(device->msix_entries[i].vector, ioat_chan);
1590 }
1591 pci_disable_msix(device->pdev);
1592 break;
1593 case msix_single_vector:
1594 free_irq(device->msix_entries[0].vector, device);
1595 pci_disable_msix(device->pdev);
1596 break;
1597 case msi:
1598 free_irq(device->pdev->irq, device);
1599 pci_disable_msi(device->pdev);
1600 break;
1601 case intx:
1602 free_irq(device->pdev->irq, device);
1603 break;
1604 case none:
1605 dev_warn(&device->pdev->dev,
1606 "call to %s without interrupts setup\n", __func__);
1607 }
1608 device->irq_mode = none;
1609}
1610
1611struct ioatdma_device *ioat_dma_probe(struct pci_dev *pdev,
1612 void __iomem *iobase)
1613{
1614 int err;
1615 struct ioatdma_device *device;
1616
1617 device = kzalloc(sizeof(*device), GFP_KERNEL);
1618 if (!device) {
1619 err = -ENOMEM;
1620 goto err_kzalloc;
1621 }
1622 device->pdev = pdev;
1623 device->reg_base = iobase;
1624 device->version = readb(device->reg_base + IOAT_VER_OFFSET);
1625
1626 /* DMA coherent memory pool for DMA descriptor allocations */
1627 device->dma_pool = pci_pool_create("dma_desc_pool", pdev,
1628 sizeof(struct ioat_dma_descriptor),
1629 64, 0);
1630 if (!device->dma_pool) {
1631 err = -ENOMEM;
1632 goto err_dma_pool;
1633 }
1634
1635 device->completion_pool = pci_pool_create("completion_pool", pdev,
1636 sizeof(u64), SMP_CACHE_BYTES,
1637 SMP_CACHE_BYTES);
1638 if (!device->completion_pool) {
1639 err = -ENOMEM;
1640 goto err_completion_pool;
1641 }
1642
1643 INIT_LIST_HEAD(&device->common.channels);
1644 ioat_dma_enumerate_channels(device);
1645
1646 device->common.device_alloc_chan_resources =
1647 ioat_dma_alloc_chan_resources;
1648 device->common.device_free_chan_resources =
1649 ioat_dma_free_chan_resources;
1650 device->common.dev = &pdev->dev;
1651
1652 dma_cap_set(DMA_MEMCPY, device->common.cap_mask);
1653 device->common.device_is_tx_complete = ioat_dma_is_complete;
1654 switch (device->version) {
1655 case IOAT_VER_1_2:
1656 device->common.device_prep_dma_memcpy = ioat1_dma_prep_memcpy;
1657 device->common.device_issue_pending =
1658 ioat1_dma_memcpy_issue_pending;
1659 break;
1660 case IOAT_VER_2_0:
1661 case IOAT_VER_3_0:
1662 device->common.device_prep_dma_memcpy = ioat2_dma_prep_memcpy;
1663 device->common.device_issue_pending =
1664 ioat2_dma_memcpy_issue_pending;
1665 break;
1666 }
1667
1668 dev_err(&device->pdev->dev,
1669 "Intel(R) I/OAT DMA Engine found,"
1670 " %d channels, device version 0x%02x, driver version %s\n",
1671 device->common.chancnt, device->version, IOAT_DMA_VERSION);
1672
1673 if (!device->common.chancnt) {
1674 dev_err(&device->pdev->dev,
1675 "Intel(R) I/OAT DMA Engine problem found: "
1676 "zero channels detected\n");
1677 goto err_setup_interrupts;
1678 }
1679
1680 err = ioat_dma_setup_interrupts(device);
1681 if (err)
1682 goto err_setup_interrupts;
1683
1684 err = ioat_dma_self_test(device);
1685 if (err)
1686 goto err_self_test;
1687
1688 ioat_set_tcp_copy_break(device);
1689
1690 dma_async_device_register(&device->common);
1691
1692 if (device->version != IOAT_VER_3_0) {
1693 INIT_DELAYED_WORK(&device->work, ioat_dma_chan_watchdog);
1694 schedule_delayed_work(&device->work,
1695 WATCHDOG_DELAY);
1696 }
1697
1698 return device;
1699
1700err_self_test:
1701 ioat_dma_remove_interrupts(device);
1702err_setup_interrupts:
1703 pci_pool_destroy(device->completion_pool);
1704err_completion_pool:
1705 pci_pool_destroy(device->dma_pool);
1706err_dma_pool:
1707 kfree(device);
1708err_kzalloc:
1709 dev_err(&pdev->dev,
1710 "Intel(R) I/OAT DMA Engine initialization failed\n");
1711 return NULL;
1712}
1713
1714void ioat_dma_remove(struct ioatdma_device *device)
1715{
1716 struct dma_chan *chan, *_chan;
1717 struct ioat_dma_chan *ioat_chan;
1718
1719 if (device->version != IOAT_VER_3_0)
1720 cancel_delayed_work(&device->work);
1721
1722 ioat_dma_remove_interrupts(device);
1723
1724 dma_async_device_unregister(&device->common);
1725
1726 pci_pool_destroy(device->dma_pool);
1727 pci_pool_destroy(device->completion_pool);
1728
1729 iounmap(device->reg_base);
1730 pci_release_regions(device->pdev);
1731 pci_disable_device(device->pdev);
1732
1733 list_for_each_entry_safe(chan, _chan,
1734 &device->common.channels, device_node) {
1735 ioat_chan = to_ioat_chan(chan);
1736 list_del(&chan->device_node);
1737 kfree(ioat_chan);
1738 }
1739 kfree(device);
1740}
1741
diff --git a/drivers/dma/ioatdma.h b/drivers/dma/ioatdma.h
deleted file mode 100644
index a52ff4bd4601..000000000000
--- a/drivers/dma/ioatdma.h
+++ /dev/null
@@ -1,165 +0,0 @@
1/*
2 * Copyright(c) 2004 - 2009 Intel Corporation. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of the GNU General Public License as published by the Free
6 * Software Foundation; either version 2 of the License, or (at your option)
7 * any later version.
8 *
9 * This program is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 * You should have received a copy of the GNU General Public License along with
15 * this program; if not, write to the Free Software Foundation, Inc., 59
16 * Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * The full GNU General Public License is included in this distribution in the
19 * file called COPYING.
20 */
21#ifndef IOATDMA_H
22#define IOATDMA_H
23
24#include <linux/dmaengine.h>
25#include "ioatdma_hw.h"
26#include <linux/init.h>
27#include <linux/dmapool.h>
28#include <linux/cache.h>
29#include <linux/pci_ids.h>
30#include <net/tcp.h>
31
32#define IOAT_DMA_VERSION "3.64"
33
34enum ioat_interrupt {
35 none = 0,
36 msix_multi_vector = 1,
37 msix_single_vector = 2,
38 msi = 3,
39 intx = 4,
40};
41
42#define IOAT_LOW_COMPLETION_MASK 0xffffffc0
43#define IOAT_DMA_DCA_ANY_CPU ~0
44#define IOAT_WATCHDOG_PERIOD (2 * HZ)
45
46
47/**
48 * struct ioatdma_device - internal representation of a IOAT device
49 * @pdev: PCI-Express device
50 * @reg_base: MMIO register space base address
51 * @dma_pool: for allocating DMA descriptors
52 * @common: embedded struct dma_device
53 * @version: version of ioatdma device
54 * @irq_mode: which style irq to use
55 * @msix_entries: irq handlers
56 * @idx: per channel data
57 */
58
59struct ioatdma_device {
60 struct pci_dev *pdev;
61 void __iomem *reg_base;
62 struct pci_pool *dma_pool;
63 struct pci_pool *completion_pool;
64 struct dma_device common;
65 u8 version;
66 enum ioat_interrupt irq_mode;
67 struct delayed_work work;
68 struct msix_entry msix_entries[4];
69 struct ioat_dma_chan *idx[4];
70};
71
72/**
73 * struct ioat_dma_chan - internal representation of a DMA channel
74 */
75struct ioat_dma_chan {
76
77 void __iomem *reg_base;
78
79 dma_cookie_t completed_cookie;
80 unsigned long last_completion;
81 unsigned long last_completion_time;
82
83 size_t xfercap; /* XFERCAP register value expanded out */
84
85 spinlock_t cleanup_lock;
86 spinlock_t desc_lock;
87 struct list_head free_desc;
88 struct list_head used_desc;
89 unsigned long watchdog_completion;
90 int watchdog_tcp_cookie;
91 u32 watchdog_last_tcp_cookie;
92 struct delayed_work work;
93
94 int pending;
95 int dmacount;
96 int desccount;
97
98 struct ioatdma_device *device;
99 struct dma_chan common;
100
101 dma_addr_t completion_addr;
102 union {
103 u64 full; /* HW completion writeback */
104 struct {
105 u32 low;
106 u32 high;
107 };
108 } *completion_virt;
109 unsigned long last_compl_desc_addr_hw;
110 struct tasklet_struct cleanup_task;
111};
112
113/* wrapper around hardware descriptor format + additional software fields */
114
115/**
116 * struct ioat_desc_sw - wrapper around hardware descriptor
117 * @hw: hardware DMA descriptor
118 * @node: this descriptor will either be on the free list,
119 * or attached to a transaction list (async_tx.tx_list)
120 * @tx_cnt: number of descriptors required to complete the transaction
121 * @async_tx: the generic software descriptor for all engines
122 */
123struct ioat_desc_sw {
124 struct ioat_dma_descriptor *hw;
125 struct list_head node;
126 int tx_cnt;
127 size_t len;
128 dma_addr_t src;
129 dma_addr_t dst;
130 struct dma_async_tx_descriptor async_tx;
131};
132
133static inline void ioat_set_tcp_copy_break(struct ioatdma_device *dev)
134{
135 #ifdef CONFIG_NET_DMA
136 switch (dev->version) {
137 case IOAT_VER_1_2:
138 sysctl_tcp_dma_copybreak = 4096;
139 break;
140 case IOAT_VER_2_0:
141 sysctl_tcp_dma_copybreak = 2048;
142 break;
143 case IOAT_VER_3_0:
144 sysctl_tcp_dma_copybreak = 262144;
145 break;
146 }
147 #endif
148}
149
150#if defined(CONFIG_INTEL_IOATDMA) || defined(CONFIG_INTEL_IOATDMA_MODULE)
151struct ioatdma_device *ioat_dma_probe(struct pci_dev *pdev,
152 void __iomem *iobase);
153void ioat_dma_remove(struct ioatdma_device *device);
154struct dca_provider *ioat_dca_init(struct pci_dev *pdev, void __iomem *iobase);
155struct dca_provider *ioat2_dca_init(struct pci_dev *pdev, void __iomem *iobase);
156struct dca_provider *ioat3_dca_init(struct pci_dev *pdev, void __iomem *iobase);
157#else
158#define ioat_dma_probe(pdev, iobase) NULL
159#define ioat_dma_remove(device) do { } while (0)
160#define ioat_dca_init(pdev, iobase) NULL
161#define ioat2_dca_init(pdev, iobase) NULL
162#define ioat3_dca_init(pdev, iobase) NULL
163#endif
164
165#endif /* IOATDMA_H */
diff --git a/drivers/dma/ioatdma_hw.h b/drivers/dma/ioatdma_hw.h
deleted file mode 100644
index afa57eef86c9..000000000000
--- a/drivers/dma/ioatdma_hw.h
+++ /dev/null
@@ -1,70 +0,0 @@
1/*
2 * Copyright(c) 2004 - 2009 Intel Corporation. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of the GNU General Public License as published by the Free
6 * Software Foundation; either version 2 of the License, or (at your option)
7 * any later version.
8 *
9 * This program is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 * You should have received a copy of the GNU General Public License along with
15 * this program; if not, write to the Free Software Foundation, Inc., 59
16 * Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * The full GNU General Public License is included in this distribution in the
19 * file called COPYING.
20 */
21#ifndef _IOAT_HW_H_
22#define _IOAT_HW_H_
23
24/* PCI Configuration Space Values */
25#define IOAT_PCI_VID 0x8086
26
27/* CB device ID's */
28#define IOAT_PCI_DID_5000 0x1A38
29#define IOAT_PCI_DID_CNB 0x360B
30#define IOAT_PCI_DID_SCNB 0x65FF
31#define IOAT_PCI_DID_SNB 0x402F
32
33#define IOAT_PCI_RID 0x00
34#define IOAT_PCI_SVID 0x8086
35#define IOAT_PCI_SID 0x8086
36#define IOAT_VER_1_2 0x12 /* Version 1.2 */
37#define IOAT_VER_2_0 0x20 /* Version 2.0 */
38#define IOAT_VER_3_0 0x30 /* Version 3.0 */
39
40struct ioat_dma_descriptor {
41 uint32_t size;
42 uint32_t ctl;
43 uint64_t src_addr;
44 uint64_t dst_addr;
45 uint64_t next;
46 uint64_t rsv1;
47 uint64_t rsv2;
48 uint64_t user1;
49 uint64_t user2;
50};
51
52#define IOAT_DMA_DESCRIPTOR_CTL_INT_GN 0x00000001
53#define IOAT_DMA_DESCRIPTOR_CTL_SRC_SN 0x00000002
54#define IOAT_DMA_DESCRIPTOR_CTL_DST_SN 0x00000004
55#define IOAT_DMA_DESCRIPTOR_CTL_CP_STS 0x00000008
56#define IOAT_DMA_DESCRIPTOR_CTL_FRAME 0x00000010
57#define IOAT_DMA_DESCRIPTOR_NUL 0x00000020
58#define IOAT_DMA_DESCRIPTOR_CTL_SP_BRK 0x00000040
59#define IOAT_DMA_DESCRIPTOR_CTL_DP_BRK 0x00000080
60#define IOAT_DMA_DESCRIPTOR_CTL_BNDL 0x00000100
61#define IOAT_DMA_DESCRIPTOR_CTL_DCA 0x00000200
62#define IOAT_DMA_DESCRIPTOR_CTL_BUFHINT 0x00000400
63
64#define IOAT_DMA_DESCRIPTOR_CTL_OPCODE_CONTEXT 0xFF000000
65#define IOAT_DMA_DESCRIPTOR_CTL_OPCODE_DMA 0x00000000
66
67#define IOAT_DMA_DESCRIPTOR_CTL_CONTEXT_DCA 0x00000001
68#define IOAT_DMA_DESCRIPTOR_CTL_OPCODE_MASK 0xFF000000
69
70#endif
diff --git a/drivers/dma/iop-adma.c b/drivers/dma/iop-adma.c
index 2f052265122f..645ca8d54ec4 100644
--- a/drivers/dma/iop-adma.c
+++ b/drivers/dma/iop-adma.c
@@ -31,6 +31,7 @@
31#include <linux/platform_device.h> 31#include <linux/platform_device.h>
32#include <linux/memory.h> 32#include <linux/memory.h>
33#include <linux/ioport.h> 33#include <linux/ioport.h>
34#include <linux/raid/pq.h>
34 35
35#include <mach/adma.h> 36#include <mach/adma.h>
36 37
@@ -57,65 +58,110 @@ static void iop_adma_free_slots(struct iop_adma_desc_slot *slot)
57 } 58 }
58} 59}
59 60
61static void
62iop_desc_unmap(struct iop_adma_chan *iop_chan, struct iop_adma_desc_slot *desc)
63{
64 struct dma_async_tx_descriptor *tx = &desc->async_tx;
65 struct iop_adma_desc_slot *unmap = desc->group_head;
66 struct device *dev = &iop_chan->device->pdev->dev;
67 u32 len = unmap->unmap_len;
68 enum dma_ctrl_flags flags = tx->flags;
69 u32 src_cnt;
70 dma_addr_t addr;
71 dma_addr_t dest;
72
73 src_cnt = unmap->unmap_src_cnt;
74 dest = iop_desc_get_dest_addr(unmap, iop_chan);
75 if (!(flags & DMA_COMPL_SKIP_DEST_UNMAP)) {
76 enum dma_data_direction dir;
77
78 if (src_cnt > 1) /* is xor? */
79 dir = DMA_BIDIRECTIONAL;
80 else
81 dir = DMA_FROM_DEVICE;
82
83 dma_unmap_page(dev, dest, len, dir);
84 }
85
86 if (!(flags & DMA_COMPL_SKIP_SRC_UNMAP)) {
87 while (src_cnt--) {
88 addr = iop_desc_get_src_addr(unmap, iop_chan, src_cnt);
89 if (addr == dest)
90 continue;
91 dma_unmap_page(dev, addr, len, DMA_TO_DEVICE);
92 }
93 }
94 desc->group_head = NULL;
95}
96
97static void
98iop_desc_unmap_pq(struct iop_adma_chan *iop_chan, struct iop_adma_desc_slot *desc)
99{
100 struct dma_async_tx_descriptor *tx = &desc->async_tx;
101 struct iop_adma_desc_slot *unmap = desc->group_head;
102 struct device *dev = &iop_chan->device->pdev->dev;
103 u32 len = unmap->unmap_len;
104 enum dma_ctrl_flags flags = tx->flags;
105 u32 src_cnt = unmap->unmap_src_cnt;
106 dma_addr_t pdest = iop_desc_get_dest_addr(unmap, iop_chan);
107 dma_addr_t qdest = iop_desc_get_qdest_addr(unmap, iop_chan);
108 int i;
109
110 if (tx->flags & DMA_PREP_CONTINUE)
111 src_cnt -= 3;
112
113 if (!(flags & DMA_COMPL_SKIP_DEST_UNMAP) && !desc->pq_check_result) {
114 dma_unmap_page(dev, pdest, len, DMA_BIDIRECTIONAL);
115 dma_unmap_page(dev, qdest, len, DMA_BIDIRECTIONAL);
116 }
117
118 if (!(flags & DMA_COMPL_SKIP_SRC_UNMAP)) {
119 dma_addr_t addr;
120
121 for (i = 0; i < src_cnt; i++) {
122 addr = iop_desc_get_src_addr(unmap, iop_chan, i);
123 dma_unmap_page(dev, addr, len, DMA_TO_DEVICE);
124 }
125 if (desc->pq_check_result) {
126 dma_unmap_page(dev, pdest, len, DMA_TO_DEVICE);
127 dma_unmap_page(dev, qdest, len, DMA_TO_DEVICE);
128 }
129 }
130
131 desc->group_head = NULL;
132}
133
134
60static dma_cookie_t 135static dma_cookie_t
61iop_adma_run_tx_complete_actions(struct iop_adma_desc_slot *desc, 136iop_adma_run_tx_complete_actions(struct iop_adma_desc_slot *desc,
62 struct iop_adma_chan *iop_chan, dma_cookie_t cookie) 137 struct iop_adma_chan *iop_chan, dma_cookie_t cookie)
63{ 138{
64 BUG_ON(desc->async_tx.cookie < 0); 139 struct dma_async_tx_descriptor *tx = &desc->async_tx;
65 if (desc->async_tx.cookie > 0) { 140
66 cookie = desc->async_tx.cookie; 141 BUG_ON(tx->cookie < 0);
67 desc->async_tx.cookie = 0; 142 if (tx->cookie > 0) {
143 cookie = tx->cookie;
144 tx->cookie = 0;
68 145
69 /* call the callback (must not sleep or submit new 146 /* call the callback (must not sleep or submit new
70 * operations to this channel) 147 * operations to this channel)
71 */ 148 */
72 if (desc->async_tx.callback) 149 if (tx->callback)
73 desc->async_tx.callback( 150 tx->callback(tx->callback_param);
74 desc->async_tx.callback_param);
75 151
76 /* unmap dma addresses 152 /* unmap dma addresses
77 * (unmap_single vs unmap_page?) 153 * (unmap_single vs unmap_page?)
78 */ 154 */
79 if (desc->group_head && desc->unmap_len) { 155 if (desc->group_head && desc->unmap_len) {
80 struct iop_adma_desc_slot *unmap = desc->group_head; 156 if (iop_desc_is_pq(desc))
81 struct device *dev = 157 iop_desc_unmap_pq(iop_chan, desc);
82 &iop_chan->device->pdev->dev; 158 else
83 u32 len = unmap->unmap_len; 159 iop_desc_unmap(iop_chan, desc);
84 enum dma_ctrl_flags flags = desc->async_tx.flags;
85 u32 src_cnt;
86 dma_addr_t addr;
87 dma_addr_t dest;
88
89 src_cnt = unmap->unmap_src_cnt;
90 dest = iop_desc_get_dest_addr(unmap, iop_chan);
91 if (!(flags & DMA_COMPL_SKIP_DEST_UNMAP)) {
92 enum dma_data_direction dir;
93
94 if (src_cnt > 1) /* is xor? */
95 dir = DMA_BIDIRECTIONAL;
96 else
97 dir = DMA_FROM_DEVICE;
98
99 dma_unmap_page(dev, dest, len, dir);
100 }
101
102 if (!(flags & DMA_COMPL_SKIP_SRC_UNMAP)) {
103 while (src_cnt--) {
104 addr = iop_desc_get_src_addr(unmap,
105 iop_chan,
106 src_cnt);
107 if (addr == dest)
108 continue;
109 dma_unmap_page(dev, addr, len,
110 DMA_TO_DEVICE);
111 }
112 }
113 desc->group_head = NULL;
114 } 160 }
115 } 161 }
116 162
117 /* run dependent operations */ 163 /* run dependent operations */
118 dma_run_dependencies(&desc->async_tx); 164 dma_run_dependencies(tx);
119 165
120 return cookie; 166 return cookie;
121} 167}
@@ -287,7 +333,12 @@ static void iop_adma_tasklet(unsigned long data)
287{ 333{
288 struct iop_adma_chan *iop_chan = (struct iop_adma_chan *) data; 334 struct iop_adma_chan *iop_chan = (struct iop_adma_chan *) data;
289 335
290 spin_lock(&iop_chan->lock); 336 /* lockdep will flag depedency submissions as potentially
337 * recursive locking, this is not the case as a dependency
338 * submission will never recurse a channels submit routine.
339 * There are checks in async_tx.c to prevent this.
340 */
341 spin_lock_nested(&iop_chan->lock, SINGLE_DEPTH_NESTING);
291 __iop_adma_slot_cleanup(iop_chan); 342 __iop_adma_slot_cleanup(iop_chan);
292 spin_unlock(&iop_chan->lock); 343 spin_unlock(&iop_chan->lock);
293} 344}
@@ -370,7 +421,7 @@ retry:
370 } 421 }
371 alloc_tail->group_head = alloc_start; 422 alloc_tail->group_head = alloc_start;
372 alloc_tail->async_tx.cookie = -EBUSY; 423 alloc_tail->async_tx.cookie = -EBUSY;
373 list_splice(&chain, &alloc_tail->async_tx.tx_list); 424 list_splice(&chain, &alloc_tail->tx_list);
374 iop_chan->last_used = last_used; 425 iop_chan->last_used = last_used;
375 iop_desc_clear_next_desc(alloc_start); 426 iop_desc_clear_next_desc(alloc_start);
376 iop_desc_clear_next_desc(alloc_tail); 427 iop_desc_clear_next_desc(alloc_tail);
@@ -429,7 +480,7 @@ iop_adma_tx_submit(struct dma_async_tx_descriptor *tx)
429 480
430 old_chain_tail = list_entry(iop_chan->chain.prev, 481 old_chain_tail = list_entry(iop_chan->chain.prev,
431 struct iop_adma_desc_slot, chain_node); 482 struct iop_adma_desc_slot, chain_node);
432 list_splice_init(&sw_desc->async_tx.tx_list, 483 list_splice_init(&sw_desc->tx_list,
433 &old_chain_tail->chain_node); 484 &old_chain_tail->chain_node);
434 485
435 /* fix up the hardware chain */ 486 /* fix up the hardware chain */
@@ -496,6 +547,7 @@ static int iop_adma_alloc_chan_resources(struct dma_chan *chan)
496 547
497 dma_async_tx_descriptor_init(&slot->async_tx, chan); 548 dma_async_tx_descriptor_init(&slot->async_tx, chan);
498 slot->async_tx.tx_submit = iop_adma_tx_submit; 549 slot->async_tx.tx_submit = iop_adma_tx_submit;
550 INIT_LIST_HEAD(&slot->tx_list);
499 INIT_LIST_HEAD(&slot->chain_node); 551 INIT_LIST_HEAD(&slot->chain_node);
500 INIT_LIST_HEAD(&slot->slot_node); 552 INIT_LIST_HEAD(&slot->slot_node);
501 hw_desc = (char *) iop_chan->device->dma_desc_pool; 553 hw_desc = (char *) iop_chan->device->dma_desc_pool;
@@ -660,9 +712,9 @@ iop_adma_prep_dma_xor(struct dma_chan *chan, dma_addr_t dma_dest,
660} 712}
661 713
662static struct dma_async_tx_descriptor * 714static struct dma_async_tx_descriptor *
663iop_adma_prep_dma_zero_sum(struct dma_chan *chan, dma_addr_t *dma_src, 715iop_adma_prep_dma_xor_val(struct dma_chan *chan, dma_addr_t *dma_src,
664 unsigned int src_cnt, size_t len, u32 *result, 716 unsigned int src_cnt, size_t len, u32 *result,
665 unsigned long flags) 717 unsigned long flags)
666{ 718{
667 struct iop_adma_chan *iop_chan = to_iop_adma_chan(chan); 719 struct iop_adma_chan *iop_chan = to_iop_adma_chan(chan);
668 struct iop_adma_desc_slot *sw_desc, *grp_start; 720 struct iop_adma_desc_slot *sw_desc, *grp_start;
@@ -696,6 +748,118 @@ iop_adma_prep_dma_zero_sum(struct dma_chan *chan, dma_addr_t *dma_src,
696 return sw_desc ? &sw_desc->async_tx : NULL; 748 return sw_desc ? &sw_desc->async_tx : NULL;
697} 749}
698 750
751static struct dma_async_tx_descriptor *
752iop_adma_prep_dma_pq(struct dma_chan *chan, dma_addr_t *dst, dma_addr_t *src,
753 unsigned int src_cnt, const unsigned char *scf, size_t len,
754 unsigned long flags)
755{
756 struct iop_adma_chan *iop_chan = to_iop_adma_chan(chan);
757 struct iop_adma_desc_slot *sw_desc, *g;
758 int slot_cnt, slots_per_op;
759 int continue_srcs;
760
761 if (unlikely(!len))
762 return NULL;
763 BUG_ON(len > IOP_ADMA_XOR_MAX_BYTE_COUNT);
764
765 dev_dbg(iop_chan->device->common.dev,
766 "%s src_cnt: %d len: %u flags: %lx\n",
767 __func__, src_cnt, len, flags);
768
769 if (dmaf_p_disabled_continue(flags))
770 continue_srcs = 1+src_cnt;
771 else if (dmaf_continue(flags))
772 continue_srcs = 3+src_cnt;
773 else
774 continue_srcs = 0+src_cnt;
775
776 spin_lock_bh(&iop_chan->lock);
777 slot_cnt = iop_chan_pq_slot_count(len, continue_srcs, &slots_per_op);
778 sw_desc = iop_adma_alloc_slots(iop_chan, slot_cnt, slots_per_op);
779 if (sw_desc) {
780 int i;
781
782 g = sw_desc->group_head;
783 iop_desc_set_byte_count(g, iop_chan, len);
784
785 /* even if P is disabled its destination address (bits
786 * [3:0]) must match Q. It is ok if P points to an
787 * invalid address, it won't be written.
788 */
789 if (flags & DMA_PREP_PQ_DISABLE_P)
790 dst[0] = dst[1] & 0x7;
791
792 iop_desc_set_pq_addr(g, dst);
793 sw_desc->unmap_src_cnt = src_cnt;
794 sw_desc->unmap_len = len;
795 sw_desc->async_tx.flags = flags;
796 for (i = 0; i < src_cnt; i++)
797 iop_desc_set_pq_src_addr(g, i, src[i], scf[i]);
798
799 /* if we are continuing a previous operation factor in
800 * the old p and q values, see the comment for dma_maxpq
801 * in include/linux/dmaengine.h
802 */
803 if (dmaf_p_disabled_continue(flags))
804 iop_desc_set_pq_src_addr(g, i++, dst[1], 1);
805 else if (dmaf_continue(flags)) {
806 iop_desc_set_pq_src_addr(g, i++, dst[0], 0);
807 iop_desc_set_pq_src_addr(g, i++, dst[1], 1);
808 iop_desc_set_pq_src_addr(g, i++, dst[1], 0);
809 }
810 iop_desc_init_pq(g, i, flags);
811 }
812 spin_unlock_bh(&iop_chan->lock);
813
814 return sw_desc ? &sw_desc->async_tx : NULL;
815}
816
817static struct dma_async_tx_descriptor *
818iop_adma_prep_dma_pq_val(struct dma_chan *chan, dma_addr_t *pq, dma_addr_t *src,
819 unsigned int src_cnt, const unsigned char *scf,
820 size_t len, enum sum_check_flags *pqres,
821 unsigned long flags)
822{
823 struct iop_adma_chan *iop_chan = to_iop_adma_chan(chan);
824 struct iop_adma_desc_slot *sw_desc, *g;
825 int slot_cnt, slots_per_op;
826
827 if (unlikely(!len))
828 return NULL;
829 BUG_ON(len > IOP_ADMA_XOR_MAX_BYTE_COUNT);
830
831 dev_dbg(iop_chan->device->common.dev, "%s src_cnt: %d len: %u\n",
832 __func__, src_cnt, len);
833
834 spin_lock_bh(&iop_chan->lock);
835 slot_cnt = iop_chan_pq_zero_sum_slot_count(len, src_cnt + 2, &slots_per_op);
836 sw_desc = iop_adma_alloc_slots(iop_chan, slot_cnt, slots_per_op);
837 if (sw_desc) {
838 /* for validate operations p and q are tagged onto the
839 * end of the source list
840 */
841 int pq_idx = src_cnt;
842
843 g = sw_desc->group_head;
844 iop_desc_init_pq_zero_sum(g, src_cnt+2, flags);
845 iop_desc_set_pq_zero_sum_byte_count(g, len);
846 g->pq_check_result = pqres;
847 pr_debug("\t%s: g->pq_check_result: %p\n",
848 __func__, g->pq_check_result);
849 sw_desc->unmap_src_cnt = src_cnt+2;
850 sw_desc->unmap_len = len;
851 sw_desc->async_tx.flags = flags;
852 while (src_cnt--)
853 iop_desc_set_pq_zero_sum_src_addr(g, src_cnt,
854 src[src_cnt],
855 scf[src_cnt]);
856 iop_desc_set_pq_zero_sum_addr(g, pq_idx, src);
857 }
858 spin_unlock_bh(&iop_chan->lock);
859
860 return sw_desc ? &sw_desc->async_tx : NULL;
861}
862
699static void iop_adma_free_chan_resources(struct dma_chan *chan) 863static void iop_adma_free_chan_resources(struct dma_chan *chan)
700{ 864{
701 struct iop_adma_chan *iop_chan = to_iop_adma_chan(chan); 865 struct iop_adma_chan *iop_chan = to_iop_adma_chan(chan);
@@ -906,7 +1070,7 @@ out:
906 1070
907#define IOP_ADMA_NUM_SRC_TEST 4 /* must be <= 15 */ 1071#define IOP_ADMA_NUM_SRC_TEST 4 /* must be <= 15 */
908static int __devinit 1072static int __devinit
909iop_adma_xor_zero_sum_self_test(struct iop_adma_device *device) 1073iop_adma_xor_val_self_test(struct iop_adma_device *device)
910{ 1074{
911 int i, src_idx; 1075 int i, src_idx;
912 struct page *dest; 1076 struct page *dest;
@@ -1002,7 +1166,7 @@ iop_adma_xor_zero_sum_self_test(struct iop_adma_device *device)
1002 PAGE_SIZE, DMA_TO_DEVICE); 1166 PAGE_SIZE, DMA_TO_DEVICE);
1003 1167
1004 /* skip zero sum if the capability is not present */ 1168 /* skip zero sum if the capability is not present */
1005 if (!dma_has_cap(DMA_ZERO_SUM, dma_chan->device->cap_mask)) 1169 if (!dma_has_cap(DMA_XOR_VAL, dma_chan->device->cap_mask))
1006 goto free_resources; 1170 goto free_resources;
1007 1171
1008 /* zero sum the sources with the destintation page */ 1172 /* zero sum the sources with the destintation page */
@@ -1016,10 +1180,10 @@ iop_adma_xor_zero_sum_self_test(struct iop_adma_device *device)
1016 dma_srcs[i] = dma_map_page(dma_chan->device->dev, 1180 dma_srcs[i] = dma_map_page(dma_chan->device->dev,
1017 zero_sum_srcs[i], 0, PAGE_SIZE, 1181 zero_sum_srcs[i], 0, PAGE_SIZE,
1018 DMA_TO_DEVICE); 1182 DMA_TO_DEVICE);
1019 tx = iop_adma_prep_dma_zero_sum(dma_chan, dma_srcs, 1183 tx = iop_adma_prep_dma_xor_val(dma_chan, dma_srcs,
1020 IOP_ADMA_NUM_SRC_TEST + 1, PAGE_SIZE, 1184 IOP_ADMA_NUM_SRC_TEST + 1, PAGE_SIZE,
1021 &zero_sum_result, 1185 &zero_sum_result,
1022 DMA_PREP_INTERRUPT | DMA_CTRL_ACK); 1186 DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
1023 1187
1024 cookie = iop_adma_tx_submit(tx); 1188 cookie = iop_adma_tx_submit(tx);
1025 iop_adma_issue_pending(dma_chan); 1189 iop_adma_issue_pending(dma_chan);
@@ -1072,10 +1236,10 @@ iop_adma_xor_zero_sum_self_test(struct iop_adma_device *device)
1072 dma_srcs[i] = dma_map_page(dma_chan->device->dev, 1236 dma_srcs[i] = dma_map_page(dma_chan->device->dev,
1073 zero_sum_srcs[i], 0, PAGE_SIZE, 1237 zero_sum_srcs[i], 0, PAGE_SIZE,
1074 DMA_TO_DEVICE); 1238 DMA_TO_DEVICE);
1075 tx = iop_adma_prep_dma_zero_sum(dma_chan, dma_srcs, 1239 tx = iop_adma_prep_dma_xor_val(dma_chan, dma_srcs,
1076 IOP_ADMA_NUM_SRC_TEST + 1, PAGE_SIZE, 1240 IOP_ADMA_NUM_SRC_TEST + 1, PAGE_SIZE,
1077 &zero_sum_result, 1241 &zero_sum_result,
1078 DMA_PREP_INTERRUPT | DMA_CTRL_ACK); 1242 DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
1079 1243
1080 cookie = iop_adma_tx_submit(tx); 1244 cookie = iop_adma_tx_submit(tx);
1081 iop_adma_issue_pending(dma_chan); 1245 iop_adma_issue_pending(dma_chan);
@@ -1105,6 +1269,170 @@ out:
1105 return err; 1269 return err;
1106} 1270}
1107 1271
1272#ifdef CONFIG_MD_RAID6_PQ
1273static int __devinit
1274iop_adma_pq_zero_sum_self_test(struct iop_adma_device *device)
1275{
1276 /* combined sources, software pq results, and extra hw pq results */
1277 struct page *pq[IOP_ADMA_NUM_SRC_TEST+2+2];
1278 /* ptr to the extra hw pq buffers defined above */
1279 struct page **pq_hw = &pq[IOP_ADMA_NUM_SRC_TEST+2];
1280 /* address conversion buffers (dma_map / page_address) */
1281 void *pq_sw[IOP_ADMA_NUM_SRC_TEST+2];
1282 dma_addr_t pq_src[IOP_ADMA_NUM_SRC_TEST];
1283 dma_addr_t pq_dest[2];
1284
1285 int i;
1286 struct dma_async_tx_descriptor *tx;
1287 struct dma_chan *dma_chan;
1288 dma_cookie_t cookie;
1289 u32 zero_sum_result;
1290 int err = 0;
1291 struct device *dev;
1292
1293 dev_dbg(device->common.dev, "%s\n", __func__);
1294
1295 for (i = 0; i < ARRAY_SIZE(pq); i++) {
1296 pq[i] = alloc_page(GFP_KERNEL);
1297 if (!pq[i]) {
1298 while (i--)
1299 __free_page(pq[i]);
1300 return -ENOMEM;
1301 }
1302 }
1303
1304 /* Fill in src buffers */
1305 for (i = 0; i < IOP_ADMA_NUM_SRC_TEST; i++) {
1306 pq_sw[i] = page_address(pq[i]);
1307 memset(pq_sw[i], 0x11111111 * (1<<i), PAGE_SIZE);
1308 }
1309 pq_sw[i] = page_address(pq[i]);
1310 pq_sw[i+1] = page_address(pq[i+1]);
1311
1312 dma_chan = container_of(device->common.channels.next,
1313 struct dma_chan,
1314 device_node);
1315 if (iop_adma_alloc_chan_resources(dma_chan) < 1) {
1316 err = -ENODEV;
1317 goto out;
1318 }
1319
1320 dev = dma_chan->device->dev;
1321
1322 /* initialize the dests */
1323 memset(page_address(pq_hw[0]), 0 , PAGE_SIZE);
1324 memset(page_address(pq_hw[1]), 0 , PAGE_SIZE);
1325
1326 /* test pq */
1327 pq_dest[0] = dma_map_page(dev, pq_hw[0], 0, PAGE_SIZE, DMA_FROM_DEVICE);
1328 pq_dest[1] = dma_map_page(dev, pq_hw[1], 0, PAGE_SIZE, DMA_FROM_DEVICE);
1329 for (i = 0; i < IOP_ADMA_NUM_SRC_TEST; i++)
1330 pq_src[i] = dma_map_page(dev, pq[i], 0, PAGE_SIZE,
1331 DMA_TO_DEVICE);
1332
1333 tx = iop_adma_prep_dma_pq(dma_chan, pq_dest, pq_src,
1334 IOP_ADMA_NUM_SRC_TEST, (u8 *)raid6_gfexp,
1335 PAGE_SIZE,
1336 DMA_PREP_INTERRUPT |
1337 DMA_CTRL_ACK);
1338
1339 cookie = iop_adma_tx_submit(tx);
1340 iop_adma_issue_pending(dma_chan);
1341 msleep(8);
1342
1343 if (iop_adma_is_complete(dma_chan, cookie, NULL, NULL) !=
1344 DMA_SUCCESS) {
1345 dev_err(dev, "Self-test pq timed out, disabling\n");
1346 err = -ENODEV;
1347 goto free_resources;
1348 }
1349
1350 raid6_call.gen_syndrome(IOP_ADMA_NUM_SRC_TEST+2, PAGE_SIZE, pq_sw);
1351
1352 if (memcmp(pq_sw[IOP_ADMA_NUM_SRC_TEST],
1353 page_address(pq_hw[0]), PAGE_SIZE) != 0) {
1354 dev_err(dev, "Self-test p failed compare, disabling\n");
1355 err = -ENODEV;
1356 goto free_resources;
1357 }
1358 if (memcmp(pq_sw[IOP_ADMA_NUM_SRC_TEST+1],
1359 page_address(pq_hw[1]), PAGE_SIZE) != 0) {
1360 dev_err(dev, "Self-test q failed compare, disabling\n");
1361 err = -ENODEV;
1362 goto free_resources;
1363 }
1364
1365 /* test correct zero sum using the software generated pq values */
1366 for (i = 0; i < IOP_ADMA_NUM_SRC_TEST + 2; i++)
1367 pq_src[i] = dma_map_page(dev, pq[i], 0, PAGE_SIZE,
1368 DMA_TO_DEVICE);
1369
1370 zero_sum_result = ~0;
1371 tx = iop_adma_prep_dma_pq_val(dma_chan, &pq_src[IOP_ADMA_NUM_SRC_TEST],
1372 pq_src, IOP_ADMA_NUM_SRC_TEST,
1373 raid6_gfexp, PAGE_SIZE, &zero_sum_result,
1374 DMA_PREP_INTERRUPT|DMA_CTRL_ACK);
1375
1376 cookie = iop_adma_tx_submit(tx);
1377 iop_adma_issue_pending(dma_chan);
1378 msleep(8);
1379
1380 if (iop_adma_is_complete(dma_chan, cookie, NULL, NULL) !=
1381 DMA_SUCCESS) {
1382 dev_err(dev, "Self-test pq-zero-sum timed out, disabling\n");
1383 err = -ENODEV;
1384 goto free_resources;
1385 }
1386
1387 if (zero_sum_result != 0) {
1388 dev_err(dev, "Self-test pq-zero-sum failed to validate: %x\n",
1389 zero_sum_result);
1390 err = -ENODEV;
1391 goto free_resources;
1392 }
1393
1394 /* test incorrect zero sum */
1395 i = IOP_ADMA_NUM_SRC_TEST;
1396 memset(pq_sw[i] + 100, 0, 100);
1397 memset(pq_sw[i+1] + 200, 0, 200);
1398 for (i = 0; i < IOP_ADMA_NUM_SRC_TEST + 2; i++)
1399 pq_src[i] = dma_map_page(dev, pq[i], 0, PAGE_SIZE,
1400 DMA_TO_DEVICE);
1401
1402 zero_sum_result = 0;
1403 tx = iop_adma_prep_dma_pq_val(dma_chan, &pq_src[IOP_ADMA_NUM_SRC_TEST],
1404 pq_src, IOP_ADMA_NUM_SRC_TEST,
1405 raid6_gfexp, PAGE_SIZE, &zero_sum_result,
1406 DMA_PREP_INTERRUPT|DMA_CTRL_ACK);
1407
1408 cookie = iop_adma_tx_submit(tx);
1409 iop_adma_issue_pending(dma_chan);
1410 msleep(8);
1411
1412 if (iop_adma_is_complete(dma_chan, cookie, NULL, NULL) !=
1413 DMA_SUCCESS) {
1414 dev_err(dev, "Self-test !pq-zero-sum timed out, disabling\n");
1415 err = -ENODEV;
1416 goto free_resources;
1417 }
1418
1419 if (zero_sum_result != (SUM_CHECK_P_RESULT | SUM_CHECK_Q_RESULT)) {
1420 dev_err(dev, "Self-test !pq-zero-sum failed to validate: %x\n",
1421 zero_sum_result);
1422 err = -ENODEV;
1423 goto free_resources;
1424 }
1425
1426free_resources:
1427 iop_adma_free_chan_resources(dma_chan);
1428out:
1429 i = ARRAY_SIZE(pq);
1430 while (i--)
1431 __free_page(pq[i]);
1432 return err;
1433}
1434#endif
1435
1108static int __devexit iop_adma_remove(struct platform_device *dev) 1436static int __devexit iop_adma_remove(struct platform_device *dev)
1109{ 1437{
1110 struct iop_adma_device *device = platform_get_drvdata(dev); 1438 struct iop_adma_device *device = platform_get_drvdata(dev);
@@ -1192,9 +1520,16 @@ static int __devinit iop_adma_probe(struct platform_device *pdev)
1192 dma_dev->max_xor = iop_adma_get_max_xor(); 1520 dma_dev->max_xor = iop_adma_get_max_xor();
1193 dma_dev->device_prep_dma_xor = iop_adma_prep_dma_xor; 1521 dma_dev->device_prep_dma_xor = iop_adma_prep_dma_xor;
1194 } 1522 }
1195 if (dma_has_cap(DMA_ZERO_SUM, dma_dev->cap_mask)) 1523 if (dma_has_cap(DMA_XOR_VAL, dma_dev->cap_mask))
1196 dma_dev->device_prep_dma_zero_sum = 1524 dma_dev->device_prep_dma_xor_val =
1197 iop_adma_prep_dma_zero_sum; 1525 iop_adma_prep_dma_xor_val;
1526 if (dma_has_cap(DMA_PQ, dma_dev->cap_mask)) {
1527 dma_set_maxpq(dma_dev, iop_adma_get_max_pq(), 0);
1528 dma_dev->device_prep_dma_pq = iop_adma_prep_dma_pq;
1529 }
1530 if (dma_has_cap(DMA_PQ_VAL, dma_dev->cap_mask))
1531 dma_dev->device_prep_dma_pq_val =
1532 iop_adma_prep_dma_pq_val;
1198 if (dma_has_cap(DMA_INTERRUPT, dma_dev->cap_mask)) 1533 if (dma_has_cap(DMA_INTERRUPT, dma_dev->cap_mask))
1199 dma_dev->device_prep_dma_interrupt = 1534 dma_dev->device_prep_dma_interrupt =
1200 iop_adma_prep_dma_interrupt; 1535 iop_adma_prep_dma_interrupt;
@@ -1248,23 +1583,35 @@ static int __devinit iop_adma_probe(struct platform_device *pdev)
1248 } 1583 }
1249 1584
1250 if (dma_has_cap(DMA_XOR, dma_dev->cap_mask) || 1585 if (dma_has_cap(DMA_XOR, dma_dev->cap_mask) ||
1251 dma_has_cap(DMA_MEMSET, dma_dev->cap_mask)) { 1586 dma_has_cap(DMA_MEMSET, dma_dev->cap_mask)) {
1252 ret = iop_adma_xor_zero_sum_self_test(adev); 1587 ret = iop_adma_xor_val_self_test(adev);
1253 dev_dbg(&pdev->dev, "xor self test returned %d\n", ret); 1588 dev_dbg(&pdev->dev, "xor self test returned %d\n", ret);
1254 if (ret) 1589 if (ret)
1255 goto err_free_iop_chan; 1590 goto err_free_iop_chan;
1256 } 1591 }
1257 1592
1593 if (dma_has_cap(DMA_PQ, dma_dev->cap_mask) &&
1594 dma_has_cap(DMA_PQ_VAL, dma_dev->cap_mask)) {
1595 #ifdef CONFIG_MD_RAID6_PQ
1596 ret = iop_adma_pq_zero_sum_self_test(adev);
1597 dev_dbg(&pdev->dev, "pq self test returned %d\n", ret);
1598 #else
1599 /* can not test raid6, so do not publish capability */
1600 dma_cap_clear(DMA_PQ, dma_dev->cap_mask);
1601 dma_cap_clear(DMA_PQ_VAL, dma_dev->cap_mask);
1602 ret = 0;
1603 #endif
1604 if (ret)
1605 goto err_free_iop_chan;
1606 }
1607
1258 dev_printk(KERN_INFO, &pdev->dev, "Intel(R) IOP: " 1608 dev_printk(KERN_INFO, &pdev->dev, "Intel(R) IOP: "
1259 "( %s%s%s%s%s%s%s%s%s%s)\n", 1609 "( %s%s%s%s%s%s%s)\n",
1260 dma_has_cap(DMA_PQ_XOR, dma_dev->cap_mask) ? "pq_xor " : "", 1610 dma_has_cap(DMA_PQ, dma_dev->cap_mask) ? "pq " : "",
1261 dma_has_cap(DMA_PQ_UPDATE, dma_dev->cap_mask) ? "pq_update " : "", 1611 dma_has_cap(DMA_PQ_VAL, dma_dev->cap_mask) ? "pq_val " : "",
1262 dma_has_cap(DMA_PQ_ZERO_SUM, dma_dev->cap_mask) ? "pq_zero_sum " : "",
1263 dma_has_cap(DMA_XOR, dma_dev->cap_mask) ? "xor " : "", 1612 dma_has_cap(DMA_XOR, dma_dev->cap_mask) ? "xor " : "",
1264 dma_has_cap(DMA_DUAL_XOR, dma_dev->cap_mask) ? "dual_xor " : "", 1613 dma_has_cap(DMA_XOR_VAL, dma_dev->cap_mask) ? "xor_val " : "",
1265 dma_has_cap(DMA_ZERO_SUM, dma_dev->cap_mask) ? "xor_zero_sum " : "",
1266 dma_has_cap(DMA_MEMSET, dma_dev->cap_mask) ? "fill " : "", 1614 dma_has_cap(DMA_MEMSET, dma_dev->cap_mask) ? "fill " : "",
1267 dma_has_cap(DMA_MEMCPY_CRC32C, dma_dev->cap_mask) ? "cpy+crc " : "",
1268 dma_has_cap(DMA_MEMCPY, dma_dev->cap_mask) ? "cpy " : "", 1615 dma_has_cap(DMA_MEMCPY, dma_dev->cap_mask) ? "cpy " : "",
1269 dma_has_cap(DMA_INTERRUPT, dma_dev->cap_mask) ? "intr " : ""); 1616 dma_has_cap(DMA_INTERRUPT, dma_dev->cap_mask) ? "intr " : "");
1270 1617
@@ -1296,7 +1643,7 @@ static void iop_chan_start_null_memcpy(struct iop_adma_chan *iop_chan)
1296 if (sw_desc) { 1643 if (sw_desc) {
1297 grp_start = sw_desc->group_head; 1644 grp_start = sw_desc->group_head;
1298 1645
1299 list_splice_init(&sw_desc->async_tx.tx_list, &iop_chan->chain); 1646 list_splice_init(&sw_desc->tx_list, &iop_chan->chain);
1300 async_tx_ack(&sw_desc->async_tx); 1647 async_tx_ack(&sw_desc->async_tx);
1301 iop_desc_init_memcpy(grp_start, 0); 1648 iop_desc_init_memcpy(grp_start, 0);
1302 iop_desc_set_byte_count(grp_start, iop_chan, 0); 1649 iop_desc_set_byte_count(grp_start, iop_chan, 0);
@@ -1352,7 +1699,7 @@ static void iop_chan_start_null_xor(struct iop_adma_chan *iop_chan)
1352 sw_desc = iop_adma_alloc_slots(iop_chan, slot_cnt, slots_per_op); 1699 sw_desc = iop_adma_alloc_slots(iop_chan, slot_cnt, slots_per_op);
1353 if (sw_desc) { 1700 if (sw_desc) {
1354 grp_start = sw_desc->group_head; 1701 grp_start = sw_desc->group_head;
1355 list_splice_init(&sw_desc->async_tx.tx_list, &iop_chan->chain); 1702 list_splice_init(&sw_desc->tx_list, &iop_chan->chain);
1356 async_tx_ack(&sw_desc->async_tx); 1703 async_tx_ack(&sw_desc->async_tx);
1357 iop_desc_init_null_xor(grp_start, 2, 0); 1704 iop_desc_init_null_xor(grp_start, 2, 0);
1358 iop_desc_set_byte_count(grp_start, iop_chan, 0); 1705 iop_desc_set_byte_count(grp_start, iop_chan, 0);
diff --git a/drivers/dma/iovlock.c b/drivers/dma/iovlock.c
index 9f6fe46a9b87..c0a272c73682 100644
--- a/drivers/dma/iovlock.c
+++ b/drivers/dma/iovlock.c
@@ -183,6 +183,11 @@ dma_cookie_t dma_memcpy_to_iovec(struct dma_chan *chan, struct iovec *iov,
183 iov_byte_offset, 183 iov_byte_offset,
184 kdata, 184 kdata,
185 copy); 185 copy);
186 /* poll for a descriptor slot */
187 if (unlikely(dma_cookie < 0)) {
188 dma_async_issue_pending(chan);
189 continue;
190 }
186 191
187 len -= copy; 192 len -= copy;
188 iov[iovec_idx].iov_len -= copy; 193 iov[iovec_idx].iov_len -= copy;
@@ -248,6 +253,11 @@ dma_cookie_t dma_memcpy_pg_to_iovec(struct dma_chan *chan, struct iovec *iov,
248 page, 253 page,
249 offset, 254 offset,
250 copy); 255 copy);
256 /* poll for a descriptor slot */
257 if (unlikely(dma_cookie < 0)) {
258 dma_async_issue_pending(chan);
259 continue;
260 }
251 261
252 len -= copy; 262 len -= copy;
253 iov[iovec_idx].iov_len -= copy; 263 iov[iovec_idx].iov_len -= copy;
diff --git a/drivers/dma/mv_xor.c b/drivers/dma/mv_xor.c
index 3f23eabe09f2..466ab10c1ff1 100644
--- a/drivers/dma/mv_xor.c
+++ b/drivers/dma/mv_xor.c
@@ -517,7 +517,7 @@ retry:
517 } 517 }
518 alloc_tail->group_head = alloc_start; 518 alloc_tail->group_head = alloc_start;
519 alloc_tail->async_tx.cookie = -EBUSY; 519 alloc_tail->async_tx.cookie = -EBUSY;
520 list_splice(&chain, &alloc_tail->async_tx.tx_list); 520 list_splice(&chain, &alloc_tail->tx_list);
521 mv_chan->last_used = last_used; 521 mv_chan->last_used = last_used;
522 mv_desc_clear_next_desc(alloc_start); 522 mv_desc_clear_next_desc(alloc_start);
523 mv_desc_clear_next_desc(alloc_tail); 523 mv_desc_clear_next_desc(alloc_tail);
@@ -565,14 +565,14 @@ mv_xor_tx_submit(struct dma_async_tx_descriptor *tx)
565 cookie = mv_desc_assign_cookie(mv_chan, sw_desc); 565 cookie = mv_desc_assign_cookie(mv_chan, sw_desc);
566 566
567 if (list_empty(&mv_chan->chain)) 567 if (list_empty(&mv_chan->chain))
568 list_splice_init(&sw_desc->async_tx.tx_list, &mv_chan->chain); 568 list_splice_init(&sw_desc->tx_list, &mv_chan->chain);
569 else { 569 else {
570 new_hw_chain = 0; 570 new_hw_chain = 0;
571 571
572 old_chain_tail = list_entry(mv_chan->chain.prev, 572 old_chain_tail = list_entry(mv_chan->chain.prev,
573 struct mv_xor_desc_slot, 573 struct mv_xor_desc_slot,
574 chain_node); 574 chain_node);
575 list_splice_init(&grp_start->async_tx.tx_list, 575 list_splice_init(&grp_start->tx_list,
576 &old_chain_tail->chain_node); 576 &old_chain_tail->chain_node);
577 577
578 if (!mv_can_chain(grp_start)) 578 if (!mv_can_chain(grp_start))
@@ -632,6 +632,7 @@ static int mv_xor_alloc_chan_resources(struct dma_chan *chan)
632 slot->async_tx.tx_submit = mv_xor_tx_submit; 632 slot->async_tx.tx_submit = mv_xor_tx_submit;
633 INIT_LIST_HEAD(&slot->chain_node); 633 INIT_LIST_HEAD(&slot->chain_node);
634 INIT_LIST_HEAD(&slot->slot_node); 634 INIT_LIST_HEAD(&slot->slot_node);
635 INIT_LIST_HEAD(&slot->tx_list);
635 hw_desc = (char *) mv_chan->device->dma_desc_pool; 636 hw_desc = (char *) mv_chan->device->dma_desc_pool;
636 slot->async_tx.phys = 637 slot->async_tx.phys =
637 (dma_addr_t) &hw_desc[idx * MV_XOR_SLOT_SIZE]; 638 (dma_addr_t) &hw_desc[idx * MV_XOR_SLOT_SIZE];
diff --git a/drivers/dma/mv_xor.h b/drivers/dma/mv_xor.h
index 06cafe1ef521..977b592e976b 100644
--- a/drivers/dma/mv_xor.h
+++ b/drivers/dma/mv_xor.h
@@ -126,9 +126,8 @@ struct mv_xor_chan {
126 * @idx: pool index 126 * @idx: pool index
127 * @unmap_src_cnt: number of xor sources 127 * @unmap_src_cnt: number of xor sources
128 * @unmap_len: transaction bytecount 128 * @unmap_len: transaction bytecount
129 * @tx_list: list of slots that make up a multi-descriptor transaction
129 * @async_tx: support for the async_tx api 130 * @async_tx: support for the async_tx api
130 * @group_list: list of slots that make up a multi-descriptor transaction
131 * for example transfer lengths larger than the supported hw max
132 * @xor_check_result: result of zero sum 131 * @xor_check_result: result of zero sum
133 * @crc32_result: result crc calculation 132 * @crc32_result: result crc calculation
134 */ 133 */
@@ -145,6 +144,7 @@ struct mv_xor_desc_slot {
145 u16 unmap_src_cnt; 144 u16 unmap_src_cnt;
146 u32 value; 145 u32 value;
147 size_t unmap_len; 146 size_t unmap_len;
147 struct list_head tx_list;
148 struct dma_async_tx_descriptor async_tx; 148 struct dma_async_tx_descriptor async_tx;
149 union { 149 union {
150 u32 *xor_check_result; 150 u32 *xor_check_result;
diff --git a/drivers/dma/shdma.c b/drivers/dma/shdma.c
new file mode 100644
index 000000000000..b3b065c4e5c1
--- /dev/null
+++ b/drivers/dma/shdma.c
@@ -0,0 +1,786 @@
1/*
2 * Renesas SuperH DMA Engine support
3 *
4 * base is drivers/dma/flsdma.c
5 *
6 * Copyright (C) 2009 Nobuhiro Iwamatsu <iwamatsu.nobuhiro@renesas.com>
7 * Copyright (C) 2009 Renesas Solutions, Inc. All rights reserved.
8 * Copyright (C) 2007 Freescale Semiconductor, Inc. All rights reserved.
9 *
10 * This is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * - DMA of SuperH does not have Hardware DMA chain mode.
16 * - MAX DMA size is 16MB.
17 *
18 */
19
20#include <linux/init.h>
21#include <linux/module.h>
22#include <linux/interrupt.h>
23#include <linux/dmaengine.h>
24#include <linux/delay.h>
25#include <linux/dma-mapping.h>
26#include <linux/dmapool.h>
27#include <linux/platform_device.h>
28#include <cpu/dma.h>
29#include <asm/dma-sh.h>
30#include "shdma.h"
31
32/* DMA descriptor control */
33#define DESC_LAST (-1)
34#define DESC_COMP (1)
35#define DESC_NCOMP (0)
36
37#define NR_DESCS_PER_CHANNEL 32
38/*
39 * Define the default configuration for dual address memory-memory transfer.
40 * The 0x400 value represents auto-request, external->external.
41 *
42 * And this driver set 4byte burst mode.
43 * If you want to change mode, you need to change RS_DEFAULT of value.
44 * (ex 1byte burst mode -> (RS_DUAL & ~TS_32)
45 */
46#define RS_DEFAULT (RS_DUAL)
47
48#define SH_DMAC_CHAN_BASE(id) (dma_base_addr[id])
49static void sh_dmae_writel(struct sh_dmae_chan *sh_dc, u32 data, u32 reg)
50{
51 ctrl_outl(data, (SH_DMAC_CHAN_BASE(sh_dc->id) + reg));
52}
53
54static u32 sh_dmae_readl(struct sh_dmae_chan *sh_dc, u32 reg)
55{
56 return ctrl_inl((SH_DMAC_CHAN_BASE(sh_dc->id) + reg));
57}
58
59static void dmae_init(struct sh_dmae_chan *sh_chan)
60{
61 u32 chcr = RS_DEFAULT; /* default is DUAL mode */
62 sh_dmae_writel(sh_chan, chcr, CHCR);
63}
64
65/*
66 * Reset DMA controller
67 *
68 * SH7780 has two DMAOR register
69 */
70static void sh_dmae_ctl_stop(int id)
71{
72 unsigned short dmaor = dmaor_read_reg(id);
73
74 dmaor &= ~(DMAOR_NMIF | DMAOR_AE);
75 dmaor_write_reg(id, dmaor);
76}
77
78static int sh_dmae_rst(int id)
79{
80 unsigned short dmaor;
81
82 sh_dmae_ctl_stop(id);
83 dmaor = (dmaor_read_reg(id)|DMAOR_INIT);
84
85 dmaor_write_reg(id, dmaor);
86 if ((dmaor_read_reg(id) & (DMAOR_AE | DMAOR_NMIF))) {
87 pr_warning(KERN_ERR "dma-sh: Can't initialize DMAOR.\n");
88 return -EINVAL;
89 }
90 return 0;
91}
92
93static int dmae_is_idle(struct sh_dmae_chan *sh_chan)
94{
95 u32 chcr = sh_dmae_readl(sh_chan, CHCR);
96 if (chcr & CHCR_DE) {
97 if (!(chcr & CHCR_TE))
98 return -EBUSY; /* working */
99 }
100 return 0; /* waiting */
101}
102
103static inline unsigned int calc_xmit_shift(struct sh_dmae_chan *sh_chan)
104{
105 u32 chcr = sh_dmae_readl(sh_chan, CHCR);
106 return ts_shift[(chcr & CHCR_TS_MASK) >> CHCR_TS_SHIFT];
107}
108
109static void dmae_set_reg(struct sh_dmae_chan *sh_chan, struct sh_dmae_regs hw)
110{
111 sh_dmae_writel(sh_chan, hw.sar, SAR);
112 sh_dmae_writel(sh_chan, hw.dar, DAR);
113 sh_dmae_writel(sh_chan,
114 (hw.tcr >> calc_xmit_shift(sh_chan)), TCR);
115}
116
117static void dmae_start(struct sh_dmae_chan *sh_chan)
118{
119 u32 chcr = sh_dmae_readl(sh_chan, CHCR);
120
121 chcr |= (CHCR_DE|CHCR_IE);
122 sh_dmae_writel(sh_chan, chcr, CHCR);
123}
124
125static void dmae_halt(struct sh_dmae_chan *sh_chan)
126{
127 u32 chcr = sh_dmae_readl(sh_chan, CHCR);
128
129 chcr &= ~(CHCR_DE | CHCR_TE | CHCR_IE);
130 sh_dmae_writel(sh_chan, chcr, CHCR);
131}
132
133static int dmae_set_chcr(struct sh_dmae_chan *sh_chan, u32 val)
134{
135 int ret = dmae_is_idle(sh_chan);
136 /* When DMA was working, can not set data to CHCR */
137 if (ret)
138 return ret;
139
140 sh_dmae_writel(sh_chan, val, CHCR);
141 return 0;
142}
143
144#define DMARS1_ADDR 0x04
145#define DMARS2_ADDR 0x08
146#define DMARS_SHIFT 8
147#define DMARS_CHAN_MSK 0x01
148static int dmae_set_dmars(struct sh_dmae_chan *sh_chan, u16 val)
149{
150 u32 addr;
151 int shift = 0;
152 int ret = dmae_is_idle(sh_chan);
153 if (ret)
154 return ret;
155
156 if (sh_chan->id & DMARS_CHAN_MSK)
157 shift = DMARS_SHIFT;
158
159 switch (sh_chan->id) {
160 /* DMARS0 */
161 case 0:
162 case 1:
163 addr = SH_DMARS_BASE;
164 break;
165 /* DMARS1 */
166 case 2:
167 case 3:
168 addr = (SH_DMARS_BASE + DMARS1_ADDR);
169 break;
170 /* DMARS2 */
171 case 4:
172 case 5:
173 addr = (SH_DMARS_BASE + DMARS2_ADDR);
174 break;
175 default:
176 return -EINVAL;
177 }
178
179 ctrl_outw((val << shift) |
180 (ctrl_inw(addr) & (shift ? 0xFF00 : 0x00FF)),
181 addr);
182
183 return 0;
184}
185
186static dma_cookie_t sh_dmae_tx_submit(struct dma_async_tx_descriptor *tx)
187{
188 struct sh_desc *desc = tx_to_sh_desc(tx);
189 struct sh_dmae_chan *sh_chan = to_sh_chan(tx->chan);
190 dma_cookie_t cookie;
191
192 spin_lock_bh(&sh_chan->desc_lock);
193
194 cookie = sh_chan->common.cookie;
195 cookie++;
196 if (cookie < 0)
197 cookie = 1;
198
199 /* If desc only in the case of 1 */
200 if (desc->async_tx.cookie != -EBUSY)
201 desc->async_tx.cookie = cookie;
202 sh_chan->common.cookie = desc->async_tx.cookie;
203
204 list_splice_init(&desc->tx_list, sh_chan->ld_queue.prev);
205
206 spin_unlock_bh(&sh_chan->desc_lock);
207
208 return cookie;
209}
210
211static struct sh_desc *sh_dmae_get_desc(struct sh_dmae_chan *sh_chan)
212{
213 struct sh_desc *desc, *_desc, *ret = NULL;
214
215 spin_lock_bh(&sh_chan->desc_lock);
216 list_for_each_entry_safe(desc, _desc, &sh_chan->ld_free, node) {
217 if (async_tx_test_ack(&desc->async_tx)) {
218 list_del(&desc->node);
219 ret = desc;
220 break;
221 }
222 }
223 spin_unlock_bh(&sh_chan->desc_lock);
224
225 return ret;
226}
227
228static void sh_dmae_put_desc(struct sh_dmae_chan *sh_chan, struct sh_desc *desc)
229{
230 if (desc) {
231 spin_lock_bh(&sh_chan->desc_lock);
232
233 list_splice_init(&desc->tx_list, &sh_chan->ld_free);
234 list_add(&desc->node, &sh_chan->ld_free);
235
236 spin_unlock_bh(&sh_chan->desc_lock);
237 }
238}
239
240static int sh_dmae_alloc_chan_resources(struct dma_chan *chan)
241{
242 struct sh_dmae_chan *sh_chan = to_sh_chan(chan);
243 struct sh_desc *desc;
244
245 spin_lock_bh(&sh_chan->desc_lock);
246 while (sh_chan->descs_allocated < NR_DESCS_PER_CHANNEL) {
247 spin_unlock_bh(&sh_chan->desc_lock);
248 desc = kzalloc(sizeof(struct sh_desc), GFP_KERNEL);
249 if (!desc) {
250 spin_lock_bh(&sh_chan->desc_lock);
251 break;
252 }
253 dma_async_tx_descriptor_init(&desc->async_tx,
254 &sh_chan->common);
255 desc->async_tx.tx_submit = sh_dmae_tx_submit;
256 desc->async_tx.flags = DMA_CTRL_ACK;
257 INIT_LIST_HEAD(&desc->tx_list);
258 sh_dmae_put_desc(sh_chan, desc);
259
260 spin_lock_bh(&sh_chan->desc_lock);
261 sh_chan->descs_allocated++;
262 }
263 spin_unlock_bh(&sh_chan->desc_lock);
264
265 return sh_chan->descs_allocated;
266}
267
268/*
269 * sh_dma_free_chan_resources - Free all resources of the channel.
270 */
271static void sh_dmae_free_chan_resources(struct dma_chan *chan)
272{
273 struct sh_dmae_chan *sh_chan = to_sh_chan(chan);
274 struct sh_desc *desc, *_desc;
275 LIST_HEAD(list);
276
277 BUG_ON(!list_empty(&sh_chan->ld_queue));
278 spin_lock_bh(&sh_chan->desc_lock);
279
280 list_splice_init(&sh_chan->ld_free, &list);
281 sh_chan->descs_allocated = 0;
282
283 spin_unlock_bh(&sh_chan->desc_lock);
284
285 list_for_each_entry_safe(desc, _desc, &list, node)
286 kfree(desc);
287}
288
289static struct dma_async_tx_descriptor *sh_dmae_prep_memcpy(
290 struct dma_chan *chan, dma_addr_t dma_dest, dma_addr_t dma_src,
291 size_t len, unsigned long flags)
292{
293 struct sh_dmae_chan *sh_chan;
294 struct sh_desc *first = NULL, *prev = NULL, *new;
295 size_t copy_size;
296
297 if (!chan)
298 return NULL;
299
300 if (!len)
301 return NULL;
302
303 sh_chan = to_sh_chan(chan);
304
305 do {
306 /* Allocate the link descriptor from DMA pool */
307 new = sh_dmae_get_desc(sh_chan);
308 if (!new) {
309 dev_err(sh_chan->dev,
310 "No free memory for link descriptor\n");
311 goto err_get_desc;
312 }
313
314 copy_size = min(len, (size_t)SH_DMA_TCR_MAX);
315
316 new->hw.sar = dma_src;
317 new->hw.dar = dma_dest;
318 new->hw.tcr = copy_size;
319 if (!first)
320 first = new;
321
322 new->mark = DESC_NCOMP;
323 async_tx_ack(&new->async_tx);
324
325 prev = new;
326 len -= copy_size;
327 dma_src += copy_size;
328 dma_dest += copy_size;
329 /* Insert the link descriptor to the LD ring */
330 list_add_tail(&new->node, &first->tx_list);
331 } while (len);
332
333 new->async_tx.flags = flags; /* client is in control of this ack */
334 new->async_tx.cookie = -EBUSY; /* Last desc */
335
336 return &first->async_tx;
337
338err_get_desc:
339 sh_dmae_put_desc(sh_chan, first);
340 return NULL;
341
342}
343
344/*
345 * sh_chan_ld_cleanup - Clean up link descriptors
346 *
347 * This function clean up the ld_queue of DMA channel.
348 */
349static void sh_dmae_chan_ld_cleanup(struct sh_dmae_chan *sh_chan)
350{
351 struct sh_desc *desc, *_desc;
352
353 spin_lock_bh(&sh_chan->desc_lock);
354 list_for_each_entry_safe(desc, _desc, &sh_chan->ld_queue, node) {
355 dma_async_tx_callback callback;
356 void *callback_param;
357
358 /* non send data */
359 if (desc->mark == DESC_NCOMP)
360 break;
361
362 /* send data sesc */
363 callback = desc->async_tx.callback;
364 callback_param = desc->async_tx.callback_param;
365
366 /* Remove from ld_queue list */
367 list_splice_init(&desc->tx_list, &sh_chan->ld_free);
368
369 dev_dbg(sh_chan->dev, "link descriptor %p will be recycle.\n",
370 desc);
371
372 list_move(&desc->node, &sh_chan->ld_free);
373 /* Run the link descriptor callback function */
374 if (callback) {
375 spin_unlock_bh(&sh_chan->desc_lock);
376 dev_dbg(sh_chan->dev, "link descriptor %p callback\n",
377 desc);
378 callback(callback_param);
379 spin_lock_bh(&sh_chan->desc_lock);
380 }
381 }
382 spin_unlock_bh(&sh_chan->desc_lock);
383}
384
385static void sh_chan_xfer_ld_queue(struct sh_dmae_chan *sh_chan)
386{
387 struct list_head *ld_node;
388 struct sh_dmae_regs hw;
389
390 /* DMA work check */
391 if (dmae_is_idle(sh_chan))
392 return;
393
394 /* Find the first un-transfer desciptor */
395 for (ld_node = sh_chan->ld_queue.next;
396 (ld_node != &sh_chan->ld_queue)
397 && (to_sh_desc(ld_node)->mark == DESC_COMP);
398 ld_node = ld_node->next)
399 cpu_relax();
400
401 if (ld_node != &sh_chan->ld_queue) {
402 /* Get the ld start address from ld_queue */
403 hw = to_sh_desc(ld_node)->hw;
404 dmae_set_reg(sh_chan, hw);
405 dmae_start(sh_chan);
406 }
407}
408
409static void sh_dmae_memcpy_issue_pending(struct dma_chan *chan)
410{
411 struct sh_dmae_chan *sh_chan = to_sh_chan(chan);
412 sh_chan_xfer_ld_queue(sh_chan);
413}
414
415static enum dma_status sh_dmae_is_complete(struct dma_chan *chan,
416 dma_cookie_t cookie,
417 dma_cookie_t *done,
418 dma_cookie_t *used)
419{
420 struct sh_dmae_chan *sh_chan = to_sh_chan(chan);
421 dma_cookie_t last_used;
422 dma_cookie_t last_complete;
423
424 sh_dmae_chan_ld_cleanup(sh_chan);
425
426 last_used = chan->cookie;
427 last_complete = sh_chan->completed_cookie;
428 if (last_complete == -EBUSY)
429 last_complete = last_used;
430
431 if (done)
432 *done = last_complete;
433
434 if (used)
435 *used = last_used;
436
437 return dma_async_is_complete(cookie, last_complete, last_used);
438}
439
440static irqreturn_t sh_dmae_interrupt(int irq, void *data)
441{
442 irqreturn_t ret = IRQ_NONE;
443 struct sh_dmae_chan *sh_chan = (struct sh_dmae_chan *)data;
444 u32 chcr = sh_dmae_readl(sh_chan, CHCR);
445
446 if (chcr & CHCR_TE) {
447 /* DMA stop */
448 dmae_halt(sh_chan);
449
450 ret = IRQ_HANDLED;
451 tasklet_schedule(&sh_chan->tasklet);
452 }
453
454 return ret;
455}
456
457#if defined(CONFIG_CPU_SH4)
458static irqreturn_t sh_dmae_err(int irq, void *data)
459{
460 int err = 0;
461 struct sh_dmae_device *shdev = (struct sh_dmae_device *)data;
462
463 /* IRQ Multi */
464 if (shdev->pdata.mode & SHDMA_MIX_IRQ) {
465 int cnt = 0;
466 switch (irq) {
467#if defined(DMTE6_IRQ) && defined(DMAE1_IRQ)
468 case DMTE6_IRQ:
469 cnt++;
470#endif
471 case DMTE0_IRQ:
472 if (dmaor_read_reg(cnt) & (DMAOR_NMIF | DMAOR_AE)) {
473 disable_irq(irq);
474 return IRQ_HANDLED;
475 }
476 default:
477 return IRQ_NONE;
478 }
479 } else {
480 /* reset dma controller */
481 err = sh_dmae_rst(0);
482 if (err)
483 return err;
484 if (shdev->pdata.mode & SHDMA_DMAOR1) {
485 err = sh_dmae_rst(1);
486 if (err)
487 return err;
488 }
489 disable_irq(irq);
490 return IRQ_HANDLED;
491 }
492}
493#endif
494
495static void dmae_do_tasklet(unsigned long data)
496{
497 struct sh_dmae_chan *sh_chan = (struct sh_dmae_chan *)data;
498 struct sh_desc *desc, *_desc, *cur_desc = NULL;
499 u32 sar_buf = sh_dmae_readl(sh_chan, SAR);
500 list_for_each_entry_safe(desc, _desc,
501 &sh_chan->ld_queue, node) {
502 if ((desc->hw.sar + desc->hw.tcr) == sar_buf) {
503 cur_desc = desc;
504 break;
505 }
506 }
507
508 if (cur_desc) {
509 switch (cur_desc->async_tx.cookie) {
510 case 0: /* other desc data */
511 break;
512 case -EBUSY: /* last desc */
513 sh_chan->completed_cookie =
514 cur_desc->async_tx.cookie;
515 break;
516 default: /* first desc ( 0 < )*/
517 sh_chan->completed_cookie =
518 cur_desc->async_tx.cookie - 1;
519 break;
520 }
521 cur_desc->mark = DESC_COMP;
522 }
523 /* Next desc */
524 sh_chan_xfer_ld_queue(sh_chan);
525 sh_dmae_chan_ld_cleanup(sh_chan);
526}
527
528static unsigned int get_dmae_irq(unsigned int id)
529{
530 unsigned int irq = 0;
531 if (id < ARRAY_SIZE(dmte_irq_map))
532 irq = dmte_irq_map[id];
533 return irq;
534}
535
536static int __devinit sh_dmae_chan_probe(struct sh_dmae_device *shdev, int id)
537{
538 int err;
539 unsigned int irq = get_dmae_irq(id);
540 unsigned long irqflags = IRQF_DISABLED;
541 struct sh_dmae_chan *new_sh_chan;
542
543 /* alloc channel */
544 new_sh_chan = kzalloc(sizeof(struct sh_dmae_chan), GFP_KERNEL);
545 if (!new_sh_chan) {
546 dev_err(shdev->common.dev, "No free memory for allocating "
547 "dma channels!\n");
548 return -ENOMEM;
549 }
550
551 new_sh_chan->dev = shdev->common.dev;
552 new_sh_chan->id = id;
553
554 /* Init DMA tasklet */
555 tasklet_init(&new_sh_chan->tasklet, dmae_do_tasklet,
556 (unsigned long)new_sh_chan);
557
558 /* Init the channel */
559 dmae_init(new_sh_chan);
560
561 spin_lock_init(&new_sh_chan->desc_lock);
562
563 /* Init descripter manage list */
564 INIT_LIST_HEAD(&new_sh_chan->ld_queue);
565 INIT_LIST_HEAD(&new_sh_chan->ld_free);
566
567 /* copy struct dma_device */
568 new_sh_chan->common.device = &shdev->common;
569
570 /* Add the channel to DMA device channel list */
571 list_add_tail(&new_sh_chan->common.device_node,
572 &shdev->common.channels);
573 shdev->common.chancnt++;
574
575 if (shdev->pdata.mode & SHDMA_MIX_IRQ) {
576 irqflags = IRQF_SHARED;
577#if defined(DMTE6_IRQ)
578 if (irq >= DMTE6_IRQ)
579 irq = DMTE6_IRQ;
580 else
581#endif
582 irq = DMTE0_IRQ;
583 }
584
585 snprintf(new_sh_chan->dev_id, sizeof(new_sh_chan->dev_id),
586 "sh-dmae%d", new_sh_chan->id);
587
588 /* set up channel irq */
589 err = request_irq(irq, &sh_dmae_interrupt,
590 irqflags, new_sh_chan->dev_id, new_sh_chan);
591 if (err) {
592 dev_err(shdev->common.dev, "DMA channel %d request_irq error "
593 "with return %d\n", id, err);
594 goto err_no_irq;
595 }
596
597 /* CHCR register control function */
598 new_sh_chan->set_chcr = dmae_set_chcr;
599 /* DMARS register control function */
600 new_sh_chan->set_dmars = dmae_set_dmars;
601
602 shdev->chan[id] = new_sh_chan;
603 return 0;
604
605err_no_irq:
606 /* remove from dmaengine device node */
607 list_del(&new_sh_chan->common.device_node);
608 kfree(new_sh_chan);
609 return err;
610}
611
612static void sh_dmae_chan_remove(struct sh_dmae_device *shdev)
613{
614 int i;
615
616 for (i = shdev->common.chancnt - 1 ; i >= 0 ; i--) {
617 if (shdev->chan[i]) {
618 struct sh_dmae_chan *shchan = shdev->chan[i];
619 if (!(shdev->pdata.mode & SHDMA_MIX_IRQ))
620 free_irq(dmte_irq_map[i], shchan);
621
622 list_del(&shchan->common.device_node);
623 kfree(shchan);
624 shdev->chan[i] = NULL;
625 }
626 }
627 shdev->common.chancnt = 0;
628}
629
630static int __init sh_dmae_probe(struct platform_device *pdev)
631{
632 int err = 0, cnt, ecnt;
633 unsigned long irqflags = IRQF_DISABLED;
634#if defined(CONFIG_CPU_SH4)
635 int eirq[] = { DMAE0_IRQ,
636#if defined(DMAE1_IRQ)
637 DMAE1_IRQ
638#endif
639 };
640#endif
641 struct sh_dmae_device *shdev;
642
643 shdev = kzalloc(sizeof(struct sh_dmae_device), GFP_KERNEL);
644 if (!shdev) {
645 dev_err(&pdev->dev, "No enough memory\n");
646 err = -ENOMEM;
647 goto shdev_err;
648 }
649
650 /* get platform data */
651 if (!pdev->dev.platform_data)
652 goto shdev_err;
653
654 /* platform data */
655 memcpy(&shdev->pdata, pdev->dev.platform_data,
656 sizeof(struct sh_dmae_pdata));
657
658 /* reset dma controller */
659 err = sh_dmae_rst(0);
660 if (err)
661 goto rst_err;
662
663 /* SH7780/85/23 has DMAOR1 */
664 if (shdev->pdata.mode & SHDMA_DMAOR1) {
665 err = sh_dmae_rst(1);
666 if (err)
667 goto rst_err;
668 }
669
670 INIT_LIST_HEAD(&shdev->common.channels);
671
672 dma_cap_set(DMA_MEMCPY, shdev->common.cap_mask);
673 shdev->common.device_alloc_chan_resources
674 = sh_dmae_alloc_chan_resources;
675 shdev->common.device_free_chan_resources = sh_dmae_free_chan_resources;
676 shdev->common.device_prep_dma_memcpy = sh_dmae_prep_memcpy;
677 shdev->common.device_is_tx_complete = sh_dmae_is_complete;
678 shdev->common.device_issue_pending = sh_dmae_memcpy_issue_pending;
679 shdev->common.dev = &pdev->dev;
680
681#if defined(CONFIG_CPU_SH4)
682 /* Non Mix IRQ mode SH7722/SH7730 etc... */
683 if (shdev->pdata.mode & SHDMA_MIX_IRQ) {
684 irqflags = IRQF_SHARED;
685 eirq[0] = DMTE0_IRQ;
686#if defined(DMTE6_IRQ) && defined(DMAE1_IRQ)
687 eirq[1] = DMTE6_IRQ;
688#endif
689 }
690
691 for (ecnt = 0 ; ecnt < ARRAY_SIZE(eirq); ecnt++) {
692 err = request_irq(eirq[ecnt], sh_dmae_err,
693 irqflags, "DMAC Address Error", shdev);
694 if (err) {
695 dev_err(&pdev->dev, "DMA device request_irq"
696 "error (irq %d) with return %d\n",
697 eirq[ecnt], err);
698 goto eirq_err;
699 }
700 }
701#endif /* CONFIG_CPU_SH4 */
702
703 /* Create DMA Channel */
704 for (cnt = 0 ; cnt < MAX_DMA_CHANNELS ; cnt++) {
705 err = sh_dmae_chan_probe(shdev, cnt);
706 if (err)
707 goto chan_probe_err;
708 }
709
710 platform_set_drvdata(pdev, shdev);
711 dma_async_device_register(&shdev->common);
712
713 return err;
714
715chan_probe_err:
716 sh_dmae_chan_remove(shdev);
717
718eirq_err:
719 for (ecnt-- ; ecnt >= 0; ecnt--)
720 free_irq(eirq[ecnt], shdev);
721
722rst_err:
723 kfree(shdev);
724
725shdev_err:
726 return err;
727}
728
729static int __exit sh_dmae_remove(struct platform_device *pdev)
730{
731 struct sh_dmae_device *shdev = platform_get_drvdata(pdev);
732
733 dma_async_device_unregister(&shdev->common);
734
735 if (shdev->pdata.mode & SHDMA_MIX_IRQ) {
736 free_irq(DMTE0_IRQ, shdev);
737#if defined(DMTE6_IRQ)
738 free_irq(DMTE6_IRQ, shdev);
739#endif
740 }
741
742 /* channel data remove */
743 sh_dmae_chan_remove(shdev);
744
745 if (!(shdev->pdata.mode & SHDMA_MIX_IRQ)) {
746 free_irq(DMAE0_IRQ, shdev);
747#if defined(DMAE1_IRQ)
748 free_irq(DMAE1_IRQ, shdev);
749#endif
750 }
751 kfree(shdev);
752
753 return 0;
754}
755
756static void sh_dmae_shutdown(struct platform_device *pdev)
757{
758 struct sh_dmae_device *shdev = platform_get_drvdata(pdev);
759 sh_dmae_ctl_stop(0);
760 if (shdev->pdata.mode & SHDMA_DMAOR1)
761 sh_dmae_ctl_stop(1);
762}
763
764static struct platform_driver sh_dmae_driver = {
765 .remove = __exit_p(sh_dmae_remove),
766 .shutdown = sh_dmae_shutdown,
767 .driver = {
768 .name = "sh-dma-engine",
769 },
770};
771
772static int __init sh_dmae_init(void)
773{
774 return platform_driver_probe(&sh_dmae_driver, sh_dmae_probe);
775}
776module_init(sh_dmae_init);
777
778static void __exit sh_dmae_exit(void)
779{
780 platform_driver_unregister(&sh_dmae_driver);
781}
782module_exit(sh_dmae_exit);
783
784MODULE_AUTHOR("Nobuhiro Iwamatsu <iwamatsu.nobuhiro@renesas.com>");
785MODULE_DESCRIPTION("Renesas SH DMA Engine driver");
786MODULE_LICENSE("GPL");
diff --git a/drivers/dma/shdma.h b/drivers/dma/shdma.h
new file mode 100644
index 000000000000..2b4bc15a2c0a
--- /dev/null
+++ b/drivers/dma/shdma.h
@@ -0,0 +1,64 @@
1/*
2 * Renesas SuperH DMA Engine support
3 *
4 * Copyright (C) 2009 Nobuhiro Iwamatsu <iwamatsu.nobuhiro@renesas.com>
5 * Copyright (C) 2009 Renesas Solutions, Inc. All rights reserved.
6 *
7 * This is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 */
13#ifndef __DMA_SHDMA_H
14#define __DMA_SHDMA_H
15
16#include <linux/device.h>
17#include <linux/dmapool.h>
18#include <linux/dmaengine.h>
19
20#define SH_DMA_TCR_MAX 0x00FFFFFF /* 16MB */
21
22struct sh_dmae_regs {
23 u32 sar; /* SAR / source address */
24 u32 dar; /* DAR / destination address */
25 u32 tcr; /* TCR / transfer count */
26};
27
28struct sh_desc {
29 struct list_head tx_list;
30 struct sh_dmae_regs hw;
31 struct list_head node;
32 struct dma_async_tx_descriptor async_tx;
33 int mark;
34};
35
36struct sh_dmae_chan {
37 dma_cookie_t completed_cookie; /* The maximum cookie completed */
38 spinlock_t desc_lock; /* Descriptor operation lock */
39 struct list_head ld_queue; /* Link descriptors queue */
40 struct list_head ld_free; /* Link descriptors free */
41 struct dma_chan common; /* DMA common channel */
42 struct device *dev; /* Channel device */
43 struct tasklet_struct tasklet; /* Tasklet */
44 int descs_allocated; /* desc count */
45 int id; /* Raw id of this channel */
46 char dev_id[16]; /* unique name per DMAC of channel */
47
48 /* Set chcr */
49 int (*set_chcr)(struct sh_dmae_chan *sh_chan, u32 regs);
50 /* Set DMA resource */
51 int (*set_dmars)(struct sh_dmae_chan *sh_chan, u16 res);
52};
53
54struct sh_dmae_device {
55 struct dma_device common;
56 struct sh_dmae_chan *chan[MAX_DMA_CHANNELS];
57 struct sh_dmae_pdata pdata;
58};
59
60#define to_sh_chan(chan) container_of(chan, struct sh_dmae_chan, common)
61#define to_sh_desc(lh) container_of(lh, struct sh_desc, node)
62#define tx_to_sh_desc(tx) container_of(tx, struct sh_desc, async_tx)
63
64#endif /* __DMA_SHDMA_H */
diff --git a/drivers/dma/txx9dmac.c b/drivers/dma/txx9dmac.c
index 7837930146a4..fb6bb64e8861 100644
--- a/drivers/dma/txx9dmac.c
+++ b/drivers/dma/txx9dmac.c
@@ -180,9 +180,8 @@ static struct txx9dmac_desc *txx9dmac_first_queued(struct txx9dmac_chan *dc)
180 180
181static struct txx9dmac_desc *txx9dmac_last_child(struct txx9dmac_desc *desc) 181static struct txx9dmac_desc *txx9dmac_last_child(struct txx9dmac_desc *desc)
182{ 182{
183 if (!list_empty(&desc->txd.tx_list)) 183 if (!list_empty(&desc->tx_list))
184 desc = list_entry(desc->txd.tx_list.prev, 184 desc = list_entry(desc->tx_list.prev, typeof(*desc), desc_node);
185 struct txx9dmac_desc, desc_node);
186 return desc; 185 return desc;
187} 186}
188 187
@@ -197,6 +196,7 @@ static struct txx9dmac_desc *txx9dmac_desc_alloc(struct txx9dmac_chan *dc,
197 desc = kzalloc(sizeof(*desc), flags); 196 desc = kzalloc(sizeof(*desc), flags);
198 if (!desc) 197 if (!desc)
199 return NULL; 198 return NULL;
199 INIT_LIST_HEAD(&desc->tx_list);
200 dma_async_tx_descriptor_init(&desc->txd, &dc->chan); 200 dma_async_tx_descriptor_init(&desc->txd, &dc->chan);
201 desc->txd.tx_submit = txx9dmac_tx_submit; 201 desc->txd.tx_submit = txx9dmac_tx_submit;
202 /* txd.flags will be overwritten in prep funcs */ 202 /* txd.flags will be overwritten in prep funcs */
@@ -245,7 +245,7 @@ static void txx9dmac_sync_desc_for_cpu(struct txx9dmac_chan *dc,
245 struct txx9dmac_dev *ddev = dc->ddev; 245 struct txx9dmac_dev *ddev = dc->ddev;
246 struct txx9dmac_desc *child; 246 struct txx9dmac_desc *child;
247 247
248 list_for_each_entry(child, &desc->txd.tx_list, desc_node) 248 list_for_each_entry(child, &desc->tx_list, desc_node)
249 dma_sync_single_for_cpu(chan2parent(&dc->chan), 249 dma_sync_single_for_cpu(chan2parent(&dc->chan),
250 child->txd.phys, ddev->descsize, 250 child->txd.phys, ddev->descsize,
251 DMA_TO_DEVICE); 251 DMA_TO_DEVICE);
@@ -267,11 +267,11 @@ static void txx9dmac_desc_put(struct txx9dmac_chan *dc,
267 txx9dmac_sync_desc_for_cpu(dc, desc); 267 txx9dmac_sync_desc_for_cpu(dc, desc);
268 268
269 spin_lock_bh(&dc->lock); 269 spin_lock_bh(&dc->lock);
270 list_for_each_entry(child, &desc->txd.tx_list, desc_node) 270 list_for_each_entry(child, &desc->tx_list, desc_node)
271 dev_vdbg(chan2dev(&dc->chan), 271 dev_vdbg(chan2dev(&dc->chan),
272 "moving child desc %p to freelist\n", 272 "moving child desc %p to freelist\n",
273 child); 273 child);
274 list_splice_init(&desc->txd.tx_list, &dc->free_list); 274 list_splice_init(&desc->tx_list, &dc->free_list);
275 dev_vdbg(chan2dev(&dc->chan), "moving desc %p to freelist\n", 275 dev_vdbg(chan2dev(&dc->chan), "moving desc %p to freelist\n",
276 desc); 276 desc);
277 list_add(&desc->desc_node, &dc->free_list); 277 list_add(&desc->desc_node, &dc->free_list);
@@ -429,7 +429,7 @@ txx9dmac_descriptor_complete(struct txx9dmac_chan *dc,
429 param = txd->callback_param; 429 param = txd->callback_param;
430 430
431 txx9dmac_sync_desc_for_cpu(dc, desc); 431 txx9dmac_sync_desc_for_cpu(dc, desc);
432 list_splice_init(&txd->tx_list, &dc->free_list); 432 list_splice_init(&desc->tx_list, &dc->free_list);
433 list_move(&desc->desc_node, &dc->free_list); 433 list_move(&desc->desc_node, &dc->free_list);
434 434
435 if (!ds) { 435 if (!ds) {
@@ -571,7 +571,7 @@ static void txx9dmac_handle_error(struct txx9dmac_chan *dc, u32 csr)
571 "Bad descriptor submitted for DMA! (cookie: %d)\n", 571 "Bad descriptor submitted for DMA! (cookie: %d)\n",
572 bad_desc->txd.cookie); 572 bad_desc->txd.cookie);
573 txx9dmac_dump_desc(dc, &bad_desc->hwdesc); 573 txx9dmac_dump_desc(dc, &bad_desc->hwdesc);
574 list_for_each_entry(child, &bad_desc->txd.tx_list, desc_node) 574 list_for_each_entry(child, &bad_desc->tx_list, desc_node)
575 txx9dmac_dump_desc(dc, &child->hwdesc); 575 txx9dmac_dump_desc(dc, &child->hwdesc);
576 /* Pretend the descriptor completed successfully */ 576 /* Pretend the descriptor completed successfully */
577 txx9dmac_descriptor_complete(dc, bad_desc); 577 txx9dmac_descriptor_complete(dc, bad_desc);
@@ -613,7 +613,7 @@ static void txx9dmac_scan_descriptors(struct txx9dmac_chan *dc)
613 return; 613 return;
614 } 614 }
615 615
616 list_for_each_entry(child, &desc->txd.tx_list, desc_node) 616 list_for_each_entry(child, &desc->tx_list, desc_node)
617 if (desc_read_CHAR(dc, child) == chain) { 617 if (desc_read_CHAR(dc, child) == chain) {
618 /* Currently in progress */ 618 /* Currently in progress */
619 if (csr & TXX9_DMA_CSR_ABCHC) 619 if (csr & TXX9_DMA_CSR_ABCHC)
@@ -823,8 +823,7 @@ txx9dmac_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dest, dma_addr_t src,
823 dma_sync_single_for_device(chan2parent(&dc->chan), 823 dma_sync_single_for_device(chan2parent(&dc->chan),
824 prev->txd.phys, ddev->descsize, 824 prev->txd.phys, ddev->descsize,
825 DMA_TO_DEVICE); 825 DMA_TO_DEVICE);
826 list_add_tail(&desc->desc_node, 826 list_add_tail(&desc->desc_node, &first->tx_list);
827 &first->txd.tx_list);
828 } 827 }
829 prev = desc; 828 prev = desc;
830 } 829 }
@@ -919,8 +918,7 @@ txx9dmac_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
919 prev->txd.phys, 918 prev->txd.phys,
920 ddev->descsize, 919 ddev->descsize,
921 DMA_TO_DEVICE); 920 DMA_TO_DEVICE);
922 list_add_tail(&desc->desc_node, 921 list_add_tail(&desc->desc_node, &first->tx_list);
923 &first->txd.tx_list);
924 } 922 }
925 prev = desc; 923 prev = desc;
926 } 924 }
diff --git a/drivers/dma/txx9dmac.h b/drivers/dma/txx9dmac.h
index c907ff01d276..365d42366b9f 100644
--- a/drivers/dma/txx9dmac.h
+++ b/drivers/dma/txx9dmac.h
@@ -231,6 +231,7 @@ struct txx9dmac_desc {
231 231
232 /* THEN values for driver housekeeping */ 232 /* THEN values for driver housekeeping */
233 struct list_head desc_node ____cacheline_aligned; 233 struct list_head desc_node ____cacheline_aligned;
234 struct list_head tx_list;
234 struct dma_async_tx_descriptor txd; 235 struct dma_async_tx_descriptor txd;
235 size_t len; 236 size_t len;
236}; 237};
diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig
index a3ca18e2d7cf..02127e59fe8e 100644
--- a/drivers/edac/Kconfig
+++ b/drivers/edac/Kconfig
@@ -133,6 +133,13 @@ config EDAC_I3000
133 Support for error detection and correction on the Intel 133 Support for error detection and correction on the Intel
134 3000 and 3010 server chipsets. 134 3000 and 3010 server chipsets.
135 135
136config EDAC_I3200
137 tristate "Intel 3200"
138 depends on EDAC_MM_EDAC && PCI && X86 && EXPERIMENTAL
139 help
140 Support for error detection and correction on the Intel
141 3200 and 3210 server chipsets.
142
136config EDAC_X38 143config EDAC_X38
137 tristate "Intel X38" 144 tristate "Intel X38"
138 depends on EDAC_MM_EDAC && PCI && X86 145 depends on EDAC_MM_EDAC && PCI && X86
@@ -176,11 +183,11 @@ config EDAC_I5100
176 San Clemente MCH. 183 San Clemente MCH.
177 184
178config EDAC_MPC85XX 185config EDAC_MPC85XX
179 tristate "Freescale MPC85xx" 186 tristate "Freescale MPC83xx / MPC85xx"
180 depends on EDAC_MM_EDAC && FSL_SOC && MPC85xx 187 depends on EDAC_MM_EDAC && FSL_SOC && (PPC_83xx || MPC85xx)
181 help 188 help
182 Support for error detection and correction on the Freescale 189 Support for error detection and correction on the Freescale
183 MPC8560, MPC8540, MPC8548 190 MPC8349, MPC8560, MPC8540, MPC8548
184 191
185config EDAC_MV64X60 192config EDAC_MV64X60
186 tristate "Marvell MV64x60" 193 tristate "Marvell MV64x60"
diff --git a/drivers/edac/Makefile b/drivers/edac/Makefile
index cfa033ce53a7..7a473bbe8abd 100644
--- a/drivers/edac/Makefile
+++ b/drivers/edac/Makefile
@@ -32,6 +32,7 @@ obj-$(CONFIG_EDAC_I82443BXGX) += i82443bxgx_edac.o
32obj-$(CONFIG_EDAC_I82875P) += i82875p_edac.o 32obj-$(CONFIG_EDAC_I82875P) += i82875p_edac.o
33obj-$(CONFIG_EDAC_I82975X) += i82975x_edac.o 33obj-$(CONFIG_EDAC_I82975X) += i82975x_edac.o
34obj-$(CONFIG_EDAC_I3000) += i3000_edac.o 34obj-$(CONFIG_EDAC_I3000) += i3000_edac.o
35obj-$(CONFIG_EDAC_I3200) += i3200_edac.o
35obj-$(CONFIG_EDAC_X38) += x38_edac.o 36obj-$(CONFIG_EDAC_X38) += x38_edac.o
36obj-$(CONFIG_EDAC_I82860) += i82860_edac.o 37obj-$(CONFIG_EDAC_I82860) += i82860_edac.o
37obj-$(CONFIG_EDAC_R82600) += r82600_edac.o 38obj-$(CONFIG_EDAC_R82600) += r82600_edac.o
@@ -49,3 +50,4 @@ obj-$(CONFIG_EDAC_CELL) += cell_edac.o
49obj-$(CONFIG_EDAC_PPC4XX) += ppc4xx_edac.o 50obj-$(CONFIG_EDAC_PPC4XX) += ppc4xx_edac.o
50obj-$(CONFIG_EDAC_AMD8111) += amd8111_edac.o 51obj-$(CONFIG_EDAC_AMD8111) += amd8111_edac.o
51obj-$(CONFIG_EDAC_AMD8131) += amd8131_edac.o 52obj-$(CONFIG_EDAC_AMD8131) += amd8131_edac.o
53
diff --git a/drivers/edac/cpc925_edac.c b/drivers/edac/cpc925_edac.c
index 8c54196b5aba..3d50274f1348 100644
--- a/drivers/edac/cpc925_edac.c
+++ b/drivers/edac/cpc925_edac.c
@@ -885,14 +885,14 @@ static int __devinit cpc925_probe(struct platform_device *pdev)
885 885
886 if (!devm_request_mem_region(&pdev->dev, 886 if (!devm_request_mem_region(&pdev->dev,
887 r->start, 887 r->start,
888 r->end - r->start + 1, 888 resource_size(r),
889 pdev->name)) { 889 pdev->name)) {
890 cpc925_printk(KERN_ERR, "Unable to request mem region\n"); 890 cpc925_printk(KERN_ERR, "Unable to request mem region\n");
891 res = -EBUSY; 891 res = -EBUSY;
892 goto err1; 892 goto err1;
893 } 893 }
894 894
895 vbase = devm_ioremap(&pdev->dev, r->start, r->end - r->start + 1); 895 vbase = devm_ioremap(&pdev->dev, r->start, resource_size(r));
896 if (!vbase) { 896 if (!vbase) {
897 cpc925_printk(KERN_ERR, "Unable to ioremap device\n"); 897 cpc925_printk(KERN_ERR, "Unable to ioremap device\n");
898 res = -ENOMEM; 898 res = -ENOMEM;
@@ -953,7 +953,7 @@ err3:
953 cpc925_mc_exit(mci); 953 cpc925_mc_exit(mci);
954 edac_mc_free(mci); 954 edac_mc_free(mci);
955err2: 955err2:
956 devm_release_mem_region(&pdev->dev, r->start, r->end-r->start+1); 956 devm_release_mem_region(&pdev->dev, r->start, resource_size(r));
957err1: 957err1:
958 devres_release_group(&pdev->dev, cpc925_probe); 958 devres_release_group(&pdev->dev, cpc925_probe);
959out: 959out:
diff --git a/drivers/edac/edac_device.c b/drivers/edac/edac_device.c
index b02a6a69a8f0..d5e13c94714f 100644
--- a/drivers/edac/edac_device.c
+++ b/drivers/edac/edac_device.c
@@ -356,7 +356,6 @@ static void complete_edac_device_list_del(struct rcu_head *head)
356 356
357 edac_dev = container_of(head, struct edac_device_ctl_info, rcu); 357 edac_dev = container_of(head, struct edac_device_ctl_info, rcu);
358 INIT_LIST_HEAD(&edac_dev->link); 358 INIT_LIST_HEAD(&edac_dev->link);
359 complete(&edac_dev->removal_complete);
360} 359}
361 360
362/* 361/*
@@ -369,10 +368,8 @@ static void del_edac_device_from_global_list(struct edac_device_ctl_info
369 *edac_device) 368 *edac_device)
370{ 369{
371 list_del_rcu(&edac_device->link); 370 list_del_rcu(&edac_device->link);
372
373 init_completion(&edac_device->removal_complete);
374 call_rcu(&edac_device->rcu, complete_edac_device_list_del); 371 call_rcu(&edac_device->rcu, complete_edac_device_list_del);
375 wait_for_completion(&edac_device->removal_complete); 372 rcu_barrier();
376} 373}
377 374
378/* 375/*
diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
index 335b7ebdb11c..b629c41756f0 100644
--- a/drivers/edac/edac_mc.c
+++ b/drivers/edac/edac_mc.c
@@ -418,16 +418,14 @@ static void complete_mc_list_del(struct rcu_head *head)
418 418
419 mci = container_of(head, struct mem_ctl_info, rcu); 419 mci = container_of(head, struct mem_ctl_info, rcu);
420 INIT_LIST_HEAD(&mci->link); 420 INIT_LIST_HEAD(&mci->link);
421 complete(&mci->complete);
422} 421}
423 422
424static void del_mc_from_global_list(struct mem_ctl_info *mci) 423static void del_mc_from_global_list(struct mem_ctl_info *mci)
425{ 424{
426 atomic_dec(&edac_handlers); 425 atomic_dec(&edac_handlers);
427 list_del_rcu(&mci->link); 426 list_del_rcu(&mci->link);
428 init_completion(&mci->complete);
429 call_rcu(&mci->rcu, complete_mc_list_del); 427 call_rcu(&mci->rcu, complete_mc_list_del);
430 wait_for_completion(&mci->complete); 428 rcu_barrier();
431} 429}
432 430
433/** 431/**
diff --git a/drivers/edac/edac_pci.c b/drivers/edac/edac_pci.c
index 30b585b1d60b..efb5d5650783 100644
--- a/drivers/edac/edac_pci.c
+++ b/drivers/edac/edac_pci.c
@@ -174,7 +174,6 @@ static void complete_edac_pci_list_del(struct rcu_head *head)
174 174
175 pci = container_of(head, struct edac_pci_ctl_info, rcu); 175 pci = container_of(head, struct edac_pci_ctl_info, rcu);
176 INIT_LIST_HEAD(&pci->link); 176 INIT_LIST_HEAD(&pci->link);
177 complete(&pci->complete);
178} 177}
179 178
180/* 179/*
@@ -185,9 +184,8 @@ static void complete_edac_pci_list_del(struct rcu_head *head)
185static void del_edac_pci_from_global_list(struct edac_pci_ctl_info *pci) 184static void del_edac_pci_from_global_list(struct edac_pci_ctl_info *pci)
186{ 185{
187 list_del_rcu(&pci->link); 186 list_del_rcu(&pci->link);
188 init_completion(&pci->complete);
189 call_rcu(&pci->rcu, complete_edac_pci_list_del); 187 call_rcu(&pci->rcu, complete_edac_pci_list_del);
190 wait_for_completion(&pci->complete); 188 rcu_barrier();
191} 189}
192 190
193#if 0 191#if 0
diff --git a/drivers/edac/i3200_edac.c b/drivers/edac/i3200_edac.c
new file mode 100644
index 000000000000..fde4db91c4d2
--- /dev/null
+++ b/drivers/edac/i3200_edac.c
@@ -0,0 +1,527 @@
1/*
2 * Intel 3200/3210 Memory Controller kernel module
3 * Copyright (C) 2008-2009 Akamai Technologies, Inc.
4 * Portions by Hitoshi Mitake <h.mitake@gmail.com>.
5 *
6 * This file may be distributed under the terms of the
7 * GNU General Public License.
8 */
9
10#include <linux/module.h>
11#include <linux/init.h>
12#include <linux/pci.h>
13#include <linux/pci_ids.h>
14#include <linux/slab.h>
15#include <linux/edac.h>
16#include <linux/io.h>
17#include "edac_core.h"
18
19#define I3200_REVISION "1.1"
20
21#define EDAC_MOD_STR "i3200_edac"
22
23#define PCI_DEVICE_ID_INTEL_3200_HB 0x29f0
24
25#define I3200_RANKS 8
26#define I3200_RANKS_PER_CHANNEL 4
27#define I3200_CHANNELS 2
28
29/* Intel 3200 register addresses - device 0 function 0 - DRAM Controller */
30
31#define I3200_MCHBAR_LOW 0x48 /* MCH Memory Mapped Register BAR */
32#define I3200_MCHBAR_HIGH 0x4c
33#define I3200_MCHBAR_MASK 0xfffffc000ULL /* bits 35:14 */
34#define I3200_MMR_WINDOW_SIZE 16384
35
36#define I3200_TOM 0xa0 /* Top of Memory (16b)
37 *
38 * 15:10 reserved
39 * 9:0 total populated physical memory
40 */
41#define I3200_TOM_MASK 0x3ff /* bits 9:0 */
42#define I3200_TOM_SHIFT 26 /* 64MiB grain */
43
44#define I3200_ERRSTS 0xc8 /* Error Status Register (16b)
45 *
46 * 15 reserved
47 * 14 Isochronous TBWRR Run Behind FIFO Full
48 * (ITCV)
49 * 13 Isochronous TBWRR Run Behind FIFO Put
50 * (ITSTV)
51 * 12 reserved
52 * 11 MCH Thermal Sensor Event
53 * for SMI/SCI/SERR (GTSE)
54 * 10 reserved
55 * 9 LOCK to non-DRAM Memory Flag (LCKF)
56 * 8 reserved
57 * 7 DRAM Throttle Flag (DTF)
58 * 6:2 reserved
59 * 1 Multi-bit DRAM ECC Error Flag (DMERR)
60 * 0 Single-bit DRAM ECC Error Flag (DSERR)
61 */
62#define I3200_ERRSTS_UE 0x0002
63#define I3200_ERRSTS_CE 0x0001
64#define I3200_ERRSTS_BITS (I3200_ERRSTS_UE | I3200_ERRSTS_CE)
65
66
67/* Intel MMIO register space - device 0 function 0 - MMR space */
68
69#define I3200_C0DRB 0x200 /* Channel 0 DRAM Rank Boundary (16b x 4)
70 *
71 * 15:10 reserved
72 * 9:0 Channel 0 DRAM Rank Boundary Address
73 */
74#define I3200_C1DRB 0x600 /* Channel 1 DRAM Rank Boundary (16b x 4) */
75#define I3200_DRB_MASK 0x3ff /* bits 9:0 */
76#define I3200_DRB_SHIFT 26 /* 64MiB grain */
77
78#define I3200_C0ECCERRLOG 0x280 /* Channel 0 ECC Error Log (64b)
79 *
80 * 63:48 Error Column Address (ERRCOL)
81 * 47:32 Error Row Address (ERRROW)
82 * 31:29 Error Bank Address (ERRBANK)
83 * 28:27 Error Rank Address (ERRRANK)
84 * 26:24 reserved
85 * 23:16 Error Syndrome (ERRSYND)
86 * 15: 2 reserved
87 * 1 Multiple Bit Error Status (MERRSTS)
88 * 0 Correctable Error Status (CERRSTS)
89 */
90#define I3200_C1ECCERRLOG 0x680 /* Chan 1 ECC Error Log (64b) */
91#define I3200_ECCERRLOG_CE 0x1
92#define I3200_ECCERRLOG_UE 0x2
93#define I3200_ECCERRLOG_RANK_BITS 0x18000000
94#define I3200_ECCERRLOG_RANK_SHIFT 27
95#define I3200_ECCERRLOG_SYNDROME_BITS 0xff0000
96#define I3200_ECCERRLOG_SYNDROME_SHIFT 16
97#define I3200_CAPID0 0xe0 /* P.95 of spec for details */
98
99struct i3200_priv {
100 void __iomem *window;
101};
102
103static int nr_channels;
104
105static int how_many_channels(struct pci_dev *pdev)
106{
107 unsigned char capid0_8b; /* 8th byte of CAPID0 */
108
109 pci_read_config_byte(pdev, I3200_CAPID0 + 8, &capid0_8b);
110 if (capid0_8b & 0x20) { /* check DCD: Dual Channel Disable */
111 debugf0("In single channel mode.\n");
112 return 1;
113 } else {
114 debugf0("In dual channel mode.\n");
115 return 2;
116 }
117}
118
119static unsigned long eccerrlog_syndrome(u64 log)
120{
121 return (log & I3200_ECCERRLOG_SYNDROME_BITS) >>
122 I3200_ECCERRLOG_SYNDROME_SHIFT;
123}
124
125static int eccerrlog_row(int channel, u64 log)
126{
127 u64 rank = ((log & I3200_ECCERRLOG_RANK_BITS) >>
128 I3200_ECCERRLOG_RANK_SHIFT);
129 return rank | (channel * I3200_RANKS_PER_CHANNEL);
130}
131
132enum i3200_chips {
133 I3200 = 0,
134};
135
136struct i3200_dev_info {
137 const char *ctl_name;
138};
139
140struct i3200_error_info {
141 u16 errsts;
142 u16 errsts2;
143 u64 eccerrlog[I3200_CHANNELS];
144};
145
146static const struct i3200_dev_info i3200_devs[] = {
147 [I3200] = {
148 .ctl_name = "i3200"
149 },
150};
151
152static struct pci_dev *mci_pdev;
153static int i3200_registered = 1;
154
155
156static void i3200_clear_error_info(struct mem_ctl_info *mci)
157{
158 struct pci_dev *pdev;
159
160 pdev = to_pci_dev(mci->dev);
161
162 /*
163 * Clear any error bits.
164 * (Yes, we really clear bits by writing 1 to them.)
165 */
166 pci_write_bits16(pdev, I3200_ERRSTS, I3200_ERRSTS_BITS,
167 I3200_ERRSTS_BITS);
168}
169
170static void i3200_get_and_clear_error_info(struct mem_ctl_info *mci,
171 struct i3200_error_info *info)
172{
173 struct pci_dev *pdev;
174 struct i3200_priv *priv = mci->pvt_info;
175 void __iomem *window = priv->window;
176
177 pdev = to_pci_dev(mci->dev);
178
179 /*
180 * This is a mess because there is no atomic way to read all the
181 * registers at once and the registers can transition from CE being
182 * overwritten by UE.
183 */
184 pci_read_config_word(pdev, I3200_ERRSTS, &info->errsts);
185 if (!(info->errsts & I3200_ERRSTS_BITS))
186 return;
187
188 info->eccerrlog[0] = readq(window + I3200_C0ECCERRLOG);
189 if (nr_channels == 2)
190 info->eccerrlog[1] = readq(window + I3200_C1ECCERRLOG);
191
192 pci_read_config_word(pdev, I3200_ERRSTS, &info->errsts2);
193
194 /*
195 * If the error is the same for both reads then the first set
196 * of reads is valid. If there is a change then there is a CE
197 * with no info and the second set of reads is valid and
198 * should be UE info.
199 */
200 if ((info->errsts ^ info->errsts2) & I3200_ERRSTS_BITS) {
201 info->eccerrlog[0] = readq(window + I3200_C0ECCERRLOG);
202 if (nr_channels == 2)
203 info->eccerrlog[1] = readq(window + I3200_C1ECCERRLOG);
204 }
205
206 i3200_clear_error_info(mci);
207}
208
209static void i3200_process_error_info(struct mem_ctl_info *mci,
210 struct i3200_error_info *info)
211{
212 int channel;
213 u64 log;
214
215 if (!(info->errsts & I3200_ERRSTS_BITS))
216 return;
217
218 if ((info->errsts ^ info->errsts2) & I3200_ERRSTS_BITS) {
219 edac_mc_handle_ce_no_info(mci, "UE overwrote CE");
220 info->errsts = info->errsts2;
221 }
222
223 for (channel = 0; channel < nr_channels; channel++) {
224 log = info->eccerrlog[channel];
225 if (log & I3200_ECCERRLOG_UE) {
226 edac_mc_handle_ue(mci, 0, 0,
227 eccerrlog_row(channel, log),
228 "i3200 UE");
229 } else if (log & I3200_ECCERRLOG_CE) {
230 edac_mc_handle_ce(mci, 0, 0,
231 eccerrlog_syndrome(log),
232 eccerrlog_row(channel, log), 0,
233 "i3200 CE");
234 }
235 }
236}
237
238static void i3200_check(struct mem_ctl_info *mci)
239{
240 struct i3200_error_info info;
241
242 debugf1("MC%d: %s()\n", mci->mc_idx, __func__);
243 i3200_get_and_clear_error_info(mci, &info);
244 i3200_process_error_info(mci, &info);
245}
246
247
248void __iomem *i3200_map_mchbar(struct pci_dev *pdev)
249{
250 union {
251 u64 mchbar;
252 struct {
253 u32 mchbar_low;
254 u32 mchbar_high;
255 };
256 } u;
257 void __iomem *window;
258
259 pci_read_config_dword(pdev, I3200_MCHBAR_LOW, &u.mchbar_low);
260 pci_read_config_dword(pdev, I3200_MCHBAR_HIGH, &u.mchbar_high);
261 u.mchbar &= I3200_MCHBAR_MASK;
262
263 if (u.mchbar != (resource_size_t)u.mchbar) {
264 printk(KERN_ERR
265 "i3200: mmio space beyond accessible range (0x%llx)\n",
266 (unsigned long long)u.mchbar);
267 return NULL;
268 }
269
270 window = ioremap_nocache(u.mchbar, I3200_MMR_WINDOW_SIZE);
271 if (!window)
272 printk(KERN_ERR "i3200: cannot map mmio space at 0x%llx\n",
273 (unsigned long long)u.mchbar);
274
275 return window;
276}
277
278
279static void i3200_get_drbs(void __iomem *window,
280 u16 drbs[I3200_CHANNELS][I3200_RANKS_PER_CHANNEL])
281{
282 int i;
283
284 for (i = 0; i < I3200_RANKS_PER_CHANNEL; i++) {
285 drbs[0][i] = readw(window + I3200_C0DRB + 2*i) & I3200_DRB_MASK;
286 drbs[1][i] = readw(window + I3200_C1DRB + 2*i) & I3200_DRB_MASK;
287 }
288}
289
290static bool i3200_is_stacked(struct pci_dev *pdev,
291 u16 drbs[I3200_CHANNELS][I3200_RANKS_PER_CHANNEL])
292{
293 u16 tom;
294
295 pci_read_config_word(pdev, I3200_TOM, &tom);
296 tom &= I3200_TOM_MASK;
297
298 return drbs[I3200_CHANNELS - 1][I3200_RANKS_PER_CHANNEL - 1] == tom;
299}
300
301static unsigned long drb_to_nr_pages(
302 u16 drbs[I3200_CHANNELS][I3200_RANKS_PER_CHANNEL], bool stacked,
303 int channel, int rank)
304{
305 int n;
306
307 n = drbs[channel][rank];
308 if (rank > 0)
309 n -= drbs[channel][rank - 1];
310 if (stacked && (channel == 1) &&
311 drbs[channel][rank] == drbs[channel][I3200_RANKS_PER_CHANNEL - 1])
312 n -= drbs[0][I3200_RANKS_PER_CHANNEL - 1];
313
314 n <<= (I3200_DRB_SHIFT - PAGE_SHIFT);
315 return n;
316}
317
318static int i3200_probe1(struct pci_dev *pdev, int dev_idx)
319{
320 int rc;
321 int i;
322 struct mem_ctl_info *mci = NULL;
323 unsigned long last_page;
324 u16 drbs[I3200_CHANNELS][I3200_RANKS_PER_CHANNEL];
325 bool stacked;
326 void __iomem *window;
327 struct i3200_priv *priv;
328
329 debugf0("MC: %s()\n", __func__);
330
331 window = i3200_map_mchbar(pdev);
332 if (!window)
333 return -ENODEV;
334
335 i3200_get_drbs(window, drbs);
336 nr_channels = how_many_channels(pdev);
337
338 mci = edac_mc_alloc(sizeof(struct i3200_priv), I3200_RANKS,
339 nr_channels, 0);
340 if (!mci)
341 return -ENOMEM;
342
343 debugf3("MC: %s(): init mci\n", __func__);
344
345 mci->dev = &pdev->dev;
346 mci->mtype_cap = MEM_FLAG_DDR2;
347
348 mci->edac_ctl_cap = EDAC_FLAG_SECDED;
349 mci->edac_cap = EDAC_FLAG_SECDED;
350
351 mci->mod_name = EDAC_MOD_STR;
352 mci->mod_ver = I3200_REVISION;
353 mci->ctl_name = i3200_devs[dev_idx].ctl_name;
354 mci->dev_name = pci_name(pdev);
355 mci->edac_check = i3200_check;
356 mci->ctl_page_to_phys = NULL;
357 priv = mci->pvt_info;
358 priv->window = window;
359
360 stacked = i3200_is_stacked(pdev, drbs);
361
362 /*
363 * The dram rank boundary (DRB) reg values are boundary addresses
364 * for each DRAM rank with a granularity of 64MB. DRB regs are
365 * cumulative; the last one will contain the total memory
366 * contained in all ranks.
367 */
368 last_page = -1UL;
369 for (i = 0; i < mci->nr_csrows; i++) {
370 unsigned long nr_pages;
371 struct csrow_info *csrow = &mci->csrows[i];
372
373 nr_pages = drb_to_nr_pages(drbs, stacked,
374 i / I3200_RANKS_PER_CHANNEL,
375 i % I3200_RANKS_PER_CHANNEL);
376
377 if (nr_pages == 0) {
378 csrow->mtype = MEM_EMPTY;
379 continue;
380 }
381
382 csrow->first_page = last_page + 1;
383 last_page += nr_pages;
384 csrow->last_page = last_page;
385 csrow->nr_pages = nr_pages;
386
387 csrow->grain = nr_pages << PAGE_SHIFT;
388 csrow->mtype = MEM_DDR2;
389 csrow->dtype = DEV_UNKNOWN;
390 csrow->edac_mode = EDAC_UNKNOWN;
391 }
392
393 i3200_clear_error_info(mci);
394
395 rc = -ENODEV;
396 if (edac_mc_add_mc(mci)) {
397 debugf3("MC: %s(): failed edac_mc_add_mc()\n", __func__);
398 goto fail;
399 }
400
401 /* get this far and it's successful */
402 debugf3("MC: %s(): success\n", __func__);
403 return 0;
404
405fail:
406 iounmap(window);
407 if (mci)
408 edac_mc_free(mci);
409
410 return rc;
411}
412
413static int __devinit i3200_init_one(struct pci_dev *pdev,
414 const struct pci_device_id *ent)
415{
416 int rc;
417
418 debugf0("MC: %s()\n", __func__);
419
420 if (pci_enable_device(pdev) < 0)
421 return -EIO;
422
423 rc = i3200_probe1(pdev, ent->driver_data);
424 if (!mci_pdev)
425 mci_pdev = pci_dev_get(pdev);
426
427 return rc;
428}
429
430static void __devexit i3200_remove_one(struct pci_dev *pdev)
431{
432 struct mem_ctl_info *mci;
433 struct i3200_priv *priv;
434
435 debugf0("%s()\n", __func__);
436
437 mci = edac_mc_del_mc(&pdev->dev);
438 if (!mci)
439 return;
440
441 priv = mci->pvt_info;
442 iounmap(priv->window);
443
444 edac_mc_free(mci);
445}
446
447static const struct pci_device_id i3200_pci_tbl[] __devinitdata = {
448 {
449 PCI_VEND_DEV(INTEL, 3200_HB), PCI_ANY_ID, PCI_ANY_ID, 0, 0,
450 I3200},
451 {
452 0,
453 } /* 0 terminated list. */
454};
455
456MODULE_DEVICE_TABLE(pci, i3200_pci_tbl);
457
458static struct pci_driver i3200_driver = {
459 .name = EDAC_MOD_STR,
460 .probe = i3200_init_one,
461 .remove = __devexit_p(i3200_remove_one),
462 .id_table = i3200_pci_tbl,
463};
464
465static int __init i3200_init(void)
466{
467 int pci_rc;
468
469 debugf3("MC: %s()\n", __func__);
470
471 /* Ensure that the OPSTATE is set correctly for POLL or NMI */
472 opstate_init();
473
474 pci_rc = pci_register_driver(&i3200_driver);
475 if (pci_rc < 0)
476 goto fail0;
477
478 if (!mci_pdev) {
479 i3200_registered = 0;
480 mci_pdev = pci_get_device(PCI_VENDOR_ID_INTEL,
481 PCI_DEVICE_ID_INTEL_3200_HB, NULL);
482 if (!mci_pdev) {
483 debugf0("i3200 pci_get_device fail\n");
484 pci_rc = -ENODEV;
485 goto fail1;
486 }
487
488 pci_rc = i3200_init_one(mci_pdev, i3200_pci_tbl);
489 if (pci_rc < 0) {
490 debugf0("i3200 init fail\n");
491 pci_rc = -ENODEV;
492 goto fail1;
493 }
494 }
495
496 return 0;
497
498fail1:
499 pci_unregister_driver(&i3200_driver);
500
501fail0:
502 if (mci_pdev)
503 pci_dev_put(mci_pdev);
504
505 return pci_rc;
506}
507
508static void __exit i3200_exit(void)
509{
510 debugf3("MC: %s()\n", __func__);
511
512 pci_unregister_driver(&i3200_driver);
513 if (!i3200_registered) {
514 i3200_remove_one(mci_pdev);
515 pci_dev_put(mci_pdev);
516 }
517}
518
519module_init(i3200_init);
520module_exit(i3200_exit);
521
522MODULE_LICENSE("GPL");
523MODULE_AUTHOR("Akamai Technologies, Inc.");
524MODULE_DESCRIPTION("MC support for Intel 3200 memory hub controllers");
525
526module_param(edac_op_state, int, 0444);
527MODULE_PARM_DESC(edac_op_state, "EDAC Error Reporting state: 0=Poll,1=NMI");
diff --git a/drivers/edac/mpc85xx_edac.c b/drivers/edac/mpc85xx_edac.c
index 3f2ccfc6407c..157f6504f25e 100644
--- a/drivers/edac/mpc85xx_edac.c
+++ b/drivers/edac/mpc85xx_edac.c
@@ -41,7 +41,9 @@ static u32 orig_pci_err_en;
41#endif 41#endif
42 42
43static u32 orig_l2_err_disable; 43static u32 orig_l2_err_disable;
44#ifdef CONFIG_MPC85xx
44static u32 orig_hid1[2]; 45static u32 orig_hid1[2];
46#endif
45 47
46/************************ MC SYSFS parts ***********************************/ 48/************************ MC SYSFS parts ***********************************/
47 49
@@ -646,6 +648,7 @@ static struct of_device_id mpc85xx_l2_err_of_match[] = {
646 { .compatible = "fsl,mpc8560-l2-cache-controller", }, 648 { .compatible = "fsl,mpc8560-l2-cache-controller", },
647 { .compatible = "fsl,mpc8568-l2-cache-controller", }, 649 { .compatible = "fsl,mpc8568-l2-cache-controller", },
648 { .compatible = "fsl,mpc8572-l2-cache-controller", }, 650 { .compatible = "fsl,mpc8572-l2-cache-controller", },
651 { .compatible = "fsl,p2020-l2-cache-controller", },
649 {}, 652 {},
650}; 653};
651 654
@@ -788,19 +791,20 @@ static void __devinit mpc85xx_init_csrows(struct mem_ctl_info *mci)
788 csrow = &mci->csrows[index]; 791 csrow = &mci->csrows[index];
789 cs_bnds = in_be32(pdata->mc_vbase + MPC85XX_MC_CS_BNDS_0 + 792 cs_bnds = in_be32(pdata->mc_vbase + MPC85XX_MC_CS_BNDS_0 +
790 (index * MPC85XX_MC_CS_BNDS_OFS)); 793 (index * MPC85XX_MC_CS_BNDS_OFS));
791 start = (cs_bnds & 0xfff0000) << 4; 794
792 end = ((cs_bnds & 0xfff) << 20); 795 start = (cs_bnds & 0xffff0000) >> 16;
793 if (start) 796 end = (cs_bnds & 0x0000ffff);
794 start |= 0xfffff;
795 if (end)
796 end |= 0xfffff;
797 797
798 if (start == end) 798 if (start == end)
799 continue; /* not populated */ 799 continue; /* not populated */
800 800
801 start <<= (24 - PAGE_SHIFT);
802 end <<= (24 - PAGE_SHIFT);
803 end |= (1 << (24 - PAGE_SHIFT)) - 1;
804
801 csrow->first_page = start >> PAGE_SHIFT; 805 csrow->first_page = start >> PAGE_SHIFT;
802 csrow->last_page = end >> PAGE_SHIFT; 806 csrow->last_page = end >> PAGE_SHIFT;
803 csrow->nr_pages = csrow->last_page + 1 - csrow->first_page; 807 csrow->nr_pages = end + 1 - start;
804 csrow->grain = 8; 808 csrow->grain = 8;
805 csrow->mtype = mtype; 809 csrow->mtype = mtype;
806 csrow->dtype = DEV_UNKNOWN; 810 csrow->dtype = DEV_UNKNOWN;
@@ -984,6 +988,8 @@ static struct of_device_id mpc85xx_mc_err_of_match[] = {
984 { .compatible = "fsl,mpc8560-memory-controller", }, 988 { .compatible = "fsl,mpc8560-memory-controller", },
985 { .compatible = "fsl,mpc8568-memory-controller", }, 989 { .compatible = "fsl,mpc8568-memory-controller", },
986 { .compatible = "fsl,mpc8572-memory-controller", }, 990 { .compatible = "fsl,mpc8572-memory-controller", },
991 { .compatible = "fsl,mpc8349-memory-controller", },
992 { .compatible = "fsl,p2020-memory-controller", },
987 {}, 993 {},
988}; 994};
989 995
@@ -999,13 +1005,13 @@ static struct of_platform_driver mpc85xx_mc_err_driver = {
999 }, 1005 },
1000}; 1006};
1001 1007
1002 1008#ifdef CONFIG_MPC85xx
1003static void __init mpc85xx_mc_clear_rfxe(void *data) 1009static void __init mpc85xx_mc_clear_rfxe(void *data)
1004{ 1010{
1005 orig_hid1[smp_processor_id()] = mfspr(SPRN_HID1); 1011 orig_hid1[smp_processor_id()] = mfspr(SPRN_HID1);
1006 mtspr(SPRN_HID1, (orig_hid1[smp_processor_id()] & ~0x20000)); 1012 mtspr(SPRN_HID1, (orig_hid1[smp_processor_id()] & ~0x20000));
1007} 1013}
1008 1014#endif
1009 1015
1010static int __init mpc85xx_mc_init(void) 1016static int __init mpc85xx_mc_init(void)
1011{ 1017{
@@ -1038,26 +1044,32 @@ static int __init mpc85xx_mc_init(void)
1038 printk(KERN_WARNING EDAC_MOD_STR "PCI fails to register\n"); 1044 printk(KERN_WARNING EDAC_MOD_STR "PCI fails to register\n");
1039#endif 1045#endif
1040 1046
1047#ifdef CONFIG_MPC85xx
1041 /* 1048 /*
1042 * need to clear HID1[RFXE] to disable machine check int 1049 * need to clear HID1[RFXE] to disable machine check int
1043 * so we can catch it 1050 * so we can catch it
1044 */ 1051 */
1045 if (edac_op_state == EDAC_OPSTATE_INT) 1052 if (edac_op_state == EDAC_OPSTATE_INT)
1046 on_each_cpu(mpc85xx_mc_clear_rfxe, NULL, 0); 1053 on_each_cpu(mpc85xx_mc_clear_rfxe, NULL, 0);
1054#endif
1047 1055
1048 return 0; 1056 return 0;
1049} 1057}
1050 1058
1051module_init(mpc85xx_mc_init); 1059module_init(mpc85xx_mc_init);
1052 1060
1061#ifdef CONFIG_MPC85xx
1053static void __exit mpc85xx_mc_restore_hid1(void *data) 1062static void __exit mpc85xx_mc_restore_hid1(void *data)
1054{ 1063{
1055 mtspr(SPRN_HID1, orig_hid1[smp_processor_id()]); 1064 mtspr(SPRN_HID1, orig_hid1[smp_processor_id()]);
1056} 1065}
1066#endif
1057 1067
1058static void __exit mpc85xx_mc_exit(void) 1068static void __exit mpc85xx_mc_exit(void)
1059{ 1069{
1070#ifdef CONFIG_MPC85xx
1060 on_each_cpu(mpc85xx_mc_restore_hid1, NULL, 0); 1071 on_each_cpu(mpc85xx_mc_restore_hid1, NULL, 0);
1072#endif
1061#ifdef CONFIG_PCI 1073#ifdef CONFIG_PCI
1062 of_unregister_platform_driver(&mpc85xx_pci_err_driver); 1074 of_unregister_platform_driver(&mpc85xx_pci_err_driver);
1063#endif 1075#endif
diff --git a/drivers/edac/mv64x60_edac.c b/drivers/edac/mv64x60_edac.c
index 5131aaae8e03..a6b9fec13a74 100644
--- a/drivers/edac/mv64x60_edac.c
+++ b/drivers/edac/mv64x60_edac.c
@@ -90,7 +90,7 @@ static int __init mv64x60_pci_fixup(struct platform_device *pdev)
90 return -ENOENT; 90 return -ENOENT;
91 } 91 }
92 92
93 pci_serr = ioremap(r->start, r->end - r->start + 1); 93 pci_serr = ioremap(r->start, resource_size(r));
94 if (!pci_serr) 94 if (!pci_serr)
95 return -ENOMEM; 95 return -ENOMEM;
96 96
@@ -140,7 +140,7 @@ static int __devinit mv64x60_pci_err_probe(struct platform_device *pdev)
140 140
141 if (!devm_request_mem_region(&pdev->dev, 141 if (!devm_request_mem_region(&pdev->dev,
142 r->start, 142 r->start,
143 r->end - r->start + 1, 143 resource_size(r),
144 pdata->name)) { 144 pdata->name)) {
145 printk(KERN_ERR "%s: Error while requesting mem region\n", 145 printk(KERN_ERR "%s: Error while requesting mem region\n",
146 __func__); 146 __func__);
@@ -150,7 +150,7 @@ static int __devinit mv64x60_pci_err_probe(struct platform_device *pdev)
150 150
151 pdata->pci_vbase = devm_ioremap(&pdev->dev, 151 pdata->pci_vbase = devm_ioremap(&pdev->dev,
152 r->start, 152 r->start,
153 r->end - r->start + 1); 153 resource_size(r));
154 if (!pdata->pci_vbase) { 154 if (!pdata->pci_vbase) {
155 printk(KERN_ERR "%s: Unable to setup PCI err regs\n", __func__); 155 printk(KERN_ERR "%s: Unable to setup PCI err regs\n", __func__);
156 res = -ENOMEM; 156 res = -ENOMEM;
@@ -306,7 +306,7 @@ static int __devinit mv64x60_sram_err_probe(struct platform_device *pdev)
306 306
307 if (!devm_request_mem_region(&pdev->dev, 307 if (!devm_request_mem_region(&pdev->dev,
308 r->start, 308 r->start,
309 r->end - r->start + 1, 309 resource_size(r),
310 pdata->name)) { 310 pdata->name)) {
311 printk(KERN_ERR "%s: Error while request mem region\n", 311 printk(KERN_ERR "%s: Error while request mem region\n",
312 __func__); 312 __func__);
@@ -316,7 +316,7 @@ static int __devinit mv64x60_sram_err_probe(struct platform_device *pdev)
316 316
317 pdata->sram_vbase = devm_ioremap(&pdev->dev, 317 pdata->sram_vbase = devm_ioremap(&pdev->dev,
318 r->start, 318 r->start,
319 r->end - r->start + 1); 319 resource_size(r));
320 if (!pdata->sram_vbase) { 320 if (!pdata->sram_vbase) {
321 printk(KERN_ERR "%s: Unable to setup SRAM err regs\n", 321 printk(KERN_ERR "%s: Unable to setup SRAM err regs\n",
322 __func__); 322 __func__);
@@ -474,7 +474,7 @@ static int __devinit mv64x60_cpu_err_probe(struct platform_device *pdev)
474 474
475 if (!devm_request_mem_region(&pdev->dev, 475 if (!devm_request_mem_region(&pdev->dev,
476 r->start, 476 r->start,
477 r->end - r->start + 1, 477 resource_size(r),
478 pdata->name)) { 478 pdata->name)) {
479 printk(KERN_ERR "%s: Error while requesting mem region\n", 479 printk(KERN_ERR "%s: Error while requesting mem region\n",
480 __func__); 480 __func__);
@@ -484,7 +484,7 @@ static int __devinit mv64x60_cpu_err_probe(struct platform_device *pdev)
484 484
485 pdata->cpu_vbase[0] = devm_ioremap(&pdev->dev, 485 pdata->cpu_vbase[0] = devm_ioremap(&pdev->dev,
486 r->start, 486 r->start,
487 r->end - r->start + 1); 487 resource_size(r));
488 if (!pdata->cpu_vbase[0]) { 488 if (!pdata->cpu_vbase[0]) {
489 printk(KERN_ERR "%s: Unable to setup CPU err regs\n", __func__); 489 printk(KERN_ERR "%s: Unable to setup CPU err regs\n", __func__);
490 res = -ENOMEM; 490 res = -ENOMEM;
@@ -501,7 +501,7 @@ static int __devinit mv64x60_cpu_err_probe(struct platform_device *pdev)
501 501
502 if (!devm_request_mem_region(&pdev->dev, 502 if (!devm_request_mem_region(&pdev->dev,
503 r->start, 503 r->start,
504 r->end - r->start + 1, 504 resource_size(r),
505 pdata->name)) { 505 pdata->name)) {
506 printk(KERN_ERR "%s: Error while requesting mem region\n", 506 printk(KERN_ERR "%s: Error while requesting mem region\n",
507 __func__); 507 __func__);
@@ -511,7 +511,7 @@ static int __devinit mv64x60_cpu_err_probe(struct platform_device *pdev)
511 511
512 pdata->cpu_vbase[1] = devm_ioremap(&pdev->dev, 512 pdata->cpu_vbase[1] = devm_ioremap(&pdev->dev,
513 r->start, 513 r->start,
514 r->end - r->start + 1); 514 resource_size(r));
515 if (!pdata->cpu_vbase[1]) { 515 if (!pdata->cpu_vbase[1]) {
516 printk(KERN_ERR "%s: Unable to setup CPU err regs\n", __func__); 516 printk(KERN_ERR "%s: Unable to setup CPU err regs\n", __func__);
517 res = -ENOMEM; 517 res = -ENOMEM;
@@ -726,7 +726,7 @@ static int __devinit mv64x60_mc_err_probe(struct platform_device *pdev)
726 726
727 if (!devm_request_mem_region(&pdev->dev, 727 if (!devm_request_mem_region(&pdev->dev,
728 r->start, 728 r->start,
729 r->end - r->start + 1, 729 resource_size(r),
730 pdata->name)) { 730 pdata->name)) {
731 printk(KERN_ERR "%s: Error while requesting mem region\n", 731 printk(KERN_ERR "%s: Error while requesting mem region\n",
732 __func__); 732 __func__);
@@ -736,7 +736,7 @@ static int __devinit mv64x60_mc_err_probe(struct platform_device *pdev)
736 736
737 pdata->mc_vbase = devm_ioremap(&pdev->dev, 737 pdata->mc_vbase = devm_ioremap(&pdev->dev,
738 r->start, 738 r->start,
739 r->end - r->start + 1); 739 resource_size(r));
740 if (!pdata->mc_vbase) { 740 if (!pdata->mc_vbase) {
741 printk(KERN_ERR "%s: Unable to setup MC err regs\n", __func__); 741 printk(KERN_ERR "%s: Unable to setup MC err regs\n", __func__);
742 res = -ENOMEM; 742 res = -ENOMEM;
diff --git a/drivers/idle/i7300_idle.c b/drivers/idle/i7300_idle.c
index 949c97ff57e3..1f20a042a4f5 100644
--- a/drivers/idle/i7300_idle.c
+++ b/drivers/idle/i7300_idle.c
@@ -29,8 +29,8 @@
29 29
30#include <asm/idle.h> 30#include <asm/idle.h>
31 31
32#include "../dma/ioatdma_hw.h" 32#include "../dma/ioat/hw.h"
33#include "../dma/ioatdma_registers.h" 33#include "../dma/ioat/registers.h"
34 34
35#define I7300_IDLE_DRIVER_VERSION "1.55" 35#define I7300_IDLE_DRIVER_VERSION "1.55"
36#define I7300_PRINT "i7300_idle:" 36#define I7300_PRINT "i7300_idle:"
@@ -126,9 +126,9 @@ static void i7300_idle_ioat_stop(void)
126 udelay(10); 126 udelay(10);
127 127
128 sts = readq(ioat_chanbase + IOAT1_CHANSTS_OFFSET) & 128 sts = readq(ioat_chanbase + IOAT1_CHANSTS_OFFSET) &
129 IOAT_CHANSTS_DMA_TRANSFER_STATUS; 129 IOAT_CHANSTS_STATUS;
130 130
131 if (sts != IOAT_CHANSTS_DMA_TRANSFER_STATUS_ACTIVE) 131 if (sts != IOAT_CHANSTS_ACTIVE)
132 break; 132 break;
133 133
134 } 134 }
@@ -160,9 +160,9 @@ static int __init i7300_idle_ioat_selftest(u8 *ctl,
160 udelay(1000); 160 udelay(1000);
161 161
162 chan_sts = readq(ioat_chanbase + IOAT1_CHANSTS_OFFSET) & 162 chan_sts = readq(ioat_chanbase + IOAT1_CHANSTS_OFFSET) &
163 IOAT_CHANSTS_DMA_TRANSFER_STATUS; 163 IOAT_CHANSTS_STATUS;
164 164
165 if (chan_sts != IOAT_CHANSTS_DMA_TRANSFER_STATUS_DONE) { 165 if (chan_sts != IOAT_CHANSTS_DONE) {
166 /* Not complete, reset the channel */ 166 /* Not complete, reset the channel */
167 writeb(IOAT_CHANCMD_RESET, 167 writeb(IOAT_CHANCMD_RESET,
168 ioat_chanbase + IOAT1_CHANCMD_OFFSET); 168 ioat_chanbase + IOAT1_CHANCMD_OFFSET);
@@ -288,9 +288,9 @@ static void __exit i7300_idle_ioat_exit(void)
288 ioat_chanbase + IOAT1_CHANCMD_OFFSET); 288 ioat_chanbase + IOAT1_CHANCMD_OFFSET);
289 289
290 chan_sts = readq(ioat_chanbase + IOAT1_CHANSTS_OFFSET) & 290 chan_sts = readq(ioat_chanbase + IOAT1_CHANSTS_OFFSET) &
291 IOAT_CHANSTS_DMA_TRANSFER_STATUS; 291 IOAT_CHANSTS_STATUS;
292 292
293 if (chan_sts != IOAT_CHANSTS_DMA_TRANSFER_STATUS_ACTIVE) { 293 if (chan_sts != IOAT_CHANSTS_ACTIVE) {
294 writew(0, ioat_chanbase + IOAT_CHANCTRL_OFFSET); 294 writew(0, ioat_chanbase + IOAT_CHANCTRL_OFFSET);
295 break; 295 break;
296 } 296 }
@@ -298,14 +298,14 @@ static void __exit i7300_idle_ioat_exit(void)
298 } 298 }
299 299
300 chan_sts = readq(ioat_chanbase + IOAT1_CHANSTS_OFFSET) & 300 chan_sts = readq(ioat_chanbase + IOAT1_CHANSTS_OFFSET) &
301 IOAT_CHANSTS_DMA_TRANSFER_STATUS; 301 IOAT_CHANSTS_STATUS;
302 302
303 /* 303 /*
304 * We tried to reset multiple times. If IO A/T channel is still active 304 * We tried to reset multiple times. If IO A/T channel is still active
305 * flag an error and return without cleanup. Memory leak is better 305 * flag an error and return without cleanup. Memory leak is better
306 * than random corruption in that extreme error situation. 306 * than random corruption in that extreme error situation.
307 */ 307 */
308 if (chan_sts == IOAT_CHANSTS_DMA_TRANSFER_STATUS_ACTIVE) { 308 if (chan_sts == IOAT_CHANSTS_ACTIVE) {
309 printk(KERN_ERR I7300_PRINT "Unable to stop IO A/T channels." 309 printk(KERN_ERR I7300_PRINT "Unable to stop IO A/T channels."
310 " Not freeing resources\n"); 310 " Not freeing resources\n");
311 return; 311 return;
diff --git a/drivers/input/misc/Kconfig b/drivers/input/misc/Kconfig
index 76d6751f89a7..02f4f8f1db6f 100644
--- a/drivers/input/misc/Kconfig
+++ b/drivers/input/misc/Kconfig
@@ -225,6 +225,7 @@ config INPUT_SGI_BTNS
225config INPUT_WINBOND_CIR 225config INPUT_WINBOND_CIR
226 tristate "Winbond IR remote control" 226 tristate "Winbond IR remote control"
227 depends on X86 && PNP 227 depends on X86 && PNP
228 select NEW_LEDS
228 select LEDS_CLASS 229 select LEDS_CLASS
229 select BITREVERSE 230 select BITREVERSE
230 help 231 help
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 020f9573fd82..2158377a1359 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -124,6 +124,8 @@ config MD_RAID456
124 select MD_RAID6_PQ 124 select MD_RAID6_PQ
125 select ASYNC_MEMCPY 125 select ASYNC_MEMCPY
126 select ASYNC_XOR 126 select ASYNC_XOR
127 select ASYNC_PQ
128 select ASYNC_RAID6_RECOV
127 ---help--- 129 ---help---
128 A RAID-5 set of N drives with a capacity of C MB per drive provides 130 A RAID-5 set of N drives with a capacity of C MB per drive provides
129 the capacity of C * (N - 1) MB, and protects against a failure 131 the capacity of C * (N - 1) MB, and protects against a failure
@@ -152,9 +154,33 @@ config MD_RAID456
152 154
153 If unsure, say Y. 155 If unsure, say Y.
154 156
157config MULTICORE_RAID456
158 bool "RAID-4/RAID-5/RAID-6 Multicore processing (EXPERIMENTAL)"
159 depends on MD_RAID456
160 depends on SMP
161 depends on EXPERIMENTAL
162 ---help---
163 Enable the raid456 module to dispatch per-stripe raid operations to a
164 thread pool.
165
166 If unsure, say N.
167
155config MD_RAID6_PQ 168config MD_RAID6_PQ
156 tristate 169 tristate
157 170
171config ASYNC_RAID6_TEST
172 tristate "Self test for hardware accelerated raid6 recovery"
173 depends on MD_RAID6_PQ
174 select ASYNC_RAID6_RECOV
175 ---help---
176 This is a one-shot self test that permutes through the
177 recovery of all the possible two disk failure scenarios for a
178 N-disk array. Recovery is performed with the asynchronous
179 raid6 recovery routines, and will optionally use an offload
180 engine if one is available.
181
182 If unsure, say N.
183
158config MD_MULTIPATH 184config MD_MULTIPATH
159 tristate "Multipath I/O support" 185 tristate "Multipath I/O support"
160 depends on BLK_DEV_MD 186 depends on BLK_DEV_MD
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 3319c2fec28e..6986b0059d23 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -108,6 +108,8 @@ static void bitmap_free_page(struct bitmap *bitmap, unsigned char *page)
108 * allocated while we're using it 108 * allocated while we're using it
109 */ 109 */
110static int bitmap_checkpage(struct bitmap *bitmap, unsigned long page, int create) 110static int bitmap_checkpage(struct bitmap *bitmap, unsigned long page, int create)
111__releases(bitmap->lock)
112__acquires(bitmap->lock)
111{ 113{
112 unsigned char *mappage; 114 unsigned char *mappage;
113 115
@@ -325,7 +327,6 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
325 return 0; 327 return 0;
326 328
327 bad_alignment: 329 bad_alignment:
328 rcu_read_unlock();
329 return -EINVAL; 330 return -EINVAL;
330} 331}
331 332
@@ -1207,6 +1208,8 @@ void bitmap_daemon_work(struct bitmap *bitmap)
1207static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap, 1208static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap,
1208 sector_t offset, int *blocks, 1209 sector_t offset, int *blocks,
1209 int create) 1210 int create)
1211__releases(bitmap->lock)
1212__acquires(bitmap->lock)
1210{ 1213{
1211 /* If 'create', we might release the lock and reclaim it. 1214 /* If 'create', we might release the lock and reclaim it.
1212 * The lock must have been taken with interrupts enabled. 1215 * The lock must have been taken with interrupts enabled.
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index ea4842905444..1ceceb334d5e 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -108,6 +108,9 @@ static int linear_congested(void *data, int bits)
108 linear_conf_t *conf; 108 linear_conf_t *conf;
109 int i, ret = 0; 109 int i, ret = 0;
110 110
111 if (mddev_congested(mddev, bits))
112 return 1;
113
111 rcu_read_lock(); 114 rcu_read_lock();
112 conf = rcu_dereference(mddev->private); 115 conf = rcu_dereference(mddev->private);
113 116
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 6aa497e4baf8..26ba42a79129 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -262,6 +262,12 @@ static void mddev_resume(mddev_t *mddev)
262 mddev->pers->quiesce(mddev, 0); 262 mddev->pers->quiesce(mddev, 0);
263} 263}
264 264
265int mddev_congested(mddev_t *mddev, int bits)
266{
267 return mddev->suspended;
268}
269EXPORT_SYMBOL(mddev_congested);
270
265 271
266static inline mddev_t *mddev_get(mddev_t *mddev) 272static inline mddev_t *mddev_get(mddev_t *mddev)
267{ 273{
@@ -4218,7 +4224,7 @@ static int do_md_run(mddev_t * mddev)
4218 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 4224 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
4219 mddev->sync_thread = md_register_thread(md_do_sync, 4225 mddev->sync_thread = md_register_thread(md_do_sync,
4220 mddev, 4226 mddev,
4221 "%s_resync"); 4227 "resync");
4222 if (!mddev->sync_thread) { 4228 if (!mddev->sync_thread) {
4223 printk(KERN_ERR "%s: could not start resync" 4229 printk(KERN_ERR "%s: could not start resync"
4224 " thread...\n", 4230 " thread...\n",
@@ -4575,10 +4581,10 @@ static int get_version(void __user * arg)
4575static int get_array_info(mddev_t * mddev, void __user * arg) 4581static int get_array_info(mddev_t * mddev, void __user * arg)
4576{ 4582{
4577 mdu_array_info_t info; 4583 mdu_array_info_t info;
4578 int nr,working,active,failed,spare; 4584 int nr,working,insync,failed,spare;
4579 mdk_rdev_t *rdev; 4585 mdk_rdev_t *rdev;
4580 4586
4581 nr=working=active=failed=spare=0; 4587 nr=working=insync=failed=spare=0;
4582 list_for_each_entry(rdev, &mddev->disks, same_set) { 4588 list_for_each_entry(rdev, &mddev->disks, same_set) {
4583 nr++; 4589 nr++;
4584 if (test_bit(Faulty, &rdev->flags)) 4590 if (test_bit(Faulty, &rdev->flags))
@@ -4586,7 +4592,7 @@ static int get_array_info(mddev_t * mddev, void __user * arg)
4586 else { 4592 else {
4587 working++; 4593 working++;
4588 if (test_bit(In_sync, &rdev->flags)) 4594 if (test_bit(In_sync, &rdev->flags))
4589 active++; 4595 insync++;
4590 else 4596 else
4591 spare++; 4597 spare++;
4592 } 4598 }
@@ -4611,7 +4617,7 @@ static int get_array_info(mddev_t * mddev, void __user * arg)
4611 info.state = (1<<MD_SB_CLEAN); 4617 info.state = (1<<MD_SB_CLEAN);
4612 if (mddev->bitmap && mddev->bitmap_offset) 4618 if (mddev->bitmap && mddev->bitmap_offset)
4613 info.state = (1<<MD_SB_BITMAP_PRESENT); 4619 info.state = (1<<MD_SB_BITMAP_PRESENT);
4614 info.active_disks = active; 4620 info.active_disks = insync;
4615 info.working_disks = working; 4621 info.working_disks = working;
4616 info.failed_disks = failed; 4622 info.failed_disks = failed;
4617 info.spare_disks = spare; 4623 info.spare_disks = spare;
@@ -4721,7 +4727,7 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
4721 if (!list_empty(&mddev->disks)) { 4727 if (!list_empty(&mddev->disks)) {
4722 mdk_rdev_t *rdev0 = list_entry(mddev->disks.next, 4728 mdk_rdev_t *rdev0 = list_entry(mddev->disks.next,
4723 mdk_rdev_t, same_set); 4729 mdk_rdev_t, same_set);
4724 int err = super_types[mddev->major_version] 4730 err = super_types[mddev->major_version]
4725 .load_super(rdev, rdev0, mddev->minor_version); 4731 .load_super(rdev, rdev0, mddev->minor_version);
4726 if (err < 0) { 4732 if (err < 0) {
4727 printk(KERN_WARNING 4733 printk(KERN_WARNING
@@ -5631,7 +5637,10 @@ mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev,
5631 thread->run = run; 5637 thread->run = run;
5632 thread->mddev = mddev; 5638 thread->mddev = mddev;
5633 thread->timeout = MAX_SCHEDULE_TIMEOUT; 5639 thread->timeout = MAX_SCHEDULE_TIMEOUT;
5634 thread->tsk = kthread_run(md_thread, thread, name, mdname(thread->mddev)); 5640 thread->tsk = kthread_run(md_thread, thread,
5641 "%s_%s",
5642 mdname(thread->mddev),
5643 name ?: mddev->pers->name);
5635 if (IS_ERR(thread->tsk)) { 5644 if (IS_ERR(thread->tsk)) {
5636 kfree(thread); 5645 kfree(thread);
5637 return NULL; 5646 return NULL;
@@ -6745,7 +6754,7 @@ void md_check_recovery(mddev_t *mddev)
6745 } 6754 }
6746 mddev->sync_thread = md_register_thread(md_do_sync, 6755 mddev->sync_thread = md_register_thread(md_do_sync,
6747 mddev, 6756 mddev,
6748 "%s_resync"); 6757 "resync");
6749 if (!mddev->sync_thread) { 6758 if (!mddev->sync_thread) {
6750 printk(KERN_ERR "%s: could not start resync" 6759 printk(KERN_ERR "%s: could not start resync"
6751 " thread...\n", 6760 " thread...\n",
diff --git a/drivers/md/md.h b/drivers/md/md.h
index f55d2ff95133..f184b69ef337 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -430,6 +430,7 @@ extern void md_write_end(mddev_t *mddev);
430extern void md_done_sync(mddev_t *mddev, int blocks, int ok); 430extern void md_done_sync(mddev_t *mddev, int blocks, int ok);
431extern void md_error(mddev_t *mddev, mdk_rdev_t *rdev); 431extern void md_error(mddev_t *mddev, mdk_rdev_t *rdev);
432 432
433extern int mddev_congested(mddev_t *mddev, int bits);
433extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, 434extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
434 sector_t sector, int size, struct page *page); 435 sector_t sector, int size, struct page *page);
435extern void md_super_wait(mddev_t *mddev); 436extern void md_super_wait(mddev_t *mddev);
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index d2d3fd54cc68..ee7646f974a0 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -150,7 +150,6 @@ static int multipath_make_request (struct request_queue *q, struct bio * bio)
150 } 150 }
151 151
152 mp_bh = mempool_alloc(conf->pool, GFP_NOIO); 152 mp_bh = mempool_alloc(conf->pool, GFP_NOIO);
153 memset(mp_bh, 0, sizeof(*mp_bh));
154 153
155 mp_bh->master_bio = bio; 154 mp_bh->master_bio = bio;
156 mp_bh->mddev = mddev; 155 mp_bh->mddev = mddev;
@@ -199,6 +198,9 @@ static int multipath_congested(void *data, int bits)
199 multipath_conf_t *conf = mddev->private; 198 multipath_conf_t *conf = mddev->private;
200 int i, ret = 0; 199 int i, ret = 0;
201 200
201 if (mddev_congested(mddev, bits))
202 return 1;
203
202 rcu_read_lock(); 204 rcu_read_lock();
203 for (i = 0; i < mddev->raid_disks ; i++) { 205 for (i = 0; i < mddev->raid_disks ; i++) {
204 mdk_rdev_t *rdev = rcu_dereference(conf->multipaths[i].rdev); 206 mdk_rdev_t *rdev = rcu_dereference(conf->multipaths[i].rdev);
@@ -504,7 +506,7 @@ static int multipath_run (mddev_t *mddev)
504 } 506 }
505 507
506 { 508 {
507 mddev->thread = md_register_thread(multipathd, mddev, "%s_multipath"); 509 mddev->thread = md_register_thread(multipathd, mddev, NULL);
508 if (!mddev->thread) { 510 if (!mddev->thread) {
509 printk(KERN_ERR "multipath: couldn't allocate thread" 511 printk(KERN_ERR "multipath: couldn't allocate thread"
510 " for %s\n", mdname(mddev)); 512 " for %s\n", mdname(mddev));
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index f845ed98fec9..d3a4ce06015a 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -44,6 +44,9 @@ static int raid0_congested(void *data, int bits)
44 mdk_rdev_t **devlist = conf->devlist; 44 mdk_rdev_t **devlist = conf->devlist;
45 int i, ret = 0; 45 int i, ret = 0;
46 46
47 if (mddev_congested(mddev, bits))
48 return 1;
49
47 for (i = 0; i < mddev->raid_disks && !ret ; i++) { 50 for (i = 0; i < mddev->raid_disks && !ret ; i++) {
48 struct request_queue *q = bdev_get_queue(devlist[i]->bdev); 51 struct request_queue *q = bdev_get_queue(devlist[i]->bdev);
49 52
@@ -86,7 +89,7 @@ static void dump_zones(mddev_t *mddev)
86 89
87static int create_strip_zones(mddev_t *mddev) 90static int create_strip_zones(mddev_t *mddev)
88{ 91{
89 int i, c, j, err; 92 int i, c, err;
90 sector_t curr_zone_end, sectors; 93 sector_t curr_zone_end, sectors;
91 mdk_rdev_t *smallest, *rdev1, *rdev2, *rdev, **dev; 94 mdk_rdev_t *smallest, *rdev1, *rdev2, *rdev, **dev;
92 struct strip_zone *zone; 95 struct strip_zone *zone;
@@ -198,6 +201,8 @@ static int create_strip_zones(mddev_t *mddev)
198 /* now do the other zones */ 201 /* now do the other zones */
199 for (i = 1; i < conf->nr_strip_zones; i++) 202 for (i = 1; i < conf->nr_strip_zones; i++)
200 { 203 {
204 int j;
205
201 zone = conf->strip_zone + i; 206 zone = conf->strip_zone + i;
202 dev = conf->devlist + i * mddev->raid_disks; 207 dev = conf->devlist + i * mddev->raid_disks;
203 208
@@ -207,7 +212,6 @@ static int create_strip_zones(mddev_t *mddev)
207 c = 0; 212 c = 0;
208 213
209 for (j=0; j<cnt; j++) { 214 for (j=0; j<cnt; j++) {
210 char b[BDEVNAME_SIZE];
211 rdev = conf->devlist[j]; 215 rdev = conf->devlist[j];
212 printk(KERN_INFO "raid0: checking %s ...", 216 printk(KERN_INFO "raid0: checking %s ...",
213 bdevname(rdev->bdev, b)); 217 bdevname(rdev->bdev, b));
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index ff7ed3335995..d1b9bd5fd4f6 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -576,6 +576,9 @@ static int raid1_congested(void *data, int bits)
576 conf_t *conf = mddev->private; 576 conf_t *conf = mddev->private;
577 int i, ret = 0; 577 int i, ret = 0;
578 578
579 if (mddev_congested(mddev, bits))
580 return 1;
581
579 rcu_read_lock(); 582 rcu_read_lock();
580 for (i = 0; i < mddev->raid_disks; i++) { 583 for (i = 0; i < mddev->raid_disks; i++) {
581 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); 584 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
@@ -851,7 +854,7 @@ static int make_request(struct request_queue *q, struct bio * bio)
851 read_bio->bi_sector = r1_bio->sector + mirror->rdev->data_offset; 854 read_bio->bi_sector = r1_bio->sector + mirror->rdev->data_offset;
852 read_bio->bi_bdev = mirror->rdev->bdev; 855 read_bio->bi_bdev = mirror->rdev->bdev;
853 read_bio->bi_end_io = raid1_end_read_request; 856 read_bio->bi_end_io = raid1_end_read_request;
854 read_bio->bi_rw = READ | do_sync; 857 read_bio->bi_rw = READ | (do_sync << BIO_RW_SYNCIO);
855 read_bio->bi_private = r1_bio; 858 read_bio->bi_private = r1_bio;
856 859
857 generic_make_request(read_bio); 860 generic_make_request(read_bio);
@@ -943,7 +946,8 @@ static int make_request(struct request_queue *q, struct bio * bio)
943 mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset; 946 mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset;
944 mbio->bi_bdev = conf->mirrors[i].rdev->bdev; 947 mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
945 mbio->bi_end_io = raid1_end_write_request; 948 mbio->bi_end_io = raid1_end_write_request;
946 mbio->bi_rw = WRITE | do_barriers | do_sync; 949 mbio->bi_rw = WRITE | (do_barriers << BIO_RW_BARRIER) |
950 (do_sync << BIO_RW_SYNCIO);
947 mbio->bi_private = r1_bio; 951 mbio->bi_private = r1_bio;
948 952
949 if (behind_pages) { 953 if (behind_pages) {
@@ -1623,7 +1627,8 @@ static void raid1d(mddev_t *mddev)
1623 conf->mirrors[i].rdev->data_offset; 1627 conf->mirrors[i].rdev->data_offset;
1624 bio->bi_bdev = conf->mirrors[i].rdev->bdev; 1628 bio->bi_bdev = conf->mirrors[i].rdev->bdev;
1625 bio->bi_end_io = raid1_end_write_request; 1629 bio->bi_end_io = raid1_end_write_request;
1626 bio->bi_rw = WRITE | do_sync; 1630 bio->bi_rw = WRITE |
1631 (do_sync << BIO_RW_SYNCIO);
1627 bio->bi_private = r1_bio; 1632 bio->bi_private = r1_bio;
1628 r1_bio->bios[i] = bio; 1633 r1_bio->bios[i] = bio;
1629 generic_make_request(bio); 1634 generic_make_request(bio);
@@ -1672,7 +1677,7 @@ static void raid1d(mddev_t *mddev)
1672 bio->bi_sector = r1_bio->sector + rdev->data_offset; 1677 bio->bi_sector = r1_bio->sector + rdev->data_offset;
1673 bio->bi_bdev = rdev->bdev; 1678 bio->bi_bdev = rdev->bdev;
1674 bio->bi_end_io = raid1_end_read_request; 1679 bio->bi_end_io = raid1_end_read_request;
1675 bio->bi_rw = READ | do_sync; 1680 bio->bi_rw = READ | (do_sync << BIO_RW_SYNCIO);
1676 bio->bi_private = r1_bio; 1681 bio->bi_private = r1_bio;
1677 unplug = 1; 1682 unplug = 1;
1678 generic_make_request(bio); 1683 generic_make_request(bio);
@@ -2047,7 +2052,7 @@ static int run(mddev_t *mddev)
2047 conf->last_used = j; 2052 conf->last_used = j;
2048 2053
2049 2054
2050 mddev->thread = md_register_thread(raid1d, mddev, "%s_raid1"); 2055 mddev->thread = md_register_thread(raid1d, mddev, NULL);
2051 if (!mddev->thread) { 2056 if (!mddev->thread) {
2052 printk(KERN_ERR 2057 printk(KERN_ERR
2053 "raid1: couldn't allocate thread for %s\n", 2058 "raid1: couldn't allocate thread for %s\n",
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index d0a2152e064f..51c4c5c4d87a 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -631,6 +631,8 @@ static int raid10_congested(void *data, int bits)
631 conf_t *conf = mddev->private; 631 conf_t *conf = mddev->private;
632 int i, ret = 0; 632 int i, ret = 0;
633 633
634 if (mddev_congested(mddev, bits))
635 return 1;
634 rcu_read_lock(); 636 rcu_read_lock();
635 for (i = 0; i < mddev->raid_disks && ret == 0; i++) { 637 for (i = 0; i < mddev->raid_disks && ret == 0; i++) {
636 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); 638 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
@@ -882,7 +884,7 @@ static int make_request(struct request_queue *q, struct bio * bio)
882 mirror->rdev->data_offset; 884 mirror->rdev->data_offset;
883 read_bio->bi_bdev = mirror->rdev->bdev; 885 read_bio->bi_bdev = mirror->rdev->bdev;
884 read_bio->bi_end_io = raid10_end_read_request; 886 read_bio->bi_end_io = raid10_end_read_request;
885 read_bio->bi_rw = READ | do_sync; 887 read_bio->bi_rw = READ | (do_sync << BIO_RW_SYNCIO);
886 read_bio->bi_private = r10_bio; 888 read_bio->bi_private = r10_bio;
887 889
888 generic_make_request(read_bio); 890 generic_make_request(read_bio);
@@ -950,7 +952,7 @@ static int make_request(struct request_queue *q, struct bio * bio)
950 conf->mirrors[d].rdev->data_offset; 952 conf->mirrors[d].rdev->data_offset;
951 mbio->bi_bdev = conf->mirrors[d].rdev->bdev; 953 mbio->bi_bdev = conf->mirrors[d].rdev->bdev;
952 mbio->bi_end_io = raid10_end_write_request; 954 mbio->bi_end_io = raid10_end_write_request;
953 mbio->bi_rw = WRITE | do_sync; 955 mbio->bi_rw = WRITE | (do_sync << BIO_RW_SYNCIO);
954 mbio->bi_private = r10_bio; 956 mbio->bi_private = r10_bio;
955 957
956 atomic_inc(&r10_bio->remaining); 958 atomic_inc(&r10_bio->remaining);
@@ -1623,7 +1625,7 @@ static void raid10d(mddev_t *mddev)
1623 bio->bi_sector = r10_bio->devs[r10_bio->read_slot].addr 1625 bio->bi_sector = r10_bio->devs[r10_bio->read_slot].addr
1624 + rdev->data_offset; 1626 + rdev->data_offset;
1625 bio->bi_bdev = rdev->bdev; 1627 bio->bi_bdev = rdev->bdev;
1626 bio->bi_rw = READ | do_sync; 1628 bio->bi_rw = READ | (do_sync << BIO_RW_SYNCIO);
1627 bio->bi_private = r10_bio; 1629 bio->bi_private = r10_bio;
1628 bio->bi_end_io = raid10_end_read_request; 1630 bio->bi_end_io = raid10_end_read_request;
1629 unplug = 1; 1631 unplug = 1;
@@ -1773,7 +1775,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1773 max_sync = RESYNC_PAGES << (PAGE_SHIFT-9); 1775 max_sync = RESYNC_PAGES << (PAGE_SHIFT-9);
1774 if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 1776 if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
1775 /* recovery... the complicated one */ 1777 /* recovery... the complicated one */
1776 int i, j, k; 1778 int j, k;
1777 r10_bio = NULL; 1779 r10_bio = NULL;
1778 1780
1779 for (i=0 ; i<conf->raid_disks; i++) 1781 for (i=0 ; i<conf->raid_disks; i++)
@@ -2188,7 +2190,7 @@ static int run(mddev_t *mddev)
2188 } 2190 }
2189 2191
2190 2192
2191 mddev->thread = md_register_thread(raid10d, mddev, "%s_raid10"); 2193 mddev->thread = md_register_thread(raid10d, mddev, NULL);
2192 if (!mddev->thread) { 2194 if (!mddev->thread) {
2193 printk(KERN_ERR 2195 printk(KERN_ERR
2194 "raid10: couldn't allocate thread for %s\n", 2196 "raid10: couldn't allocate thread for %s\n",
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 826eb3467357..94829804ab7f 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -47,7 +47,9 @@
47#include <linux/kthread.h> 47#include <linux/kthread.h>
48#include <linux/raid/pq.h> 48#include <linux/raid/pq.h>
49#include <linux/async_tx.h> 49#include <linux/async_tx.h>
50#include <linux/async.h>
50#include <linux/seq_file.h> 51#include <linux/seq_file.h>
52#include <linux/cpu.h>
51#include "md.h" 53#include "md.h"
52#include "raid5.h" 54#include "raid5.h"
53#include "bitmap.h" 55#include "bitmap.h"
@@ -499,11 +501,18 @@ async_copy_data(int frombio, struct bio *bio, struct page *page,
499 struct page *bio_page; 501 struct page *bio_page;
500 int i; 502 int i;
501 int page_offset; 503 int page_offset;
504 struct async_submit_ctl submit;
505 enum async_tx_flags flags = 0;
502 506
503 if (bio->bi_sector >= sector) 507 if (bio->bi_sector >= sector)
504 page_offset = (signed)(bio->bi_sector - sector) * 512; 508 page_offset = (signed)(bio->bi_sector - sector) * 512;
505 else 509 else
506 page_offset = (signed)(sector - bio->bi_sector) * -512; 510 page_offset = (signed)(sector - bio->bi_sector) * -512;
511
512 if (frombio)
513 flags |= ASYNC_TX_FENCE;
514 init_async_submit(&submit, flags, tx, NULL, NULL, NULL);
515
507 bio_for_each_segment(bvl, bio, i) { 516 bio_for_each_segment(bvl, bio, i) {
508 int len = bio_iovec_idx(bio, i)->bv_len; 517 int len = bio_iovec_idx(bio, i)->bv_len;
509 int clen; 518 int clen;
@@ -525,15 +534,14 @@ async_copy_data(int frombio, struct bio *bio, struct page *page,
525 bio_page = bio_iovec_idx(bio, i)->bv_page; 534 bio_page = bio_iovec_idx(bio, i)->bv_page;
526 if (frombio) 535 if (frombio)
527 tx = async_memcpy(page, bio_page, page_offset, 536 tx = async_memcpy(page, bio_page, page_offset,
528 b_offset, clen, 537 b_offset, clen, &submit);
529 ASYNC_TX_DEP_ACK,
530 tx, NULL, NULL);
531 else 538 else
532 tx = async_memcpy(bio_page, page, b_offset, 539 tx = async_memcpy(bio_page, page, b_offset,
533 page_offset, clen, 540 page_offset, clen, &submit);
534 ASYNC_TX_DEP_ACK,
535 tx, NULL, NULL);
536 } 541 }
542 /* chain the operations */
543 submit.depend_tx = tx;
544
537 if (clen < len) /* hit end of page */ 545 if (clen < len) /* hit end of page */
538 break; 546 break;
539 page_offset += len; 547 page_offset += len;
@@ -592,6 +600,7 @@ static void ops_run_biofill(struct stripe_head *sh)
592{ 600{
593 struct dma_async_tx_descriptor *tx = NULL; 601 struct dma_async_tx_descriptor *tx = NULL;
594 raid5_conf_t *conf = sh->raid_conf; 602 raid5_conf_t *conf = sh->raid_conf;
603 struct async_submit_ctl submit;
595 int i; 604 int i;
596 605
597 pr_debug("%s: stripe %llu\n", __func__, 606 pr_debug("%s: stripe %llu\n", __func__,
@@ -615,22 +624,34 @@ static void ops_run_biofill(struct stripe_head *sh)
615 } 624 }
616 625
617 atomic_inc(&sh->count); 626 atomic_inc(&sh->count);
618 async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx, 627 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_biofill, sh, NULL);
619 ops_complete_biofill, sh); 628 async_trigger_callback(&submit);
620} 629}
621 630
622static void ops_complete_compute5(void *stripe_head_ref) 631static void mark_target_uptodate(struct stripe_head *sh, int target)
623{ 632{
624 struct stripe_head *sh = stripe_head_ref; 633 struct r5dev *tgt;
625 int target = sh->ops.target;
626 struct r5dev *tgt = &sh->dev[target];
627 634
628 pr_debug("%s: stripe %llu\n", __func__, 635 if (target < 0)
629 (unsigned long long)sh->sector); 636 return;
630 637
638 tgt = &sh->dev[target];
631 set_bit(R5_UPTODATE, &tgt->flags); 639 set_bit(R5_UPTODATE, &tgt->flags);
632 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 640 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
633 clear_bit(R5_Wantcompute, &tgt->flags); 641 clear_bit(R5_Wantcompute, &tgt->flags);
642}
643
644static void ops_complete_compute(void *stripe_head_ref)
645{
646 struct stripe_head *sh = stripe_head_ref;
647
648 pr_debug("%s: stripe %llu\n", __func__,
649 (unsigned long long)sh->sector);
650
651 /* mark the computed target(s) as uptodate */
652 mark_target_uptodate(sh, sh->ops.target);
653 mark_target_uptodate(sh, sh->ops.target2);
654
634 clear_bit(STRIPE_COMPUTE_RUN, &sh->state); 655 clear_bit(STRIPE_COMPUTE_RUN, &sh->state);
635 if (sh->check_state == check_state_compute_run) 656 if (sh->check_state == check_state_compute_run)
636 sh->check_state = check_state_compute_result; 657 sh->check_state = check_state_compute_result;
@@ -638,16 +659,24 @@ static void ops_complete_compute5(void *stripe_head_ref)
638 release_stripe(sh); 659 release_stripe(sh);
639} 660}
640 661
641static struct dma_async_tx_descriptor *ops_run_compute5(struct stripe_head *sh) 662/* return a pointer to the address conversion region of the scribble buffer */
663static addr_conv_t *to_addr_conv(struct stripe_head *sh,
664 struct raid5_percpu *percpu)
665{
666 return percpu->scribble + sizeof(struct page *) * (sh->disks + 2);
667}
668
669static struct dma_async_tx_descriptor *
670ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
642{ 671{
643 /* kernel stack size limits the total number of disks */
644 int disks = sh->disks; 672 int disks = sh->disks;
645 struct page *xor_srcs[disks]; 673 struct page **xor_srcs = percpu->scribble;
646 int target = sh->ops.target; 674 int target = sh->ops.target;
647 struct r5dev *tgt = &sh->dev[target]; 675 struct r5dev *tgt = &sh->dev[target];
648 struct page *xor_dest = tgt->page; 676 struct page *xor_dest = tgt->page;
649 int count = 0; 677 int count = 0;
650 struct dma_async_tx_descriptor *tx; 678 struct dma_async_tx_descriptor *tx;
679 struct async_submit_ctl submit;
651 int i; 680 int i;
652 681
653 pr_debug("%s: stripe %llu block: %d\n", 682 pr_debug("%s: stripe %llu block: %d\n",
@@ -660,17 +689,215 @@ static struct dma_async_tx_descriptor *ops_run_compute5(struct stripe_head *sh)
660 689
661 atomic_inc(&sh->count); 690 atomic_inc(&sh->count);
662 691
692 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL,
693 ops_complete_compute, sh, to_addr_conv(sh, percpu));
663 if (unlikely(count == 1)) 694 if (unlikely(count == 1))
664 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, 695 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
665 0, NULL, ops_complete_compute5, sh);
666 else 696 else
667 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, 697 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
668 ASYNC_TX_XOR_ZERO_DST, NULL,
669 ops_complete_compute5, sh);
670 698
671 return tx; 699 return tx;
672} 700}
673 701
702/* set_syndrome_sources - populate source buffers for gen_syndrome
703 * @srcs - (struct page *) array of size sh->disks
704 * @sh - stripe_head to parse
705 *
706 * Populates srcs in proper layout order for the stripe and returns the
707 * 'count' of sources to be used in a call to async_gen_syndrome. The P
708 * destination buffer is recorded in srcs[count] and the Q destination
709 * is recorded in srcs[count+1]].
710 */
711static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh)
712{
713 int disks = sh->disks;
714 int syndrome_disks = sh->ddf_layout ? disks : (disks - 2);
715 int d0_idx = raid6_d0(sh);
716 int count;
717 int i;
718
719 for (i = 0; i < disks; i++)
720 srcs[i] = (void *)raid6_empty_zero_page;
721
722 count = 0;
723 i = d0_idx;
724 do {
725 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
726
727 srcs[slot] = sh->dev[i].page;
728 i = raid6_next_disk(i, disks);
729 } while (i != d0_idx);
730 BUG_ON(count != syndrome_disks);
731
732 return count;
733}
734
735static struct dma_async_tx_descriptor *
736ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu)
737{
738 int disks = sh->disks;
739 struct page **blocks = percpu->scribble;
740 int target;
741 int qd_idx = sh->qd_idx;
742 struct dma_async_tx_descriptor *tx;
743 struct async_submit_ctl submit;
744 struct r5dev *tgt;
745 struct page *dest;
746 int i;
747 int count;
748
749 if (sh->ops.target < 0)
750 target = sh->ops.target2;
751 else if (sh->ops.target2 < 0)
752 target = sh->ops.target;
753 else
754 /* we should only have one valid target */
755 BUG();
756 BUG_ON(target < 0);
757 pr_debug("%s: stripe %llu block: %d\n",
758 __func__, (unsigned long long)sh->sector, target);
759
760 tgt = &sh->dev[target];
761 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
762 dest = tgt->page;
763
764 atomic_inc(&sh->count);
765
766 if (target == qd_idx) {
767 count = set_syndrome_sources(blocks, sh);
768 blocks[count] = NULL; /* regenerating p is not necessary */
769 BUG_ON(blocks[count+1] != dest); /* q should already be set */
770 init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
771 ops_complete_compute, sh,
772 to_addr_conv(sh, percpu));
773 tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
774 } else {
775 /* Compute any data- or p-drive using XOR */
776 count = 0;
777 for (i = disks; i-- ; ) {
778 if (i == target || i == qd_idx)
779 continue;
780 blocks[count++] = sh->dev[i].page;
781 }
782
783 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
784 NULL, ops_complete_compute, sh,
785 to_addr_conv(sh, percpu));
786 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit);
787 }
788
789 return tx;
790}
791
792static struct dma_async_tx_descriptor *
793ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
794{
795 int i, count, disks = sh->disks;
796 int syndrome_disks = sh->ddf_layout ? disks : disks-2;
797 int d0_idx = raid6_d0(sh);
798 int faila = -1, failb = -1;
799 int target = sh->ops.target;
800 int target2 = sh->ops.target2;
801 struct r5dev *tgt = &sh->dev[target];
802 struct r5dev *tgt2 = &sh->dev[target2];
803 struct dma_async_tx_descriptor *tx;
804 struct page **blocks = percpu->scribble;
805 struct async_submit_ctl submit;
806
807 pr_debug("%s: stripe %llu block1: %d block2: %d\n",
808 __func__, (unsigned long long)sh->sector, target, target2);
809 BUG_ON(target < 0 || target2 < 0);
810 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
811 BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags));
812
813 /* we need to open-code set_syndrome_sources to handle the
814 * slot number conversion for 'faila' and 'failb'
815 */
816 for (i = 0; i < disks ; i++)
817 blocks[i] = (void *)raid6_empty_zero_page;
818 count = 0;
819 i = d0_idx;
820 do {
821 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
822
823 blocks[slot] = sh->dev[i].page;
824
825 if (i == target)
826 faila = slot;
827 if (i == target2)
828 failb = slot;
829 i = raid6_next_disk(i, disks);
830 } while (i != d0_idx);
831 BUG_ON(count != syndrome_disks);
832
833 BUG_ON(faila == failb);
834 if (failb < faila)
835 swap(faila, failb);
836 pr_debug("%s: stripe: %llu faila: %d failb: %d\n",
837 __func__, (unsigned long long)sh->sector, faila, failb);
838
839 atomic_inc(&sh->count);
840
841 if (failb == syndrome_disks+1) {
842 /* Q disk is one of the missing disks */
843 if (faila == syndrome_disks) {
844 /* Missing P+Q, just recompute */
845 init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
846 ops_complete_compute, sh,
847 to_addr_conv(sh, percpu));
848 return async_gen_syndrome(blocks, 0, count+2,
849 STRIPE_SIZE, &submit);
850 } else {
851 struct page *dest;
852 int data_target;
853 int qd_idx = sh->qd_idx;
854
855 /* Missing D+Q: recompute D from P, then recompute Q */
856 if (target == qd_idx)
857 data_target = target2;
858 else
859 data_target = target;
860
861 count = 0;
862 for (i = disks; i-- ; ) {
863 if (i == data_target || i == qd_idx)
864 continue;
865 blocks[count++] = sh->dev[i].page;
866 }
867 dest = sh->dev[data_target].page;
868 init_async_submit(&submit,
869 ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
870 NULL, NULL, NULL,
871 to_addr_conv(sh, percpu));
872 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE,
873 &submit);
874
875 count = set_syndrome_sources(blocks, sh);
876 init_async_submit(&submit, ASYNC_TX_FENCE, tx,
877 ops_complete_compute, sh,
878 to_addr_conv(sh, percpu));
879 return async_gen_syndrome(blocks, 0, count+2,
880 STRIPE_SIZE, &submit);
881 }
882 } else {
883 init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
884 ops_complete_compute, sh,
885 to_addr_conv(sh, percpu));
886 if (failb == syndrome_disks) {
887 /* We're missing D+P. */
888 return async_raid6_datap_recov(syndrome_disks+2,
889 STRIPE_SIZE, faila,
890 blocks, &submit);
891 } else {
892 /* We're missing D+D. */
893 return async_raid6_2data_recov(syndrome_disks+2,
894 STRIPE_SIZE, faila, failb,
895 blocks, &submit);
896 }
897 }
898}
899
900
674static void ops_complete_prexor(void *stripe_head_ref) 901static void ops_complete_prexor(void *stripe_head_ref)
675{ 902{
676 struct stripe_head *sh = stripe_head_ref; 903 struct stripe_head *sh = stripe_head_ref;
@@ -680,12 +907,13 @@ static void ops_complete_prexor(void *stripe_head_ref)
680} 907}
681 908
682static struct dma_async_tx_descriptor * 909static struct dma_async_tx_descriptor *
683ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) 910ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu,
911 struct dma_async_tx_descriptor *tx)
684{ 912{
685 /* kernel stack size limits the total number of disks */
686 int disks = sh->disks; 913 int disks = sh->disks;
687 struct page *xor_srcs[disks]; 914 struct page **xor_srcs = percpu->scribble;
688 int count = 0, pd_idx = sh->pd_idx, i; 915 int count = 0, pd_idx = sh->pd_idx, i;
916 struct async_submit_ctl submit;
689 917
690 /* existing parity data subtracted */ 918 /* existing parity data subtracted */
691 struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 919 struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
@@ -700,9 +928,9 @@ ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
700 xor_srcs[count++] = dev->page; 928 xor_srcs[count++] = dev->page;
701 } 929 }
702 930
703 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, 931 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
704 ASYNC_TX_DEP_ACK | ASYNC_TX_XOR_DROP_DST, tx, 932 ops_complete_prexor, sh, to_addr_conv(sh, percpu));
705 ops_complete_prexor, sh); 933 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
706 934
707 return tx; 935 return tx;
708} 936}
@@ -742,17 +970,21 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
742 return tx; 970 return tx;
743} 971}
744 972
745static void ops_complete_postxor(void *stripe_head_ref) 973static void ops_complete_reconstruct(void *stripe_head_ref)
746{ 974{
747 struct stripe_head *sh = stripe_head_ref; 975 struct stripe_head *sh = stripe_head_ref;
748 int disks = sh->disks, i, pd_idx = sh->pd_idx; 976 int disks = sh->disks;
977 int pd_idx = sh->pd_idx;
978 int qd_idx = sh->qd_idx;
979 int i;
749 980
750 pr_debug("%s: stripe %llu\n", __func__, 981 pr_debug("%s: stripe %llu\n", __func__,
751 (unsigned long long)sh->sector); 982 (unsigned long long)sh->sector);
752 983
753 for (i = disks; i--; ) { 984 for (i = disks; i--; ) {
754 struct r5dev *dev = &sh->dev[i]; 985 struct r5dev *dev = &sh->dev[i];
755 if (dev->written || i == pd_idx) 986
987 if (dev->written || i == pd_idx || i == qd_idx)
756 set_bit(R5_UPTODATE, &dev->flags); 988 set_bit(R5_UPTODATE, &dev->flags);
757 } 989 }
758 990
@@ -770,12 +1002,12 @@ static void ops_complete_postxor(void *stripe_head_ref)
770} 1002}
771 1003
772static void 1004static void
773ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) 1005ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu,
1006 struct dma_async_tx_descriptor *tx)
774{ 1007{
775 /* kernel stack size limits the total number of disks */
776 int disks = sh->disks; 1008 int disks = sh->disks;
777 struct page *xor_srcs[disks]; 1009 struct page **xor_srcs = percpu->scribble;
778 1010 struct async_submit_ctl submit;
779 int count = 0, pd_idx = sh->pd_idx, i; 1011 int count = 0, pd_idx = sh->pd_idx, i;
780 struct page *xor_dest; 1012 struct page *xor_dest;
781 int prexor = 0; 1013 int prexor = 0;
@@ -809,18 +1041,36 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
809 * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST 1041 * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST
810 * for the synchronous xor case 1042 * for the synchronous xor case
811 */ 1043 */
812 flags = ASYNC_TX_DEP_ACK | ASYNC_TX_ACK | 1044 flags = ASYNC_TX_ACK |
813 (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST); 1045 (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST);
814 1046
815 atomic_inc(&sh->count); 1047 atomic_inc(&sh->count);
816 1048
817 if (unlikely(count == 1)) { 1049 init_async_submit(&submit, flags, tx, ops_complete_reconstruct, sh,
818 flags &= ~(ASYNC_TX_XOR_DROP_DST | ASYNC_TX_XOR_ZERO_DST); 1050 to_addr_conv(sh, percpu));
819 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, 1051 if (unlikely(count == 1))
820 flags, tx, ops_complete_postxor, sh); 1052 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
821 } else 1053 else
822 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, 1054 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
823 flags, tx, ops_complete_postxor, sh); 1055}
1056
1057static void
1058ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,
1059 struct dma_async_tx_descriptor *tx)
1060{
1061 struct async_submit_ctl submit;
1062 struct page **blocks = percpu->scribble;
1063 int count;
1064
1065 pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
1066
1067 count = set_syndrome_sources(blocks, sh);
1068
1069 atomic_inc(&sh->count);
1070
1071 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_reconstruct,
1072 sh, to_addr_conv(sh, percpu));
1073 async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
824} 1074}
825 1075
826static void ops_complete_check(void *stripe_head_ref) 1076static void ops_complete_check(void *stripe_head_ref)
@@ -835,63 +1085,115 @@ static void ops_complete_check(void *stripe_head_ref)
835 release_stripe(sh); 1085 release_stripe(sh);
836} 1086}
837 1087
838static void ops_run_check(struct stripe_head *sh) 1088static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu)
839{ 1089{
840 /* kernel stack size limits the total number of disks */
841 int disks = sh->disks; 1090 int disks = sh->disks;
842 struct page *xor_srcs[disks]; 1091 int pd_idx = sh->pd_idx;
1092 int qd_idx = sh->qd_idx;
1093 struct page *xor_dest;
1094 struct page **xor_srcs = percpu->scribble;
843 struct dma_async_tx_descriptor *tx; 1095 struct dma_async_tx_descriptor *tx;
844 1096 struct async_submit_ctl submit;
845 int count = 0, pd_idx = sh->pd_idx, i; 1097 int count;
846 struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 1098 int i;
847 1099
848 pr_debug("%s: stripe %llu\n", __func__, 1100 pr_debug("%s: stripe %llu\n", __func__,
849 (unsigned long long)sh->sector); 1101 (unsigned long long)sh->sector);
850 1102
1103 count = 0;
1104 xor_dest = sh->dev[pd_idx].page;
1105 xor_srcs[count++] = xor_dest;
851 for (i = disks; i--; ) { 1106 for (i = disks; i--; ) {
852 struct r5dev *dev = &sh->dev[i]; 1107 if (i == pd_idx || i == qd_idx)
853 if (i != pd_idx) 1108 continue;
854 xor_srcs[count++] = dev->page; 1109 xor_srcs[count++] = sh->dev[i].page;
855 } 1110 }
856 1111
857 tx = async_xor_zero_sum(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, 1112 init_async_submit(&submit, 0, NULL, NULL, NULL,
858 &sh->ops.zero_sum_result, 0, NULL, NULL, NULL); 1113 to_addr_conv(sh, percpu));
1114 tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
1115 &sh->ops.zero_sum_result, &submit);
1116
1117 atomic_inc(&sh->count);
1118 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_check, sh, NULL);
1119 tx = async_trigger_callback(&submit);
1120}
1121
1122static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp)
1123{
1124 struct page **srcs = percpu->scribble;
1125 struct async_submit_ctl submit;
1126 int count;
1127
1128 pr_debug("%s: stripe %llu checkp: %d\n", __func__,
1129 (unsigned long long)sh->sector, checkp);
1130
1131 count = set_syndrome_sources(srcs, sh);
1132 if (!checkp)
1133 srcs[count] = NULL;
859 1134
860 atomic_inc(&sh->count); 1135 atomic_inc(&sh->count);
861 tx = async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx, 1136 init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check,
862 ops_complete_check, sh); 1137 sh, to_addr_conv(sh, percpu));
1138 async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE,
1139 &sh->ops.zero_sum_result, percpu->spare_page, &submit);
863} 1140}
864 1141
865static void raid5_run_ops(struct stripe_head *sh, unsigned long ops_request) 1142static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
866{ 1143{
867 int overlap_clear = 0, i, disks = sh->disks; 1144 int overlap_clear = 0, i, disks = sh->disks;
868 struct dma_async_tx_descriptor *tx = NULL; 1145 struct dma_async_tx_descriptor *tx = NULL;
1146 raid5_conf_t *conf = sh->raid_conf;
1147 int level = conf->level;
1148 struct raid5_percpu *percpu;
1149 unsigned long cpu;
869 1150
1151 cpu = get_cpu();
1152 percpu = per_cpu_ptr(conf->percpu, cpu);
870 if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) { 1153 if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
871 ops_run_biofill(sh); 1154 ops_run_biofill(sh);
872 overlap_clear++; 1155 overlap_clear++;
873 } 1156 }
874 1157
875 if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) { 1158 if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) {
876 tx = ops_run_compute5(sh); 1159 if (level < 6)
877 /* terminate the chain if postxor is not set to be run */ 1160 tx = ops_run_compute5(sh, percpu);
878 if (tx && !test_bit(STRIPE_OP_POSTXOR, &ops_request)) 1161 else {
1162 if (sh->ops.target2 < 0 || sh->ops.target < 0)
1163 tx = ops_run_compute6_1(sh, percpu);
1164 else
1165 tx = ops_run_compute6_2(sh, percpu);
1166 }
1167 /* terminate the chain if reconstruct is not set to be run */
1168 if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request))
879 async_tx_ack(tx); 1169 async_tx_ack(tx);
880 } 1170 }
881 1171
882 if (test_bit(STRIPE_OP_PREXOR, &ops_request)) 1172 if (test_bit(STRIPE_OP_PREXOR, &ops_request))
883 tx = ops_run_prexor(sh, tx); 1173 tx = ops_run_prexor(sh, percpu, tx);
884 1174
885 if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) { 1175 if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) {
886 tx = ops_run_biodrain(sh, tx); 1176 tx = ops_run_biodrain(sh, tx);
887 overlap_clear++; 1177 overlap_clear++;
888 } 1178 }
889 1179
890 if (test_bit(STRIPE_OP_POSTXOR, &ops_request)) 1180 if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) {
891 ops_run_postxor(sh, tx); 1181 if (level < 6)
1182 ops_run_reconstruct5(sh, percpu, tx);
1183 else
1184 ops_run_reconstruct6(sh, percpu, tx);
1185 }
892 1186
893 if (test_bit(STRIPE_OP_CHECK, &ops_request)) 1187 if (test_bit(STRIPE_OP_CHECK, &ops_request)) {
894 ops_run_check(sh); 1188 if (sh->check_state == check_state_run)
1189 ops_run_check_p(sh, percpu);
1190 else if (sh->check_state == check_state_run_q)
1191 ops_run_check_pq(sh, percpu, 0);
1192 else if (sh->check_state == check_state_run_pq)
1193 ops_run_check_pq(sh, percpu, 1);
1194 else
1195 BUG();
1196 }
895 1197
896 if (overlap_clear) 1198 if (overlap_clear)
897 for (i = disks; i--; ) { 1199 for (i = disks; i--; ) {
@@ -899,6 +1201,7 @@ static void raid5_run_ops(struct stripe_head *sh, unsigned long ops_request)
899 if (test_and_clear_bit(R5_Overlap, &dev->flags)) 1201 if (test_and_clear_bit(R5_Overlap, &dev->flags))
900 wake_up(&sh->raid_conf->wait_for_overlap); 1202 wake_up(&sh->raid_conf->wait_for_overlap);
901 } 1203 }
1204 put_cpu();
902} 1205}
903 1206
904static int grow_one_stripe(raid5_conf_t *conf) 1207static int grow_one_stripe(raid5_conf_t *conf)
@@ -948,6 +1251,28 @@ static int grow_stripes(raid5_conf_t *conf, int num)
948 return 0; 1251 return 0;
949} 1252}
950 1253
1254/**
1255 * scribble_len - return the required size of the scribble region
1256 * @num - total number of disks in the array
1257 *
1258 * The size must be enough to contain:
1259 * 1/ a struct page pointer for each device in the array +2
1260 * 2/ room to convert each entry in (1) to its corresponding dma
1261 * (dma_map_page()) or page (page_address()) address.
1262 *
1263 * Note: the +2 is for the destination buffers of the ddf/raid6 case where we
1264 * calculate over all devices (not just the data blocks), using zeros in place
1265 * of the P and Q blocks.
1266 */
1267static size_t scribble_len(int num)
1268{
1269 size_t len;
1270
1271 len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2);
1272
1273 return len;
1274}
1275
951static int resize_stripes(raid5_conf_t *conf, int newsize) 1276static int resize_stripes(raid5_conf_t *conf, int newsize)
952{ 1277{
953 /* Make all the stripes able to hold 'newsize' devices. 1278 /* Make all the stripes able to hold 'newsize' devices.
@@ -976,6 +1301,7 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
976 struct stripe_head *osh, *nsh; 1301 struct stripe_head *osh, *nsh;
977 LIST_HEAD(newstripes); 1302 LIST_HEAD(newstripes);
978 struct disk_info *ndisks; 1303 struct disk_info *ndisks;
1304 unsigned long cpu;
979 int err; 1305 int err;
980 struct kmem_cache *sc; 1306 struct kmem_cache *sc;
981 int i; 1307 int i;
@@ -1041,7 +1367,7 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
1041 /* Step 3. 1367 /* Step 3.
1042 * At this point, we are holding all the stripes so the array 1368 * At this point, we are holding all the stripes so the array
1043 * is completely stalled, so now is a good time to resize 1369 * is completely stalled, so now is a good time to resize
1044 * conf->disks. 1370 * conf->disks and the scribble region
1045 */ 1371 */
1046 ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO); 1372 ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO);
1047 if (ndisks) { 1373 if (ndisks) {
@@ -1052,10 +1378,30 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
1052 } else 1378 } else
1053 err = -ENOMEM; 1379 err = -ENOMEM;
1054 1380
1381 get_online_cpus();
1382 conf->scribble_len = scribble_len(newsize);
1383 for_each_present_cpu(cpu) {
1384 struct raid5_percpu *percpu;
1385 void *scribble;
1386
1387 percpu = per_cpu_ptr(conf->percpu, cpu);
1388 scribble = kmalloc(conf->scribble_len, GFP_NOIO);
1389
1390 if (scribble) {
1391 kfree(percpu->scribble);
1392 percpu->scribble = scribble;
1393 } else {
1394 err = -ENOMEM;
1395 break;
1396 }
1397 }
1398 put_online_cpus();
1399
1055 /* Step 4, return new stripes to service */ 1400 /* Step 4, return new stripes to service */
1056 while(!list_empty(&newstripes)) { 1401 while(!list_empty(&newstripes)) {
1057 nsh = list_entry(newstripes.next, struct stripe_head, lru); 1402 nsh = list_entry(newstripes.next, struct stripe_head, lru);
1058 list_del_init(&nsh->lru); 1403 list_del_init(&nsh->lru);
1404
1059 for (i=conf->raid_disks; i < newsize; i++) 1405 for (i=conf->raid_disks; i < newsize; i++)
1060 if (nsh->dev[i].page == NULL) { 1406 if (nsh->dev[i].page == NULL) {
1061 struct page *p = alloc_page(GFP_NOIO); 1407 struct page *p = alloc_page(GFP_NOIO);
@@ -1594,258 +1940,13 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous)
1594} 1940}
1595 1941
1596 1942
1597
1598/*
1599 * Copy data between a page in the stripe cache, and one or more bion
1600 * The page could align with the middle of the bio, or there could be
1601 * several bion, each with several bio_vecs, which cover part of the page
1602 * Multiple bion are linked together on bi_next. There may be extras
1603 * at the end of this list. We ignore them.
1604 */
1605static void copy_data(int frombio, struct bio *bio,
1606 struct page *page,
1607 sector_t sector)
1608{
1609 char *pa = page_address(page);
1610 struct bio_vec *bvl;
1611 int i;
1612 int page_offset;
1613
1614 if (bio->bi_sector >= sector)
1615 page_offset = (signed)(bio->bi_sector - sector) * 512;
1616 else
1617 page_offset = (signed)(sector - bio->bi_sector) * -512;
1618 bio_for_each_segment(bvl, bio, i) {
1619 int len = bio_iovec_idx(bio,i)->bv_len;
1620 int clen;
1621 int b_offset = 0;
1622
1623 if (page_offset < 0) {
1624 b_offset = -page_offset;
1625 page_offset += b_offset;
1626 len -= b_offset;
1627 }
1628
1629 if (len > 0 && page_offset + len > STRIPE_SIZE)
1630 clen = STRIPE_SIZE - page_offset;
1631 else clen = len;
1632
1633 if (clen > 0) {
1634 char *ba = __bio_kmap_atomic(bio, i, KM_USER0);
1635 if (frombio)
1636 memcpy(pa+page_offset, ba+b_offset, clen);
1637 else
1638 memcpy(ba+b_offset, pa+page_offset, clen);
1639 __bio_kunmap_atomic(ba, KM_USER0);
1640 }
1641 if (clen < len) /* hit end of page */
1642 break;
1643 page_offset += len;
1644 }
1645}
1646
1647#define check_xor() do { \
1648 if (count == MAX_XOR_BLOCKS) { \
1649 xor_blocks(count, STRIPE_SIZE, dest, ptr);\
1650 count = 0; \
1651 } \
1652 } while(0)
1653
1654static void compute_parity6(struct stripe_head *sh, int method)
1655{
1656 raid5_conf_t *conf = sh->raid_conf;
1657 int i, pd_idx, qd_idx, d0_idx, disks = sh->disks, count;
1658 int syndrome_disks = sh->ddf_layout ? disks : (disks - 2);
1659 struct bio *chosen;
1660 /**** FIX THIS: This could be very bad if disks is close to 256 ****/
1661 void *ptrs[syndrome_disks+2];
1662
1663 pd_idx = sh->pd_idx;
1664 qd_idx = sh->qd_idx;
1665 d0_idx = raid6_d0(sh);
1666
1667 pr_debug("compute_parity, stripe %llu, method %d\n",
1668 (unsigned long long)sh->sector, method);
1669
1670 switch(method) {
1671 case READ_MODIFY_WRITE:
1672 BUG(); /* READ_MODIFY_WRITE N/A for RAID-6 */
1673 case RECONSTRUCT_WRITE:
1674 for (i= disks; i-- ;)
1675 if ( i != pd_idx && i != qd_idx && sh->dev[i].towrite ) {
1676 chosen = sh->dev[i].towrite;
1677 sh->dev[i].towrite = NULL;
1678
1679 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
1680 wake_up(&conf->wait_for_overlap);
1681
1682 BUG_ON(sh->dev[i].written);
1683 sh->dev[i].written = chosen;
1684 }
1685 break;
1686 case CHECK_PARITY:
1687 BUG(); /* Not implemented yet */
1688 }
1689
1690 for (i = disks; i--;)
1691 if (sh->dev[i].written) {
1692 sector_t sector = sh->dev[i].sector;
1693 struct bio *wbi = sh->dev[i].written;
1694 while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
1695 copy_data(1, wbi, sh->dev[i].page, sector);
1696 wbi = r5_next_bio(wbi, sector);
1697 }
1698
1699 set_bit(R5_LOCKED, &sh->dev[i].flags);
1700 set_bit(R5_UPTODATE, &sh->dev[i].flags);
1701 }
1702
1703 /* Note that unlike RAID-5, the ordering of the disks matters greatly.*/
1704
1705 for (i = 0; i < disks; i++)
1706 ptrs[i] = (void *)raid6_empty_zero_page;
1707
1708 count = 0;
1709 i = d0_idx;
1710 do {
1711 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
1712
1713 ptrs[slot] = page_address(sh->dev[i].page);
1714 if (slot < syndrome_disks &&
1715 !test_bit(R5_UPTODATE, &sh->dev[i].flags)) {
1716 printk(KERN_ERR "block %d/%d not uptodate "
1717 "on parity calc\n", i, count);
1718 BUG();
1719 }
1720
1721 i = raid6_next_disk(i, disks);
1722 } while (i != d0_idx);
1723 BUG_ON(count != syndrome_disks);
1724
1725 raid6_call.gen_syndrome(syndrome_disks+2, STRIPE_SIZE, ptrs);
1726
1727 switch(method) {
1728 case RECONSTRUCT_WRITE:
1729 set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
1730 set_bit(R5_UPTODATE, &sh->dev[qd_idx].flags);
1731 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
1732 set_bit(R5_LOCKED, &sh->dev[qd_idx].flags);
1733 break;
1734 case UPDATE_PARITY:
1735 set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
1736 set_bit(R5_UPTODATE, &sh->dev[qd_idx].flags);
1737 break;
1738 }
1739}
1740
1741
1742/* Compute one missing block */
1743static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero)
1744{
1745 int i, count, disks = sh->disks;
1746 void *ptr[MAX_XOR_BLOCKS], *dest, *p;
1747 int qd_idx = sh->qd_idx;
1748
1749 pr_debug("compute_block_1, stripe %llu, idx %d\n",
1750 (unsigned long long)sh->sector, dd_idx);
1751
1752 if ( dd_idx == qd_idx ) {
1753 /* We're actually computing the Q drive */
1754 compute_parity6(sh, UPDATE_PARITY);
1755 } else {
1756 dest = page_address(sh->dev[dd_idx].page);
1757 if (!nozero) memset(dest, 0, STRIPE_SIZE);
1758 count = 0;
1759 for (i = disks ; i--; ) {
1760 if (i == dd_idx || i == qd_idx)
1761 continue;
1762 p = page_address(sh->dev[i].page);
1763 if (test_bit(R5_UPTODATE, &sh->dev[i].flags))
1764 ptr[count++] = p;
1765 else
1766 printk("compute_block() %d, stripe %llu, %d"
1767 " not present\n", dd_idx,
1768 (unsigned long long)sh->sector, i);
1769
1770 check_xor();
1771 }
1772 if (count)
1773 xor_blocks(count, STRIPE_SIZE, dest, ptr);
1774 if (!nozero) set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
1775 else clear_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
1776 }
1777}
1778
1779/* Compute two missing blocks */
1780static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2)
1781{
1782 int i, count, disks = sh->disks;
1783 int syndrome_disks = sh->ddf_layout ? disks : disks-2;
1784 int d0_idx = raid6_d0(sh);
1785 int faila = -1, failb = -1;
1786 /**** FIX THIS: This could be very bad if disks is close to 256 ****/
1787 void *ptrs[syndrome_disks+2];
1788
1789 for (i = 0; i < disks ; i++)
1790 ptrs[i] = (void *)raid6_empty_zero_page;
1791 count = 0;
1792 i = d0_idx;
1793 do {
1794 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
1795
1796 ptrs[slot] = page_address(sh->dev[i].page);
1797
1798 if (i == dd_idx1)
1799 faila = slot;
1800 if (i == dd_idx2)
1801 failb = slot;
1802 i = raid6_next_disk(i, disks);
1803 } while (i != d0_idx);
1804 BUG_ON(count != syndrome_disks);
1805
1806 BUG_ON(faila == failb);
1807 if ( failb < faila ) { int tmp = faila; faila = failb; failb = tmp; }
1808
1809 pr_debug("compute_block_2, stripe %llu, idx %d,%d (%d,%d)\n",
1810 (unsigned long long)sh->sector, dd_idx1, dd_idx2,
1811 faila, failb);
1812
1813 if (failb == syndrome_disks+1) {
1814 /* Q disk is one of the missing disks */
1815 if (faila == syndrome_disks) {
1816 /* Missing P+Q, just recompute */
1817 compute_parity6(sh, UPDATE_PARITY);
1818 return;
1819 } else {
1820 /* We're missing D+Q; recompute D from P */
1821 compute_block_1(sh, ((dd_idx1 == sh->qd_idx) ?
1822 dd_idx2 : dd_idx1),
1823 0);
1824 compute_parity6(sh, UPDATE_PARITY); /* Is this necessary? */
1825 return;
1826 }
1827 }
1828
1829 /* We're missing D+P or D+D; */
1830 if (failb == syndrome_disks) {
1831 /* We're missing D+P. */
1832 raid6_datap_recov(syndrome_disks+2, STRIPE_SIZE, faila, ptrs);
1833 } else {
1834 /* We're missing D+D. */
1835 raid6_2data_recov(syndrome_disks+2, STRIPE_SIZE, faila, failb,
1836 ptrs);
1837 }
1838
1839 /* Both the above update both missing blocks */
1840 set_bit(R5_UPTODATE, &sh->dev[dd_idx1].flags);
1841 set_bit(R5_UPTODATE, &sh->dev[dd_idx2].flags);
1842}
1843
1844static void 1943static void
1845schedule_reconstruction5(struct stripe_head *sh, struct stripe_head_state *s, 1944schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
1846 int rcw, int expand) 1945 int rcw, int expand)
1847{ 1946{
1848 int i, pd_idx = sh->pd_idx, disks = sh->disks; 1947 int i, pd_idx = sh->pd_idx, disks = sh->disks;
1948 raid5_conf_t *conf = sh->raid_conf;
1949 int level = conf->level;
1849 1950
1850 if (rcw) { 1951 if (rcw) {
1851 /* if we are not expanding this is a proper write request, and 1952 /* if we are not expanding this is a proper write request, and
@@ -1858,7 +1959,7 @@ schedule_reconstruction5(struct stripe_head *sh, struct stripe_head_state *s,
1858 } else 1959 } else
1859 sh->reconstruct_state = reconstruct_state_run; 1960 sh->reconstruct_state = reconstruct_state_run;
1860 1961
1861 set_bit(STRIPE_OP_POSTXOR, &s->ops_request); 1962 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
1862 1963
1863 for (i = disks; i--; ) { 1964 for (i = disks; i--; ) {
1864 struct r5dev *dev = &sh->dev[i]; 1965 struct r5dev *dev = &sh->dev[i];
@@ -1871,17 +1972,18 @@ schedule_reconstruction5(struct stripe_head *sh, struct stripe_head_state *s,
1871 s->locked++; 1972 s->locked++;
1872 } 1973 }
1873 } 1974 }
1874 if (s->locked + 1 == disks) 1975 if (s->locked + conf->max_degraded == disks)
1875 if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) 1976 if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
1876 atomic_inc(&sh->raid_conf->pending_full_writes); 1977 atomic_inc(&conf->pending_full_writes);
1877 } else { 1978 } else {
1979 BUG_ON(level == 6);
1878 BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || 1980 BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) ||
1879 test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); 1981 test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags)));
1880 1982
1881 sh->reconstruct_state = reconstruct_state_prexor_drain_run; 1983 sh->reconstruct_state = reconstruct_state_prexor_drain_run;
1882 set_bit(STRIPE_OP_PREXOR, &s->ops_request); 1984 set_bit(STRIPE_OP_PREXOR, &s->ops_request);
1883 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); 1985 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
1884 set_bit(STRIPE_OP_POSTXOR, &s->ops_request); 1986 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
1885 1987
1886 for (i = disks; i--; ) { 1988 for (i = disks; i--; ) {
1887 struct r5dev *dev = &sh->dev[i]; 1989 struct r5dev *dev = &sh->dev[i];
@@ -1899,13 +2001,22 @@ schedule_reconstruction5(struct stripe_head *sh, struct stripe_head_state *s,
1899 } 2001 }
1900 } 2002 }
1901 2003
1902 /* keep the parity disk locked while asynchronous operations 2004 /* keep the parity disk(s) locked while asynchronous operations
1903 * are in flight 2005 * are in flight
1904 */ 2006 */
1905 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); 2007 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
1906 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); 2008 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
1907 s->locked++; 2009 s->locked++;
1908 2010
2011 if (level == 6) {
2012 int qd_idx = sh->qd_idx;
2013 struct r5dev *dev = &sh->dev[qd_idx];
2014
2015 set_bit(R5_LOCKED, &dev->flags);
2016 clear_bit(R5_UPTODATE, &dev->flags);
2017 s->locked++;
2018 }
2019
1909 pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n", 2020 pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n",
1910 __func__, (unsigned long long)sh->sector, 2021 __func__, (unsigned long long)sh->sector,
1911 s->locked, s->ops_request); 2022 s->locked, s->ops_request);
@@ -1986,13 +2097,6 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
1986 2097
1987static void end_reshape(raid5_conf_t *conf); 2098static void end_reshape(raid5_conf_t *conf);
1988 2099
1989static int page_is_zero(struct page *p)
1990{
1991 char *a = page_address(p);
1992 return ((*(u32*)a) == 0 &&
1993 memcmp(a, a+4, STRIPE_SIZE-4)==0);
1994}
1995
1996static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous, 2100static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous,
1997 struct stripe_head *sh) 2101 struct stripe_head *sh)
1998{ 2102{
@@ -2132,9 +2236,10 @@ static int fetch_block5(struct stripe_head *sh, struct stripe_head_state *s,
2132 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 2236 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
2133 set_bit(R5_Wantcompute, &dev->flags); 2237 set_bit(R5_Wantcompute, &dev->flags);
2134 sh->ops.target = disk_idx; 2238 sh->ops.target = disk_idx;
2239 sh->ops.target2 = -1;
2135 s->req_compute = 1; 2240 s->req_compute = 1;
2136 /* Careful: from this point on 'uptodate' is in the eye 2241 /* Careful: from this point on 'uptodate' is in the eye
2137 * of raid5_run_ops which services 'compute' operations 2242 * of raid_run_ops which services 'compute' operations
2138 * before writes. R5_Wantcompute flags a block that will 2243 * before writes. R5_Wantcompute flags a block that will
2139 * be R5_UPTODATE by the time it is needed for a 2244 * be R5_UPTODATE by the time it is needed for a
2140 * subsequent operation. 2245 * subsequent operation.
@@ -2173,61 +2278,104 @@ static void handle_stripe_fill5(struct stripe_head *sh,
2173 set_bit(STRIPE_HANDLE, &sh->state); 2278 set_bit(STRIPE_HANDLE, &sh->state);
2174} 2279}
2175 2280
2176static void handle_stripe_fill6(struct stripe_head *sh, 2281/* fetch_block6 - checks the given member device to see if its data needs
2177 struct stripe_head_state *s, struct r6_state *r6s, 2282 * to be read or computed to satisfy a request.
2178 int disks) 2283 *
2284 * Returns 1 when no more member devices need to be checked, otherwise returns
2285 * 0 to tell the loop in handle_stripe_fill6 to continue
2286 */
2287static int fetch_block6(struct stripe_head *sh, struct stripe_head_state *s,
2288 struct r6_state *r6s, int disk_idx, int disks)
2179{ 2289{
2180 int i; 2290 struct r5dev *dev = &sh->dev[disk_idx];
2181 for (i = disks; i--; ) { 2291 struct r5dev *fdev[2] = { &sh->dev[r6s->failed_num[0]],
2182 struct r5dev *dev = &sh->dev[i]; 2292 &sh->dev[r6s->failed_num[1]] };
2183 if (!test_bit(R5_LOCKED, &dev->flags) && 2293
2184 !test_bit(R5_UPTODATE, &dev->flags) && 2294 if (!test_bit(R5_LOCKED, &dev->flags) &&
2185 (dev->toread || (dev->towrite && 2295 !test_bit(R5_UPTODATE, &dev->flags) &&
2186 !test_bit(R5_OVERWRITE, &dev->flags)) || 2296 (dev->toread ||
2187 s->syncing || s->expanding || 2297 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
2188 (s->failed >= 1 && 2298 s->syncing || s->expanding ||
2189 (sh->dev[r6s->failed_num[0]].toread || 2299 (s->failed >= 1 &&
2190 s->to_write)) || 2300 (fdev[0]->toread || s->to_write)) ||
2191 (s->failed >= 2 && 2301 (s->failed >= 2 &&
2192 (sh->dev[r6s->failed_num[1]].toread || 2302 (fdev[1]->toread || s->to_write)))) {
2193 s->to_write)))) { 2303 /* we would like to get this block, possibly by computing it,
2194 /* we would like to get this block, possibly 2304 * otherwise read it if the backing disk is insync
2195 * by computing it, but we might not be able to 2305 */
2306 BUG_ON(test_bit(R5_Wantcompute, &dev->flags));
2307 BUG_ON(test_bit(R5_Wantread, &dev->flags));
2308 if ((s->uptodate == disks - 1) &&
2309 (s->failed && (disk_idx == r6s->failed_num[0] ||
2310 disk_idx == r6s->failed_num[1]))) {
2311 /* have disk failed, and we're requested to fetch it;
2312 * do compute it
2196 */ 2313 */
2197 if ((s->uptodate == disks - 1) && 2314 pr_debug("Computing stripe %llu block %d\n",
2198 (s->failed && (i == r6s->failed_num[0] || 2315 (unsigned long long)sh->sector, disk_idx);
2199 i == r6s->failed_num[1]))) { 2316 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
2200 pr_debug("Computing stripe %llu block %d\n", 2317 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
2201 (unsigned long long)sh->sector, i); 2318 set_bit(R5_Wantcompute, &dev->flags);
2202 compute_block_1(sh, i, 0); 2319 sh->ops.target = disk_idx;
2203 s->uptodate++; 2320 sh->ops.target2 = -1; /* no 2nd target */
2204 } else if ( s->uptodate == disks-2 && s->failed >= 2 ) { 2321 s->req_compute = 1;
2205 /* Computing 2-failure is *very* expensive; only 2322 s->uptodate++;
2206 * do it if failed >= 2 2323 return 1;
2207 */ 2324 } else if (s->uptodate == disks-2 && s->failed >= 2) {
2208 int other; 2325 /* Computing 2-failure is *very* expensive; only
2209 for (other = disks; other--; ) { 2326 * do it if failed >= 2
2210 if (other == i) 2327 */
2211 continue; 2328 int other;
2212 if (!test_bit(R5_UPTODATE, 2329 for (other = disks; other--; ) {
2213 &sh->dev[other].flags)) 2330 if (other == disk_idx)
2214 break; 2331 continue;
2215 } 2332 if (!test_bit(R5_UPTODATE,
2216 BUG_ON(other < 0); 2333 &sh->dev[other].flags))
2217 pr_debug("Computing stripe %llu blocks %d,%d\n", 2334 break;
2218 (unsigned long long)sh->sector,
2219 i, other);
2220 compute_block_2(sh, i, other);
2221 s->uptodate += 2;
2222 } else if (test_bit(R5_Insync, &dev->flags)) {
2223 set_bit(R5_LOCKED, &dev->flags);
2224 set_bit(R5_Wantread, &dev->flags);
2225 s->locked++;
2226 pr_debug("Reading block %d (sync=%d)\n",
2227 i, s->syncing);
2228 } 2335 }
2336 BUG_ON(other < 0);
2337 pr_debug("Computing stripe %llu blocks %d,%d\n",
2338 (unsigned long long)sh->sector,
2339 disk_idx, other);
2340 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
2341 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
2342 set_bit(R5_Wantcompute, &sh->dev[disk_idx].flags);
2343 set_bit(R5_Wantcompute, &sh->dev[other].flags);
2344 sh->ops.target = disk_idx;
2345 sh->ops.target2 = other;
2346 s->uptodate += 2;
2347 s->req_compute = 1;
2348 return 1;
2349 } else if (test_bit(R5_Insync, &dev->flags)) {
2350 set_bit(R5_LOCKED, &dev->flags);
2351 set_bit(R5_Wantread, &dev->flags);
2352 s->locked++;
2353 pr_debug("Reading block %d (sync=%d)\n",
2354 disk_idx, s->syncing);
2229 } 2355 }
2230 } 2356 }
2357
2358 return 0;
2359}
2360
2361/**
2362 * handle_stripe_fill6 - read or compute data to satisfy pending requests.
2363 */
2364static void handle_stripe_fill6(struct stripe_head *sh,
2365 struct stripe_head_state *s, struct r6_state *r6s,
2366 int disks)
2367{
2368 int i;
2369
2370 /* look for blocks to read/compute, skip this if a compute
2371 * is already in flight, or if the stripe contents are in the
2372 * midst of changing due to a write
2373 */
2374 if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state &&
2375 !sh->reconstruct_state)
2376 for (i = disks; i--; )
2377 if (fetch_block6(sh, s, r6s, i, disks))
2378 break;
2231 set_bit(STRIPE_HANDLE, &sh->state); 2379 set_bit(STRIPE_HANDLE, &sh->state);
2232} 2380}
2233 2381
@@ -2361,114 +2509,61 @@ static void handle_stripe_dirtying5(raid5_conf_t *conf,
2361 */ 2509 */
2362 /* since handle_stripe can be called at any time we need to handle the 2510 /* since handle_stripe can be called at any time we need to handle the
2363 * case where a compute block operation has been submitted and then a 2511 * case where a compute block operation has been submitted and then a
2364 * subsequent call wants to start a write request. raid5_run_ops only 2512 * subsequent call wants to start a write request. raid_run_ops only
2365 * handles the case where compute block and postxor are requested 2513 * handles the case where compute block and reconstruct are requested
2366 * simultaneously. If this is not the case then new writes need to be 2514 * simultaneously. If this is not the case then new writes need to be
2367 * held off until the compute completes. 2515 * held off until the compute completes.
2368 */ 2516 */
2369 if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && 2517 if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) &&
2370 (s->locked == 0 && (rcw == 0 || rmw == 0) && 2518 (s->locked == 0 && (rcw == 0 || rmw == 0) &&
2371 !test_bit(STRIPE_BIT_DELAY, &sh->state))) 2519 !test_bit(STRIPE_BIT_DELAY, &sh->state)))
2372 schedule_reconstruction5(sh, s, rcw == 0, 0); 2520 schedule_reconstruction(sh, s, rcw == 0, 0);
2373} 2521}
2374 2522
2375static void handle_stripe_dirtying6(raid5_conf_t *conf, 2523static void handle_stripe_dirtying6(raid5_conf_t *conf,
2376 struct stripe_head *sh, struct stripe_head_state *s, 2524 struct stripe_head *sh, struct stripe_head_state *s,
2377 struct r6_state *r6s, int disks) 2525 struct r6_state *r6s, int disks)
2378{ 2526{
2379 int rcw = 0, must_compute = 0, pd_idx = sh->pd_idx, i; 2527 int rcw = 0, pd_idx = sh->pd_idx, i;
2380 int qd_idx = sh->qd_idx; 2528 int qd_idx = sh->qd_idx;
2529
2530 set_bit(STRIPE_HANDLE, &sh->state);
2381 for (i = disks; i--; ) { 2531 for (i = disks; i--; ) {
2382 struct r5dev *dev = &sh->dev[i]; 2532 struct r5dev *dev = &sh->dev[i];
2383 /* Would I have to read this buffer for reconstruct_write */ 2533 /* check if we haven't enough data */
2384 if (!test_bit(R5_OVERWRITE, &dev->flags) 2534 if (!test_bit(R5_OVERWRITE, &dev->flags) &&
2385 && i != pd_idx && i != qd_idx 2535 i != pd_idx && i != qd_idx &&
2386 && (!test_bit(R5_LOCKED, &dev->flags) 2536 !test_bit(R5_LOCKED, &dev->flags) &&
2387 ) && 2537 !(test_bit(R5_UPTODATE, &dev->flags) ||
2388 !test_bit(R5_UPTODATE, &dev->flags)) { 2538 test_bit(R5_Wantcompute, &dev->flags))) {
2389 if (test_bit(R5_Insync, &dev->flags)) rcw++; 2539 rcw++;
2390 else { 2540 if (!test_bit(R5_Insync, &dev->flags))
2391 pr_debug("raid6: must_compute: " 2541 continue; /* it's a failed drive */
2392 "disk %d flags=%#lx\n", i, dev->flags); 2542
2393 must_compute++; 2543 if (
2544 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
2545 pr_debug("Read_old stripe %llu "
2546 "block %d for Reconstruct\n",
2547 (unsigned long long)sh->sector, i);
2548 set_bit(R5_LOCKED, &dev->flags);
2549 set_bit(R5_Wantread, &dev->flags);
2550 s->locked++;
2551 } else {
2552 pr_debug("Request delayed stripe %llu "
2553 "block %d for Reconstruct\n",
2554 (unsigned long long)sh->sector, i);
2555 set_bit(STRIPE_DELAYED, &sh->state);
2556 set_bit(STRIPE_HANDLE, &sh->state);
2394 } 2557 }
2395 } 2558 }
2396 } 2559 }
2397 pr_debug("for sector %llu, rcw=%d, must_compute=%d\n",
2398 (unsigned long long)sh->sector, rcw, must_compute);
2399 set_bit(STRIPE_HANDLE, &sh->state);
2400
2401 if (rcw > 0)
2402 /* want reconstruct write, but need to get some data */
2403 for (i = disks; i--; ) {
2404 struct r5dev *dev = &sh->dev[i];
2405 if (!test_bit(R5_OVERWRITE, &dev->flags)
2406 && !(s->failed == 0 && (i == pd_idx || i == qd_idx))
2407 && !test_bit(R5_LOCKED, &dev->flags) &&
2408 !test_bit(R5_UPTODATE, &dev->flags) &&
2409 test_bit(R5_Insync, &dev->flags)) {
2410 if (
2411 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
2412 pr_debug("Read_old stripe %llu "
2413 "block %d for Reconstruct\n",
2414 (unsigned long long)sh->sector, i);
2415 set_bit(R5_LOCKED, &dev->flags);
2416 set_bit(R5_Wantread, &dev->flags);
2417 s->locked++;
2418 } else {
2419 pr_debug("Request delayed stripe %llu "
2420 "block %d for Reconstruct\n",
2421 (unsigned long long)sh->sector, i);
2422 set_bit(STRIPE_DELAYED, &sh->state);
2423 set_bit(STRIPE_HANDLE, &sh->state);
2424 }
2425 }
2426 }
2427 /* now if nothing is locked, and if we have enough data, we can start a 2560 /* now if nothing is locked, and if we have enough data, we can start a
2428 * write request 2561 * write request
2429 */ 2562 */
2430 if (s->locked == 0 && rcw == 0 && 2563 if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) &&
2564 s->locked == 0 && rcw == 0 &&
2431 !test_bit(STRIPE_BIT_DELAY, &sh->state)) { 2565 !test_bit(STRIPE_BIT_DELAY, &sh->state)) {
2432 if (must_compute > 0) { 2566 schedule_reconstruction(sh, s, 1, 0);
2433 /* We have failed blocks and need to compute them */
2434 switch (s->failed) {
2435 case 0:
2436 BUG();
2437 case 1:
2438 compute_block_1(sh, r6s->failed_num[0], 0);
2439 break;
2440 case 2:
2441 compute_block_2(sh, r6s->failed_num[0],
2442 r6s->failed_num[1]);
2443 break;
2444 default: /* This request should have been failed? */
2445 BUG();
2446 }
2447 }
2448
2449 pr_debug("Computing parity for stripe %llu\n",
2450 (unsigned long long)sh->sector);
2451 compute_parity6(sh, RECONSTRUCT_WRITE);
2452 /* now every locked buffer is ready to be written */
2453 for (i = disks; i--; )
2454 if (test_bit(R5_LOCKED, &sh->dev[i].flags)) {
2455 pr_debug("Writing stripe %llu block %d\n",
2456 (unsigned long long)sh->sector, i);
2457 s->locked++;
2458 set_bit(R5_Wantwrite, &sh->dev[i].flags);
2459 }
2460 if (s->locked == disks)
2461 if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
2462 atomic_inc(&conf->pending_full_writes);
2463 /* after a RECONSTRUCT_WRITE, the stripe MUST be in-sync */
2464 set_bit(STRIPE_INSYNC, &sh->state);
2465
2466 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
2467 atomic_dec(&conf->preread_active_stripes);
2468 if (atomic_read(&conf->preread_active_stripes) <
2469 IO_THRESHOLD)
2470 md_wakeup_thread(conf->mddev->thread);
2471 }
2472 } 2567 }
2473} 2568}
2474 2569
@@ -2527,7 +2622,7 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
2527 * we are done. Otherwise update the mismatch count and repair 2622 * we are done. Otherwise update the mismatch count and repair
2528 * parity if !MD_RECOVERY_CHECK 2623 * parity if !MD_RECOVERY_CHECK
2529 */ 2624 */
2530 if (sh->ops.zero_sum_result == 0) 2625 if ((sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) == 0)
2531 /* parity is correct (on disc, 2626 /* parity is correct (on disc,
2532 * not in buffer any more) 2627 * not in buffer any more)
2533 */ 2628 */
@@ -2544,6 +2639,7 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
2544 set_bit(R5_Wantcompute, 2639 set_bit(R5_Wantcompute,
2545 &sh->dev[sh->pd_idx].flags); 2640 &sh->dev[sh->pd_idx].flags);
2546 sh->ops.target = sh->pd_idx; 2641 sh->ops.target = sh->pd_idx;
2642 sh->ops.target2 = -1;
2547 s->uptodate++; 2643 s->uptodate++;
2548 } 2644 }
2549 } 2645 }
@@ -2560,67 +2656,74 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
2560 2656
2561 2657
2562static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh, 2658static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh,
2563 struct stripe_head_state *s, 2659 struct stripe_head_state *s,
2564 struct r6_state *r6s, struct page *tmp_page, 2660 struct r6_state *r6s, int disks)
2565 int disks)
2566{ 2661{
2567 int update_p = 0, update_q = 0;
2568 struct r5dev *dev;
2569 int pd_idx = sh->pd_idx; 2662 int pd_idx = sh->pd_idx;
2570 int qd_idx = sh->qd_idx; 2663 int qd_idx = sh->qd_idx;
2664 struct r5dev *dev;
2571 2665
2572 set_bit(STRIPE_HANDLE, &sh->state); 2666 set_bit(STRIPE_HANDLE, &sh->state);
2573 2667
2574 BUG_ON(s->failed > 2); 2668 BUG_ON(s->failed > 2);
2575 BUG_ON(s->uptodate < disks); 2669
2576 /* Want to check and possibly repair P and Q. 2670 /* Want to check and possibly repair P and Q.
2577 * However there could be one 'failed' device, in which 2671 * However there could be one 'failed' device, in which
2578 * case we can only check one of them, possibly using the 2672 * case we can only check one of them, possibly using the
2579 * other to generate missing data 2673 * other to generate missing data
2580 */ 2674 */
2581 2675
2582 /* If !tmp_page, we cannot do the calculations, 2676 switch (sh->check_state) {
2583 * but as we have set STRIPE_HANDLE, we will soon be called 2677 case check_state_idle:
2584 * by stripe_handle with a tmp_page - just wait until then. 2678 /* start a new check operation if there are < 2 failures */
2585 */
2586 if (tmp_page) {
2587 if (s->failed == r6s->q_failed) { 2679 if (s->failed == r6s->q_failed) {
2588 /* The only possible failed device holds 'Q', so it 2680 /* The only possible failed device holds Q, so it
2589 * makes sense to check P (If anything else were failed, 2681 * makes sense to check P (If anything else were failed,
2590 * we would have used P to recreate it). 2682 * we would have used P to recreate it).
2591 */ 2683 */
2592 compute_block_1(sh, pd_idx, 1); 2684 sh->check_state = check_state_run;
2593 if (!page_is_zero(sh->dev[pd_idx].page)) {
2594 compute_block_1(sh, pd_idx, 0);
2595 update_p = 1;
2596 }
2597 } 2685 }
2598 if (!r6s->q_failed && s->failed < 2) { 2686 if (!r6s->q_failed && s->failed < 2) {
2599 /* q is not failed, and we didn't use it to generate 2687 /* Q is not failed, and we didn't use it to generate
2600 * anything, so it makes sense to check it 2688 * anything, so it makes sense to check it
2601 */ 2689 */
2602 memcpy(page_address(tmp_page), 2690 if (sh->check_state == check_state_run)
2603 page_address(sh->dev[qd_idx].page), 2691 sh->check_state = check_state_run_pq;
2604 STRIPE_SIZE); 2692 else
2605 compute_parity6(sh, UPDATE_PARITY); 2693 sh->check_state = check_state_run_q;
2606 if (memcmp(page_address(tmp_page),
2607 page_address(sh->dev[qd_idx].page),
2608 STRIPE_SIZE) != 0) {
2609 clear_bit(STRIPE_INSYNC, &sh->state);
2610 update_q = 1;
2611 }
2612 } 2694 }
2613 if (update_p || update_q) { 2695
2614 conf->mddev->resync_mismatches += STRIPE_SECTORS; 2696 /* discard potentially stale zero_sum_result */
2615 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) 2697 sh->ops.zero_sum_result = 0;
2616 /* don't try to repair!! */ 2698
2617 update_p = update_q = 0; 2699 if (sh->check_state == check_state_run) {
2700 /* async_xor_zero_sum destroys the contents of P */
2701 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
2702 s->uptodate--;
2703 }
2704 if (sh->check_state >= check_state_run &&
2705 sh->check_state <= check_state_run_pq) {
2706 /* async_syndrome_zero_sum preserves P and Q, so
2707 * no need to mark them !uptodate here
2708 */
2709 set_bit(STRIPE_OP_CHECK, &s->ops_request);
2710 break;
2618 } 2711 }
2619 2712
2713 /* we have 2-disk failure */
2714 BUG_ON(s->failed != 2);
2715 /* fall through */
2716 case check_state_compute_result:
2717 sh->check_state = check_state_idle;
2718
2719 /* check that a write has not made the stripe insync */
2720 if (test_bit(STRIPE_INSYNC, &sh->state))
2721 break;
2722
2620 /* now write out any block on a failed drive, 2723 /* now write out any block on a failed drive,
2621 * or P or Q if they need it 2724 * or P or Q if they were recomputed
2622 */ 2725 */
2623 2726 BUG_ON(s->uptodate < disks - 1); /* We don't need Q to recover */
2624 if (s->failed == 2) { 2727 if (s->failed == 2) {
2625 dev = &sh->dev[r6s->failed_num[1]]; 2728 dev = &sh->dev[r6s->failed_num[1]];
2626 s->locked++; 2729 s->locked++;
@@ -2633,14 +2736,13 @@ static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh,
2633 set_bit(R5_LOCKED, &dev->flags); 2736 set_bit(R5_LOCKED, &dev->flags);
2634 set_bit(R5_Wantwrite, &dev->flags); 2737 set_bit(R5_Wantwrite, &dev->flags);
2635 } 2738 }
2636 2739 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) {
2637 if (update_p) {
2638 dev = &sh->dev[pd_idx]; 2740 dev = &sh->dev[pd_idx];
2639 s->locked++; 2741 s->locked++;
2640 set_bit(R5_LOCKED, &dev->flags); 2742 set_bit(R5_LOCKED, &dev->flags);
2641 set_bit(R5_Wantwrite, &dev->flags); 2743 set_bit(R5_Wantwrite, &dev->flags);
2642 } 2744 }
2643 if (update_q) { 2745 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) {
2644 dev = &sh->dev[qd_idx]; 2746 dev = &sh->dev[qd_idx];
2645 s->locked++; 2747 s->locked++;
2646 set_bit(R5_LOCKED, &dev->flags); 2748 set_bit(R5_LOCKED, &dev->flags);
@@ -2649,6 +2751,70 @@ static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh,
2649 clear_bit(STRIPE_DEGRADED, &sh->state); 2751 clear_bit(STRIPE_DEGRADED, &sh->state);
2650 2752
2651 set_bit(STRIPE_INSYNC, &sh->state); 2753 set_bit(STRIPE_INSYNC, &sh->state);
2754 break;
2755 case check_state_run:
2756 case check_state_run_q:
2757 case check_state_run_pq:
2758 break; /* we will be called again upon completion */
2759 case check_state_check_result:
2760 sh->check_state = check_state_idle;
2761
2762 /* handle a successful check operation, if parity is correct
2763 * we are done. Otherwise update the mismatch count and repair
2764 * parity if !MD_RECOVERY_CHECK
2765 */
2766 if (sh->ops.zero_sum_result == 0) {
2767 /* both parities are correct */
2768 if (!s->failed)
2769 set_bit(STRIPE_INSYNC, &sh->state);
2770 else {
2771 /* in contrast to the raid5 case we can validate
2772 * parity, but still have a failure to write
2773 * back
2774 */
2775 sh->check_state = check_state_compute_result;
2776 /* Returning at this point means that we may go
2777 * off and bring p and/or q uptodate again so
2778 * we make sure to check zero_sum_result again
2779 * to verify if p or q need writeback
2780 */
2781 }
2782 } else {
2783 conf->mddev->resync_mismatches += STRIPE_SECTORS;
2784 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
2785 /* don't try to repair!! */
2786 set_bit(STRIPE_INSYNC, &sh->state);
2787 else {
2788 int *target = &sh->ops.target;
2789
2790 sh->ops.target = -1;
2791 sh->ops.target2 = -1;
2792 sh->check_state = check_state_compute_run;
2793 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
2794 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
2795 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) {
2796 set_bit(R5_Wantcompute,
2797 &sh->dev[pd_idx].flags);
2798 *target = pd_idx;
2799 target = &sh->ops.target2;
2800 s->uptodate++;
2801 }
2802 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) {
2803 set_bit(R5_Wantcompute,
2804 &sh->dev[qd_idx].flags);
2805 *target = qd_idx;
2806 s->uptodate++;
2807 }
2808 }
2809 }
2810 break;
2811 case check_state_compute_run:
2812 break;
2813 default:
2814 printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n",
2815 __func__, sh->check_state,
2816 (unsigned long long) sh->sector);
2817 BUG();
2652 } 2818 }
2653} 2819}
2654 2820
@@ -2666,6 +2832,7 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh,
2666 if (i != sh->pd_idx && i != sh->qd_idx) { 2832 if (i != sh->pd_idx && i != sh->qd_idx) {
2667 int dd_idx, j; 2833 int dd_idx, j;
2668 struct stripe_head *sh2; 2834 struct stripe_head *sh2;
2835 struct async_submit_ctl submit;
2669 2836
2670 sector_t bn = compute_blocknr(sh, i, 1); 2837 sector_t bn = compute_blocknr(sh, i, 1);
2671 sector_t s = raid5_compute_sector(conf, bn, 0, 2838 sector_t s = raid5_compute_sector(conf, bn, 0,
@@ -2685,9 +2852,10 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh,
2685 } 2852 }
2686 2853
2687 /* place all the copies on one channel */ 2854 /* place all the copies on one channel */
2855 init_async_submit(&submit, 0, tx, NULL, NULL, NULL);
2688 tx = async_memcpy(sh2->dev[dd_idx].page, 2856 tx = async_memcpy(sh2->dev[dd_idx].page,
2689 sh->dev[i].page, 0, 0, STRIPE_SIZE, 2857 sh->dev[i].page, 0, 0, STRIPE_SIZE,
2690 ASYNC_TX_DEP_ACK, tx, NULL, NULL); 2858 &submit);
2691 2859
2692 set_bit(R5_Expanded, &sh2->dev[dd_idx].flags); 2860 set_bit(R5_Expanded, &sh2->dev[dd_idx].flags);
2693 set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags); 2861 set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags);
@@ -2756,7 +2924,8 @@ static bool handle_stripe5(struct stripe_head *sh)
2756 rcu_read_lock(); 2924 rcu_read_lock();
2757 for (i=disks; i--; ) { 2925 for (i=disks; i--; ) {
2758 mdk_rdev_t *rdev; 2926 mdk_rdev_t *rdev;
2759 struct r5dev *dev = &sh->dev[i]; 2927
2928 dev = &sh->dev[i];
2760 clear_bit(R5_Insync, &dev->flags); 2929 clear_bit(R5_Insync, &dev->flags);
2761 2930
2762 pr_debug("check %d: state 0x%lx toread %p read %p write %p " 2931 pr_debug("check %d: state 0x%lx toread %p read %p write %p "
@@ -2973,7 +3142,7 @@ static bool handle_stripe5(struct stripe_head *sh)
2973 /* Need to write out all blocks after computing parity */ 3142 /* Need to write out all blocks after computing parity */
2974 sh->disks = conf->raid_disks; 3143 sh->disks = conf->raid_disks;
2975 stripe_set_idx(sh->sector, conf, 0, sh); 3144 stripe_set_idx(sh->sector, conf, 0, sh);
2976 schedule_reconstruction5(sh, &s, 1, 1); 3145 schedule_reconstruction(sh, &s, 1, 1);
2977 } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { 3146 } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) {
2978 clear_bit(STRIPE_EXPAND_READY, &sh->state); 3147 clear_bit(STRIPE_EXPAND_READY, &sh->state);
2979 atomic_dec(&conf->reshape_stripes); 3148 atomic_dec(&conf->reshape_stripes);
@@ -2993,7 +3162,7 @@ static bool handle_stripe5(struct stripe_head *sh)
2993 md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); 3162 md_wait_for_blocked_rdev(blocked_rdev, conf->mddev);
2994 3163
2995 if (s.ops_request) 3164 if (s.ops_request)
2996 raid5_run_ops(sh, s.ops_request); 3165 raid_run_ops(sh, s.ops_request);
2997 3166
2998 ops_run_io(sh, &s); 3167 ops_run_io(sh, &s);
2999 3168
@@ -3002,7 +3171,7 @@ static bool handle_stripe5(struct stripe_head *sh)
3002 return blocked_rdev == NULL; 3171 return blocked_rdev == NULL;
3003} 3172}
3004 3173
3005static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) 3174static bool handle_stripe6(struct stripe_head *sh)
3006{ 3175{
3007 raid5_conf_t *conf = sh->raid_conf; 3176 raid5_conf_t *conf = sh->raid_conf;
3008 int disks = sh->disks; 3177 int disks = sh->disks;
@@ -3014,9 +3183,10 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
3014 mdk_rdev_t *blocked_rdev = NULL; 3183 mdk_rdev_t *blocked_rdev = NULL;
3015 3184
3016 pr_debug("handling stripe %llu, state=%#lx cnt=%d, " 3185 pr_debug("handling stripe %llu, state=%#lx cnt=%d, "
3017 "pd_idx=%d, qd_idx=%d\n", 3186 "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n",
3018 (unsigned long long)sh->sector, sh->state, 3187 (unsigned long long)sh->sector, sh->state,
3019 atomic_read(&sh->count), pd_idx, qd_idx); 3188 atomic_read(&sh->count), pd_idx, qd_idx,
3189 sh->check_state, sh->reconstruct_state);
3020 memset(&s, 0, sizeof(s)); 3190 memset(&s, 0, sizeof(s));
3021 3191
3022 spin_lock(&sh->lock); 3192 spin_lock(&sh->lock);
@@ -3036,35 +3206,26 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
3036 3206
3037 pr_debug("check %d: state 0x%lx read %p write %p written %p\n", 3207 pr_debug("check %d: state 0x%lx read %p write %p written %p\n",
3038 i, dev->flags, dev->toread, dev->towrite, dev->written); 3208 i, dev->flags, dev->toread, dev->towrite, dev->written);
3039 /* maybe we can reply to a read */ 3209 /* maybe we can reply to a read
3040 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread) { 3210 *
3041 struct bio *rbi, *rbi2; 3211 * new wantfill requests are only permitted while
3042 pr_debug("Return read for disc %d\n", i); 3212 * ops_complete_biofill is guaranteed to be inactive
3043 spin_lock_irq(&conf->device_lock); 3213 */
3044 rbi = dev->toread; 3214 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread &&
3045 dev->toread = NULL; 3215 !test_bit(STRIPE_BIOFILL_RUN, &sh->state))
3046 if (test_and_clear_bit(R5_Overlap, &dev->flags)) 3216 set_bit(R5_Wantfill, &dev->flags);
3047 wake_up(&conf->wait_for_overlap);
3048 spin_unlock_irq(&conf->device_lock);
3049 while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) {
3050 copy_data(0, rbi, dev->page, dev->sector);
3051 rbi2 = r5_next_bio(rbi, dev->sector);
3052 spin_lock_irq(&conf->device_lock);
3053 if (!raid5_dec_bi_phys_segments(rbi)) {
3054 rbi->bi_next = return_bi;
3055 return_bi = rbi;
3056 }
3057 spin_unlock_irq(&conf->device_lock);
3058 rbi = rbi2;
3059 }
3060 }
3061 3217
3062 /* now count some things */ 3218 /* now count some things */
3063 if (test_bit(R5_LOCKED, &dev->flags)) s.locked++; 3219 if (test_bit(R5_LOCKED, &dev->flags)) s.locked++;
3064 if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++; 3220 if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++;
3221 if (test_bit(R5_Wantcompute, &dev->flags)) {
3222 s.compute++;
3223 BUG_ON(s.compute > 2);
3224 }
3065 3225
3066 3226 if (test_bit(R5_Wantfill, &dev->flags)) {
3067 if (dev->toread) 3227 s.to_fill++;
3228 } else if (dev->toread)
3068 s.to_read++; 3229 s.to_read++;
3069 if (dev->towrite) { 3230 if (dev->towrite) {
3070 s.to_write++; 3231 s.to_write++;
@@ -3105,6 +3266,11 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
3105 blocked_rdev = NULL; 3266 blocked_rdev = NULL;
3106 } 3267 }
3107 3268
3269 if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) {
3270 set_bit(STRIPE_OP_BIOFILL, &s.ops_request);
3271 set_bit(STRIPE_BIOFILL_RUN, &sh->state);
3272 }
3273
3108 pr_debug("locked=%d uptodate=%d to_read=%d" 3274 pr_debug("locked=%d uptodate=%d to_read=%d"
3109 " to_write=%d failed=%d failed_num=%d,%d\n", 3275 " to_write=%d failed=%d failed_num=%d,%d\n",
3110 s.locked, s.uptodate, s.to_read, s.to_write, s.failed, 3276 s.locked, s.uptodate, s.to_read, s.to_write, s.failed,
@@ -3145,19 +3311,62 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
3145 * or to load a block that is being partially written. 3311 * or to load a block that is being partially written.
3146 */ 3312 */
3147 if (s.to_read || s.non_overwrite || (s.to_write && s.failed) || 3313 if (s.to_read || s.non_overwrite || (s.to_write && s.failed) ||
3148 (s.syncing && (s.uptodate < disks)) || s.expanding) 3314 (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding)
3149 handle_stripe_fill6(sh, &s, &r6s, disks); 3315 handle_stripe_fill6(sh, &s, &r6s, disks);
3150 3316
3151 /* now to consider writing and what else, if anything should be read */ 3317 /* Now we check to see if any write operations have recently
3152 if (s.to_write) 3318 * completed
3319 */
3320 if (sh->reconstruct_state == reconstruct_state_drain_result) {
3321 int qd_idx = sh->qd_idx;
3322
3323 sh->reconstruct_state = reconstruct_state_idle;
3324 /* All the 'written' buffers and the parity blocks are ready to
3325 * be written back to disk
3326 */
3327 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags));
3328 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags));
3329 for (i = disks; i--; ) {
3330 dev = &sh->dev[i];
3331 if (test_bit(R5_LOCKED, &dev->flags) &&
3332 (i == sh->pd_idx || i == qd_idx ||
3333 dev->written)) {
3334 pr_debug("Writing block %d\n", i);
3335 BUG_ON(!test_bit(R5_UPTODATE, &dev->flags));
3336 set_bit(R5_Wantwrite, &dev->flags);
3337 if (!test_bit(R5_Insync, &dev->flags) ||
3338 ((i == sh->pd_idx || i == qd_idx) &&
3339 s.failed == 0))
3340 set_bit(STRIPE_INSYNC, &sh->state);
3341 }
3342 }
3343 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
3344 atomic_dec(&conf->preread_active_stripes);
3345 if (atomic_read(&conf->preread_active_stripes) <
3346 IO_THRESHOLD)
3347 md_wakeup_thread(conf->mddev->thread);
3348 }
3349 }
3350
3351 /* Now to consider new write requests and what else, if anything
3352 * should be read. We do not handle new writes when:
3353 * 1/ A 'write' operation (copy+gen_syndrome) is already in flight.
3354 * 2/ A 'check' operation is in flight, as it may clobber the parity
3355 * block.
3356 */
3357 if (s.to_write && !sh->reconstruct_state && !sh->check_state)
3153 handle_stripe_dirtying6(conf, sh, &s, &r6s, disks); 3358 handle_stripe_dirtying6(conf, sh, &s, &r6s, disks);
3154 3359
3155 /* maybe we need to check and possibly fix the parity for this stripe 3360 /* maybe we need to check and possibly fix the parity for this stripe
3156 * Any reads will already have been scheduled, so we just see if enough 3361 * Any reads will already have been scheduled, so we just see if enough
3157 * data is available 3362 * data is available. The parity check is held off while parity
3363 * dependent operations are in flight.
3158 */ 3364 */
3159 if (s.syncing && s.locked == 0 && !test_bit(STRIPE_INSYNC, &sh->state)) 3365 if (sh->check_state ||
3160 handle_parity_checks6(conf, sh, &s, &r6s, tmp_page, disks); 3366 (s.syncing && s.locked == 0 &&
3367 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
3368 !test_bit(STRIPE_INSYNC, &sh->state)))
3369 handle_parity_checks6(conf, sh, &s, &r6s, disks);
3161 3370
3162 if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { 3371 if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
3163 md_done_sync(conf->mddev, STRIPE_SECTORS,1); 3372 md_done_sync(conf->mddev, STRIPE_SECTORS,1);
@@ -3178,15 +3387,29 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
3178 set_bit(R5_Wantwrite, &dev->flags); 3387 set_bit(R5_Wantwrite, &dev->flags);
3179 set_bit(R5_ReWrite, &dev->flags); 3388 set_bit(R5_ReWrite, &dev->flags);
3180 set_bit(R5_LOCKED, &dev->flags); 3389 set_bit(R5_LOCKED, &dev->flags);
3390 s.locked++;
3181 } else { 3391 } else {
3182 /* let's read it back */ 3392 /* let's read it back */
3183 set_bit(R5_Wantread, &dev->flags); 3393 set_bit(R5_Wantread, &dev->flags);
3184 set_bit(R5_LOCKED, &dev->flags); 3394 set_bit(R5_LOCKED, &dev->flags);
3395 s.locked++;
3185 } 3396 }
3186 } 3397 }
3187 } 3398 }
3188 3399
3189 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state)) { 3400 /* Finish reconstruct operations initiated by the expansion process */
3401 if (sh->reconstruct_state == reconstruct_state_result) {
3402 sh->reconstruct_state = reconstruct_state_idle;
3403 clear_bit(STRIPE_EXPANDING, &sh->state);
3404 for (i = conf->raid_disks; i--; ) {
3405 set_bit(R5_Wantwrite, &sh->dev[i].flags);
3406 set_bit(R5_LOCKED, &sh->dev[i].flags);
3407 s.locked++;
3408 }
3409 }
3410
3411 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) &&
3412 !sh->reconstruct_state) {
3190 struct stripe_head *sh2 3413 struct stripe_head *sh2
3191 = get_active_stripe(conf, sh->sector, 1, 1, 1); 3414 = get_active_stripe(conf, sh->sector, 1, 1, 1);
3192 if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) { 3415 if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) {
@@ -3207,14 +3430,8 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
3207 /* Need to write out all blocks after computing P&Q */ 3430 /* Need to write out all blocks after computing P&Q */
3208 sh->disks = conf->raid_disks; 3431 sh->disks = conf->raid_disks;
3209 stripe_set_idx(sh->sector, conf, 0, sh); 3432 stripe_set_idx(sh->sector, conf, 0, sh);
3210 compute_parity6(sh, RECONSTRUCT_WRITE); 3433 schedule_reconstruction(sh, &s, 1, 1);
3211 for (i = conf->raid_disks ; i-- ; ) { 3434 } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) {
3212 set_bit(R5_LOCKED, &sh->dev[i].flags);
3213 s.locked++;
3214 set_bit(R5_Wantwrite, &sh->dev[i].flags);
3215 }
3216 clear_bit(STRIPE_EXPANDING, &sh->state);
3217 } else if (s.expanded) {
3218 clear_bit(STRIPE_EXPAND_READY, &sh->state); 3435 clear_bit(STRIPE_EXPAND_READY, &sh->state);
3219 atomic_dec(&conf->reshape_stripes); 3436 atomic_dec(&conf->reshape_stripes);
3220 wake_up(&conf->wait_for_overlap); 3437 wake_up(&conf->wait_for_overlap);
@@ -3232,6 +3449,9 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
3232 if (unlikely(blocked_rdev)) 3449 if (unlikely(blocked_rdev))
3233 md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); 3450 md_wait_for_blocked_rdev(blocked_rdev, conf->mddev);
3234 3451
3452 if (s.ops_request)
3453 raid_run_ops(sh, s.ops_request);
3454
3235 ops_run_io(sh, &s); 3455 ops_run_io(sh, &s);
3236 3456
3237 return_io(return_bi); 3457 return_io(return_bi);
@@ -3240,16 +3460,14 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
3240} 3460}
3241 3461
3242/* returns true if the stripe was handled */ 3462/* returns true if the stripe was handled */
3243static bool handle_stripe(struct stripe_head *sh, struct page *tmp_page) 3463static bool handle_stripe(struct stripe_head *sh)
3244{ 3464{
3245 if (sh->raid_conf->level == 6) 3465 if (sh->raid_conf->level == 6)
3246 return handle_stripe6(sh, tmp_page); 3466 return handle_stripe6(sh);
3247 else 3467 else
3248 return handle_stripe5(sh); 3468 return handle_stripe5(sh);
3249} 3469}
3250 3470
3251
3252
3253static void raid5_activate_delayed(raid5_conf_t *conf) 3471static void raid5_activate_delayed(raid5_conf_t *conf)
3254{ 3472{
3255 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) { 3473 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) {
@@ -3331,6 +3549,9 @@ static int raid5_congested(void *data, int bits)
3331 /* No difference between reads and writes. Just check 3549 /* No difference between reads and writes. Just check
3332 * how busy the stripe_cache is 3550 * how busy the stripe_cache is
3333 */ 3551 */
3552
3553 if (mddev_congested(mddev, bits))
3554 return 1;
3334 if (conf->inactive_blocked) 3555 if (conf->inactive_blocked)
3335 return 1; 3556 return 1;
3336 if (conf->quiesce) 3557 if (conf->quiesce)
@@ -3880,7 +4101,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
3880 INIT_LIST_HEAD(&stripes); 4101 INIT_LIST_HEAD(&stripes);
3881 for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) { 4102 for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) {
3882 int j; 4103 int j;
3883 int skipped = 0; 4104 int skipped_disk = 0;
3884 sh = get_active_stripe(conf, stripe_addr+i, 0, 0, 1); 4105 sh = get_active_stripe(conf, stripe_addr+i, 0, 0, 1);
3885 set_bit(STRIPE_EXPANDING, &sh->state); 4106 set_bit(STRIPE_EXPANDING, &sh->state);
3886 atomic_inc(&conf->reshape_stripes); 4107 atomic_inc(&conf->reshape_stripes);
@@ -3896,14 +4117,14 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
3896 continue; 4117 continue;
3897 s = compute_blocknr(sh, j, 0); 4118 s = compute_blocknr(sh, j, 0);
3898 if (s < raid5_size(mddev, 0, 0)) { 4119 if (s < raid5_size(mddev, 0, 0)) {
3899 skipped = 1; 4120 skipped_disk = 1;
3900 continue; 4121 continue;
3901 } 4122 }
3902 memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE); 4123 memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE);
3903 set_bit(R5_Expanded, &sh->dev[j].flags); 4124 set_bit(R5_Expanded, &sh->dev[j].flags);
3904 set_bit(R5_UPTODATE, &sh->dev[j].flags); 4125 set_bit(R5_UPTODATE, &sh->dev[j].flags);
3905 } 4126 }
3906 if (!skipped) { 4127 if (!skipped_disk) {
3907 set_bit(STRIPE_EXPAND_READY, &sh->state); 4128 set_bit(STRIPE_EXPAND_READY, &sh->state);
3908 set_bit(STRIPE_HANDLE, &sh->state); 4129 set_bit(STRIPE_HANDLE, &sh->state);
3909 } 4130 }
@@ -4057,7 +4278,7 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski
4057 spin_unlock(&sh->lock); 4278 spin_unlock(&sh->lock);
4058 4279
4059 /* wait for any blocked device to be handled */ 4280 /* wait for any blocked device to be handled */
4060 while(unlikely(!handle_stripe(sh, NULL))) 4281 while (unlikely(!handle_stripe(sh)))
4061 ; 4282 ;
4062 release_stripe(sh); 4283 release_stripe(sh);
4063 4284
@@ -4114,7 +4335,7 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio)
4114 return handled; 4335 return handled;
4115 } 4336 }
4116 4337
4117 handle_stripe(sh, NULL); 4338 handle_stripe(sh);
4118 release_stripe(sh); 4339 release_stripe(sh);
4119 handled++; 4340 handled++;
4120 } 4341 }
@@ -4128,6 +4349,36 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio)
4128 return handled; 4349 return handled;
4129} 4350}
4130 4351
4352#ifdef CONFIG_MULTICORE_RAID456
4353static void __process_stripe(void *param, async_cookie_t cookie)
4354{
4355 struct stripe_head *sh = param;
4356
4357 handle_stripe(sh);
4358 release_stripe(sh);
4359}
4360
4361static void process_stripe(struct stripe_head *sh, struct list_head *domain)
4362{
4363 async_schedule_domain(__process_stripe, sh, domain);
4364}
4365
4366static void synchronize_stripe_processing(struct list_head *domain)
4367{
4368 async_synchronize_full_domain(domain);
4369}
4370#else
4371static void process_stripe(struct stripe_head *sh, struct list_head *domain)
4372{
4373 handle_stripe(sh);
4374 release_stripe(sh);
4375 cond_resched();
4376}
4377
4378static void synchronize_stripe_processing(struct list_head *domain)
4379{
4380}
4381#endif
4131 4382
4132 4383
4133/* 4384/*
@@ -4142,6 +4393,7 @@ static void raid5d(mddev_t *mddev)
4142 struct stripe_head *sh; 4393 struct stripe_head *sh;
4143 raid5_conf_t *conf = mddev->private; 4394 raid5_conf_t *conf = mddev->private;
4144 int handled; 4395 int handled;
4396 LIST_HEAD(raid_domain);
4145 4397
4146 pr_debug("+++ raid5d active\n"); 4398 pr_debug("+++ raid5d active\n");
4147 4399
@@ -4178,8 +4430,7 @@ static void raid5d(mddev_t *mddev)
4178 spin_unlock_irq(&conf->device_lock); 4430 spin_unlock_irq(&conf->device_lock);
4179 4431
4180 handled++; 4432 handled++;
4181 handle_stripe(sh, conf->spare_page); 4433 process_stripe(sh, &raid_domain);
4182 release_stripe(sh);
4183 4434
4184 spin_lock_irq(&conf->device_lock); 4435 spin_lock_irq(&conf->device_lock);
4185 } 4436 }
@@ -4187,6 +4438,7 @@ static void raid5d(mddev_t *mddev)
4187 4438
4188 spin_unlock_irq(&conf->device_lock); 4439 spin_unlock_irq(&conf->device_lock);
4189 4440
4441 synchronize_stripe_processing(&raid_domain);
4190 async_tx_issue_pending_all(); 4442 async_tx_issue_pending_all();
4191 unplug_slaves(mddev); 4443 unplug_slaves(mddev);
4192 4444
@@ -4319,15 +4571,118 @@ raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks)
4319 return sectors * (raid_disks - conf->max_degraded); 4571 return sectors * (raid_disks - conf->max_degraded);
4320} 4572}
4321 4573
4574static void raid5_free_percpu(raid5_conf_t *conf)
4575{
4576 struct raid5_percpu *percpu;
4577 unsigned long cpu;
4578
4579 if (!conf->percpu)
4580 return;
4581
4582 get_online_cpus();
4583 for_each_possible_cpu(cpu) {
4584 percpu = per_cpu_ptr(conf->percpu, cpu);
4585 safe_put_page(percpu->spare_page);
4586 kfree(percpu->scribble);
4587 }
4588#ifdef CONFIG_HOTPLUG_CPU
4589 unregister_cpu_notifier(&conf->cpu_notify);
4590#endif
4591 put_online_cpus();
4592
4593 free_percpu(conf->percpu);
4594}
4595
4322static void free_conf(raid5_conf_t *conf) 4596static void free_conf(raid5_conf_t *conf)
4323{ 4597{
4324 shrink_stripes(conf); 4598 shrink_stripes(conf);
4325 safe_put_page(conf->spare_page); 4599 raid5_free_percpu(conf);
4326 kfree(conf->disks); 4600 kfree(conf->disks);
4327 kfree(conf->stripe_hashtbl); 4601 kfree(conf->stripe_hashtbl);
4328 kfree(conf); 4602 kfree(conf);
4329} 4603}
4330 4604
4605#ifdef CONFIG_HOTPLUG_CPU
4606static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action,
4607 void *hcpu)
4608{
4609 raid5_conf_t *conf = container_of(nfb, raid5_conf_t, cpu_notify);
4610 long cpu = (long)hcpu;
4611 struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu);
4612
4613 switch (action) {
4614 case CPU_UP_PREPARE:
4615 case CPU_UP_PREPARE_FROZEN:
4616 if (conf->level == 6 && !percpu->spare_page)
4617 percpu->spare_page = alloc_page(GFP_KERNEL);
4618 if (!percpu->scribble)
4619 percpu->scribble = kmalloc(conf->scribble_len, GFP_KERNEL);
4620
4621 if (!percpu->scribble ||
4622 (conf->level == 6 && !percpu->spare_page)) {
4623 safe_put_page(percpu->spare_page);
4624 kfree(percpu->scribble);
4625 pr_err("%s: failed memory allocation for cpu%ld\n",
4626 __func__, cpu);
4627 return NOTIFY_BAD;
4628 }
4629 break;
4630 case CPU_DEAD:
4631 case CPU_DEAD_FROZEN:
4632 safe_put_page(percpu->spare_page);
4633 kfree(percpu->scribble);
4634 percpu->spare_page = NULL;
4635 percpu->scribble = NULL;
4636 break;
4637 default:
4638 break;
4639 }
4640 return NOTIFY_OK;
4641}
4642#endif
4643
4644static int raid5_alloc_percpu(raid5_conf_t *conf)
4645{
4646 unsigned long cpu;
4647 struct page *spare_page;
4648 struct raid5_percpu *allcpus;
4649 void *scribble;
4650 int err;
4651
4652 allcpus = alloc_percpu(struct raid5_percpu);
4653 if (!allcpus)
4654 return -ENOMEM;
4655 conf->percpu = allcpus;
4656
4657 get_online_cpus();
4658 err = 0;
4659 for_each_present_cpu(cpu) {
4660 if (conf->level == 6) {
4661 spare_page = alloc_page(GFP_KERNEL);
4662 if (!spare_page) {
4663 err = -ENOMEM;
4664 break;
4665 }
4666 per_cpu_ptr(conf->percpu, cpu)->spare_page = spare_page;
4667 }
4668 scribble = kmalloc(scribble_len(conf->raid_disks), GFP_KERNEL);
4669 if (!scribble) {
4670 err = -ENOMEM;
4671 break;
4672 }
4673 per_cpu_ptr(conf->percpu, cpu)->scribble = scribble;
4674 }
4675#ifdef CONFIG_HOTPLUG_CPU
4676 conf->cpu_notify.notifier_call = raid456_cpu_notify;
4677 conf->cpu_notify.priority = 0;
4678 if (err == 0)
4679 err = register_cpu_notifier(&conf->cpu_notify);
4680#endif
4681 put_online_cpus();
4682
4683 return err;
4684}
4685
4331static raid5_conf_t *setup_conf(mddev_t *mddev) 4686static raid5_conf_t *setup_conf(mddev_t *mddev)
4332{ 4687{
4333 raid5_conf_t *conf; 4688 raid5_conf_t *conf;
@@ -4369,6 +4724,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
4369 goto abort; 4724 goto abort;
4370 4725
4371 conf->raid_disks = mddev->raid_disks; 4726 conf->raid_disks = mddev->raid_disks;
4727 conf->scribble_len = scribble_len(conf->raid_disks);
4372 if (mddev->reshape_position == MaxSector) 4728 if (mddev->reshape_position == MaxSector)
4373 conf->previous_raid_disks = mddev->raid_disks; 4729 conf->previous_raid_disks = mddev->raid_disks;
4374 else 4730 else
@@ -4384,11 +4740,10 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
4384 if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) 4740 if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
4385 goto abort; 4741 goto abort;
4386 4742
4387 if (mddev->new_level == 6) { 4743 conf->level = mddev->new_level;
4388 conf->spare_page = alloc_page(GFP_KERNEL); 4744 if (raid5_alloc_percpu(conf) != 0)
4389 if (!conf->spare_page) 4745 goto abort;
4390 goto abort; 4746
4391 }
4392 spin_lock_init(&conf->device_lock); 4747 spin_lock_init(&conf->device_lock);
4393 init_waitqueue_head(&conf->wait_for_stripe); 4748 init_waitqueue_head(&conf->wait_for_stripe);
4394 init_waitqueue_head(&conf->wait_for_overlap); 4749 init_waitqueue_head(&conf->wait_for_overlap);
@@ -4447,7 +4802,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
4447 printk(KERN_INFO "raid5: allocated %dkB for %s\n", 4802 printk(KERN_INFO "raid5: allocated %dkB for %s\n",
4448 memory, mdname(mddev)); 4803 memory, mdname(mddev));
4449 4804
4450 conf->thread = md_register_thread(raid5d, mddev, "%s_raid5"); 4805 conf->thread = md_register_thread(raid5d, mddev, NULL);
4451 if (!conf->thread) { 4806 if (!conf->thread) {
4452 printk(KERN_ERR 4807 printk(KERN_ERR
4453 "raid5: couldn't allocate thread for %s\n", 4808 "raid5: couldn't allocate thread for %s\n",
@@ -4613,7 +4968,7 @@ static int run(mddev_t *mddev)
4613 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 4968 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
4614 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 4969 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
4615 mddev->sync_thread = md_register_thread(md_do_sync, mddev, 4970 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
4616 "%s_reshape"); 4971 "reshape");
4617 } 4972 }
4618 4973
4619 /* read-ahead size must cover two whole stripes, which is 4974 /* read-ahead size must cover two whole stripes, which is
@@ -5031,7 +5386,7 @@ static int raid5_start_reshape(mddev_t *mddev)
5031 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 5386 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
5032 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 5387 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
5033 mddev->sync_thread = md_register_thread(md_do_sync, mddev, 5388 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
5034 "%s_reshape"); 5389 "reshape");
5035 if (!mddev->sync_thread) { 5390 if (!mddev->sync_thread) {
5036 mddev->recovery = 0; 5391 mddev->recovery = 0;
5037 spin_lock_irq(&conf->device_lock); 5392 spin_lock_irq(&conf->device_lock);
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 9459689c4ea0..2390e0e83daf 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -2,6 +2,7 @@
2#define _RAID5_H 2#define _RAID5_H
3 3
4#include <linux/raid/xor.h> 4#include <linux/raid/xor.h>
5#include <linux/dmaengine.h>
5 6
6/* 7/*
7 * 8 *
@@ -175,7 +176,9 @@
175 */ 176 */
176enum check_states { 177enum check_states {
177 check_state_idle = 0, 178 check_state_idle = 0,
178 check_state_run, /* parity check */ 179 check_state_run, /* xor parity check */
180 check_state_run_q, /* q-parity check */
181 check_state_run_pq, /* pq dual parity check */
179 check_state_check_result, 182 check_state_check_result,
180 check_state_compute_run, /* parity repair */ 183 check_state_compute_run, /* parity repair */
181 check_state_compute_result, 184 check_state_compute_result,
@@ -215,8 +218,8 @@ struct stripe_head {
215 * @target - STRIPE_OP_COMPUTE_BLK target 218 * @target - STRIPE_OP_COMPUTE_BLK target
216 */ 219 */
217 struct stripe_operations { 220 struct stripe_operations {
218 int target; 221 int target, target2;
219 u32 zero_sum_result; 222 enum sum_check_flags zero_sum_result;
220 } ops; 223 } ops;
221 struct r5dev { 224 struct r5dev {
222 struct bio req; 225 struct bio req;
@@ -298,7 +301,7 @@ struct r6_state {
298#define STRIPE_OP_COMPUTE_BLK 1 301#define STRIPE_OP_COMPUTE_BLK 1
299#define STRIPE_OP_PREXOR 2 302#define STRIPE_OP_PREXOR 2
300#define STRIPE_OP_BIODRAIN 3 303#define STRIPE_OP_BIODRAIN 3
301#define STRIPE_OP_POSTXOR 4 304#define STRIPE_OP_RECONSTRUCT 4
302#define STRIPE_OP_CHECK 5 305#define STRIPE_OP_CHECK 5
303 306
304/* 307/*
@@ -385,8 +388,21 @@ struct raid5_private_data {
385 * (fresh device added). 388 * (fresh device added).
386 * Cleared when a sync completes. 389 * Cleared when a sync completes.
387 */ 390 */
388 391 /* per cpu variables */
389 struct page *spare_page; /* Used when checking P/Q in raid6 */ 392 struct raid5_percpu {
393 struct page *spare_page; /* Used when checking P/Q in raid6 */
394 void *scribble; /* space for constructing buffer
395 * lists and performing address
396 * conversions
397 */
398 } *percpu;
399 size_t scribble_len; /* size of scribble region must be
400 * associated with conf to handle
401 * cpu hotplug while reshaping
402 */
403#ifdef CONFIG_HOTPLUG_CPU
404 struct notifier_block cpu_notify;
405#endif
390 406
391 /* 407 /*
392 * Free stripes pool 408 * Free stripes pool
diff --git a/drivers/media/dvb/dvb-core/dvbdev.h b/drivers/media/dvb/dvb-core/dvbdev.h
index 895e2efca8a9..01fc70484743 100644
--- a/drivers/media/dvb/dvb-core/dvbdev.h
+++ b/drivers/media/dvb/dvb-core/dvbdev.h
@@ -31,10 +31,9 @@
31#define DVB_MAJOR 212 31#define DVB_MAJOR 212
32 32
33#if defined(CONFIG_DVB_MAX_ADAPTERS) && CONFIG_DVB_MAX_ADAPTERS > 0 33#if defined(CONFIG_DVB_MAX_ADAPTERS) && CONFIG_DVB_MAX_ADAPTERS > 0
34#define DVB_MAX_ADAPTERS CONFIG_DVB_MAX_ADAPTERS 34 #define DVB_MAX_ADAPTERS CONFIG_DVB_MAX_ADAPTERS
35#else 35#else
36#warning invalid CONFIG_DVB_MAX_ADAPTERS value 36 #define DVB_MAX_ADAPTERS 8
37#define DVB_MAX_ADAPTERS 8
38#endif 37#endif
39 38
40#define DVB_UNSET (-1) 39#define DVB_UNSET (-1)
diff --git a/drivers/media/dvb/dvb-usb/Kconfig b/drivers/media/dvb/dvb-usb/Kconfig
index 0e4b97fba384..9744b0692417 100644
--- a/drivers/media/dvb/dvb-usb/Kconfig
+++ b/drivers/media/dvb/dvb-usb/Kconfig
@@ -75,7 +75,7 @@ config DVB_USB_DIB0700
75 select DVB_DIB3000MC if !DVB_FE_CUSTOMISE 75 select DVB_DIB3000MC if !DVB_FE_CUSTOMISE
76 select DVB_S5H1411 if !DVB_FE_CUSTOMISE 76 select DVB_S5H1411 if !DVB_FE_CUSTOMISE
77 select DVB_LGDT3305 if !DVB_FE_CUSTOMISE 77 select DVB_LGDT3305 if !DVB_FE_CUSTOMISE
78 select DVB_TUNER_DIB0070 if !DVB_FE_CUSTOMISE 78 select DVB_TUNER_DIB0070
79 select MEDIA_TUNER_MT2060 if !MEDIA_TUNER_CUSTOMISE 79 select MEDIA_TUNER_MT2060 if !MEDIA_TUNER_CUSTOMISE
80 select MEDIA_TUNER_MT2266 if !MEDIA_TUNER_CUSTOMISE 80 select MEDIA_TUNER_MT2266 if !MEDIA_TUNER_CUSTOMISE
81 select MEDIA_TUNER_XC2028 if !MEDIA_TUNER_CUSTOMISE 81 select MEDIA_TUNER_XC2028 if !MEDIA_TUNER_CUSTOMISE
diff --git a/drivers/media/video/saa7164/saa7164-api.c b/drivers/media/video/saa7164/saa7164-api.c
index bb6df1b276be..6f094a96ac81 100644
--- a/drivers/media/video/saa7164/saa7164-api.c
+++ b/drivers/media/video/saa7164/saa7164-api.c
@@ -415,7 +415,7 @@ int saa7164_api_enum_subdevs(struct saa7164_dev *dev)
415 goto out; 415 goto out;
416 } 416 }
417 417
418 if (debug & DBGLVL_API) 418 if (saa_debug & DBGLVL_API)
419 saa7164_dumphex16(dev, buf, (buflen/16)*16); 419 saa7164_dumphex16(dev, buf, (buflen/16)*16);
420 420
421 saa7164_api_dump_subdevs(dev, buf, buflen); 421 saa7164_api_dump_subdevs(dev, buf, buflen);
@@ -480,7 +480,7 @@ int saa7164_api_i2c_read(struct saa7164_i2c *bus, u8 addr, u32 reglen, u8 *reg,
480 480
481 dprintk(DBGLVL_API, "%s() len = %d bytes\n", __func__, len); 481 dprintk(DBGLVL_API, "%s() len = %d bytes\n", __func__, len);
482 482
483 if (debug & DBGLVL_I2C) 483 if (saa_debug & DBGLVL_I2C)
484 saa7164_dumphex16(dev, buf, 2 * 16); 484 saa7164_dumphex16(dev, buf, 2 * 16);
485 485
486 ret = saa7164_cmd_send(bus->dev, unitid, GET_CUR, 486 ret = saa7164_cmd_send(bus->dev, unitid, GET_CUR,
@@ -488,7 +488,7 @@ int saa7164_api_i2c_read(struct saa7164_i2c *bus, u8 addr, u32 reglen, u8 *reg,
488 if (ret != SAA_OK) 488 if (ret != SAA_OK)
489 printk(KERN_ERR "%s() error, ret(2) = 0x%x\n", __func__, ret); 489 printk(KERN_ERR "%s() error, ret(2) = 0x%x\n", __func__, ret);
490 else { 490 else {
491 if (debug & DBGLVL_I2C) 491 if (saa_debug & DBGLVL_I2C)
492 saa7164_dumphex16(dev, buf, sizeof(buf)); 492 saa7164_dumphex16(dev, buf, sizeof(buf));
493 memcpy(data, (buf + 2 * sizeof(u32) + reglen), datalen); 493 memcpy(data, (buf + 2 * sizeof(u32) + reglen), datalen);
494 } 494 }
@@ -548,7 +548,7 @@ int saa7164_api_i2c_write(struct saa7164_i2c *bus, u8 addr, u32 datalen,
548 *((u32 *)(buf + 1 * sizeof(u32))) = datalen - reglen; 548 *((u32 *)(buf + 1 * sizeof(u32))) = datalen - reglen;
549 memcpy((buf + 2 * sizeof(u32)), data, datalen); 549 memcpy((buf + 2 * sizeof(u32)), data, datalen);
550 550
551 if (debug & DBGLVL_I2C) 551 if (saa_debug & DBGLVL_I2C)
552 saa7164_dumphex16(dev, buf, sizeof(buf)); 552 saa7164_dumphex16(dev, buf, sizeof(buf));
553 553
554 ret = saa7164_cmd_send(bus->dev, unitid, SET_CUR, 554 ret = saa7164_cmd_send(bus->dev, unitid, SET_CUR,
diff --git a/drivers/media/video/saa7164/saa7164-cmd.c b/drivers/media/video/saa7164/saa7164-cmd.c
index e097f1a0969a..c45966edc0cf 100644
--- a/drivers/media/video/saa7164/saa7164-cmd.c
+++ b/drivers/media/video/saa7164/saa7164-cmd.c
@@ -250,7 +250,7 @@ int saa7164_cmd_wait(struct saa7164_dev *dev, u8 seqno)
250 unsigned long stamp; 250 unsigned long stamp;
251 int r; 251 int r;
252 252
253 if (debug >= 4) 253 if (saa_debug >= 4)
254 saa7164_bus_dump(dev); 254 saa7164_bus_dump(dev);
255 255
256 dprintk(DBGLVL_CMD, "%s(seqno=%d)\n", __func__, seqno); 256 dprintk(DBGLVL_CMD, "%s(seqno=%d)\n", __func__, seqno);
diff --git a/drivers/media/video/saa7164/saa7164-core.c b/drivers/media/video/saa7164/saa7164-core.c
index f0dbead188c8..709affc31042 100644
--- a/drivers/media/video/saa7164/saa7164-core.c
+++ b/drivers/media/video/saa7164/saa7164-core.c
@@ -45,8 +45,8 @@ MODULE_LICENSE("GPL");
45 32 bus 45 32 bus
46 */ 46 */
47 47
48unsigned int debug; 48unsigned int saa_debug;
49module_param(debug, int, 0644); 49module_param_named(debug, saa_debug, int, 0644);
50MODULE_PARM_DESC(debug, "enable debug messages"); 50MODULE_PARM_DESC(debug, "enable debug messages");
51 51
52unsigned int waitsecs = 10; 52unsigned int waitsecs = 10;
@@ -653,7 +653,7 @@ static int __devinit saa7164_initdev(struct pci_dev *pci_dev,
653 printk(KERN_ERR "%s() Unsupported board detected, " 653 printk(KERN_ERR "%s() Unsupported board detected, "
654 "registering without firmware\n", __func__); 654 "registering without firmware\n", __func__);
655 655
656 dprintk(1, "%s() parameter debug = %d\n", __func__, debug); 656 dprintk(1, "%s() parameter debug = %d\n", __func__, saa_debug);
657 dprintk(1, "%s() parameter waitsecs = %d\n", __func__, waitsecs); 657 dprintk(1, "%s() parameter waitsecs = %d\n", __func__, waitsecs);
658 658
659fail_fw: 659fail_fw:
diff --git a/drivers/media/video/saa7164/saa7164.h b/drivers/media/video/saa7164/saa7164.h
index 6753008a9c9b..42660b546f0e 100644
--- a/drivers/media/video/saa7164/saa7164.h
+++ b/drivers/media/video/saa7164/saa7164.h
@@ -375,9 +375,9 @@ extern int saa7164_buffer_dealloc(struct saa7164_tsport *port,
375 375
376/* ----------------------------------------------------------- */ 376/* ----------------------------------------------------------- */
377 377
378extern unsigned int debug; 378extern unsigned int saa_debug;
379#define dprintk(level, fmt, arg...)\ 379#define dprintk(level, fmt, arg...)\
380 do { if (debug & level)\ 380 do { if (saa_debug & level)\
381 printk(KERN_DEBUG "%s: " fmt, dev->name, ## arg);\ 381 printk(KERN_DEBUG "%s: " fmt, dev->name, ## arg);\
382 } while (0) 382 } while (0)
383 383
diff --git a/drivers/memstick/core/memstick.c b/drivers/memstick/core/memstick.c
index a5b448ea4eab..b3bf1c44d74d 100644
--- a/drivers/memstick/core/memstick.c
+++ b/drivers/memstick/core/memstick.c
@@ -339,9 +339,9 @@ static int h_memstick_read_dev_id(struct memstick_dev *card,
339 card->id.type = id_reg.type; 339 card->id.type = id_reg.type;
340 card->id.category = id_reg.category; 340 card->id.category = id_reg.category;
341 card->id.class = id_reg.class; 341 card->id.class = id_reg.class;
342 dev_dbg(&card->dev, "if_mode = %02x\n", id_reg.if_mode);
342 } 343 }
343 complete(&card->mrq_complete); 344 complete(&card->mrq_complete);
344 dev_dbg(&card->dev, "if_mode = %02x\n", id_reg.if_mode);
345 return -EAGAIN; 345 return -EAGAIN;
346 } 346 }
347} 347}
diff --git a/drivers/misc/sgi-gru/grukservices.c b/drivers/misc/sgi-gru/grukservices.c
index 79689b10f937..766e21e15574 100644
--- a/drivers/misc/sgi-gru/grukservices.c
+++ b/drivers/misc/sgi-gru/grukservices.c
@@ -937,6 +937,8 @@ static int quicktest1(unsigned long arg)
937 937
938 /* Need 1K cacheline aligned that does not cross page boundary */ 938 /* Need 1K cacheline aligned that does not cross page boundary */
939 p = kmalloc(4096, 0); 939 p = kmalloc(4096, 0);
940 if (p == NULL)
941 return -ENOMEM;
940 mq = ALIGNUP(p, 1024); 942 mq = ALIGNUP(p, 1024);
941 memset(mes, 0xee, sizeof(mes)); 943 memset(mes, 0xee, sizeof(mes));
942 dw = mq; 944 dw = mq;
diff --git a/drivers/misc/sgi-gru/gruprocfs.c b/drivers/misc/sgi-gru/gruprocfs.c
index 9cbf95bedce6..ccd4408a26c7 100644
--- a/drivers/misc/sgi-gru/gruprocfs.c
+++ b/drivers/misc/sgi-gru/gruprocfs.c
@@ -340,10 +340,9 @@ static struct proc_dir_entry *proc_gru __read_mostly;
340 340
341static int create_proc_file(struct proc_entry *p) 341static int create_proc_file(struct proc_entry *p)
342{ 342{
343 p->entry = create_proc_entry(p->name, p->mode, proc_gru); 343 p->entry = proc_create(p->name, p->mode, proc_gru, p->fops);
344 if (!p->entry) 344 if (!p->entry)
345 return -1; 345 return -1;
346 p->entry->proc_fops = p->fops;
347 return 0; 346 return 0;
348} 347}
349 348
diff --git a/drivers/mmc/host/atmel-mci.c b/drivers/mmc/host/atmel-mci.c
index 065fa818be57..fc25586b7ee1 100644
--- a/drivers/mmc/host/atmel-mci.c
+++ b/drivers/mmc/host/atmel-mci.c
@@ -599,6 +599,7 @@ atmci_submit_data_dma(struct atmel_mci *host, struct mmc_data *data)
599 struct scatterlist *sg; 599 struct scatterlist *sg;
600 unsigned int i; 600 unsigned int i;
601 enum dma_data_direction direction; 601 enum dma_data_direction direction;
602 unsigned int sglen;
602 603
603 /* 604 /*
604 * We don't do DMA on "complex" transfers, i.e. with 605 * We don't do DMA on "complex" transfers, i.e. with
@@ -628,11 +629,14 @@ atmci_submit_data_dma(struct atmel_mci *host, struct mmc_data *data)
628 else 629 else
629 direction = DMA_TO_DEVICE; 630 direction = DMA_TO_DEVICE;
630 631
632 sglen = dma_map_sg(&host->pdev->dev, data->sg, data->sg_len, direction);
633 if (sglen != data->sg_len)
634 goto unmap_exit;
631 desc = chan->device->device_prep_slave_sg(chan, 635 desc = chan->device->device_prep_slave_sg(chan,
632 data->sg, data->sg_len, direction, 636 data->sg, data->sg_len, direction,
633 DMA_PREP_INTERRUPT | DMA_CTRL_ACK); 637 DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
634 if (!desc) 638 if (!desc)
635 return -ENOMEM; 639 goto unmap_exit;
636 640
637 host->dma.data_desc = desc; 641 host->dma.data_desc = desc;
638 desc->callback = atmci_dma_complete; 642 desc->callback = atmci_dma_complete;
@@ -643,6 +647,9 @@ atmci_submit_data_dma(struct atmel_mci *host, struct mmc_data *data)
643 chan->device->device_issue_pending(chan); 647 chan->device->device_issue_pending(chan);
644 648
645 return 0; 649 return 0;
650unmap_exit:
651 dma_unmap_sg(&host->pdev->dev, data->sg, sglen, direction);
652 return -ENOMEM;
646} 653}
647 654
648#else /* CONFIG_MMC_ATMELMCI_DMA */ 655#else /* CONFIG_MMC_ATMELMCI_DMA */
diff --git a/drivers/net/wireless/arlan-proc.c b/drivers/net/wireless/arlan-proc.c
index 2ab1d59870f4..a8b689635a3b 100644
--- a/drivers/net/wireless/arlan-proc.c
+++ b/drivers/net/wireless/arlan-proc.c
@@ -402,7 +402,7 @@ static int arlan_setup_card_by_book(struct net_device *dev)
402 402
403static char arlan_drive_info[ARLAN_STR_SIZE] = "A655\n\0"; 403static char arlan_drive_info[ARLAN_STR_SIZE] = "A655\n\0";
404 404
405static int arlan_sysctl_info(ctl_table * ctl, int write, struct file *filp, 405static int arlan_sysctl_info(ctl_table * ctl, int write,
406 void __user *buffer, size_t * lenp, loff_t *ppos) 406 void __user *buffer, size_t * lenp, loff_t *ppos)
407{ 407{
408 int i; 408 int i;
@@ -629,7 +629,7 @@ final:
629 *lenp = pos; 629 *lenp = pos;
630 630
631 if (!write) 631 if (!write)
632 retv = proc_dostring(ctl, write, filp, buffer, lenp, ppos); 632 retv = proc_dostring(ctl, write, buffer, lenp, ppos);
633 else 633 else
634 { 634 {
635 *lenp = 0; 635 *lenp = 0;
@@ -639,7 +639,7 @@ final:
639} 639}
640 640
641 641
642static int arlan_sysctl_info161719(ctl_table * ctl, int write, struct file *filp, 642static int arlan_sysctl_info161719(ctl_table * ctl, int write,
643 void __user *buffer, size_t * lenp, loff_t *ppos) 643 void __user *buffer, size_t * lenp, loff_t *ppos)
644{ 644{
645 int i; 645 int i;
@@ -669,11 +669,11 @@ static int arlan_sysctl_info161719(ctl_table * ctl, int write, struct file *filp
669 669
670final: 670final:
671 *lenp = pos; 671 *lenp = pos;
672 retv = proc_dostring(ctl, write, filp, buffer, lenp, ppos); 672 retv = proc_dostring(ctl, write, buffer, lenp, ppos);
673 return retv; 673 return retv;
674} 674}
675 675
676static int arlan_sysctl_infotxRing(ctl_table * ctl, int write, struct file *filp, 676static int arlan_sysctl_infotxRing(ctl_table * ctl, int write,
677 void __user *buffer, size_t * lenp, loff_t *ppos) 677 void __user *buffer, size_t * lenp, loff_t *ppos)
678{ 678{
679 int i; 679 int i;
@@ -698,11 +698,11 @@ static int arlan_sysctl_infotxRing(ctl_table * ctl, int write, struct file *filp
698 SARLBNpln(u_char, txBuffer, 0x800); 698 SARLBNpln(u_char, txBuffer, 0x800);
699final: 699final:
700 *lenp = pos; 700 *lenp = pos;
701 retv = proc_dostring(ctl, write, filp, buffer, lenp, ppos); 701 retv = proc_dostring(ctl, write, buffer, lenp, ppos);
702 return retv; 702 return retv;
703} 703}
704 704
705static int arlan_sysctl_inforxRing(ctl_table * ctl, int write, struct file *filp, 705static int arlan_sysctl_inforxRing(ctl_table * ctl, int write,
706 void __user *buffer, size_t * lenp, loff_t *ppos) 706 void __user *buffer, size_t * lenp, loff_t *ppos)
707{ 707{
708 int i; 708 int i;
@@ -726,11 +726,11 @@ static int arlan_sysctl_inforxRing(ctl_table * ctl, int write, struct file *filp
726 SARLBNpln(u_char, rxBuffer, 0x800); 726 SARLBNpln(u_char, rxBuffer, 0x800);
727final: 727final:
728 *lenp = pos; 728 *lenp = pos;
729 retv = proc_dostring(ctl, write, filp, buffer, lenp, ppos); 729 retv = proc_dostring(ctl, write, buffer, lenp, ppos);
730 return retv; 730 return retv;
731} 731}
732 732
733static int arlan_sysctl_info18(ctl_table * ctl, int write, struct file *filp, 733static int arlan_sysctl_info18(ctl_table * ctl, int write,
734 void __user *buffer, size_t * lenp, loff_t *ppos) 734 void __user *buffer, size_t * lenp, loff_t *ppos)
735{ 735{
736 int i; 736 int i;
@@ -756,7 +756,7 @@ static int arlan_sysctl_info18(ctl_table * ctl, int write, struct file *filp,
756 756
757final: 757final:
758 *lenp = pos; 758 *lenp = pos;
759 retv = proc_dostring(ctl, write, filp, buffer, lenp, ppos); 759 retv = proc_dostring(ctl, write, buffer, lenp, ppos);
760 return retv; 760 return retv;
761} 761}
762 762
@@ -766,7 +766,7 @@ final:
766 766
767static char conf_reset_result[200]; 767static char conf_reset_result[200];
768 768
769static int arlan_configure(ctl_table * ctl, int write, struct file *filp, 769static int arlan_configure(ctl_table * ctl, int write,
770 void __user *buffer, size_t * lenp, loff_t *ppos) 770 void __user *buffer, size_t * lenp, loff_t *ppos)
771{ 771{
772 int pos = 0; 772 int pos = 0;
@@ -788,10 +788,10 @@ static int arlan_configure(ctl_table * ctl, int write, struct file *filp,
788 return -1; 788 return -1;
789 789
790 *lenp = pos; 790 *lenp = pos;
791 return proc_dostring(ctl, write, filp, buffer, lenp, ppos); 791 return proc_dostring(ctl, write, buffer, lenp, ppos);
792} 792}
793 793
794static int arlan_sysctl_reset(ctl_table * ctl, int write, struct file *filp, 794static int arlan_sysctl_reset(ctl_table * ctl, int write,
795 void __user *buffer, size_t * lenp, loff_t *ppos) 795 void __user *buffer, size_t * lenp, loff_t *ppos)
796{ 796{
797 int pos = 0; 797 int pos = 0;
@@ -811,7 +811,7 @@ static int arlan_sysctl_reset(ctl_table * ctl, int write, struct file *filp,
811 } else 811 } else
812 return -1; 812 return -1;
813 *lenp = pos + 3; 813 *lenp = pos + 3;
814 return proc_dostring(ctl, write, filp, buffer, lenp, ppos); 814 return proc_dostring(ctl, write, buffer, lenp, ppos);
815} 815}
816 816
817 817
diff --git a/drivers/parport/procfs.c b/drivers/parport/procfs.c
index 554e11f9e1ce..8eefe56f1cbe 100644
--- a/drivers/parport/procfs.c
+++ b/drivers/parport/procfs.c
@@ -31,7 +31,7 @@
31#define PARPORT_MIN_SPINTIME_VALUE 1 31#define PARPORT_MIN_SPINTIME_VALUE 1
32#define PARPORT_MAX_SPINTIME_VALUE 1000 32#define PARPORT_MAX_SPINTIME_VALUE 1000
33 33
34static int do_active_device(ctl_table *table, int write, struct file *filp, 34static int do_active_device(ctl_table *table, int write,
35 void __user *result, size_t *lenp, loff_t *ppos) 35 void __user *result, size_t *lenp, loff_t *ppos)
36{ 36{
37 struct parport *port = (struct parport *)table->extra1; 37 struct parport *port = (struct parport *)table->extra1;
@@ -68,7 +68,7 @@ static int do_active_device(ctl_table *table, int write, struct file *filp,
68} 68}
69 69
70#ifdef CONFIG_PARPORT_1284 70#ifdef CONFIG_PARPORT_1284
71static int do_autoprobe(ctl_table *table, int write, struct file *filp, 71static int do_autoprobe(ctl_table *table, int write,
72 void __user *result, size_t *lenp, loff_t *ppos) 72 void __user *result, size_t *lenp, loff_t *ppos)
73{ 73{
74 struct parport_device_info *info = table->extra2; 74 struct parport_device_info *info = table->extra2;
@@ -111,7 +111,7 @@ static int do_autoprobe(ctl_table *table, int write, struct file *filp,
111#endif /* IEEE1284.3 support. */ 111#endif /* IEEE1284.3 support. */
112 112
113static int do_hardware_base_addr (ctl_table *table, int write, 113static int do_hardware_base_addr (ctl_table *table, int write,
114 struct file *filp, void __user *result, 114 void __user *result,
115 size_t *lenp, loff_t *ppos) 115 size_t *lenp, loff_t *ppos)
116{ 116{
117 struct parport *port = (struct parport *)table->extra1; 117 struct parport *port = (struct parport *)table->extra1;
@@ -139,7 +139,7 @@ static int do_hardware_base_addr (ctl_table *table, int write,
139} 139}
140 140
141static int do_hardware_irq (ctl_table *table, int write, 141static int do_hardware_irq (ctl_table *table, int write,
142 struct file *filp, void __user *result, 142 void __user *result,
143 size_t *lenp, loff_t *ppos) 143 size_t *lenp, loff_t *ppos)
144{ 144{
145 struct parport *port = (struct parport *)table->extra1; 145 struct parport *port = (struct parport *)table->extra1;
@@ -167,7 +167,7 @@ static int do_hardware_irq (ctl_table *table, int write,
167} 167}
168 168
169static int do_hardware_dma (ctl_table *table, int write, 169static int do_hardware_dma (ctl_table *table, int write,
170 struct file *filp, void __user *result, 170 void __user *result,
171 size_t *lenp, loff_t *ppos) 171 size_t *lenp, loff_t *ppos)
172{ 172{
173 struct parport *port = (struct parport *)table->extra1; 173 struct parport *port = (struct parport *)table->extra1;
@@ -195,7 +195,7 @@ static int do_hardware_dma (ctl_table *table, int write,
195} 195}
196 196
197static int do_hardware_modes (ctl_table *table, int write, 197static int do_hardware_modes (ctl_table *table, int write,
198 struct file *filp, void __user *result, 198 void __user *result,
199 size_t *lenp, loff_t *ppos) 199 size_t *lenp, loff_t *ppos)
200{ 200{
201 struct parport *port = (struct parport *)table->extra1; 201 struct parport *port = (struct parport *)table->extra1;
diff --git a/drivers/staging/go7007/Makefile b/drivers/staging/go7007/Makefile
index d14ea84a01f6..1301caa7495d 100644
--- a/drivers/staging/go7007/Makefile
+++ b/drivers/staging/go7007/Makefile
@@ -32,8 +32,3 @@ endif
32 32
33EXTRA_CFLAGS += -Idrivers/media/dvb/frontends 33EXTRA_CFLAGS += -Idrivers/media/dvb/frontends
34EXTRA_CFLAGS += -Idrivers/media/dvb/dvb-core 34EXTRA_CFLAGS += -Idrivers/media/dvb/dvb-core
35
36# Ubuntu 8.04 has CONFIG_SND undefined, so include lum sound/config.h too
37ifeq ($(CONFIG_SND),)
38EXTRA_CFLAGS += -include sound/config.h
39endif
diff --git a/drivers/usb/serial/sierra.c b/drivers/usb/serial/sierra.c
index 68fa0e43b781..8c075b2416bb 100644
--- a/drivers/usb/serial/sierra.c
+++ b/drivers/usb/serial/sierra.c
@@ -912,6 +912,7 @@ static void sierra_release(struct usb_serial *serial)
912 } 912 }
913} 913}
914 914
915#ifdef CONFIG_PM
915static void stop_read_write_urbs(struct usb_serial *serial) 916static void stop_read_write_urbs(struct usb_serial *serial)
916{ 917{
917 int i, j; 918 int i, j;
@@ -988,6 +989,10 @@ static int sierra_resume(struct usb_serial *serial)
988 989
989 return ec ? -EIO : 0; 990 return ec ? -EIO : 0;
990} 991}
992#else
993#define sierra_suspend NULL
994#define sierra_resume NULL
995#endif
991 996
992static struct usb_serial_driver sierra_device = { 997static struct usb_serial_driver sierra_device = {
993 .driver = { 998 .driver = {
diff --git a/drivers/vlynq/vlynq.c b/drivers/vlynq/vlynq.c
index ba3d71f5c7d0..9554ad5f9af7 100644
--- a/drivers/vlynq/vlynq.c
+++ b/drivers/vlynq/vlynq.c
@@ -702,7 +702,7 @@ static int vlynq_probe(struct platform_device *pdev)
702 dev->mem_start = mem_res->start; 702 dev->mem_start = mem_res->start;
703 dev->mem_end = mem_res->end; 703 dev->mem_end = mem_res->end;
704 704
705 len = regs_res->end - regs_res->start; 705 len = resource_size(regs_res);
706 if (!request_mem_region(regs_res->start, len, dev_name(&dev->dev))) { 706 if (!request_mem_region(regs_res->start, len, dev_name(&dev->dev))) {
707 printk(KERN_ERR "%s: Can't request vlynq registers\n", 707 printk(KERN_ERR "%s: Can't request vlynq registers\n",
708 dev_name(&dev->dev)); 708 dev_name(&dev->dev));
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index 798cb071d132..3f57ce4bee5d 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -19,9 +19,6 @@ static int
19adfs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh, 19adfs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh,
20 int create) 20 int create)
21{ 21{
22 if (block < 0)
23 goto abort_negative;
24
25 if (!create) { 22 if (!create) {
26 if (block >= inode->i_blocks) 23 if (block >= inode->i_blocks)
27 goto abort_toobig; 24 goto abort_toobig;
@@ -34,10 +31,6 @@ adfs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh,
34 /* don't support allocation of blocks yet */ 31 /* don't support allocation of blocks yet */
35 return -EIO; 32 return -EIO;
36 33
37abort_negative:
38 adfs_error(inode->i_sb, "block %d < 0", block);
39 return -EIO;
40
41abort_toobig: 34abort_toobig:
42 return 0; 35 return 0;
43} 36}
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 442d94fe255c..b9b3bb51b1e4 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1711,42 +1711,52 @@ struct elf_note_info {
1711 int numnote; 1711 int numnote;
1712}; 1712};
1713 1713
1714static int fill_note_info(struct elfhdr *elf, int phdrs, 1714static int elf_note_info_init(struct elf_note_info *info)
1715 struct elf_note_info *info,
1716 long signr, struct pt_regs *regs)
1717{ 1715{
1718#define NUM_NOTES 6 1716 memset(info, 0, sizeof(*info));
1719 struct list_head *t;
1720
1721 info->notes = NULL;
1722 info->prstatus = NULL;
1723 info->psinfo = NULL;
1724 info->fpu = NULL;
1725#ifdef ELF_CORE_COPY_XFPREGS
1726 info->xfpu = NULL;
1727#endif
1728 INIT_LIST_HEAD(&info->thread_list); 1717 INIT_LIST_HEAD(&info->thread_list);
1729 1718
1730 info->notes = kmalloc(NUM_NOTES * sizeof(struct memelfnote), 1719 /* Allocate space for six ELF notes */
1731 GFP_KERNEL); 1720 info->notes = kmalloc(6 * sizeof(struct memelfnote), GFP_KERNEL);
1732 if (!info->notes) 1721 if (!info->notes)
1733 return 0; 1722 return 0;
1734 info->psinfo = kmalloc(sizeof(*info->psinfo), GFP_KERNEL); 1723 info->psinfo = kmalloc(sizeof(*info->psinfo), GFP_KERNEL);
1735 if (!info->psinfo) 1724 if (!info->psinfo)
1736 return 0; 1725 goto notes_free;
1737 info->prstatus = kmalloc(sizeof(*info->prstatus), GFP_KERNEL); 1726 info->prstatus = kmalloc(sizeof(*info->prstatus), GFP_KERNEL);
1738 if (!info->prstatus) 1727 if (!info->prstatus)
1739 return 0; 1728 goto psinfo_free;
1740 info->fpu = kmalloc(sizeof(*info->fpu), GFP_KERNEL); 1729 info->fpu = kmalloc(sizeof(*info->fpu), GFP_KERNEL);
1741 if (!info->fpu) 1730 if (!info->fpu)
1742 return 0; 1731 goto prstatus_free;
1743#ifdef ELF_CORE_COPY_XFPREGS 1732#ifdef ELF_CORE_COPY_XFPREGS
1744 info->xfpu = kmalloc(sizeof(*info->xfpu), GFP_KERNEL); 1733 info->xfpu = kmalloc(sizeof(*info->xfpu), GFP_KERNEL);
1745 if (!info->xfpu) 1734 if (!info->xfpu)
1746 return 0; 1735 goto fpu_free;
1736#endif
1737 return 1;
1738#ifdef ELF_CORE_COPY_XFPREGS
1739 fpu_free:
1740 kfree(info->fpu);
1747#endif 1741#endif
1742 prstatus_free:
1743 kfree(info->prstatus);
1744 psinfo_free:
1745 kfree(info->psinfo);
1746 notes_free:
1747 kfree(info->notes);
1748 return 0;
1749}
1750
1751static int fill_note_info(struct elfhdr *elf, int phdrs,
1752 struct elf_note_info *info,
1753 long signr, struct pt_regs *regs)
1754{
1755 struct list_head *t;
1756
1757 if (!elf_note_info_init(info))
1758 return 0;
1748 1759
1749 info->thread_status_size = 0;
1750 if (signr) { 1760 if (signr) {
1751 struct core_thread *ct; 1761 struct core_thread *ct;
1752 struct elf_thread_status *ets; 1762 struct elf_thread_status *ets;
@@ -1806,8 +1816,6 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
1806#endif 1816#endif
1807 1817
1808 return 1; 1818 return 1;
1809
1810#undef NUM_NOTES
1811} 1819}
1812 1820
1813static size_t get_note_info_size(struct elf_note_info *info) 1821static size_t get_note_info_size(struct elf_note_info *info)
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 76285471073e..38502c67987c 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -283,20 +283,23 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm,
283 } 283 }
284 284
285 stack_size = exec_params.stack_size; 285 stack_size = exec_params.stack_size;
286 if (stack_size < interp_params.stack_size)
287 stack_size = interp_params.stack_size;
288
289 if (exec_params.flags & ELF_FDPIC_FLAG_EXEC_STACK) 286 if (exec_params.flags & ELF_FDPIC_FLAG_EXEC_STACK)
290 executable_stack = EXSTACK_ENABLE_X; 287 executable_stack = EXSTACK_ENABLE_X;
291 else if (exec_params.flags & ELF_FDPIC_FLAG_NOEXEC_STACK) 288 else if (exec_params.flags & ELF_FDPIC_FLAG_NOEXEC_STACK)
292 executable_stack = EXSTACK_DISABLE_X; 289 executable_stack = EXSTACK_DISABLE_X;
293 else if (interp_params.flags & ELF_FDPIC_FLAG_EXEC_STACK)
294 executable_stack = EXSTACK_ENABLE_X;
295 else if (interp_params.flags & ELF_FDPIC_FLAG_NOEXEC_STACK)
296 executable_stack = EXSTACK_DISABLE_X;
297 else 290 else
298 executable_stack = EXSTACK_DEFAULT; 291 executable_stack = EXSTACK_DEFAULT;
299 292
293 if (stack_size == 0) {
294 stack_size = interp_params.stack_size;
295 if (interp_params.flags & ELF_FDPIC_FLAG_EXEC_STACK)
296 executable_stack = EXSTACK_ENABLE_X;
297 else if (interp_params.flags & ELF_FDPIC_FLAG_NOEXEC_STACK)
298 executable_stack = EXSTACK_DISABLE_X;
299 else
300 executable_stack = EXSTACK_DEFAULT;
301 }
302
300 retval = -ENOEXEC; 303 retval = -ENOEXEC;
301 if (stack_size == 0) 304 if (stack_size == 0)
302 goto error; 305 goto error;
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index e92f229e3c6e..a2796651e756 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -278,8 +278,6 @@ static int decompress_exec(
278 ret = bprm->file->f_op->read(bprm->file, buf, LBUFSIZE, &fpos); 278 ret = bprm->file->f_op->read(bprm->file, buf, LBUFSIZE, &fpos);
279 if (ret <= 0) 279 if (ret <= 0)
280 break; 280 break;
281 if (ret >= (unsigned long) -4096)
282 break;
283 len -= ret; 281 len -= ret;
284 282
285 strm.next_in = buf; 283 strm.next_in = buf;
@@ -335,7 +333,7 @@ calc_reloc(unsigned long r, struct lib_info *p, int curid, int internalp)
335 "(%d != %d)", (unsigned) r, curid, id); 333 "(%d != %d)", (unsigned) r, curid, id);
336 goto failed; 334 goto failed;
337 } else if ( ! p->lib_list[id].loaded && 335 } else if ( ! p->lib_list[id].loaded &&
338 load_flat_shared_library(id, p) > (unsigned long) -4096) { 336 IS_ERR_VALUE(load_flat_shared_library(id, p))) {
339 printk("BINFMT_FLAT: failed to load library %d", id); 337 printk("BINFMT_FLAT: failed to load library %d", id);
340 goto failed; 338 goto failed;
341 } 339 }
@@ -545,7 +543,7 @@ static int load_flat_file(struct linux_binprm * bprm,
545 textpos = do_mmap(bprm->file, 0, text_len, PROT_READ|PROT_EXEC, 543 textpos = do_mmap(bprm->file, 0, text_len, PROT_READ|PROT_EXEC,
546 MAP_PRIVATE|MAP_EXECUTABLE, 0); 544 MAP_PRIVATE|MAP_EXECUTABLE, 0);
547 up_write(&current->mm->mmap_sem); 545 up_write(&current->mm->mmap_sem);
548 if (!textpos || textpos >= (unsigned long) -4096) { 546 if (!textpos || IS_ERR_VALUE(textpos)) {
549 if (!textpos) 547 if (!textpos)
550 textpos = (unsigned long) -ENOMEM; 548 textpos = (unsigned long) -ENOMEM;
551 printk("Unable to mmap process text, errno %d\n", (int)-textpos); 549 printk("Unable to mmap process text, errno %d\n", (int)-textpos);
@@ -560,7 +558,7 @@ static int load_flat_file(struct linux_binprm * bprm,
560 PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, 0); 558 PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, 0);
561 up_write(&current->mm->mmap_sem); 559 up_write(&current->mm->mmap_sem);
562 560
563 if (realdatastart == 0 || realdatastart >= (unsigned long)-4096) { 561 if (realdatastart == 0 || IS_ERR_VALUE(realdatastart)) {
564 if (!realdatastart) 562 if (!realdatastart)
565 realdatastart = (unsigned long) -ENOMEM; 563 realdatastart = (unsigned long) -ENOMEM;
566 printk("Unable to allocate RAM for process data, errno %d\n", 564 printk("Unable to allocate RAM for process data, errno %d\n",
@@ -587,7 +585,7 @@ static int load_flat_file(struct linux_binprm * bprm,
587 result = bprm->file->f_op->read(bprm->file, (char *) datapos, 585 result = bprm->file->f_op->read(bprm->file, (char *) datapos,
588 data_len + (relocs * sizeof(unsigned long)), &fpos); 586 data_len + (relocs * sizeof(unsigned long)), &fpos);
589 } 587 }
590 if (result >= (unsigned long)-4096) { 588 if (IS_ERR_VALUE(result)) {
591 printk("Unable to read data+bss, errno %d\n", (int)-result); 589 printk("Unable to read data+bss, errno %d\n", (int)-result);
592 do_munmap(current->mm, textpos, text_len); 590 do_munmap(current->mm, textpos, text_len);
593 do_munmap(current->mm, realdatastart, data_len + extra); 591 do_munmap(current->mm, realdatastart, data_len + extra);
@@ -607,7 +605,7 @@ static int load_flat_file(struct linux_binprm * bprm,
607 PROT_READ | PROT_EXEC | PROT_WRITE, MAP_PRIVATE, 0); 605 PROT_READ | PROT_EXEC | PROT_WRITE, MAP_PRIVATE, 0);
608 up_write(&current->mm->mmap_sem); 606 up_write(&current->mm->mmap_sem);
609 607
610 if (!textpos || textpos >= (unsigned long) -4096) { 608 if (!textpos || IS_ERR_VALUE(textpos)) {
611 if (!textpos) 609 if (!textpos)
612 textpos = (unsigned long) -ENOMEM; 610 textpos = (unsigned long) -ENOMEM;
613 printk("Unable to allocate RAM for process text/data, errno %d\n", 611 printk("Unable to allocate RAM for process text/data, errno %d\n",
@@ -641,7 +639,7 @@ static int load_flat_file(struct linux_binprm * bprm,
641 fpos = 0; 639 fpos = 0;
642 result = bprm->file->f_op->read(bprm->file, 640 result = bprm->file->f_op->read(bprm->file,
643 (char *) textpos, text_len, &fpos); 641 (char *) textpos, text_len, &fpos);
644 if (result < (unsigned long) -4096) 642 if (!IS_ERR_VALUE(result))
645 result = decompress_exec(bprm, text_len, (char *) datapos, 643 result = decompress_exec(bprm, text_len, (char *) datapos,
646 data_len + (relocs * sizeof(unsigned long)), 0); 644 data_len + (relocs * sizeof(unsigned long)), 0);
647 } 645 }
@@ -651,13 +649,13 @@ static int load_flat_file(struct linux_binprm * bprm,
651 fpos = 0; 649 fpos = 0;
652 result = bprm->file->f_op->read(bprm->file, 650 result = bprm->file->f_op->read(bprm->file,
653 (char *) textpos, text_len, &fpos); 651 (char *) textpos, text_len, &fpos);
654 if (result < (unsigned long) -4096) { 652 if (!IS_ERR_VALUE(result)) {
655 fpos = ntohl(hdr->data_start); 653 fpos = ntohl(hdr->data_start);
656 result = bprm->file->f_op->read(bprm->file, (char *) datapos, 654 result = bprm->file->f_op->read(bprm->file, (char *) datapos,
657 data_len + (relocs * sizeof(unsigned long)), &fpos); 655 data_len + (relocs * sizeof(unsigned long)), &fpos);
658 } 656 }
659 } 657 }
660 if (result >= (unsigned long)-4096) { 658 if (IS_ERR_VALUE(result)) {
661 printk("Unable to read code+data+bss, errno %d\n",(int)-result); 659 printk("Unable to read code+data+bss, errno %d\n",(int)-result);
662 do_munmap(current->mm, textpos, text_len + data_len + extra + 660 do_munmap(current->mm, textpos, text_len + data_len + extra +
663 MAX_SHARED_LIBS * sizeof(unsigned long)); 661 MAX_SHARED_LIBS * sizeof(unsigned long));
@@ -835,7 +833,7 @@ static int load_flat_shared_library(int id, struct lib_info *libs)
835 833
836 res = prepare_binprm(&bprm); 834 res = prepare_binprm(&bprm);
837 835
838 if (res <= (unsigned long)-4096) 836 if (!IS_ERR_VALUE(res))
839 res = load_flat_file(&bprm, libs, id, NULL); 837 res = load_flat_file(&bprm, libs, id, NULL);
840 838
841 abort_creds(bprm.cred); 839 abort_creds(bprm.cred);
@@ -880,7 +878,7 @@ static int load_flat_binary(struct linux_binprm * bprm, struct pt_regs * regs)
880 stack_len += FLAT_DATA_ALIGN - 1; /* reserve for upcoming alignment */ 878 stack_len += FLAT_DATA_ALIGN - 1; /* reserve for upcoming alignment */
881 879
882 res = load_flat_file(bprm, &libinfo, 0, &stack_len); 880 res = load_flat_file(bprm, &libinfo, 0, &stack_len);
883 if (res > (unsigned long)-4096) 881 if (IS_ERR_VALUE(res))
884 return res; 882 return res;
885 883
886 /* Update data segment pointers for all libraries */ 884 /* Update data segment pointers for all libraries */
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 9096fd0ca3ca..d154a3f365d5 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5269,6 +5269,7 @@ static const struct address_space_operations btrfs_aops = {
5269 .invalidatepage = btrfs_invalidatepage, 5269 .invalidatepage = btrfs_invalidatepage,
5270 .releasepage = btrfs_releasepage, 5270 .releasepage = btrfs_releasepage,
5271 .set_page_dirty = btrfs_set_page_dirty, 5271 .set_page_dirty = btrfs_set_page_dirty,
5272 .error_remove_page = generic_error_remove_page,
5272}; 5273};
5273 5274
5274static const struct address_space_operations btrfs_symlink_aops = { 5275static const struct address_space_operations btrfs_symlink_aops = {
diff --git a/fs/char_dev.c b/fs/char_dev.c
index 3cbc57f932d2..d6db933df2b2 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -264,7 +264,6 @@ int __register_chrdev(unsigned int major, unsigned int baseminor,
264{ 264{
265 struct char_device_struct *cd; 265 struct char_device_struct *cd;
266 struct cdev *cdev; 266 struct cdev *cdev;
267 char *s;
268 int err = -ENOMEM; 267 int err = -ENOMEM;
269 268
270 cd = __register_chrdev_region(major, baseminor, count, name); 269 cd = __register_chrdev_region(major, baseminor, count, name);
@@ -278,8 +277,6 @@ int __register_chrdev(unsigned int major, unsigned int baseminor,
278 cdev->owner = fops->owner; 277 cdev->owner = fops->owner;
279 cdev->ops = fops; 278 cdev->ops = fops;
280 kobject_set_name(&cdev->kobj, "%s", name); 279 kobject_set_name(&cdev->kobj, "%s", name);
281 for (s = strchr(kobject_name(&cdev->kobj),'/'); s; s = strchr(s, '/'))
282 *s = '!';
283 280
284 err = cdev_add(cdev, MKDEV(cd->major, baseminor), count); 281 err = cdev_add(cdev, MKDEV(cd->major, baseminor), count);
285 if (err) 282 if (err)
diff --git a/fs/coda/coda_int.h b/fs/coda/coda_int.h
index 8ccd5ed81d9c..d99860a33890 100644
--- a/fs/coda/coda_int.h
+++ b/fs/coda/coda_int.h
@@ -2,6 +2,7 @@
2#define _CODA_INT_ 2#define _CODA_INT_
3 3
4struct dentry; 4struct dentry;
5struct file;
5 6
6extern struct file_system_type coda_fs_type; 7extern struct file_system_type coda_fs_type;
7extern unsigned long coda_timeout; 8extern unsigned long coda_timeout;
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index a2edb7913447..31f4b0e6d72c 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -63,9 +63,9 @@ static void drop_slab(void)
63} 63}
64 64
65int drop_caches_sysctl_handler(ctl_table *table, int write, 65int drop_caches_sysctl_handler(ctl_table *table, int write,
66 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 66 void __user *buffer, size_t *length, loff_t *ppos)
67{ 67{
68 proc_dointvec_minmax(table, write, file, buffer, length, ppos); 68 proc_dointvec_minmax(table, write, buffer, length, ppos);
69 if (write) { 69 if (write) {
70 if (sysctl_drop_caches & 1) 70 if (sysctl_drop_caches & 1)
71 drop_pagecache(); 71 drop_pagecache();
diff --git a/fs/exec.c b/fs/exec.c
index 5c833c18d0d4..d49be6bc1793 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -55,6 +55,7 @@
55#include <linux/kmod.h> 55#include <linux/kmod.h>
56#include <linux/fsnotify.h> 56#include <linux/fsnotify.h>
57#include <linux/fs_struct.h> 57#include <linux/fs_struct.h>
58#include <linux/pipe_fs_i.h>
58 59
59#include <asm/uaccess.h> 60#include <asm/uaccess.h>
60#include <asm/mmu_context.h> 61#include <asm/mmu_context.h>
@@ -63,6 +64,7 @@
63 64
64int core_uses_pid; 65int core_uses_pid;
65char core_pattern[CORENAME_MAX_SIZE] = "core"; 66char core_pattern[CORENAME_MAX_SIZE] = "core";
67unsigned int core_pipe_limit;
66int suid_dumpable = 0; 68int suid_dumpable = 0;
67 69
68/* The maximal length of core_pattern is also specified in sysctl.c */ 70/* The maximal length of core_pattern is also specified in sysctl.c */
@@ -1393,18 +1395,16 @@ out_ret:
1393 return retval; 1395 return retval;
1394} 1396}
1395 1397
1396int set_binfmt(struct linux_binfmt *new) 1398void set_binfmt(struct linux_binfmt *new)
1397{ 1399{
1398 struct linux_binfmt *old = current->binfmt; 1400 struct mm_struct *mm = current->mm;
1399 1401
1400 if (new) { 1402 if (mm->binfmt)
1401 if (!try_module_get(new->module)) 1403 module_put(mm->binfmt->module);
1402 return -1; 1404
1403 } 1405 mm->binfmt = new;
1404 current->binfmt = new; 1406 if (new)
1405 if (old) 1407 __module_get(new->module);
1406 module_put(old->module);
1407 return 0;
1408} 1408}
1409 1409
1410EXPORT_SYMBOL(set_binfmt); 1410EXPORT_SYMBOL(set_binfmt);
@@ -1728,6 +1728,29 @@ int get_dumpable(struct mm_struct *mm)
1728 return (ret >= 2) ? 2 : ret; 1728 return (ret >= 2) ? 2 : ret;
1729} 1729}
1730 1730
1731static void wait_for_dump_helpers(struct file *file)
1732{
1733 struct pipe_inode_info *pipe;
1734
1735 pipe = file->f_path.dentry->d_inode->i_pipe;
1736
1737 pipe_lock(pipe);
1738 pipe->readers++;
1739 pipe->writers--;
1740
1741 while ((pipe->readers > 1) && (!signal_pending(current))) {
1742 wake_up_interruptible_sync(&pipe->wait);
1743 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
1744 pipe_wait(pipe);
1745 }
1746
1747 pipe->readers--;
1748 pipe->writers++;
1749 pipe_unlock(pipe);
1750
1751}
1752
1753
1731void do_coredump(long signr, int exit_code, struct pt_regs *regs) 1754void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1732{ 1755{
1733 struct core_state core_state; 1756 struct core_state core_state;
@@ -1744,11 +1767,12 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1744 unsigned long core_limit = current->signal->rlim[RLIMIT_CORE].rlim_cur; 1767 unsigned long core_limit = current->signal->rlim[RLIMIT_CORE].rlim_cur;
1745 char **helper_argv = NULL; 1768 char **helper_argv = NULL;
1746 int helper_argc = 0; 1769 int helper_argc = 0;
1747 char *delimit; 1770 int dump_count = 0;
1771 static atomic_t core_dump_count = ATOMIC_INIT(0);
1748 1772
1749 audit_core_dumps(signr); 1773 audit_core_dumps(signr);
1750 1774
1751 binfmt = current->binfmt; 1775 binfmt = mm->binfmt;
1752 if (!binfmt || !binfmt->core_dump) 1776 if (!binfmt || !binfmt->core_dump)
1753 goto fail; 1777 goto fail;
1754 1778
@@ -1799,54 +1823,63 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1799 lock_kernel(); 1823 lock_kernel();
1800 ispipe = format_corename(corename, signr); 1824 ispipe = format_corename(corename, signr);
1801 unlock_kernel(); 1825 unlock_kernel();
1802 /* 1826
1803 * Don't bother to check the RLIMIT_CORE value if core_pattern points
1804 * to a pipe. Since we're not writing directly to the filesystem
1805 * RLIMIT_CORE doesn't really apply, as no actual core file will be
1806 * created unless the pipe reader choses to write out the core file
1807 * at which point file size limits and permissions will be imposed
1808 * as it does with any other process
1809 */
1810 if ((!ispipe) && (core_limit < binfmt->min_coredump)) 1827 if ((!ispipe) && (core_limit < binfmt->min_coredump))
1811 goto fail_unlock; 1828 goto fail_unlock;
1812 1829
1813 if (ispipe) { 1830 if (ispipe) {
1831 if (core_limit == 0) {
1832 /*
1833 * Normally core limits are irrelevant to pipes, since
1834 * we're not writing to the file system, but we use
1835 * core_limit of 0 here as a speacial value. Any
1836 * non-zero limit gets set to RLIM_INFINITY below, but
1837 * a limit of 0 skips the dump. This is a consistent
1838 * way to catch recursive crashes. We can still crash
1839 * if the core_pattern binary sets RLIM_CORE = !0
1840 * but it runs as root, and can do lots of stupid things
1841 * Note that we use task_tgid_vnr here to grab the pid
1842 * of the process group leader. That way we get the
1843 * right pid if a thread in a multi-threaded
1844 * core_pattern process dies.
1845 */
1846 printk(KERN_WARNING
1847 "Process %d(%s) has RLIMIT_CORE set to 0\n",
1848 task_tgid_vnr(current), current->comm);
1849 printk(KERN_WARNING "Aborting core\n");
1850 goto fail_unlock;
1851 }
1852
1853 dump_count = atomic_inc_return(&core_dump_count);
1854 if (core_pipe_limit && (core_pipe_limit < dump_count)) {
1855 printk(KERN_WARNING "Pid %d(%s) over core_pipe_limit\n",
1856 task_tgid_vnr(current), current->comm);
1857 printk(KERN_WARNING "Skipping core dump\n");
1858 goto fail_dropcount;
1859 }
1860
1814 helper_argv = argv_split(GFP_KERNEL, corename+1, &helper_argc); 1861 helper_argv = argv_split(GFP_KERNEL, corename+1, &helper_argc);
1815 if (!helper_argv) { 1862 if (!helper_argv) {
1816 printk(KERN_WARNING "%s failed to allocate memory\n", 1863 printk(KERN_WARNING "%s failed to allocate memory\n",
1817 __func__); 1864 __func__);
1818 goto fail_unlock; 1865 goto fail_dropcount;
1819 }
1820 /* Terminate the string before the first option */
1821 delimit = strchr(corename, ' ');
1822 if (delimit)
1823 *delimit = '\0';
1824 delimit = strrchr(helper_argv[0], '/');
1825 if (delimit)
1826 delimit++;
1827 else
1828 delimit = helper_argv[0];
1829 if (!strcmp(delimit, current->comm)) {
1830 printk(KERN_NOTICE "Recursive core dump detected, "
1831 "aborting\n");
1832 goto fail_unlock;
1833 } 1866 }
1834 1867
1835 core_limit = RLIM_INFINITY; 1868 core_limit = RLIM_INFINITY;
1836 1869
1837 /* SIGPIPE can happen, but it's just never processed */ 1870 /* SIGPIPE can happen, but it's just never processed */
1838 if (call_usermodehelper_pipe(corename+1, helper_argv, NULL, 1871 if (call_usermodehelper_pipe(helper_argv[0], helper_argv, NULL,
1839 &file)) { 1872 &file)) {
1840 printk(KERN_INFO "Core dump to %s pipe failed\n", 1873 printk(KERN_INFO "Core dump to %s pipe failed\n",
1841 corename); 1874 corename);
1842 goto fail_unlock; 1875 goto fail_dropcount;
1843 } 1876 }
1844 } else 1877 } else
1845 file = filp_open(corename, 1878 file = filp_open(corename,
1846 O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag, 1879 O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag,
1847 0600); 1880 0600);
1848 if (IS_ERR(file)) 1881 if (IS_ERR(file))
1849 goto fail_unlock; 1882 goto fail_dropcount;
1850 inode = file->f_path.dentry->d_inode; 1883 inode = file->f_path.dentry->d_inode;
1851 if (inode->i_nlink > 1) 1884 if (inode->i_nlink > 1)
1852 goto close_fail; /* multiple links - don't dump */ 1885 goto close_fail; /* multiple links - don't dump */
@@ -1875,7 +1908,12 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1875 if (retval) 1908 if (retval)
1876 current->signal->group_exit_code |= 0x80; 1909 current->signal->group_exit_code |= 0x80;
1877close_fail: 1910close_fail:
1911 if (ispipe && core_pipe_limit)
1912 wait_for_dump_helpers(file);
1878 filp_close(file, NULL); 1913 filp_close(file, NULL);
1914fail_dropcount:
1915 if (dump_count)
1916 atomic_dec(&core_dump_count);
1879fail_unlock: 1917fail_unlock:
1880 if (helper_argv) 1918 if (helper_argv)
1881 argv_free(helper_argv); 1919 argv_free(helper_argv);
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 1c1638f873a4..ade634076d0a 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -819,6 +819,7 @@ const struct address_space_operations ext2_aops = {
819 .writepages = ext2_writepages, 819 .writepages = ext2_writepages,
820 .migratepage = buffer_migrate_page, 820 .migratepage = buffer_migrate_page,
821 .is_partially_uptodate = block_is_partially_uptodate, 821 .is_partially_uptodate = block_is_partially_uptodate,
822 .error_remove_page = generic_error_remove_page,
822}; 823};
823 824
824const struct address_space_operations ext2_aops_xip = { 825const struct address_space_operations ext2_aops_xip = {
@@ -837,6 +838,7 @@ const struct address_space_operations ext2_nobh_aops = {
837 .direct_IO = ext2_direct_IO, 838 .direct_IO = ext2_direct_IO,
838 .writepages = ext2_writepages, 839 .writepages = ext2_writepages,
839 .migratepage = buffer_migrate_page, 840 .migratepage = buffer_migrate_page,
841 .error_remove_page = generic_error_remove_page,
840}; 842};
841 843
842/* 844/*
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index cd098a7b77fc..acf1b1423327 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1830,6 +1830,7 @@ static const struct address_space_operations ext3_ordered_aops = {
1830 .direct_IO = ext3_direct_IO, 1830 .direct_IO = ext3_direct_IO,
1831 .migratepage = buffer_migrate_page, 1831 .migratepage = buffer_migrate_page,
1832 .is_partially_uptodate = block_is_partially_uptodate, 1832 .is_partially_uptodate = block_is_partially_uptodate,
1833 .error_remove_page = generic_error_remove_page,
1833}; 1834};
1834 1835
1835static const struct address_space_operations ext3_writeback_aops = { 1836static const struct address_space_operations ext3_writeback_aops = {
@@ -1845,6 +1846,7 @@ static const struct address_space_operations ext3_writeback_aops = {
1845 .direct_IO = ext3_direct_IO, 1846 .direct_IO = ext3_direct_IO,
1846 .migratepage = buffer_migrate_page, 1847 .migratepage = buffer_migrate_page,
1847 .is_partially_uptodate = block_is_partially_uptodate, 1848 .is_partially_uptodate = block_is_partially_uptodate,
1849 .error_remove_page = generic_error_remove_page,
1848}; 1850};
1849 1851
1850static const struct address_space_operations ext3_journalled_aops = { 1852static const struct address_space_operations ext3_journalled_aops = {
@@ -1859,6 +1861,7 @@ static const struct address_space_operations ext3_journalled_aops = {
1859 .invalidatepage = ext3_invalidatepage, 1861 .invalidatepage = ext3_invalidatepage,
1860 .releasepage = ext3_releasepage, 1862 .releasepage = ext3_releasepage,
1861 .is_partially_uptodate = block_is_partially_uptodate, 1863 .is_partially_uptodate = block_is_partially_uptodate,
1864 .error_remove_page = generic_error_remove_page,
1862}; 1865};
1863 1866
1864void ext3_set_aops(struct inode *inode) 1867void ext3_set_aops(struct inode *inode)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 3a798737e305..064746fad581 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3386,6 +3386,7 @@ static const struct address_space_operations ext4_ordered_aops = {
3386 .direct_IO = ext4_direct_IO, 3386 .direct_IO = ext4_direct_IO,
3387 .migratepage = buffer_migrate_page, 3387 .migratepage = buffer_migrate_page,
3388 .is_partially_uptodate = block_is_partially_uptodate, 3388 .is_partially_uptodate = block_is_partially_uptodate,
3389 .error_remove_page = generic_error_remove_page,
3389}; 3390};
3390 3391
3391static const struct address_space_operations ext4_writeback_aops = { 3392static const struct address_space_operations ext4_writeback_aops = {
@@ -3401,6 +3402,7 @@ static const struct address_space_operations ext4_writeback_aops = {
3401 .direct_IO = ext4_direct_IO, 3402 .direct_IO = ext4_direct_IO,
3402 .migratepage = buffer_migrate_page, 3403 .migratepage = buffer_migrate_page,
3403 .is_partially_uptodate = block_is_partially_uptodate, 3404 .is_partially_uptodate = block_is_partially_uptodate,
3405 .error_remove_page = generic_error_remove_page,
3404}; 3406};
3405 3407
3406static const struct address_space_operations ext4_journalled_aops = { 3408static const struct address_space_operations ext4_journalled_aops = {
@@ -3415,6 +3417,7 @@ static const struct address_space_operations ext4_journalled_aops = {
3415 .invalidatepage = ext4_invalidatepage, 3417 .invalidatepage = ext4_invalidatepage,
3416 .releasepage = ext4_releasepage, 3418 .releasepage = ext4_releasepage,
3417 .is_partially_uptodate = block_is_partially_uptodate, 3419 .is_partially_uptodate = block_is_partially_uptodate,
3420 .error_remove_page = generic_error_remove_page,
3418}; 3421};
3419 3422
3420static const struct address_space_operations ext4_da_aops = { 3423static const struct address_space_operations ext4_da_aops = {
@@ -3431,6 +3434,7 @@ static const struct address_space_operations ext4_da_aops = {
3431 .direct_IO = ext4_direct_IO, 3434 .direct_IO = ext4_direct_IO,
3432 .migratepage = buffer_migrate_page, 3435 .migratepage = buffer_migrate_page,
3433 .is_partially_uptodate = block_is_partially_uptodate, 3436 .is_partially_uptodate = block_is_partially_uptodate,
3437 .error_remove_page = generic_error_remove_page,
3434}; 3438};
3435 3439
3436void ext4_set_aops(struct inode *inode) 3440void ext4_set_aops(struct inode *inode)
diff --git a/fs/fcntl.c b/fs/fcntl.c
index ae413086db97..fc089f2f7f56 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -263,6 +263,79 @@ pid_t f_getown(struct file *filp)
263 return pid; 263 return pid;
264} 264}
265 265
266static int f_setown_ex(struct file *filp, unsigned long arg)
267{
268 struct f_owner_ex * __user owner_p = (void * __user)arg;
269 struct f_owner_ex owner;
270 struct pid *pid;
271 int type;
272 int ret;
273
274 ret = copy_from_user(&owner, owner_p, sizeof(owner));
275 if (ret)
276 return ret;
277
278 switch (owner.type) {
279 case F_OWNER_TID:
280 type = PIDTYPE_MAX;
281 break;
282
283 case F_OWNER_PID:
284 type = PIDTYPE_PID;
285 break;
286
287 case F_OWNER_GID:
288 type = PIDTYPE_PGID;
289 break;
290
291 default:
292 return -EINVAL;
293 }
294
295 rcu_read_lock();
296 pid = find_vpid(owner.pid);
297 if (owner.pid && !pid)
298 ret = -ESRCH;
299 else
300 ret = __f_setown(filp, pid, type, 1);
301 rcu_read_unlock();
302
303 return ret;
304}
305
306static int f_getown_ex(struct file *filp, unsigned long arg)
307{
308 struct f_owner_ex * __user owner_p = (void * __user)arg;
309 struct f_owner_ex owner;
310 int ret = 0;
311
312 read_lock(&filp->f_owner.lock);
313 owner.pid = pid_vnr(filp->f_owner.pid);
314 switch (filp->f_owner.pid_type) {
315 case PIDTYPE_MAX:
316 owner.type = F_OWNER_TID;
317 break;
318
319 case PIDTYPE_PID:
320 owner.type = F_OWNER_PID;
321 break;
322
323 case PIDTYPE_PGID:
324 owner.type = F_OWNER_GID;
325 break;
326
327 default:
328 WARN_ON(1);
329 ret = -EINVAL;
330 break;
331 }
332 read_unlock(&filp->f_owner.lock);
333
334 if (!ret)
335 ret = copy_to_user(owner_p, &owner, sizeof(owner));
336 return ret;
337}
338
266static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, 339static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
267 struct file *filp) 340 struct file *filp)
268{ 341{
@@ -313,6 +386,12 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
313 case F_SETOWN: 386 case F_SETOWN:
314 err = f_setown(filp, arg, 1); 387 err = f_setown(filp, arg, 1);
315 break; 388 break;
389 case F_GETOWN_EX:
390 err = f_getown_ex(filp, arg);
391 break;
392 case F_SETOWN_EX:
393 err = f_setown_ex(filp, arg);
394 break;
316 case F_GETSIG: 395 case F_GETSIG:
317 err = filp->f_owner.signum; 396 err = filp->f_owner.signum;
318 break; 397 break;
@@ -428,8 +507,7 @@ static inline int sigio_perm(struct task_struct *p,
428 507
429static void send_sigio_to_task(struct task_struct *p, 508static void send_sigio_to_task(struct task_struct *p,
430 struct fown_struct *fown, 509 struct fown_struct *fown,
431 int fd, 510 int fd, int reason, int group)
432 int reason)
433{ 511{
434 /* 512 /*
435 * F_SETSIG can change ->signum lockless in parallel, make 513 * F_SETSIG can change ->signum lockless in parallel, make
@@ -461,11 +539,11 @@ static void send_sigio_to_task(struct task_struct *p,
461 else 539 else
462 si.si_band = band_table[reason - POLL_IN]; 540 si.si_band = band_table[reason - POLL_IN];
463 si.si_fd = fd; 541 si.si_fd = fd;
464 if (!group_send_sig_info(signum, &si, p)) 542 if (!do_send_sig_info(signum, &si, p, group))
465 break; 543 break;
466 /* fall-through: fall back on the old plain SIGIO signal */ 544 /* fall-through: fall back on the old plain SIGIO signal */
467 case 0: 545 case 0:
468 group_send_sig_info(SIGIO, SEND_SIG_PRIV, p); 546 do_send_sig_info(SIGIO, SEND_SIG_PRIV, p, group);
469 } 547 }
470} 548}
471 549
@@ -474,16 +552,23 @@ void send_sigio(struct fown_struct *fown, int fd, int band)
474 struct task_struct *p; 552 struct task_struct *p;
475 enum pid_type type; 553 enum pid_type type;
476 struct pid *pid; 554 struct pid *pid;
555 int group = 1;
477 556
478 read_lock(&fown->lock); 557 read_lock(&fown->lock);
558
479 type = fown->pid_type; 559 type = fown->pid_type;
560 if (type == PIDTYPE_MAX) {
561 group = 0;
562 type = PIDTYPE_PID;
563 }
564
480 pid = fown->pid; 565 pid = fown->pid;
481 if (!pid) 566 if (!pid)
482 goto out_unlock_fown; 567 goto out_unlock_fown;
483 568
484 read_lock(&tasklist_lock); 569 read_lock(&tasklist_lock);
485 do_each_pid_task(pid, type, p) { 570 do_each_pid_task(pid, type, p) {
486 send_sigio_to_task(p, fown, fd, band); 571 send_sigio_to_task(p, fown, fd, band, group);
487 } while_each_pid_task(pid, type, p); 572 } while_each_pid_task(pid, type, p);
488 read_unlock(&tasklist_lock); 573 read_unlock(&tasklist_lock);
489 out_unlock_fown: 574 out_unlock_fown:
@@ -491,10 +576,10 @@ void send_sigio(struct fown_struct *fown, int fd, int band)
491} 576}
492 577
493static void send_sigurg_to_task(struct task_struct *p, 578static void send_sigurg_to_task(struct task_struct *p,
494 struct fown_struct *fown) 579 struct fown_struct *fown, int group)
495{ 580{
496 if (sigio_perm(p, fown, SIGURG)) 581 if (sigio_perm(p, fown, SIGURG))
497 group_send_sig_info(SIGURG, SEND_SIG_PRIV, p); 582 do_send_sig_info(SIGURG, SEND_SIG_PRIV, p, group);
498} 583}
499 584
500int send_sigurg(struct fown_struct *fown) 585int send_sigurg(struct fown_struct *fown)
@@ -502,10 +587,17 @@ int send_sigurg(struct fown_struct *fown)
502 struct task_struct *p; 587 struct task_struct *p;
503 enum pid_type type; 588 enum pid_type type;
504 struct pid *pid; 589 struct pid *pid;
590 int group = 1;
505 int ret = 0; 591 int ret = 0;
506 592
507 read_lock(&fown->lock); 593 read_lock(&fown->lock);
594
508 type = fown->pid_type; 595 type = fown->pid_type;
596 if (type == PIDTYPE_MAX) {
597 group = 0;
598 type = PIDTYPE_PID;
599 }
600
509 pid = fown->pid; 601 pid = fown->pid;
510 if (!pid) 602 if (!pid)
511 goto out_unlock_fown; 603 goto out_unlock_fown;
@@ -514,7 +606,7 @@ int send_sigurg(struct fown_struct *fown)
514 606
515 read_lock(&tasklist_lock); 607 read_lock(&tasklist_lock);
516 do_each_pid_task(pid, type, p) { 608 do_each_pid_task(pid, type, p) {
517 send_sigurg_to_task(p, fown); 609 send_sigurg_to_task(p, fown, group);
518 } while_each_pid_task(pid, type, p); 610 } while_each_pid_task(pid, type, p);
519 read_unlock(&tasklist_lock); 611 read_unlock(&tasklist_lock);
520 out_unlock_fown: 612 out_unlock_fown:
diff --git a/fs/file_table.c b/fs/file_table.c
index 334ce39881f8..8eb44042e009 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -74,14 +74,14 @@ EXPORT_SYMBOL_GPL(get_max_files);
74 * Handle nr_files sysctl 74 * Handle nr_files sysctl
75 */ 75 */
76#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) 76#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
77int proc_nr_files(ctl_table *table, int write, struct file *filp, 77int proc_nr_files(ctl_table *table, int write,
78 void __user *buffer, size_t *lenp, loff_t *ppos) 78 void __user *buffer, size_t *lenp, loff_t *ppos)
79{ 79{
80 files_stat.nr_files = get_nr_files(); 80 files_stat.nr_files = get_nr_files();
81 return proc_dointvec(table, write, filp, buffer, lenp, ppos); 81 return proc_dointvec(table, write, buffer, lenp, ppos);
82} 82}
83#else 83#else
84int proc_nr_files(ctl_table *table, int write, struct file *filp, 84int proc_nr_files(ctl_table *table, int write,
85 void __user *buffer, size_t *lenp, loff_t *ppos) 85 void __user *buffer, size_t *lenp, loff_t *ppos)
86{ 86{
87 return -ENOSYS; 87 return -ENOSYS;
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 7ebae9a4ecc0..694b5d48f036 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -1135,6 +1135,7 @@ static const struct address_space_operations gfs2_writeback_aops = {
1135 .direct_IO = gfs2_direct_IO, 1135 .direct_IO = gfs2_direct_IO,
1136 .migratepage = buffer_migrate_page, 1136 .migratepage = buffer_migrate_page,
1137 .is_partially_uptodate = block_is_partially_uptodate, 1137 .is_partially_uptodate = block_is_partially_uptodate,
1138 .error_remove_page = generic_error_remove_page,
1138}; 1139};
1139 1140
1140static const struct address_space_operations gfs2_ordered_aops = { 1141static const struct address_space_operations gfs2_ordered_aops = {
@@ -1151,6 +1152,7 @@ static const struct address_space_operations gfs2_ordered_aops = {
1151 .direct_IO = gfs2_direct_IO, 1152 .direct_IO = gfs2_direct_IO,
1152 .migratepage = buffer_migrate_page, 1153 .migratepage = buffer_migrate_page,
1153 .is_partially_uptodate = block_is_partially_uptodate, 1154 .is_partially_uptodate = block_is_partially_uptodate,
1155 .error_remove_page = generic_error_remove_page,
1154}; 1156};
1155 1157
1156static const struct address_space_operations gfs2_jdata_aops = { 1158static const struct address_space_operations gfs2_jdata_aops = {
@@ -1166,6 +1168,7 @@ static const struct address_space_operations gfs2_jdata_aops = {
1166 .invalidatepage = gfs2_invalidatepage, 1168 .invalidatepage = gfs2_invalidatepage,
1167 .releasepage = gfs2_releasepage, 1169 .releasepage = gfs2_releasepage,
1168 .is_partially_uptodate = block_is_partially_uptodate, 1170 .is_partially_uptodate = block_is_partially_uptodate,
1171 .error_remove_page = generic_error_remove_page,
1169}; 1172};
1170 1173
1171void gfs2_set_aops(struct inode *inode) 1174void gfs2_set_aops(struct inode *inode)
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index eba6d552d9c9..133335479c24 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -936,15 +936,9 @@ static struct file_system_type hugetlbfs_fs_type = {
936 936
937static struct vfsmount *hugetlbfs_vfsmount; 937static struct vfsmount *hugetlbfs_vfsmount;
938 938
939static int can_do_hugetlb_shm(int creat_flags) 939static int can_do_hugetlb_shm(void)
940{ 940{
941 if (creat_flags != HUGETLB_SHMFS_INODE) 941 return capable(CAP_IPC_LOCK) || in_group_p(sysctl_hugetlb_shm_group);
942 return 0;
943 if (capable(CAP_IPC_LOCK))
944 return 1;
945 if (in_group_p(sysctl_hugetlb_shm_group))
946 return 1;
947 return 0;
948} 942}
949 943
950struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag, 944struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag,
@@ -960,7 +954,7 @@ struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag,
960 if (!hugetlbfs_vfsmount) 954 if (!hugetlbfs_vfsmount)
961 return ERR_PTR(-ENOENT); 955 return ERR_PTR(-ENOENT);
962 956
963 if (!can_do_hugetlb_shm(creat_flags)) { 957 if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) {
964 *user = current_user(); 958 *user = current_user();
965 if (user_shm_lock(size, *user)) { 959 if (user_shm_lock(size, *user)) {
966 WARN_ONCE(1, 960 WARN_ONCE(1,
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 5021b75d2d1e..86d6b4db1096 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -525,6 +525,7 @@ const struct address_space_operations nfs_file_aops = {
525 .direct_IO = nfs_direct_IO, 525 .direct_IO = nfs_direct_IO,
526 .migratepage = nfs_migrate_page, 526 .migratepage = nfs_migrate_page,
527 .launder_page = nfs_launder_page, 527 .launder_page = nfs_launder_page,
528 .error_remove_page = generic_error_remove_page,
528}; 529};
529 530
530/* 531/*
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index b38f944f0667..cfce53cb65d7 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -1550,6 +1550,7 @@ const struct address_space_operations ntfs_aops = {
1550 .migratepage = buffer_migrate_page, /* Move a page cache page from 1550 .migratepage = buffer_migrate_page, /* Move a page cache page from
1551 one physical page to an 1551 one physical page to an
1552 other. */ 1552 other. */
1553 .error_remove_page = generic_error_remove_page,
1553}; 1554};
1554 1555
1555/** 1556/**
@@ -1569,6 +1570,7 @@ const struct address_space_operations ntfs_mst_aops = {
1569 .migratepage = buffer_migrate_page, /* Move a page cache page from 1570 .migratepage = buffer_migrate_page, /* Move a page cache page from
1570 one physical page to an 1571 one physical page to an
1571 other. */ 1572 other. */
1573 .error_remove_page = generic_error_remove_page,
1572}; 1574};
1573 1575
1574#ifdef NTFS_RW 1576#ifdef NTFS_RW
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 72e76062a900..deb2b132ae5e 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -2022,4 +2022,5 @@ const struct address_space_operations ocfs2_aops = {
2022 .releasepage = ocfs2_releasepage, 2022 .releasepage = ocfs2_releasepage,
2023 .migratepage = buffer_migrate_page, 2023 .migratepage = buffer_migrate_page,
2024 .is_partially_uptodate = block_is_partially_uptodate, 2024 .is_partially_uptodate = block_is_partially_uptodate,
2025 .error_remove_page = generic_error_remove_page,
2025}; 2026};
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 171e052c07b3..c7bff4f603ff 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -97,7 +97,11 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
97 "Committed_AS: %8lu kB\n" 97 "Committed_AS: %8lu kB\n"
98 "VmallocTotal: %8lu kB\n" 98 "VmallocTotal: %8lu kB\n"
99 "VmallocUsed: %8lu kB\n" 99 "VmallocUsed: %8lu kB\n"
100 "VmallocChunk: %8lu kB\n", 100 "VmallocChunk: %8lu kB\n"
101#ifdef CONFIG_MEMORY_FAILURE
102 "HardwareCorrupted: %8lu kB\n"
103#endif
104 ,
101 K(i.totalram), 105 K(i.totalram),
102 K(i.freeram), 106 K(i.freeram),
103 K(i.bufferram), 107 K(i.bufferram),
@@ -144,6 +148,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
144 (unsigned long)VMALLOC_TOTAL >> 10, 148 (unsigned long)VMALLOC_TOTAL >> 10,
145 vmi.used >> 10, 149 vmi.used >> 10,
146 vmi.largest_chunk >> 10 150 vmi.largest_chunk >> 10
151#ifdef CONFIG_MEMORY_FAILURE
152 ,atomic_long_read(&mce_bad_pages) << (PAGE_SHIFT - 10)
153#endif
147 ); 154 );
148 155
149 hugetlb_report_meminfo(m); 156 hugetlb_report_meminfo(m);
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 9b1e4e9a16bf..f667e8aeabdf 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -153,7 +153,7 @@ static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf,
153 153
154 /* careful: calling conventions are nasty here */ 154 /* careful: calling conventions are nasty here */
155 res = count; 155 res = count;
156 error = table->proc_handler(table, write, filp, buf, &res, ppos); 156 error = table->proc_handler(table, write, buf, &res, ppos);
157 if (!error) 157 if (!error)
158 error = res; 158 error = res;
159out: 159out:
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index 47f132df0c3f..c117fa80d1e9 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -528,7 +528,7 @@ static int romfs_fill_super(struct super_block *sb, void *data, int silent)
528 pos = (ROMFH_SIZE + len + 1 + ROMFH_PAD) & ROMFH_MASK; 528 pos = (ROMFH_SIZE + len + 1 + ROMFH_PAD) & ROMFH_MASK;
529 529
530 root = romfs_iget(sb, pos); 530 root = romfs_iget(sb, pos);
531 if (!root) 531 if (IS_ERR(root))
532 goto error; 532 goto error;
533 533
534 sb->s_root = d_alloc_root(root); 534 sb->s_root = d_alloc_root(root);
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index d5e5559e31db..381854461b28 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -1635,4 +1635,5 @@ const struct address_space_operations xfs_address_space_operations = {
1635 .direct_IO = xfs_vm_direct_IO, 1635 .direct_IO = xfs_vm_direct_IO,
1636 .migratepage = buffer_migrate_page, 1636 .migratepage = buffer_migrate_page,
1637 .is_partially_uptodate = block_is_partially_uptodate, 1637 .is_partially_uptodate = block_is_partially_uptodate,
1638 .error_remove_page = generic_error_remove_page,
1638}; 1639};
diff --git a/fs/xfs/linux-2.6/xfs_sysctl.c b/fs/xfs/linux-2.6/xfs_sysctl.c
index 916c0ffb6083..c5bc67c4e3bb 100644
--- a/fs/xfs/linux-2.6/xfs_sysctl.c
+++ b/fs/xfs/linux-2.6/xfs_sysctl.c
@@ -26,7 +26,6 @@ STATIC int
26xfs_stats_clear_proc_handler( 26xfs_stats_clear_proc_handler(
27 ctl_table *ctl, 27 ctl_table *ctl,
28 int write, 28 int write,
29 struct file *filp,
30 void __user *buffer, 29 void __user *buffer,
31 size_t *lenp, 30 size_t *lenp,
32 loff_t *ppos) 31 loff_t *ppos)
@@ -34,7 +33,7 @@ xfs_stats_clear_proc_handler(
34 int c, ret, *valp = ctl->data; 33 int c, ret, *valp = ctl->data;
35 __uint32_t vn_active; 34 __uint32_t vn_active;
36 35
37 ret = proc_dointvec_minmax(ctl, write, filp, buffer, lenp, ppos); 36 ret = proc_dointvec_minmax(ctl, write, buffer, lenp, ppos);
38 37
39 if (!ret && write && *valp) { 38 if (!ret && write && *valp) {
40 printk("XFS Clearing xfsstats\n"); 39 printk("XFS Clearing xfsstats\n");
diff --git a/include/asm-generic/fcntl.h b/include/asm-generic/fcntl.h
index 4d3e48373e74..0c3dd8603927 100644
--- a/include/asm-generic/fcntl.h
+++ b/include/asm-generic/fcntl.h
@@ -73,6 +73,19 @@
73#define F_SETSIG 10 /* for sockets. */ 73#define F_SETSIG 10 /* for sockets. */
74#define F_GETSIG 11 /* for sockets. */ 74#define F_GETSIG 11 /* for sockets. */
75#endif 75#endif
76#ifndef F_SETOWN_EX
77#define F_SETOWN_EX 12
78#define F_GETOWN_EX 13
79#endif
80
81#define F_OWNER_TID 0
82#define F_OWNER_PID 1
83#define F_OWNER_GID 2
84
85struct f_owner_ex {
86 int type;
87 pid_t pid;
88};
76 89
77/* for F_[GET|SET]FL */ 90/* for F_[GET|SET]FL */
78#define FD_CLOEXEC 1 /* actually anything with low bit set goes */ 91#define FD_CLOEXEC 1 /* actually anything with low bit set goes */
diff --git a/include/asm-generic/mman-common.h b/include/asm-generic/mman-common.h
index dd63bd38864b..5ee13b2fd223 100644
--- a/include/asm-generic/mman-common.h
+++ b/include/asm-generic/mman-common.h
@@ -34,6 +34,7 @@
34#define MADV_REMOVE 9 /* remove these pages & resources */ 34#define MADV_REMOVE 9 /* remove these pages & resources */
35#define MADV_DONTFORK 10 /* don't inherit across fork */ 35#define MADV_DONTFORK 10 /* don't inherit across fork */
36#define MADV_DOFORK 11 /* do inherit across fork */ 36#define MADV_DOFORK 11 /* do inherit across fork */
37#define MADV_HWPOISON 100 /* poison a page for testing */
37 38
38#define MADV_MERGEABLE 12 /* KSM may merge identical pages */ 39#define MADV_MERGEABLE 12 /* KSM may merge identical pages */
39#define MADV_UNMERGEABLE 13 /* KSM may not merge identical pages */ 40#define MADV_UNMERGEABLE 13 /* KSM may not merge identical pages */
diff --git a/include/asm-generic/siginfo.h b/include/asm-generic/siginfo.h
index c840719a8c59..942d30b5aab1 100644
--- a/include/asm-generic/siginfo.h
+++ b/include/asm-generic/siginfo.h
@@ -82,6 +82,7 @@ typedef struct siginfo {
82#ifdef __ARCH_SI_TRAPNO 82#ifdef __ARCH_SI_TRAPNO
83 int _trapno; /* TRAP # which caused the signal */ 83 int _trapno; /* TRAP # which caused the signal */
84#endif 84#endif
85 short _addr_lsb; /* LSB of the reported address */
85 } _sigfault; 86 } _sigfault;
86 87
87 /* SIGPOLL */ 88 /* SIGPOLL */
@@ -112,6 +113,7 @@ typedef struct siginfo {
112#ifdef __ARCH_SI_TRAPNO 113#ifdef __ARCH_SI_TRAPNO
113#define si_trapno _sifields._sigfault._trapno 114#define si_trapno _sifields._sigfault._trapno
114#endif 115#endif
116#define si_addr_lsb _sifields._sigfault._addr_lsb
115#define si_band _sifields._sigpoll._band 117#define si_band _sifields._sigpoll._band
116#define si_fd _sifields._sigpoll._fd 118#define si_fd _sifields._sigpoll._fd
117 119
@@ -192,7 +194,11 @@ typedef struct siginfo {
192#define BUS_ADRALN (__SI_FAULT|1) /* invalid address alignment */ 194#define BUS_ADRALN (__SI_FAULT|1) /* invalid address alignment */
193#define BUS_ADRERR (__SI_FAULT|2) /* non-existant physical address */ 195#define BUS_ADRERR (__SI_FAULT|2) /* non-existant physical address */
194#define BUS_OBJERR (__SI_FAULT|3) /* object specific hardware error */ 196#define BUS_OBJERR (__SI_FAULT|3) /* object specific hardware error */
195#define NSIGBUS 3 197/* hardware memory error consumed on a machine check: action required */
198#define BUS_MCEERR_AR (__SI_FAULT|4)
199/* hardware memory error detected in process but not consumed: action optional*/
200#define BUS_MCEERR_AO (__SI_FAULT|5)
201#define NSIGBUS 5
196 202
197/* 203/*
198 * SIGTRAP si_codes 204 * SIGTRAP si_codes
diff --git a/include/linux/async_tx.h b/include/linux/async_tx.h
index 5fc2ef8d97fa..a1c486a88e88 100644
--- a/include/linux/async_tx.h
+++ b/include/linux/async_tx.h
@@ -58,25 +58,60 @@ struct dma_chan_ref {
58 * array. 58 * array.
59 * @ASYNC_TX_ACK: immediately ack the descriptor, precludes setting up a 59 * @ASYNC_TX_ACK: immediately ack the descriptor, precludes setting up a
60 * dependency chain 60 * dependency chain
61 * @ASYNC_TX_DEP_ACK: ack the dependency descriptor. Useful for chaining. 61 * @ASYNC_TX_FENCE: specify that the next operation in the dependency
62 * chain uses this operation's result as an input
62 */ 63 */
63enum async_tx_flags { 64enum async_tx_flags {
64 ASYNC_TX_XOR_ZERO_DST = (1 << 0), 65 ASYNC_TX_XOR_ZERO_DST = (1 << 0),
65 ASYNC_TX_XOR_DROP_DST = (1 << 1), 66 ASYNC_TX_XOR_DROP_DST = (1 << 1),
66 ASYNC_TX_ACK = (1 << 3), 67 ASYNC_TX_ACK = (1 << 2),
67 ASYNC_TX_DEP_ACK = (1 << 4), 68 ASYNC_TX_FENCE = (1 << 3),
69};
70
71/**
72 * struct async_submit_ctl - async_tx submission/completion modifiers
73 * @flags: submission modifiers
74 * @depend_tx: parent dependency of the current operation being submitted
75 * @cb_fn: callback routine to run at operation completion
76 * @cb_param: parameter for the callback routine
77 * @scribble: caller provided space for dma/page address conversions
78 */
79struct async_submit_ctl {
80 enum async_tx_flags flags;
81 struct dma_async_tx_descriptor *depend_tx;
82 dma_async_tx_callback cb_fn;
83 void *cb_param;
84 void *scribble;
68}; 85};
69 86
70#ifdef CONFIG_DMA_ENGINE 87#ifdef CONFIG_DMA_ENGINE
71#define async_tx_issue_pending_all dma_issue_pending_all 88#define async_tx_issue_pending_all dma_issue_pending_all
89
90/**
91 * async_tx_issue_pending - send pending descriptor to the hardware channel
92 * @tx: descriptor handle to retrieve hardware context
93 *
94 * Note: any dependent operations will have already been issued by
95 * async_tx_channel_switch, or (in the case of no channel switch) will
96 * be already pending on this channel.
97 */
98static inline void async_tx_issue_pending(struct dma_async_tx_descriptor *tx)
99{
100 if (likely(tx)) {
101 struct dma_chan *chan = tx->chan;
102 struct dma_device *dma = chan->device;
103
104 dma->device_issue_pending(chan);
105 }
106}
72#ifdef CONFIG_ARCH_HAS_ASYNC_TX_FIND_CHANNEL 107#ifdef CONFIG_ARCH_HAS_ASYNC_TX_FIND_CHANNEL
73#include <asm/async_tx.h> 108#include <asm/async_tx.h>
74#else 109#else
75#define async_tx_find_channel(dep, type, dst, dst_count, src, src_count, len) \ 110#define async_tx_find_channel(dep, type, dst, dst_count, src, src_count, len) \
76 __async_tx_find_channel(dep, type) 111 __async_tx_find_channel(dep, type)
77struct dma_chan * 112struct dma_chan *
78__async_tx_find_channel(struct dma_async_tx_descriptor *depend_tx, 113__async_tx_find_channel(struct async_submit_ctl *submit,
79 enum dma_transaction_type tx_type); 114 enum dma_transaction_type tx_type);
80#endif /* CONFIG_ARCH_HAS_ASYNC_TX_FIND_CHANNEL */ 115#endif /* CONFIG_ARCH_HAS_ASYNC_TX_FIND_CHANNEL */
81#else 116#else
82static inline void async_tx_issue_pending_all(void) 117static inline void async_tx_issue_pending_all(void)
@@ -84,10 +119,16 @@ static inline void async_tx_issue_pending_all(void)
84 do { } while (0); 119 do { } while (0);
85} 120}
86 121
122static inline void async_tx_issue_pending(struct dma_async_tx_descriptor *tx)
123{
124 do { } while (0);
125}
126
87static inline struct dma_chan * 127static inline struct dma_chan *
88async_tx_find_channel(struct dma_async_tx_descriptor *depend_tx, 128async_tx_find_channel(struct async_submit_ctl *submit,
89 enum dma_transaction_type tx_type, struct page **dst, int dst_count, 129 enum dma_transaction_type tx_type, struct page **dst,
90 struct page **src, int src_count, size_t len) 130 int dst_count, struct page **src, int src_count,
131 size_t len)
91{ 132{
92 return NULL; 133 return NULL;
93} 134}
@@ -99,46 +140,70 @@ async_tx_find_channel(struct dma_async_tx_descriptor *depend_tx,
99 * @cb_fn_param: parameter to pass to the callback routine 140 * @cb_fn_param: parameter to pass to the callback routine
100 */ 141 */
101static inline void 142static inline void
102async_tx_sync_epilog(dma_async_tx_callback cb_fn, void *cb_fn_param) 143async_tx_sync_epilog(struct async_submit_ctl *submit)
103{ 144{
104 if (cb_fn) 145 if (submit->cb_fn)
105 cb_fn(cb_fn_param); 146 submit->cb_fn(submit->cb_param);
106} 147}
107 148
108void 149typedef union {
109async_tx_submit(struct dma_chan *chan, struct dma_async_tx_descriptor *tx, 150 unsigned long addr;
110 enum async_tx_flags flags, struct dma_async_tx_descriptor *depend_tx, 151 struct page *page;
111 dma_async_tx_callback cb_fn, void *cb_fn_param); 152 dma_addr_t dma;
153} addr_conv_t;
154
155static inline void
156init_async_submit(struct async_submit_ctl *args, enum async_tx_flags flags,
157 struct dma_async_tx_descriptor *tx,
158 dma_async_tx_callback cb_fn, void *cb_param,
159 addr_conv_t *scribble)
160{
161 args->flags = flags;
162 args->depend_tx = tx;
163 args->cb_fn = cb_fn;
164 args->cb_param = cb_param;
165 args->scribble = scribble;
166}
167
168void async_tx_submit(struct dma_chan *chan, struct dma_async_tx_descriptor *tx,
169 struct async_submit_ctl *submit);
112 170
113struct dma_async_tx_descriptor * 171struct dma_async_tx_descriptor *
114async_xor(struct page *dest, struct page **src_list, unsigned int offset, 172async_xor(struct page *dest, struct page **src_list, unsigned int offset,
115 int src_cnt, size_t len, enum async_tx_flags flags, 173 int src_cnt, size_t len, struct async_submit_ctl *submit);
116 struct dma_async_tx_descriptor *depend_tx,
117 dma_async_tx_callback cb_fn, void *cb_fn_param);
118 174
119struct dma_async_tx_descriptor * 175struct dma_async_tx_descriptor *
120async_xor_zero_sum(struct page *dest, struct page **src_list, 176async_xor_val(struct page *dest, struct page **src_list, unsigned int offset,
121 unsigned int offset, int src_cnt, size_t len, 177 int src_cnt, size_t len, enum sum_check_flags *result,
122 u32 *result, enum async_tx_flags flags, 178 struct async_submit_ctl *submit);
123 struct dma_async_tx_descriptor *depend_tx,
124 dma_async_tx_callback cb_fn, void *cb_fn_param);
125 179
126struct dma_async_tx_descriptor * 180struct dma_async_tx_descriptor *
127async_memcpy(struct page *dest, struct page *src, unsigned int dest_offset, 181async_memcpy(struct page *dest, struct page *src, unsigned int dest_offset,
128 unsigned int src_offset, size_t len, enum async_tx_flags flags, 182 unsigned int src_offset, size_t len,
129 struct dma_async_tx_descriptor *depend_tx, 183 struct async_submit_ctl *submit);
130 dma_async_tx_callback cb_fn, void *cb_fn_param);
131 184
132struct dma_async_tx_descriptor * 185struct dma_async_tx_descriptor *
133async_memset(struct page *dest, int val, unsigned int offset, 186async_memset(struct page *dest, int val, unsigned int offset,
134 size_t len, enum async_tx_flags flags, 187 size_t len, struct async_submit_ctl *submit);
135 struct dma_async_tx_descriptor *depend_tx, 188
136 dma_async_tx_callback cb_fn, void *cb_fn_param); 189struct dma_async_tx_descriptor *async_trigger_callback(struct async_submit_ctl *submit);
190
191struct dma_async_tx_descriptor *
192async_gen_syndrome(struct page **blocks, unsigned int offset, int src_cnt,
193 size_t len, struct async_submit_ctl *submit);
194
195struct dma_async_tx_descriptor *
196async_syndrome_val(struct page **blocks, unsigned int offset, int src_cnt,
197 size_t len, enum sum_check_flags *pqres, struct page *spare,
198 struct async_submit_ctl *submit);
199
200struct dma_async_tx_descriptor *
201async_raid6_2data_recov(int src_num, size_t bytes, int faila, int failb,
202 struct page **ptrs, struct async_submit_ctl *submit);
137 203
138struct dma_async_tx_descriptor * 204struct dma_async_tx_descriptor *
139async_trigger_callback(enum async_tx_flags flags, 205async_raid6_datap_recov(int src_num, size_t bytes, int faila,
140 struct dma_async_tx_descriptor *depend_tx, 206 struct page **ptrs, struct async_submit_ctl *submit);
141 dma_async_tx_callback cb_fn, void *cb_fn_param);
142 207
143void async_tx_quiesce(struct dma_async_tx_descriptor **tx); 208void async_tx_quiesce(struct dma_async_tx_descriptor **tx);
144#endif /* _ASYNC_TX_H_ */ 209#endif /* _ASYNC_TX_H_ */
diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index 2046b5b8af48..aece486ac734 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -120,7 +120,7 @@ extern int copy_strings_kernel(int argc,char ** argv,struct linux_binprm *bprm);
120extern int prepare_bprm_creds(struct linux_binprm *bprm); 120extern int prepare_bprm_creds(struct linux_binprm *bprm);
121extern void install_exec_creds(struct linux_binprm *bprm); 121extern void install_exec_creds(struct linux_binprm *bprm);
122extern void do_coredump(long signr, int exit_code, struct pt_regs *regs); 122extern void do_coredump(long signr, int exit_code, struct pt_regs *regs);
123extern int set_binfmt(struct linux_binfmt *new); 123extern void set_binfmt(struct linux_binfmt *new);
124extern void free_bprm(struct linux_binprm *); 124extern void free_bprm(struct linux_binprm *);
125 125
126#endif /* __KERNEL__ */ 126#endif /* __KERNEL__ */
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 90bba9e62286..b62bb9294d0c 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -141,6 +141,38 @@ enum {
141 CGRP_WAIT_ON_RMDIR, 141 CGRP_WAIT_ON_RMDIR,
142}; 142};
143 143
144/* which pidlist file are we talking about? */
145enum cgroup_filetype {
146 CGROUP_FILE_PROCS,
147 CGROUP_FILE_TASKS,
148};
149
150/*
151 * A pidlist is a list of pids that virtually represents the contents of one
152 * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
153 * a pair (one each for procs, tasks) for each pid namespace that's relevant
154 * to the cgroup.
155 */
156struct cgroup_pidlist {
157 /*
158 * used to find which pidlist is wanted. doesn't change as long as
159 * this particular list stays in the list.
160 */
161 struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
162 /* array of xids */
163 pid_t *list;
164 /* how many elements the above list has */
165 int length;
166 /* how many files are using the current array */
167 int use_count;
168 /* each of these stored in a list by its cgroup */
169 struct list_head links;
170 /* pointer to the cgroup we belong to, for list removal purposes */
171 struct cgroup *owner;
172 /* protects the other fields */
173 struct rw_semaphore mutex;
174};
175
144struct cgroup { 176struct cgroup {
145 unsigned long flags; /* "unsigned long" so bitops work */ 177 unsigned long flags; /* "unsigned long" so bitops work */
146 178
@@ -179,11 +211,12 @@ struct cgroup {
179 */ 211 */
180 struct list_head release_list; 212 struct list_head release_list;
181 213
182 /* pids_mutex protects pids_list and cached pid arrays. */ 214 /*
183 struct rw_semaphore pids_mutex; 215 * list of pidlists, up to two for each namespace (one for procs, one
184 216 * for tasks); created on demand.
185 /* Linked list of struct cgroup_pids */ 217 */
186 struct list_head pids_list; 218 struct list_head pidlists;
219 struct mutex pidlist_mutex;
187 220
188 /* For RCU-protected deletion */ 221 /* For RCU-protected deletion */
189 struct rcu_head rcu_head; 222 struct rcu_head rcu_head;
@@ -227,6 +260,9 @@ struct css_set {
227 * during subsystem registration (at boot time). 260 * during subsystem registration (at boot time).
228 */ 261 */
229 struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT]; 262 struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
263
264 /* For RCU-protected deletion */
265 struct rcu_head rcu_head;
230}; 266};
231 267
232/* 268/*
@@ -389,10 +425,11 @@ struct cgroup_subsys {
389 struct cgroup *cgrp); 425 struct cgroup *cgrp);
390 int (*pre_destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp); 426 int (*pre_destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp);
391 void (*destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp); 427 void (*destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp);
392 int (*can_attach)(struct cgroup_subsys *ss, 428 int (*can_attach)(struct cgroup_subsys *ss, struct cgroup *cgrp,
393 struct cgroup *cgrp, struct task_struct *tsk); 429 struct task_struct *tsk, bool threadgroup);
394 void (*attach)(struct cgroup_subsys *ss, struct cgroup *cgrp, 430 void (*attach)(struct cgroup_subsys *ss, struct cgroup *cgrp,
395 struct cgroup *old_cgrp, struct task_struct *tsk); 431 struct cgroup *old_cgrp, struct task_struct *tsk,
432 bool threadgroup);
396 void (*fork)(struct cgroup_subsys *ss, struct task_struct *task); 433 void (*fork)(struct cgroup_subsys *ss, struct task_struct *task);
397 void (*exit)(struct cgroup_subsys *ss, struct task_struct *task); 434 void (*exit)(struct cgroup_subsys *ss, struct task_struct *task);
398 int (*populate)(struct cgroup_subsys *ss, 435 int (*populate)(struct cgroup_subsys *ss,
diff --git a/include/linux/configfs.h b/include/linux/configfs.h
index 7f627775c947..ddb7a97c78c2 100644
--- a/include/linux/configfs.h
+++ b/include/linux/configfs.h
@@ -27,8 +27,8 @@
27 * 27 *
28 * configfs Copyright (C) 2005 Oracle. All rights reserved. 28 * configfs Copyright (C) 2005 Oracle. All rights reserved.
29 * 29 *
30 * Please read Documentation/filesystems/configfs.txt before using the 30 * Please read Documentation/filesystems/configfs/configfs.txt before using
31 * configfs interface, ESPECIALLY the parts about reference counts and 31 * the configfs interface, ESPECIALLY the parts about reference counts and
32 * item destructors. 32 * item destructors.
33 */ 33 */
34 34
diff --git a/include/linux/dca.h b/include/linux/dca.h
index 9c20c7e87d0a..d27a7a05718d 100644
--- a/include/linux/dca.h
+++ b/include/linux/dca.h
@@ -20,6 +20,9 @@
20 */ 20 */
21#ifndef DCA_H 21#ifndef DCA_H
22#define DCA_H 22#define DCA_H
23
24#include <linux/pci.h>
25
23/* DCA Provider API */ 26/* DCA Provider API */
24 27
25/* DCA Notifier Interface */ 28/* DCA Notifier Interface */
@@ -36,6 +39,12 @@ struct dca_provider {
36 int id; 39 int id;
37}; 40};
38 41
42struct dca_domain {
43 struct list_head node;
44 struct list_head dca_providers;
45 struct pci_bus *pci_rc;
46};
47
39struct dca_ops { 48struct dca_ops {
40 int (*add_requester) (struct dca_provider *, struct device *); 49 int (*add_requester) (struct dca_provider *, struct device *);
41 int (*remove_requester) (struct dca_provider *, struct device *); 50 int (*remove_requester) (struct dca_provider *, struct device *);
@@ -47,7 +56,7 @@ struct dca_ops {
47struct dca_provider *alloc_dca_provider(struct dca_ops *ops, int priv_size); 56struct dca_provider *alloc_dca_provider(struct dca_ops *ops, int priv_size);
48void free_dca_provider(struct dca_provider *dca); 57void free_dca_provider(struct dca_provider *dca);
49int register_dca_provider(struct dca_provider *dca, struct device *dev); 58int register_dca_provider(struct dca_provider *dca, struct device *dev);
50void unregister_dca_provider(struct dca_provider *dca); 59void unregister_dca_provider(struct dca_provider *dca, struct device *dev);
51 60
52static inline void *dca_priv(struct dca_provider *dca) 61static inline void *dca_priv(struct dca_provider *dca)
53{ 62{
diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h
index eb5c2ba2f81a..fc1b930f246c 100644
--- a/include/linux/debugfs.h
+++ b/include/linux/debugfs.h
@@ -9,7 +9,7 @@
9 * 2 as published by the Free Software Foundation. 9 * 2 as published by the Free Software Foundation.
10 * 10 *
11 * debugfs is for people to use instead of /proc or /sys. 11 * debugfs is for people to use instead of /proc or /sys.
12 * See Documentation/DocBook/kernel-api for more details. 12 * See Documentation/DocBook/filesystems for more details.
13 */ 13 */
14 14
15#ifndef _DEBUGFS_H_ 15#ifndef _DEBUGFS_H_
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index ffefba81c818..2b9f2ac7ed60 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -48,19 +48,20 @@ enum dma_status {
48 48
49/** 49/**
50 * enum dma_transaction_type - DMA transaction types/indexes 50 * enum dma_transaction_type - DMA transaction types/indexes
51 *
52 * Note: The DMA_ASYNC_TX capability is not to be set by drivers. It is
53 * automatically set as dma devices are registered.
51 */ 54 */
52enum dma_transaction_type { 55enum dma_transaction_type {
53 DMA_MEMCPY, 56 DMA_MEMCPY,
54 DMA_XOR, 57 DMA_XOR,
55 DMA_PQ_XOR, 58 DMA_PQ,
56 DMA_DUAL_XOR, 59 DMA_XOR_VAL,
57 DMA_PQ_UPDATE, 60 DMA_PQ_VAL,
58 DMA_ZERO_SUM,
59 DMA_PQ_ZERO_SUM,
60 DMA_MEMSET, 61 DMA_MEMSET,
61 DMA_MEMCPY_CRC32C,
62 DMA_INTERRUPT, 62 DMA_INTERRUPT,
63 DMA_PRIVATE, 63 DMA_PRIVATE,
64 DMA_ASYNC_TX,
64 DMA_SLAVE, 65 DMA_SLAVE,
65}; 66};
66 67
@@ -70,18 +71,25 @@ enum dma_transaction_type {
70 71
71/** 72/**
72 * enum dma_ctrl_flags - DMA flags to augment operation preparation, 73 * enum dma_ctrl_flags - DMA flags to augment operation preparation,
73 * control completion, and communicate status. 74 * control completion, and communicate status.
74 * @DMA_PREP_INTERRUPT - trigger an interrupt (callback) upon completion of 75 * @DMA_PREP_INTERRUPT - trigger an interrupt (callback) upon completion of
75 * this transaction 76 * this transaction
76 * @DMA_CTRL_ACK - the descriptor cannot be reused until the client 77 * @DMA_CTRL_ACK - the descriptor cannot be reused until the client
77 * acknowledges receipt, i.e. has has a chance to establish any 78 * acknowledges receipt, i.e. has has a chance to establish any dependency
78 * dependency chains 79 * chains
79 * @DMA_COMPL_SKIP_SRC_UNMAP - set to disable dma-unmapping the source buffer(s) 80 * @DMA_COMPL_SKIP_SRC_UNMAP - set to disable dma-unmapping the source buffer(s)
80 * @DMA_COMPL_SKIP_DEST_UNMAP - set to disable dma-unmapping the destination(s) 81 * @DMA_COMPL_SKIP_DEST_UNMAP - set to disable dma-unmapping the destination(s)
81 * @DMA_COMPL_SRC_UNMAP_SINGLE - set to do the source dma-unmapping as single 82 * @DMA_COMPL_SRC_UNMAP_SINGLE - set to do the source dma-unmapping as single
82 * (if not set, do the source dma-unmapping as page) 83 * (if not set, do the source dma-unmapping as page)
83 * @DMA_COMPL_DEST_UNMAP_SINGLE - set to do the destination dma-unmapping as single 84 * @DMA_COMPL_DEST_UNMAP_SINGLE - set to do the destination dma-unmapping as single
84 * (if not set, do the destination dma-unmapping as page) 85 * (if not set, do the destination dma-unmapping as page)
86 * @DMA_PREP_PQ_DISABLE_P - prevent generation of P while generating Q
87 * @DMA_PREP_PQ_DISABLE_Q - prevent generation of Q while generating P
88 * @DMA_PREP_CONTINUE - indicate to a driver that it is reusing buffers as
89 * sources that were the result of a previous operation, in the case of a PQ
90 * operation it continues the calculation with new sources
91 * @DMA_PREP_FENCE - tell the driver that subsequent operations depend
92 * on the result of this operation
85 */ 93 */
86enum dma_ctrl_flags { 94enum dma_ctrl_flags {
87 DMA_PREP_INTERRUPT = (1 << 0), 95 DMA_PREP_INTERRUPT = (1 << 0),
@@ -90,9 +98,32 @@ enum dma_ctrl_flags {
90 DMA_COMPL_SKIP_DEST_UNMAP = (1 << 3), 98 DMA_COMPL_SKIP_DEST_UNMAP = (1 << 3),
91 DMA_COMPL_SRC_UNMAP_SINGLE = (1 << 4), 99 DMA_COMPL_SRC_UNMAP_SINGLE = (1 << 4),
92 DMA_COMPL_DEST_UNMAP_SINGLE = (1 << 5), 100 DMA_COMPL_DEST_UNMAP_SINGLE = (1 << 5),
101 DMA_PREP_PQ_DISABLE_P = (1 << 6),
102 DMA_PREP_PQ_DISABLE_Q = (1 << 7),
103 DMA_PREP_CONTINUE = (1 << 8),
104 DMA_PREP_FENCE = (1 << 9),
93}; 105};
94 106
95/** 107/**
108 * enum sum_check_bits - bit position of pq_check_flags
109 */
110enum sum_check_bits {
111 SUM_CHECK_P = 0,
112 SUM_CHECK_Q = 1,
113};
114
115/**
116 * enum pq_check_flags - result of async_{xor,pq}_zero_sum operations
117 * @SUM_CHECK_P_RESULT - 1 if xor zero sum error, 0 otherwise
118 * @SUM_CHECK_Q_RESULT - 1 if reed-solomon zero sum error, 0 otherwise
119 */
120enum sum_check_flags {
121 SUM_CHECK_P_RESULT = (1 << SUM_CHECK_P),
122 SUM_CHECK_Q_RESULT = (1 << SUM_CHECK_Q),
123};
124
125
126/**
96 * dma_cap_mask_t - capabilities bitmap modeled after cpumask_t. 127 * dma_cap_mask_t - capabilities bitmap modeled after cpumask_t.
97 * See linux/cpumask.h 128 * See linux/cpumask.h
98 */ 129 */
@@ -180,8 +211,6 @@ typedef void (*dma_async_tx_callback)(void *dma_async_param);
180 * @flags: flags to augment operation preparation, control completion, and 211 * @flags: flags to augment operation preparation, control completion, and
181 * communicate status 212 * communicate status
182 * @phys: physical address of the descriptor 213 * @phys: physical address of the descriptor
183 * @tx_list: driver common field for operations that require multiple
184 * descriptors
185 * @chan: target channel for this operation 214 * @chan: target channel for this operation
186 * @tx_submit: set the prepared descriptor(s) to be executed by the engine 215 * @tx_submit: set the prepared descriptor(s) to be executed by the engine
187 * @callback: routine to call after this operation is complete 216 * @callback: routine to call after this operation is complete
@@ -195,7 +224,6 @@ struct dma_async_tx_descriptor {
195 dma_cookie_t cookie; 224 dma_cookie_t cookie;
196 enum dma_ctrl_flags flags; /* not a 'long' to pack with cookie */ 225 enum dma_ctrl_flags flags; /* not a 'long' to pack with cookie */
197 dma_addr_t phys; 226 dma_addr_t phys;
198 struct list_head tx_list;
199 struct dma_chan *chan; 227 struct dma_chan *chan;
200 dma_cookie_t (*tx_submit)(struct dma_async_tx_descriptor *tx); 228 dma_cookie_t (*tx_submit)(struct dma_async_tx_descriptor *tx);
201 dma_async_tx_callback callback; 229 dma_async_tx_callback callback;
@@ -213,6 +241,11 @@ struct dma_async_tx_descriptor {
213 * @global_node: list_head for global dma_device_list 241 * @global_node: list_head for global dma_device_list
214 * @cap_mask: one or more dma_capability flags 242 * @cap_mask: one or more dma_capability flags
215 * @max_xor: maximum number of xor sources, 0 if no capability 243 * @max_xor: maximum number of xor sources, 0 if no capability
244 * @max_pq: maximum number of PQ sources and PQ-continue capability
245 * @copy_align: alignment shift for memcpy operations
246 * @xor_align: alignment shift for xor operations
247 * @pq_align: alignment shift for pq operations
248 * @fill_align: alignment shift for memset operations
216 * @dev_id: unique device ID 249 * @dev_id: unique device ID
217 * @dev: struct device reference for dma mapping api 250 * @dev: struct device reference for dma mapping api
218 * @device_alloc_chan_resources: allocate resources and return the 251 * @device_alloc_chan_resources: allocate resources and return the
@@ -220,7 +253,9 @@ struct dma_async_tx_descriptor {
220 * @device_free_chan_resources: release DMA channel's resources 253 * @device_free_chan_resources: release DMA channel's resources
221 * @device_prep_dma_memcpy: prepares a memcpy operation 254 * @device_prep_dma_memcpy: prepares a memcpy operation
222 * @device_prep_dma_xor: prepares a xor operation 255 * @device_prep_dma_xor: prepares a xor operation
223 * @device_prep_dma_zero_sum: prepares a zero_sum operation 256 * @device_prep_dma_xor_val: prepares a xor validation operation
257 * @device_prep_dma_pq: prepares a pq operation
258 * @device_prep_dma_pq_val: prepares a pqzero_sum operation
224 * @device_prep_dma_memset: prepares a memset operation 259 * @device_prep_dma_memset: prepares a memset operation
225 * @device_prep_dma_interrupt: prepares an end of chain interrupt operation 260 * @device_prep_dma_interrupt: prepares an end of chain interrupt operation
226 * @device_prep_slave_sg: prepares a slave dma operation 261 * @device_prep_slave_sg: prepares a slave dma operation
@@ -235,7 +270,13 @@ struct dma_device {
235 struct list_head channels; 270 struct list_head channels;
236 struct list_head global_node; 271 struct list_head global_node;
237 dma_cap_mask_t cap_mask; 272 dma_cap_mask_t cap_mask;
238 int max_xor; 273 unsigned short max_xor;
274 unsigned short max_pq;
275 u8 copy_align;
276 u8 xor_align;
277 u8 pq_align;
278 u8 fill_align;
279 #define DMA_HAS_PQ_CONTINUE (1 << 15)
239 280
240 int dev_id; 281 int dev_id;
241 struct device *dev; 282 struct device *dev;
@@ -249,9 +290,17 @@ struct dma_device {
249 struct dma_async_tx_descriptor *(*device_prep_dma_xor)( 290 struct dma_async_tx_descriptor *(*device_prep_dma_xor)(
250 struct dma_chan *chan, dma_addr_t dest, dma_addr_t *src, 291 struct dma_chan *chan, dma_addr_t dest, dma_addr_t *src,
251 unsigned int src_cnt, size_t len, unsigned long flags); 292 unsigned int src_cnt, size_t len, unsigned long flags);
252 struct dma_async_tx_descriptor *(*device_prep_dma_zero_sum)( 293 struct dma_async_tx_descriptor *(*device_prep_dma_xor_val)(
253 struct dma_chan *chan, dma_addr_t *src, unsigned int src_cnt, 294 struct dma_chan *chan, dma_addr_t *src, unsigned int src_cnt,
254 size_t len, u32 *result, unsigned long flags); 295 size_t len, enum sum_check_flags *result, unsigned long flags);
296 struct dma_async_tx_descriptor *(*device_prep_dma_pq)(
297 struct dma_chan *chan, dma_addr_t *dst, dma_addr_t *src,
298 unsigned int src_cnt, const unsigned char *scf,
299 size_t len, unsigned long flags);
300 struct dma_async_tx_descriptor *(*device_prep_dma_pq_val)(
301 struct dma_chan *chan, dma_addr_t *pq, dma_addr_t *src,
302 unsigned int src_cnt, const unsigned char *scf, size_t len,
303 enum sum_check_flags *pqres, unsigned long flags);
255 struct dma_async_tx_descriptor *(*device_prep_dma_memset)( 304 struct dma_async_tx_descriptor *(*device_prep_dma_memset)(
256 struct dma_chan *chan, dma_addr_t dest, int value, size_t len, 305 struct dma_chan *chan, dma_addr_t dest, int value, size_t len,
257 unsigned long flags); 306 unsigned long flags);
@@ -270,6 +319,96 @@ struct dma_device {
270 void (*device_issue_pending)(struct dma_chan *chan); 319 void (*device_issue_pending)(struct dma_chan *chan);
271}; 320};
272 321
322static inline bool dmaengine_check_align(u8 align, size_t off1, size_t off2, size_t len)
323{
324 size_t mask;
325
326 if (!align)
327 return true;
328 mask = (1 << align) - 1;
329 if (mask & (off1 | off2 | len))
330 return false;
331 return true;
332}
333
334static inline bool is_dma_copy_aligned(struct dma_device *dev, size_t off1,
335 size_t off2, size_t len)
336{
337 return dmaengine_check_align(dev->copy_align, off1, off2, len);
338}
339
340static inline bool is_dma_xor_aligned(struct dma_device *dev, size_t off1,
341 size_t off2, size_t len)
342{
343 return dmaengine_check_align(dev->xor_align, off1, off2, len);
344}
345
346static inline bool is_dma_pq_aligned(struct dma_device *dev, size_t off1,
347 size_t off2, size_t len)
348{
349 return dmaengine_check_align(dev->pq_align, off1, off2, len);
350}
351
352static inline bool is_dma_fill_aligned(struct dma_device *dev, size_t off1,
353 size_t off2, size_t len)
354{
355 return dmaengine_check_align(dev->fill_align, off1, off2, len);
356}
357
358static inline void
359dma_set_maxpq(struct dma_device *dma, int maxpq, int has_pq_continue)
360{
361 dma->max_pq = maxpq;
362 if (has_pq_continue)
363 dma->max_pq |= DMA_HAS_PQ_CONTINUE;
364}
365
366static inline bool dmaf_continue(enum dma_ctrl_flags flags)
367{
368 return (flags & DMA_PREP_CONTINUE) == DMA_PREP_CONTINUE;
369}
370
371static inline bool dmaf_p_disabled_continue(enum dma_ctrl_flags flags)
372{
373 enum dma_ctrl_flags mask = DMA_PREP_CONTINUE | DMA_PREP_PQ_DISABLE_P;
374
375 return (flags & mask) == mask;
376}
377
378static inline bool dma_dev_has_pq_continue(struct dma_device *dma)
379{
380 return (dma->max_pq & DMA_HAS_PQ_CONTINUE) == DMA_HAS_PQ_CONTINUE;
381}
382
383static unsigned short dma_dev_to_maxpq(struct dma_device *dma)
384{
385 return dma->max_pq & ~DMA_HAS_PQ_CONTINUE;
386}
387
388/* dma_maxpq - reduce maxpq in the face of continued operations
389 * @dma - dma device with PQ capability
390 * @flags - to check if DMA_PREP_CONTINUE and DMA_PREP_PQ_DISABLE_P are set
391 *
392 * When an engine does not support native continuation we need 3 extra
393 * source slots to reuse P and Q with the following coefficients:
394 * 1/ {00} * P : remove P from Q', but use it as a source for P'
395 * 2/ {01} * Q : use Q to continue Q' calculation
396 * 3/ {00} * Q : subtract Q from P' to cancel (2)
397 *
398 * In the case where P is disabled we only need 1 extra source:
399 * 1/ {01} * Q : use Q to continue Q' calculation
400 */
401static inline int dma_maxpq(struct dma_device *dma, enum dma_ctrl_flags flags)
402{
403 if (dma_dev_has_pq_continue(dma) || !dmaf_continue(flags))
404 return dma_dev_to_maxpq(dma);
405 else if (dmaf_p_disabled_continue(flags))
406 return dma_dev_to_maxpq(dma) - 1;
407 else if (dmaf_continue(flags))
408 return dma_dev_to_maxpq(dma) - 3;
409 BUG();
410}
411
273/* --- public DMA engine API --- */ 412/* --- public DMA engine API --- */
274 413
275#ifdef CONFIG_DMA_ENGINE 414#ifdef CONFIG_DMA_ENGINE
@@ -299,7 +438,11 @@ static inline void net_dmaengine_put(void)
299#ifdef CONFIG_ASYNC_TX_DMA 438#ifdef CONFIG_ASYNC_TX_DMA
300#define async_dmaengine_get() dmaengine_get() 439#define async_dmaengine_get() dmaengine_get()
301#define async_dmaengine_put() dmaengine_put() 440#define async_dmaengine_put() dmaengine_put()
441#ifdef CONFIG_ASYNC_TX_DISABLE_CHANNEL_SWITCH
442#define async_dma_find_channel(type) dma_find_channel(DMA_ASYNC_TX)
443#else
302#define async_dma_find_channel(type) dma_find_channel(type) 444#define async_dma_find_channel(type) dma_find_channel(type)
445#endif /* CONFIG_ASYNC_TX_DISABLE_CHANNEL_SWITCH */
303#else 446#else
304static inline void async_dmaengine_get(void) 447static inline void async_dmaengine_get(void)
305{ 448{
@@ -312,7 +455,7 @@ async_dma_find_channel(enum dma_transaction_type type)
312{ 455{
313 return NULL; 456 return NULL;
314} 457}
315#endif 458#endif /* CONFIG_ASYNC_TX_DMA */
316 459
317dma_cookie_t dma_async_memcpy_buf_to_buf(struct dma_chan *chan, 460dma_cookie_t dma_async_memcpy_buf_to_buf(struct dma_chan *chan,
318 void *dest, void *src, size_t len); 461 void *dest, void *src, size_t len);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 51803528b095..78e95b8b66d4 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -595,6 +595,7 @@ struct address_space_operations {
595 int (*launder_page) (struct page *); 595 int (*launder_page) (struct page *);
596 int (*is_partially_uptodate) (struct page *, read_descriptor_t *, 596 int (*is_partially_uptodate) (struct page *, read_descriptor_t *,
597 unsigned long); 597 unsigned long);
598 int (*error_remove_page)(struct address_space *, struct page *);
598}; 599};
599 600
600/* 601/*
@@ -2467,7 +2468,7 @@ ssize_t simple_attr_write(struct file *file, const char __user *buf,
2467 size_t len, loff_t *ppos); 2468 size_t len, loff_t *ppos);
2468 2469
2469struct ctl_table; 2470struct ctl_table;
2470int proc_nr_files(struct ctl_table *table, int write, struct file *filp, 2471int proc_nr_files(struct ctl_table *table, int write,
2471 void __user *buffer, size_t *lenp, loff_t *ppos); 2472 void __user *buffer, size_t *lenp, loff_t *ppos);
2472 2473
2473int __init get_filesystem_list(char *buf); 2474int __init get_filesystem_list(char *buf);
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 3c0924a18daf..cd3d2abaf30a 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -19,7 +19,7 @@
19extern int ftrace_enabled; 19extern int ftrace_enabled;
20extern int 20extern int
21ftrace_enable_sysctl(struct ctl_table *table, int write, 21ftrace_enable_sysctl(struct ctl_table *table, int write,
22 struct file *filp, void __user *buffer, size_t *lenp, 22 void __user *buffer, size_t *lenp,
23 loff_t *ppos); 23 loff_t *ppos);
24 24
25typedef void (*ftrace_func_t)(unsigned long ip, unsigned long parent_ip); 25typedef void (*ftrace_func_t)(unsigned long ip, unsigned long parent_ip);
@@ -94,7 +94,7 @@ static inline void ftrace_start(void) { }
94extern int stack_tracer_enabled; 94extern int stack_tracer_enabled;
95int 95int
96stack_trace_sysctl(struct ctl_table *table, int write, 96stack_trace_sysctl(struct ctl_table *table, int write,
97 struct file *file, void __user *buffer, size_t *lenp, 97 void __user *buffer, size_t *lenp,
98 loff_t *ppos); 98 loff_t *ppos);
99#endif 99#endif
100 100
diff --git a/include/linux/futex.h b/include/linux/futex.h
index 34956c8fdebf..8ec17997d94f 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -4,11 +4,6 @@
4#include <linux/compiler.h> 4#include <linux/compiler.h>
5#include <linux/types.h> 5#include <linux/types.h>
6 6
7struct inode;
8struct mm_struct;
9struct task_struct;
10union ktime;
11
12/* Second argument to futex syscall */ 7/* Second argument to futex syscall */
13 8
14 9
@@ -129,6 +124,11 @@ struct robust_list_head {
129#define FUTEX_BITSET_MATCH_ANY 0xffffffff 124#define FUTEX_BITSET_MATCH_ANY 0xffffffff
130 125
131#ifdef __KERNEL__ 126#ifdef __KERNEL__
127struct inode;
128struct mm_struct;
129struct task_struct;
130union ktime;
131
132long do_futex(u32 __user *uaddr, int op, u32 val, union ktime *timeout, 132long do_futex(u32 __user *uaddr, int op, u32 val, union ktime *timeout,
133 u32 __user *uaddr2, u32 val2, u32 val3); 133 u32 __user *uaddr2, u32 val2, u32 val3);
134 134
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 176e7ee73eff..11ab19ac6b3d 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -20,9 +20,9 @@ static inline int is_vm_hugetlb_page(struct vm_area_struct *vma)
20} 20}
21 21
22void reset_vma_resv_huge_pages(struct vm_area_struct *vma); 22void reset_vma_resv_huge_pages(struct vm_area_struct *vma);
23int hugetlb_sysctl_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *); 23int hugetlb_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *);
24int hugetlb_overcommit_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *); 24int hugetlb_overcommit_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *);
25int hugetlb_treat_movable_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *); 25int hugetlb_treat_movable_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *);
26int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *); 26int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *);
27int follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *, 27int follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *,
28 struct page **, struct vm_area_struct **, 28 struct page **, struct vm_area_struct **,
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index e46a0734ab6e..bf9213b2db8f 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -118,6 +118,9 @@ static inline bool mem_cgroup_disabled(void)
118 118
119extern bool mem_cgroup_oom_called(struct task_struct *task); 119extern bool mem_cgroup_oom_called(struct task_struct *task);
120void mem_cgroup_update_mapped_file_stat(struct page *page, int val); 120void mem_cgroup_update_mapped_file_stat(struct page *page, int val);
121unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
122 gfp_t gfp_mask, int nid,
123 int zid);
121#else /* CONFIG_CGROUP_MEM_RES_CTLR */ 124#else /* CONFIG_CGROUP_MEM_RES_CTLR */
122struct mem_cgroup; 125struct mem_cgroup;
123 126
@@ -276,6 +279,13 @@ static inline void mem_cgroup_update_mapped_file_stat(struct page *page,
276{ 279{
277} 280}
278 281
282static inline
283unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
284 gfp_t gfp_mask, int nid, int zid)
285{
286 return 0;
287}
288
279#endif /* CONFIG_CGROUP_MEM_CONT */ 289#endif /* CONFIG_CGROUP_MEM_CONT */
280 290
281#endif /* _LINUX_MEMCONTROL_H */ 291#endif /* _LINUX_MEMCONTROL_H */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index b6eae5e3144b..6953a5a53e44 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -695,11 +695,12 @@ static inline int page_mapped(struct page *page)
695#define VM_FAULT_SIGBUS 0x0002 695#define VM_FAULT_SIGBUS 0x0002
696#define VM_FAULT_MAJOR 0x0004 696#define VM_FAULT_MAJOR 0x0004
697#define VM_FAULT_WRITE 0x0008 /* Special case for get_user_pages */ 697#define VM_FAULT_WRITE 0x0008 /* Special case for get_user_pages */
698#define VM_FAULT_HWPOISON 0x0010 /* Hit poisoned page */
698 699
699#define VM_FAULT_NOPAGE 0x0100 /* ->fault installed the pte, not return page */ 700#define VM_FAULT_NOPAGE 0x0100 /* ->fault installed the pte, not return page */
700#define VM_FAULT_LOCKED 0x0200 /* ->fault locked the returned page */ 701#define VM_FAULT_LOCKED 0x0200 /* ->fault locked the returned page */
701 702
702#define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS) 703#define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_HWPOISON)
703 704
704/* 705/*
705 * Can be called by the pagefault handler when it gets a VM_FAULT_OOM. 706 * Can be called by the pagefault handler when it gets a VM_FAULT_OOM.
@@ -794,6 +795,11 @@ static inline void unmap_shared_mapping_range(struct address_space *mapping,
794extern int vmtruncate(struct inode * inode, loff_t offset); 795extern int vmtruncate(struct inode * inode, loff_t offset);
795extern int vmtruncate_range(struct inode * inode, loff_t offset, loff_t end); 796extern int vmtruncate_range(struct inode * inode, loff_t offset, loff_t end);
796 797
798int truncate_inode_page(struct address_space *mapping, struct page *page);
799int generic_error_remove_page(struct address_space *mapping, struct page *page);
800
801int invalidate_inode_page(struct page *page);
802
797#ifdef CONFIG_MMU 803#ifdef CONFIG_MMU
798extern int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, 804extern int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
799 unsigned long address, unsigned int flags); 805 unsigned long address, unsigned int flags);
@@ -1279,7 +1285,7 @@ int in_gate_area_no_task(unsigned long addr);
1279#define in_gate_area(task, addr) ({(void)task; in_gate_area_no_task(addr);}) 1285#define in_gate_area(task, addr) ({(void)task; in_gate_area_no_task(addr);})
1280#endif /* __HAVE_ARCH_GATE_AREA */ 1286#endif /* __HAVE_ARCH_GATE_AREA */
1281 1287
1282int drop_caches_sysctl_handler(struct ctl_table *, int, struct file *, 1288int drop_caches_sysctl_handler(struct ctl_table *, int,
1283 void __user *, size_t *, loff_t *); 1289 void __user *, size_t *, loff_t *);
1284unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, 1290unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
1285 unsigned long lru_pages); 1291 unsigned long lru_pages);
@@ -1308,5 +1314,12 @@ void vmemmap_populate_print_last(void);
1308extern int account_locked_memory(struct mm_struct *mm, struct rlimit *rlim, 1314extern int account_locked_memory(struct mm_struct *mm, struct rlimit *rlim,
1309 size_t size); 1315 size_t size);
1310extern void refund_locked_memory(struct mm_struct *mm, size_t size); 1316extern void refund_locked_memory(struct mm_struct *mm, size_t size);
1317
1318extern void memory_failure(unsigned long pfn, int trapno);
1319extern int __memory_failure(unsigned long pfn, int trapno, int ref);
1320extern int sysctl_memory_failure_early_kill;
1321extern int sysctl_memory_failure_recovery;
1322extern atomic_long_t mce_bad_pages;
1323
1311#endif /* __KERNEL__ */ 1324#endif /* __KERNEL__ */
1312#endif /* _LINUX_MM_H */ 1325#endif /* _LINUX_MM_H */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 0042090a4d70..21d6aa45206a 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -240,6 +240,8 @@ struct mm_struct {
240 240
241 unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */ 241 unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */
242 242
243 struct linux_binfmt *binfmt;
244
243 cpumask_t cpu_vm_mask; 245 cpumask_t cpu_vm_mask;
244 246
245 /* Architecture-specific MM context */ 247 /* Architecture-specific MM context */
@@ -259,11 +261,10 @@ struct mm_struct {
259 unsigned long flags; /* Must use atomic bitops to access the bits */ 261 unsigned long flags; /* Must use atomic bitops to access the bits */
260 262
261 struct core_state *core_state; /* coredumping support */ 263 struct core_state *core_state; /* coredumping support */
262 264#ifdef CONFIG_AIO
263 /* aio bits */
264 spinlock_t ioctx_lock; 265 spinlock_t ioctx_lock;
265 struct hlist_head ioctx_list; 266 struct hlist_head ioctx_list;
266 267#endif
267#ifdef CONFIG_MM_OWNER 268#ifdef CONFIG_MM_OWNER
268 /* 269 /*
269 * "owner" points to a task that is regarded as the canonical 270 * "owner" points to a task that is regarded as the canonical
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 652ef01be582..6f7561730d88 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -755,21 +755,20 @@ static inline int is_dma(struct zone *zone)
755 755
756/* These two functions are used to setup the per zone pages min values */ 756/* These two functions are used to setup the per zone pages min values */
757struct ctl_table; 757struct ctl_table;
758struct file; 758int min_free_kbytes_sysctl_handler(struct ctl_table *, int,
759int min_free_kbytes_sysctl_handler(struct ctl_table *, int, struct file *,
760 void __user *, size_t *, loff_t *); 759 void __user *, size_t *, loff_t *);
761extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1]; 760extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1];
762int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, struct file *, 761int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int,
763 void __user *, size_t *, loff_t *); 762 void __user *, size_t *, loff_t *);
764int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *, int, struct file *, 763int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *, int,
765 void __user *, size_t *, loff_t *); 764 void __user *, size_t *, loff_t *);
766int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *, int, 765int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *, int,
767 struct file *, void __user *, size_t *, loff_t *); 766 void __user *, size_t *, loff_t *);
768int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int, 767int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int,
769 struct file *, void __user *, size_t *, loff_t *); 768 void __user *, size_t *, loff_t *);
770 769
771extern int numa_zonelist_order_handler(struct ctl_table *, int, 770extern int numa_zonelist_order_handler(struct ctl_table *, int,
772 struct file *, void __user *, size_t *, loff_t *); 771 void __user *, size_t *, loff_t *);
773extern char numa_zonelist_order[]; 772extern char numa_zonelist_order[];
774#define NUMA_ZONELIST_ORDER_LEN 16 /* string buffer size */ 773#define NUMA_ZONELIST_ORDER_LEN 16 /* string buffer size */
775 774
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 13de789f0a5c..6b202b173955 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -51,6 +51,9 @@
51 * PG_buddy is set to indicate that the page is free and in the buddy system 51 * PG_buddy is set to indicate that the page is free and in the buddy system
52 * (see mm/page_alloc.c). 52 * (see mm/page_alloc.c).
53 * 53 *
54 * PG_hwpoison indicates that a page got corrupted in hardware and contains
55 * data with incorrect ECC bits that triggered a machine check. Accessing is
56 * not safe since it may cause another machine check. Don't touch!
54 */ 57 */
55 58
56/* 59/*
@@ -102,6 +105,9 @@ enum pageflags {
102#ifdef CONFIG_ARCH_USES_PG_UNCACHED 105#ifdef CONFIG_ARCH_USES_PG_UNCACHED
103 PG_uncached, /* Page has been mapped as uncached */ 106 PG_uncached, /* Page has been mapped as uncached */
104#endif 107#endif
108#ifdef CONFIG_MEMORY_FAILURE
109 PG_hwpoison, /* hardware poisoned page. Don't touch */
110#endif
105 __NR_PAGEFLAGS, 111 __NR_PAGEFLAGS,
106 112
107 /* Filesystems */ 113 /* Filesystems */
@@ -269,6 +275,15 @@ PAGEFLAG(Uncached, uncached)
269PAGEFLAG_FALSE(Uncached) 275PAGEFLAG_FALSE(Uncached)
270#endif 276#endif
271 277
278#ifdef CONFIG_MEMORY_FAILURE
279PAGEFLAG(HWPoison, hwpoison)
280TESTSETFLAG(HWPoison, hwpoison)
281#define __PG_HWPOISON (1UL << PG_hwpoison)
282#else
283PAGEFLAG_FALSE(HWPoison)
284#define __PG_HWPOISON 0
285#endif
286
272static inline int PageUptodate(struct page *page) 287static inline int PageUptodate(struct page *page)
273{ 288{
274 int ret = test_bit(PG_uptodate, &(page)->flags); 289 int ret = test_bit(PG_uptodate, &(page)->flags);
@@ -393,7 +408,7 @@ static inline void __ClearPageTail(struct page *page)
393 1 << PG_private | 1 << PG_private_2 | \ 408 1 << PG_private | 1 << PG_private_2 | \
394 1 << PG_buddy | 1 << PG_writeback | 1 << PG_reserved | \ 409 1 << PG_buddy | 1 << PG_writeback | 1 << PG_reserved | \
395 1 << PG_slab | 1 << PG_swapcache | 1 << PG_active | \ 410 1 << PG_slab | 1 << PG_swapcache | 1 << PG_active | \
396 1 << PG_unevictable | __PG_MLOCKED) 411 1 << PG_unevictable | __PG_MLOCKED | __PG_HWPOISON)
397 412
398/* 413/*
399 * Flags checked when a page is prepped for return by the page allocator. 414 * Flags checked when a page is prepped for return by the page allocator.
diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h
index ada779f24178..4b938d4f3ac2 100644
--- a/include/linux/page_cgroup.h
+++ b/include/linux/page_cgroup.h
@@ -38,6 +38,7 @@ enum {
38 PCG_LOCK, /* page cgroup is locked */ 38 PCG_LOCK, /* page cgroup is locked */
39 PCG_CACHE, /* charged as cache */ 39 PCG_CACHE, /* charged as cache */
40 PCG_USED, /* this object is in use. */ 40 PCG_USED, /* this object is in use. */
41 PCG_ACCT_LRU, /* page has been accounted for */
41}; 42};
42 43
43#define TESTPCGFLAG(uname, lname) \ 44#define TESTPCGFLAG(uname, lname) \
@@ -52,11 +53,23 @@ static inline void SetPageCgroup##uname(struct page_cgroup *pc)\
52static inline void ClearPageCgroup##uname(struct page_cgroup *pc) \ 53static inline void ClearPageCgroup##uname(struct page_cgroup *pc) \
53 { clear_bit(PCG_##lname, &pc->flags); } 54 { clear_bit(PCG_##lname, &pc->flags); }
54 55
56#define TESTCLEARPCGFLAG(uname, lname) \
57static inline int TestClearPageCgroup##uname(struct page_cgroup *pc) \
58 { return test_and_clear_bit(PCG_##lname, &pc->flags); }
59
55/* Cache flag is set only once (at allocation) */ 60/* Cache flag is set only once (at allocation) */
56TESTPCGFLAG(Cache, CACHE) 61TESTPCGFLAG(Cache, CACHE)
62CLEARPCGFLAG(Cache, CACHE)
63SETPCGFLAG(Cache, CACHE)
57 64
58TESTPCGFLAG(Used, USED) 65TESTPCGFLAG(Used, USED)
59CLEARPCGFLAG(Used, USED) 66CLEARPCGFLAG(Used, USED)
67SETPCGFLAG(Used, USED)
68
69SETPCGFLAG(AcctLRU, ACCT_LRU)
70CLEARPCGFLAG(AcctLRU, ACCT_LRU)
71TESTPCGFLAG(AcctLRU, ACCT_LRU)
72TESTCLEARPCGFLAG(AcctLRU, ACCT_LRU)
60 73
61static inline int page_cgroup_nid(struct page_cgroup *pc) 74static inline int page_cgroup_nid(struct page_cgroup *pc)
62{ 75{
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 7803565aa877..da1fda8623e0 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2527,6 +2527,16 @@
2527#define PCI_DEVICE_ID_INTEL_E7525_MCH 0x359e 2527#define PCI_DEVICE_ID_INTEL_E7525_MCH 0x359e
2528#define PCI_DEVICE_ID_INTEL_IOAT_CNB 0x360b 2528#define PCI_DEVICE_ID_INTEL_IOAT_CNB 0x360b
2529#define PCI_DEVICE_ID_INTEL_FBD_CNB 0x360c 2529#define PCI_DEVICE_ID_INTEL_FBD_CNB 0x360c
2530#define PCI_DEVICE_ID_INTEL_IOAT_JSF0 0x3710
2531#define PCI_DEVICE_ID_INTEL_IOAT_JSF1 0x3711
2532#define PCI_DEVICE_ID_INTEL_IOAT_JSF2 0x3712
2533#define PCI_DEVICE_ID_INTEL_IOAT_JSF3 0x3713
2534#define PCI_DEVICE_ID_INTEL_IOAT_JSF4 0x3714
2535#define PCI_DEVICE_ID_INTEL_IOAT_JSF5 0x3715
2536#define PCI_DEVICE_ID_INTEL_IOAT_JSF6 0x3716
2537#define PCI_DEVICE_ID_INTEL_IOAT_JSF7 0x3717
2538#define PCI_DEVICE_ID_INTEL_IOAT_JSF8 0x3718
2539#define PCI_DEVICE_ID_INTEL_IOAT_JSF9 0x3719
2530#define PCI_DEVICE_ID_INTEL_ICH10_0 0x3a14 2540#define PCI_DEVICE_ID_INTEL_ICH10_0 0x3a14
2531#define PCI_DEVICE_ID_INTEL_ICH10_1 0x3a16 2541#define PCI_DEVICE_ID_INTEL_ICH10_1 0x3a16
2532#define PCI_DEVICE_ID_INTEL_ICH10_2 0x3a18 2542#define PCI_DEVICE_ID_INTEL_ICH10_2 0x3a18
diff --git a/include/linux/prctl.h b/include/linux/prctl.h
index 07bff666e65b..931150566ade 100644
--- a/include/linux/prctl.h
+++ b/include/linux/prctl.h
@@ -88,4 +88,6 @@
88#define PR_TASK_PERF_EVENTS_DISABLE 31 88#define PR_TASK_PERF_EVENTS_DISABLE 31
89#define PR_TASK_PERF_EVENTS_ENABLE 32 89#define PR_TASK_PERF_EVENTS_ENABLE 32
90 90
91#define PR_MCE_KILL 33
92
91#endif /* _LINUX_PRCTL_H */ 93#endif /* _LINUX_PRCTL_H */
diff --git a/include/linux/relay.h b/include/linux/relay.h
index 953fc055e875..14a86bc7102b 100644
--- a/include/linux/relay.h
+++ b/include/linux/relay.h
@@ -140,7 +140,7 @@ struct rchan_callbacks
140 * cause relay_open() to create a single global buffer rather 140 * cause relay_open() to create a single global buffer rather
141 * than the default set of per-cpu buffers. 141 * than the default set of per-cpu buffers.
142 * 142 *
143 * See Documentation/filesystems/relayfs.txt for more info. 143 * See Documentation/filesystems/relay.txt for more info.
144 */ 144 */
145 struct dentry *(*create_buf_file)(const char *filename, 145 struct dentry *(*create_buf_file)(const char *filename,
146 struct dentry *parent, 146 struct dentry *parent,
diff --git a/include/linux/res_counter.h b/include/linux/res_counter.h
index 511f42fc6816..731af71cddc9 100644
--- a/include/linux/res_counter.h
+++ b/include/linux/res_counter.h
@@ -35,6 +35,10 @@ struct res_counter {
35 */ 35 */
36 unsigned long long limit; 36 unsigned long long limit;
37 /* 37 /*
38 * the limit that usage can be exceed
39 */
40 unsigned long long soft_limit;
41 /*
38 * the number of unsuccessful attempts to consume the resource 42 * the number of unsuccessful attempts to consume the resource
39 */ 43 */
40 unsigned long long failcnt; 44 unsigned long long failcnt;
@@ -87,6 +91,7 @@ enum {
87 RES_MAX_USAGE, 91 RES_MAX_USAGE,
88 RES_LIMIT, 92 RES_LIMIT,
89 RES_FAILCNT, 93 RES_FAILCNT,
94 RES_SOFT_LIMIT,
90}; 95};
91 96
92/* 97/*
@@ -109,7 +114,8 @@ void res_counter_init(struct res_counter *counter, struct res_counter *parent);
109int __must_check res_counter_charge_locked(struct res_counter *counter, 114int __must_check res_counter_charge_locked(struct res_counter *counter,
110 unsigned long val); 115 unsigned long val);
111int __must_check res_counter_charge(struct res_counter *counter, 116int __must_check res_counter_charge(struct res_counter *counter,
112 unsigned long val, struct res_counter **limit_fail_at); 117 unsigned long val, struct res_counter **limit_fail_at,
118 struct res_counter **soft_limit_at);
113 119
114/* 120/*
115 * uncharge - tell that some portion of the resource is released 121 * uncharge - tell that some portion of the resource is released
@@ -122,7 +128,8 @@ int __must_check res_counter_charge(struct res_counter *counter,
122 */ 128 */
123 129
124void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val); 130void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val);
125void res_counter_uncharge(struct res_counter *counter, unsigned long val); 131void res_counter_uncharge(struct res_counter *counter, unsigned long val,
132 bool *was_soft_limit_excess);
126 133
127static inline bool res_counter_limit_check_locked(struct res_counter *cnt) 134static inline bool res_counter_limit_check_locked(struct res_counter *cnt)
128{ 135{
@@ -132,6 +139,36 @@ static inline bool res_counter_limit_check_locked(struct res_counter *cnt)
132 return false; 139 return false;
133} 140}
134 141
142static inline bool res_counter_soft_limit_check_locked(struct res_counter *cnt)
143{
144 if (cnt->usage < cnt->soft_limit)
145 return true;
146
147 return false;
148}
149
150/**
151 * Get the difference between the usage and the soft limit
152 * @cnt: The counter
153 *
154 * Returns 0 if usage is less than or equal to soft limit
155 * The difference between usage and soft limit, otherwise.
156 */
157static inline unsigned long long
158res_counter_soft_limit_excess(struct res_counter *cnt)
159{
160 unsigned long long excess;
161 unsigned long flags;
162
163 spin_lock_irqsave(&cnt->lock, flags);
164 if (cnt->usage <= cnt->soft_limit)
165 excess = 0;
166 else
167 excess = cnt->usage - cnt->soft_limit;
168 spin_unlock_irqrestore(&cnt->lock, flags);
169 return excess;
170}
171
135/* 172/*
136 * Helper function to detect if the cgroup is within it's limit or 173 * Helper function to detect if the cgroup is within it's limit or
137 * not. It's currently called from cgroup_rss_prepare() 174 * not. It's currently called from cgroup_rss_prepare()
@@ -147,6 +184,17 @@ static inline bool res_counter_check_under_limit(struct res_counter *cnt)
147 return ret; 184 return ret;
148} 185}
149 186
187static inline bool res_counter_check_under_soft_limit(struct res_counter *cnt)
188{
189 bool ret;
190 unsigned long flags;
191
192 spin_lock_irqsave(&cnt->lock, flags);
193 ret = res_counter_soft_limit_check_locked(cnt);
194 spin_unlock_irqrestore(&cnt->lock, flags);
195 return ret;
196}
197
150static inline void res_counter_reset_max(struct res_counter *cnt) 198static inline void res_counter_reset_max(struct res_counter *cnt)
151{ 199{
152 unsigned long flags; 200 unsigned long flags;
@@ -180,4 +228,16 @@ static inline int res_counter_set_limit(struct res_counter *cnt,
180 return ret; 228 return ret;
181} 229}
182 230
231static inline int
232res_counter_set_soft_limit(struct res_counter *cnt,
233 unsigned long long soft_limit)
234{
235 unsigned long flags;
236
237 spin_lock_irqsave(&cnt->lock, flags);
238 cnt->soft_limit = soft_limit;
239 spin_unlock_irqrestore(&cnt->lock, flags);
240 return 0;
241}
242
183#endif 243#endif
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 477841d29fce..cb0ba7032609 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -81,7 +81,19 @@ static inline void page_dup_rmap(struct page *page)
81 */ 81 */
82int page_referenced(struct page *, int is_locked, 82int page_referenced(struct page *, int is_locked,
83 struct mem_cgroup *cnt, unsigned long *vm_flags); 83 struct mem_cgroup *cnt, unsigned long *vm_flags);
84int try_to_unmap(struct page *, int ignore_refs); 84enum ttu_flags {
85 TTU_UNMAP = 0, /* unmap mode */
86 TTU_MIGRATION = 1, /* migration mode */
87 TTU_MUNLOCK = 2, /* munlock mode */
88 TTU_ACTION_MASK = 0xff,
89
90 TTU_IGNORE_MLOCK = (1 << 8), /* ignore mlock */
91 TTU_IGNORE_ACCESS = (1 << 9), /* don't age */
92 TTU_IGNORE_HWPOISON = (1 << 10),/* corrupted page is recoverable */
93};
94#define TTU_ACTION(x) ((x) & TTU_ACTION_MASK)
95
96int try_to_unmap(struct page *, enum ttu_flags flags);
85 97
86/* 98/*
87 * Called from mm/filemap_xip.c to unmap empty zero page 99 * Called from mm/filemap_xip.c to unmap empty zero page
@@ -108,6 +120,13 @@ int page_mkclean(struct page *);
108 */ 120 */
109int try_to_munlock(struct page *); 121int try_to_munlock(struct page *);
110 122
123/*
124 * Called by memory-failure.c to kill processes.
125 */
126struct anon_vma *page_lock_anon_vma(struct page *page);
127void page_unlock_anon_vma(struct anon_vma *anon_vma);
128int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma);
129
111#else /* !CONFIG_MMU */ 130#else /* !CONFIG_MMU */
112 131
113#define anon_vma_init() do {} while (0) 132#define anon_vma_init() do {} while (0)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 848d1f20086e..75e6e60bf583 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -309,7 +309,7 @@ extern void softlockup_tick(void);
309extern void touch_softlockup_watchdog(void); 309extern void touch_softlockup_watchdog(void);
310extern void touch_all_softlockup_watchdogs(void); 310extern void touch_all_softlockup_watchdogs(void);
311extern int proc_dosoftlockup_thresh(struct ctl_table *table, int write, 311extern int proc_dosoftlockup_thresh(struct ctl_table *table, int write,
312 struct file *filp, void __user *buffer, 312 void __user *buffer,
313 size_t *lenp, loff_t *ppos); 313 size_t *lenp, loff_t *ppos);
314extern unsigned int softlockup_panic; 314extern unsigned int softlockup_panic;
315extern int softlockup_thresh; 315extern int softlockup_thresh;
@@ -331,7 +331,7 @@ extern unsigned long sysctl_hung_task_check_count;
331extern unsigned long sysctl_hung_task_timeout_secs; 331extern unsigned long sysctl_hung_task_timeout_secs;
332extern unsigned long sysctl_hung_task_warnings; 332extern unsigned long sysctl_hung_task_warnings;
333extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, 333extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
334 struct file *filp, void __user *buffer, 334 void __user *buffer,
335 size_t *lenp, loff_t *ppos); 335 size_t *lenp, loff_t *ppos);
336#endif 336#endif
337 337
@@ -1271,7 +1271,6 @@ struct task_struct {
1271 struct mm_struct *mm, *active_mm; 1271 struct mm_struct *mm, *active_mm;
1272 1272
1273/* task state */ 1273/* task state */
1274 struct linux_binfmt *binfmt;
1275 int exit_state; 1274 int exit_state;
1276 int exit_code, exit_signal; 1275 int exit_code, exit_signal;
1277 int pdeath_signal; /* The signal sent when the parent dies */ 1276 int pdeath_signal; /* The signal sent when the parent dies */
@@ -1735,6 +1734,7 @@ extern cputime_t task_gtime(struct task_struct *p);
1735#define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */ 1734#define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */
1736#define PF_VCPU 0x00000010 /* I'm a virtual CPU */ 1735#define PF_VCPU 0x00000010 /* I'm a virtual CPU */
1737#define PF_FORKNOEXEC 0x00000040 /* forked but didn't exec */ 1736#define PF_FORKNOEXEC 0x00000040 /* forked but didn't exec */
1737#define PF_MCE_PROCESS 0x00000080 /* process policy on mce errors */
1738#define PF_SUPERPRIV 0x00000100 /* used super-user privileges */ 1738#define PF_SUPERPRIV 0x00000100 /* used super-user privileges */
1739#define PF_DUMPCORE 0x00000200 /* dumped core */ 1739#define PF_DUMPCORE 0x00000200 /* dumped core */
1740#define PF_SIGNALED 0x00000400 /* killed by a signal */ 1740#define PF_SIGNALED 0x00000400 /* killed by a signal */
@@ -1754,6 +1754,7 @@ extern cputime_t task_gtime(struct task_struct *p);
1754#define PF_SPREAD_PAGE 0x01000000 /* Spread page cache over cpuset */ 1754#define PF_SPREAD_PAGE 0x01000000 /* Spread page cache over cpuset */
1755#define PF_SPREAD_SLAB 0x02000000 /* Spread some slab caches over cpuset */ 1755#define PF_SPREAD_SLAB 0x02000000 /* Spread some slab caches over cpuset */
1756#define PF_THREAD_BOUND 0x04000000 /* Thread bound to specific cpu */ 1756#define PF_THREAD_BOUND 0x04000000 /* Thread bound to specific cpu */
1757#define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */
1757#define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */ 1758#define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */
1758#define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */ 1759#define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */
1759#define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezeable */ 1760#define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezeable */
@@ -1906,7 +1907,7 @@ extern unsigned int sysctl_sched_time_avg;
1906extern unsigned int sysctl_timer_migration; 1907extern unsigned int sysctl_timer_migration;
1907 1908
1908int sched_nr_latency_handler(struct ctl_table *table, int write, 1909int sched_nr_latency_handler(struct ctl_table *table, int write,
1909 struct file *file, void __user *buffer, size_t *length, 1910 void __user *buffer, size_t *length,
1910 loff_t *ppos); 1911 loff_t *ppos);
1911#endif 1912#endif
1912#ifdef CONFIG_SCHED_DEBUG 1913#ifdef CONFIG_SCHED_DEBUG
@@ -1924,7 +1925,7 @@ extern unsigned int sysctl_sched_rt_period;
1924extern int sysctl_sched_rt_runtime; 1925extern int sysctl_sched_rt_runtime;
1925 1926
1926int sched_rt_handler(struct ctl_table *table, int write, 1927int sched_rt_handler(struct ctl_table *table, int write,
1927 struct file *filp, void __user *buffer, size_t *lenp, 1928 void __user *buffer, size_t *lenp,
1928 loff_t *ppos); 1929 loff_t *ppos);
1929 1930
1930extern unsigned int sysctl_sched_compat_yield; 1931extern unsigned int sysctl_sched_compat_yield;
@@ -2059,6 +2060,7 @@ extern int kill_pgrp(struct pid *pid, int sig, int priv);
2059extern int kill_pid(struct pid *pid, int sig, int priv); 2060extern int kill_pid(struct pid *pid, int sig, int priv);
2060extern int kill_proc_info(int, struct siginfo *, pid_t); 2061extern int kill_proc_info(int, struct siginfo *, pid_t);
2061extern int do_notify_parent(struct task_struct *, int); 2062extern int do_notify_parent(struct task_struct *, int);
2063extern void __wake_up_parent(struct task_struct *p, struct task_struct *parent);
2062extern void force_sig(int, struct task_struct *); 2064extern void force_sig(int, struct task_struct *);
2063extern void force_sig_specific(int, struct task_struct *); 2065extern void force_sig_specific(int, struct task_struct *);
2064extern int send_sig(int, struct task_struct *, int); 2066extern int send_sig(int, struct task_struct *, int);
@@ -2336,7 +2338,10 @@ static inline int signal_pending(struct task_struct *p)
2336 return unlikely(test_tsk_thread_flag(p,TIF_SIGPENDING)); 2338 return unlikely(test_tsk_thread_flag(p,TIF_SIGPENDING));
2337} 2339}
2338 2340
2339extern int __fatal_signal_pending(struct task_struct *p); 2341static inline int __fatal_signal_pending(struct task_struct *p)
2342{
2343 return unlikely(sigismember(&p->pending.signal, SIGKILL));
2344}
2340 2345
2341static inline int fatal_signal_pending(struct task_struct *p) 2346static inline int fatal_signal_pending(struct task_struct *p)
2342{ 2347{
diff --git a/include/linux/security.h b/include/linux/security.h
index d050b66ab9ef..239e40d0450b 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -133,7 +133,7 @@ static inline unsigned long round_hint_to_min(unsigned long hint)
133 return PAGE_ALIGN(mmap_min_addr); 133 return PAGE_ALIGN(mmap_min_addr);
134 return hint; 134 return hint;
135} 135}
136extern int mmap_min_addr_handler(struct ctl_table *table, int write, struct file *filp, 136extern int mmap_min_addr_handler(struct ctl_table *table, int write,
137 void __user *buffer, size_t *lenp, loff_t *ppos); 137 void __user *buffer, size_t *lenp, loff_t *ppos);
138 138
139#ifdef CONFIG_SECURITY 139#ifdef CONFIG_SECURITY
diff --git a/include/linux/signal.h b/include/linux/signal.h
index c7552836bd95..ab9272cc270c 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -233,6 +233,8 @@ static inline int valid_signal(unsigned long sig)
233} 233}
234 234
235extern int next_signal(struct sigpending *pending, sigset_t *mask); 235extern int next_signal(struct sigpending *pending, sigset_t *mask);
236extern int do_send_sig_info(int sig, struct siginfo *info,
237 struct task_struct *p, bool group);
236extern int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p); 238extern int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p);
237extern int __group_send_sig_info(int, struct siginfo *, struct task_struct *); 239extern int __group_send_sig_info(int, struct siginfo *, struct task_struct *);
238extern long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, 240extern long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig,
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 6c990e658f4e..4ec90019c1a4 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -34,16 +34,38 @@ static inline int current_is_kswapd(void)
34 * the type/offset into the pte as 5/27 as well. 34 * the type/offset into the pte as 5/27 as well.
35 */ 35 */
36#define MAX_SWAPFILES_SHIFT 5 36#define MAX_SWAPFILES_SHIFT 5
37#ifndef CONFIG_MIGRATION 37
38#define MAX_SWAPFILES (1 << MAX_SWAPFILES_SHIFT) 38/*
39 * Use some of the swap files numbers for other purposes. This
40 * is a convenient way to hook into the VM to trigger special
41 * actions on faults.
42 */
43
44/*
45 * NUMA node memory migration support
46 */
47#ifdef CONFIG_MIGRATION
48#define SWP_MIGRATION_NUM 2
49#define SWP_MIGRATION_READ (MAX_SWAPFILES + SWP_HWPOISON_NUM)
50#define SWP_MIGRATION_WRITE (MAX_SWAPFILES + SWP_HWPOISON_NUM + 1)
39#else 51#else
40/* Use last two entries for page migration swap entries */ 52#define SWP_MIGRATION_NUM 0
41#define MAX_SWAPFILES ((1 << MAX_SWAPFILES_SHIFT)-2)
42#define SWP_MIGRATION_READ MAX_SWAPFILES
43#define SWP_MIGRATION_WRITE (MAX_SWAPFILES + 1)
44#endif 53#endif
45 54
46/* 55/*
56 * Handling of hardware poisoned pages with memory corruption.
57 */
58#ifdef CONFIG_MEMORY_FAILURE
59#define SWP_HWPOISON_NUM 1
60#define SWP_HWPOISON MAX_SWAPFILES
61#else
62#define SWP_HWPOISON_NUM 0
63#endif
64
65#define MAX_SWAPFILES \
66 ((1 << MAX_SWAPFILES_SHIFT) - SWP_MIGRATION_NUM - SWP_HWPOISON_NUM)
67
68/*
47 * Magic header for a swap area. The first part of the union is 69 * Magic header for a swap area. The first part of the union is
48 * what the swap magic looks like for the old (limited to 128MB) 70 * what the swap magic looks like for the old (limited to 128MB)
49 * swap area format, the second part of the union adds - in the 71 * swap area format, the second part of the union adds - in the
@@ -217,6 +239,11 @@ extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
217extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem, 239extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem,
218 gfp_t gfp_mask, bool noswap, 240 gfp_t gfp_mask, bool noswap,
219 unsigned int swappiness); 241 unsigned int swappiness);
242extern unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
243 gfp_t gfp_mask, bool noswap,
244 unsigned int swappiness,
245 struct zone *zone,
246 int nid);
220extern int __isolate_lru_page(struct page *page, int mode, int file); 247extern int __isolate_lru_page(struct page *page, int mode, int file);
221extern unsigned long shrink_all_memory(unsigned long nr_pages); 248extern unsigned long shrink_all_memory(unsigned long nr_pages);
222extern int vm_swappiness; 249extern int vm_swappiness;
@@ -240,7 +267,7 @@ extern int page_evictable(struct page *page, struct vm_area_struct *vma);
240extern void scan_mapping_unevictable_pages(struct address_space *); 267extern void scan_mapping_unevictable_pages(struct address_space *);
241 268
242extern unsigned long scan_unevictable_pages; 269extern unsigned long scan_unevictable_pages;
243extern int scan_unevictable_handler(struct ctl_table *, int, struct file *, 270extern int scan_unevictable_handler(struct ctl_table *, int,
244 void __user *, size_t *, loff_t *); 271 void __user *, size_t *, loff_t *);
245extern int scan_unevictable_register_node(struct node *node); 272extern int scan_unevictable_register_node(struct node *node);
246extern void scan_unevictable_unregister_node(struct node *node); 273extern void scan_unevictable_unregister_node(struct node *node);
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index 6ec39ab27b4b..cd42e30b7c6e 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -131,3 +131,41 @@ static inline int is_write_migration_entry(swp_entry_t entry)
131 131
132#endif 132#endif
133 133
134#ifdef CONFIG_MEMORY_FAILURE
135/*
136 * Support for hardware poisoned pages
137 */
138static inline swp_entry_t make_hwpoison_entry(struct page *page)
139{
140 BUG_ON(!PageLocked(page));
141 return swp_entry(SWP_HWPOISON, page_to_pfn(page));
142}
143
144static inline int is_hwpoison_entry(swp_entry_t entry)
145{
146 return swp_type(entry) == SWP_HWPOISON;
147}
148#else
149
150static inline swp_entry_t make_hwpoison_entry(struct page *page)
151{
152 return swp_entry(0, 0);
153}
154
155static inline int is_hwpoison_entry(swp_entry_t swp)
156{
157 return 0;
158}
159#endif
160
161#if defined(CONFIG_MEMORY_FAILURE) || defined(CONFIG_MIGRATION)
162static inline int non_swap_entry(swp_entry_t entry)
163{
164 return swp_type(entry) >= MAX_SWAPFILES;
165}
166#else
167static inline int non_swap_entry(swp_entry_t entry)
168{
169 return 0;
170}
171#endif
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index e76d3b22a466..1e4743ee6831 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -29,7 +29,6 @@
29#include <linux/types.h> 29#include <linux/types.h>
30#include <linux/compiler.h> 30#include <linux/compiler.h>
31 31
32struct file;
33struct completion; 32struct completion;
34 33
35#define CTL_MAXNAME 10 /* how many path components do we allow in a 34#define CTL_MAXNAME 10 /* how many path components do we allow in a
@@ -977,25 +976,25 @@ typedef int ctl_handler (struct ctl_table *table,
977 void __user *oldval, size_t __user *oldlenp, 976 void __user *oldval, size_t __user *oldlenp,
978 void __user *newval, size_t newlen); 977 void __user *newval, size_t newlen);
979 978
980typedef int proc_handler (struct ctl_table *ctl, int write, struct file * filp, 979typedef int proc_handler (struct ctl_table *ctl, int write,
981 void __user *buffer, size_t *lenp, loff_t *ppos); 980 void __user *buffer, size_t *lenp, loff_t *ppos);
982 981
983extern int proc_dostring(struct ctl_table *, int, struct file *, 982extern int proc_dostring(struct ctl_table *, int,
984 void __user *, size_t *, loff_t *); 983 void __user *, size_t *, loff_t *);
985extern int proc_dointvec(struct ctl_table *, int, struct file *, 984extern int proc_dointvec(struct ctl_table *, int,
986 void __user *, size_t *, loff_t *); 985 void __user *, size_t *, loff_t *);
987extern int proc_dointvec_minmax(struct ctl_table *, int, struct file *, 986extern int proc_dointvec_minmax(struct ctl_table *, int,
988 void __user *, size_t *, loff_t *); 987 void __user *, size_t *, loff_t *);
989extern int proc_dointvec_jiffies(struct ctl_table *, int, struct file *, 988extern int proc_dointvec_jiffies(struct ctl_table *, int,
990 void __user *, size_t *, loff_t *); 989 void __user *, size_t *, loff_t *);
991extern int proc_dointvec_userhz_jiffies(struct ctl_table *, int, struct file *, 990extern int proc_dointvec_userhz_jiffies(struct ctl_table *, int,
992 void __user *, size_t *, loff_t *); 991 void __user *, size_t *, loff_t *);
993extern int proc_dointvec_ms_jiffies(struct ctl_table *, int, struct file *, 992extern int proc_dointvec_ms_jiffies(struct ctl_table *, int,
994 void __user *, size_t *, loff_t *); 993 void __user *, size_t *, loff_t *);
995extern int proc_doulongvec_minmax(struct ctl_table *, int, struct file *, 994extern int proc_doulongvec_minmax(struct ctl_table *, int,
996 void __user *, size_t *, loff_t *); 995 void __user *, size_t *, loff_t *);
997extern int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int, 996extern int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int,
998 struct file *, void __user *, size_t *, loff_t *); 997 void __user *, size_t *, loff_t *);
999 998
1000extern int do_sysctl (int __user *name, int nlen, 999extern int do_sysctl (int __user *name, int nlen,
1001 void __user *oldval, size_t __user *oldlenp, 1000 void __user *oldval, size_t __user *oldlenp,
diff --git a/include/linux/time.h b/include/linux/time.h
index 56787c093345..fe04e5ef6a59 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -155,6 +155,34 @@ extern void timekeeping_leap_insert(int leapsecond);
155struct tms; 155struct tms;
156extern void do_sys_times(struct tms *); 156extern void do_sys_times(struct tms *);
157 157
158/*
159 * Similar to the struct tm in userspace <time.h>, but it needs to be here so
160 * that the kernel source is self contained.
161 */
162struct tm {
163 /*
164 * the number of seconds after the minute, normally in the range
165 * 0 to 59, but can be up to 60 to allow for leap seconds
166 */
167 int tm_sec;
168 /* the number of minutes after the hour, in the range 0 to 59*/
169 int tm_min;
170 /* the number of hours past midnight, in the range 0 to 23 */
171 int tm_hour;
172 /* the day of the month, in the range 1 to 31 */
173 int tm_mday;
174 /* the number of months since January, in the range 0 to 11 */
175 int tm_mon;
176 /* the number of years since 1900 */
177 long tm_year;
178 /* the number of days since Sunday, in the range 0 to 6 */
179 int tm_wday;
180 /* the number of days since January 1, in the range 0 to 365 */
181 int tm_yday;
182};
183
184void time_to_tm(time_t totalsecs, int offset, struct tm *result);
185
158/** 186/**
159 * timespec_to_ns - Convert timespec to nanoseconds 187 * timespec_to_ns - Convert timespec to nanoseconds
160 * @ts: pointer to the timespec variable to be converted 188 * @ts: pointer to the timespec variable to be converted
diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index 17ba82efa483..1eb44a924e56 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -1,7 +1,7 @@
1/* 1/*
2 * Tracing hooks 2 * Tracing hooks
3 * 3 *
4 * Copyright (C) 2008 Red Hat, Inc. All rights reserved. 4 * Copyright (C) 2008-2009 Red Hat, Inc. All rights reserved.
5 * 5 *
6 * This copyrighted material is made available to anyone wishing to use, 6 * This copyrighted material is made available to anyone wishing to use,
7 * modify, copy, or redistribute it subject to the terms and conditions 7 * modify, copy, or redistribute it subject to the terms and conditions
@@ -463,22 +463,38 @@ static inline int tracehook_get_signal(struct task_struct *task,
463 463
464/** 464/**
465 * tracehook_notify_jctl - report about job control stop/continue 465 * tracehook_notify_jctl - report about job control stop/continue
466 * @notify: nonzero if this is the last thread in the group to stop 466 * @notify: zero, %CLD_STOPPED or %CLD_CONTINUED
467 * @why: %CLD_STOPPED or %CLD_CONTINUED 467 * @why: %CLD_STOPPED or %CLD_CONTINUED
468 * 468 *
469 * This is called when we might call do_notify_parent_cldstop(). 469 * This is called when we might call do_notify_parent_cldstop().
470 * It's called when about to stop for job control; we are already in
471 * %TASK_STOPPED state, about to call schedule(). It's also called when
472 * a delayed %CLD_STOPPED or %CLD_CONTINUED report is ready to be made.
473 * 470 *
474 * Return nonzero to generate a %SIGCHLD with @why, which is 471 * @notify is zero if we would not ordinarily send a %SIGCHLD,
475 * normal if @notify is nonzero. 472 * or is the %CLD_STOPPED or %CLD_CONTINUED .si_code for %SIGCHLD.
476 * 473 *
477 * Called with no locks held. 474 * @why is %CLD_STOPPED when about to stop for job control;
475 * we are already in %TASK_STOPPED state, about to call schedule().
476 * It might also be that we have just exited (check %PF_EXITING),
477 * but need to report that a group-wide stop is complete.
478 *
479 * @why is %CLD_CONTINUED when waking up after job control stop and
480 * ready to make a delayed @notify report.
481 *
482 * Return the %CLD_* value for %SIGCHLD, or zero to generate no signal.
483 *
484 * Called with the siglock held.
478 */ 485 */
479static inline int tracehook_notify_jctl(int notify, int why) 486static inline int tracehook_notify_jctl(int notify, int why)
480{ 487{
481 return notify || (current->ptrace & PT_PTRACED); 488 return notify ?: (current->ptrace & PT_PTRACED) ? why : 0;
489}
490
491/**
492 * tracehook_finish_jctl - report about return from job control stop
493 *
494 * This is called by do_signal_stop() after wakeup.
495 */
496static inline void tracehook_finish_jctl(void)
497{
482} 498}
483 499
484#define DEATH_REAP -1 500#define DEATH_REAP -1
diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 63a3f7a80580..660a9de96f81 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -4,7 +4,7 @@
4/* 4/*
5 * Kernel Tracepoint API. 5 * Kernel Tracepoint API.
6 * 6 *
7 * See Documentation/tracepoint.txt. 7 * See Documentation/trace/tracepoints.txt.
8 * 8 *
9 * (C) Copyright 2008 Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca> 9 * (C) Copyright 2008 Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
10 * 10 *
diff --git a/include/linux/unaligned/be_byteshift.h b/include/linux/unaligned/be_byteshift.h
index 46dd12c5709e..9356b24223ac 100644
--- a/include/linux/unaligned/be_byteshift.h
+++ b/include/linux/unaligned/be_byteshift.h
@@ -1,7 +1,7 @@
1#ifndef _LINUX_UNALIGNED_BE_BYTESHIFT_H 1#ifndef _LINUX_UNALIGNED_BE_BYTESHIFT_H
2#define _LINUX_UNALIGNED_BE_BYTESHIFT_H 2#define _LINUX_UNALIGNED_BE_BYTESHIFT_H
3 3
4#include <linux/kernel.h> 4#include <linux/types.h>
5 5
6static inline u16 __get_unaligned_be16(const u8 *p) 6static inline u16 __get_unaligned_be16(const u8 *p)
7{ 7{
diff --git a/include/linux/unaligned/le_byteshift.h b/include/linux/unaligned/le_byteshift.h
index 59777e951baf..be376fb79b64 100644
--- a/include/linux/unaligned/le_byteshift.h
+++ b/include/linux/unaligned/le_byteshift.h
@@ -1,7 +1,7 @@
1#ifndef _LINUX_UNALIGNED_LE_BYTESHIFT_H 1#ifndef _LINUX_UNALIGNED_LE_BYTESHIFT_H
2#define _LINUX_UNALIGNED_LE_BYTESHIFT_H 2#define _LINUX_UNALIGNED_LE_BYTESHIFT_H
3 3
4#include <linux/kernel.h> 4#include <linux/types.h>
5 5
6static inline u16 __get_unaligned_le16(const u8 *p) 6static inline u16 __get_unaligned_le16(const u8 *p)
7{ 7{
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 75cf58666ff9..66ebddcff664 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -110,21 +110,20 @@ extern int laptop_mode;
110extern unsigned long determine_dirtyable_memory(void); 110extern unsigned long determine_dirtyable_memory(void);
111 111
112extern int dirty_background_ratio_handler(struct ctl_table *table, int write, 112extern int dirty_background_ratio_handler(struct ctl_table *table, int write,
113 struct file *filp, void __user *buffer, size_t *lenp, 113 void __user *buffer, size_t *lenp,
114 loff_t *ppos); 114 loff_t *ppos);
115extern int dirty_background_bytes_handler(struct ctl_table *table, int write, 115extern int dirty_background_bytes_handler(struct ctl_table *table, int write,
116 struct file *filp, void __user *buffer, size_t *lenp, 116 void __user *buffer, size_t *lenp,
117 loff_t *ppos); 117 loff_t *ppos);
118extern int dirty_ratio_handler(struct ctl_table *table, int write, 118extern int dirty_ratio_handler(struct ctl_table *table, int write,
119 struct file *filp, void __user *buffer, size_t *lenp, 119 void __user *buffer, size_t *lenp,
120 loff_t *ppos); 120 loff_t *ppos);
121extern int dirty_bytes_handler(struct ctl_table *table, int write, 121extern int dirty_bytes_handler(struct ctl_table *table, int write,
122 struct file *filp, void __user *buffer, size_t *lenp, 122 void __user *buffer, size_t *lenp,
123 loff_t *ppos); 123 loff_t *ppos);
124 124
125struct ctl_table; 125struct ctl_table;
126struct file; 126int dirty_writeback_centisecs_handler(struct ctl_table *, int,
127int dirty_writeback_centisecs_handler(struct ctl_table *, int, struct file *,
128 void __user *, size_t *, loff_t *); 127 void __user *, size_t *, loff_t *);
129 128
130void get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty, 129void get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty,
diff --git a/include/net/ip.h b/include/net/ip.h
index 72c36926c26d..5b26a0bd178e 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -399,7 +399,7 @@ extern void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 dport,
399 * fed into the routing cache should use these handlers. 399 * fed into the routing cache should use these handlers.
400 */ 400 */
401int ipv4_doint_and_flush(ctl_table *ctl, int write, 401int ipv4_doint_and_flush(ctl_table *ctl, int write,
402 struct file* filp, void __user *buffer, 402 void __user *buffer,
403 size_t *lenp, loff_t *ppos); 403 size_t *lenp, loff_t *ppos);
404int ipv4_doint_and_flush_strategy(ctl_table *table, 404int ipv4_doint_and_flush_strategy(ctl_table *table,
405 void __user *oldval, size_t __user *oldlenp, 405 void __user *oldval, size_t __user *oldlenp,
diff --git a/include/net/ndisc.h b/include/net/ndisc.h
index 1459ed3e2697..f76f22d05721 100644
--- a/include/net/ndisc.h
+++ b/include/net/ndisc.h
@@ -55,7 +55,6 @@ enum {
55#include <net/neighbour.h> 55#include <net/neighbour.h>
56 56
57struct ctl_table; 57struct ctl_table;
58struct file;
59struct inet6_dev; 58struct inet6_dev;
60struct net_device; 59struct net_device;
61struct net_proto_family; 60struct net_proto_family;
@@ -139,7 +138,6 @@ extern int igmp6_event_report(struct sk_buff *skb);
139#ifdef CONFIG_SYSCTL 138#ifdef CONFIG_SYSCTL
140extern int ndisc_ifinfo_sysctl_change(struct ctl_table *ctl, 139extern int ndisc_ifinfo_sysctl_change(struct ctl_table *ctl,
141 int write, 140 int write,
142 struct file * filp,
143 void __user *buffer, 141 void __user *buffer,
144 size_t *lenp, 142 size_t *lenp,
145 loff_t *ppos); 143 loff_t *ppos);
diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c
index 40eab7314aeb..7d3704750efc 100644
--- a/ipc/ipc_sysctl.c
+++ b/ipc/ipc_sysctl.c
@@ -27,18 +27,18 @@ static void *get_ipc(ctl_table *table)
27} 27}
28 28
29#ifdef CONFIG_PROC_SYSCTL 29#ifdef CONFIG_PROC_SYSCTL
30static int proc_ipc_dointvec(ctl_table *table, int write, struct file *filp, 30static int proc_ipc_dointvec(ctl_table *table, int write,
31 void __user *buffer, size_t *lenp, loff_t *ppos) 31 void __user *buffer, size_t *lenp, loff_t *ppos)
32{ 32{
33 struct ctl_table ipc_table; 33 struct ctl_table ipc_table;
34 memcpy(&ipc_table, table, sizeof(ipc_table)); 34 memcpy(&ipc_table, table, sizeof(ipc_table));
35 ipc_table.data = get_ipc(table); 35 ipc_table.data = get_ipc(table);
36 36
37 return proc_dointvec(&ipc_table, write, filp, buffer, lenp, ppos); 37 return proc_dointvec(&ipc_table, write, buffer, lenp, ppos);
38} 38}
39 39
40static int proc_ipc_callback_dointvec(ctl_table *table, int write, 40static int proc_ipc_callback_dointvec(ctl_table *table, int write,
41 struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) 41 void __user *buffer, size_t *lenp, loff_t *ppos)
42{ 42{
43 struct ctl_table ipc_table; 43 struct ctl_table ipc_table;
44 size_t lenp_bef = *lenp; 44 size_t lenp_bef = *lenp;
@@ -47,7 +47,7 @@ static int proc_ipc_callback_dointvec(ctl_table *table, int write,
47 memcpy(&ipc_table, table, sizeof(ipc_table)); 47 memcpy(&ipc_table, table, sizeof(ipc_table));
48 ipc_table.data = get_ipc(table); 48 ipc_table.data = get_ipc(table);
49 49
50 rc = proc_dointvec(&ipc_table, write, filp, buffer, lenp, ppos); 50 rc = proc_dointvec(&ipc_table, write, buffer, lenp, ppos);
51 51
52 if (write && !rc && lenp_bef == *lenp) 52 if (write && !rc && lenp_bef == *lenp)
53 /* 53 /*
@@ -61,13 +61,13 @@ static int proc_ipc_callback_dointvec(ctl_table *table, int write,
61} 61}
62 62
63static int proc_ipc_doulongvec_minmax(ctl_table *table, int write, 63static int proc_ipc_doulongvec_minmax(ctl_table *table, int write,
64 struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) 64 void __user *buffer, size_t *lenp, loff_t *ppos)
65{ 65{
66 struct ctl_table ipc_table; 66 struct ctl_table ipc_table;
67 memcpy(&ipc_table, table, sizeof(ipc_table)); 67 memcpy(&ipc_table, table, sizeof(ipc_table));
68 ipc_table.data = get_ipc(table); 68 ipc_table.data = get_ipc(table);
69 69
70 return proc_doulongvec_minmax(&ipc_table, write, filp, buffer, 70 return proc_doulongvec_minmax(&ipc_table, write, buffer,
71 lenp, ppos); 71 lenp, ppos);
72} 72}
73 73
@@ -95,7 +95,7 @@ static void ipc_auto_callback(int val)
95} 95}
96 96
97static int proc_ipcauto_dointvec_minmax(ctl_table *table, int write, 97static int proc_ipcauto_dointvec_minmax(ctl_table *table, int write,
98 struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) 98 void __user *buffer, size_t *lenp, loff_t *ppos)
99{ 99{
100 struct ctl_table ipc_table; 100 struct ctl_table ipc_table;
101 size_t lenp_bef = *lenp; 101 size_t lenp_bef = *lenp;
@@ -106,7 +106,7 @@ static int proc_ipcauto_dointvec_minmax(ctl_table *table, int write,
106 ipc_table.data = get_ipc(table); 106 ipc_table.data = get_ipc(table);
107 oldval = *((int *)(ipc_table.data)); 107 oldval = *((int *)(ipc_table.data));
108 108
109 rc = proc_dointvec_minmax(&ipc_table, write, filp, buffer, lenp, ppos); 109 rc = proc_dointvec_minmax(&ipc_table, write, buffer, lenp, ppos);
110 110
111 if (write && !rc && lenp_bef == *lenp) { 111 if (write && !rc && lenp_bef == *lenp) {
112 int newval = *((int *)(ipc_table.data)); 112 int newval = *((int *)(ipc_table.data));
diff --git a/ipc/mq_sysctl.c b/ipc/mq_sysctl.c
index 24ae46dfe45d..8a058711fc10 100644
--- a/ipc/mq_sysctl.c
+++ b/ipc/mq_sysctl.c
@@ -31,24 +31,24 @@ static void *get_mq(ctl_table *table)
31 return which; 31 return which;
32} 32}
33 33
34static int proc_mq_dointvec(ctl_table *table, int write, struct file *filp, 34static int proc_mq_dointvec(ctl_table *table, int write,
35 void __user *buffer, size_t *lenp, loff_t *ppos) 35 void __user *buffer, size_t *lenp, loff_t *ppos)
36{ 36{
37 struct ctl_table mq_table; 37 struct ctl_table mq_table;
38 memcpy(&mq_table, table, sizeof(mq_table)); 38 memcpy(&mq_table, table, sizeof(mq_table));
39 mq_table.data = get_mq(table); 39 mq_table.data = get_mq(table);
40 40
41 return proc_dointvec(&mq_table, write, filp, buffer, lenp, ppos); 41 return proc_dointvec(&mq_table, write, buffer, lenp, ppos);
42} 42}
43 43
44static int proc_mq_dointvec_minmax(ctl_table *table, int write, 44static int proc_mq_dointvec_minmax(ctl_table *table, int write,
45 struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) 45 void __user *buffer, size_t *lenp, loff_t *ppos)
46{ 46{
47 struct ctl_table mq_table; 47 struct ctl_table mq_table;
48 memcpy(&mq_table, table, sizeof(mq_table)); 48 memcpy(&mq_table, table, sizeof(mq_table));
49 mq_table.data = get_mq(table); 49 mq_table.data = get_mq(table);
50 50
51 return proc_dointvec_minmax(&mq_table, write, filp, buffer, 51 return proc_dointvec_minmax(&mq_table, write, buffer,
52 lenp, ppos); 52 lenp, ppos);
53} 53}
54#else 54#else
diff --git a/kernel/Makefile b/kernel/Makefile
index 187c89b4783d..b8d4cd8ac0b9 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -58,7 +58,6 @@ obj-$(CONFIG_KEXEC) += kexec.o
58obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o 58obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
59obj-$(CONFIG_COMPAT) += compat.o 59obj-$(CONFIG_COMPAT) += compat.o
60obj-$(CONFIG_CGROUPS) += cgroup.o 60obj-$(CONFIG_CGROUPS) += cgroup.o
61obj-$(CONFIG_CGROUP_DEBUG) += cgroup_debug.o
62obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o 61obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o
63obj-$(CONFIG_CPUSETS) += cpuset.o 62obj-$(CONFIG_CPUSETS) += cpuset.o
64obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o 63obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index cd83d9933b6b..7ccba4bc5e3b 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -23,6 +23,7 @@
23 */ 23 */
24 24
25#include <linux/cgroup.h> 25#include <linux/cgroup.h>
26#include <linux/ctype.h>
26#include <linux/errno.h> 27#include <linux/errno.h>
27#include <linux/fs.h> 28#include <linux/fs.h>
28#include <linux/kernel.h> 29#include <linux/kernel.h>
@@ -48,6 +49,8 @@
48#include <linux/namei.h> 49#include <linux/namei.h>
49#include <linux/smp_lock.h> 50#include <linux/smp_lock.h>
50#include <linux/pid_namespace.h> 51#include <linux/pid_namespace.h>
52#include <linux/idr.h>
53#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
51 54
52#include <asm/atomic.h> 55#include <asm/atomic.h>
53 56
@@ -60,6 +63,8 @@ static struct cgroup_subsys *subsys[] = {
60#include <linux/cgroup_subsys.h> 63#include <linux/cgroup_subsys.h>
61}; 64};
62 65
66#define MAX_CGROUP_ROOT_NAMELEN 64
67
63/* 68/*
64 * A cgroupfs_root represents the root of a cgroup hierarchy, 69 * A cgroupfs_root represents the root of a cgroup hierarchy,
65 * and may be associated with a superblock to form an active 70 * and may be associated with a superblock to form an active
@@ -74,6 +79,9 @@ struct cgroupfs_root {
74 */ 79 */
75 unsigned long subsys_bits; 80 unsigned long subsys_bits;
76 81
82 /* Unique id for this hierarchy. */
83 int hierarchy_id;
84
77 /* The bitmask of subsystems currently attached to this hierarchy */ 85 /* The bitmask of subsystems currently attached to this hierarchy */
78 unsigned long actual_subsys_bits; 86 unsigned long actual_subsys_bits;
79 87
@@ -94,6 +102,9 @@ struct cgroupfs_root {
94 102
95 /* The path to use for release notifications. */ 103 /* The path to use for release notifications. */
96 char release_agent_path[PATH_MAX]; 104 char release_agent_path[PATH_MAX];
105
106 /* The name for this hierarchy - may be empty */
107 char name[MAX_CGROUP_ROOT_NAMELEN];
97}; 108};
98 109
99/* 110/*
@@ -141,6 +152,10 @@ struct css_id {
141static LIST_HEAD(roots); 152static LIST_HEAD(roots);
142static int root_count; 153static int root_count;
143 154
155static DEFINE_IDA(hierarchy_ida);
156static int next_hierarchy_id;
157static DEFINE_SPINLOCK(hierarchy_id_lock);
158
144/* dummytop is a shorthand for the dummy hierarchy's top cgroup */ 159/* dummytop is a shorthand for the dummy hierarchy's top cgroup */
145#define dummytop (&rootnode.top_cgroup) 160#define dummytop (&rootnode.top_cgroup)
146 161
@@ -201,6 +216,7 @@ struct cg_cgroup_link {
201 * cgroup, anchored on cgroup->css_sets 216 * cgroup, anchored on cgroup->css_sets
202 */ 217 */
203 struct list_head cgrp_link_list; 218 struct list_head cgrp_link_list;
219 struct cgroup *cgrp;
204 /* 220 /*
205 * List running through cg_cgroup_links pointing at a 221 * List running through cg_cgroup_links pointing at a
206 * single css_set object, anchored on css_set->cg_links 222 * single css_set object, anchored on css_set->cg_links
@@ -227,8 +243,11 @@ static int cgroup_subsys_init_idr(struct cgroup_subsys *ss);
227static DEFINE_RWLOCK(css_set_lock); 243static DEFINE_RWLOCK(css_set_lock);
228static int css_set_count; 244static int css_set_count;
229 245
230/* hash table for cgroup groups. This improves the performance to 246/*
231 * find an existing css_set */ 247 * hash table for cgroup groups. This improves the performance to find
248 * an existing css_set. This hash doesn't (currently) take into
249 * account cgroups in empty hierarchies.
250 */
232#define CSS_SET_HASH_BITS 7 251#define CSS_SET_HASH_BITS 7
233#define CSS_SET_TABLE_SIZE (1 << CSS_SET_HASH_BITS) 252#define CSS_SET_TABLE_SIZE (1 << CSS_SET_HASH_BITS)
234static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE]; 253static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE];
@@ -248,48 +267,22 @@ static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
248 return &css_set_table[index]; 267 return &css_set_table[index];
249} 268}
250 269
270static void free_css_set_rcu(struct rcu_head *obj)
271{
272 struct css_set *cg = container_of(obj, struct css_set, rcu_head);
273 kfree(cg);
274}
275
251/* We don't maintain the lists running through each css_set to its 276/* We don't maintain the lists running through each css_set to its
252 * task until after the first call to cgroup_iter_start(). This 277 * task until after the first call to cgroup_iter_start(). This
253 * reduces the fork()/exit() overhead for people who have cgroups 278 * reduces the fork()/exit() overhead for people who have cgroups
254 * compiled into their kernel but not actually in use */ 279 * compiled into their kernel but not actually in use */
255static int use_task_css_set_links __read_mostly; 280static int use_task_css_set_links __read_mostly;
256 281
257/* When we create or destroy a css_set, the operation simply 282static void __put_css_set(struct css_set *cg, int taskexit)
258 * takes/releases a reference count on all the cgroups referenced
259 * by subsystems in this css_set. This can end up multiple-counting
260 * some cgroups, but that's OK - the ref-count is just a
261 * busy/not-busy indicator; ensuring that we only count each cgroup
262 * once would require taking a global lock to ensure that no
263 * subsystems moved between hierarchies while we were doing so.
264 *
265 * Possible TODO: decide at boot time based on the number of
266 * registered subsystems and the number of CPUs or NUMA nodes whether
267 * it's better for performance to ref-count every subsystem, or to
268 * take a global lock and only add one ref count to each hierarchy.
269 */
270
271/*
272 * unlink a css_set from the list and free it
273 */
274static void unlink_css_set(struct css_set *cg)
275{ 283{
276 struct cg_cgroup_link *link; 284 struct cg_cgroup_link *link;
277 struct cg_cgroup_link *saved_link; 285 struct cg_cgroup_link *saved_link;
278
279 hlist_del(&cg->hlist);
280 css_set_count--;
281
282 list_for_each_entry_safe(link, saved_link, &cg->cg_links,
283 cg_link_list) {
284 list_del(&link->cg_link_list);
285 list_del(&link->cgrp_link_list);
286 kfree(link);
287 }
288}
289
290static void __put_css_set(struct css_set *cg, int taskexit)
291{
292 int i;
293 /* 286 /*
294 * Ensure that the refcount doesn't hit zero while any readers 287 * Ensure that the refcount doesn't hit zero while any readers
295 * can see it. Similar to atomic_dec_and_lock(), but for an 288 * can see it. Similar to atomic_dec_and_lock(), but for an
@@ -302,21 +295,28 @@ static void __put_css_set(struct css_set *cg, int taskexit)
302 write_unlock(&css_set_lock); 295 write_unlock(&css_set_lock);
303 return; 296 return;
304 } 297 }
305 unlink_css_set(cg);
306 write_unlock(&css_set_lock);
307 298
308 rcu_read_lock(); 299 /* This css_set is dead. unlink it and release cgroup refcounts */
309 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 300 hlist_del(&cg->hlist);
310 struct cgroup *cgrp = rcu_dereference(cg->subsys[i]->cgroup); 301 css_set_count--;
302
303 list_for_each_entry_safe(link, saved_link, &cg->cg_links,
304 cg_link_list) {
305 struct cgroup *cgrp = link->cgrp;
306 list_del(&link->cg_link_list);
307 list_del(&link->cgrp_link_list);
311 if (atomic_dec_and_test(&cgrp->count) && 308 if (atomic_dec_and_test(&cgrp->count) &&
312 notify_on_release(cgrp)) { 309 notify_on_release(cgrp)) {
313 if (taskexit) 310 if (taskexit)
314 set_bit(CGRP_RELEASABLE, &cgrp->flags); 311 set_bit(CGRP_RELEASABLE, &cgrp->flags);
315 check_for_release(cgrp); 312 check_for_release(cgrp);
316 } 313 }
314
315 kfree(link);
317 } 316 }
318 rcu_read_unlock(); 317
319 kfree(cg); 318 write_unlock(&css_set_lock);
319 call_rcu(&cg->rcu_head, free_css_set_rcu);
320} 320}
321 321
322/* 322/*
@@ -338,6 +338,78 @@ static inline void put_css_set_taskexit(struct css_set *cg)
338} 338}
339 339
340/* 340/*
341 * compare_css_sets - helper function for find_existing_css_set().
342 * @cg: candidate css_set being tested
343 * @old_cg: existing css_set for a task
344 * @new_cgrp: cgroup that's being entered by the task
345 * @template: desired set of css pointers in css_set (pre-calculated)
346 *
347 * Returns true if "cg" matches "old_cg" except for the hierarchy
348 * which "new_cgrp" belongs to, for which it should match "new_cgrp".
349 */
350static bool compare_css_sets(struct css_set *cg,
351 struct css_set *old_cg,
352 struct cgroup *new_cgrp,
353 struct cgroup_subsys_state *template[])
354{
355 struct list_head *l1, *l2;
356
357 if (memcmp(template, cg->subsys, sizeof(cg->subsys))) {
358 /* Not all subsystems matched */
359 return false;
360 }
361
362 /*
363 * Compare cgroup pointers in order to distinguish between
364 * different cgroups in heirarchies with no subsystems. We
365 * could get by with just this check alone (and skip the
366 * memcmp above) but on most setups the memcmp check will
367 * avoid the need for this more expensive check on almost all
368 * candidates.
369 */
370
371 l1 = &cg->cg_links;
372 l2 = &old_cg->cg_links;
373 while (1) {
374 struct cg_cgroup_link *cgl1, *cgl2;
375 struct cgroup *cg1, *cg2;
376
377 l1 = l1->next;
378 l2 = l2->next;
379 /* See if we reached the end - both lists are equal length. */
380 if (l1 == &cg->cg_links) {
381 BUG_ON(l2 != &old_cg->cg_links);
382 break;
383 } else {
384 BUG_ON(l2 == &old_cg->cg_links);
385 }
386 /* Locate the cgroups associated with these links. */
387 cgl1 = list_entry(l1, struct cg_cgroup_link, cg_link_list);
388 cgl2 = list_entry(l2, struct cg_cgroup_link, cg_link_list);
389 cg1 = cgl1->cgrp;
390 cg2 = cgl2->cgrp;
391 /* Hierarchies should be linked in the same order. */
392 BUG_ON(cg1->root != cg2->root);
393
394 /*
395 * If this hierarchy is the hierarchy of the cgroup
396 * that's changing, then we need to check that this
397 * css_set points to the new cgroup; if it's any other
398 * hierarchy, then this css_set should point to the
399 * same cgroup as the old css_set.
400 */
401 if (cg1->root == new_cgrp->root) {
402 if (cg1 != new_cgrp)
403 return false;
404 } else {
405 if (cg1 != cg2)
406 return false;
407 }
408 }
409 return true;
410}
411
412/*
341 * find_existing_css_set() is a helper for 413 * find_existing_css_set() is a helper for
342 * find_css_set(), and checks to see whether an existing 414 * find_css_set(), and checks to see whether an existing
343 * css_set is suitable. 415 * css_set is suitable.
@@ -378,10 +450,11 @@ static struct css_set *find_existing_css_set(
378 450
379 hhead = css_set_hash(template); 451 hhead = css_set_hash(template);
380 hlist_for_each_entry(cg, node, hhead, hlist) { 452 hlist_for_each_entry(cg, node, hhead, hlist) {
381 if (!memcmp(template, cg->subsys, sizeof(cg->subsys))) { 453 if (!compare_css_sets(cg, oldcg, cgrp, template))
382 /* All subsystems matched */ 454 continue;
383 return cg; 455
384 } 456 /* This css_set matches what we need */
457 return cg;
385 } 458 }
386 459
387 /* No existing cgroup group matched */ 460 /* No existing cgroup group matched */
@@ -435,8 +508,14 @@ static void link_css_set(struct list_head *tmp_cg_links,
435 link = list_first_entry(tmp_cg_links, struct cg_cgroup_link, 508 link = list_first_entry(tmp_cg_links, struct cg_cgroup_link,
436 cgrp_link_list); 509 cgrp_link_list);
437 link->cg = cg; 510 link->cg = cg;
511 link->cgrp = cgrp;
512 atomic_inc(&cgrp->count);
438 list_move(&link->cgrp_link_list, &cgrp->css_sets); 513 list_move(&link->cgrp_link_list, &cgrp->css_sets);
439 list_add(&link->cg_link_list, &cg->cg_links); 514 /*
515 * Always add links to the tail of the list so that the list
516 * is sorted by order of hierarchy creation
517 */
518 list_add_tail(&link->cg_link_list, &cg->cg_links);
440} 519}
441 520
442/* 521/*
@@ -451,11 +530,11 @@ static struct css_set *find_css_set(
451{ 530{
452 struct css_set *res; 531 struct css_set *res;
453 struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT]; 532 struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
454 int i;
455 533
456 struct list_head tmp_cg_links; 534 struct list_head tmp_cg_links;
457 535
458 struct hlist_head *hhead; 536 struct hlist_head *hhead;
537 struct cg_cgroup_link *link;
459 538
460 /* First see if we already have a cgroup group that matches 539 /* First see if we already have a cgroup group that matches
461 * the desired set */ 540 * the desired set */
@@ -489,20 +568,12 @@ static struct css_set *find_css_set(
489 568
490 write_lock(&css_set_lock); 569 write_lock(&css_set_lock);
491 /* Add reference counts and links from the new css_set. */ 570 /* Add reference counts and links from the new css_set. */
492 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 571 list_for_each_entry(link, &oldcg->cg_links, cg_link_list) {
493 struct cgroup *cgrp = res->subsys[i]->cgroup; 572 struct cgroup *c = link->cgrp;
494 struct cgroup_subsys *ss = subsys[i]; 573 if (c->root == cgrp->root)
495 atomic_inc(&cgrp->count); 574 c = cgrp;
496 /* 575 link_css_set(&tmp_cg_links, res, c);
497 * We want to add a link once per cgroup, so we
498 * only do it for the first subsystem in each
499 * hierarchy
500 */
501 if (ss->root->subsys_list.next == &ss->sibling)
502 link_css_set(&tmp_cg_links, res, cgrp);
503 } 576 }
504 if (list_empty(&rootnode.subsys_list))
505 link_css_set(&tmp_cg_links, res, dummytop);
506 577
507 BUG_ON(!list_empty(&tmp_cg_links)); 578 BUG_ON(!list_empty(&tmp_cg_links));
508 579
@@ -518,6 +589,41 @@ static struct css_set *find_css_set(
518} 589}
519 590
520/* 591/*
592 * Return the cgroup for "task" from the given hierarchy. Must be
593 * called with cgroup_mutex held.
594 */
595static struct cgroup *task_cgroup_from_root(struct task_struct *task,
596 struct cgroupfs_root *root)
597{
598 struct css_set *css;
599 struct cgroup *res = NULL;
600
601 BUG_ON(!mutex_is_locked(&cgroup_mutex));
602 read_lock(&css_set_lock);
603 /*
604 * No need to lock the task - since we hold cgroup_mutex the
605 * task can't change groups, so the only thing that can happen
606 * is that it exits and its css is set back to init_css_set.
607 */
608 css = task->cgroups;
609 if (css == &init_css_set) {
610 res = &root->top_cgroup;
611 } else {
612 struct cg_cgroup_link *link;
613 list_for_each_entry(link, &css->cg_links, cg_link_list) {
614 struct cgroup *c = link->cgrp;
615 if (c->root == root) {
616 res = c;
617 break;
618 }
619 }
620 }
621 read_unlock(&css_set_lock);
622 BUG_ON(!res);
623 return res;
624}
625
626/*
521 * There is one global cgroup mutex. We also require taking 627 * There is one global cgroup mutex. We also require taking
522 * task_lock() when dereferencing a task's cgroup subsys pointers. 628 * task_lock() when dereferencing a task's cgroup subsys pointers.
523 * See "The task_lock() exception", at the end of this comment. 629 * See "The task_lock() exception", at the end of this comment.
@@ -677,6 +783,12 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
677 */ 783 */
678 deactivate_super(cgrp->root->sb); 784 deactivate_super(cgrp->root->sb);
679 785
786 /*
787 * if we're getting rid of the cgroup, refcount should ensure
788 * that there are no pidlists left.
789 */
790 BUG_ON(!list_empty(&cgrp->pidlists));
791
680 call_rcu(&cgrp->rcu_head, free_cgroup_rcu); 792 call_rcu(&cgrp->rcu_head, free_cgroup_rcu);
681 } 793 }
682 iput(inode); 794 iput(inode);
@@ -841,6 +953,8 @@ static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs)
841 seq_puts(seq, ",noprefix"); 953 seq_puts(seq, ",noprefix");
842 if (strlen(root->release_agent_path)) 954 if (strlen(root->release_agent_path))
843 seq_printf(seq, ",release_agent=%s", root->release_agent_path); 955 seq_printf(seq, ",release_agent=%s", root->release_agent_path);
956 if (strlen(root->name))
957 seq_printf(seq, ",name=%s", root->name);
844 mutex_unlock(&cgroup_mutex); 958 mutex_unlock(&cgroup_mutex);
845 return 0; 959 return 0;
846} 960}
@@ -849,6 +963,12 @@ struct cgroup_sb_opts {
849 unsigned long subsys_bits; 963 unsigned long subsys_bits;
850 unsigned long flags; 964 unsigned long flags;
851 char *release_agent; 965 char *release_agent;
966 char *name;
967 /* User explicitly requested empty subsystem */
968 bool none;
969
970 struct cgroupfs_root *new_root;
971
852}; 972};
853 973
854/* Convert a hierarchy specifier into a bitmask of subsystems and 974/* Convert a hierarchy specifier into a bitmask of subsystems and
@@ -863,9 +983,7 @@ static int parse_cgroupfs_options(char *data,
863 mask = ~(1UL << cpuset_subsys_id); 983 mask = ~(1UL << cpuset_subsys_id);
864#endif 984#endif
865 985
866 opts->subsys_bits = 0; 986 memset(opts, 0, sizeof(*opts));
867 opts->flags = 0;
868 opts->release_agent = NULL;
869 987
870 while ((token = strsep(&o, ",")) != NULL) { 988 while ((token = strsep(&o, ",")) != NULL) {
871 if (!*token) 989 if (!*token)
@@ -879,17 +997,42 @@ static int parse_cgroupfs_options(char *data,
879 if (!ss->disabled) 997 if (!ss->disabled)
880 opts->subsys_bits |= 1ul << i; 998 opts->subsys_bits |= 1ul << i;
881 } 999 }
1000 } else if (!strcmp(token, "none")) {
1001 /* Explicitly have no subsystems */
1002 opts->none = true;
882 } else if (!strcmp(token, "noprefix")) { 1003 } else if (!strcmp(token, "noprefix")) {
883 set_bit(ROOT_NOPREFIX, &opts->flags); 1004 set_bit(ROOT_NOPREFIX, &opts->flags);
884 } else if (!strncmp(token, "release_agent=", 14)) { 1005 } else if (!strncmp(token, "release_agent=", 14)) {
885 /* Specifying two release agents is forbidden */ 1006 /* Specifying two release agents is forbidden */
886 if (opts->release_agent) 1007 if (opts->release_agent)
887 return -EINVAL; 1008 return -EINVAL;
888 opts->release_agent = kzalloc(PATH_MAX, GFP_KERNEL); 1009 opts->release_agent =
1010 kstrndup(token + 14, PATH_MAX, GFP_KERNEL);
889 if (!opts->release_agent) 1011 if (!opts->release_agent)
890 return -ENOMEM; 1012 return -ENOMEM;
891 strncpy(opts->release_agent, token + 14, PATH_MAX - 1); 1013 } else if (!strncmp(token, "name=", 5)) {
892 opts->release_agent[PATH_MAX - 1] = 0; 1014 int i;
1015 const char *name = token + 5;
1016 /* Can't specify an empty name */
1017 if (!strlen(name))
1018 return -EINVAL;
1019 /* Must match [\w.-]+ */
1020 for (i = 0; i < strlen(name); i++) {
1021 char c = name[i];
1022 if (isalnum(c))
1023 continue;
1024 if ((c == '.') || (c == '-') || (c == '_'))
1025 continue;
1026 return -EINVAL;
1027 }
1028 /* Specifying two names is forbidden */
1029 if (opts->name)
1030 return -EINVAL;
1031 opts->name = kstrndup(name,
1032 MAX_CGROUP_ROOT_NAMELEN,
1033 GFP_KERNEL);
1034 if (!opts->name)
1035 return -ENOMEM;
893 } else { 1036 } else {
894 struct cgroup_subsys *ss; 1037 struct cgroup_subsys *ss;
895 int i; 1038 int i;
@@ -906,6 +1049,8 @@ static int parse_cgroupfs_options(char *data,
906 } 1049 }
907 } 1050 }
908 1051
1052 /* Consistency checks */
1053
909 /* 1054 /*
910 * Option noprefix was introduced just for backward compatibility 1055 * Option noprefix was introduced just for backward compatibility
911 * with the old cpuset, so we allow noprefix only if mounting just 1056 * with the old cpuset, so we allow noprefix only if mounting just
@@ -915,8 +1060,16 @@ static int parse_cgroupfs_options(char *data,
915 (opts->subsys_bits & mask)) 1060 (opts->subsys_bits & mask))
916 return -EINVAL; 1061 return -EINVAL;
917 1062
918 /* We can't have an empty hierarchy */ 1063
919 if (!opts->subsys_bits) 1064 /* Can't specify "none" and some subsystems */
1065 if (opts->subsys_bits && opts->none)
1066 return -EINVAL;
1067
1068 /*
1069 * We either have to specify by name or by subsystems. (So all
1070 * empty hierarchies must have a name).
1071 */
1072 if (!opts->subsys_bits && !opts->name)
920 return -EINVAL; 1073 return -EINVAL;
921 1074
922 return 0; 1075 return 0;
@@ -944,6 +1097,12 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
944 goto out_unlock; 1097 goto out_unlock;
945 } 1098 }
946 1099
1100 /* Don't allow name to change at remount */
1101 if (opts.name && strcmp(opts.name, root->name)) {
1102 ret = -EINVAL;
1103 goto out_unlock;
1104 }
1105
947 ret = rebind_subsystems(root, opts.subsys_bits); 1106 ret = rebind_subsystems(root, opts.subsys_bits);
948 if (ret) 1107 if (ret)
949 goto out_unlock; 1108 goto out_unlock;
@@ -955,6 +1114,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
955 strcpy(root->release_agent_path, opts.release_agent); 1114 strcpy(root->release_agent_path, opts.release_agent);
956 out_unlock: 1115 out_unlock:
957 kfree(opts.release_agent); 1116 kfree(opts.release_agent);
1117 kfree(opts.name);
958 mutex_unlock(&cgroup_mutex); 1118 mutex_unlock(&cgroup_mutex);
959 mutex_unlock(&cgrp->dentry->d_inode->i_mutex); 1119 mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
960 unlock_kernel(); 1120 unlock_kernel();
@@ -974,9 +1134,10 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
974 INIT_LIST_HEAD(&cgrp->children); 1134 INIT_LIST_HEAD(&cgrp->children);
975 INIT_LIST_HEAD(&cgrp->css_sets); 1135 INIT_LIST_HEAD(&cgrp->css_sets);
976 INIT_LIST_HEAD(&cgrp->release_list); 1136 INIT_LIST_HEAD(&cgrp->release_list);
977 INIT_LIST_HEAD(&cgrp->pids_list); 1137 INIT_LIST_HEAD(&cgrp->pidlists);
978 init_rwsem(&cgrp->pids_mutex); 1138 mutex_init(&cgrp->pidlist_mutex);
979} 1139}
1140
980static void init_cgroup_root(struct cgroupfs_root *root) 1141static void init_cgroup_root(struct cgroupfs_root *root)
981{ 1142{
982 struct cgroup *cgrp = &root->top_cgroup; 1143 struct cgroup *cgrp = &root->top_cgroup;
@@ -988,33 +1149,106 @@ static void init_cgroup_root(struct cgroupfs_root *root)
988 init_cgroup_housekeeping(cgrp); 1149 init_cgroup_housekeeping(cgrp);
989} 1150}
990 1151
1152static bool init_root_id(struct cgroupfs_root *root)
1153{
1154 int ret = 0;
1155
1156 do {
1157 if (!ida_pre_get(&hierarchy_ida, GFP_KERNEL))
1158 return false;
1159 spin_lock(&hierarchy_id_lock);
1160 /* Try to allocate the next unused ID */
1161 ret = ida_get_new_above(&hierarchy_ida, next_hierarchy_id,
1162 &root->hierarchy_id);
1163 if (ret == -ENOSPC)
1164 /* Try again starting from 0 */
1165 ret = ida_get_new(&hierarchy_ida, &root->hierarchy_id);
1166 if (!ret) {
1167 next_hierarchy_id = root->hierarchy_id + 1;
1168 } else if (ret != -EAGAIN) {
1169 /* Can only get here if the 31-bit IDR is full ... */
1170 BUG_ON(ret);
1171 }
1172 spin_unlock(&hierarchy_id_lock);
1173 } while (ret);
1174 return true;
1175}
1176
991static int cgroup_test_super(struct super_block *sb, void *data) 1177static int cgroup_test_super(struct super_block *sb, void *data)
992{ 1178{
993 struct cgroupfs_root *new = data; 1179 struct cgroup_sb_opts *opts = data;
994 struct cgroupfs_root *root = sb->s_fs_info; 1180 struct cgroupfs_root *root = sb->s_fs_info;
995 1181
996 /* First check subsystems */ 1182 /* If we asked for a name then it must match */
997 if (new->subsys_bits != root->subsys_bits) 1183 if (opts->name && strcmp(opts->name, root->name))
998 return 0; 1184 return 0;
999 1185
1000 /* Next check flags */ 1186 /*
1001 if (new->flags != root->flags) 1187 * If we asked for subsystems (or explicitly for no
1188 * subsystems) then they must match
1189 */
1190 if ((opts->subsys_bits || opts->none)
1191 && (opts->subsys_bits != root->subsys_bits))
1002 return 0; 1192 return 0;
1003 1193
1004 return 1; 1194 return 1;
1005} 1195}
1006 1196
1197static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
1198{
1199 struct cgroupfs_root *root;
1200
1201 if (!opts->subsys_bits && !opts->none)
1202 return NULL;
1203
1204 root = kzalloc(sizeof(*root), GFP_KERNEL);
1205 if (!root)
1206 return ERR_PTR(-ENOMEM);
1207
1208 if (!init_root_id(root)) {
1209 kfree(root);
1210 return ERR_PTR(-ENOMEM);
1211 }
1212 init_cgroup_root(root);
1213
1214 root->subsys_bits = opts->subsys_bits;
1215 root->flags = opts->flags;
1216 if (opts->release_agent)
1217 strcpy(root->release_agent_path, opts->release_agent);
1218 if (opts->name)
1219 strcpy(root->name, opts->name);
1220 return root;
1221}
1222
1223static void cgroup_drop_root(struct cgroupfs_root *root)
1224{
1225 if (!root)
1226 return;
1227
1228 BUG_ON(!root->hierarchy_id);
1229 spin_lock(&hierarchy_id_lock);
1230 ida_remove(&hierarchy_ida, root->hierarchy_id);
1231 spin_unlock(&hierarchy_id_lock);
1232 kfree(root);
1233}
1234
1007static int cgroup_set_super(struct super_block *sb, void *data) 1235static int cgroup_set_super(struct super_block *sb, void *data)
1008{ 1236{
1009 int ret; 1237 int ret;
1010 struct cgroupfs_root *root = data; 1238 struct cgroup_sb_opts *opts = data;
1239
1240 /* If we don't have a new root, we can't set up a new sb */
1241 if (!opts->new_root)
1242 return -EINVAL;
1243
1244 BUG_ON(!opts->subsys_bits && !opts->none);
1011 1245
1012 ret = set_anon_super(sb, NULL); 1246 ret = set_anon_super(sb, NULL);
1013 if (ret) 1247 if (ret)
1014 return ret; 1248 return ret;
1015 1249
1016 sb->s_fs_info = root; 1250 sb->s_fs_info = opts->new_root;
1017 root->sb = sb; 1251 opts->new_root->sb = sb;
1018 1252
1019 sb->s_blocksize = PAGE_CACHE_SIZE; 1253 sb->s_blocksize = PAGE_CACHE_SIZE;
1020 sb->s_blocksize_bits = PAGE_CACHE_SHIFT; 1254 sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
@@ -1051,48 +1285,43 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1051 void *data, struct vfsmount *mnt) 1285 void *data, struct vfsmount *mnt)
1052{ 1286{
1053 struct cgroup_sb_opts opts; 1287 struct cgroup_sb_opts opts;
1288 struct cgroupfs_root *root;
1054 int ret = 0; 1289 int ret = 0;
1055 struct super_block *sb; 1290 struct super_block *sb;
1056 struct cgroupfs_root *root; 1291 struct cgroupfs_root *new_root;
1057 struct list_head tmp_cg_links;
1058 1292
1059 /* First find the desired set of subsystems */ 1293 /* First find the desired set of subsystems */
1060 ret = parse_cgroupfs_options(data, &opts); 1294 ret = parse_cgroupfs_options(data, &opts);
1061 if (ret) { 1295 if (ret)
1062 kfree(opts.release_agent); 1296 goto out_err;
1063 return ret;
1064 }
1065
1066 root = kzalloc(sizeof(*root), GFP_KERNEL);
1067 if (!root) {
1068 kfree(opts.release_agent);
1069 return -ENOMEM;
1070 }
1071 1297
1072 init_cgroup_root(root); 1298 /*
1073 root->subsys_bits = opts.subsys_bits; 1299 * Allocate a new cgroup root. We may not need it if we're
1074 root->flags = opts.flags; 1300 * reusing an existing hierarchy.
1075 if (opts.release_agent) { 1301 */
1076 strcpy(root->release_agent_path, opts.release_agent); 1302 new_root = cgroup_root_from_opts(&opts);
1077 kfree(opts.release_agent); 1303 if (IS_ERR(new_root)) {
1304 ret = PTR_ERR(new_root);
1305 goto out_err;
1078 } 1306 }
1307 opts.new_root = new_root;
1079 1308
1080 sb = sget(fs_type, cgroup_test_super, cgroup_set_super, root); 1309 /* Locate an existing or new sb for this hierarchy */
1081 1310 sb = sget(fs_type, cgroup_test_super, cgroup_set_super, &opts);
1082 if (IS_ERR(sb)) { 1311 if (IS_ERR(sb)) {
1083 kfree(root); 1312 ret = PTR_ERR(sb);
1084 return PTR_ERR(sb); 1313 cgroup_drop_root(opts.new_root);
1314 goto out_err;
1085 } 1315 }
1086 1316
1087 if (sb->s_fs_info != root) { 1317 root = sb->s_fs_info;
1088 /* Reusing an existing superblock */ 1318 BUG_ON(!root);
1089 BUG_ON(sb->s_root == NULL); 1319 if (root == opts.new_root) {
1090 kfree(root); 1320 /* We used the new root structure, so this is a new hierarchy */
1091 root = NULL; 1321 struct list_head tmp_cg_links;
1092 } else {
1093 /* New superblock */
1094 struct cgroup *root_cgrp = &root->top_cgroup; 1322 struct cgroup *root_cgrp = &root->top_cgroup;
1095 struct inode *inode; 1323 struct inode *inode;
1324 struct cgroupfs_root *existing_root;
1096 int i; 1325 int i;
1097 1326
1098 BUG_ON(sb->s_root != NULL); 1327 BUG_ON(sb->s_root != NULL);
@@ -1105,6 +1334,18 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1105 mutex_lock(&inode->i_mutex); 1334 mutex_lock(&inode->i_mutex);
1106 mutex_lock(&cgroup_mutex); 1335 mutex_lock(&cgroup_mutex);
1107 1336
1337 if (strlen(root->name)) {
1338 /* Check for name clashes with existing mounts */
1339 for_each_active_root(existing_root) {
1340 if (!strcmp(existing_root->name, root->name)) {
1341 ret = -EBUSY;
1342 mutex_unlock(&cgroup_mutex);
1343 mutex_unlock(&inode->i_mutex);
1344 goto drop_new_super;
1345 }
1346 }
1347 }
1348
1108 /* 1349 /*
1109 * We're accessing css_set_count without locking 1350 * We're accessing css_set_count without locking
1110 * css_set_lock here, but that's OK - it can only be 1351 * css_set_lock here, but that's OK - it can only be
@@ -1123,7 +1364,8 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1123 if (ret == -EBUSY) { 1364 if (ret == -EBUSY) {
1124 mutex_unlock(&cgroup_mutex); 1365 mutex_unlock(&cgroup_mutex);
1125 mutex_unlock(&inode->i_mutex); 1366 mutex_unlock(&inode->i_mutex);
1126 goto free_cg_links; 1367 free_cg_links(&tmp_cg_links);
1368 goto drop_new_super;
1127 } 1369 }
1128 1370
1129 /* EBUSY should be the only error here */ 1371 /* EBUSY should be the only error here */
@@ -1155,17 +1397,27 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1155 BUG_ON(root->number_of_cgroups != 1); 1397 BUG_ON(root->number_of_cgroups != 1);
1156 1398
1157 cgroup_populate_dir(root_cgrp); 1399 cgroup_populate_dir(root_cgrp);
1158 mutex_unlock(&inode->i_mutex);
1159 mutex_unlock(&cgroup_mutex); 1400 mutex_unlock(&cgroup_mutex);
1401 mutex_unlock(&inode->i_mutex);
1402 } else {
1403 /*
1404 * We re-used an existing hierarchy - the new root (if
1405 * any) is not needed
1406 */
1407 cgroup_drop_root(opts.new_root);
1160 } 1408 }
1161 1409
1162 simple_set_mnt(mnt, sb); 1410 simple_set_mnt(mnt, sb);
1411 kfree(opts.release_agent);
1412 kfree(opts.name);
1163 return 0; 1413 return 0;
1164 1414
1165 free_cg_links:
1166 free_cg_links(&tmp_cg_links);
1167 drop_new_super: 1415 drop_new_super:
1168 deactivate_locked_super(sb); 1416 deactivate_locked_super(sb);
1417 out_err:
1418 kfree(opts.release_agent);
1419 kfree(opts.name);
1420
1169 return ret; 1421 return ret;
1170} 1422}
1171 1423
@@ -1211,7 +1463,7 @@ static void cgroup_kill_sb(struct super_block *sb) {
1211 mutex_unlock(&cgroup_mutex); 1463 mutex_unlock(&cgroup_mutex);
1212 1464
1213 kill_litter_super(sb); 1465 kill_litter_super(sb);
1214 kfree(root); 1466 cgroup_drop_root(root);
1215} 1467}
1216 1468
1217static struct file_system_type cgroup_fs_type = { 1469static struct file_system_type cgroup_fs_type = {
@@ -1276,27 +1528,6 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1276 return 0; 1528 return 0;
1277} 1529}
1278 1530
1279/*
1280 * Return the first subsystem attached to a cgroup's hierarchy, and
1281 * its subsystem id.
1282 */
1283
1284static void get_first_subsys(const struct cgroup *cgrp,
1285 struct cgroup_subsys_state **css, int *subsys_id)
1286{
1287 const struct cgroupfs_root *root = cgrp->root;
1288 const struct cgroup_subsys *test_ss;
1289 BUG_ON(list_empty(&root->subsys_list));
1290 test_ss = list_entry(root->subsys_list.next,
1291 struct cgroup_subsys, sibling);
1292 if (css) {
1293 *css = cgrp->subsys[test_ss->subsys_id];
1294 BUG_ON(!*css);
1295 }
1296 if (subsys_id)
1297 *subsys_id = test_ss->subsys_id;
1298}
1299
1300/** 1531/**
1301 * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp' 1532 * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp'
1302 * @cgrp: the cgroup the task is attaching to 1533 * @cgrp: the cgroup the task is attaching to
@@ -1313,18 +1544,15 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1313 struct css_set *cg; 1544 struct css_set *cg;
1314 struct css_set *newcg; 1545 struct css_set *newcg;
1315 struct cgroupfs_root *root = cgrp->root; 1546 struct cgroupfs_root *root = cgrp->root;
1316 int subsys_id;
1317
1318 get_first_subsys(cgrp, NULL, &subsys_id);
1319 1547
1320 /* Nothing to do if the task is already in that cgroup */ 1548 /* Nothing to do if the task is already in that cgroup */
1321 oldcgrp = task_cgroup(tsk, subsys_id); 1549 oldcgrp = task_cgroup_from_root(tsk, root);
1322 if (cgrp == oldcgrp) 1550 if (cgrp == oldcgrp)
1323 return 0; 1551 return 0;
1324 1552
1325 for_each_subsys(root, ss) { 1553 for_each_subsys(root, ss) {
1326 if (ss->can_attach) { 1554 if (ss->can_attach) {
1327 retval = ss->can_attach(ss, cgrp, tsk); 1555 retval = ss->can_attach(ss, cgrp, tsk, false);
1328 if (retval) 1556 if (retval)
1329 return retval; 1557 return retval;
1330 } 1558 }
@@ -1362,7 +1590,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1362 1590
1363 for_each_subsys(root, ss) { 1591 for_each_subsys(root, ss) {
1364 if (ss->attach) 1592 if (ss->attach)
1365 ss->attach(ss, cgrp, oldcgrp, tsk); 1593 ss->attach(ss, cgrp, oldcgrp, tsk, false);
1366 } 1594 }
1367 set_bit(CGRP_RELEASABLE, &oldcgrp->flags); 1595 set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
1368 synchronize_rcu(); 1596 synchronize_rcu();
@@ -1423,15 +1651,6 @@ static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
1423 return ret; 1651 return ret;
1424} 1652}
1425 1653
1426/* The various types of files and directories in a cgroup file system */
1427enum cgroup_filetype {
1428 FILE_ROOT,
1429 FILE_DIR,
1430 FILE_TASKLIST,
1431 FILE_NOTIFY_ON_RELEASE,
1432 FILE_RELEASE_AGENT,
1433};
1434
1435/** 1654/**
1436 * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive. 1655 * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.
1437 * @cgrp: the cgroup to be checked for liveness 1656 * @cgrp: the cgroup to be checked for liveness
@@ -1876,7 +2095,7 @@ int cgroup_task_count(const struct cgroup *cgrp)
1876 * the start of a css_set 2095 * the start of a css_set
1877 */ 2096 */
1878static void cgroup_advance_iter(struct cgroup *cgrp, 2097static void cgroup_advance_iter(struct cgroup *cgrp,
1879 struct cgroup_iter *it) 2098 struct cgroup_iter *it)
1880{ 2099{
1881 struct list_head *l = it->cg_link; 2100 struct list_head *l = it->cg_link;
1882 struct cg_cgroup_link *link; 2101 struct cg_cgroup_link *link;
@@ -2129,7 +2348,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
2129} 2348}
2130 2349
2131/* 2350/*
2132 * Stuff for reading the 'tasks' file. 2351 * Stuff for reading the 'tasks'/'procs' files.
2133 * 2352 *
2134 * Reading this file can return large amounts of data if a cgroup has 2353 * Reading this file can return large amounts of data if a cgroup has
2135 * *lots* of attached tasks. So it may need several calls to read(), 2354 * *lots* of attached tasks. So it may need several calls to read(),
@@ -2139,27 +2358,196 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
2139 */ 2358 */
2140 2359
2141/* 2360/*
2142 * Load into 'pidarray' up to 'npids' of the tasks using cgroup 2361 * The following two functions "fix" the issue where there are more pids
2143 * 'cgrp'. Return actual number of pids loaded. No need to 2362 * than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
2144 * task_lock(p) when reading out p->cgroup, since we're in an RCU 2363 * TODO: replace with a kernel-wide solution to this problem
2145 * read section, so the css_set can't go away, and is 2364 */
2146 * immutable after creation. 2365#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))
2366static void *pidlist_allocate(int count)
2367{
2368 if (PIDLIST_TOO_LARGE(count))
2369 return vmalloc(count * sizeof(pid_t));
2370 else
2371 return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
2372}
2373static void pidlist_free(void *p)
2374{
2375 if (is_vmalloc_addr(p))
2376 vfree(p);
2377 else
2378 kfree(p);
2379}
2380static void *pidlist_resize(void *p, int newcount)
2381{
2382 void *newlist;
2383 /* note: if new alloc fails, old p will still be valid either way */
2384 if (is_vmalloc_addr(p)) {
2385 newlist = vmalloc(newcount * sizeof(pid_t));
2386 if (!newlist)
2387 return NULL;
2388 memcpy(newlist, p, newcount * sizeof(pid_t));
2389 vfree(p);
2390 } else {
2391 newlist = krealloc(p, newcount * sizeof(pid_t), GFP_KERNEL);
2392 }
2393 return newlist;
2394}
2395
2396/*
2397 * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
2398 * If the new stripped list is sufficiently smaller and there's enough memory
2399 * to allocate a new buffer, will let go of the unneeded memory. Returns the
2400 * number of unique elements.
2401 */
2402/* is the size difference enough that we should re-allocate the array? */
2403#define PIDLIST_REALLOC_DIFFERENCE(old, new) ((old) - PAGE_SIZE >= (new))
2404static int pidlist_uniq(pid_t **p, int length)
2405{
2406 int src, dest = 1;
2407 pid_t *list = *p;
2408 pid_t *newlist;
2409
2410 /*
2411 * we presume the 0th element is unique, so i starts at 1. trivial
2412 * edge cases first; no work needs to be done for either
2413 */
2414 if (length == 0 || length == 1)
2415 return length;
2416 /* src and dest walk down the list; dest counts unique elements */
2417 for (src = 1; src < length; src++) {
2418 /* find next unique element */
2419 while (list[src] == list[src-1]) {
2420 src++;
2421 if (src == length)
2422 goto after;
2423 }
2424 /* dest always points to where the next unique element goes */
2425 list[dest] = list[src];
2426 dest++;
2427 }
2428after:
2429 /*
2430 * if the length difference is large enough, we want to allocate a
2431 * smaller buffer to save memory. if this fails due to out of memory,
2432 * we'll just stay with what we've got.
2433 */
2434 if (PIDLIST_REALLOC_DIFFERENCE(length, dest)) {
2435 newlist = pidlist_resize(list, dest);
2436 if (newlist)
2437 *p = newlist;
2438 }
2439 return dest;
2440}
2441
2442static int cmppid(const void *a, const void *b)
2443{
2444 return *(pid_t *)a - *(pid_t *)b;
2445}
2446
2447/*
2448 * find the appropriate pidlist for our purpose (given procs vs tasks)
2449 * returns with the lock on that pidlist already held, and takes care
2450 * of the use count, or returns NULL with no locks held if we're out of
2451 * memory.
2147 */ 2452 */
2148static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp) 2453static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
2454 enum cgroup_filetype type)
2149{ 2455{
2150 int n = 0, pid; 2456 struct cgroup_pidlist *l;
2457 /* don't need task_nsproxy() if we're looking at ourself */
2458 struct pid_namespace *ns = get_pid_ns(current->nsproxy->pid_ns);
2459 /*
2460 * We can't drop the pidlist_mutex before taking the l->mutex in case
2461 * the last ref-holder is trying to remove l from the list at the same
2462 * time. Holding the pidlist_mutex precludes somebody taking whichever
2463 * list we find out from under us - compare release_pid_array().
2464 */
2465 mutex_lock(&cgrp->pidlist_mutex);
2466 list_for_each_entry(l, &cgrp->pidlists, links) {
2467 if (l->key.type == type && l->key.ns == ns) {
2468 /* found a matching list - drop the extra refcount */
2469 put_pid_ns(ns);
2470 /* make sure l doesn't vanish out from under us */
2471 down_write(&l->mutex);
2472 mutex_unlock(&cgrp->pidlist_mutex);
2473 l->use_count++;
2474 return l;
2475 }
2476 }
2477 /* entry not found; create a new one */
2478 l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
2479 if (!l) {
2480 mutex_unlock(&cgrp->pidlist_mutex);
2481 put_pid_ns(ns);
2482 return l;
2483 }
2484 init_rwsem(&l->mutex);
2485 down_write(&l->mutex);
2486 l->key.type = type;
2487 l->key.ns = ns;
2488 l->use_count = 0; /* don't increment here */
2489 l->list = NULL;
2490 l->owner = cgrp;
2491 list_add(&l->links, &cgrp->pidlists);
2492 mutex_unlock(&cgrp->pidlist_mutex);
2493 return l;
2494}
2495
2496/*
2497 * Load a cgroup's pidarray with either procs' tgids or tasks' pids
2498 */
2499static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
2500 struct cgroup_pidlist **lp)
2501{
2502 pid_t *array;
2503 int length;
2504 int pid, n = 0; /* used for populating the array */
2151 struct cgroup_iter it; 2505 struct cgroup_iter it;
2152 struct task_struct *tsk; 2506 struct task_struct *tsk;
2507 struct cgroup_pidlist *l;
2508
2509 /*
2510 * If cgroup gets more users after we read count, we won't have
2511 * enough space - tough. This race is indistinguishable to the
2512 * caller from the case that the additional cgroup users didn't
2513 * show up until sometime later on.
2514 */
2515 length = cgroup_task_count(cgrp);
2516 array = pidlist_allocate(length);
2517 if (!array)
2518 return -ENOMEM;
2519 /* now, populate the array */
2153 cgroup_iter_start(cgrp, &it); 2520 cgroup_iter_start(cgrp, &it);
2154 while ((tsk = cgroup_iter_next(cgrp, &it))) { 2521 while ((tsk = cgroup_iter_next(cgrp, &it))) {
2155 if (unlikely(n == npids)) 2522 if (unlikely(n == length))
2156 break; 2523 break;
2157 pid = task_pid_vnr(tsk); 2524 /* get tgid or pid for procs or tasks file respectively */
2158 if (pid > 0) 2525 if (type == CGROUP_FILE_PROCS)
2159 pidarray[n++] = pid; 2526 pid = task_tgid_vnr(tsk);
2527 else
2528 pid = task_pid_vnr(tsk);
2529 if (pid > 0) /* make sure to only use valid results */
2530 array[n++] = pid;
2160 } 2531 }
2161 cgroup_iter_end(cgrp, &it); 2532 cgroup_iter_end(cgrp, &it);
2162 return n; 2533 length = n;
2534 /* now sort & (if procs) strip out duplicates */
2535 sort(array, length, sizeof(pid_t), cmppid, NULL);
2536 if (type == CGROUP_FILE_PROCS)
2537 length = pidlist_uniq(&array, length);
2538 l = cgroup_pidlist_find(cgrp, type);
2539 if (!l) {
2540 pidlist_free(array);
2541 return -ENOMEM;
2542 }
2543 /* store array, freeing old if necessary - lock already held */
2544 pidlist_free(l->list);
2545 l->list = array;
2546 l->length = length;
2547 l->use_count++;
2548 up_write(&l->mutex);
2549 *lp = l;
2550 return 0;
2163} 2551}
2164 2552
2165/** 2553/**
@@ -2216,37 +2604,14 @@ err:
2216 return ret; 2604 return ret;
2217} 2605}
2218 2606
2219/*
2220 * Cache pids for all threads in the same pid namespace that are
2221 * opening the same "tasks" file.
2222 */
2223struct cgroup_pids {
2224 /* The node in cgrp->pids_list */
2225 struct list_head list;
2226 /* The cgroup those pids belong to */
2227 struct cgroup *cgrp;
2228 /* The namepsace those pids belong to */
2229 struct pid_namespace *ns;
2230 /* Array of process ids in the cgroup */
2231 pid_t *tasks_pids;
2232 /* How many files are using the this tasks_pids array */
2233 int use_count;
2234 /* Length of the current tasks_pids array */
2235 int length;
2236};
2237
2238static int cmppid(const void *a, const void *b)
2239{
2240 return *(pid_t *)a - *(pid_t *)b;
2241}
2242 2607
2243/* 2608/*
2244 * seq_file methods for the "tasks" file. The seq_file position is the 2609 * seq_file methods for the tasks/procs files. The seq_file position is the
2245 * next pid to display; the seq_file iterator is a pointer to the pid 2610 * next pid to display; the seq_file iterator is a pointer to the pid
2246 * in the cgroup->tasks_pids array. 2611 * in the cgroup->l->list array.
2247 */ 2612 */
2248 2613
2249static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos) 2614static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
2250{ 2615{
2251 /* 2616 /*
2252 * Initially we receive a position value that corresponds to 2617 * Initially we receive a position value that corresponds to
@@ -2254,48 +2619,45 @@ static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos)
2254 * after a seek to the start). Use a binary-search to find the 2619 * after a seek to the start). Use a binary-search to find the
2255 * next pid to display, if any 2620 * next pid to display, if any
2256 */ 2621 */
2257 struct cgroup_pids *cp = s->private; 2622 struct cgroup_pidlist *l = s->private;
2258 struct cgroup *cgrp = cp->cgrp;
2259 int index = 0, pid = *pos; 2623 int index = 0, pid = *pos;
2260 int *iter; 2624 int *iter;
2261 2625
2262 down_read(&cgrp->pids_mutex); 2626 down_read(&l->mutex);
2263 if (pid) { 2627 if (pid) {
2264 int end = cp->length; 2628 int end = l->length;
2265 2629
2266 while (index < end) { 2630 while (index < end) {
2267 int mid = (index + end) / 2; 2631 int mid = (index + end) / 2;
2268 if (cp->tasks_pids[mid] == pid) { 2632 if (l->list[mid] == pid) {
2269 index = mid; 2633 index = mid;
2270 break; 2634 break;
2271 } else if (cp->tasks_pids[mid] <= pid) 2635 } else if (l->list[mid] <= pid)
2272 index = mid + 1; 2636 index = mid + 1;
2273 else 2637 else
2274 end = mid; 2638 end = mid;
2275 } 2639 }
2276 } 2640 }
2277 /* If we're off the end of the array, we're done */ 2641 /* If we're off the end of the array, we're done */
2278 if (index >= cp->length) 2642 if (index >= l->length)
2279 return NULL; 2643 return NULL;
2280 /* Update the abstract position to be the actual pid that we found */ 2644 /* Update the abstract position to be the actual pid that we found */
2281 iter = cp->tasks_pids + index; 2645 iter = l->list + index;
2282 *pos = *iter; 2646 *pos = *iter;
2283 return iter; 2647 return iter;
2284} 2648}
2285 2649
2286static void cgroup_tasks_stop(struct seq_file *s, void *v) 2650static void cgroup_pidlist_stop(struct seq_file *s, void *v)
2287{ 2651{
2288 struct cgroup_pids *cp = s->private; 2652 struct cgroup_pidlist *l = s->private;
2289 struct cgroup *cgrp = cp->cgrp; 2653 up_read(&l->mutex);
2290 up_read(&cgrp->pids_mutex);
2291} 2654}
2292 2655
2293static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos) 2656static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
2294{ 2657{
2295 struct cgroup_pids *cp = s->private; 2658 struct cgroup_pidlist *l = s->private;
2296 int *p = v; 2659 pid_t *p = v;
2297 int *end = cp->tasks_pids + cp->length; 2660 pid_t *end = l->list + l->length;
2298
2299 /* 2661 /*
2300 * Advance to the next pid in the array. If this goes off the 2662 * Advance to the next pid in the array. If this goes off the
2301 * end, we're done 2663 * end, we're done
@@ -2309,124 +2671,107 @@ static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos)
2309 } 2671 }
2310} 2672}
2311 2673
2312static int cgroup_tasks_show(struct seq_file *s, void *v) 2674static int cgroup_pidlist_show(struct seq_file *s, void *v)
2313{ 2675{
2314 return seq_printf(s, "%d\n", *(int *)v); 2676 return seq_printf(s, "%d\n", *(int *)v);
2315} 2677}
2316 2678
2317static const struct seq_operations cgroup_tasks_seq_operations = { 2679/*
2318 .start = cgroup_tasks_start, 2680 * seq_operations functions for iterating on pidlists through seq_file -
2319 .stop = cgroup_tasks_stop, 2681 * independent of whether it's tasks or procs
2320 .next = cgroup_tasks_next, 2682 */
2321 .show = cgroup_tasks_show, 2683static const struct seq_operations cgroup_pidlist_seq_operations = {
2684 .start = cgroup_pidlist_start,
2685 .stop = cgroup_pidlist_stop,
2686 .next = cgroup_pidlist_next,
2687 .show = cgroup_pidlist_show,
2322}; 2688};
2323 2689
2324static void release_cgroup_pid_array(struct cgroup_pids *cp) 2690static void cgroup_release_pid_array(struct cgroup_pidlist *l)
2325{ 2691{
2326 struct cgroup *cgrp = cp->cgrp; 2692 /*
2327 2693 * the case where we're the last user of this particular pidlist will
2328 down_write(&cgrp->pids_mutex); 2694 * have us remove it from the cgroup's list, which entails taking the
2329 BUG_ON(!cp->use_count); 2695 * mutex. since in pidlist_find the pidlist->lock depends on cgroup->
2330 if (!--cp->use_count) { 2696 * pidlist_mutex, we have to take pidlist_mutex first.
2331 list_del(&cp->list); 2697 */
2332 put_pid_ns(cp->ns); 2698 mutex_lock(&l->owner->pidlist_mutex);
2333 kfree(cp->tasks_pids); 2699 down_write(&l->mutex);
2334 kfree(cp); 2700 BUG_ON(!l->use_count);
2701 if (!--l->use_count) {
2702 /* we're the last user if refcount is 0; remove and free */
2703 list_del(&l->links);
2704 mutex_unlock(&l->owner->pidlist_mutex);
2705 pidlist_free(l->list);
2706 put_pid_ns(l->key.ns);
2707 up_write(&l->mutex);
2708 kfree(l);
2709 return;
2335 } 2710 }
2336 up_write(&cgrp->pids_mutex); 2711 mutex_unlock(&l->owner->pidlist_mutex);
2712 up_write(&l->mutex);
2337} 2713}
2338 2714
2339static int cgroup_tasks_release(struct inode *inode, struct file *file) 2715static int cgroup_pidlist_release(struct inode *inode, struct file *file)
2340{ 2716{
2341 struct seq_file *seq; 2717 struct cgroup_pidlist *l;
2342 struct cgroup_pids *cp;
2343
2344 if (!(file->f_mode & FMODE_READ)) 2718 if (!(file->f_mode & FMODE_READ))
2345 return 0; 2719 return 0;
2346 2720 /*
2347 seq = file->private_data; 2721 * the seq_file will only be initialized if the file was opened for
2348 cp = seq->private; 2722 * reading; hence we check if it's not null only in that case.
2349 2723 */
2350 release_cgroup_pid_array(cp); 2724 l = ((struct seq_file *)file->private_data)->private;
2725 cgroup_release_pid_array(l);
2351 return seq_release(inode, file); 2726 return seq_release(inode, file);
2352} 2727}
2353 2728
2354static struct file_operations cgroup_tasks_operations = { 2729static const struct file_operations cgroup_pidlist_operations = {
2355 .read = seq_read, 2730 .read = seq_read,
2356 .llseek = seq_lseek, 2731 .llseek = seq_lseek,
2357 .write = cgroup_file_write, 2732 .write = cgroup_file_write,
2358 .release = cgroup_tasks_release, 2733 .release = cgroup_pidlist_release,
2359}; 2734};
2360 2735
2361/* 2736/*
2362 * Handle an open on 'tasks' file. Prepare an array containing the 2737 * The following functions handle opens on a file that displays a pidlist
2363 * process id's of tasks currently attached to the cgroup being opened. 2738 * (tasks or procs). Prepare an array of the process/thread IDs of whoever's
2739 * in the cgroup.
2364 */ 2740 */
2365 2741/* helper function for the two below it */
2366static int cgroup_tasks_open(struct inode *unused, struct file *file) 2742static int cgroup_pidlist_open(struct file *file, enum cgroup_filetype type)
2367{ 2743{
2368 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); 2744 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
2369 struct pid_namespace *ns = current->nsproxy->pid_ns; 2745 struct cgroup_pidlist *l;
2370 struct cgroup_pids *cp;
2371 pid_t *pidarray;
2372 int npids;
2373 int retval; 2746 int retval;
2374 2747
2375 /* Nothing to do for write-only files */ 2748 /* Nothing to do for write-only files */
2376 if (!(file->f_mode & FMODE_READ)) 2749 if (!(file->f_mode & FMODE_READ))
2377 return 0; 2750 return 0;
2378 2751
2379 /* 2752 /* have the array populated */
2380 * If cgroup gets more users after we read count, we won't have 2753 retval = pidlist_array_load(cgrp, type, &l);
2381 * enough space - tough. This race is indistinguishable to the 2754 if (retval)
2382 * caller from the case that the additional cgroup users didn't 2755 return retval;
2383 * show up until sometime later on. 2756 /* configure file information */
2384 */ 2757 file->f_op = &cgroup_pidlist_operations;
2385 npids = cgroup_task_count(cgrp);
2386 pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL);
2387 if (!pidarray)
2388 return -ENOMEM;
2389 npids = pid_array_load(pidarray, npids, cgrp);
2390 sort(pidarray, npids, sizeof(pid_t), cmppid, NULL);
2391
2392 /*
2393 * Store the array in the cgroup, freeing the old
2394 * array if necessary
2395 */
2396 down_write(&cgrp->pids_mutex);
2397
2398 list_for_each_entry(cp, &cgrp->pids_list, list) {
2399 if (ns == cp->ns)
2400 goto found;
2401 }
2402
2403 cp = kzalloc(sizeof(*cp), GFP_KERNEL);
2404 if (!cp) {
2405 up_write(&cgrp->pids_mutex);
2406 kfree(pidarray);
2407 return -ENOMEM;
2408 }
2409 cp->cgrp = cgrp;
2410 cp->ns = ns;
2411 get_pid_ns(ns);
2412 list_add(&cp->list, &cgrp->pids_list);
2413found:
2414 kfree(cp->tasks_pids);
2415 cp->tasks_pids = pidarray;
2416 cp->length = npids;
2417 cp->use_count++;
2418 up_write(&cgrp->pids_mutex);
2419
2420 file->f_op = &cgroup_tasks_operations;
2421 2758
2422 retval = seq_open(file, &cgroup_tasks_seq_operations); 2759 retval = seq_open(file, &cgroup_pidlist_seq_operations);
2423 if (retval) { 2760 if (retval) {
2424 release_cgroup_pid_array(cp); 2761 cgroup_release_pid_array(l);
2425 return retval; 2762 return retval;
2426 } 2763 }
2427 ((struct seq_file *)file->private_data)->private = cp; 2764 ((struct seq_file *)file->private_data)->private = l;
2428 return 0; 2765 return 0;
2429} 2766}
2767static int cgroup_tasks_open(struct inode *unused, struct file *file)
2768{
2769 return cgroup_pidlist_open(file, CGROUP_FILE_TASKS);
2770}
2771static int cgroup_procs_open(struct inode *unused, struct file *file)
2772{
2773 return cgroup_pidlist_open(file, CGROUP_FILE_PROCS);
2774}
2430 2775
2431static u64 cgroup_read_notify_on_release(struct cgroup *cgrp, 2776static u64 cgroup_read_notify_on_release(struct cgroup *cgrp,
2432 struct cftype *cft) 2777 struct cftype *cft)
@@ -2449,21 +2794,27 @@ static int cgroup_write_notify_on_release(struct cgroup *cgrp,
2449/* 2794/*
2450 * for the common functions, 'private' gives the type of file 2795 * for the common functions, 'private' gives the type of file
2451 */ 2796 */
2797/* for hysterical raisins, we can't put this on the older files */
2798#define CGROUP_FILE_GENERIC_PREFIX "cgroup."
2452static struct cftype files[] = { 2799static struct cftype files[] = {
2453 { 2800 {
2454 .name = "tasks", 2801 .name = "tasks",
2455 .open = cgroup_tasks_open, 2802 .open = cgroup_tasks_open,
2456 .write_u64 = cgroup_tasks_write, 2803 .write_u64 = cgroup_tasks_write,
2457 .release = cgroup_tasks_release, 2804 .release = cgroup_pidlist_release,
2458 .private = FILE_TASKLIST,
2459 .mode = S_IRUGO | S_IWUSR, 2805 .mode = S_IRUGO | S_IWUSR,
2460 }, 2806 },
2461 2807 {
2808 .name = CGROUP_FILE_GENERIC_PREFIX "procs",
2809 .open = cgroup_procs_open,
2810 /* .write_u64 = cgroup_procs_write, TODO */
2811 .release = cgroup_pidlist_release,
2812 .mode = S_IRUGO,
2813 },
2462 { 2814 {
2463 .name = "notify_on_release", 2815 .name = "notify_on_release",
2464 .read_u64 = cgroup_read_notify_on_release, 2816 .read_u64 = cgroup_read_notify_on_release,
2465 .write_u64 = cgroup_write_notify_on_release, 2817 .write_u64 = cgroup_write_notify_on_release,
2466 .private = FILE_NOTIFY_ON_RELEASE,
2467 }, 2818 },
2468}; 2819};
2469 2820
@@ -2472,7 +2823,6 @@ static struct cftype cft_release_agent = {
2472 .read_seq_string = cgroup_release_agent_show, 2823 .read_seq_string = cgroup_release_agent_show,
2473 .write_string = cgroup_release_agent_write, 2824 .write_string = cgroup_release_agent_write,
2474 .max_write_len = PATH_MAX, 2825 .max_write_len = PATH_MAX,
2475 .private = FILE_RELEASE_AGENT,
2476}; 2826};
2477 2827
2478static int cgroup_populate_dir(struct cgroup *cgrp) 2828static int cgroup_populate_dir(struct cgroup *cgrp)
@@ -2879,6 +3229,7 @@ int __init cgroup_init_early(void)
2879 init_task.cgroups = &init_css_set; 3229 init_task.cgroups = &init_css_set;
2880 3230
2881 init_css_set_link.cg = &init_css_set; 3231 init_css_set_link.cg = &init_css_set;
3232 init_css_set_link.cgrp = dummytop;
2882 list_add(&init_css_set_link.cgrp_link_list, 3233 list_add(&init_css_set_link.cgrp_link_list,
2883 &rootnode.top_cgroup.css_sets); 3234 &rootnode.top_cgroup.css_sets);
2884 list_add(&init_css_set_link.cg_link_list, 3235 list_add(&init_css_set_link.cg_link_list,
@@ -2933,7 +3284,7 @@ int __init cgroup_init(void)
2933 /* Add init_css_set to the hash table */ 3284 /* Add init_css_set to the hash table */
2934 hhead = css_set_hash(init_css_set.subsys); 3285 hhead = css_set_hash(init_css_set.subsys);
2935 hlist_add_head(&init_css_set.hlist, hhead); 3286 hlist_add_head(&init_css_set.hlist, hhead);
2936 3287 BUG_ON(!init_root_id(&rootnode));
2937 err = register_filesystem(&cgroup_fs_type); 3288 err = register_filesystem(&cgroup_fs_type);
2938 if (err < 0) 3289 if (err < 0)
2939 goto out; 3290 goto out;
@@ -2986,15 +3337,16 @@ static int proc_cgroup_show(struct seq_file *m, void *v)
2986 for_each_active_root(root) { 3337 for_each_active_root(root) {
2987 struct cgroup_subsys *ss; 3338 struct cgroup_subsys *ss;
2988 struct cgroup *cgrp; 3339 struct cgroup *cgrp;
2989 int subsys_id;
2990 int count = 0; 3340 int count = 0;
2991 3341
2992 seq_printf(m, "%lu:", root->subsys_bits); 3342 seq_printf(m, "%d:", root->hierarchy_id);
2993 for_each_subsys(root, ss) 3343 for_each_subsys(root, ss)
2994 seq_printf(m, "%s%s", count++ ? "," : "", ss->name); 3344 seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
3345 if (strlen(root->name))
3346 seq_printf(m, "%sname=%s", count ? "," : "",
3347 root->name);
2995 seq_putc(m, ':'); 3348 seq_putc(m, ':');
2996 get_first_subsys(&root->top_cgroup, NULL, &subsys_id); 3349 cgrp = task_cgroup_from_root(tsk, root);
2997 cgrp = task_cgroup(tsk, subsys_id);
2998 retval = cgroup_path(cgrp, buf, PAGE_SIZE); 3350 retval = cgroup_path(cgrp, buf, PAGE_SIZE);
2999 if (retval < 0) 3351 if (retval < 0)
3000 goto out_unlock; 3352 goto out_unlock;
@@ -3033,8 +3385,8 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v)
3033 mutex_lock(&cgroup_mutex); 3385 mutex_lock(&cgroup_mutex);
3034 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 3386 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
3035 struct cgroup_subsys *ss = subsys[i]; 3387 struct cgroup_subsys *ss = subsys[i];
3036 seq_printf(m, "%s\t%lu\t%d\t%d\n", 3388 seq_printf(m, "%s\t%d\t%d\t%d\n",
3037 ss->name, ss->root->subsys_bits, 3389 ss->name, ss->root->hierarchy_id,
3038 ss->root->number_of_cgroups, !ss->disabled); 3390 ss->root->number_of_cgroups, !ss->disabled);
3039 } 3391 }
3040 mutex_unlock(&cgroup_mutex); 3392 mutex_unlock(&cgroup_mutex);
@@ -3320,13 +3672,11 @@ int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task)
3320{ 3672{
3321 int ret; 3673 int ret;
3322 struct cgroup *target; 3674 struct cgroup *target;
3323 int subsys_id;
3324 3675
3325 if (cgrp == dummytop) 3676 if (cgrp == dummytop)
3326 return 1; 3677 return 1;
3327 3678
3328 get_first_subsys(cgrp, NULL, &subsys_id); 3679 target = task_cgroup_from_root(task, cgrp->root);
3329 target = task_cgroup(task, subsys_id);
3330 while (cgrp != target && cgrp!= cgrp->top_cgroup) 3680 while (cgrp != target && cgrp!= cgrp->top_cgroup)
3331 cgrp = cgrp->parent; 3681 cgrp = cgrp->parent;
3332 ret = (cgrp == target); 3682 ret = (cgrp == target);
@@ -3693,3 +4043,154 @@ css_get_next(struct cgroup_subsys *ss, int id,
3693 return ret; 4043 return ret;
3694} 4044}
3695 4045
4046#ifdef CONFIG_CGROUP_DEBUG
4047static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss,
4048 struct cgroup *cont)
4049{
4050 struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
4051
4052 if (!css)
4053 return ERR_PTR(-ENOMEM);
4054
4055 return css;
4056}
4057
4058static void debug_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
4059{
4060 kfree(cont->subsys[debug_subsys_id]);
4061}
4062
4063static u64 cgroup_refcount_read(struct cgroup *cont, struct cftype *cft)
4064{
4065 return atomic_read(&cont->count);
4066}
4067
4068static u64 debug_taskcount_read(struct cgroup *cont, struct cftype *cft)
4069{
4070 return cgroup_task_count(cont);
4071}
4072
4073static u64 current_css_set_read(struct cgroup *cont, struct cftype *cft)
4074{
4075 return (u64)(unsigned long)current->cgroups;
4076}
4077
4078static u64 current_css_set_refcount_read(struct cgroup *cont,
4079 struct cftype *cft)
4080{
4081 u64 count;
4082
4083 rcu_read_lock();
4084 count = atomic_read(&current->cgroups->refcount);
4085 rcu_read_unlock();
4086 return count;
4087}
4088
4089static int current_css_set_cg_links_read(struct cgroup *cont,
4090 struct cftype *cft,
4091 struct seq_file *seq)
4092{
4093 struct cg_cgroup_link *link;
4094 struct css_set *cg;
4095
4096 read_lock(&css_set_lock);
4097 rcu_read_lock();
4098 cg = rcu_dereference(current->cgroups);
4099 list_for_each_entry(link, &cg->cg_links, cg_link_list) {
4100 struct cgroup *c = link->cgrp;
4101 const char *name;
4102
4103 if (c->dentry)
4104 name = c->dentry->d_name.name;
4105 else
4106 name = "?";
4107 seq_printf(seq, "Root %d group %s\n",
4108 c->root->hierarchy_id, name);
4109 }
4110 rcu_read_unlock();
4111 read_unlock(&css_set_lock);
4112 return 0;
4113}
4114
4115#define MAX_TASKS_SHOWN_PER_CSS 25
4116static int cgroup_css_links_read(struct cgroup *cont,
4117 struct cftype *cft,
4118 struct seq_file *seq)
4119{
4120 struct cg_cgroup_link *link;
4121
4122 read_lock(&css_set_lock);
4123 list_for_each_entry(link, &cont->css_sets, cgrp_link_list) {
4124 struct css_set *cg = link->cg;
4125 struct task_struct *task;
4126 int count = 0;
4127 seq_printf(seq, "css_set %p\n", cg);
4128 list_for_each_entry(task, &cg->tasks, cg_list) {
4129 if (count++ > MAX_TASKS_SHOWN_PER_CSS) {
4130 seq_puts(seq, " ...\n");
4131 break;
4132 } else {
4133 seq_printf(seq, " task %d\n",
4134 task_pid_vnr(task));
4135 }
4136 }
4137 }
4138 read_unlock(&css_set_lock);
4139 return 0;
4140}
4141
4142static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft)
4143{
4144 return test_bit(CGRP_RELEASABLE, &cgrp->flags);
4145}
4146
4147static struct cftype debug_files[] = {
4148 {
4149 .name = "cgroup_refcount",
4150 .read_u64 = cgroup_refcount_read,
4151 },
4152 {
4153 .name = "taskcount",
4154 .read_u64 = debug_taskcount_read,
4155 },
4156
4157 {
4158 .name = "current_css_set",
4159 .read_u64 = current_css_set_read,
4160 },
4161
4162 {
4163 .name = "current_css_set_refcount",
4164 .read_u64 = current_css_set_refcount_read,
4165 },
4166
4167 {
4168 .name = "current_css_set_cg_links",
4169 .read_seq_string = current_css_set_cg_links_read,
4170 },
4171
4172 {
4173 .name = "cgroup_css_links",
4174 .read_seq_string = cgroup_css_links_read,
4175 },
4176
4177 {
4178 .name = "releasable",
4179 .read_u64 = releasable_read,
4180 },
4181};
4182
4183static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont)
4184{
4185 return cgroup_add_files(cont, ss, debug_files,
4186 ARRAY_SIZE(debug_files));
4187}
4188
4189struct cgroup_subsys debug_subsys = {
4190 .name = "debug",
4191 .create = debug_create,
4192 .destroy = debug_destroy,
4193 .populate = debug_populate,
4194 .subsys_id = debug_subsys_id,
4195};
4196#endif /* CONFIG_CGROUP_DEBUG */
diff --git a/kernel/cgroup_debug.c b/kernel/cgroup_debug.c
deleted file mode 100644
index 0c92d797baa6..000000000000
--- a/kernel/cgroup_debug.c
+++ /dev/null
@@ -1,105 +0,0 @@
1/*
2 * kernel/cgroup_debug.c - Example cgroup subsystem that
3 * exposes debug info
4 *
5 * Copyright (C) Google Inc, 2007
6 *
7 * Developed by Paul Menage (menage@google.com)
8 *
9 */
10
11#include <linux/cgroup.h>
12#include <linux/fs.h>
13#include <linux/slab.h>
14#include <linux/rcupdate.h>
15
16#include <asm/atomic.h>
17
18static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss,
19 struct cgroup *cont)
20{
21 struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
22
23 if (!css)
24 return ERR_PTR(-ENOMEM);
25
26 return css;
27}
28
29static void debug_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
30{
31 kfree(cont->subsys[debug_subsys_id]);
32}
33
34static u64 cgroup_refcount_read(struct cgroup *cont, struct cftype *cft)
35{
36 return atomic_read(&cont->count);
37}
38
39static u64 taskcount_read(struct cgroup *cont, struct cftype *cft)
40{
41 u64 count;
42
43 count = cgroup_task_count(cont);
44 return count;
45}
46
47static u64 current_css_set_read(struct cgroup *cont, struct cftype *cft)
48{
49 return (u64)(long)current->cgroups;
50}
51
52static u64 current_css_set_refcount_read(struct cgroup *cont,
53 struct cftype *cft)
54{
55 u64 count;
56
57 rcu_read_lock();
58 count = atomic_read(&current->cgroups->refcount);
59 rcu_read_unlock();
60 return count;
61}
62
63static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft)
64{
65 return test_bit(CGRP_RELEASABLE, &cgrp->flags);
66}
67
68static struct cftype files[] = {
69 {
70 .name = "cgroup_refcount",
71 .read_u64 = cgroup_refcount_read,
72 },
73 {
74 .name = "taskcount",
75 .read_u64 = taskcount_read,
76 },
77
78 {
79 .name = "current_css_set",
80 .read_u64 = current_css_set_read,
81 },
82
83 {
84 .name = "current_css_set_refcount",
85 .read_u64 = current_css_set_refcount_read,
86 },
87
88 {
89 .name = "releasable",
90 .read_u64 = releasable_read,
91 },
92};
93
94static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont)
95{
96 return cgroup_add_files(cont, ss, files, ARRAY_SIZE(files));
97}
98
99struct cgroup_subsys debug_subsys = {
100 .name = "debug",
101 .create = debug_create,
102 .destroy = debug_destroy,
103 .populate = debug_populate,
104 .subsys_id = debug_subsys_id,
105};
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index fb249e2bcada..59e9ef6aab40 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -159,7 +159,7 @@ static bool is_task_frozen_enough(struct task_struct *task)
159 */ 159 */
160static int freezer_can_attach(struct cgroup_subsys *ss, 160static int freezer_can_attach(struct cgroup_subsys *ss,
161 struct cgroup *new_cgroup, 161 struct cgroup *new_cgroup,
162 struct task_struct *task) 162 struct task_struct *task, bool threadgroup)
163{ 163{
164 struct freezer *freezer; 164 struct freezer *freezer;
165 165
@@ -177,6 +177,19 @@ static int freezer_can_attach(struct cgroup_subsys *ss,
177 if (freezer->state == CGROUP_FROZEN) 177 if (freezer->state == CGROUP_FROZEN)
178 return -EBUSY; 178 return -EBUSY;
179 179
180 if (threadgroup) {
181 struct task_struct *c;
182
183 rcu_read_lock();
184 list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
185 if (is_task_frozen_enough(c)) {
186 rcu_read_unlock();
187 return -EBUSY;
188 }
189 }
190 rcu_read_unlock();
191 }
192
180 return 0; 193 return 0;
181} 194}
182 195
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 7e75a41bd508..b5cb469d2545 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1324,9 +1324,10 @@ static int fmeter_getrate(struct fmeter *fmp)
1324static cpumask_var_t cpus_attach; 1324static cpumask_var_t cpus_attach;
1325 1325
1326/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ 1326/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
1327static int cpuset_can_attach(struct cgroup_subsys *ss, 1327static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
1328 struct cgroup *cont, struct task_struct *tsk) 1328 struct task_struct *tsk, bool threadgroup)
1329{ 1329{
1330 int ret;
1330 struct cpuset *cs = cgroup_cs(cont); 1331 struct cpuset *cs = cgroup_cs(cont);
1331 1332
1332 if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) 1333 if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
@@ -1343,18 +1344,51 @@ static int cpuset_can_attach(struct cgroup_subsys *ss,
1343 if (tsk->flags & PF_THREAD_BOUND) 1344 if (tsk->flags & PF_THREAD_BOUND)
1344 return -EINVAL; 1345 return -EINVAL;
1345 1346
1346 return security_task_setscheduler(tsk, 0, NULL); 1347 ret = security_task_setscheduler(tsk, 0, NULL);
1348 if (ret)
1349 return ret;
1350 if (threadgroup) {
1351 struct task_struct *c;
1352
1353 rcu_read_lock();
1354 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
1355 ret = security_task_setscheduler(c, 0, NULL);
1356 if (ret) {
1357 rcu_read_unlock();
1358 return ret;
1359 }
1360 }
1361 rcu_read_unlock();
1362 }
1363 return 0;
1364}
1365
1366static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to,
1367 struct cpuset *cs)
1368{
1369 int err;
1370 /*
1371 * can_attach beforehand should guarantee that this doesn't fail.
1372 * TODO: have a better way to handle failure here
1373 */
1374 err = set_cpus_allowed_ptr(tsk, cpus_attach);
1375 WARN_ON_ONCE(err);
1376
1377 task_lock(tsk);
1378 cpuset_change_task_nodemask(tsk, to);
1379 task_unlock(tsk);
1380 cpuset_update_task_spread_flag(cs, tsk);
1381
1347} 1382}
1348 1383
1349static void cpuset_attach(struct cgroup_subsys *ss, 1384static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
1350 struct cgroup *cont, struct cgroup *oldcont, 1385 struct cgroup *oldcont, struct task_struct *tsk,
1351 struct task_struct *tsk) 1386 bool threadgroup)
1352{ 1387{
1353 nodemask_t from, to; 1388 nodemask_t from, to;
1354 struct mm_struct *mm; 1389 struct mm_struct *mm;
1355 struct cpuset *cs = cgroup_cs(cont); 1390 struct cpuset *cs = cgroup_cs(cont);
1356 struct cpuset *oldcs = cgroup_cs(oldcont); 1391 struct cpuset *oldcs = cgroup_cs(oldcont);
1357 int err;
1358 1392
1359 if (cs == &top_cpuset) { 1393 if (cs == &top_cpuset) {
1360 cpumask_copy(cpus_attach, cpu_possible_mask); 1394 cpumask_copy(cpus_attach, cpu_possible_mask);
@@ -1363,15 +1397,19 @@ static void cpuset_attach(struct cgroup_subsys *ss,
1363 guarantee_online_cpus(cs, cpus_attach); 1397 guarantee_online_cpus(cs, cpus_attach);
1364 guarantee_online_mems(cs, &to); 1398 guarantee_online_mems(cs, &to);
1365 } 1399 }
1366 err = set_cpus_allowed_ptr(tsk, cpus_attach);
1367 if (err)
1368 return;
1369 1400
1370 task_lock(tsk); 1401 /* do per-task migration stuff possibly for each in the threadgroup */
1371 cpuset_change_task_nodemask(tsk, &to); 1402 cpuset_attach_task(tsk, &to, cs);
1372 task_unlock(tsk); 1403 if (threadgroup) {
1373 cpuset_update_task_spread_flag(cs, tsk); 1404 struct task_struct *c;
1405 rcu_read_lock();
1406 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
1407 cpuset_attach_task(c, &to, cs);
1408 }
1409 rcu_read_unlock();
1410 }
1374 1411
1412 /* change mm; only needs to be done once even if threadgroup */
1375 from = oldcs->mems_allowed; 1413 from = oldcs->mems_allowed;
1376 to = cs->mems_allowed; 1414 to = cs->mems_allowed;
1377 mm = get_task_mm(tsk); 1415 mm = get_task_mm(tsk);
diff --git a/kernel/exit.c b/kernel/exit.c
index 60d6fdcc9265..5859f598c951 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -976,8 +976,6 @@ NORET_TYPE void do_exit(long code)
976 disassociate_ctty(1); 976 disassociate_ctty(1);
977 977
978 module_put(task_thread_info(tsk)->exec_domain->module); 978 module_put(task_thread_info(tsk)->exec_domain->module);
979 if (tsk->binfmt)
980 module_put(tsk->binfmt->module);
981 979
982 proc_exit_connector(tsk); 980 proc_exit_connector(tsk);
983 981
@@ -1097,28 +1095,28 @@ struct wait_opts {
1097 int __user *wo_stat; 1095 int __user *wo_stat;
1098 struct rusage __user *wo_rusage; 1096 struct rusage __user *wo_rusage;
1099 1097
1098 wait_queue_t child_wait;
1100 int notask_error; 1099 int notask_error;
1101}; 1100};
1102 1101
1103static struct pid *task_pid_type(struct task_struct *task, enum pid_type type) 1102static inline
1103struct pid *task_pid_type(struct task_struct *task, enum pid_type type)
1104{ 1104{
1105 struct pid *pid = NULL; 1105 if (type != PIDTYPE_PID)
1106 if (type == PIDTYPE_PID) 1106 task = task->group_leader;
1107 pid = task->pids[type].pid; 1107 return task->pids[type].pid;
1108 else if (type < PIDTYPE_MAX)
1109 pid = task->group_leader->pids[type].pid;
1110 return pid;
1111} 1108}
1112 1109
1113static int eligible_child(struct wait_opts *wo, struct task_struct *p) 1110static int eligible_pid(struct wait_opts *wo, struct task_struct *p)
1114{ 1111{
1115 int err; 1112 return wo->wo_type == PIDTYPE_MAX ||
1116 1113 task_pid_type(p, wo->wo_type) == wo->wo_pid;
1117 if (wo->wo_type < PIDTYPE_MAX) { 1114}
1118 if (task_pid_type(p, wo->wo_type) != wo->wo_pid)
1119 return 0;
1120 }
1121 1115
1116static int eligible_child(struct wait_opts *wo, struct task_struct *p)
1117{
1118 if (!eligible_pid(wo, p))
1119 return 0;
1122 /* Wait for all children (clone and not) if __WALL is set; 1120 /* Wait for all children (clone and not) if __WALL is set;
1123 * otherwise, wait for clone children *only* if __WCLONE is 1121 * otherwise, wait for clone children *only* if __WCLONE is
1124 * set; otherwise, wait for non-clone children *only*. (Note: 1122 * set; otherwise, wait for non-clone children *only*. (Note:
@@ -1128,10 +1126,6 @@ static int eligible_child(struct wait_opts *wo, struct task_struct *p)
1128 && !(wo->wo_flags & __WALL)) 1126 && !(wo->wo_flags & __WALL))
1129 return 0; 1127 return 0;
1130 1128
1131 err = security_task_wait(p);
1132 if (err)
1133 return err;
1134
1135 return 1; 1129 return 1;
1136} 1130}
1137 1131
@@ -1144,18 +1138,20 @@ static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p,
1144 1138
1145 put_task_struct(p); 1139 put_task_struct(p);
1146 infop = wo->wo_info; 1140 infop = wo->wo_info;
1147 if (!retval) 1141 if (infop) {
1148 retval = put_user(SIGCHLD, &infop->si_signo); 1142 if (!retval)
1149 if (!retval) 1143 retval = put_user(SIGCHLD, &infop->si_signo);
1150 retval = put_user(0, &infop->si_errno); 1144 if (!retval)
1151 if (!retval) 1145 retval = put_user(0, &infop->si_errno);
1152 retval = put_user((short)why, &infop->si_code); 1146 if (!retval)
1153 if (!retval) 1147 retval = put_user((short)why, &infop->si_code);
1154 retval = put_user(pid, &infop->si_pid); 1148 if (!retval)
1155 if (!retval) 1149 retval = put_user(pid, &infop->si_pid);
1156 retval = put_user(uid, &infop->si_uid); 1150 if (!retval)
1157 if (!retval) 1151 retval = put_user(uid, &infop->si_uid);
1158 retval = put_user(status, &infop->si_status); 1152 if (!retval)
1153 retval = put_user(status, &infop->si_status);
1154 }
1159 if (!retval) 1155 if (!retval)
1160 retval = pid; 1156 retval = pid;
1161 return retval; 1157 return retval;
@@ -1485,13 +1481,14 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
1485 * then ->notask_error is 0 if @p is an eligible child, 1481 * then ->notask_error is 0 if @p is an eligible child,
1486 * or another error from security_task_wait(), or still -ECHILD. 1482 * or another error from security_task_wait(), or still -ECHILD.
1487 */ 1483 */
1488static int wait_consider_task(struct wait_opts *wo, struct task_struct *parent, 1484static int wait_consider_task(struct wait_opts *wo, int ptrace,
1489 int ptrace, struct task_struct *p) 1485 struct task_struct *p)
1490{ 1486{
1491 int ret = eligible_child(wo, p); 1487 int ret = eligible_child(wo, p);
1492 if (!ret) 1488 if (!ret)
1493 return ret; 1489 return ret;
1494 1490
1491 ret = security_task_wait(p);
1495 if (unlikely(ret < 0)) { 1492 if (unlikely(ret < 0)) {
1496 /* 1493 /*
1497 * If we have not yet seen any eligible child, 1494 * If we have not yet seen any eligible child,
@@ -1553,7 +1550,7 @@ static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk)
1553 * Do not consider detached threads. 1550 * Do not consider detached threads.
1554 */ 1551 */
1555 if (!task_detached(p)) { 1552 if (!task_detached(p)) {
1556 int ret = wait_consider_task(wo, tsk, 0, p); 1553 int ret = wait_consider_task(wo, 0, p);
1557 if (ret) 1554 if (ret)
1558 return ret; 1555 return ret;
1559 } 1556 }
@@ -1567,7 +1564,7 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)
1567 struct task_struct *p; 1564 struct task_struct *p;
1568 1565
1569 list_for_each_entry(p, &tsk->ptraced, ptrace_entry) { 1566 list_for_each_entry(p, &tsk->ptraced, ptrace_entry) {
1570 int ret = wait_consider_task(wo, tsk, 1, p); 1567 int ret = wait_consider_task(wo, 1, p);
1571 if (ret) 1568 if (ret)
1572 return ret; 1569 return ret;
1573 } 1570 }
@@ -1575,15 +1572,38 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)
1575 return 0; 1572 return 0;
1576} 1573}
1577 1574
1575static int child_wait_callback(wait_queue_t *wait, unsigned mode,
1576 int sync, void *key)
1577{
1578 struct wait_opts *wo = container_of(wait, struct wait_opts,
1579 child_wait);
1580 struct task_struct *p = key;
1581
1582 if (!eligible_pid(wo, p))
1583 return 0;
1584
1585 if ((wo->wo_flags & __WNOTHREAD) && wait->private != p->parent)
1586 return 0;
1587
1588 return default_wake_function(wait, mode, sync, key);
1589}
1590
1591void __wake_up_parent(struct task_struct *p, struct task_struct *parent)
1592{
1593 __wake_up_sync_key(&parent->signal->wait_chldexit,
1594 TASK_INTERRUPTIBLE, 1, p);
1595}
1596
1578static long do_wait(struct wait_opts *wo) 1597static long do_wait(struct wait_opts *wo)
1579{ 1598{
1580 DECLARE_WAITQUEUE(wait, current);
1581 struct task_struct *tsk; 1599 struct task_struct *tsk;
1582 int retval; 1600 int retval;
1583 1601
1584 trace_sched_process_wait(wo->wo_pid); 1602 trace_sched_process_wait(wo->wo_pid);
1585 1603
1586 add_wait_queue(&current->signal->wait_chldexit,&wait); 1604 init_waitqueue_func_entry(&wo->child_wait, child_wait_callback);
1605 wo->child_wait.private = current;
1606 add_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
1587repeat: 1607repeat:
1588 /* 1608 /*
1589 * If there is nothing that can match our critiera just get out. 1609 * If there is nothing that can match our critiera just get out.
@@ -1624,32 +1644,7 @@ notask:
1624 } 1644 }
1625end: 1645end:
1626 __set_current_state(TASK_RUNNING); 1646 __set_current_state(TASK_RUNNING);
1627 remove_wait_queue(&current->signal->wait_chldexit,&wait); 1647 remove_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
1628 if (wo->wo_info) {
1629 struct siginfo __user *infop = wo->wo_info;
1630
1631 if (retval > 0)
1632 retval = 0;
1633 else {
1634 /*
1635 * For a WNOHANG return, clear out all the fields
1636 * we would set so the user can easily tell the
1637 * difference.
1638 */
1639 if (!retval)
1640 retval = put_user(0, &infop->si_signo);
1641 if (!retval)
1642 retval = put_user(0, &infop->si_errno);
1643 if (!retval)
1644 retval = put_user(0, &infop->si_code);
1645 if (!retval)
1646 retval = put_user(0, &infop->si_pid);
1647 if (!retval)
1648 retval = put_user(0, &infop->si_uid);
1649 if (!retval)
1650 retval = put_user(0, &infop->si_status);
1651 }
1652 }
1653 return retval; 1648 return retval;
1654} 1649}
1655 1650
@@ -1694,6 +1689,29 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
1694 wo.wo_stat = NULL; 1689 wo.wo_stat = NULL;
1695 wo.wo_rusage = ru; 1690 wo.wo_rusage = ru;
1696 ret = do_wait(&wo); 1691 ret = do_wait(&wo);
1692
1693 if (ret > 0) {
1694 ret = 0;
1695 } else if (infop) {
1696 /*
1697 * For a WNOHANG return, clear out all the fields
1698 * we would set so the user can easily tell the
1699 * difference.
1700 */
1701 if (!ret)
1702 ret = put_user(0, &infop->si_signo);
1703 if (!ret)
1704 ret = put_user(0, &infop->si_errno);
1705 if (!ret)
1706 ret = put_user(0, &infop->si_code);
1707 if (!ret)
1708 ret = put_user(0, &infop->si_pid);
1709 if (!ret)
1710 ret = put_user(0, &infop->si_uid);
1711 if (!ret)
1712 ret = put_user(0, &infop->si_status);
1713 }
1714
1697 put_pid(pid); 1715 put_pid(pid);
1698 1716
1699 /* avoid REGPARM breakage on x86: */ 1717 /* avoid REGPARM breakage on x86: */
diff --git a/kernel/fork.c b/kernel/fork.c
index 51ad0b0b7266..266c6af6ef1b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -434,6 +434,14 @@ __setup("coredump_filter=", coredump_filter_setup);
434 434
435#include <linux/init_task.h> 435#include <linux/init_task.h>
436 436
437static void mm_init_aio(struct mm_struct *mm)
438{
439#ifdef CONFIG_AIO
440 spin_lock_init(&mm->ioctx_lock);
441 INIT_HLIST_HEAD(&mm->ioctx_list);
442#endif
443}
444
437static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) 445static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
438{ 446{
439 atomic_set(&mm->mm_users, 1); 447 atomic_set(&mm->mm_users, 1);
@@ -447,10 +455,9 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
447 set_mm_counter(mm, file_rss, 0); 455 set_mm_counter(mm, file_rss, 0);
448 set_mm_counter(mm, anon_rss, 0); 456 set_mm_counter(mm, anon_rss, 0);
449 spin_lock_init(&mm->page_table_lock); 457 spin_lock_init(&mm->page_table_lock);
450 spin_lock_init(&mm->ioctx_lock);
451 INIT_HLIST_HEAD(&mm->ioctx_list);
452 mm->free_area_cache = TASK_UNMAPPED_BASE; 458 mm->free_area_cache = TASK_UNMAPPED_BASE;
453 mm->cached_hole_size = ~0UL; 459 mm->cached_hole_size = ~0UL;
460 mm_init_aio(mm);
454 mm_init_owner(mm, p); 461 mm_init_owner(mm, p);
455 462
456 if (likely(!mm_alloc_pgd(mm))) { 463 if (likely(!mm_alloc_pgd(mm))) {
@@ -511,6 +518,8 @@ void mmput(struct mm_struct *mm)
511 spin_unlock(&mmlist_lock); 518 spin_unlock(&mmlist_lock);
512 } 519 }
513 put_swap_token(mm); 520 put_swap_token(mm);
521 if (mm->binfmt)
522 module_put(mm->binfmt->module);
514 mmdrop(mm); 523 mmdrop(mm);
515 } 524 }
516} 525}
@@ -636,9 +645,14 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
636 mm->hiwater_rss = get_mm_rss(mm); 645 mm->hiwater_rss = get_mm_rss(mm);
637 mm->hiwater_vm = mm->total_vm; 646 mm->hiwater_vm = mm->total_vm;
638 647
648 if (mm->binfmt && !try_module_get(mm->binfmt->module))
649 goto free_pt;
650
639 return mm; 651 return mm;
640 652
641free_pt: 653free_pt:
654 /* don't put binfmt in mmput, we haven't got module yet */
655 mm->binfmt = NULL;
642 mmput(mm); 656 mmput(mm);
643 657
644fail_nomem: 658fail_nomem:
@@ -979,6 +993,16 @@ static struct task_struct *copy_process(unsigned long clone_flags,
979 if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM)) 993 if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))
980 return ERR_PTR(-EINVAL); 994 return ERR_PTR(-EINVAL);
981 995
996 /*
997 * Siblings of global init remain as zombies on exit since they are
998 * not reaped by their parent (swapper). To solve this and to avoid
999 * multi-rooted process trees, prevent global and container-inits
1000 * from creating siblings.
1001 */
1002 if ((clone_flags & CLONE_PARENT) &&
1003 current->signal->flags & SIGNAL_UNKILLABLE)
1004 return ERR_PTR(-EINVAL);
1005
982 retval = security_task_create(clone_flags); 1006 retval = security_task_create(clone_flags);
983 if (retval) 1007 if (retval)
984 goto fork_out; 1008 goto fork_out;
@@ -1020,9 +1044,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1020 if (!try_module_get(task_thread_info(p)->exec_domain->module)) 1044 if (!try_module_get(task_thread_info(p)->exec_domain->module))
1021 goto bad_fork_cleanup_count; 1045 goto bad_fork_cleanup_count;
1022 1046
1023 if (p->binfmt && !try_module_get(p->binfmt->module))
1024 goto bad_fork_cleanup_put_domain;
1025
1026 p->did_exec = 0; 1047 p->did_exec = 0;
1027 delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ 1048 delayacct_tsk_init(p); /* Must remain after dup_task_struct() */
1028 copy_flags(clone_flags, p); 1049 copy_flags(clone_flags, p);
@@ -1310,9 +1331,6 @@ bad_fork_cleanup_cgroup:
1310#endif 1331#endif
1311 cgroup_exit(p, cgroup_callbacks_done); 1332 cgroup_exit(p, cgroup_callbacks_done);
1312 delayacct_tsk_free(p); 1333 delayacct_tsk_free(p);
1313 if (p->binfmt)
1314 module_put(p->binfmt->module);
1315bad_fork_cleanup_put_domain:
1316 module_put(task_thread_info(p)->exec_domain->module); 1334 module_put(task_thread_info(p)->exec_domain->module);
1317bad_fork_cleanup_count: 1335bad_fork_cleanup_count:
1318 atomic_dec(&p->cred->user->processes); 1336 atomic_dec(&p->cred->user->processes);
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 022a4927b785..d4e841747400 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -171,12 +171,12 @@ static unsigned long timeout_jiffies(unsigned long timeout)
171 * Process updating of timeout sysctl 171 * Process updating of timeout sysctl
172 */ 172 */
173int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, 173int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
174 struct file *filp, void __user *buffer, 174 void __user *buffer,
175 size_t *lenp, loff_t *ppos) 175 size_t *lenp, loff_t *ppos)
176{ 176{
177 int ret; 177 int ret;
178 178
179 ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos); 179 ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
180 180
181 if (ret || !write) 181 if (ret || !write)
182 goto out; 182 goto out;
diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c
index 5aa854f9e5ae..2a5dfec8efe0 100644
--- a/kernel/ns_cgroup.c
+++ b/kernel/ns_cgroup.c
@@ -42,8 +42,8 @@ int ns_cgroup_clone(struct task_struct *task, struct pid *pid)
42 * (hence either you are in the same cgroup as task, or in an 42 * (hence either you are in the same cgroup as task, or in an
43 * ancestor cgroup thereof) 43 * ancestor cgroup thereof)
44 */ 44 */
45static int ns_can_attach(struct cgroup_subsys *ss, 45static int ns_can_attach(struct cgroup_subsys *ss, struct cgroup *new_cgroup,
46 struct cgroup *new_cgroup, struct task_struct *task) 46 struct task_struct *task, bool threadgroup)
47{ 47{
48 if (current != task) { 48 if (current != task) {
49 if (!capable(CAP_SYS_ADMIN)) 49 if (!capable(CAP_SYS_ADMIN))
@@ -56,6 +56,18 @@ static int ns_can_attach(struct cgroup_subsys *ss,
56 if (!cgroup_is_descendant(new_cgroup, task)) 56 if (!cgroup_is_descendant(new_cgroup, task))
57 return -EPERM; 57 return -EPERM;
58 58
59 if (threadgroup) {
60 struct task_struct *c;
61 rcu_read_lock();
62 list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
63 if (!cgroup_is_descendant(new_cgroup, c)) {
64 rcu_read_unlock();
65 return -EPERM;
66 }
67 }
68 rcu_read_unlock();
69 }
70
59 return 0; 71 return 0;
60} 72}
61 73
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 821722ae58a7..86b3796b0436 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -118,7 +118,7 @@ struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old
118{ 118{
119 if (!(flags & CLONE_NEWPID)) 119 if (!(flags & CLONE_NEWPID))
120 return get_pid_ns(old_ns); 120 return get_pid_ns(old_ns);
121 if (flags & CLONE_THREAD) 121 if (flags & (CLONE_THREAD|CLONE_PARENT))
122 return ERR_PTR(-EINVAL); 122 return ERR_PTR(-EINVAL);
123 return create_pid_namespace(old_ns); 123 return create_pid_namespace(old_ns);
124} 124}
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 307c285af59e..23bd09cd042e 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -266,9 +266,10 @@ static int ignoring_children(struct sighand_struct *sigh)
266 * or self-reaping. Do notification now if it would have happened earlier. 266 * or self-reaping. Do notification now if it would have happened earlier.
267 * If it should reap itself, return true. 267 * If it should reap itself, return true.
268 * 268 *
269 * If it's our own child, there is no notification to do. 269 * If it's our own child, there is no notification to do. But if our normal
270 * But if our normal children self-reap, then this child 270 * children self-reap, then this child was prevented by ptrace and we must
271 * was prevented by ptrace and we must reap it now. 271 * reap it now, in that case we must also wake up sub-threads sleeping in
272 * do_wait().
272 */ 273 */
273static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p) 274static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p)
274{ 275{
@@ -278,8 +279,10 @@ static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p)
278 if (!task_detached(p) && thread_group_empty(p)) { 279 if (!task_detached(p) && thread_group_empty(p)) {
279 if (!same_thread_group(p->real_parent, tracer)) 280 if (!same_thread_group(p->real_parent, tracer))
280 do_notify_parent(p, p->exit_signal); 281 do_notify_parent(p, p->exit_signal);
281 else if (ignoring_children(tracer->sighand)) 282 else if (ignoring_children(tracer->sighand)) {
283 __wake_up_parent(p, tracer);
282 p->exit_signal = -1; 284 p->exit_signal = -1;
285 }
283 } 286 }
284 if (task_detached(p)) { 287 if (task_detached(p)) {
285 /* Mark it as in the process of being reaped. */ 288 /* Mark it as in the process of being reaped. */
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index e1338f074314..88faec23e833 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -19,6 +19,7 @@ void res_counter_init(struct res_counter *counter, struct res_counter *parent)
19{ 19{
20 spin_lock_init(&counter->lock); 20 spin_lock_init(&counter->lock);
21 counter->limit = RESOURCE_MAX; 21 counter->limit = RESOURCE_MAX;
22 counter->soft_limit = RESOURCE_MAX;
22 counter->parent = parent; 23 counter->parent = parent;
23} 24}
24 25
@@ -36,17 +37,27 @@ int res_counter_charge_locked(struct res_counter *counter, unsigned long val)
36} 37}
37 38
38int res_counter_charge(struct res_counter *counter, unsigned long val, 39int res_counter_charge(struct res_counter *counter, unsigned long val,
39 struct res_counter **limit_fail_at) 40 struct res_counter **limit_fail_at,
41 struct res_counter **soft_limit_fail_at)
40{ 42{
41 int ret; 43 int ret;
42 unsigned long flags; 44 unsigned long flags;
43 struct res_counter *c, *u; 45 struct res_counter *c, *u;
44 46
45 *limit_fail_at = NULL; 47 *limit_fail_at = NULL;
48 if (soft_limit_fail_at)
49 *soft_limit_fail_at = NULL;
46 local_irq_save(flags); 50 local_irq_save(flags);
47 for (c = counter; c != NULL; c = c->parent) { 51 for (c = counter; c != NULL; c = c->parent) {
48 spin_lock(&c->lock); 52 spin_lock(&c->lock);
49 ret = res_counter_charge_locked(c, val); 53 ret = res_counter_charge_locked(c, val);
54 /*
55 * With soft limits, we return the highest ancestor
56 * that exceeds its soft limit
57 */
58 if (soft_limit_fail_at &&
59 !res_counter_soft_limit_check_locked(c))
60 *soft_limit_fail_at = c;
50 spin_unlock(&c->lock); 61 spin_unlock(&c->lock);
51 if (ret < 0) { 62 if (ret < 0) {
52 *limit_fail_at = c; 63 *limit_fail_at = c;
@@ -74,7 +85,8 @@ void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val)
74 counter->usage -= val; 85 counter->usage -= val;
75} 86}
76 87
77void res_counter_uncharge(struct res_counter *counter, unsigned long val) 88void res_counter_uncharge(struct res_counter *counter, unsigned long val,
89 bool *was_soft_limit_excess)
78{ 90{
79 unsigned long flags; 91 unsigned long flags;
80 struct res_counter *c; 92 struct res_counter *c;
@@ -82,6 +94,9 @@ void res_counter_uncharge(struct res_counter *counter, unsigned long val)
82 local_irq_save(flags); 94 local_irq_save(flags);
83 for (c = counter; c != NULL; c = c->parent) { 95 for (c = counter; c != NULL; c = c->parent) {
84 spin_lock(&c->lock); 96 spin_lock(&c->lock);
97 if (was_soft_limit_excess)
98 *was_soft_limit_excess =
99 !res_counter_soft_limit_check_locked(c);
85 res_counter_uncharge_locked(c, val); 100 res_counter_uncharge_locked(c, val);
86 spin_unlock(&c->lock); 101 spin_unlock(&c->lock);
87 } 102 }
@@ -101,6 +116,8 @@ res_counter_member(struct res_counter *counter, int member)
101 return &counter->limit; 116 return &counter->limit;
102 case RES_FAILCNT: 117 case RES_FAILCNT:
103 return &counter->failcnt; 118 return &counter->failcnt;
119 case RES_SOFT_LIMIT:
120 return &counter->soft_limit;
104 }; 121 };
105 122
106 BUG(); 123 BUG();
diff --git a/kernel/sched.c b/kernel/sched.c
index 2f76e06bea58..ee61f454a98b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -10312,7 +10312,7 @@ static int sched_rt_global_constraints(void)
10312#endif /* CONFIG_RT_GROUP_SCHED */ 10312#endif /* CONFIG_RT_GROUP_SCHED */
10313 10313
10314int sched_rt_handler(struct ctl_table *table, int write, 10314int sched_rt_handler(struct ctl_table *table, int write,
10315 struct file *filp, void __user *buffer, size_t *lenp, 10315 void __user *buffer, size_t *lenp,
10316 loff_t *ppos) 10316 loff_t *ppos)
10317{ 10317{
10318 int ret; 10318 int ret;
@@ -10323,7 +10323,7 @@ int sched_rt_handler(struct ctl_table *table, int write,
10323 old_period = sysctl_sched_rt_period; 10323 old_period = sysctl_sched_rt_period;
10324 old_runtime = sysctl_sched_rt_runtime; 10324 old_runtime = sysctl_sched_rt_runtime;
10325 10325
10326 ret = proc_dointvec(table, write, filp, buffer, lenp, ppos); 10326 ret = proc_dointvec(table, write, buffer, lenp, ppos);
10327 10327
10328 if (!ret && write) { 10328 if (!ret && write) {
10329 ret = sched_rt_global_constraints(); 10329 ret = sched_rt_global_constraints();
@@ -10377,8 +10377,7 @@ cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
10377} 10377}
10378 10378
10379static int 10379static int
10380cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, 10380cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
10381 struct task_struct *tsk)
10382{ 10381{
10383#ifdef CONFIG_RT_GROUP_SCHED 10382#ifdef CONFIG_RT_GROUP_SCHED
10384 if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk)) 10383 if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk))
@@ -10388,15 +10387,45 @@ cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
10388 if (tsk->sched_class != &fair_sched_class) 10387 if (tsk->sched_class != &fair_sched_class)
10389 return -EINVAL; 10388 return -EINVAL;
10390#endif 10389#endif
10390 return 0;
10391}
10391 10392
10393static int
10394cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
10395 struct task_struct *tsk, bool threadgroup)
10396{
10397 int retval = cpu_cgroup_can_attach_task(cgrp, tsk);
10398 if (retval)
10399 return retval;
10400 if (threadgroup) {
10401 struct task_struct *c;
10402 rcu_read_lock();
10403 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
10404 retval = cpu_cgroup_can_attach_task(cgrp, c);
10405 if (retval) {
10406 rcu_read_unlock();
10407 return retval;
10408 }
10409 }
10410 rcu_read_unlock();
10411 }
10392 return 0; 10412 return 0;
10393} 10413}
10394 10414
10395static void 10415static void
10396cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, 10416cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
10397 struct cgroup *old_cont, struct task_struct *tsk) 10417 struct cgroup *old_cont, struct task_struct *tsk,
10418 bool threadgroup)
10398{ 10419{
10399 sched_move_task(tsk); 10420 sched_move_task(tsk);
10421 if (threadgroup) {
10422 struct task_struct *c;
10423 rcu_read_lock();
10424 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
10425 sched_move_task(c);
10426 }
10427 rcu_read_unlock();
10428 }
10400} 10429}
10401 10430
10402#ifdef CONFIG_FAIR_GROUP_SCHED 10431#ifdef CONFIG_FAIR_GROUP_SCHED
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index ecc637a0d591..4e777b47eeda 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -384,10 +384,10 @@ static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
384 384
385#ifdef CONFIG_SCHED_DEBUG 385#ifdef CONFIG_SCHED_DEBUG
386int sched_nr_latency_handler(struct ctl_table *table, int write, 386int sched_nr_latency_handler(struct ctl_table *table, int write,
387 struct file *filp, void __user *buffer, size_t *lenp, 387 void __user *buffer, size_t *lenp,
388 loff_t *ppos) 388 loff_t *ppos)
389{ 389{
390 int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); 390 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
391 391
392 if (ret || !write) 392 if (ret || !write)
393 return ret; 393 return ret;
diff --git a/kernel/signal.c b/kernel/signal.c
index 64c5deeaca5d..6705320784fd 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -705,7 +705,7 @@ static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns)
705 705
706 if (why) { 706 if (why) {
707 /* 707 /*
708 * The first thread which returns from finish_stop() 708 * The first thread which returns from do_signal_stop()
709 * will take ->siglock, notice SIGNAL_CLD_MASK, and 709 * will take ->siglock, notice SIGNAL_CLD_MASK, and
710 * notify its parent. See get_signal_to_deliver(). 710 * notify its parent. See get_signal_to_deliver().
711 */ 711 */
@@ -971,6 +971,20 @@ specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t)
971 return send_signal(sig, info, t, 0); 971 return send_signal(sig, info, t, 0);
972} 972}
973 973
974int do_send_sig_info(int sig, struct siginfo *info, struct task_struct *p,
975 bool group)
976{
977 unsigned long flags;
978 int ret = -ESRCH;
979
980 if (lock_task_sighand(p, &flags)) {
981 ret = send_signal(sig, info, p, group);
982 unlock_task_sighand(p, &flags);
983 }
984
985 return ret;
986}
987
974/* 988/*
975 * Force a signal that the process can't ignore: if necessary 989 * Force a signal that the process can't ignore: if necessary
976 * we unblock the signal and change any SIG_IGN to SIG_DFL. 990 * we unblock the signal and change any SIG_IGN to SIG_DFL.
@@ -1036,12 +1050,6 @@ void zap_other_threads(struct task_struct *p)
1036 } 1050 }
1037} 1051}
1038 1052
1039int __fatal_signal_pending(struct task_struct *tsk)
1040{
1041 return sigismember(&tsk->pending.signal, SIGKILL);
1042}
1043EXPORT_SYMBOL(__fatal_signal_pending);
1044
1045struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long *flags) 1053struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long *flags)
1046{ 1054{
1047 struct sighand_struct *sighand; 1055 struct sighand_struct *sighand;
@@ -1068,18 +1076,10 @@ struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long
1068 */ 1076 */
1069int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) 1077int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
1070{ 1078{
1071 unsigned long flags; 1079 int ret = check_kill_permission(sig, info, p);
1072 int ret;
1073 1080
1074 ret = check_kill_permission(sig, info, p); 1081 if (!ret && sig)
1075 1082 ret = do_send_sig_info(sig, info, p, true);
1076 if (!ret && sig) {
1077 ret = -ESRCH;
1078 if (lock_task_sighand(p, &flags)) {
1079 ret = __group_send_sig_info(sig, info, p);
1080 unlock_task_sighand(p, &flags);
1081 }
1082 }
1083 1083
1084 return ret; 1084 return ret;
1085} 1085}
@@ -1224,15 +1224,9 @@ static int kill_something_info(int sig, struct siginfo *info, pid_t pid)
1224 * These are for backward compatibility with the rest of the kernel source. 1224 * These are for backward compatibility with the rest of the kernel source.
1225 */ 1225 */
1226 1226
1227/*
1228 * The caller must ensure the task can't exit.
1229 */
1230int 1227int
1231send_sig_info(int sig, struct siginfo *info, struct task_struct *p) 1228send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
1232{ 1229{
1233 int ret;
1234 unsigned long flags;
1235
1236 /* 1230 /*
1237 * Make sure legacy kernel users don't send in bad values 1231 * Make sure legacy kernel users don't send in bad values
1238 * (normal paths check this in check_kill_permission). 1232 * (normal paths check this in check_kill_permission).
@@ -1240,10 +1234,7 @@ send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
1240 if (!valid_signal(sig)) 1234 if (!valid_signal(sig))
1241 return -EINVAL; 1235 return -EINVAL;
1242 1236
1243 spin_lock_irqsave(&p->sighand->siglock, flags); 1237 return do_send_sig_info(sig, info, p, false);
1244 ret = specific_send_sig_info(sig, info, p);
1245 spin_unlock_irqrestore(&p->sighand->siglock, flags);
1246 return ret;
1247} 1238}
1248 1239
1249#define __si_special(priv) \ 1240#define __si_special(priv) \
@@ -1383,15 +1374,6 @@ ret:
1383} 1374}
1384 1375
1385/* 1376/*
1386 * Wake up any threads in the parent blocked in wait* syscalls.
1387 */
1388static inline void __wake_up_parent(struct task_struct *p,
1389 struct task_struct *parent)
1390{
1391 wake_up_interruptible_sync(&parent->signal->wait_chldexit);
1392}
1393
1394/*
1395 * Let a parent know about the death of a child. 1377 * Let a parent know about the death of a child.
1396 * For a stopped/continued status change, use do_notify_parent_cldstop instead. 1378 * For a stopped/continued status change, use do_notify_parent_cldstop instead.
1397 * 1379 *
@@ -1673,29 +1655,6 @@ void ptrace_notify(int exit_code)
1673 spin_unlock_irq(&current->sighand->siglock); 1655 spin_unlock_irq(&current->sighand->siglock);
1674} 1656}
1675 1657
1676static void
1677finish_stop(int stop_count)
1678{
1679 /*
1680 * If there are no other threads in the group, or if there is
1681 * a group stop in progress and we are the last to stop,
1682 * report to the parent. When ptraced, every thread reports itself.
1683 */
1684 if (tracehook_notify_jctl(stop_count == 0, CLD_STOPPED)) {
1685 read_lock(&tasklist_lock);
1686 do_notify_parent_cldstop(current, CLD_STOPPED);
1687 read_unlock(&tasklist_lock);
1688 }
1689
1690 do {
1691 schedule();
1692 } while (try_to_freeze());
1693 /*
1694 * Now we don't run again until continued.
1695 */
1696 current->exit_code = 0;
1697}
1698
1699/* 1658/*
1700 * This performs the stopping for SIGSTOP and other stop signals. 1659 * This performs the stopping for SIGSTOP and other stop signals.
1701 * We have to stop all threads in the thread group. 1660 * We have to stop all threads in the thread group.
@@ -1705,15 +1664,9 @@ finish_stop(int stop_count)
1705static int do_signal_stop(int signr) 1664static int do_signal_stop(int signr)
1706{ 1665{
1707 struct signal_struct *sig = current->signal; 1666 struct signal_struct *sig = current->signal;
1708 int stop_count; 1667 int notify;
1709 1668
1710 if (sig->group_stop_count > 0) { 1669 if (!sig->group_stop_count) {
1711 /*
1712 * There is a group stop in progress. We don't need to
1713 * start another one.
1714 */
1715 stop_count = --sig->group_stop_count;
1716 } else {
1717 struct task_struct *t; 1670 struct task_struct *t;
1718 1671
1719 if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED) || 1672 if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED) ||
@@ -1725,7 +1678,7 @@ static int do_signal_stop(int signr)
1725 */ 1678 */
1726 sig->group_exit_code = signr; 1679 sig->group_exit_code = signr;
1727 1680
1728 stop_count = 0; 1681 sig->group_stop_count = 1;
1729 for (t = next_thread(current); t != current; t = next_thread(t)) 1682 for (t = next_thread(current); t != current; t = next_thread(t))
1730 /* 1683 /*
1731 * Setting state to TASK_STOPPED for a group 1684 * Setting state to TASK_STOPPED for a group
@@ -1734,19 +1687,44 @@ static int do_signal_stop(int signr)
1734 */ 1687 */
1735 if (!(t->flags & PF_EXITING) && 1688 if (!(t->flags & PF_EXITING) &&
1736 !task_is_stopped_or_traced(t)) { 1689 !task_is_stopped_or_traced(t)) {
1737 stop_count++; 1690 sig->group_stop_count++;
1738 signal_wake_up(t, 0); 1691 signal_wake_up(t, 0);
1739 } 1692 }
1740 sig->group_stop_count = stop_count;
1741 } 1693 }
1694 /*
1695 * If there are no other threads in the group, or if there is
1696 * a group stop in progress and we are the last to stop, report
1697 * to the parent. When ptraced, every thread reports itself.
1698 */
1699 notify = sig->group_stop_count == 1 ? CLD_STOPPED : 0;
1700 notify = tracehook_notify_jctl(notify, CLD_STOPPED);
1701 /*
1702 * tracehook_notify_jctl() can drop and reacquire siglock, so
1703 * we keep ->group_stop_count != 0 before the call. If SIGCONT
1704 * or SIGKILL comes in between ->group_stop_count == 0.
1705 */
1706 if (sig->group_stop_count) {
1707 if (!--sig->group_stop_count)
1708 sig->flags = SIGNAL_STOP_STOPPED;
1709 current->exit_code = sig->group_exit_code;
1710 __set_current_state(TASK_STOPPED);
1711 }
1712 spin_unlock_irq(&current->sighand->siglock);
1742 1713
1743 if (stop_count == 0) 1714 if (notify) {
1744 sig->flags = SIGNAL_STOP_STOPPED; 1715 read_lock(&tasklist_lock);
1745 current->exit_code = sig->group_exit_code; 1716 do_notify_parent_cldstop(current, notify);
1746 __set_current_state(TASK_STOPPED); 1717 read_unlock(&tasklist_lock);
1718 }
1719
1720 /* Now we don't run again until woken by SIGCONT or SIGKILL */
1721 do {
1722 schedule();
1723 } while (try_to_freeze());
1724
1725 tracehook_finish_jctl();
1726 current->exit_code = 0;
1747 1727
1748 spin_unlock_irq(&current->sighand->siglock);
1749 finish_stop(stop_count);
1750 return 1; 1728 return 1;
1751} 1729}
1752 1730
@@ -1815,14 +1793,15 @@ relock:
1815 int why = (signal->flags & SIGNAL_STOP_CONTINUED) 1793 int why = (signal->flags & SIGNAL_STOP_CONTINUED)
1816 ? CLD_CONTINUED : CLD_STOPPED; 1794 ? CLD_CONTINUED : CLD_STOPPED;
1817 signal->flags &= ~SIGNAL_CLD_MASK; 1795 signal->flags &= ~SIGNAL_CLD_MASK;
1818 spin_unlock_irq(&sighand->siglock);
1819 1796
1820 if (unlikely(!tracehook_notify_jctl(1, why))) 1797 why = tracehook_notify_jctl(why, CLD_CONTINUED);
1821 goto relock; 1798 spin_unlock_irq(&sighand->siglock);
1822 1799
1823 read_lock(&tasklist_lock); 1800 if (why) {
1824 do_notify_parent_cldstop(current->group_leader, why); 1801 read_lock(&tasklist_lock);
1825 read_unlock(&tasklist_lock); 1802 do_notify_parent_cldstop(current->group_leader, why);
1803 read_unlock(&tasklist_lock);
1804 }
1826 goto relock; 1805 goto relock;
1827 } 1806 }
1828 1807
@@ -1987,14 +1966,14 @@ void exit_signals(struct task_struct *tsk)
1987 if (unlikely(tsk->signal->group_stop_count) && 1966 if (unlikely(tsk->signal->group_stop_count) &&
1988 !--tsk->signal->group_stop_count) { 1967 !--tsk->signal->group_stop_count) {
1989 tsk->signal->flags = SIGNAL_STOP_STOPPED; 1968 tsk->signal->flags = SIGNAL_STOP_STOPPED;
1990 group_stop = 1; 1969 group_stop = tracehook_notify_jctl(CLD_STOPPED, CLD_STOPPED);
1991 } 1970 }
1992out: 1971out:
1993 spin_unlock_irq(&tsk->sighand->siglock); 1972 spin_unlock_irq(&tsk->sighand->siglock);
1994 1973
1995 if (unlikely(group_stop) && tracehook_notify_jctl(1, CLD_STOPPED)) { 1974 if (unlikely(group_stop)) {
1996 read_lock(&tasklist_lock); 1975 read_lock(&tasklist_lock);
1997 do_notify_parent_cldstop(tsk, CLD_STOPPED); 1976 do_notify_parent_cldstop(tsk, group_stop);
1998 read_unlock(&tasklist_lock); 1977 read_unlock(&tasklist_lock);
1999 } 1978 }
2000} 1979}
@@ -2290,7 +2269,6 @@ static int
2290do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info) 2269do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info)
2291{ 2270{
2292 struct task_struct *p; 2271 struct task_struct *p;
2293 unsigned long flags;
2294 int error = -ESRCH; 2272 int error = -ESRCH;
2295 2273
2296 rcu_read_lock(); 2274 rcu_read_lock();
@@ -2300,14 +2278,16 @@ do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info)
2300 /* 2278 /*
2301 * The null signal is a permissions and process existence 2279 * The null signal is a permissions and process existence
2302 * probe. No signal is actually delivered. 2280 * probe. No signal is actually delivered.
2303 *
2304 * If lock_task_sighand() fails we pretend the task dies
2305 * after receiving the signal. The window is tiny, and the
2306 * signal is private anyway.
2307 */ 2281 */
2308 if (!error && sig && lock_task_sighand(p, &flags)) { 2282 if (!error && sig) {
2309 error = specific_send_sig_info(sig, info, p); 2283 error = do_send_sig_info(sig, info, p, false);
2310 unlock_task_sighand(p, &flags); 2284 /*
2285 * If lock_task_sighand() failed we pretend the task
2286 * dies after receiving the signal. The window is tiny,
2287 * and the signal is private anyway.
2288 */
2289 if (unlikely(error == -ESRCH))
2290 error = 0;
2311 } 2291 }
2312 } 2292 }
2313 rcu_read_unlock(); 2293 rcu_read_unlock();
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
index 09d7519557d3..0d31135efbf4 100644
--- a/kernel/slow-work.c
+++ b/kernel/slow-work.c
@@ -26,10 +26,10 @@ static void slow_work_cull_timeout(unsigned long);
26static void slow_work_oom_timeout(unsigned long); 26static void slow_work_oom_timeout(unsigned long);
27 27
28#ifdef CONFIG_SYSCTL 28#ifdef CONFIG_SYSCTL
29static int slow_work_min_threads_sysctl(struct ctl_table *, int, struct file *, 29static int slow_work_min_threads_sysctl(struct ctl_table *, int,
30 void __user *, size_t *, loff_t *); 30 void __user *, size_t *, loff_t *);
31 31
32static int slow_work_max_threads_sysctl(struct ctl_table *, int , struct file *, 32static int slow_work_max_threads_sysctl(struct ctl_table *, int ,
33 void __user *, size_t *, loff_t *); 33 void __user *, size_t *, loff_t *);
34#endif 34#endif
35 35
@@ -493,10 +493,10 @@ static void slow_work_oom_timeout(unsigned long data)
493 * Handle adjustment of the minimum number of threads 493 * Handle adjustment of the minimum number of threads
494 */ 494 */
495static int slow_work_min_threads_sysctl(struct ctl_table *table, int write, 495static int slow_work_min_threads_sysctl(struct ctl_table *table, int write,
496 struct file *filp, void __user *buffer, 496 void __user *buffer,
497 size_t *lenp, loff_t *ppos) 497 size_t *lenp, loff_t *ppos)
498{ 498{
499 int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); 499 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
500 int n; 500 int n;
501 501
502 if (ret == 0) { 502 if (ret == 0) {
@@ -521,10 +521,10 @@ static int slow_work_min_threads_sysctl(struct ctl_table *table, int write,
521 * Handle adjustment of the maximum number of threads 521 * Handle adjustment of the maximum number of threads
522 */ 522 */
523static int slow_work_max_threads_sysctl(struct ctl_table *table, int write, 523static int slow_work_max_threads_sysctl(struct ctl_table *table, int write,
524 struct file *filp, void __user *buffer, 524 void __user *buffer,
525 size_t *lenp, loff_t *ppos) 525 size_t *lenp, loff_t *ppos)
526{ 526{
527 int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); 527 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
528 int n; 528 int n;
529 529
530 if (ret == 0) { 530 if (ret == 0) {
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 88796c330838..81324d12eb35 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -90,11 +90,11 @@ void touch_all_softlockup_watchdogs(void)
90EXPORT_SYMBOL(touch_all_softlockup_watchdogs); 90EXPORT_SYMBOL(touch_all_softlockup_watchdogs);
91 91
92int proc_dosoftlockup_thresh(struct ctl_table *table, int write, 92int proc_dosoftlockup_thresh(struct ctl_table *table, int write,
93 struct file *filp, void __user *buffer, 93 void __user *buffer,
94 size_t *lenp, loff_t *ppos) 94 size_t *lenp, loff_t *ppos)
95{ 95{
96 touch_all_softlockup_watchdogs(); 96 touch_all_softlockup_watchdogs();
97 return proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); 97 return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
98} 98}
99 99
100/* 100/*
diff --git a/kernel/sys.c b/kernel/sys.c
index ebcb15611728..255475d163e0 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1542,6 +1542,28 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
1542 current->timer_slack_ns = arg2; 1542 current->timer_slack_ns = arg2;
1543 error = 0; 1543 error = 0;
1544 break; 1544 break;
1545 case PR_MCE_KILL:
1546 if (arg4 | arg5)
1547 return -EINVAL;
1548 switch (arg2) {
1549 case 0:
1550 if (arg3 != 0)
1551 return -EINVAL;
1552 current->flags &= ~PF_MCE_PROCESS;
1553 break;
1554 case 1:
1555 current->flags |= PF_MCE_PROCESS;
1556 if (arg3 != 0)
1557 current->flags |= PF_MCE_EARLY;
1558 else
1559 current->flags &= ~PF_MCE_EARLY;
1560 break;
1561 default:
1562 return -EINVAL;
1563 }
1564 error = 0;
1565 break;
1566
1545 default: 1567 default:
1546 error = -EINVAL; 1568 error = -EINVAL;
1547 break; 1569 break;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 7f4f57bea4ce..0d949c517412 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -76,6 +76,7 @@ extern int max_threads;
76extern int core_uses_pid; 76extern int core_uses_pid;
77extern int suid_dumpable; 77extern int suid_dumpable;
78extern char core_pattern[]; 78extern char core_pattern[];
79extern unsigned int core_pipe_limit;
79extern int pid_max; 80extern int pid_max;
80extern int min_free_kbytes; 81extern int min_free_kbytes;
81extern int pid_max_min, pid_max_max; 82extern int pid_max_min, pid_max_max;
@@ -162,9 +163,9 @@ extern int max_lock_depth;
162#endif 163#endif
163 164
164#ifdef CONFIG_PROC_SYSCTL 165#ifdef CONFIG_PROC_SYSCTL
165static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp, 166static int proc_do_cad_pid(struct ctl_table *table, int write,
166 void __user *buffer, size_t *lenp, loff_t *ppos); 167 void __user *buffer, size_t *lenp, loff_t *ppos);
167static int proc_taint(struct ctl_table *table, int write, struct file *filp, 168static int proc_taint(struct ctl_table *table, int write,
168 void __user *buffer, size_t *lenp, loff_t *ppos); 169 void __user *buffer, size_t *lenp, loff_t *ppos);
169#endif 170#endif
170 171
@@ -423,6 +424,14 @@ static struct ctl_table kern_table[] = {
423 .proc_handler = &proc_dostring, 424 .proc_handler = &proc_dostring,
424 .strategy = &sysctl_string, 425 .strategy = &sysctl_string,
425 }, 426 },
427 {
428 .ctl_name = CTL_UNNUMBERED,
429 .procname = "core_pipe_limit",
430 .data = &core_pipe_limit,
431 .maxlen = sizeof(unsigned int),
432 .mode = 0644,
433 .proc_handler = &proc_dointvec,
434 },
426#ifdef CONFIG_PROC_SYSCTL 435#ifdef CONFIG_PROC_SYSCTL
427 { 436 {
428 .procname = "tainted", 437 .procname = "tainted",
@@ -1389,6 +1398,31 @@ static struct ctl_table vm_table[] = {
1389 .mode = 0644, 1398 .mode = 0644,
1390 .proc_handler = &scan_unevictable_handler, 1399 .proc_handler = &scan_unevictable_handler,
1391 }, 1400 },
1401#ifdef CONFIG_MEMORY_FAILURE
1402 {
1403 .ctl_name = CTL_UNNUMBERED,
1404 .procname = "memory_failure_early_kill",
1405 .data = &sysctl_memory_failure_early_kill,
1406 .maxlen = sizeof(sysctl_memory_failure_early_kill),
1407 .mode = 0644,
1408 .proc_handler = &proc_dointvec_minmax,
1409 .strategy = &sysctl_intvec,
1410 .extra1 = &zero,
1411 .extra2 = &one,
1412 },
1413 {
1414 .ctl_name = CTL_UNNUMBERED,
1415 .procname = "memory_failure_recovery",
1416 .data = &sysctl_memory_failure_recovery,
1417 .maxlen = sizeof(sysctl_memory_failure_recovery),
1418 .mode = 0644,
1419 .proc_handler = &proc_dointvec_minmax,
1420 .strategy = &sysctl_intvec,
1421 .extra1 = &zero,
1422 .extra2 = &one,
1423 },
1424#endif
1425
1392/* 1426/*
1393 * NOTE: do not add new entries to this table unless you have read 1427 * NOTE: do not add new entries to this table unless you have read
1394 * Documentation/sysctl/ctl_unnumbered.txt 1428 * Documentation/sysctl/ctl_unnumbered.txt
@@ -2217,7 +2251,7 @@ void sysctl_head_put(struct ctl_table_header *head)
2217#ifdef CONFIG_PROC_SYSCTL 2251#ifdef CONFIG_PROC_SYSCTL
2218 2252
2219static int _proc_do_string(void* data, int maxlen, int write, 2253static int _proc_do_string(void* data, int maxlen, int write,
2220 struct file *filp, void __user *buffer, 2254 void __user *buffer,
2221 size_t *lenp, loff_t *ppos) 2255 size_t *lenp, loff_t *ppos)
2222{ 2256{
2223 size_t len; 2257 size_t len;
@@ -2278,7 +2312,6 @@ static int _proc_do_string(void* data, int maxlen, int write,
2278 * proc_dostring - read a string sysctl 2312 * proc_dostring - read a string sysctl
2279 * @table: the sysctl table 2313 * @table: the sysctl table
2280 * @write: %TRUE if this is a write to the sysctl file 2314 * @write: %TRUE if this is a write to the sysctl file
2281 * @filp: the file structure
2282 * @buffer: the user buffer 2315 * @buffer: the user buffer
2283 * @lenp: the size of the user buffer 2316 * @lenp: the size of the user buffer
2284 * @ppos: file position 2317 * @ppos: file position
@@ -2292,10 +2325,10 @@ static int _proc_do_string(void* data, int maxlen, int write,
2292 * 2325 *
2293 * Returns 0 on success. 2326 * Returns 0 on success.
2294 */ 2327 */
2295int proc_dostring(struct ctl_table *table, int write, struct file *filp, 2328int proc_dostring(struct ctl_table *table, int write,
2296 void __user *buffer, size_t *lenp, loff_t *ppos) 2329 void __user *buffer, size_t *lenp, loff_t *ppos)
2297{ 2330{
2298 return _proc_do_string(table->data, table->maxlen, write, filp, 2331 return _proc_do_string(table->data, table->maxlen, write,
2299 buffer, lenp, ppos); 2332 buffer, lenp, ppos);
2300} 2333}
2301 2334
@@ -2320,7 +2353,7 @@ static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp,
2320} 2353}
2321 2354
2322static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, 2355static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
2323 int write, struct file *filp, void __user *buffer, 2356 int write, void __user *buffer,
2324 size_t *lenp, loff_t *ppos, 2357 size_t *lenp, loff_t *ppos,
2325 int (*conv)(int *negp, unsigned long *lvalp, int *valp, 2358 int (*conv)(int *negp, unsigned long *lvalp, int *valp,
2326 int write, void *data), 2359 int write, void *data),
@@ -2427,13 +2460,13 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
2427#undef TMPBUFLEN 2460#undef TMPBUFLEN
2428} 2461}
2429 2462
2430static int do_proc_dointvec(struct ctl_table *table, int write, struct file *filp, 2463static int do_proc_dointvec(struct ctl_table *table, int write,
2431 void __user *buffer, size_t *lenp, loff_t *ppos, 2464 void __user *buffer, size_t *lenp, loff_t *ppos,
2432 int (*conv)(int *negp, unsigned long *lvalp, int *valp, 2465 int (*conv)(int *negp, unsigned long *lvalp, int *valp,
2433 int write, void *data), 2466 int write, void *data),
2434 void *data) 2467 void *data)
2435{ 2468{
2436 return __do_proc_dointvec(table->data, table, write, filp, 2469 return __do_proc_dointvec(table->data, table, write,
2437 buffer, lenp, ppos, conv, data); 2470 buffer, lenp, ppos, conv, data);
2438} 2471}
2439 2472
@@ -2441,7 +2474,6 @@ static int do_proc_dointvec(struct ctl_table *table, int write, struct file *fil
2441 * proc_dointvec - read a vector of integers 2474 * proc_dointvec - read a vector of integers
2442 * @table: the sysctl table 2475 * @table: the sysctl table
2443 * @write: %TRUE if this is a write to the sysctl file 2476 * @write: %TRUE if this is a write to the sysctl file
2444 * @filp: the file structure
2445 * @buffer: the user buffer 2477 * @buffer: the user buffer
2446 * @lenp: the size of the user buffer 2478 * @lenp: the size of the user buffer
2447 * @ppos: file position 2479 * @ppos: file position
@@ -2451,10 +2483,10 @@ static int do_proc_dointvec(struct ctl_table *table, int write, struct file *fil
2451 * 2483 *
2452 * Returns 0 on success. 2484 * Returns 0 on success.
2453 */ 2485 */
2454int proc_dointvec(struct ctl_table *table, int write, struct file *filp, 2486int proc_dointvec(struct ctl_table *table, int write,
2455 void __user *buffer, size_t *lenp, loff_t *ppos) 2487 void __user *buffer, size_t *lenp, loff_t *ppos)
2456{ 2488{
2457 return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, 2489 return do_proc_dointvec(table,write,buffer,lenp,ppos,
2458 NULL,NULL); 2490 NULL,NULL);
2459} 2491}
2460 2492
@@ -2462,7 +2494,7 @@ int proc_dointvec(struct ctl_table *table, int write, struct file *filp,
2462 * Taint values can only be increased 2494 * Taint values can only be increased
2463 * This means we can safely use a temporary. 2495 * This means we can safely use a temporary.
2464 */ 2496 */
2465static int proc_taint(struct ctl_table *table, int write, struct file *filp, 2497static int proc_taint(struct ctl_table *table, int write,
2466 void __user *buffer, size_t *lenp, loff_t *ppos) 2498 void __user *buffer, size_t *lenp, loff_t *ppos)
2467{ 2499{
2468 struct ctl_table t; 2500 struct ctl_table t;
@@ -2474,7 +2506,7 @@ static int proc_taint(struct ctl_table *table, int write, struct file *filp,
2474 2506
2475 t = *table; 2507 t = *table;
2476 t.data = &tmptaint; 2508 t.data = &tmptaint;
2477 err = proc_doulongvec_minmax(&t, write, filp, buffer, lenp, ppos); 2509 err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos);
2478 if (err < 0) 2510 if (err < 0)
2479 return err; 2511 return err;
2480 2512
@@ -2526,7 +2558,6 @@ static int do_proc_dointvec_minmax_conv(int *negp, unsigned long *lvalp,
2526 * proc_dointvec_minmax - read a vector of integers with min/max values 2558 * proc_dointvec_minmax - read a vector of integers with min/max values
2527 * @table: the sysctl table 2559 * @table: the sysctl table
2528 * @write: %TRUE if this is a write to the sysctl file 2560 * @write: %TRUE if this is a write to the sysctl file
2529 * @filp: the file structure
2530 * @buffer: the user buffer 2561 * @buffer: the user buffer
2531 * @lenp: the size of the user buffer 2562 * @lenp: the size of the user buffer
2532 * @ppos: file position 2563 * @ppos: file position
@@ -2539,19 +2570,18 @@ static int do_proc_dointvec_minmax_conv(int *negp, unsigned long *lvalp,
2539 * 2570 *
2540 * Returns 0 on success. 2571 * Returns 0 on success.
2541 */ 2572 */
2542int proc_dointvec_minmax(struct ctl_table *table, int write, struct file *filp, 2573int proc_dointvec_minmax(struct ctl_table *table, int write,
2543 void __user *buffer, size_t *lenp, loff_t *ppos) 2574 void __user *buffer, size_t *lenp, loff_t *ppos)
2544{ 2575{
2545 struct do_proc_dointvec_minmax_conv_param param = { 2576 struct do_proc_dointvec_minmax_conv_param param = {
2546 .min = (int *) table->extra1, 2577 .min = (int *) table->extra1,
2547 .max = (int *) table->extra2, 2578 .max = (int *) table->extra2,
2548 }; 2579 };
2549 return do_proc_dointvec(table, write, filp, buffer, lenp, ppos, 2580 return do_proc_dointvec(table, write, buffer, lenp, ppos,
2550 do_proc_dointvec_minmax_conv, &param); 2581 do_proc_dointvec_minmax_conv, &param);
2551} 2582}
2552 2583
2553static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write, 2584static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write,
2554 struct file *filp,
2555 void __user *buffer, 2585 void __user *buffer,
2556 size_t *lenp, loff_t *ppos, 2586 size_t *lenp, loff_t *ppos,
2557 unsigned long convmul, 2587 unsigned long convmul,
@@ -2656,21 +2686,19 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
2656} 2686}
2657 2687
2658static int do_proc_doulongvec_minmax(struct ctl_table *table, int write, 2688static int do_proc_doulongvec_minmax(struct ctl_table *table, int write,
2659 struct file *filp,
2660 void __user *buffer, 2689 void __user *buffer,
2661 size_t *lenp, loff_t *ppos, 2690 size_t *lenp, loff_t *ppos,
2662 unsigned long convmul, 2691 unsigned long convmul,
2663 unsigned long convdiv) 2692 unsigned long convdiv)
2664{ 2693{
2665 return __do_proc_doulongvec_minmax(table->data, table, write, 2694 return __do_proc_doulongvec_minmax(table->data, table, write,
2666 filp, buffer, lenp, ppos, convmul, convdiv); 2695 buffer, lenp, ppos, convmul, convdiv);
2667} 2696}
2668 2697
2669/** 2698/**
2670 * proc_doulongvec_minmax - read a vector of long integers with min/max values 2699 * proc_doulongvec_minmax - read a vector of long integers with min/max values
2671 * @table: the sysctl table 2700 * @table: the sysctl table
2672 * @write: %TRUE if this is a write to the sysctl file 2701 * @write: %TRUE if this is a write to the sysctl file
2673 * @filp: the file structure
2674 * @buffer: the user buffer 2702 * @buffer: the user buffer
2675 * @lenp: the size of the user buffer 2703 * @lenp: the size of the user buffer
2676 * @ppos: file position 2704 * @ppos: file position
@@ -2683,17 +2711,16 @@ static int do_proc_doulongvec_minmax(struct ctl_table *table, int write,
2683 * 2711 *
2684 * Returns 0 on success. 2712 * Returns 0 on success.
2685 */ 2713 */
2686int proc_doulongvec_minmax(struct ctl_table *table, int write, struct file *filp, 2714int proc_doulongvec_minmax(struct ctl_table *table, int write,
2687 void __user *buffer, size_t *lenp, loff_t *ppos) 2715 void __user *buffer, size_t *lenp, loff_t *ppos)
2688{ 2716{
2689 return do_proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos, 1l, 1l); 2717 return do_proc_doulongvec_minmax(table, write, buffer, lenp, ppos, 1l, 1l);
2690} 2718}
2691 2719
2692/** 2720/**
2693 * proc_doulongvec_ms_jiffies_minmax - read a vector of millisecond values with min/max values 2721 * proc_doulongvec_ms_jiffies_minmax - read a vector of millisecond values with min/max values
2694 * @table: the sysctl table 2722 * @table: the sysctl table
2695 * @write: %TRUE if this is a write to the sysctl file 2723 * @write: %TRUE if this is a write to the sysctl file
2696 * @filp: the file structure
2697 * @buffer: the user buffer 2724 * @buffer: the user buffer
2698 * @lenp: the size of the user buffer 2725 * @lenp: the size of the user buffer
2699 * @ppos: file position 2726 * @ppos: file position
@@ -2708,11 +2735,10 @@ int proc_doulongvec_minmax(struct ctl_table *table, int write, struct file *filp
2708 * Returns 0 on success. 2735 * Returns 0 on success.
2709 */ 2736 */
2710int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, 2737int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
2711 struct file *filp,
2712 void __user *buffer, 2738 void __user *buffer,
2713 size_t *lenp, loff_t *ppos) 2739 size_t *lenp, loff_t *ppos)
2714{ 2740{
2715 return do_proc_doulongvec_minmax(table, write, filp, buffer, 2741 return do_proc_doulongvec_minmax(table, write, buffer,
2716 lenp, ppos, HZ, 1000l); 2742 lenp, ppos, HZ, 1000l);
2717} 2743}
2718 2744
@@ -2788,7 +2814,6 @@ static int do_proc_dointvec_ms_jiffies_conv(int *negp, unsigned long *lvalp,
2788 * proc_dointvec_jiffies - read a vector of integers as seconds 2814 * proc_dointvec_jiffies - read a vector of integers as seconds
2789 * @table: the sysctl table 2815 * @table: the sysctl table
2790 * @write: %TRUE if this is a write to the sysctl file 2816 * @write: %TRUE if this is a write to the sysctl file
2791 * @filp: the file structure
2792 * @buffer: the user buffer 2817 * @buffer: the user buffer
2793 * @lenp: the size of the user buffer 2818 * @lenp: the size of the user buffer
2794 * @ppos: file position 2819 * @ppos: file position
@@ -2800,10 +2825,10 @@ static int do_proc_dointvec_ms_jiffies_conv(int *negp, unsigned long *lvalp,
2800 * 2825 *
2801 * Returns 0 on success. 2826 * Returns 0 on success.
2802 */ 2827 */
2803int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp, 2828int proc_dointvec_jiffies(struct ctl_table *table, int write,
2804 void __user *buffer, size_t *lenp, loff_t *ppos) 2829 void __user *buffer, size_t *lenp, loff_t *ppos)
2805{ 2830{
2806 return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, 2831 return do_proc_dointvec(table,write,buffer,lenp,ppos,
2807 do_proc_dointvec_jiffies_conv,NULL); 2832 do_proc_dointvec_jiffies_conv,NULL);
2808} 2833}
2809 2834
@@ -2811,7 +2836,6 @@ int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp,
2811 * proc_dointvec_userhz_jiffies - read a vector of integers as 1/USER_HZ seconds 2836 * proc_dointvec_userhz_jiffies - read a vector of integers as 1/USER_HZ seconds
2812 * @table: the sysctl table 2837 * @table: the sysctl table
2813 * @write: %TRUE if this is a write to the sysctl file 2838 * @write: %TRUE if this is a write to the sysctl file
2814 * @filp: the file structure
2815 * @buffer: the user buffer 2839 * @buffer: the user buffer
2816 * @lenp: the size of the user buffer 2840 * @lenp: the size of the user buffer
2817 * @ppos: pointer to the file position 2841 * @ppos: pointer to the file position
@@ -2823,10 +2847,10 @@ int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp,
2823 * 2847 *
2824 * Returns 0 on success. 2848 * Returns 0 on success.
2825 */ 2849 */
2826int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file *filp, 2850int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write,
2827 void __user *buffer, size_t *lenp, loff_t *ppos) 2851 void __user *buffer, size_t *lenp, loff_t *ppos)
2828{ 2852{
2829 return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, 2853 return do_proc_dointvec(table,write,buffer,lenp,ppos,
2830 do_proc_dointvec_userhz_jiffies_conv,NULL); 2854 do_proc_dointvec_userhz_jiffies_conv,NULL);
2831} 2855}
2832 2856
@@ -2834,7 +2858,6 @@ int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file
2834 * proc_dointvec_ms_jiffies - read a vector of integers as 1 milliseconds 2858 * proc_dointvec_ms_jiffies - read a vector of integers as 1 milliseconds
2835 * @table: the sysctl table 2859 * @table: the sysctl table
2836 * @write: %TRUE if this is a write to the sysctl file 2860 * @write: %TRUE if this is a write to the sysctl file
2837 * @filp: the file structure
2838 * @buffer: the user buffer 2861 * @buffer: the user buffer
2839 * @lenp: the size of the user buffer 2862 * @lenp: the size of the user buffer
2840 * @ppos: file position 2863 * @ppos: file position
@@ -2847,14 +2870,14 @@ int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file
2847 * 2870 *
2848 * Returns 0 on success. 2871 * Returns 0 on success.
2849 */ 2872 */
2850int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, struct file *filp, 2873int proc_dointvec_ms_jiffies(struct ctl_table *table, int write,
2851 void __user *buffer, size_t *lenp, loff_t *ppos) 2874 void __user *buffer, size_t *lenp, loff_t *ppos)
2852{ 2875{
2853 return do_proc_dointvec(table, write, filp, buffer, lenp, ppos, 2876 return do_proc_dointvec(table, write, buffer, lenp, ppos,
2854 do_proc_dointvec_ms_jiffies_conv, NULL); 2877 do_proc_dointvec_ms_jiffies_conv, NULL);
2855} 2878}
2856 2879
2857static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp, 2880static int proc_do_cad_pid(struct ctl_table *table, int write,
2858 void __user *buffer, size_t *lenp, loff_t *ppos) 2881 void __user *buffer, size_t *lenp, loff_t *ppos)
2859{ 2882{
2860 struct pid *new_pid; 2883 struct pid *new_pid;
@@ -2863,7 +2886,7 @@ static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp
2863 2886
2864 tmp = pid_vnr(cad_pid); 2887 tmp = pid_vnr(cad_pid);
2865 2888
2866 r = __do_proc_dointvec(&tmp, table, write, filp, buffer, 2889 r = __do_proc_dointvec(&tmp, table, write, buffer,
2867 lenp, ppos, NULL, NULL); 2890 lenp, ppos, NULL, NULL);
2868 if (r || !write) 2891 if (r || !write)
2869 return r; 2892 return r;
@@ -2878,50 +2901,49 @@ static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp
2878 2901
2879#else /* CONFIG_PROC_FS */ 2902#else /* CONFIG_PROC_FS */
2880 2903
2881int proc_dostring(struct ctl_table *table, int write, struct file *filp, 2904int proc_dostring(struct ctl_table *table, int write,
2882 void __user *buffer, size_t *lenp, loff_t *ppos) 2905 void __user *buffer, size_t *lenp, loff_t *ppos)
2883{ 2906{
2884 return -ENOSYS; 2907 return -ENOSYS;
2885} 2908}
2886 2909
2887int proc_dointvec(struct ctl_table *table, int write, struct file *filp, 2910int proc_dointvec(struct ctl_table *table, int write,
2888 void __user *buffer, size_t *lenp, loff_t *ppos) 2911 void __user *buffer, size_t *lenp, loff_t *ppos)
2889{ 2912{
2890 return -ENOSYS; 2913 return -ENOSYS;
2891} 2914}
2892 2915
2893int proc_dointvec_minmax(struct ctl_table *table, int write, struct file *filp, 2916int proc_dointvec_minmax(struct ctl_table *table, int write,
2894 void __user *buffer, size_t *lenp, loff_t *ppos) 2917 void __user *buffer, size_t *lenp, loff_t *ppos)
2895{ 2918{
2896 return -ENOSYS; 2919 return -ENOSYS;
2897} 2920}
2898 2921
2899int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp, 2922int proc_dointvec_jiffies(struct ctl_table *table, int write,
2900 void __user *buffer, size_t *lenp, loff_t *ppos) 2923 void __user *buffer, size_t *lenp, loff_t *ppos)
2901{ 2924{
2902 return -ENOSYS; 2925 return -ENOSYS;
2903} 2926}
2904 2927
2905int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file *filp, 2928int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write,
2906 void __user *buffer, size_t *lenp, loff_t *ppos) 2929 void __user *buffer, size_t *lenp, loff_t *ppos)
2907{ 2930{
2908 return -ENOSYS; 2931 return -ENOSYS;
2909} 2932}
2910 2933
2911int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, struct file *filp, 2934int proc_dointvec_ms_jiffies(struct ctl_table *table, int write,
2912 void __user *buffer, size_t *lenp, loff_t *ppos) 2935 void __user *buffer, size_t *lenp, loff_t *ppos)
2913{ 2936{
2914 return -ENOSYS; 2937 return -ENOSYS;
2915} 2938}
2916 2939
2917int proc_doulongvec_minmax(struct ctl_table *table, int write, struct file *filp, 2940int proc_doulongvec_minmax(struct ctl_table *table, int write,
2918 void __user *buffer, size_t *lenp, loff_t *ppos) 2941 void __user *buffer, size_t *lenp, loff_t *ppos)
2919{ 2942{
2920 return -ENOSYS; 2943 return -ENOSYS;
2921} 2944}
2922 2945
2923int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, 2946int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
2924 struct file *filp,
2925 void __user *buffer, 2947 void __user *buffer,
2926 size_t *lenp, loff_t *ppos) 2948 size_t *lenp, loff_t *ppos)
2927{ 2949{
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index 0b0a6366c9d4..ee266620b06c 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -1,4 +1,4 @@
1obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o 1obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o timeconv.o
2 2
3obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o 3obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o
4obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o 4obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o
diff --git a/kernel/time/timeconv.c b/kernel/time/timeconv.c
new file mode 100644
index 000000000000..86628e755f38
--- /dev/null
+++ b/kernel/time/timeconv.c
@@ -0,0 +1,127 @@
1/*
2 * Copyright (C) 1993, 1994, 1995, 1996, 1997 Free Software Foundation, Inc.
3 * This file is part of the GNU C Library.
4 * Contributed by Paul Eggert (eggert@twinsun.com).
5 *
6 * The GNU C Library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Library General Public License as
8 * published by the Free Software Foundation; either version 2 of the
9 * License, or (at your option) any later version.
10 *
11 * The GNU C Library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Library General Public License for more details.
15 *
16 * You should have received a copy of the GNU Library General Public
17 * License along with the GNU C Library; see the file COPYING.LIB. If not,
18 * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 * Boston, MA 02111-1307, USA.
20 */
21
22/*
23 * Converts the calendar time to broken-down time representation
24 * Based on code from glibc-2.6
25 *
26 * 2009-7-14:
27 * Moved from glibc-2.6 to kernel by Zhaolei<zhaolei@cn.fujitsu.com>
28 */
29
30#include <linux/time.h>
31#include <linux/module.h>
32
33/*
34 * Nonzero if YEAR is a leap year (every 4 years,
35 * except every 100th isn't, and every 400th is).
36 */
37static int __isleap(long year)
38{
39 return (year) % 4 == 0 && ((year) % 100 != 0 || (year) % 400 == 0);
40}
41
42/* do a mathdiv for long type */
43static long math_div(long a, long b)
44{
45 return a / b - (a % b < 0);
46}
47
48/* How many leap years between y1 and y2, y1 must less or equal to y2 */
49static long leaps_between(long y1, long y2)
50{
51 long leaps1 = math_div(y1 - 1, 4) - math_div(y1 - 1, 100)
52 + math_div(y1 - 1, 400);
53 long leaps2 = math_div(y2 - 1, 4) - math_div(y2 - 1, 100)
54 + math_div(y2 - 1, 400);
55 return leaps2 - leaps1;
56}
57
58/* How many days come before each month (0-12). */
59static const unsigned short __mon_yday[2][13] = {
60 /* Normal years. */
61 {0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 365},
62 /* Leap years. */
63 {0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, 366}
64};
65
66#define SECS_PER_HOUR (60 * 60)
67#define SECS_PER_DAY (SECS_PER_HOUR * 24)
68
69/**
70 * time_to_tm - converts the calendar time to local broken-down time
71 *
72 * @totalsecs the number of seconds elapsed since 00:00:00 on January 1, 1970,
73 * Coordinated Universal Time (UTC).
74 * @offset offset seconds adding to totalsecs.
75 * @result pointer to struct tm variable to receive broken-down time
76 */
77void time_to_tm(time_t totalsecs, int offset, struct tm *result)
78{
79 long days, rem, y;
80 const unsigned short *ip;
81
82 days = totalsecs / SECS_PER_DAY;
83 rem = totalsecs % SECS_PER_DAY;
84 rem += offset;
85 while (rem < 0) {
86 rem += SECS_PER_DAY;
87 --days;
88 }
89 while (rem >= SECS_PER_DAY) {
90 rem -= SECS_PER_DAY;
91 ++days;
92 }
93
94 result->tm_hour = rem / SECS_PER_HOUR;
95 rem %= SECS_PER_HOUR;
96 result->tm_min = rem / 60;
97 result->tm_sec = rem % 60;
98
99 /* January 1, 1970 was a Thursday. */
100 result->tm_wday = (4 + days) % 7;
101 if (result->tm_wday < 0)
102 result->tm_wday += 7;
103
104 y = 1970;
105
106 while (days < 0 || days >= (__isleap(y) ? 366 : 365)) {
107 /* Guess a corrected year, assuming 365 days per year. */
108 long yg = y + math_div(days, 365);
109
110 /* Adjust DAYS and Y to match the guessed year. */
111 days -= (yg - y) * 365 + leaps_between(y, yg);
112 y = yg;
113 }
114
115 result->tm_year = y - 1900;
116
117 result->tm_yday = days;
118
119 ip = __mon_yday[__isleap(y)];
120 for (y = 11; days < ip[y]; y--)
121 continue;
122 days -= ip[y];
123
124 result->tm_mon = y;
125 result->tm_mday = days + 1;
126}
127EXPORT_SYMBOL(time_to_tm);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 23df7771c937..a142579765bf 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -3015,7 +3015,7 @@ int unregister_ftrace_function(struct ftrace_ops *ops)
3015 3015
3016int 3016int
3017ftrace_enable_sysctl(struct ctl_table *table, int write, 3017ftrace_enable_sysctl(struct ctl_table *table, int write,
3018 struct file *file, void __user *buffer, size_t *lenp, 3018 void __user *buffer, size_t *lenp,
3019 loff_t *ppos) 3019 loff_t *ppos)
3020{ 3020{
3021 int ret; 3021 int ret;
@@ -3025,7 +3025,7 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
3025 3025
3026 mutex_lock(&ftrace_lock); 3026 mutex_lock(&ftrace_lock);
3027 3027
3028 ret = proc_dointvec(table, write, file, buffer, lenp, ppos); 3028 ret = proc_dointvec(table, write, buffer, lenp, ppos);
3029 3029
3030 if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled)) 3030 if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled))
3031 goto out; 3031 goto out;
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 0f6facb050a1..8504ac71e4e8 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -296,14 +296,14 @@ static const struct file_operations stack_trace_fops = {
296 296
297int 297int
298stack_trace_sysctl(struct ctl_table *table, int write, 298stack_trace_sysctl(struct ctl_table *table, int write,
299 struct file *file, void __user *buffer, size_t *lenp, 299 void __user *buffer, size_t *lenp,
300 loff_t *ppos) 300 loff_t *ppos)
301{ 301{
302 int ret; 302 int ret;
303 303
304 mutex_lock(&stack_sysctl_mutex); 304 mutex_lock(&stack_sysctl_mutex);
305 305
306 ret = proc_dointvec(table, write, file, buffer, lenp, ppos); 306 ret = proc_dointvec(table, write, buffer, lenp, ppos);
307 307
308 if (ret || !write || 308 if (ret || !write ||
309 (last_stack_tracer_enabled == !!stack_tracer_enabled)) 309 (last_stack_tracer_enabled == !!stack_tracer_enabled))
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index 92359cc747a7..69eae358a726 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -42,14 +42,14 @@ static void put_uts(ctl_table *table, int write, void *which)
42 * Special case of dostring for the UTS structure. This has locks 42 * Special case of dostring for the UTS structure. This has locks
43 * to observe. Should this be in kernel/sys.c ???? 43 * to observe. Should this be in kernel/sys.c ????
44 */ 44 */
45static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, 45static int proc_do_uts_string(ctl_table *table, int write,
46 void __user *buffer, size_t *lenp, loff_t *ppos) 46 void __user *buffer, size_t *lenp, loff_t *ppos)
47{ 47{
48 struct ctl_table uts_table; 48 struct ctl_table uts_table;
49 int r; 49 int r;
50 memcpy(&uts_table, table, sizeof(uts_table)); 50 memcpy(&uts_table, table, sizeof(uts_table));
51 uts_table.data = get_uts(table, write); 51 uts_table.data = get_uts(table, write);
52 r = proc_dostring(&uts_table,write,filp,buffer,lenp, ppos); 52 r = proc_dostring(&uts_table,write,buffer,lenp, ppos);
53 put_uts(table, write, uts_table.data); 53 put_uts(table, write, uts_table.data);
54 return r; 54 return r;
55} 55}
diff --git a/lib/decompress_inflate.c b/lib/decompress_inflate.c
index 68dfce59c1b8..fc686c7a0a0d 100644
--- a/lib/decompress_inflate.c
+++ b/lib/decompress_inflate.c
@@ -27,6 +27,11 @@
27 27
28#define GZIP_IOBUF_SIZE (16*1024) 28#define GZIP_IOBUF_SIZE (16*1024)
29 29
30static int nofill(void *buffer, unsigned int len)
31{
32 return -1;
33}
34
30/* Included from initramfs et al code */ 35/* Included from initramfs et al code */
31STATIC int INIT gunzip(unsigned char *buf, int len, 36STATIC int INIT gunzip(unsigned char *buf, int len,
32 int(*fill)(void*, unsigned int), 37 int(*fill)(void*, unsigned int),
@@ -76,6 +81,9 @@ STATIC int INIT gunzip(unsigned char *buf, int len,
76 goto gunzip_nomem4; 81 goto gunzip_nomem4;
77 } 82 }
78 83
84 if (!fill)
85 fill = nofill;
86
79 if (len == 0) 87 if (len == 0)
80 len = fill(zbuf, GZIP_IOBUF_SIZE); 88 len = fill(zbuf, GZIP_IOBUF_SIZE);
81 89
diff --git a/lib/decompress_unlzma.c b/lib/decompress_unlzma.c
index 0b954e04bd30..ca82fde81c8f 100644
--- a/lib/decompress_unlzma.c
+++ b/lib/decompress_unlzma.c
@@ -82,6 +82,11 @@ struct rc {
82#define RC_MODEL_TOTAL_BITS 11 82#define RC_MODEL_TOTAL_BITS 11
83 83
84 84
85static int nofill(void *buffer, unsigned int len)
86{
87 return -1;
88}
89
85/* Called twice: once at startup and once in rc_normalize() */ 90/* Called twice: once at startup and once in rc_normalize() */
86static void INIT rc_read(struct rc *rc) 91static void INIT rc_read(struct rc *rc)
87{ 92{
@@ -97,7 +102,10 @@ static inline void INIT rc_init(struct rc *rc,
97 int (*fill)(void*, unsigned int), 102 int (*fill)(void*, unsigned int),
98 char *buffer, int buffer_size) 103 char *buffer, int buffer_size)
99{ 104{
100 rc->fill = fill; 105 if (fill)
106 rc->fill = fill;
107 else
108 rc->fill = nofill;
101 rc->buffer = (uint8_t *)buffer; 109 rc->buffer = (uint8_t *)buffer;
102 rc->buffer_size = buffer_size; 110 rc->buffer_size = buffer_size;
103 rc->buffer_end = rc->buffer + rc->buffer_size; 111 rc->buffer_end = rc->buffer + rc->buffer_size;
diff --git a/mm/Kconfig b/mm/Kconfig
index 71eb0b4cce8d..247760729593 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -245,6 +245,20 @@ config DEFAULT_MMAP_MIN_ADDR
245 /proc/sys/vm/mmap_min_addr tunable. 245 /proc/sys/vm/mmap_min_addr tunable.
246 246
247 247
248config MEMORY_FAILURE
249 depends on MMU
250 depends on X86_MCE
251 bool "Enable recovery from hardware memory errors"
252 help
253 Enables code to recover from some memory failures on systems
254 with MCA recovery. This allows a system to continue running
255 even when some of its memory has uncorrected errors. This requires
256 special hardware support and typically ECC memory.
257
258config HWPOISON_INJECT
259 tristate "Poison pages injector"
260 depends on MEMORY_FAILURE && DEBUG_KERNEL
261
248config NOMMU_INITIAL_TRIM_EXCESS 262config NOMMU_INITIAL_TRIM_EXCESS
249 int "Turn on mmap() excess space trimming before booting" 263 int "Turn on mmap() excess space trimming before booting"
250 depends on !MMU 264 depends on !MMU
diff --git a/mm/Makefile b/mm/Makefile
index 88193d73cd1a..515fd793c17f 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -41,5 +41,7 @@ obj-$(CONFIG_SMP) += allocpercpu.o
41endif 41endif
42obj-$(CONFIG_QUICKLIST) += quicklist.o 42obj-$(CONFIG_QUICKLIST) += quicklist.o
43obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o 43obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
44obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
45obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
44obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o 46obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
45obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o 47obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
diff --git a/mm/filemap.c b/mm/filemap.c
index bcc7372aebbc..c1fc205a92c6 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -104,6 +104,10 @@
104 * 104 *
105 * ->task->proc_lock 105 * ->task->proc_lock
106 * ->dcache_lock (proc_pid_lookup) 106 * ->dcache_lock (proc_pid_lookup)
107 *
108 * (code doesn't rely on that order, so you could switch it around)
109 * ->tasklist_lock (memory_failure, collect_procs_ao)
110 * ->i_mmap_lock
107 */ 111 */
108 112
109/* 113/*
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 815dbd4a6dcb..6f048fcc749c 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1537,7 +1537,7 @@ static unsigned int cpuset_mems_nr(unsigned int *array)
1537 1537
1538#ifdef CONFIG_SYSCTL 1538#ifdef CONFIG_SYSCTL
1539int hugetlb_sysctl_handler(struct ctl_table *table, int write, 1539int hugetlb_sysctl_handler(struct ctl_table *table, int write,
1540 struct file *file, void __user *buffer, 1540 void __user *buffer,
1541 size_t *length, loff_t *ppos) 1541 size_t *length, loff_t *ppos)
1542{ 1542{
1543 struct hstate *h = &default_hstate; 1543 struct hstate *h = &default_hstate;
@@ -1548,7 +1548,7 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write,
1548 1548
1549 table->data = &tmp; 1549 table->data = &tmp;
1550 table->maxlen = sizeof(unsigned long); 1550 table->maxlen = sizeof(unsigned long);
1551 proc_doulongvec_minmax(table, write, file, buffer, length, ppos); 1551 proc_doulongvec_minmax(table, write, buffer, length, ppos);
1552 1552
1553 if (write) 1553 if (write)
1554 h->max_huge_pages = set_max_huge_pages(h, tmp); 1554 h->max_huge_pages = set_max_huge_pages(h, tmp);
@@ -1557,10 +1557,10 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write,
1557} 1557}
1558 1558
1559int hugetlb_treat_movable_handler(struct ctl_table *table, int write, 1559int hugetlb_treat_movable_handler(struct ctl_table *table, int write,
1560 struct file *file, void __user *buffer, 1560 void __user *buffer,
1561 size_t *length, loff_t *ppos) 1561 size_t *length, loff_t *ppos)
1562{ 1562{
1563 proc_dointvec(table, write, file, buffer, length, ppos); 1563 proc_dointvec(table, write, buffer, length, ppos);
1564 if (hugepages_treat_as_movable) 1564 if (hugepages_treat_as_movable)
1565 htlb_alloc_mask = GFP_HIGHUSER_MOVABLE; 1565 htlb_alloc_mask = GFP_HIGHUSER_MOVABLE;
1566 else 1566 else
@@ -1569,7 +1569,7 @@ int hugetlb_treat_movable_handler(struct ctl_table *table, int write,
1569} 1569}
1570 1570
1571int hugetlb_overcommit_handler(struct ctl_table *table, int write, 1571int hugetlb_overcommit_handler(struct ctl_table *table, int write,
1572 struct file *file, void __user *buffer, 1572 void __user *buffer,
1573 size_t *length, loff_t *ppos) 1573 size_t *length, loff_t *ppos)
1574{ 1574{
1575 struct hstate *h = &default_hstate; 1575 struct hstate *h = &default_hstate;
@@ -1580,7 +1580,7 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
1580 1580
1581 table->data = &tmp; 1581 table->data = &tmp;
1582 table->maxlen = sizeof(unsigned long); 1582 table->maxlen = sizeof(unsigned long);
1583 proc_doulongvec_minmax(table, write, file, buffer, length, ppos); 1583 proc_doulongvec_minmax(table, write, buffer, length, ppos);
1584 1584
1585 if (write) { 1585 if (write) {
1586 spin_lock(&hugetlb_lock); 1586 spin_lock(&hugetlb_lock);
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
new file mode 100644
index 000000000000..e1d85137f086
--- /dev/null
+++ b/mm/hwpoison-inject.c
@@ -0,0 +1,41 @@
1/* Inject a hwpoison memory failure on a arbitary pfn */
2#include <linux/module.h>
3#include <linux/debugfs.h>
4#include <linux/kernel.h>
5#include <linux/mm.h>
6
7static struct dentry *hwpoison_dir, *corrupt_pfn;
8
9static int hwpoison_inject(void *data, u64 val)
10{
11 if (!capable(CAP_SYS_ADMIN))
12 return -EPERM;
13 printk(KERN_INFO "Injecting memory failure at pfn %Lx\n", val);
14 return __memory_failure(val, 18, 0);
15}
16
17DEFINE_SIMPLE_ATTRIBUTE(hwpoison_fops, NULL, hwpoison_inject, "%lli\n");
18
19static void pfn_inject_exit(void)
20{
21 if (hwpoison_dir)
22 debugfs_remove_recursive(hwpoison_dir);
23}
24
25static int pfn_inject_init(void)
26{
27 hwpoison_dir = debugfs_create_dir("hwpoison", NULL);
28 if (hwpoison_dir == NULL)
29 return -ENOMEM;
30 corrupt_pfn = debugfs_create_file("corrupt-pfn", 0600, hwpoison_dir,
31 NULL, &hwpoison_fops);
32 if (corrupt_pfn == NULL) {
33 pfn_inject_exit();
34 return -ENOMEM;
35 }
36 return 0;
37}
38
39module_init(pfn_inject_init);
40module_exit(pfn_inject_exit);
41MODULE_LICENSE("GPL");
diff --git a/mm/ksm.c b/mm/ksm.c
index 37cc37325094..f7edac356f46 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -30,6 +30,7 @@
30#include <linux/slab.h> 30#include <linux/slab.h>
31#include <linux/rbtree.h> 31#include <linux/rbtree.h>
32#include <linux/mmu_notifier.h> 32#include <linux/mmu_notifier.h>
33#include <linux/swap.h>
33#include <linux/ksm.h> 34#include <linux/ksm.h>
34 35
35#include <asm/tlbflush.h> 36#include <asm/tlbflush.h>
@@ -162,10 +163,10 @@ static unsigned long ksm_pages_unshared;
162static unsigned long ksm_rmap_items; 163static unsigned long ksm_rmap_items;
163 164
164/* Limit on the number of unswappable pages used */ 165/* Limit on the number of unswappable pages used */
165static unsigned long ksm_max_kernel_pages = 2000; 166static unsigned long ksm_max_kernel_pages;
166 167
167/* Number of pages ksmd should scan in one batch */ 168/* Number of pages ksmd should scan in one batch */
168static unsigned int ksm_thread_pages_to_scan = 200; 169static unsigned int ksm_thread_pages_to_scan = 100;
169 170
170/* Milliseconds ksmd should sleep between batches */ 171/* Milliseconds ksmd should sleep between batches */
171static unsigned int ksm_thread_sleep_millisecs = 20; 172static unsigned int ksm_thread_sleep_millisecs = 20;
@@ -173,7 +174,7 @@ static unsigned int ksm_thread_sleep_millisecs = 20;
173#define KSM_RUN_STOP 0 174#define KSM_RUN_STOP 0
174#define KSM_RUN_MERGE 1 175#define KSM_RUN_MERGE 1
175#define KSM_RUN_UNMERGE 2 176#define KSM_RUN_UNMERGE 2
176static unsigned int ksm_run = KSM_RUN_MERGE; 177static unsigned int ksm_run = KSM_RUN_STOP;
177 178
178static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait); 179static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait);
179static DEFINE_MUTEX(ksm_thread_mutex); 180static DEFINE_MUTEX(ksm_thread_mutex);
@@ -183,6 +184,11 @@ static DEFINE_SPINLOCK(ksm_mmlist_lock);
183 sizeof(struct __struct), __alignof__(struct __struct),\ 184 sizeof(struct __struct), __alignof__(struct __struct),\
184 (__flags), NULL) 185 (__flags), NULL)
185 186
187static void __init ksm_init_max_kernel_pages(void)
188{
189 ksm_max_kernel_pages = nr_free_buffer_pages() / 4;
190}
191
186static int __init ksm_slab_init(void) 192static int __init ksm_slab_init(void)
187{ 193{
188 rmap_item_cache = KSM_KMEM_CACHE(rmap_item, 0); 194 rmap_item_cache = KSM_KMEM_CACHE(rmap_item, 0);
@@ -1667,6 +1673,8 @@ static int __init ksm_init(void)
1667 struct task_struct *ksm_thread; 1673 struct task_struct *ksm_thread;
1668 int err; 1674 int err;
1669 1675
1676 ksm_init_max_kernel_pages();
1677
1670 err = ksm_slab_init(); 1678 err = ksm_slab_init();
1671 if (err) 1679 if (err)
1672 goto out; 1680 goto out;
diff --git a/mm/madvise.c b/mm/madvise.c
index d9ae2067952e..35b1479b7c9d 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -218,6 +218,32 @@ static long madvise_remove(struct vm_area_struct *vma,
218 return error; 218 return error;
219} 219}
220 220
221#ifdef CONFIG_MEMORY_FAILURE
222/*
223 * Error injection support for memory error handling.
224 */
225static int madvise_hwpoison(unsigned long start, unsigned long end)
226{
227 int ret = 0;
228
229 if (!capable(CAP_SYS_ADMIN))
230 return -EPERM;
231 for (; start < end; start += PAGE_SIZE) {
232 struct page *p;
233 int ret = get_user_pages(current, current->mm, start, 1,
234 0, 0, &p, NULL);
235 if (ret != 1)
236 return ret;
237 printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n",
238 page_to_pfn(p), start);
239 /* Ignore return value for now */
240 __memory_failure(page_to_pfn(p), 0, 1);
241 put_page(p);
242 }
243 return ret;
244}
245#endif
246
221static long 247static long
222madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, 248madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
223 unsigned long start, unsigned long end, int behavior) 249 unsigned long start, unsigned long end, int behavior)
@@ -308,6 +334,10 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
308 int write; 334 int write;
309 size_t len; 335 size_t len;
310 336
337#ifdef CONFIG_MEMORY_FAILURE
338 if (behavior == MADV_HWPOISON)
339 return madvise_hwpoison(start, start+len_in);
340#endif
311 if (!madvise_behavior_valid(behavior)) 341 if (!madvise_behavior_valid(behavior))
312 return error; 342 return error;
313 343
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 9b10d8753784..e2b98a6875c0 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -29,6 +29,7 @@
29#include <linux/rcupdate.h> 29#include <linux/rcupdate.h>
30#include <linux/limits.h> 30#include <linux/limits.h>
31#include <linux/mutex.h> 31#include <linux/mutex.h>
32#include <linux/rbtree.h>
32#include <linux/slab.h> 33#include <linux/slab.h>
33#include <linux/swap.h> 34#include <linux/swap.h>
34#include <linux/spinlock.h> 35#include <linux/spinlock.h>
@@ -43,6 +44,7 @@
43 44
44struct cgroup_subsys mem_cgroup_subsys __read_mostly; 45struct cgroup_subsys mem_cgroup_subsys __read_mostly;
45#define MEM_CGROUP_RECLAIM_RETRIES 5 46#define MEM_CGROUP_RECLAIM_RETRIES 5
47struct mem_cgroup *root_mem_cgroup __read_mostly;
46 48
47#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 49#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
48/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ 50/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
@@ -53,6 +55,7 @@ static int really_do_swap_account __initdata = 1; /* for remember boot option*/
53#endif 55#endif
54 56
55static DEFINE_MUTEX(memcg_tasklist); /* can be hold under cgroup_mutex */ 57static DEFINE_MUTEX(memcg_tasklist); /* can be hold under cgroup_mutex */
58#define SOFTLIMIT_EVENTS_THRESH (1000)
56 59
57/* 60/*
58 * Statistics for memory cgroup. 61 * Statistics for memory cgroup.
@@ -66,6 +69,8 @@ enum mem_cgroup_stat_index {
66 MEM_CGROUP_STAT_MAPPED_FILE, /* # of pages charged as file rss */ 69 MEM_CGROUP_STAT_MAPPED_FILE, /* # of pages charged as file rss */
67 MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ 70 MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */
68 MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ 71 MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */
72 MEM_CGROUP_STAT_EVENTS, /* sum of pagein + pageout for internal use */
73 MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
69 74
70 MEM_CGROUP_STAT_NSTATS, 75 MEM_CGROUP_STAT_NSTATS,
71}; 76};
@@ -78,6 +83,20 @@ struct mem_cgroup_stat {
78 struct mem_cgroup_stat_cpu cpustat[0]; 83 struct mem_cgroup_stat_cpu cpustat[0];
79}; 84};
80 85
86static inline void
87__mem_cgroup_stat_reset_safe(struct mem_cgroup_stat_cpu *stat,
88 enum mem_cgroup_stat_index idx)
89{
90 stat->count[idx] = 0;
91}
92
93static inline s64
94__mem_cgroup_stat_read_local(struct mem_cgroup_stat_cpu *stat,
95 enum mem_cgroup_stat_index idx)
96{
97 return stat->count[idx];
98}
99
81/* 100/*
82 * For accounting under irq disable, no need for increment preempt count. 101 * For accounting under irq disable, no need for increment preempt count.
83 */ 102 */
@@ -117,6 +136,12 @@ struct mem_cgroup_per_zone {
117 unsigned long count[NR_LRU_LISTS]; 136 unsigned long count[NR_LRU_LISTS];
118 137
119 struct zone_reclaim_stat reclaim_stat; 138 struct zone_reclaim_stat reclaim_stat;
139 struct rb_node tree_node; /* RB tree node */
140 unsigned long long usage_in_excess;/* Set to the value by which */
141 /* the soft limit is exceeded*/
142 bool on_tree;
143 struct mem_cgroup *mem; /* Back pointer, we cannot */
144 /* use container_of */
120}; 145};
121/* Macro for accessing counter */ 146/* Macro for accessing counter */
122#define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) 147#define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)])
@@ -130,6 +155,26 @@ struct mem_cgroup_lru_info {
130}; 155};
131 156
132/* 157/*
158 * Cgroups above their limits are maintained in a RB-Tree, independent of
159 * their hierarchy representation
160 */
161
162struct mem_cgroup_tree_per_zone {
163 struct rb_root rb_root;
164 spinlock_t lock;
165};
166
167struct mem_cgroup_tree_per_node {
168 struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
169};
170
171struct mem_cgroup_tree {
172 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
173};
174
175static struct mem_cgroup_tree soft_limit_tree __read_mostly;
176
177/*
133 * The memory controller data structure. The memory controller controls both 178 * The memory controller data structure. The memory controller controls both
134 * page cache and RSS per cgroup. We would eventually like to provide 179 * page cache and RSS per cgroup. We would eventually like to provide
135 * statistics based on the statistics developed by Rik Van Riel for clock-pro, 180 * statistics based on the statistics developed by Rik Van Riel for clock-pro,
@@ -186,6 +231,13 @@ struct mem_cgroup {
186 struct mem_cgroup_stat stat; 231 struct mem_cgroup_stat stat;
187}; 232};
188 233
234/*
235 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
236 * limit reclaim to prevent infinite loops, if they ever occur.
237 */
238#define MEM_CGROUP_MAX_RECLAIM_LOOPS (100)
239#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS (2)
240
189enum charge_type { 241enum charge_type {
190 MEM_CGROUP_CHARGE_TYPE_CACHE = 0, 242 MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
191 MEM_CGROUP_CHARGE_TYPE_MAPPED, 243 MEM_CGROUP_CHARGE_TYPE_MAPPED,
@@ -200,13 +252,8 @@ enum charge_type {
200#define PCGF_CACHE (1UL << PCG_CACHE) 252#define PCGF_CACHE (1UL << PCG_CACHE)
201#define PCGF_USED (1UL << PCG_USED) 253#define PCGF_USED (1UL << PCG_USED)
202#define PCGF_LOCK (1UL << PCG_LOCK) 254#define PCGF_LOCK (1UL << PCG_LOCK)
203static const unsigned long 255/* Not used, but added here for completeness */
204pcg_default_flags[NR_CHARGE_TYPE] = { 256#define PCGF_ACCT (1UL << PCG_ACCT)
205 PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* File Cache */
206 PCGF_USED | PCGF_LOCK, /* Anon */
207 PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */
208 0, /* FORCE */
209};
210 257
211/* for encoding cft->private value on file */ 258/* for encoding cft->private value on file */
212#define _MEM (0) 259#define _MEM (0)
@@ -215,15 +262,241 @@ pcg_default_flags[NR_CHARGE_TYPE] = {
215#define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff) 262#define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff)
216#define MEMFILE_ATTR(val) ((val) & 0xffff) 263#define MEMFILE_ATTR(val) ((val) & 0xffff)
217 264
265/*
266 * Reclaim flags for mem_cgroup_hierarchical_reclaim
267 */
268#define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0
269#define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
270#define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1
271#define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
272#define MEM_CGROUP_RECLAIM_SOFT_BIT 0x2
273#define MEM_CGROUP_RECLAIM_SOFT (1 << MEM_CGROUP_RECLAIM_SOFT_BIT)
274
218static void mem_cgroup_get(struct mem_cgroup *mem); 275static void mem_cgroup_get(struct mem_cgroup *mem);
219static void mem_cgroup_put(struct mem_cgroup *mem); 276static void mem_cgroup_put(struct mem_cgroup *mem);
220static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); 277static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
221 278
279static struct mem_cgroup_per_zone *
280mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
281{
282 return &mem->info.nodeinfo[nid]->zoneinfo[zid];
283}
284
285static struct mem_cgroup_per_zone *
286page_cgroup_zoneinfo(struct page_cgroup *pc)
287{
288 struct mem_cgroup *mem = pc->mem_cgroup;
289 int nid = page_cgroup_nid(pc);
290 int zid = page_cgroup_zid(pc);
291
292 if (!mem)
293 return NULL;
294
295 return mem_cgroup_zoneinfo(mem, nid, zid);
296}
297
298static struct mem_cgroup_tree_per_zone *
299soft_limit_tree_node_zone(int nid, int zid)
300{
301 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
302}
303
304static struct mem_cgroup_tree_per_zone *
305soft_limit_tree_from_page(struct page *page)
306{
307 int nid = page_to_nid(page);
308 int zid = page_zonenum(page);
309
310 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
311}
312
313static void
314__mem_cgroup_insert_exceeded(struct mem_cgroup *mem,
315 struct mem_cgroup_per_zone *mz,
316 struct mem_cgroup_tree_per_zone *mctz)
317{
318 struct rb_node **p = &mctz->rb_root.rb_node;
319 struct rb_node *parent = NULL;
320 struct mem_cgroup_per_zone *mz_node;
321
322 if (mz->on_tree)
323 return;
324
325 mz->usage_in_excess = res_counter_soft_limit_excess(&mem->res);
326 while (*p) {
327 parent = *p;
328 mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
329 tree_node);
330 if (mz->usage_in_excess < mz_node->usage_in_excess)
331 p = &(*p)->rb_left;
332 /*
333 * We can't avoid mem cgroups that are over their soft
334 * limit by the same amount
335 */
336 else if (mz->usage_in_excess >= mz_node->usage_in_excess)
337 p = &(*p)->rb_right;
338 }
339 rb_link_node(&mz->tree_node, parent, p);
340 rb_insert_color(&mz->tree_node, &mctz->rb_root);
341 mz->on_tree = true;
342}
343
344static void
345__mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
346 struct mem_cgroup_per_zone *mz,
347 struct mem_cgroup_tree_per_zone *mctz)
348{
349 if (!mz->on_tree)
350 return;
351 rb_erase(&mz->tree_node, &mctz->rb_root);
352 mz->on_tree = false;
353}
354
355static void
356mem_cgroup_insert_exceeded(struct mem_cgroup *mem,
357 struct mem_cgroup_per_zone *mz,
358 struct mem_cgroup_tree_per_zone *mctz)
359{
360 spin_lock(&mctz->lock);
361 __mem_cgroup_insert_exceeded(mem, mz, mctz);
362 spin_unlock(&mctz->lock);
363}
364
365static void
366mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
367 struct mem_cgroup_per_zone *mz,
368 struct mem_cgroup_tree_per_zone *mctz)
369{
370 spin_lock(&mctz->lock);
371 __mem_cgroup_remove_exceeded(mem, mz, mctz);
372 spin_unlock(&mctz->lock);
373}
374
375static bool mem_cgroup_soft_limit_check(struct mem_cgroup *mem)
376{
377 bool ret = false;
378 int cpu;
379 s64 val;
380 struct mem_cgroup_stat_cpu *cpustat;
381
382 cpu = get_cpu();
383 cpustat = &mem->stat.cpustat[cpu];
384 val = __mem_cgroup_stat_read_local(cpustat, MEM_CGROUP_STAT_EVENTS);
385 if (unlikely(val > SOFTLIMIT_EVENTS_THRESH)) {
386 __mem_cgroup_stat_reset_safe(cpustat, MEM_CGROUP_STAT_EVENTS);
387 ret = true;
388 }
389 put_cpu();
390 return ret;
391}
392
393static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page)
394{
395 unsigned long long prev_usage_in_excess, new_usage_in_excess;
396 bool updated_tree = false;
397 struct mem_cgroup_per_zone *mz;
398 struct mem_cgroup_tree_per_zone *mctz;
399
400 mz = mem_cgroup_zoneinfo(mem, page_to_nid(page), page_zonenum(page));
401 mctz = soft_limit_tree_from_page(page);
402
403 /*
404 * We do updates in lazy mode, mem's are removed
405 * lazily from the per-zone, per-node rb tree
406 */
407 prev_usage_in_excess = mz->usage_in_excess;
408
409 new_usage_in_excess = res_counter_soft_limit_excess(&mem->res);
410 if (prev_usage_in_excess) {
411 mem_cgroup_remove_exceeded(mem, mz, mctz);
412 updated_tree = true;
413 }
414 if (!new_usage_in_excess)
415 goto done;
416 mem_cgroup_insert_exceeded(mem, mz, mctz);
417
418done:
419 if (updated_tree) {
420 spin_lock(&mctz->lock);
421 mz->usage_in_excess = new_usage_in_excess;
422 spin_unlock(&mctz->lock);
423 }
424}
425
426static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem)
427{
428 int node, zone;
429 struct mem_cgroup_per_zone *mz;
430 struct mem_cgroup_tree_per_zone *mctz;
431
432 for_each_node_state(node, N_POSSIBLE) {
433 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
434 mz = mem_cgroup_zoneinfo(mem, node, zone);
435 mctz = soft_limit_tree_node_zone(node, zone);
436 mem_cgroup_remove_exceeded(mem, mz, mctz);
437 }
438 }
439}
440
441static inline unsigned long mem_cgroup_get_excess(struct mem_cgroup *mem)
442{
443 return res_counter_soft_limit_excess(&mem->res) >> PAGE_SHIFT;
444}
445
446static struct mem_cgroup_per_zone *
447__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
448{
449 struct rb_node *rightmost = NULL;
450 struct mem_cgroup_per_zone *mz = NULL;
451
452retry:
453 rightmost = rb_last(&mctz->rb_root);
454 if (!rightmost)
455 goto done; /* Nothing to reclaim from */
456
457 mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
458 /*
459 * Remove the node now but someone else can add it back,
460 * we will to add it back at the end of reclaim to its correct
461 * position in the tree.
462 */
463 __mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
464 if (!res_counter_soft_limit_excess(&mz->mem->res) ||
465 !css_tryget(&mz->mem->css))
466 goto retry;
467done:
468 return mz;
469}
470
471static struct mem_cgroup_per_zone *
472mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
473{
474 struct mem_cgroup_per_zone *mz;
475
476 spin_lock(&mctz->lock);
477 mz = __mem_cgroup_largest_soft_limit_node(mctz);
478 spin_unlock(&mctz->lock);
479 return mz;
480}
481
482static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
483 bool charge)
484{
485 int val = (charge) ? 1 : -1;
486 struct mem_cgroup_stat *stat = &mem->stat;
487 struct mem_cgroup_stat_cpu *cpustat;
488 int cpu = get_cpu();
489
490 cpustat = &stat->cpustat[cpu];
491 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_SWAPOUT, val);
492 put_cpu();
493}
494
222static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, 495static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
223 struct page_cgroup *pc, 496 struct page_cgroup *pc,
224 bool charge) 497 bool charge)
225{ 498{
226 int val = (charge)? 1 : -1; 499 int val = (charge) ? 1 : -1;
227 struct mem_cgroup_stat *stat = &mem->stat; 500 struct mem_cgroup_stat *stat = &mem->stat;
228 struct mem_cgroup_stat_cpu *cpustat; 501 struct mem_cgroup_stat_cpu *cpustat;
229 int cpu = get_cpu(); 502 int cpu = get_cpu();
@@ -240,28 +513,10 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
240 else 513 else
241 __mem_cgroup_stat_add_safe(cpustat, 514 __mem_cgroup_stat_add_safe(cpustat,
242 MEM_CGROUP_STAT_PGPGOUT_COUNT, 1); 515 MEM_CGROUP_STAT_PGPGOUT_COUNT, 1);
516 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_EVENTS, 1);
243 put_cpu(); 517 put_cpu();
244} 518}
245 519
246static struct mem_cgroup_per_zone *
247mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
248{
249 return &mem->info.nodeinfo[nid]->zoneinfo[zid];
250}
251
252static struct mem_cgroup_per_zone *
253page_cgroup_zoneinfo(struct page_cgroup *pc)
254{
255 struct mem_cgroup *mem = pc->mem_cgroup;
256 int nid = page_cgroup_nid(pc);
257 int zid = page_cgroup_zid(pc);
258
259 if (!mem)
260 return NULL;
261
262 return mem_cgroup_zoneinfo(mem, nid, zid);
263}
264
265static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, 520static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
266 enum lru_list idx) 521 enum lru_list idx)
267{ 522{
@@ -354,6 +609,11 @@ static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data,
354 return ret; 609 return ret;
355} 610}
356 611
612static inline bool mem_cgroup_is_root(struct mem_cgroup *mem)
613{
614 return (mem == root_mem_cgroup);
615}
616
357/* 617/*
358 * Following LRU functions are allowed to be used without PCG_LOCK. 618 * Following LRU functions are allowed to be used without PCG_LOCK.
359 * Operations are called by routine of global LRU independently from memcg. 619 * Operations are called by routine of global LRU independently from memcg.
@@ -371,22 +631,24 @@ static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data,
371void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru) 631void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)
372{ 632{
373 struct page_cgroup *pc; 633 struct page_cgroup *pc;
374 struct mem_cgroup *mem;
375 struct mem_cgroup_per_zone *mz; 634 struct mem_cgroup_per_zone *mz;
376 635
377 if (mem_cgroup_disabled()) 636 if (mem_cgroup_disabled())
378 return; 637 return;
379 pc = lookup_page_cgroup(page); 638 pc = lookup_page_cgroup(page);
380 /* can happen while we handle swapcache. */ 639 /* can happen while we handle swapcache. */
381 if (list_empty(&pc->lru) || !pc->mem_cgroup) 640 if (!TestClearPageCgroupAcctLRU(pc))
382 return; 641 return;
642 VM_BUG_ON(!pc->mem_cgroup);
383 /* 643 /*
384 * We don't check PCG_USED bit. It's cleared when the "page" is finally 644 * We don't check PCG_USED bit. It's cleared when the "page" is finally
385 * removed from global LRU. 645 * removed from global LRU.
386 */ 646 */
387 mz = page_cgroup_zoneinfo(pc); 647 mz = page_cgroup_zoneinfo(pc);
388 mem = pc->mem_cgroup;
389 MEM_CGROUP_ZSTAT(mz, lru) -= 1; 648 MEM_CGROUP_ZSTAT(mz, lru) -= 1;
649 if (mem_cgroup_is_root(pc->mem_cgroup))
650 return;
651 VM_BUG_ON(list_empty(&pc->lru));
390 list_del_init(&pc->lru); 652 list_del_init(&pc->lru);
391 return; 653 return;
392} 654}
@@ -410,8 +672,8 @@ void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)
410 * For making pc->mem_cgroup visible, insert smp_rmb() here. 672 * For making pc->mem_cgroup visible, insert smp_rmb() here.
411 */ 673 */
412 smp_rmb(); 674 smp_rmb();
413 /* unused page is not rotated. */ 675 /* unused or root page is not rotated. */
414 if (!PageCgroupUsed(pc)) 676 if (!PageCgroupUsed(pc) || mem_cgroup_is_root(pc->mem_cgroup))
415 return; 677 return;
416 mz = page_cgroup_zoneinfo(pc); 678 mz = page_cgroup_zoneinfo(pc);
417 list_move(&pc->lru, &mz->lists[lru]); 679 list_move(&pc->lru, &mz->lists[lru]);
@@ -425,6 +687,7 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
425 if (mem_cgroup_disabled()) 687 if (mem_cgroup_disabled())
426 return; 688 return;
427 pc = lookup_page_cgroup(page); 689 pc = lookup_page_cgroup(page);
690 VM_BUG_ON(PageCgroupAcctLRU(pc));
428 /* 691 /*
429 * Used bit is set without atomic ops but after smp_wmb(). 692 * Used bit is set without atomic ops but after smp_wmb().
430 * For making pc->mem_cgroup visible, insert smp_rmb() here. 693 * For making pc->mem_cgroup visible, insert smp_rmb() here.
@@ -435,6 +698,9 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
435 698
436 mz = page_cgroup_zoneinfo(pc); 699 mz = page_cgroup_zoneinfo(pc);
437 MEM_CGROUP_ZSTAT(mz, lru) += 1; 700 MEM_CGROUP_ZSTAT(mz, lru) += 1;
701 SetPageCgroupAcctLRU(pc);
702 if (mem_cgroup_is_root(pc->mem_cgroup))
703 return;
438 list_add(&pc->lru, &mz->lists[lru]); 704 list_add(&pc->lru, &mz->lists[lru]);
439} 705}
440 706
@@ -469,7 +735,7 @@ static void mem_cgroup_lru_add_after_commit_swapcache(struct page *page)
469 735
470 spin_lock_irqsave(&zone->lru_lock, flags); 736 spin_lock_irqsave(&zone->lru_lock, flags);
471 /* link when the page is linked to LRU but page_cgroup isn't */ 737 /* link when the page is linked to LRU but page_cgroup isn't */
472 if (PageLRU(page) && list_empty(&pc->lru)) 738 if (PageLRU(page) && !PageCgroupAcctLRU(pc))
473 mem_cgroup_add_lru_list(page, page_lru(page)); 739 mem_cgroup_add_lru_list(page, page_lru(page));
474 spin_unlock_irqrestore(&zone->lru_lock, flags); 740 spin_unlock_irqrestore(&zone->lru_lock, flags);
475} 741}
@@ -855,28 +1121,62 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem)
855 * If shrink==true, for avoiding to free too much, this returns immedieately. 1121 * If shrink==true, for avoiding to free too much, this returns immedieately.
856 */ 1122 */
857static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, 1123static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
858 gfp_t gfp_mask, bool noswap, bool shrink) 1124 struct zone *zone,
1125 gfp_t gfp_mask,
1126 unsigned long reclaim_options)
859{ 1127{
860 struct mem_cgroup *victim; 1128 struct mem_cgroup *victim;
861 int ret, total = 0; 1129 int ret, total = 0;
862 int loop = 0; 1130 int loop = 0;
1131 bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP;
1132 bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK;
1133 bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT;
1134 unsigned long excess = mem_cgroup_get_excess(root_mem);
863 1135
864 /* If memsw_is_minimum==1, swap-out is of-no-use. */ 1136 /* If memsw_is_minimum==1, swap-out is of-no-use. */
865 if (root_mem->memsw_is_minimum) 1137 if (root_mem->memsw_is_minimum)
866 noswap = true; 1138 noswap = true;
867 1139
868 while (loop < 2) { 1140 while (1) {
869 victim = mem_cgroup_select_victim(root_mem); 1141 victim = mem_cgroup_select_victim(root_mem);
870 if (victim == root_mem) 1142 if (victim == root_mem) {
871 loop++; 1143 loop++;
1144 if (loop >= 2) {
1145 /*
1146 * If we have not been able to reclaim
1147 * anything, it might because there are
1148 * no reclaimable pages under this hierarchy
1149 */
1150 if (!check_soft || !total) {
1151 css_put(&victim->css);
1152 break;
1153 }
1154 /*
1155 * We want to do more targetted reclaim.
1156 * excess >> 2 is not to excessive so as to
1157 * reclaim too much, nor too less that we keep
1158 * coming back to reclaim from this cgroup
1159 */
1160 if (total >= (excess >> 2) ||
1161 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) {
1162 css_put(&victim->css);
1163 break;
1164 }
1165 }
1166 }
872 if (!mem_cgroup_local_usage(&victim->stat)) { 1167 if (!mem_cgroup_local_usage(&victim->stat)) {
873 /* this cgroup's local usage == 0 */ 1168 /* this cgroup's local usage == 0 */
874 css_put(&victim->css); 1169 css_put(&victim->css);
875 continue; 1170 continue;
876 } 1171 }
877 /* we use swappiness of local cgroup */ 1172 /* we use swappiness of local cgroup */
878 ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, noswap, 1173 if (check_soft)
879 get_swappiness(victim)); 1174 ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
1175 noswap, get_swappiness(victim), zone,
1176 zone->zone_pgdat->node_id);
1177 else
1178 ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
1179 noswap, get_swappiness(victim));
880 css_put(&victim->css); 1180 css_put(&victim->css);
881 /* 1181 /*
882 * At shrinking usage, we can't check we should stop here or 1182 * At shrinking usage, we can't check we should stop here or
@@ -886,7 +1186,10 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
886 if (shrink) 1186 if (shrink)
887 return ret; 1187 return ret;
888 total += ret; 1188 total += ret;
889 if (mem_cgroup_check_under_limit(root_mem)) 1189 if (check_soft) {
1190 if (res_counter_check_under_soft_limit(&root_mem->res))
1191 return total;
1192 } else if (mem_cgroup_check_under_limit(root_mem))
890 return 1 + total; 1193 return 1 + total;
891 } 1194 }
892 return total; 1195 return total;
@@ -965,11 +1268,11 @@ done:
965 */ 1268 */
966static int __mem_cgroup_try_charge(struct mm_struct *mm, 1269static int __mem_cgroup_try_charge(struct mm_struct *mm,
967 gfp_t gfp_mask, struct mem_cgroup **memcg, 1270 gfp_t gfp_mask, struct mem_cgroup **memcg,
968 bool oom) 1271 bool oom, struct page *page)
969{ 1272{
970 struct mem_cgroup *mem, *mem_over_limit; 1273 struct mem_cgroup *mem, *mem_over_limit, *mem_over_soft_limit;
971 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 1274 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
972 struct res_counter *fail_res; 1275 struct res_counter *fail_res, *soft_fail_res = NULL;
973 1276
974 if (unlikely(test_thread_flag(TIF_MEMDIE))) { 1277 if (unlikely(test_thread_flag(TIF_MEMDIE))) {
975 /* Don't account this! */ 1278 /* Don't account this! */
@@ -996,20 +1299,23 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
996 VM_BUG_ON(css_is_removed(&mem->css)); 1299 VM_BUG_ON(css_is_removed(&mem->css));
997 1300
998 while (1) { 1301 while (1) {
999 int ret; 1302 int ret = 0;
1000 bool noswap = false; 1303 unsigned long flags = 0;
1001 1304
1002 ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res); 1305 if (mem_cgroup_is_root(mem))
1306 goto done;
1307 ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res,
1308 &soft_fail_res);
1003 if (likely(!ret)) { 1309 if (likely(!ret)) {
1004 if (!do_swap_account) 1310 if (!do_swap_account)
1005 break; 1311 break;
1006 ret = res_counter_charge(&mem->memsw, PAGE_SIZE, 1312 ret = res_counter_charge(&mem->memsw, PAGE_SIZE,
1007 &fail_res); 1313 &fail_res, NULL);
1008 if (likely(!ret)) 1314 if (likely(!ret))
1009 break; 1315 break;
1010 /* mem+swap counter fails */ 1316 /* mem+swap counter fails */
1011 res_counter_uncharge(&mem->res, PAGE_SIZE); 1317 res_counter_uncharge(&mem->res, PAGE_SIZE, NULL);
1012 noswap = true; 1318 flags |= MEM_CGROUP_RECLAIM_NOSWAP;
1013 mem_over_limit = mem_cgroup_from_res_counter(fail_res, 1319 mem_over_limit = mem_cgroup_from_res_counter(fail_res,
1014 memsw); 1320 memsw);
1015 } else 1321 } else
@@ -1020,8 +1326,8 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
1020 if (!(gfp_mask & __GFP_WAIT)) 1326 if (!(gfp_mask & __GFP_WAIT))
1021 goto nomem; 1327 goto nomem;
1022 1328
1023 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, gfp_mask, 1329 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
1024 noswap, false); 1330 gfp_mask, flags);
1025 if (ret) 1331 if (ret)
1026 continue; 1332 continue;
1027 1333
@@ -1046,13 +1352,24 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
1046 goto nomem; 1352 goto nomem;
1047 } 1353 }
1048 } 1354 }
1355 /*
1356 * Insert just the ancestor, we should trickle down to the correct
1357 * cgroup for reclaim, since the other nodes will be below their
1358 * soft limit
1359 */
1360 if (soft_fail_res) {
1361 mem_over_soft_limit =
1362 mem_cgroup_from_res_counter(soft_fail_res, res);
1363 if (mem_cgroup_soft_limit_check(mem_over_soft_limit))
1364 mem_cgroup_update_tree(mem_over_soft_limit, page);
1365 }
1366done:
1049 return 0; 1367 return 0;
1050nomem: 1368nomem:
1051 css_put(&mem->css); 1369 css_put(&mem->css);
1052 return -ENOMEM; 1370 return -ENOMEM;
1053} 1371}
1054 1372
1055
1056/* 1373/*
1057 * A helper function to get mem_cgroup from ID. must be called under 1374 * A helper function to get mem_cgroup from ID. must be called under
1058 * rcu_read_lock(). The caller must check css_is_removed() or some if 1375 * rcu_read_lock(). The caller must check css_is_removed() or some if
@@ -1119,15 +1436,38 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
1119 lock_page_cgroup(pc); 1436 lock_page_cgroup(pc);
1120 if (unlikely(PageCgroupUsed(pc))) { 1437 if (unlikely(PageCgroupUsed(pc))) {
1121 unlock_page_cgroup(pc); 1438 unlock_page_cgroup(pc);
1122 res_counter_uncharge(&mem->res, PAGE_SIZE); 1439 if (!mem_cgroup_is_root(mem)) {
1123 if (do_swap_account) 1440 res_counter_uncharge(&mem->res, PAGE_SIZE, NULL);
1124 res_counter_uncharge(&mem->memsw, PAGE_SIZE); 1441 if (do_swap_account)
1442 res_counter_uncharge(&mem->memsw, PAGE_SIZE,
1443 NULL);
1444 }
1125 css_put(&mem->css); 1445 css_put(&mem->css);
1126 return; 1446 return;
1127 } 1447 }
1448
1128 pc->mem_cgroup = mem; 1449 pc->mem_cgroup = mem;
1450 /*
1451 * We access a page_cgroup asynchronously without lock_page_cgroup().
1452 * Especially when a page_cgroup is taken from a page, pc->mem_cgroup
1453 * is accessed after testing USED bit. To make pc->mem_cgroup visible
1454 * before USED bit, we need memory barrier here.
1455 * See mem_cgroup_add_lru_list(), etc.
1456 */
1129 smp_wmb(); 1457 smp_wmb();
1130 pc->flags = pcg_default_flags[ctype]; 1458 switch (ctype) {
1459 case MEM_CGROUP_CHARGE_TYPE_CACHE:
1460 case MEM_CGROUP_CHARGE_TYPE_SHMEM:
1461 SetPageCgroupCache(pc);
1462 SetPageCgroupUsed(pc);
1463 break;
1464 case MEM_CGROUP_CHARGE_TYPE_MAPPED:
1465 ClearPageCgroupCache(pc);
1466 SetPageCgroupUsed(pc);
1467 break;
1468 default:
1469 break;
1470 }
1131 1471
1132 mem_cgroup_charge_statistics(mem, pc, true); 1472 mem_cgroup_charge_statistics(mem, pc, true);
1133 1473
@@ -1178,7 +1518,8 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
1178 if (pc->mem_cgroup != from) 1518 if (pc->mem_cgroup != from)
1179 goto out; 1519 goto out;
1180 1520
1181 res_counter_uncharge(&from->res, PAGE_SIZE); 1521 if (!mem_cgroup_is_root(from))
1522 res_counter_uncharge(&from->res, PAGE_SIZE, NULL);
1182 mem_cgroup_charge_statistics(from, pc, false); 1523 mem_cgroup_charge_statistics(from, pc, false);
1183 1524
1184 page = pc->page; 1525 page = pc->page;
@@ -1197,8 +1538,8 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
1197 1); 1538 1);
1198 } 1539 }
1199 1540
1200 if (do_swap_account) 1541 if (do_swap_account && !mem_cgroup_is_root(from))
1201 res_counter_uncharge(&from->memsw, PAGE_SIZE); 1542 res_counter_uncharge(&from->memsw, PAGE_SIZE, NULL);
1202 css_put(&from->css); 1543 css_put(&from->css);
1203 1544
1204 css_get(&to->css); 1545 css_get(&to->css);
@@ -1238,7 +1579,7 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
1238 parent = mem_cgroup_from_cont(pcg); 1579 parent = mem_cgroup_from_cont(pcg);
1239 1580
1240 1581
1241 ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false); 1582 ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, page);
1242 if (ret || !parent) 1583 if (ret || !parent)
1243 return ret; 1584 return ret;
1244 1585
@@ -1268,9 +1609,11 @@ uncharge:
1268 /* drop extra refcnt by try_charge() */ 1609 /* drop extra refcnt by try_charge() */
1269 css_put(&parent->css); 1610 css_put(&parent->css);
1270 /* uncharge if move fails */ 1611 /* uncharge if move fails */
1271 res_counter_uncharge(&parent->res, PAGE_SIZE); 1612 if (!mem_cgroup_is_root(parent)) {
1272 if (do_swap_account) 1613 res_counter_uncharge(&parent->res, PAGE_SIZE, NULL);
1273 res_counter_uncharge(&parent->memsw, PAGE_SIZE); 1614 if (do_swap_account)
1615 res_counter_uncharge(&parent->memsw, PAGE_SIZE, NULL);
1616 }
1274 return ret; 1617 return ret;
1275} 1618}
1276 1619
@@ -1295,7 +1638,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
1295 prefetchw(pc); 1638 prefetchw(pc);
1296 1639
1297 mem = memcg; 1640 mem = memcg;
1298 ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true); 1641 ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true, page);
1299 if (ret || !mem) 1642 if (ret || !mem)
1300 return ret; 1643 return ret;
1301 1644
@@ -1414,14 +1757,14 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
1414 if (!mem) 1757 if (!mem)
1415 goto charge_cur_mm; 1758 goto charge_cur_mm;
1416 *ptr = mem; 1759 *ptr = mem;
1417 ret = __mem_cgroup_try_charge(NULL, mask, ptr, true); 1760 ret = __mem_cgroup_try_charge(NULL, mask, ptr, true, page);
1418 /* drop extra refcnt from tryget */ 1761 /* drop extra refcnt from tryget */
1419 css_put(&mem->css); 1762 css_put(&mem->css);
1420 return ret; 1763 return ret;
1421charge_cur_mm: 1764charge_cur_mm:
1422 if (unlikely(!mm)) 1765 if (unlikely(!mm))
1423 mm = &init_mm; 1766 mm = &init_mm;
1424 return __mem_cgroup_try_charge(mm, mask, ptr, true); 1767 return __mem_cgroup_try_charge(mm, mask, ptr, true, page);
1425} 1768}
1426 1769
1427static void 1770static void
@@ -1459,7 +1802,10 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
1459 * This recorded memcg can be obsolete one. So, avoid 1802 * This recorded memcg can be obsolete one. So, avoid
1460 * calling css_tryget 1803 * calling css_tryget
1461 */ 1804 */
1462 res_counter_uncharge(&memcg->memsw, PAGE_SIZE); 1805 if (!mem_cgroup_is_root(memcg))
1806 res_counter_uncharge(&memcg->memsw, PAGE_SIZE,
1807 NULL);
1808 mem_cgroup_swap_statistics(memcg, false);
1463 mem_cgroup_put(memcg); 1809 mem_cgroup_put(memcg);
1464 } 1810 }
1465 rcu_read_unlock(); 1811 rcu_read_unlock();
@@ -1484,9 +1830,11 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
1484 return; 1830 return;
1485 if (!mem) 1831 if (!mem)
1486 return; 1832 return;
1487 res_counter_uncharge(&mem->res, PAGE_SIZE); 1833 if (!mem_cgroup_is_root(mem)) {
1488 if (do_swap_account) 1834 res_counter_uncharge(&mem->res, PAGE_SIZE, NULL);
1489 res_counter_uncharge(&mem->memsw, PAGE_SIZE); 1835 if (do_swap_account)
1836 res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL);
1837 }
1490 css_put(&mem->css); 1838 css_put(&mem->css);
1491} 1839}
1492 1840
@@ -1500,6 +1848,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
1500 struct page_cgroup *pc; 1848 struct page_cgroup *pc;
1501 struct mem_cgroup *mem = NULL; 1849 struct mem_cgroup *mem = NULL;
1502 struct mem_cgroup_per_zone *mz; 1850 struct mem_cgroup_per_zone *mz;
1851 bool soft_limit_excess = false;
1503 1852
1504 if (mem_cgroup_disabled()) 1853 if (mem_cgroup_disabled())
1505 return NULL; 1854 return NULL;
@@ -1538,9 +1887,14 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
1538 break; 1887 break;
1539 } 1888 }
1540 1889
1541 res_counter_uncharge(&mem->res, PAGE_SIZE); 1890 if (!mem_cgroup_is_root(mem)) {
1542 if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) 1891 res_counter_uncharge(&mem->res, PAGE_SIZE, &soft_limit_excess);
1543 res_counter_uncharge(&mem->memsw, PAGE_SIZE); 1892 if (do_swap_account &&
1893 (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT))
1894 res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL);
1895 }
1896 if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
1897 mem_cgroup_swap_statistics(mem, true);
1544 mem_cgroup_charge_statistics(mem, pc, false); 1898 mem_cgroup_charge_statistics(mem, pc, false);
1545 1899
1546 ClearPageCgroupUsed(pc); 1900 ClearPageCgroupUsed(pc);
@@ -1554,6 +1908,8 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
1554 mz = page_cgroup_zoneinfo(pc); 1908 mz = page_cgroup_zoneinfo(pc);
1555 unlock_page_cgroup(pc); 1909 unlock_page_cgroup(pc);
1556 1910
1911 if (soft_limit_excess && mem_cgroup_soft_limit_check(mem))
1912 mem_cgroup_update_tree(mem, page);
1557 /* at swapout, this memcg will be accessed to record to swap */ 1913 /* at swapout, this memcg will be accessed to record to swap */
1558 if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT) 1914 if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
1559 css_put(&mem->css); 1915 css_put(&mem->css);
@@ -1629,7 +1985,9 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent)
1629 * We uncharge this because swap is freed. 1985 * We uncharge this because swap is freed.
1630 * This memcg can be obsolete one. We avoid calling css_tryget 1986 * This memcg can be obsolete one. We avoid calling css_tryget
1631 */ 1987 */
1632 res_counter_uncharge(&memcg->memsw, PAGE_SIZE); 1988 if (!mem_cgroup_is_root(memcg))
1989 res_counter_uncharge(&memcg->memsw, PAGE_SIZE, NULL);
1990 mem_cgroup_swap_statistics(memcg, false);
1633 mem_cgroup_put(memcg); 1991 mem_cgroup_put(memcg);
1634 } 1992 }
1635 rcu_read_unlock(); 1993 rcu_read_unlock();
@@ -1658,7 +2016,8 @@ int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr)
1658 unlock_page_cgroup(pc); 2016 unlock_page_cgroup(pc);
1659 2017
1660 if (mem) { 2018 if (mem) {
1661 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false); 2019 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false,
2020 page);
1662 css_put(&mem->css); 2021 css_put(&mem->css);
1663 } 2022 }
1664 *ptr = mem; 2023 *ptr = mem;
@@ -1798,8 +2157,9 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
1798 if (!ret) 2157 if (!ret)
1799 break; 2158 break;
1800 2159
1801 progress = mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL, 2160 progress = mem_cgroup_hierarchical_reclaim(memcg, NULL,
1802 false, true); 2161 GFP_KERNEL,
2162 MEM_CGROUP_RECLAIM_SHRINK);
1803 curusage = res_counter_read_u64(&memcg->res, RES_USAGE); 2163 curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
1804 /* Usage is reduced ? */ 2164 /* Usage is reduced ? */
1805 if (curusage >= oldusage) 2165 if (curusage >= oldusage)
@@ -1851,7 +2211,9 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
1851 if (!ret) 2211 if (!ret)
1852 break; 2212 break;
1853 2213
1854 mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL, true, true); 2214 mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
2215 MEM_CGROUP_RECLAIM_NOSWAP |
2216 MEM_CGROUP_RECLAIM_SHRINK);
1855 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 2217 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
1856 /* Usage is reduced ? */ 2218 /* Usage is reduced ? */
1857 if (curusage >= oldusage) 2219 if (curusage >= oldusage)
@@ -1862,6 +2224,97 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
1862 return ret; 2224 return ret;
1863} 2225}
1864 2226
2227unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
2228 gfp_t gfp_mask, int nid,
2229 int zid)
2230{
2231 unsigned long nr_reclaimed = 0;
2232 struct mem_cgroup_per_zone *mz, *next_mz = NULL;
2233 unsigned long reclaimed;
2234 int loop = 0;
2235 struct mem_cgroup_tree_per_zone *mctz;
2236
2237 if (order > 0)
2238 return 0;
2239
2240 mctz = soft_limit_tree_node_zone(nid, zid);
2241 /*
2242 * This loop can run a while, specially if mem_cgroup's continuously
2243 * keep exceeding their soft limit and putting the system under
2244 * pressure
2245 */
2246 do {
2247 if (next_mz)
2248 mz = next_mz;
2249 else
2250 mz = mem_cgroup_largest_soft_limit_node(mctz);
2251 if (!mz)
2252 break;
2253
2254 reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone,
2255 gfp_mask,
2256 MEM_CGROUP_RECLAIM_SOFT);
2257 nr_reclaimed += reclaimed;
2258 spin_lock(&mctz->lock);
2259
2260 /*
2261 * If we failed to reclaim anything from this memory cgroup
2262 * it is time to move on to the next cgroup
2263 */
2264 next_mz = NULL;
2265 if (!reclaimed) {
2266 do {
2267 /*
2268 * Loop until we find yet another one.
2269 *
2270 * By the time we get the soft_limit lock
2271 * again, someone might have aded the
2272 * group back on the RB tree. Iterate to
2273 * make sure we get a different mem.
2274 * mem_cgroup_largest_soft_limit_node returns
2275 * NULL if no other cgroup is present on
2276 * the tree
2277 */
2278 next_mz =
2279 __mem_cgroup_largest_soft_limit_node(mctz);
2280 if (next_mz == mz) {
2281 css_put(&next_mz->mem->css);
2282 next_mz = NULL;
2283 } else /* next_mz == NULL or other memcg */
2284 break;
2285 } while (1);
2286 }
2287 mz->usage_in_excess =
2288 res_counter_soft_limit_excess(&mz->mem->res);
2289 __mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
2290 /*
2291 * One school of thought says that we should not add
2292 * back the node to the tree if reclaim returns 0.
2293 * But our reclaim could return 0, simply because due
2294 * to priority we are exposing a smaller subset of
2295 * memory to reclaim from. Consider this as a longer
2296 * term TODO.
2297 */
2298 if (mz->usage_in_excess)
2299 __mem_cgroup_insert_exceeded(mz->mem, mz, mctz);
2300 spin_unlock(&mctz->lock);
2301 css_put(&mz->mem->css);
2302 loop++;
2303 /*
2304 * Could not reclaim anything and there are no more
2305 * mem cgroups to try or we seem to be looping without
2306 * reclaiming anything.
2307 */
2308 if (!nr_reclaimed &&
2309 (next_mz == NULL ||
2310 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
2311 break;
2312 } while (!nr_reclaimed);
2313 if (next_mz)
2314 css_put(&next_mz->mem->css);
2315 return nr_reclaimed;
2316}
2317
1865/* 2318/*
1866 * This routine traverse page_cgroup in given list and drop them all. 2319 * This routine traverse page_cgroup in given list and drop them all.
1867 * *And* this routine doesn't reclaim page itself, just removes page_cgroup. 2320 * *And* this routine doesn't reclaim page itself, just removes page_cgroup.
@@ -2046,20 +2499,64 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
2046 return retval; 2499 return retval;
2047} 2500}
2048 2501
2502struct mem_cgroup_idx_data {
2503 s64 val;
2504 enum mem_cgroup_stat_index idx;
2505};
2506
2507static int
2508mem_cgroup_get_idx_stat(struct mem_cgroup *mem, void *data)
2509{
2510 struct mem_cgroup_idx_data *d = data;
2511 d->val += mem_cgroup_read_stat(&mem->stat, d->idx);
2512 return 0;
2513}
2514
2515static void
2516mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem,
2517 enum mem_cgroup_stat_index idx, s64 *val)
2518{
2519 struct mem_cgroup_idx_data d;
2520 d.idx = idx;
2521 d.val = 0;
2522 mem_cgroup_walk_tree(mem, &d, mem_cgroup_get_idx_stat);
2523 *val = d.val;
2524}
2525
2049static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) 2526static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
2050{ 2527{
2051 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 2528 struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
2052 u64 val = 0; 2529 u64 idx_val, val;
2053 int type, name; 2530 int type, name;
2054 2531
2055 type = MEMFILE_TYPE(cft->private); 2532 type = MEMFILE_TYPE(cft->private);
2056 name = MEMFILE_ATTR(cft->private); 2533 name = MEMFILE_ATTR(cft->private);
2057 switch (type) { 2534 switch (type) {
2058 case _MEM: 2535 case _MEM:
2059 val = res_counter_read_u64(&mem->res, name); 2536 if (name == RES_USAGE && mem_cgroup_is_root(mem)) {
2537 mem_cgroup_get_recursive_idx_stat(mem,
2538 MEM_CGROUP_STAT_CACHE, &idx_val);
2539 val = idx_val;
2540 mem_cgroup_get_recursive_idx_stat(mem,
2541 MEM_CGROUP_STAT_RSS, &idx_val);
2542 val += idx_val;
2543 val <<= PAGE_SHIFT;
2544 } else
2545 val = res_counter_read_u64(&mem->res, name);
2060 break; 2546 break;
2061 case _MEMSWAP: 2547 case _MEMSWAP:
2062 val = res_counter_read_u64(&mem->memsw, name); 2548 if (name == RES_USAGE && mem_cgroup_is_root(mem)) {
2549 mem_cgroup_get_recursive_idx_stat(mem,
2550 MEM_CGROUP_STAT_CACHE, &idx_val);
2551 val = idx_val;
2552 mem_cgroup_get_recursive_idx_stat(mem,
2553 MEM_CGROUP_STAT_RSS, &idx_val);
2554 val += idx_val;
2555 mem_cgroup_get_recursive_idx_stat(mem,
2556 MEM_CGROUP_STAT_SWAPOUT, &idx_val);
2557 val <<= PAGE_SHIFT;
2558 } else
2559 val = res_counter_read_u64(&mem->memsw, name);
2063 break; 2560 break;
2064 default: 2561 default:
2065 BUG(); 2562 BUG();
@@ -2083,6 +2580,10 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
2083 name = MEMFILE_ATTR(cft->private); 2580 name = MEMFILE_ATTR(cft->private);
2084 switch (name) { 2581 switch (name) {
2085 case RES_LIMIT: 2582 case RES_LIMIT:
2583 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
2584 ret = -EINVAL;
2585 break;
2586 }
2086 /* This function does all necessary parse...reuse it */ 2587 /* This function does all necessary parse...reuse it */
2087 ret = res_counter_memparse_write_strategy(buffer, &val); 2588 ret = res_counter_memparse_write_strategy(buffer, &val);
2088 if (ret) 2589 if (ret)
@@ -2092,6 +2593,20 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
2092 else 2593 else
2093 ret = mem_cgroup_resize_memsw_limit(memcg, val); 2594 ret = mem_cgroup_resize_memsw_limit(memcg, val);
2094 break; 2595 break;
2596 case RES_SOFT_LIMIT:
2597 ret = res_counter_memparse_write_strategy(buffer, &val);
2598 if (ret)
2599 break;
2600 /*
2601 * For memsw, soft limits are hard to implement in terms
2602 * of semantics, for now, we support soft limits for
2603 * control without swap
2604 */
2605 if (type == _MEM)
2606 ret = res_counter_set_soft_limit(&memcg->res, val);
2607 else
2608 ret = -EINVAL;
2609 break;
2095 default: 2610 default:
2096 ret = -EINVAL; /* should be BUG() ? */ 2611 ret = -EINVAL; /* should be BUG() ? */
2097 break; 2612 break;
@@ -2149,6 +2664,7 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
2149 res_counter_reset_failcnt(&mem->memsw); 2664 res_counter_reset_failcnt(&mem->memsw);
2150 break; 2665 break;
2151 } 2666 }
2667
2152 return 0; 2668 return 0;
2153} 2669}
2154 2670
@@ -2160,6 +2676,7 @@ enum {
2160 MCS_MAPPED_FILE, 2676 MCS_MAPPED_FILE,
2161 MCS_PGPGIN, 2677 MCS_PGPGIN,
2162 MCS_PGPGOUT, 2678 MCS_PGPGOUT,
2679 MCS_SWAP,
2163 MCS_INACTIVE_ANON, 2680 MCS_INACTIVE_ANON,
2164 MCS_ACTIVE_ANON, 2681 MCS_ACTIVE_ANON,
2165 MCS_INACTIVE_FILE, 2682 MCS_INACTIVE_FILE,
@@ -2181,6 +2698,7 @@ struct {
2181 {"mapped_file", "total_mapped_file"}, 2698 {"mapped_file", "total_mapped_file"},
2182 {"pgpgin", "total_pgpgin"}, 2699 {"pgpgin", "total_pgpgin"},
2183 {"pgpgout", "total_pgpgout"}, 2700 {"pgpgout", "total_pgpgout"},
2701 {"swap", "total_swap"},
2184 {"inactive_anon", "total_inactive_anon"}, 2702 {"inactive_anon", "total_inactive_anon"},
2185 {"active_anon", "total_active_anon"}, 2703 {"active_anon", "total_active_anon"},
2186 {"inactive_file", "total_inactive_file"}, 2704 {"inactive_file", "total_inactive_file"},
@@ -2205,6 +2723,10 @@ static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data)
2205 s->stat[MCS_PGPGIN] += val; 2723 s->stat[MCS_PGPGIN] += val;
2206 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT); 2724 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT);
2207 s->stat[MCS_PGPGOUT] += val; 2725 s->stat[MCS_PGPGOUT] += val;
2726 if (do_swap_account) {
2727 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_SWAPOUT);
2728 s->stat[MCS_SWAP] += val * PAGE_SIZE;
2729 }
2208 2730
2209 /* per zone stat */ 2731 /* per zone stat */
2210 val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON); 2732 val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON);
@@ -2236,8 +2758,11 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
2236 memset(&mystat, 0, sizeof(mystat)); 2758 memset(&mystat, 0, sizeof(mystat));
2237 mem_cgroup_get_local_stat(mem_cont, &mystat); 2759 mem_cgroup_get_local_stat(mem_cont, &mystat);
2238 2760
2239 for (i = 0; i < NR_MCS_STAT; i++) 2761 for (i = 0; i < NR_MCS_STAT; i++) {
2762 if (i == MCS_SWAP && !do_swap_account)
2763 continue;
2240 cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]); 2764 cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]);
2765 }
2241 2766
2242 /* Hierarchical information */ 2767 /* Hierarchical information */
2243 { 2768 {
@@ -2250,9 +2775,11 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
2250 2775
2251 memset(&mystat, 0, sizeof(mystat)); 2776 memset(&mystat, 0, sizeof(mystat));
2252 mem_cgroup_get_total_stat(mem_cont, &mystat); 2777 mem_cgroup_get_total_stat(mem_cont, &mystat);
2253 for (i = 0; i < NR_MCS_STAT; i++) 2778 for (i = 0; i < NR_MCS_STAT; i++) {
2779 if (i == MCS_SWAP && !do_swap_account)
2780 continue;
2254 cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]); 2781 cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]);
2255 2782 }
2256 2783
2257#ifdef CONFIG_DEBUG_VM 2784#ifdef CONFIG_DEBUG_VM
2258 cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL)); 2785 cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL));
@@ -2345,6 +2872,12 @@ static struct cftype mem_cgroup_files[] = {
2345 .read_u64 = mem_cgroup_read, 2872 .read_u64 = mem_cgroup_read,
2346 }, 2873 },
2347 { 2874 {
2875 .name = "soft_limit_in_bytes",
2876 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
2877 .write_string = mem_cgroup_write,
2878 .read_u64 = mem_cgroup_read,
2879 },
2880 {
2348 .name = "failcnt", 2881 .name = "failcnt",
2349 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), 2882 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
2350 .trigger = mem_cgroup_reset, 2883 .trigger = mem_cgroup_reset,
@@ -2438,6 +2971,9 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
2438 mz = &pn->zoneinfo[zone]; 2971 mz = &pn->zoneinfo[zone];
2439 for_each_lru(l) 2972 for_each_lru(l)
2440 INIT_LIST_HEAD(&mz->lists[l]); 2973 INIT_LIST_HEAD(&mz->lists[l]);
2974 mz->usage_in_excess = 0;
2975 mz->on_tree = false;
2976 mz->mem = mem;
2441 } 2977 }
2442 return 0; 2978 return 0;
2443} 2979}
@@ -2483,6 +3019,7 @@ static void __mem_cgroup_free(struct mem_cgroup *mem)
2483{ 3019{
2484 int node; 3020 int node;
2485 3021
3022 mem_cgroup_remove_from_trees(mem);
2486 free_css_id(&mem_cgroup_subsys, &mem->css); 3023 free_css_id(&mem_cgroup_subsys, &mem->css);
2487 3024
2488 for_each_node_state(node, N_POSSIBLE) 3025 for_each_node_state(node, N_POSSIBLE)
@@ -2531,6 +3068,31 @@ static void __init enable_swap_cgroup(void)
2531} 3068}
2532#endif 3069#endif
2533 3070
3071static int mem_cgroup_soft_limit_tree_init(void)
3072{
3073 struct mem_cgroup_tree_per_node *rtpn;
3074 struct mem_cgroup_tree_per_zone *rtpz;
3075 int tmp, node, zone;
3076
3077 for_each_node_state(node, N_POSSIBLE) {
3078 tmp = node;
3079 if (!node_state(node, N_NORMAL_MEMORY))
3080 tmp = -1;
3081 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
3082 if (!rtpn)
3083 return 1;
3084
3085 soft_limit_tree.rb_tree_per_node[node] = rtpn;
3086
3087 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
3088 rtpz = &rtpn->rb_tree_per_zone[zone];
3089 rtpz->rb_root = RB_ROOT;
3090 spin_lock_init(&rtpz->lock);
3091 }
3092 }
3093 return 0;
3094}
3095
2534static struct cgroup_subsys_state * __ref 3096static struct cgroup_subsys_state * __ref
2535mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) 3097mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
2536{ 3098{
@@ -2545,10 +3107,15 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
2545 for_each_node_state(node, N_POSSIBLE) 3107 for_each_node_state(node, N_POSSIBLE)
2546 if (alloc_mem_cgroup_per_zone_info(mem, node)) 3108 if (alloc_mem_cgroup_per_zone_info(mem, node))
2547 goto free_out; 3109 goto free_out;
3110
2548 /* root ? */ 3111 /* root ? */
2549 if (cont->parent == NULL) { 3112 if (cont->parent == NULL) {
2550 enable_swap_cgroup(); 3113 enable_swap_cgroup();
2551 parent = NULL; 3114 parent = NULL;
3115 root_mem_cgroup = mem;
3116 if (mem_cgroup_soft_limit_tree_init())
3117 goto free_out;
3118
2552 } else { 3119 } else {
2553 parent = mem_cgroup_from_cont(cont->parent); 3120 parent = mem_cgroup_from_cont(cont->parent);
2554 mem->use_hierarchy = parent->use_hierarchy; 3121 mem->use_hierarchy = parent->use_hierarchy;
@@ -2577,6 +3144,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
2577 return &mem->css; 3144 return &mem->css;
2578free_out: 3145free_out:
2579 __mem_cgroup_free(mem); 3146 __mem_cgroup_free(mem);
3147 root_mem_cgroup = NULL;
2580 return ERR_PTR(error); 3148 return ERR_PTR(error);
2581} 3149}
2582 3150
@@ -2612,7 +3180,8 @@ static int mem_cgroup_populate(struct cgroup_subsys *ss,
2612static void mem_cgroup_move_task(struct cgroup_subsys *ss, 3180static void mem_cgroup_move_task(struct cgroup_subsys *ss,
2613 struct cgroup *cont, 3181 struct cgroup *cont,
2614 struct cgroup *old_cont, 3182 struct cgroup *old_cont,
2615 struct task_struct *p) 3183 struct task_struct *p,
3184 bool threadgroup)
2616{ 3185{
2617 mutex_lock(&memcg_tasklist); 3186 mutex_lock(&memcg_tasklist);
2618 /* 3187 /*
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
new file mode 100644
index 000000000000..729d4b15b645
--- /dev/null
+++ b/mm/memory-failure.c
@@ -0,0 +1,832 @@
1/*
2 * Copyright (C) 2008, 2009 Intel Corporation
3 * Authors: Andi Kleen, Fengguang Wu
4 *
5 * This software may be redistributed and/or modified under the terms of
6 * the GNU General Public License ("GPL") version 2 only as published by the
7 * Free Software Foundation.
8 *
9 * High level machine check handler. Handles pages reported by the
10 * hardware as being corrupted usually due to a 2bit ECC memory or cache
11 * failure.
12 *
13 * Handles page cache pages in various states. The tricky part
14 * here is that we can access any page asynchronous to other VM
15 * users, because memory failures could happen anytime and anywhere,
16 * possibly violating some of their assumptions. This is why this code
17 * has to be extremely careful. Generally it tries to use normal locking
18 * rules, as in get the standard locks, even if that means the
19 * error handling takes potentially a long time.
20 *
21 * The operation to map back from RMAP chains to processes has to walk
22 * the complete process list and has non linear complexity with the number
23 * mappings. In short it can be quite slow. But since memory corruptions
24 * are rare we hope to get away with this.
25 */
26
27/*
28 * Notebook:
29 * - hugetlb needs more code
30 * - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages
31 * - pass bad pages to kdump next kernel
32 */
33#define DEBUG 1 /* remove me in 2.6.34 */
34#include <linux/kernel.h>
35#include <linux/mm.h>
36#include <linux/page-flags.h>
37#include <linux/sched.h>
38#include <linux/rmap.h>
39#include <linux/pagemap.h>
40#include <linux/swap.h>
41#include <linux/backing-dev.h>
42#include "internal.h"
43
44int sysctl_memory_failure_early_kill __read_mostly = 0;
45
46int sysctl_memory_failure_recovery __read_mostly = 1;
47
48atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0);
49
50/*
51 * Send all the processes who have the page mapped an ``action optional''
52 * signal.
53 */
54static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno,
55 unsigned long pfn)
56{
57 struct siginfo si;
58 int ret;
59
60 printk(KERN_ERR
61 "MCE %#lx: Killing %s:%d early due to hardware memory corruption\n",
62 pfn, t->comm, t->pid);
63 si.si_signo = SIGBUS;
64 si.si_errno = 0;
65 si.si_code = BUS_MCEERR_AO;
66 si.si_addr = (void *)addr;
67#ifdef __ARCH_SI_TRAPNO
68 si.si_trapno = trapno;
69#endif
70 si.si_addr_lsb = PAGE_SHIFT;
71 /*
72 * Don't use force here, it's convenient if the signal
73 * can be temporarily blocked.
74 * This could cause a loop when the user sets SIGBUS
75 * to SIG_IGN, but hopefully noone will do that?
76 */
77 ret = send_sig_info(SIGBUS, &si, t); /* synchronous? */
78 if (ret < 0)
79 printk(KERN_INFO "MCE: Error sending signal to %s:%d: %d\n",
80 t->comm, t->pid, ret);
81 return ret;
82}
83
84/*
85 * Kill all processes that have a poisoned page mapped and then isolate
86 * the page.
87 *
88 * General strategy:
89 * Find all processes having the page mapped and kill them.
90 * But we keep a page reference around so that the page is not
91 * actually freed yet.
92 * Then stash the page away
93 *
94 * There's no convenient way to get back to mapped processes
95 * from the VMAs. So do a brute-force search over all
96 * running processes.
97 *
98 * Remember that machine checks are not common (or rather
99 * if they are common you have other problems), so this shouldn't
100 * be a performance issue.
101 *
102 * Also there are some races possible while we get from the
103 * error detection to actually handle it.
104 */
105
106struct to_kill {
107 struct list_head nd;
108 struct task_struct *tsk;
109 unsigned long addr;
110 unsigned addr_valid:1;
111};
112
113/*
114 * Failure handling: if we can't find or can't kill a process there's
115 * not much we can do. We just print a message and ignore otherwise.
116 */
117
118/*
119 * Schedule a process for later kill.
120 * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
121 * TBD would GFP_NOIO be enough?
122 */
123static void add_to_kill(struct task_struct *tsk, struct page *p,
124 struct vm_area_struct *vma,
125 struct list_head *to_kill,
126 struct to_kill **tkc)
127{
128 struct to_kill *tk;
129
130 if (*tkc) {
131 tk = *tkc;
132 *tkc = NULL;
133 } else {
134 tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);
135 if (!tk) {
136 printk(KERN_ERR
137 "MCE: Out of memory while machine check handling\n");
138 return;
139 }
140 }
141 tk->addr = page_address_in_vma(p, vma);
142 tk->addr_valid = 1;
143
144 /*
145 * In theory we don't have to kill when the page was
146 * munmaped. But it could be also a mremap. Since that's
147 * likely very rare kill anyways just out of paranoia, but use
148 * a SIGKILL because the error is not contained anymore.
149 */
150 if (tk->addr == -EFAULT) {
151 pr_debug("MCE: Unable to find user space address %lx in %s\n",
152 page_to_pfn(p), tsk->comm);
153 tk->addr_valid = 0;
154 }
155 get_task_struct(tsk);
156 tk->tsk = tsk;
157 list_add_tail(&tk->nd, to_kill);
158}
159
160/*
161 * Kill the processes that have been collected earlier.
162 *
163 * Only do anything when DOIT is set, otherwise just free the list
164 * (this is used for clean pages which do not need killing)
165 * Also when FAIL is set do a force kill because something went
166 * wrong earlier.
167 */
168static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno,
169 int fail, unsigned long pfn)
170{
171 struct to_kill *tk, *next;
172
173 list_for_each_entry_safe (tk, next, to_kill, nd) {
174 if (doit) {
175 /*
176 * In case something went wrong with munmaping
177 * make sure the process doesn't catch the
178 * signal and then access the memory. Just kill it.
179 * the signal handlers
180 */
181 if (fail || tk->addr_valid == 0) {
182 printk(KERN_ERR
183 "MCE %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
184 pfn, tk->tsk->comm, tk->tsk->pid);
185 force_sig(SIGKILL, tk->tsk);
186 }
187
188 /*
189 * In theory the process could have mapped
190 * something else on the address in-between. We could
191 * check for that, but we need to tell the
192 * process anyways.
193 */
194 else if (kill_proc_ao(tk->tsk, tk->addr, trapno,
195 pfn) < 0)
196 printk(KERN_ERR
197 "MCE %#lx: Cannot send advisory machine check signal to %s:%d\n",
198 pfn, tk->tsk->comm, tk->tsk->pid);
199 }
200 put_task_struct(tk->tsk);
201 kfree(tk);
202 }
203}
204
205static int task_early_kill(struct task_struct *tsk)
206{
207 if (!tsk->mm)
208 return 0;
209 if (tsk->flags & PF_MCE_PROCESS)
210 return !!(tsk->flags & PF_MCE_EARLY);
211 return sysctl_memory_failure_early_kill;
212}
213
214/*
215 * Collect processes when the error hit an anonymous page.
216 */
217static void collect_procs_anon(struct page *page, struct list_head *to_kill,
218 struct to_kill **tkc)
219{
220 struct vm_area_struct *vma;
221 struct task_struct *tsk;
222 struct anon_vma *av;
223
224 read_lock(&tasklist_lock);
225 av = page_lock_anon_vma(page);
226 if (av == NULL) /* Not actually mapped anymore */
227 goto out;
228 for_each_process (tsk) {
229 if (!task_early_kill(tsk))
230 continue;
231 list_for_each_entry (vma, &av->head, anon_vma_node) {
232 if (!page_mapped_in_vma(page, vma))
233 continue;
234 if (vma->vm_mm == tsk->mm)
235 add_to_kill(tsk, page, vma, to_kill, tkc);
236 }
237 }
238 page_unlock_anon_vma(av);
239out:
240 read_unlock(&tasklist_lock);
241}
242
243/*
244 * Collect processes when the error hit a file mapped page.
245 */
246static void collect_procs_file(struct page *page, struct list_head *to_kill,
247 struct to_kill **tkc)
248{
249 struct vm_area_struct *vma;
250 struct task_struct *tsk;
251 struct prio_tree_iter iter;
252 struct address_space *mapping = page->mapping;
253
254 /*
255 * A note on the locking order between the two locks.
256 * We don't rely on this particular order.
257 * If you have some other code that needs a different order
258 * feel free to switch them around. Or add a reverse link
259 * from mm_struct to task_struct, then this could be all
260 * done without taking tasklist_lock and looping over all tasks.
261 */
262
263 read_lock(&tasklist_lock);
264 spin_lock(&mapping->i_mmap_lock);
265 for_each_process(tsk) {
266 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
267
268 if (!task_early_kill(tsk))
269 continue;
270
271 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff,
272 pgoff) {
273 /*
274 * Send early kill signal to tasks where a vma covers
275 * the page but the corrupted page is not necessarily
276 * mapped it in its pte.
277 * Assume applications who requested early kill want
278 * to be informed of all such data corruptions.
279 */
280 if (vma->vm_mm == tsk->mm)
281 add_to_kill(tsk, page, vma, to_kill, tkc);
282 }
283 }
284 spin_unlock(&mapping->i_mmap_lock);
285 read_unlock(&tasklist_lock);
286}
287
288/*
289 * Collect the processes who have the corrupted page mapped to kill.
290 * This is done in two steps for locking reasons.
291 * First preallocate one tokill structure outside the spin locks,
292 * so that we can kill at least one process reasonably reliable.
293 */
294static void collect_procs(struct page *page, struct list_head *tokill)
295{
296 struct to_kill *tk;
297
298 if (!page->mapping)
299 return;
300
301 tk = kmalloc(sizeof(struct to_kill), GFP_NOIO);
302 if (!tk)
303 return;
304 if (PageAnon(page))
305 collect_procs_anon(page, tokill, &tk);
306 else
307 collect_procs_file(page, tokill, &tk);
308 kfree(tk);
309}
310
311/*
312 * Error handlers for various types of pages.
313 */
314
315enum outcome {
316 FAILED, /* Error handling failed */
317 DELAYED, /* Will be handled later */
318 IGNORED, /* Error safely ignored */
319 RECOVERED, /* Successfully recovered */
320};
321
322static const char *action_name[] = {
323 [FAILED] = "Failed",
324 [DELAYED] = "Delayed",
325 [IGNORED] = "Ignored",
326 [RECOVERED] = "Recovered",
327};
328
329/*
330 * Error hit kernel page.
331 * Do nothing, try to be lucky and not touch this instead. For a few cases we
332 * could be more sophisticated.
333 */
334static int me_kernel(struct page *p, unsigned long pfn)
335{
336 return DELAYED;
337}
338
339/*
340 * Already poisoned page.
341 */
342static int me_ignore(struct page *p, unsigned long pfn)
343{
344 return IGNORED;
345}
346
347/*
348 * Page in unknown state. Do nothing.
349 */
350static int me_unknown(struct page *p, unsigned long pfn)
351{
352 printk(KERN_ERR "MCE %#lx: Unknown page state\n", pfn);
353 return FAILED;
354}
355
356/*
357 * Free memory
358 */
359static int me_free(struct page *p, unsigned long pfn)
360{
361 return DELAYED;
362}
363
364/*
365 * Clean (or cleaned) page cache page.
366 */
367static int me_pagecache_clean(struct page *p, unsigned long pfn)
368{
369 int err;
370 int ret = FAILED;
371 struct address_space *mapping;
372
373 if (!isolate_lru_page(p))
374 page_cache_release(p);
375
376 /*
377 * For anonymous pages we're done the only reference left
378 * should be the one m_f() holds.
379 */
380 if (PageAnon(p))
381 return RECOVERED;
382
383 /*
384 * Now truncate the page in the page cache. This is really
385 * more like a "temporary hole punch"
386 * Don't do this for block devices when someone else
387 * has a reference, because it could be file system metadata
388 * and that's not safe to truncate.
389 */
390 mapping = page_mapping(p);
391 if (!mapping) {
392 /*
393 * Page has been teared down in the meanwhile
394 */
395 return FAILED;
396 }
397
398 /*
399 * Truncation is a bit tricky. Enable it per file system for now.
400 *
401 * Open: to take i_mutex or not for this? Right now we don't.
402 */
403 if (mapping->a_ops->error_remove_page) {
404 err = mapping->a_ops->error_remove_page(mapping, p);
405 if (err != 0) {
406 printk(KERN_INFO "MCE %#lx: Failed to punch page: %d\n",
407 pfn, err);
408 } else if (page_has_private(p) &&
409 !try_to_release_page(p, GFP_NOIO)) {
410 pr_debug("MCE %#lx: failed to release buffers\n", pfn);
411 } else {
412 ret = RECOVERED;
413 }
414 } else {
415 /*
416 * If the file system doesn't support it just invalidate
417 * This fails on dirty or anything with private pages
418 */
419 if (invalidate_inode_page(p))
420 ret = RECOVERED;
421 else
422 printk(KERN_INFO "MCE %#lx: Failed to invalidate\n",
423 pfn);
424 }
425 return ret;
426}
427
428/*
429 * Dirty cache page page
430 * Issues: when the error hit a hole page the error is not properly
431 * propagated.
432 */
433static int me_pagecache_dirty(struct page *p, unsigned long pfn)
434{
435 struct address_space *mapping = page_mapping(p);
436
437 SetPageError(p);
438 /* TBD: print more information about the file. */
439 if (mapping) {
440 /*
441 * IO error will be reported by write(), fsync(), etc.
442 * who check the mapping.
443 * This way the application knows that something went
444 * wrong with its dirty file data.
445 *
446 * There's one open issue:
447 *
448 * The EIO will be only reported on the next IO
449 * operation and then cleared through the IO map.
450 * Normally Linux has two mechanisms to pass IO error
451 * first through the AS_EIO flag in the address space
452 * and then through the PageError flag in the page.
453 * Since we drop pages on memory failure handling the
454 * only mechanism open to use is through AS_AIO.
455 *
456 * This has the disadvantage that it gets cleared on
457 * the first operation that returns an error, while
458 * the PageError bit is more sticky and only cleared
459 * when the page is reread or dropped. If an
460 * application assumes it will always get error on
461 * fsync, but does other operations on the fd before
462 * and the page is dropped inbetween then the error
463 * will not be properly reported.
464 *
465 * This can already happen even without hwpoisoned
466 * pages: first on metadata IO errors (which only
467 * report through AS_EIO) or when the page is dropped
468 * at the wrong time.
469 *
470 * So right now we assume that the application DTRT on
471 * the first EIO, but we're not worse than other parts
472 * of the kernel.
473 */
474 mapping_set_error(mapping, EIO);
475 }
476
477 return me_pagecache_clean(p, pfn);
478}
479
480/*
481 * Clean and dirty swap cache.
482 *
483 * Dirty swap cache page is tricky to handle. The page could live both in page
484 * cache and swap cache(ie. page is freshly swapped in). So it could be
485 * referenced concurrently by 2 types of PTEs:
486 * normal PTEs and swap PTEs. We try to handle them consistently by calling
487 * try_to_unmap(TTU_IGNORE_HWPOISON) to convert the normal PTEs to swap PTEs,
488 * and then
489 * - clear dirty bit to prevent IO
490 * - remove from LRU
491 * - but keep in the swap cache, so that when we return to it on
492 * a later page fault, we know the application is accessing
493 * corrupted data and shall be killed (we installed simple
494 * interception code in do_swap_page to catch it).
495 *
496 * Clean swap cache pages can be directly isolated. A later page fault will
497 * bring in the known good data from disk.
498 */
499static int me_swapcache_dirty(struct page *p, unsigned long pfn)
500{
501 int ret = FAILED;
502
503 ClearPageDirty(p);
504 /* Trigger EIO in shmem: */
505 ClearPageUptodate(p);
506
507 if (!isolate_lru_page(p)) {
508 page_cache_release(p);
509 ret = DELAYED;
510 }
511
512 return ret;
513}
514
515static int me_swapcache_clean(struct page *p, unsigned long pfn)
516{
517 int ret = FAILED;
518
519 if (!isolate_lru_page(p)) {
520 page_cache_release(p);
521 ret = RECOVERED;
522 }
523 delete_from_swap_cache(p);
524 return ret;
525}
526
527/*
528 * Huge pages. Needs work.
529 * Issues:
530 * No rmap support so we cannot find the original mapper. In theory could walk
531 * all MMs and look for the mappings, but that would be non atomic and racy.
532 * Need rmap for hugepages for this. Alternatively we could employ a heuristic,
533 * like just walking the current process and hoping it has it mapped (that
534 * should be usually true for the common "shared database cache" case)
535 * Should handle free huge pages and dequeue them too, but this needs to
536 * handle huge page accounting correctly.
537 */
538static int me_huge_page(struct page *p, unsigned long pfn)
539{
540 return FAILED;
541}
542
543/*
544 * Various page states we can handle.
545 *
546 * A page state is defined by its current page->flags bits.
547 * The table matches them in order and calls the right handler.
548 *
549 * This is quite tricky because we can access page at any time
550 * in its live cycle, so all accesses have to be extremly careful.
551 *
552 * This is not complete. More states could be added.
553 * For any missing state don't attempt recovery.
554 */
555
556#define dirty (1UL << PG_dirty)
557#define sc (1UL << PG_swapcache)
558#define unevict (1UL << PG_unevictable)
559#define mlock (1UL << PG_mlocked)
560#define writeback (1UL << PG_writeback)
561#define lru (1UL << PG_lru)
562#define swapbacked (1UL << PG_swapbacked)
563#define head (1UL << PG_head)
564#define tail (1UL << PG_tail)
565#define compound (1UL << PG_compound)
566#define slab (1UL << PG_slab)
567#define buddy (1UL << PG_buddy)
568#define reserved (1UL << PG_reserved)
569
570static struct page_state {
571 unsigned long mask;
572 unsigned long res;
573 char *msg;
574 int (*action)(struct page *p, unsigned long pfn);
575} error_states[] = {
576 { reserved, reserved, "reserved kernel", me_ignore },
577 { buddy, buddy, "free kernel", me_free },
578
579 /*
580 * Could in theory check if slab page is free or if we can drop
581 * currently unused objects without touching them. But just
582 * treat it as standard kernel for now.
583 */
584 { slab, slab, "kernel slab", me_kernel },
585
586#ifdef CONFIG_PAGEFLAGS_EXTENDED
587 { head, head, "huge", me_huge_page },
588 { tail, tail, "huge", me_huge_page },
589#else
590 { compound, compound, "huge", me_huge_page },
591#endif
592
593 { sc|dirty, sc|dirty, "swapcache", me_swapcache_dirty },
594 { sc|dirty, sc, "swapcache", me_swapcache_clean },
595
596 { unevict|dirty, unevict|dirty, "unevictable LRU", me_pagecache_dirty},
597 { unevict, unevict, "unevictable LRU", me_pagecache_clean},
598
599#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT
600 { mlock|dirty, mlock|dirty, "mlocked LRU", me_pagecache_dirty },
601 { mlock, mlock, "mlocked LRU", me_pagecache_clean },
602#endif
603
604 { lru|dirty, lru|dirty, "LRU", me_pagecache_dirty },
605 { lru|dirty, lru, "clean LRU", me_pagecache_clean },
606 { swapbacked, swapbacked, "anonymous", me_pagecache_clean },
607
608 /*
609 * Catchall entry: must be at end.
610 */
611 { 0, 0, "unknown page state", me_unknown },
612};
613
614#undef lru
615
616static void action_result(unsigned long pfn, char *msg, int result)
617{
618 struct page *page = NULL;
619 if (pfn_valid(pfn))
620 page = pfn_to_page(pfn);
621
622 printk(KERN_ERR "MCE %#lx: %s%s page recovery: %s\n",
623 pfn,
624 page && PageDirty(page) ? "dirty " : "",
625 msg, action_name[result]);
626}
627
628static int page_action(struct page_state *ps, struct page *p,
629 unsigned long pfn, int ref)
630{
631 int result;
632
633 result = ps->action(p, pfn);
634 action_result(pfn, ps->msg, result);
635 if (page_count(p) != 1 + ref)
636 printk(KERN_ERR
637 "MCE %#lx: %s page still referenced by %d users\n",
638 pfn, ps->msg, page_count(p) - 1);
639
640 /* Could do more checks here if page looks ok */
641 /*
642 * Could adjust zone counters here to correct for the missing page.
643 */
644
645 return result == RECOVERED ? 0 : -EBUSY;
646}
647
648#define N_UNMAP_TRIES 5
649
650/*
651 * Do all that is necessary to remove user space mappings. Unmap
652 * the pages and send SIGBUS to the processes if the data was dirty.
653 */
654static void hwpoison_user_mappings(struct page *p, unsigned long pfn,
655 int trapno)
656{
657 enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
658 struct address_space *mapping;
659 LIST_HEAD(tokill);
660 int ret;
661 int i;
662 int kill = 1;
663
664 if (PageReserved(p) || PageCompound(p) || PageSlab(p))
665 return;
666
667 if (!PageLRU(p))
668 lru_add_drain_all();
669
670 /*
671 * This check implies we don't kill processes if their pages
672 * are in the swap cache early. Those are always late kills.
673 */
674 if (!page_mapped(p))
675 return;
676
677 if (PageSwapCache(p)) {
678 printk(KERN_ERR
679 "MCE %#lx: keeping poisoned page in swap cache\n", pfn);
680 ttu |= TTU_IGNORE_HWPOISON;
681 }
682
683 /*
684 * Propagate the dirty bit from PTEs to struct page first, because we
685 * need this to decide if we should kill or just drop the page.
686 */
687 mapping = page_mapping(p);
688 if (!PageDirty(p) && mapping && mapping_cap_writeback_dirty(mapping)) {
689 if (page_mkclean(p)) {
690 SetPageDirty(p);
691 } else {
692 kill = 0;
693 ttu |= TTU_IGNORE_HWPOISON;
694 printk(KERN_INFO
695 "MCE %#lx: corrupted page was clean: dropped without side effects\n",
696 pfn);
697 }
698 }
699
700 /*
701 * First collect all the processes that have the page
702 * mapped in dirty form. This has to be done before try_to_unmap,
703 * because ttu takes the rmap data structures down.
704 *
705 * Error handling: We ignore errors here because
706 * there's nothing that can be done.
707 */
708 if (kill)
709 collect_procs(p, &tokill);
710
711 /*
712 * try_to_unmap can fail temporarily due to races.
713 * Try a few times (RED-PEN better strategy?)
714 */
715 for (i = 0; i < N_UNMAP_TRIES; i++) {
716 ret = try_to_unmap(p, ttu);
717 if (ret == SWAP_SUCCESS)
718 break;
719 pr_debug("MCE %#lx: try_to_unmap retry needed %d\n", pfn, ret);
720 }
721
722 if (ret != SWAP_SUCCESS)
723 printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
724 pfn, page_mapcount(p));
725
726 /*
727 * Now that the dirty bit has been propagated to the
728 * struct page and all unmaps done we can decide if
729 * killing is needed or not. Only kill when the page
730 * was dirty, otherwise the tokill list is merely
731 * freed. When there was a problem unmapping earlier
732 * use a more force-full uncatchable kill to prevent
733 * any accesses to the poisoned memory.
734 */
735 kill_procs_ao(&tokill, !!PageDirty(p), trapno,
736 ret != SWAP_SUCCESS, pfn);
737}
738
739int __memory_failure(unsigned long pfn, int trapno, int ref)
740{
741 struct page_state *ps;
742 struct page *p;
743 int res;
744
745 if (!sysctl_memory_failure_recovery)
746 panic("Memory failure from trap %d on page %lx", trapno, pfn);
747
748 if (!pfn_valid(pfn)) {
749 action_result(pfn, "memory outside kernel control", IGNORED);
750 return -EIO;
751 }
752
753 p = pfn_to_page(pfn);
754 if (TestSetPageHWPoison(p)) {
755 action_result(pfn, "already hardware poisoned", IGNORED);
756 return 0;
757 }
758
759 atomic_long_add(1, &mce_bad_pages);
760
761 /*
762 * We need/can do nothing about count=0 pages.
763 * 1) it's a free page, and therefore in safe hand:
764 * prep_new_page() will be the gate keeper.
765 * 2) it's part of a non-compound high order page.
766 * Implies some kernel user: cannot stop them from
767 * R/W the page; let's pray that the page has been
768 * used and will be freed some time later.
769 * In fact it's dangerous to directly bump up page count from 0,
770 * that may make page_freeze_refs()/page_unfreeze_refs() mismatch.
771 */
772 if (!get_page_unless_zero(compound_head(p))) {
773 action_result(pfn, "free or high order kernel", IGNORED);
774 return PageBuddy(compound_head(p)) ? 0 : -EBUSY;
775 }
776
777 /*
778 * Lock the page and wait for writeback to finish.
779 * It's very difficult to mess with pages currently under IO
780 * and in many cases impossible, so we just avoid it here.
781 */
782 lock_page_nosync(p);
783 wait_on_page_writeback(p);
784
785 /*
786 * Now take care of user space mappings.
787 */
788 hwpoison_user_mappings(p, pfn, trapno);
789
790 /*
791 * Torn down by someone else?
792 */
793 if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
794 action_result(pfn, "already truncated LRU", IGNORED);
795 res = 0;
796 goto out;
797 }
798
799 res = -EBUSY;
800 for (ps = error_states;; ps++) {
801 if ((p->flags & ps->mask) == ps->res) {
802 res = page_action(ps, p, pfn, ref);
803 break;
804 }
805 }
806out:
807 unlock_page(p);
808 return res;
809}
810EXPORT_SYMBOL_GPL(__memory_failure);
811
812/**
813 * memory_failure - Handle memory failure of a page.
814 * @pfn: Page Number of the corrupted page
815 * @trapno: Trap number reported in the signal to user space.
816 *
817 * This function is called by the low level machine check code
818 * of an architecture when it detects hardware memory corruption
819 * of a page. It tries its best to recover, which includes
820 * dropping pages, killing processes etc.
821 *
822 * The function is primarily of use for corruptions that
823 * happen outside the current execution context (e.g. when
824 * detected by a background scrubber)
825 *
826 * Must run in process context (e.g. a work queue) with interrupts
827 * enabled and no spinlocks hold.
828 */
829void memory_failure(unsigned long pfn, int trapno)
830{
831 __memory_failure(pfn, trapno, 0);
832}
diff --git a/mm/memory.c b/mm/memory.c
index b1443ac07c00..987389a809e7 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1325,7 +1325,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1325 if (ret & VM_FAULT_ERROR) { 1325 if (ret & VM_FAULT_ERROR) {
1326 if (ret & VM_FAULT_OOM) 1326 if (ret & VM_FAULT_OOM)
1327 return i ? i : -ENOMEM; 1327 return i ? i : -ENOMEM;
1328 else if (ret & VM_FAULT_SIGBUS) 1328 if (ret &
1329 (VM_FAULT_HWPOISON|VM_FAULT_SIGBUS))
1329 return i ? i : -EFAULT; 1330 return i ? i : -EFAULT;
1330 BUG(); 1331 BUG();
1331 } 1332 }
@@ -2559,8 +2560,15 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2559 goto out; 2560 goto out;
2560 2561
2561 entry = pte_to_swp_entry(orig_pte); 2562 entry = pte_to_swp_entry(orig_pte);
2562 if (is_migration_entry(entry)) { 2563 if (unlikely(non_swap_entry(entry))) {
2563 migration_entry_wait(mm, pmd, address); 2564 if (is_migration_entry(entry)) {
2565 migration_entry_wait(mm, pmd, address);
2566 } else if (is_hwpoison_entry(entry)) {
2567 ret = VM_FAULT_HWPOISON;
2568 } else {
2569 print_bad_pte(vma, address, orig_pte, NULL);
2570 ret = VM_FAULT_OOM;
2571 }
2564 goto out; 2572 goto out;
2565 } 2573 }
2566 delayacct_set_flag(DELAYACCT_PF_SWAPIN); 2574 delayacct_set_flag(DELAYACCT_PF_SWAPIN);
@@ -2584,6 +2592,10 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2584 /* Had to read the page from swap area: Major fault */ 2592 /* Had to read the page from swap area: Major fault */
2585 ret = VM_FAULT_MAJOR; 2593 ret = VM_FAULT_MAJOR;
2586 count_vm_event(PGMAJFAULT); 2594 count_vm_event(PGMAJFAULT);
2595 } else if (PageHWPoison(page)) {
2596 ret = VM_FAULT_HWPOISON;
2597 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2598 goto out;
2587 } 2599 }
2588 2600
2589 lock_page(page); 2601 lock_page(page);
@@ -2760,6 +2772,12 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2760 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) 2772 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
2761 return ret; 2773 return ret;
2762 2774
2775 if (unlikely(PageHWPoison(vmf.page))) {
2776 if (ret & VM_FAULT_LOCKED)
2777 unlock_page(vmf.page);
2778 return VM_FAULT_HWPOISON;
2779 }
2780
2763 /* 2781 /*
2764 * For consistency in subsequent calls, make the faulted page always 2782 * For consistency in subsequent calls, make the faulted page always
2765 * locked. 2783 * locked.
diff --git a/mm/migrate.c b/mm/migrate.c
index 16052e80aaac..1a4bf4813780 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -675,7 +675,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
675 } 675 }
676 676
677 /* Establish migration ptes or remove ptes */ 677 /* Establish migration ptes or remove ptes */
678 try_to_unmap(page, 1); 678 try_to_unmap(page, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
679 679
680skip_unmap: 680skip_unmap:
681 if (!page_mapped(page)) 681 if (!page_mapped(page))
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 5f378dd58802..d99664e8607e 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -155,37 +155,37 @@ static void update_completion_period(void)
155} 155}
156 156
157int dirty_background_ratio_handler(struct ctl_table *table, int write, 157int dirty_background_ratio_handler(struct ctl_table *table, int write,
158 struct file *filp, void __user *buffer, size_t *lenp, 158 void __user *buffer, size_t *lenp,
159 loff_t *ppos) 159 loff_t *ppos)
160{ 160{
161 int ret; 161 int ret;
162 162
163 ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); 163 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
164 if (ret == 0 && write) 164 if (ret == 0 && write)
165 dirty_background_bytes = 0; 165 dirty_background_bytes = 0;
166 return ret; 166 return ret;
167} 167}
168 168
169int dirty_background_bytes_handler(struct ctl_table *table, int write, 169int dirty_background_bytes_handler(struct ctl_table *table, int write,
170 struct file *filp, void __user *buffer, size_t *lenp, 170 void __user *buffer, size_t *lenp,
171 loff_t *ppos) 171 loff_t *ppos)
172{ 172{
173 int ret; 173 int ret;
174 174
175 ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos); 175 ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
176 if (ret == 0 && write) 176 if (ret == 0 && write)
177 dirty_background_ratio = 0; 177 dirty_background_ratio = 0;
178 return ret; 178 return ret;
179} 179}
180 180
181int dirty_ratio_handler(struct ctl_table *table, int write, 181int dirty_ratio_handler(struct ctl_table *table, int write,
182 struct file *filp, void __user *buffer, size_t *lenp, 182 void __user *buffer, size_t *lenp,
183 loff_t *ppos) 183 loff_t *ppos)
184{ 184{
185 int old_ratio = vm_dirty_ratio; 185 int old_ratio = vm_dirty_ratio;
186 int ret; 186 int ret;
187 187
188 ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); 188 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
189 if (ret == 0 && write && vm_dirty_ratio != old_ratio) { 189 if (ret == 0 && write && vm_dirty_ratio != old_ratio) {
190 update_completion_period(); 190 update_completion_period();
191 vm_dirty_bytes = 0; 191 vm_dirty_bytes = 0;
@@ -195,13 +195,13 @@ int dirty_ratio_handler(struct ctl_table *table, int write,
195 195
196 196
197int dirty_bytes_handler(struct ctl_table *table, int write, 197int dirty_bytes_handler(struct ctl_table *table, int write,
198 struct file *filp, void __user *buffer, size_t *lenp, 198 void __user *buffer, size_t *lenp,
199 loff_t *ppos) 199 loff_t *ppos)
200{ 200{
201 unsigned long old_bytes = vm_dirty_bytes; 201 unsigned long old_bytes = vm_dirty_bytes;
202 int ret; 202 int ret;
203 203
204 ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos); 204 ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
205 if (ret == 0 && write && vm_dirty_bytes != old_bytes) { 205 if (ret == 0 && write && vm_dirty_bytes != old_bytes) {
206 update_completion_period(); 206 update_completion_period();
207 vm_dirty_ratio = 0; 207 vm_dirty_ratio = 0;
@@ -686,9 +686,9 @@ static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0);
686 * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs 686 * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
687 */ 687 */
688int dirty_writeback_centisecs_handler(ctl_table *table, int write, 688int dirty_writeback_centisecs_handler(ctl_table *table, int write,
689 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 689 void __user *buffer, size_t *length, loff_t *ppos)
690{ 690{
691 proc_dointvec(table, write, file, buffer, length, ppos); 691 proc_dointvec(table, write, buffer, length, ppos);
692 return 0; 692 return 0;
693} 693}
694 694
@@ -1149,6 +1149,13 @@ int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page)
1149EXPORT_SYMBOL(redirty_page_for_writepage); 1149EXPORT_SYMBOL(redirty_page_for_writepage);
1150 1150
1151/* 1151/*
1152 * Dirty a page.
1153 *
1154 * For pages with a mapping this should be done under the page lock
1155 * for the benefit of asynchronous memory errors who prefer a consistent
1156 * dirty state. This rule can be broken in some special cases,
1157 * but should be better not to.
1158 *
1152 * If the mapping doesn't provide a set_page_dirty a_op, then 1159 * If the mapping doesn't provide a set_page_dirty a_op, then
1153 * just fall through and assume that it wants buffer_heads. 1160 * just fall through and assume that it wants buffer_heads.
1154 */ 1161 */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5717f27a0704..bf720550b44d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -234,6 +234,12 @@ static void bad_page(struct page *page)
234 static unsigned long nr_shown; 234 static unsigned long nr_shown;
235 static unsigned long nr_unshown; 235 static unsigned long nr_unshown;
236 236
237 /* Don't complain about poisoned pages */
238 if (PageHWPoison(page)) {
239 __ClearPageBuddy(page);
240 return;
241 }
242
237 /* 243 /*
238 * Allow a burst of 60 reports, then keep quiet for that minute; 244 * Allow a burst of 60 reports, then keep quiet for that minute;
239 * or allow a steady drip of one report per second. 245 * or allow a steady drip of one report per second.
@@ -666,7 +672,7 @@ static inline void expand(struct zone *zone, struct page *page,
666/* 672/*
667 * This page is about to be returned from the page allocator 673 * This page is about to be returned from the page allocator
668 */ 674 */
669static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) 675static inline int check_new_page(struct page *page)
670{ 676{
671 if (unlikely(page_mapcount(page) | 677 if (unlikely(page_mapcount(page) |
672 (page->mapping != NULL) | 678 (page->mapping != NULL) |
@@ -675,6 +681,18 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
675 bad_page(page); 681 bad_page(page);
676 return 1; 682 return 1;
677 } 683 }
684 return 0;
685}
686
687static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
688{
689 int i;
690
691 for (i = 0; i < (1 << order); i++) {
692 struct page *p = page + i;
693 if (unlikely(check_new_page(p)))
694 return 1;
695 }
678 696
679 set_page_private(page, 0); 697 set_page_private(page, 0);
680 set_page_refcounted(page); 698 set_page_refcounted(page);
@@ -2373,7 +2391,7 @@ early_param("numa_zonelist_order", setup_numa_zonelist_order);
2373 * sysctl handler for numa_zonelist_order 2391 * sysctl handler for numa_zonelist_order
2374 */ 2392 */
2375int numa_zonelist_order_handler(ctl_table *table, int write, 2393int numa_zonelist_order_handler(ctl_table *table, int write,
2376 struct file *file, void __user *buffer, size_t *length, 2394 void __user *buffer, size_t *length,
2377 loff_t *ppos) 2395 loff_t *ppos)
2378{ 2396{
2379 char saved_string[NUMA_ZONELIST_ORDER_LEN]; 2397 char saved_string[NUMA_ZONELIST_ORDER_LEN];
@@ -2382,7 +2400,7 @@ int numa_zonelist_order_handler(ctl_table *table, int write,
2382 if (write) 2400 if (write)
2383 strncpy(saved_string, (char*)table->data, 2401 strncpy(saved_string, (char*)table->data,
2384 NUMA_ZONELIST_ORDER_LEN); 2402 NUMA_ZONELIST_ORDER_LEN);
2385 ret = proc_dostring(table, write, file, buffer, length, ppos); 2403 ret = proc_dostring(table, write, buffer, length, ppos);
2386 if (ret) 2404 if (ret)
2387 return ret; 2405 return ret;
2388 if (write) { 2406 if (write) {
@@ -4706,9 +4724,9 @@ module_init(init_per_zone_wmark_min)
4706 * changes. 4724 * changes.
4707 */ 4725 */
4708int min_free_kbytes_sysctl_handler(ctl_table *table, int write, 4726int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
4709 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 4727 void __user *buffer, size_t *length, loff_t *ppos)
4710{ 4728{
4711 proc_dointvec(table, write, file, buffer, length, ppos); 4729 proc_dointvec(table, write, buffer, length, ppos);
4712 if (write) 4730 if (write)
4713 setup_per_zone_wmarks(); 4731 setup_per_zone_wmarks();
4714 return 0; 4732 return 0;
@@ -4716,12 +4734,12 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
4716 4734
4717#ifdef CONFIG_NUMA 4735#ifdef CONFIG_NUMA
4718int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write, 4736int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
4719 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 4737 void __user *buffer, size_t *length, loff_t *ppos)
4720{ 4738{
4721 struct zone *zone; 4739 struct zone *zone;
4722 int rc; 4740 int rc;
4723 4741
4724 rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos); 4742 rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
4725 if (rc) 4743 if (rc)
4726 return rc; 4744 return rc;
4727 4745
@@ -4732,12 +4750,12 @@ int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
4732} 4750}
4733 4751
4734int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write, 4752int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
4735 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 4753 void __user *buffer, size_t *length, loff_t *ppos)
4736{ 4754{
4737 struct zone *zone; 4755 struct zone *zone;
4738 int rc; 4756 int rc;
4739 4757
4740 rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos); 4758 rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
4741 if (rc) 4759 if (rc)
4742 return rc; 4760 return rc;
4743 4761
@@ -4758,9 +4776,9 @@ int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
4758 * if in function of the boot time zone sizes. 4776 * if in function of the boot time zone sizes.
4759 */ 4777 */
4760int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, 4778int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
4761 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 4779 void __user *buffer, size_t *length, loff_t *ppos)
4762{ 4780{
4763 proc_dointvec_minmax(table, write, file, buffer, length, ppos); 4781 proc_dointvec_minmax(table, write, buffer, length, ppos);
4764 setup_per_zone_lowmem_reserve(); 4782 setup_per_zone_lowmem_reserve();
4765 return 0; 4783 return 0;
4766} 4784}
@@ -4772,13 +4790,13 @@ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
4772 */ 4790 */
4773 4791
4774int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, 4792int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
4775 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 4793 void __user *buffer, size_t *length, loff_t *ppos)
4776{ 4794{
4777 struct zone *zone; 4795 struct zone *zone;
4778 unsigned int cpu; 4796 unsigned int cpu;
4779 int ret; 4797 int ret;
4780 4798
4781 ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos); 4799 ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
4782 if (!write || (ret == -EINVAL)) 4800 if (!write || (ret == -EINVAL))
4783 return ret; 4801 return ret;
4784 for_each_populated_zone(zone) { 4802 for_each_populated_zone(zone) {
diff --git a/mm/rmap.c b/mm/rmap.c
index 720fc03a7bc4..28aafe2b5306 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -36,6 +36,11 @@
36 * mapping->tree_lock (widely used, in set_page_dirty, 36 * mapping->tree_lock (widely used, in set_page_dirty,
37 * in arch-dependent flush_dcache_mmap_lock, 37 * in arch-dependent flush_dcache_mmap_lock,
38 * within inode_lock in __sync_single_inode) 38 * within inode_lock in __sync_single_inode)
39 *
40 * (code doesn't rely on that order so it could be switched around)
41 * ->tasklist_lock
42 * anon_vma->lock (memory_failure, collect_procs_anon)
43 * pte map lock
39 */ 44 */
40 45
41#include <linux/mm.h> 46#include <linux/mm.h>
@@ -191,7 +196,7 @@ void __init anon_vma_init(void)
191 * Getting a lock on a stable anon_vma from a page off the LRU is 196 * Getting a lock on a stable anon_vma from a page off the LRU is
192 * tricky: page_lock_anon_vma rely on RCU to guard against the races. 197 * tricky: page_lock_anon_vma rely on RCU to guard against the races.
193 */ 198 */
194static struct anon_vma *page_lock_anon_vma(struct page *page) 199struct anon_vma *page_lock_anon_vma(struct page *page)
195{ 200{
196 struct anon_vma *anon_vma; 201 struct anon_vma *anon_vma;
197 unsigned long anon_mapping; 202 unsigned long anon_mapping;
@@ -211,7 +216,7 @@ out:
211 return NULL; 216 return NULL;
212} 217}
213 218
214static void page_unlock_anon_vma(struct anon_vma *anon_vma) 219void page_unlock_anon_vma(struct anon_vma *anon_vma)
215{ 220{
216 spin_unlock(&anon_vma->lock); 221 spin_unlock(&anon_vma->lock);
217 rcu_read_unlock(); 222 rcu_read_unlock();
@@ -311,7 +316,7 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm,
311 * if the page is not mapped into the page tables of this VMA. Only 316 * if the page is not mapped into the page tables of this VMA. Only
312 * valid for normal file or anonymous VMAs. 317 * valid for normal file or anonymous VMAs.
313 */ 318 */
314static int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) 319int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
315{ 320{
316 unsigned long address; 321 unsigned long address;
317 pte_t *pte; 322 pte_t *pte;
@@ -756,7 +761,7 @@ void page_remove_rmap(struct page *page)
756 * repeatedly from either try_to_unmap_anon or try_to_unmap_file. 761 * repeatedly from either try_to_unmap_anon or try_to_unmap_file.
757 */ 762 */
758static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, 763static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
759 int migration) 764 enum ttu_flags flags)
760{ 765{
761 struct mm_struct *mm = vma->vm_mm; 766 struct mm_struct *mm = vma->vm_mm;
762 unsigned long address; 767 unsigned long address;
@@ -778,11 +783,13 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
778 * If it's recently referenced (perhaps page_referenced 783 * If it's recently referenced (perhaps page_referenced
779 * skipped over this mm) then we should reactivate it. 784 * skipped over this mm) then we should reactivate it.
780 */ 785 */
781 if (!migration) { 786 if (!(flags & TTU_IGNORE_MLOCK)) {
782 if (vma->vm_flags & VM_LOCKED) { 787 if (vma->vm_flags & VM_LOCKED) {
783 ret = SWAP_MLOCK; 788 ret = SWAP_MLOCK;
784 goto out_unmap; 789 goto out_unmap;
785 } 790 }
791 }
792 if (!(flags & TTU_IGNORE_ACCESS)) {
786 if (ptep_clear_flush_young_notify(vma, address, pte)) { 793 if (ptep_clear_flush_young_notify(vma, address, pte)) {
787 ret = SWAP_FAIL; 794 ret = SWAP_FAIL;
788 goto out_unmap; 795 goto out_unmap;
@@ -800,7 +807,14 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
800 /* Update high watermark before we lower rss */ 807 /* Update high watermark before we lower rss */
801 update_hiwater_rss(mm); 808 update_hiwater_rss(mm);
802 809
803 if (PageAnon(page)) { 810 if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) {
811 if (PageAnon(page))
812 dec_mm_counter(mm, anon_rss);
813 else
814 dec_mm_counter(mm, file_rss);
815 set_pte_at(mm, address, pte,
816 swp_entry_to_pte(make_hwpoison_entry(page)));
817 } else if (PageAnon(page)) {
804 swp_entry_t entry = { .val = page_private(page) }; 818 swp_entry_t entry = { .val = page_private(page) };
805 819
806 if (PageSwapCache(page)) { 820 if (PageSwapCache(page)) {
@@ -822,12 +836,12 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
822 * pte. do_swap_page() will wait until the migration 836 * pte. do_swap_page() will wait until the migration
823 * pte is removed and then restart fault handling. 837 * pte is removed and then restart fault handling.
824 */ 838 */
825 BUG_ON(!migration); 839 BUG_ON(TTU_ACTION(flags) != TTU_MIGRATION);
826 entry = make_migration_entry(page, pte_write(pteval)); 840 entry = make_migration_entry(page, pte_write(pteval));
827 } 841 }
828 set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); 842 set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
829 BUG_ON(pte_file(*pte)); 843 BUG_ON(pte_file(*pte));
830 } else if (PAGE_MIGRATION && migration) { 844 } else if (PAGE_MIGRATION && (TTU_ACTION(flags) == TTU_MIGRATION)) {
831 /* Establish migration entry for a file page */ 845 /* Establish migration entry for a file page */
832 swp_entry_t entry; 846 swp_entry_t entry;
833 entry = make_migration_entry(page, pte_write(pteval)); 847 entry = make_migration_entry(page, pte_write(pteval));
@@ -996,12 +1010,13 @@ static int try_to_mlock_page(struct page *page, struct vm_area_struct *vma)
996 * vm_flags for that VMA. That should be OK, because that vma shouldn't be 1010 * vm_flags for that VMA. That should be OK, because that vma shouldn't be
997 * 'LOCKED. 1011 * 'LOCKED.
998 */ 1012 */
999static int try_to_unmap_anon(struct page *page, int unlock, int migration) 1013static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
1000{ 1014{
1001 struct anon_vma *anon_vma; 1015 struct anon_vma *anon_vma;
1002 struct vm_area_struct *vma; 1016 struct vm_area_struct *vma;
1003 unsigned int mlocked = 0; 1017 unsigned int mlocked = 0;
1004 int ret = SWAP_AGAIN; 1018 int ret = SWAP_AGAIN;
1019 int unlock = TTU_ACTION(flags) == TTU_MUNLOCK;
1005 1020
1006 if (MLOCK_PAGES && unlikely(unlock)) 1021 if (MLOCK_PAGES && unlikely(unlock))
1007 ret = SWAP_SUCCESS; /* default for try_to_munlock() */ 1022 ret = SWAP_SUCCESS; /* default for try_to_munlock() */
@@ -1017,7 +1032,7 @@ static int try_to_unmap_anon(struct page *page, int unlock, int migration)
1017 continue; /* must visit all unlocked vmas */ 1032 continue; /* must visit all unlocked vmas */
1018 ret = SWAP_MLOCK; /* saw at least one mlocked vma */ 1033 ret = SWAP_MLOCK; /* saw at least one mlocked vma */
1019 } else { 1034 } else {
1020 ret = try_to_unmap_one(page, vma, migration); 1035 ret = try_to_unmap_one(page, vma, flags);
1021 if (ret == SWAP_FAIL || !page_mapped(page)) 1036 if (ret == SWAP_FAIL || !page_mapped(page))
1022 break; 1037 break;
1023 } 1038 }
@@ -1041,8 +1056,7 @@ static int try_to_unmap_anon(struct page *page, int unlock, int migration)
1041/** 1056/**
1042 * try_to_unmap_file - unmap/unlock file page using the object-based rmap method 1057 * try_to_unmap_file - unmap/unlock file page using the object-based rmap method
1043 * @page: the page to unmap/unlock 1058 * @page: the page to unmap/unlock
1044 * @unlock: request for unlock rather than unmap [unlikely] 1059 * @flags: action and flags
1045 * @migration: unmapping for migration - ignored if @unlock
1046 * 1060 *
1047 * Find all the mappings of a page using the mapping pointer and the vma chains 1061 * Find all the mappings of a page using the mapping pointer and the vma chains
1048 * contained in the address_space struct it points to. 1062 * contained in the address_space struct it points to.
@@ -1054,7 +1068,7 @@ static int try_to_unmap_anon(struct page *page, int unlock, int migration)
1054 * vm_flags for that VMA. That should be OK, because that vma shouldn't be 1068 * vm_flags for that VMA. That should be OK, because that vma shouldn't be
1055 * 'LOCKED. 1069 * 'LOCKED.
1056 */ 1070 */
1057static int try_to_unmap_file(struct page *page, int unlock, int migration) 1071static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
1058{ 1072{
1059 struct address_space *mapping = page->mapping; 1073 struct address_space *mapping = page->mapping;
1060 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 1074 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
@@ -1066,6 +1080,7 @@ static int try_to_unmap_file(struct page *page, int unlock, int migration)
1066 unsigned long max_nl_size = 0; 1080 unsigned long max_nl_size = 0;
1067 unsigned int mapcount; 1081 unsigned int mapcount;
1068 unsigned int mlocked = 0; 1082 unsigned int mlocked = 0;
1083 int unlock = TTU_ACTION(flags) == TTU_MUNLOCK;
1069 1084
1070 if (MLOCK_PAGES && unlikely(unlock)) 1085 if (MLOCK_PAGES && unlikely(unlock))
1071 ret = SWAP_SUCCESS; /* default for try_to_munlock() */ 1086 ret = SWAP_SUCCESS; /* default for try_to_munlock() */
@@ -1078,7 +1093,7 @@ static int try_to_unmap_file(struct page *page, int unlock, int migration)
1078 continue; /* must visit all vmas */ 1093 continue; /* must visit all vmas */
1079 ret = SWAP_MLOCK; 1094 ret = SWAP_MLOCK;
1080 } else { 1095 } else {
1081 ret = try_to_unmap_one(page, vma, migration); 1096 ret = try_to_unmap_one(page, vma, flags);
1082 if (ret == SWAP_FAIL || !page_mapped(page)) 1097 if (ret == SWAP_FAIL || !page_mapped(page))
1083 goto out; 1098 goto out;
1084 } 1099 }
@@ -1103,7 +1118,8 @@ static int try_to_unmap_file(struct page *page, int unlock, int migration)
1103 ret = SWAP_MLOCK; /* leave mlocked == 0 */ 1118 ret = SWAP_MLOCK; /* leave mlocked == 0 */
1104 goto out; /* no need to look further */ 1119 goto out; /* no need to look further */
1105 } 1120 }
1106 if (!MLOCK_PAGES && !migration && (vma->vm_flags & VM_LOCKED)) 1121 if (!MLOCK_PAGES && !(flags & TTU_IGNORE_MLOCK) &&
1122 (vma->vm_flags & VM_LOCKED))
1107 continue; 1123 continue;
1108 cursor = (unsigned long) vma->vm_private_data; 1124 cursor = (unsigned long) vma->vm_private_data;
1109 if (cursor > max_nl_cursor) 1125 if (cursor > max_nl_cursor)
@@ -1137,7 +1153,7 @@ static int try_to_unmap_file(struct page *page, int unlock, int migration)
1137 do { 1153 do {
1138 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, 1154 list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
1139 shared.vm_set.list) { 1155 shared.vm_set.list) {
1140 if (!MLOCK_PAGES && !migration && 1156 if (!MLOCK_PAGES && !(flags & TTU_IGNORE_MLOCK) &&
1141 (vma->vm_flags & VM_LOCKED)) 1157 (vma->vm_flags & VM_LOCKED))
1142 continue; 1158 continue;
1143 cursor = (unsigned long) vma->vm_private_data; 1159 cursor = (unsigned long) vma->vm_private_data;
@@ -1177,7 +1193,7 @@ out:
1177/** 1193/**
1178 * try_to_unmap - try to remove all page table mappings to a page 1194 * try_to_unmap - try to remove all page table mappings to a page
1179 * @page: the page to get unmapped 1195 * @page: the page to get unmapped
1180 * @migration: migration flag 1196 * @flags: action and flags
1181 * 1197 *
1182 * Tries to remove all the page table entries which are mapping this 1198 * Tries to remove all the page table entries which are mapping this
1183 * page, used in the pageout path. Caller must hold the page lock. 1199 * page, used in the pageout path. Caller must hold the page lock.
@@ -1188,16 +1204,16 @@ out:
1188 * SWAP_FAIL - the page is unswappable 1204 * SWAP_FAIL - the page is unswappable
1189 * SWAP_MLOCK - page is mlocked. 1205 * SWAP_MLOCK - page is mlocked.
1190 */ 1206 */
1191int try_to_unmap(struct page *page, int migration) 1207int try_to_unmap(struct page *page, enum ttu_flags flags)
1192{ 1208{
1193 int ret; 1209 int ret;
1194 1210
1195 BUG_ON(!PageLocked(page)); 1211 BUG_ON(!PageLocked(page));
1196 1212
1197 if (PageAnon(page)) 1213 if (PageAnon(page))
1198 ret = try_to_unmap_anon(page, 0, migration); 1214 ret = try_to_unmap_anon(page, flags);
1199 else 1215 else
1200 ret = try_to_unmap_file(page, 0, migration); 1216 ret = try_to_unmap_file(page, flags);
1201 if (ret != SWAP_MLOCK && !page_mapped(page)) 1217 if (ret != SWAP_MLOCK && !page_mapped(page))
1202 ret = SWAP_SUCCESS; 1218 ret = SWAP_SUCCESS;
1203 return ret; 1219 return ret;
@@ -1222,8 +1238,8 @@ int try_to_munlock(struct page *page)
1222 VM_BUG_ON(!PageLocked(page) || PageLRU(page)); 1238 VM_BUG_ON(!PageLocked(page) || PageLRU(page));
1223 1239
1224 if (PageAnon(page)) 1240 if (PageAnon(page))
1225 return try_to_unmap_anon(page, 1, 0); 1241 return try_to_unmap_anon(page, TTU_MUNLOCK);
1226 else 1242 else
1227 return try_to_unmap_file(page, 1, 0); 1243 return try_to_unmap_file(page, TTU_MUNLOCK);
1228} 1244}
1229 1245
diff --git a/mm/shmem.c b/mm/shmem.c
index b206a7a32e2a..98631c26c200 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1633,8 +1633,8 @@ shmem_write_end(struct file *file, struct address_space *mapping,
1633 if (pos + copied > inode->i_size) 1633 if (pos + copied > inode->i_size)
1634 i_size_write(inode, pos + copied); 1634 i_size_write(inode, pos + copied);
1635 1635
1636 unlock_page(page);
1637 set_page_dirty(page); 1636 set_page_dirty(page);
1637 unlock_page(page);
1638 page_cache_release(page); 1638 page_cache_release(page);
1639 1639
1640 return copied; 1640 return copied;
@@ -1971,13 +1971,13 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
1971 iput(inode); 1971 iput(inode);
1972 return error; 1972 return error;
1973 } 1973 }
1974 unlock_page(page);
1975 inode->i_mapping->a_ops = &shmem_aops; 1974 inode->i_mapping->a_ops = &shmem_aops;
1976 inode->i_op = &shmem_symlink_inode_operations; 1975 inode->i_op = &shmem_symlink_inode_operations;
1977 kaddr = kmap_atomic(page, KM_USER0); 1976 kaddr = kmap_atomic(page, KM_USER0);
1978 memcpy(kaddr, symname, len); 1977 memcpy(kaddr, symname, len);
1979 kunmap_atomic(kaddr, KM_USER0); 1978 kunmap_atomic(kaddr, KM_USER0);
1980 set_page_dirty(page); 1979 set_page_dirty(page);
1980 unlock_page(page);
1981 page_cache_release(page); 1981 page_cache_release(page);
1982 } 1982 }
1983 if (dir->i_mode & S_ISGID) 1983 if (dir->i_mode & S_ISGID)
@@ -2420,6 +2420,7 @@ static const struct address_space_operations shmem_aops = {
2420 .write_end = shmem_write_end, 2420 .write_end = shmem_write_end,
2421#endif 2421#endif
2422 .migratepage = migrate_page, 2422 .migratepage = migrate_page,
2423 .error_remove_page = generic_error_remove_page,
2423}; 2424};
2424 2425
2425static const struct file_operations shmem_file_operations = { 2426static const struct file_operations shmem_file_operations = {
diff --git a/mm/swapfile.c b/mm/swapfile.c
index f1bf19daadc6..4de7f02f820b 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -699,7 +699,7 @@ int free_swap_and_cache(swp_entry_t entry)
699 struct swap_info_struct *p; 699 struct swap_info_struct *p;
700 struct page *page = NULL; 700 struct page *page = NULL;
701 701
702 if (is_migration_entry(entry)) 702 if (non_swap_entry(entry))
703 return 1; 703 return 1;
704 704
705 p = swap_info_get(entry); 705 p = swap_info_get(entry);
@@ -2085,7 +2085,7 @@ static int __swap_duplicate(swp_entry_t entry, bool cache)
2085 int count; 2085 int count;
2086 bool has_cache; 2086 bool has_cache;
2087 2087
2088 if (is_migration_entry(entry)) 2088 if (non_swap_entry(entry))
2089 return -EINVAL; 2089 return -EINVAL;
2090 2090
2091 type = swp_type(entry); 2091 type = swp_type(entry);
diff --git a/mm/truncate.c b/mm/truncate.c
index ccc3ecf7cb98..a17b3977cfdf 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -93,11 +93,11 @@ EXPORT_SYMBOL(cancel_dirty_page);
93 * its lock, b) when a concurrent invalidate_mapping_pages got there first and 93 * its lock, b) when a concurrent invalidate_mapping_pages got there first and
94 * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space. 94 * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space.
95 */ 95 */
96static void 96static int
97truncate_complete_page(struct address_space *mapping, struct page *page) 97truncate_complete_page(struct address_space *mapping, struct page *page)
98{ 98{
99 if (page->mapping != mapping) 99 if (page->mapping != mapping)
100 return; 100 return -EIO;
101 101
102 if (page_has_private(page)) 102 if (page_has_private(page))
103 do_invalidatepage(page, 0); 103 do_invalidatepage(page, 0);
@@ -108,6 +108,7 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
108 remove_from_page_cache(page); 108 remove_from_page_cache(page);
109 ClearPageMappedToDisk(page); 109 ClearPageMappedToDisk(page);
110 page_cache_release(page); /* pagecache ref */ 110 page_cache_release(page); /* pagecache ref */
111 return 0;
111} 112}
112 113
113/* 114/*
@@ -135,6 +136,51 @@ invalidate_complete_page(struct address_space *mapping, struct page *page)
135 return ret; 136 return ret;
136} 137}
137 138
139int truncate_inode_page(struct address_space *mapping, struct page *page)
140{
141 if (page_mapped(page)) {
142 unmap_mapping_range(mapping,
143 (loff_t)page->index << PAGE_CACHE_SHIFT,
144 PAGE_CACHE_SIZE, 0);
145 }
146 return truncate_complete_page(mapping, page);
147}
148
149/*
150 * Used to get rid of pages on hardware memory corruption.
151 */
152int generic_error_remove_page(struct address_space *mapping, struct page *page)
153{
154 if (!mapping)
155 return -EINVAL;
156 /*
157 * Only punch for normal data pages for now.
158 * Handling other types like directories would need more auditing.
159 */
160 if (!S_ISREG(mapping->host->i_mode))
161 return -EIO;
162 return truncate_inode_page(mapping, page);
163}
164EXPORT_SYMBOL(generic_error_remove_page);
165
166/*
167 * Safely invalidate one page from its pagecache mapping.
168 * It only drops clean, unused pages. The page must be locked.
169 *
170 * Returns 1 if the page is successfully invalidated, otherwise 0.
171 */
172int invalidate_inode_page(struct page *page)
173{
174 struct address_space *mapping = page_mapping(page);
175 if (!mapping)
176 return 0;
177 if (PageDirty(page) || PageWriteback(page))
178 return 0;
179 if (page_mapped(page))
180 return 0;
181 return invalidate_complete_page(mapping, page);
182}
183
138/** 184/**
139 * truncate_inode_pages - truncate range of pages specified by start & end byte offsets 185 * truncate_inode_pages - truncate range of pages specified by start & end byte offsets
140 * @mapping: mapping to truncate 186 * @mapping: mapping to truncate
@@ -196,12 +242,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
196 unlock_page(page); 242 unlock_page(page);
197 continue; 243 continue;
198 } 244 }
199 if (page_mapped(page)) { 245 truncate_inode_page(mapping, page);
200 unmap_mapping_range(mapping,
201 (loff_t)page_index<<PAGE_CACHE_SHIFT,
202 PAGE_CACHE_SIZE, 0);
203 }
204 truncate_complete_page(mapping, page);
205 unlock_page(page); 246 unlock_page(page);
206 } 247 }
207 pagevec_release(&pvec); 248 pagevec_release(&pvec);
@@ -238,15 +279,10 @@ void truncate_inode_pages_range(struct address_space *mapping,
238 break; 279 break;
239 lock_page(page); 280 lock_page(page);
240 wait_on_page_writeback(page); 281 wait_on_page_writeback(page);
241 if (page_mapped(page)) { 282 truncate_inode_page(mapping, page);
242 unmap_mapping_range(mapping,
243 (loff_t)page->index<<PAGE_CACHE_SHIFT,
244 PAGE_CACHE_SIZE, 0);
245 }
246 if (page->index > next) 283 if (page->index > next)
247 next = page->index; 284 next = page->index;
248 next++; 285 next++;
249 truncate_complete_page(mapping, page);
250 unlock_page(page); 286 unlock_page(page);
251 } 287 }
252 pagevec_release(&pvec); 288 pagevec_release(&pvec);
@@ -311,12 +347,8 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
311 if (lock_failed) 347 if (lock_failed)
312 continue; 348 continue;
313 349
314 if (PageDirty(page) || PageWriteback(page)) 350 ret += invalidate_inode_page(page);
315 goto unlock; 351
316 if (page_mapped(page))
317 goto unlock;
318 ret += invalidate_complete_page(mapping, page);
319unlock:
320 unlock_page(page); 352 unlock_page(page);
321 if (next > end) 353 if (next > end)
322 break; 354 break;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 613e89f471d9..1219ceb8a9b2 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -663,7 +663,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
663 * processes. Try to unmap it here. 663 * processes. Try to unmap it here.
664 */ 664 */
665 if (page_mapped(page) && mapping) { 665 if (page_mapped(page) && mapping) {
666 switch (try_to_unmap(page, 0)) { 666 switch (try_to_unmap(page, TTU_UNMAP)) {
667 case SWAP_FAIL: 667 case SWAP_FAIL:
668 goto activate_locked; 668 goto activate_locked;
669 case SWAP_AGAIN: 669 case SWAP_AGAIN:
@@ -1836,11 +1836,45 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
1836 1836
1837#ifdef CONFIG_CGROUP_MEM_RES_CTLR 1837#ifdef CONFIG_CGROUP_MEM_RES_CTLR
1838 1838
1839unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
1840 gfp_t gfp_mask, bool noswap,
1841 unsigned int swappiness,
1842 struct zone *zone, int nid)
1843{
1844 struct scan_control sc = {
1845 .may_writepage = !laptop_mode,
1846 .may_unmap = 1,
1847 .may_swap = !noswap,
1848 .swap_cluster_max = SWAP_CLUSTER_MAX,
1849 .swappiness = swappiness,
1850 .order = 0,
1851 .mem_cgroup = mem,
1852 .isolate_pages = mem_cgroup_isolate_pages,
1853 };
1854 nodemask_t nm = nodemask_of_node(nid);
1855
1856 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
1857 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
1858 sc.nodemask = &nm;
1859 sc.nr_reclaimed = 0;
1860 sc.nr_scanned = 0;
1861 /*
1862 * NOTE: Although we can get the priority field, using it
1863 * here is not a good idea, since it limits the pages we can scan.
1864 * if we don't reclaim here, the shrink_zone from balance_pgdat
1865 * will pick up pages from other mem cgroup's as well. We hack
1866 * the priority and make it zero.
1867 */
1868 shrink_zone(0, zone, &sc);
1869 return sc.nr_reclaimed;
1870}
1871
1839unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, 1872unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
1840 gfp_t gfp_mask, 1873 gfp_t gfp_mask,
1841 bool noswap, 1874 bool noswap,
1842 unsigned int swappiness) 1875 unsigned int swappiness)
1843{ 1876{
1877 struct zonelist *zonelist;
1844 struct scan_control sc = { 1878 struct scan_control sc = {
1845 .may_writepage = !laptop_mode, 1879 .may_writepage = !laptop_mode,
1846 .may_unmap = 1, 1880 .may_unmap = 1,
@@ -1852,7 +1886,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
1852 .isolate_pages = mem_cgroup_isolate_pages, 1886 .isolate_pages = mem_cgroup_isolate_pages,
1853 .nodemask = NULL, /* we don't care the placement */ 1887 .nodemask = NULL, /* we don't care the placement */
1854 }; 1888 };
1855 struct zonelist *zonelist;
1856 1889
1857 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | 1890 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
1858 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); 1891 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
@@ -1974,6 +2007,7 @@ loop_again:
1974 for (i = 0; i <= end_zone; i++) { 2007 for (i = 0; i <= end_zone; i++) {
1975 struct zone *zone = pgdat->node_zones + i; 2008 struct zone *zone = pgdat->node_zones + i;
1976 int nr_slab; 2009 int nr_slab;
2010 int nid, zid;
1977 2011
1978 if (!populated_zone(zone)) 2012 if (!populated_zone(zone))
1979 continue; 2013 continue;
@@ -1988,6 +2022,15 @@ loop_again:
1988 temp_priority[i] = priority; 2022 temp_priority[i] = priority;
1989 sc.nr_scanned = 0; 2023 sc.nr_scanned = 0;
1990 note_zone_scanning_priority(zone, priority); 2024 note_zone_scanning_priority(zone, priority);
2025
2026 nid = pgdat->node_id;
2027 zid = zone_idx(zone);
2028 /*
2029 * Call soft limit reclaim before calling shrink_zone.
2030 * For now we ignore the return value
2031 */
2032 mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask,
2033 nid, zid);
1991 /* 2034 /*
1992 * We put equal pressure on every zone, unless one 2035 * We put equal pressure on every zone, unless one
1993 * zone has way too many pages free already. 2036 * zone has way too many pages free already.
@@ -2801,10 +2844,10 @@ static void scan_all_zones_unevictable_pages(void)
2801unsigned long scan_unevictable_pages; 2844unsigned long scan_unevictable_pages;
2802 2845
2803int scan_unevictable_handler(struct ctl_table *table, int write, 2846int scan_unevictable_handler(struct ctl_table *table, int write,
2804 struct file *file, void __user *buffer, 2847 void __user *buffer,
2805 size_t *length, loff_t *ppos) 2848 size_t *length, loff_t *ppos)
2806{ 2849{
2807 proc_doulongvec_minmax(table, write, file, buffer, length, ppos); 2850 proc_doulongvec_minmax(table, write, buffer, length, ppos);
2808 2851
2809 if (write && *(unsigned long *)table->data) 2852 if (write && *(unsigned long *)table->data)
2810 scan_all_zones_unevictable_pages(); 2853 scan_all_zones_unevictable_pages();
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index 907a82e9023d..a16a2342f6bf 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -965,12 +965,12 @@ static struct nf_hook_ops br_nf_ops[] __read_mostly = {
965 965
966#ifdef CONFIG_SYSCTL 966#ifdef CONFIG_SYSCTL
967static 967static
968int brnf_sysctl_call_tables(ctl_table * ctl, int write, struct file *filp, 968int brnf_sysctl_call_tables(ctl_table * ctl, int write,
969 void __user * buffer, size_t * lenp, loff_t * ppos) 969 void __user * buffer, size_t * lenp, loff_t * ppos)
970{ 970{
971 int ret; 971 int ret;
972 972
973 ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); 973 ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
974 974
975 if (write && *(int *)(ctl->data)) 975 if (write && *(int *)(ctl->data))
976 *(int *)(ctl->data) = 1; 976 *(int *)(ctl->data) = 1;
diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c
index 1c6a5bb6f0c8..6e1f085db06a 100644
--- a/net/decnet/dn_dev.c
+++ b/net/decnet/dn_dev.c
@@ -164,7 +164,7 @@ static int max_t3[] = { 8191 }; /* Must fit in 16 bits when multiplied by BCT3MU
164static int min_priority[1]; 164static int min_priority[1];
165static int max_priority[] = { 127 }; /* From DECnet spec */ 165static int max_priority[] = { 127 }; /* From DECnet spec */
166 166
167static int dn_forwarding_proc(ctl_table *, int, struct file *, 167static int dn_forwarding_proc(ctl_table *, int,
168 void __user *, size_t *, loff_t *); 168 void __user *, size_t *, loff_t *);
169static int dn_forwarding_sysctl(ctl_table *table, 169static int dn_forwarding_sysctl(ctl_table *table,
170 void __user *oldval, size_t __user *oldlenp, 170 void __user *oldval, size_t __user *oldlenp,
@@ -274,7 +274,6 @@ static void dn_dev_sysctl_unregister(struct dn_dev_parms *parms)
274} 274}
275 275
276static int dn_forwarding_proc(ctl_table *table, int write, 276static int dn_forwarding_proc(ctl_table *table, int write,
277 struct file *filep,
278 void __user *buffer, 277 void __user *buffer,
279 size_t *lenp, loff_t *ppos) 278 size_t *lenp, loff_t *ppos)
280{ 279{
@@ -290,7 +289,7 @@ static int dn_forwarding_proc(ctl_table *table, int write,
290 dn_db = dev->dn_ptr; 289 dn_db = dev->dn_ptr;
291 old = dn_db->parms.forwarding; 290 old = dn_db->parms.forwarding;
292 291
293 err = proc_dointvec(table, write, filep, buffer, lenp, ppos); 292 err = proc_dointvec(table, write, buffer, lenp, ppos);
294 293
295 if ((err >= 0) && write) { 294 if ((err >= 0) && write) {
296 if (dn_db->parms.forwarding < 0) 295 if (dn_db->parms.forwarding < 0)
diff --git a/net/decnet/sysctl_net_decnet.c b/net/decnet/sysctl_net_decnet.c
index 5bcd592ae6dd..26b0ab1e9f56 100644
--- a/net/decnet/sysctl_net_decnet.c
+++ b/net/decnet/sysctl_net_decnet.c
@@ -165,7 +165,6 @@ static int dn_node_address_strategy(ctl_table *table,
165} 165}
166 166
167static int dn_node_address_handler(ctl_table *table, int write, 167static int dn_node_address_handler(ctl_table *table, int write,
168 struct file *filp,
169 void __user *buffer, 168 void __user *buffer,
170 size_t *lenp, loff_t *ppos) 169 size_t *lenp, loff_t *ppos)
171{ 170{
@@ -276,7 +275,6 @@ static int dn_def_dev_strategy(ctl_table *table,
276 275
277 276
278static int dn_def_dev_handler(ctl_table *table, int write, 277static int dn_def_dev_handler(ctl_table *table, int write,
279 struct file * filp,
280 void __user *buffer, 278 void __user *buffer,
281 size_t *lenp, loff_t *ppos) 279 size_t *lenp, loff_t *ppos)
282{ 280{
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 07336c6201f0..e92f1fd28aa5 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1270,10 +1270,10 @@ static void inet_forward_change(struct net *net)
1270} 1270}
1271 1271
1272static int devinet_conf_proc(ctl_table *ctl, int write, 1272static int devinet_conf_proc(ctl_table *ctl, int write,
1273 struct file *filp, void __user *buffer, 1273 void __user *buffer,
1274 size_t *lenp, loff_t *ppos) 1274 size_t *lenp, loff_t *ppos)
1275{ 1275{
1276 int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); 1276 int ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
1277 1277
1278 if (write) { 1278 if (write) {
1279 struct ipv4_devconf *cnf = ctl->extra1; 1279 struct ipv4_devconf *cnf = ctl->extra1;
@@ -1342,12 +1342,12 @@ static int devinet_conf_sysctl(ctl_table *table,
1342} 1342}
1343 1343
1344static int devinet_sysctl_forward(ctl_table *ctl, int write, 1344static int devinet_sysctl_forward(ctl_table *ctl, int write,
1345 struct file *filp, void __user *buffer, 1345 void __user *buffer,
1346 size_t *lenp, loff_t *ppos) 1346 size_t *lenp, loff_t *ppos)
1347{ 1347{
1348 int *valp = ctl->data; 1348 int *valp = ctl->data;
1349 int val = *valp; 1349 int val = *valp;
1350 int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); 1350 int ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
1351 1351
1352 if (write && *valp != val) { 1352 if (write && *valp != val) {
1353 struct net *net = ctl->extra2; 1353 struct net *net = ctl->extra2;
@@ -1372,12 +1372,12 @@ static int devinet_sysctl_forward(ctl_table *ctl, int write,
1372} 1372}
1373 1373
1374int ipv4_doint_and_flush(ctl_table *ctl, int write, 1374int ipv4_doint_and_flush(ctl_table *ctl, int write,
1375 struct file *filp, void __user *buffer, 1375 void __user *buffer,
1376 size_t *lenp, loff_t *ppos) 1376 size_t *lenp, loff_t *ppos)
1377{ 1377{
1378 int *valp = ctl->data; 1378 int *valp = ctl->data;
1379 int val = *valp; 1379 int val = *valp;
1380 int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); 1380 int ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
1381 struct net *net = ctl->extra2; 1381 struct net *net = ctl->extra2;
1382 1382
1383 if (write && *valp != val) 1383 if (write && *valp != val)
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index df9347314538..bb4199252026 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -3036,7 +3036,7 @@ void ip_rt_multicast_event(struct in_device *in_dev)
3036 3036
3037#ifdef CONFIG_SYSCTL 3037#ifdef CONFIG_SYSCTL
3038static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write, 3038static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3039 struct file *filp, void __user *buffer, 3039 void __user *buffer,
3040 size_t *lenp, loff_t *ppos) 3040 size_t *lenp, loff_t *ppos)
3041{ 3041{
3042 if (write) { 3042 if (write) {
@@ -3046,7 +3046,7 @@ static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3046 3046
3047 memcpy(&ctl, __ctl, sizeof(ctl)); 3047 memcpy(&ctl, __ctl, sizeof(ctl));
3048 ctl.data = &flush_delay; 3048 ctl.data = &flush_delay;
3049 proc_dointvec(&ctl, write, filp, buffer, lenp, ppos); 3049 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3050 3050
3051 net = (struct net *)__ctl->extra1; 3051 net = (struct net *)__ctl->extra1;
3052 rt_cache_flush(net, flush_delay); 3052 rt_cache_flush(net, flush_delay);
@@ -3106,12 +3106,11 @@ static void rt_secret_reschedule(int old)
3106} 3106}
3107 3107
3108static int ipv4_sysctl_rt_secret_interval(ctl_table *ctl, int write, 3108static int ipv4_sysctl_rt_secret_interval(ctl_table *ctl, int write,
3109 struct file *filp,
3110 void __user *buffer, size_t *lenp, 3109 void __user *buffer, size_t *lenp,
3111 loff_t *ppos) 3110 loff_t *ppos)
3112{ 3111{
3113 int old = ip_rt_secret_interval; 3112 int old = ip_rt_secret_interval;
3114 int ret = proc_dointvec_jiffies(ctl, write, filp, buffer, lenp, ppos); 3113 int ret = proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos);
3115 3114
3116 rt_secret_reschedule(old); 3115 rt_secret_reschedule(old);
3117 3116
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 4710d219f06a..2dcf04d9b005 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -36,7 +36,7 @@ static void set_local_port_range(int range[2])
36} 36}
37 37
38/* Validate changes from /proc interface. */ 38/* Validate changes from /proc interface. */
39static int ipv4_local_port_range(ctl_table *table, int write, struct file *filp, 39static int ipv4_local_port_range(ctl_table *table, int write,
40 void __user *buffer, 40 void __user *buffer,
41 size_t *lenp, loff_t *ppos) 41 size_t *lenp, loff_t *ppos)
42{ 42{
@@ -51,7 +51,7 @@ static int ipv4_local_port_range(ctl_table *table, int write, struct file *filp,
51 }; 51 };
52 52
53 inet_get_local_port_range(range, range + 1); 53 inet_get_local_port_range(range, range + 1);
54 ret = proc_dointvec_minmax(&tmp, write, filp, buffer, lenp, ppos); 54 ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
55 55
56 if (write && ret == 0) { 56 if (write && ret == 0) {
57 if (range[1] < range[0]) 57 if (range[1] < range[0])
@@ -91,7 +91,7 @@ static int ipv4_sysctl_local_port_range(ctl_table *table,
91} 91}
92 92
93 93
94static int proc_tcp_congestion_control(ctl_table *ctl, int write, struct file * filp, 94static int proc_tcp_congestion_control(ctl_table *ctl, int write,
95 void __user *buffer, size_t *lenp, loff_t *ppos) 95 void __user *buffer, size_t *lenp, loff_t *ppos)
96{ 96{
97 char val[TCP_CA_NAME_MAX]; 97 char val[TCP_CA_NAME_MAX];
@@ -103,7 +103,7 @@ static int proc_tcp_congestion_control(ctl_table *ctl, int write, struct file *
103 103
104 tcp_get_default_congestion_control(val); 104 tcp_get_default_congestion_control(val);
105 105
106 ret = proc_dostring(&tbl, write, filp, buffer, lenp, ppos); 106 ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
107 if (write && ret == 0) 107 if (write && ret == 0)
108 ret = tcp_set_default_congestion_control(val); 108 ret = tcp_set_default_congestion_control(val);
109 return ret; 109 return ret;
@@ -129,7 +129,7 @@ static int sysctl_tcp_congestion_control(ctl_table *table,
129} 129}
130 130
131static int proc_tcp_available_congestion_control(ctl_table *ctl, 131static int proc_tcp_available_congestion_control(ctl_table *ctl,
132 int write, struct file * filp, 132 int write,
133 void __user *buffer, size_t *lenp, 133 void __user *buffer, size_t *lenp,
134 loff_t *ppos) 134 loff_t *ppos)
135{ 135{
@@ -140,13 +140,13 @@ static int proc_tcp_available_congestion_control(ctl_table *ctl,
140 if (!tbl.data) 140 if (!tbl.data)
141 return -ENOMEM; 141 return -ENOMEM;
142 tcp_get_available_congestion_control(tbl.data, TCP_CA_BUF_MAX); 142 tcp_get_available_congestion_control(tbl.data, TCP_CA_BUF_MAX);
143 ret = proc_dostring(&tbl, write, filp, buffer, lenp, ppos); 143 ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
144 kfree(tbl.data); 144 kfree(tbl.data);
145 return ret; 145 return ret;
146} 146}
147 147
148static int proc_allowed_congestion_control(ctl_table *ctl, 148static int proc_allowed_congestion_control(ctl_table *ctl,
149 int write, struct file * filp, 149 int write,
150 void __user *buffer, size_t *lenp, 150 void __user *buffer, size_t *lenp,
151 loff_t *ppos) 151 loff_t *ppos)
152{ 152{
@@ -158,7 +158,7 @@ static int proc_allowed_congestion_control(ctl_table *ctl,
158 return -ENOMEM; 158 return -ENOMEM;
159 159
160 tcp_get_allowed_congestion_control(tbl.data, tbl.maxlen); 160 tcp_get_allowed_congestion_control(tbl.data, tbl.maxlen);
161 ret = proc_dostring(&tbl, write, filp, buffer, lenp, ppos); 161 ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
162 if (write && ret == 0) 162 if (write && ret == 0)
163 ret = tcp_set_allowed_congestion_control(tbl.data); 163 ret = tcp_set_allowed_congestion_control(tbl.data);
164 kfree(tbl.data); 164 kfree(tbl.data);
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 55f486d89c88..1fd0a3d775d2 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -3986,14 +3986,14 @@ static void ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
3986#ifdef CONFIG_SYSCTL 3986#ifdef CONFIG_SYSCTL
3987 3987
3988static 3988static
3989int addrconf_sysctl_forward(ctl_table *ctl, int write, struct file * filp, 3989int addrconf_sysctl_forward(ctl_table *ctl, int write,
3990 void __user *buffer, size_t *lenp, loff_t *ppos) 3990 void __user *buffer, size_t *lenp, loff_t *ppos)
3991{ 3991{
3992 int *valp = ctl->data; 3992 int *valp = ctl->data;
3993 int val = *valp; 3993 int val = *valp;
3994 int ret; 3994 int ret;
3995 3995
3996 ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); 3996 ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
3997 3997
3998 if (write) 3998 if (write)
3999 ret = addrconf_fixup_forwarding(ctl, valp, val); 3999 ret = addrconf_fixup_forwarding(ctl, valp, val);
@@ -4090,14 +4090,14 @@ static int addrconf_disable_ipv6(struct ctl_table *table, int *p, int old)
4090} 4090}
4091 4091
4092static 4092static
4093int addrconf_sysctl_disable(ctl_table *ctl, int write, struct file * filp, 4093int addrconf_sysctl_disable(ctl_table *ctl, int write,
4094 void __user *buffer, size_t *lenp, loff_t *ppos) 4094 void __user *buffer, size_t *lenp, loff_t *ppos)
4095{ 4095{
4096 int *valp = ctl->data; 4096 int *valp = ctl->data;
4097 int val = *valp; 4097 int val = *valp;
4098 int ret; 4098 int ret;
4099 4099
4100 ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); 4100 ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
4101 4101
4102 if (write) 4102 if (write)
4103 ret = addrconf_disable_ipv6(ctl, valp, val); 4103 ret = addrconf_disable_ipv6(ctl, valp, val);
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 7015478797f6..498b9b0b0fad 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1735,7 +1735,7 @@ static void ndisc_warn_deprecated_sysctl(struct ctl_table *ctl,
1735 } 1735 }
1736} 1736}
1737 1737
1738int ndisc_ifinfo_sysctl_change(struct ctl_table *ctl, int write, struct file * filp, void __user *buffer, size_t *lenp, loff_t *ppos) 1738int ndisc_ifinfo_sysctl_change(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos)
1739{ 1739{
1740 struct net_device *dev = ctl->extra1; 1740 struct net_device *dev = ctl->extra1;
1741 struct inet6_dev *idev; 1741 struct inet6_dev *idev;
@@ -1746,16 +1746,16 @@ int ndisc_ifinfo_sysctl_change(struct ctl_table *ctl, int write, struct file * f
1746 ndisc_warn_deprecated_sysctl(ctl, "syscall", dev ? dev->name : "default"); 1746 ndisc_warn_deprecated_sysctl(ctl, "syscall", dev ? dev->name : "default");
1747 1747
1748 if (strcmp(ctl->procname, "retrans_time") == 0) 1748 if (strcmp(ctl->procname, "retrans_time") == 0)
1749 ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); 1749 ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
1750 1750
1751 else if (strcmp(ctl->procname, "base_reachable_time") == 0) 1751 else if (strcmp(ctl->procname, "base_reachable_time") == 0)
1752 ret = proc_dointvec_jiffies(ctl, write, 1752 ret = proc_dointvec_jiffies(ctl, write,
1753 filp, buffer, lenp, ppos); 1753 buffer, lenp, ppos);
1754 1754
1755 else if ((strcmp(ctl->procname, "retrans_time_ms") == 0) || 1755 else if ((strcmp(ctl->procname, "retrans_time_ms") == 0) ||
1756 (strcmp(ctl->procname, "base_reachable_time_ms") == 0)) 1756 (strcmp(ctl->procname, "base_reachable_time_ms") == 0))
1757 ret = proc_dointvec_ms_jiffies(ctl, write, 1757 ret = proc_dointvec_ms_jiffies(ctl, write,
1758 filp, buffer, lenp, ppos); 1758 buffer, lenp, ppos);
1759 else 1759 else
1760 ret = -1; 1760 ret = -1;
1761 1761
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 77aecbe8ff6c..d6fe7646a8ff 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2524,13 +2524,13 @@ static const struct file_operations rt6_stats_seq_fops = {
2524#ifdef CONFIG_SYSCTL 2524#ifdef CONFIG_SYSCTL
2525 2525
2526static 2526static
2527int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp, 2527int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2528 void __user *buffer, size_t *lenp, loff_t *ppos) 2528 void __user *buffer, size_t *lenp, loff_t *ppos)
2529{ 2529{
2530 struct net *net = current->nsproxy->net_ns; 2530 struct net *net = current->nsproxy->net_ns;
2531 int delay = net->ipv6.sysctl.flush_delay; 2531 int delay = net->ipv6.sysctl.flush_delay;
2532 if (write) { 2532 if (write) {
2533 proc_dointvec(ctl, write, filp, buffer, lenp, ppos); 2533 proc_dointvec(ctl, write, buffer, lenp, ppos);
2534 fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net); 2534 fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2535 return 0; 2535 return 0;
2536 } else 2536 } else
diff --git a/net/irda/irsysctl.c b/net/irda/irsysctl.c
index 57f8817c3979..5c86567e5a78 100644
--- a/net/irda/irsysctl.c
+++ b/net/irda/irsysctl.c
@@ -73,12 +73,12 @@ static int min_lap_keepalive_time = 100; /* 100us */
73/* For other sysctl, I've no idea of the range. Maybe Dag could help 73/* For other sysctl, I've no idea of the range. Maybe Dag could help
74 * us on that - Jean II */ 74 * us on that - Jean II */
75 75
76static int do_devname(ctl_table *table, int write, struct file *filp, 76static int do_devname(ctl_table *table, int write,
77 void __user *buffer, size_t *lenp, loff_t *ppos) 77 void __user *buffer, size_t *lenp, loff_t *ppos)
78{ 78{
79 int ret; 79 int ret;
80 80
81 ret = proc_dostring(table, write, filp, buffer, lenp, ppos); 81 ret = proc_dostring(table, write, buffer, lenp, ppos);
82 if (ret == 0 && write) { 82 if (ret == 0 && write) {
83 struct ias_value *val; 83 struct ias_value *val;
84 84
@@ -90,12 +90,12 @@ static int do_devname(ctl_table *table, int write, struct file *filp,
90} 90}
91 91
92 92
93static int do_discovery(ctl_table *table, int write, struct file *filp, 93static int do_discovery(ctl_table *table, int write,
94 void __user *buffer, size_t *lenp, loff_t *ppos) 94 void __user *buffer, size_t *lenp, loff_t *ppos)
95{ 95{
96 int ret; 96 int ret;
97 97
98 ret = proc_dointvec(table, write, filp, buffer, lenp, ppos); 98 ret = proc_dointvec(table, write, buffer, lenp, ppos);
99 if (ret) 99 if (ret)
100 return ret; 100 return ret;
101 101
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index fba2892b99e1..446e9bd4b4bc 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -1496,14 +1496,14 @@ static int ip_vs_zero_all(void)
1496 1496
1497 1497
1498static int 1498static int
1499proc_do_defense_mode(ctl_table *table, int write, struct file * filp, 1499proc_do_defense_mode(ctl_table *table, int write,
1500 void __user *buffer, size_t *lenp, loff_t *ppos) 1500 void __user *buffer, size_t *lenp, loff_t *ppos)
1501{ 1501{
1502 int *valp = table->data; 1502 int *valp = table->data;
1503 int val = *valp; 1503 int val = *valp;
1504 int rc; 1504 int rc;
1505 1505
1506 rc = proc_dointvec(table, write, filp, buffer, lenp, ppos); 1506 rc = proc_dointvec(table, write, buffer, lenp, ppos);
1507 if (write && (*valp != val)) { 1507 if (write && (*valp != val)) {
1508 if ((*valp < 0) || (*valp > 3)) { 1508 if ((*valp < 0) || (*valp > 3)) {
1509 /* Restore the correct value */ 1509 /* Restore the correct value */
@@ -1517,7 +1517,7 @@ proc_do_defense_mode(ctl_table *table, int write, struct file * filp,
1517 1517
1518 1518
1519static int 1519static int
1520proc_do_sync_threshold(ctl_table *table, int write, struct file *filp, 1520proc_do_sync_threshold(ctl_table *table, int write,
1521 void __user *buffer, size_t *lenp, loff_t *ppos) 1521 void __user *buffer, size_t *lenp, loff_t *ppos)
1522{ 1522{
1523 int *valp = table->data; 1523 int *valp = table->data;
@@ -1527,7 +1527,7 @@ proc_do_sync_threshold(ctl_table *table, int write, struct file *filp,
1527 /* backup the value first */ 1527 /* backup the value first */
1528 memcpy(val, valp, sizeof(val)); 1528 memcpy(val, valp, sizeof(val));
1529 1529
1530 rc = proc_dointvec(table, write, filp, buffer, lenp, ppos); 1530 rc = proc_dointvec(table, write, buffer, lenp, ppos);
1531 if (write && (valp[0] < 0 || valp[1] < 0 || valp[0] >= valp[1])) { 1531 if (write && (valp[0] < 0 || valp[1] < 0 || valp[0] >= valp[1])) {
1532 /* Restore the correct value */ 1532 /* Restore the correct value */
1533 memcpy(valp, val, sizeof(val)); 1533 memcpy(valp, val, sizeof(val));
diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c
index 4e620305f28c..c93494fef8ef 100644
--- a/net/netfilter/nf_log.c
+++ b/net/netfilter/nf_log.c
@@ -226,7 +226,7 @@ static char nf_log_sysctl_fnames[NFPROTO_NUMPROTO-NFPROTO_UNSPEC][3];
226static struct ctl_table nf_log_sysctl_table[NFPROTO_NUMPROTO+1]; 226static struct ctl_table nf_log_sysctl_table[NFPROTO_NUMPROTO+1];
227static struct ctl_table_header *nf_log_dir_header; 227static struct ctl_table_header *nf_log_dir_header;
228 228
229static int nf_log_proc_dostring(ctl_table *table, int write, struct file *filp, 229static int nf_log_proc_dostring(ctl_table *table, int write,
230 void __user *buffer, size_t *lenp, loff_t *ppos) 230 void __user *buffer, size_t *lenp, loff_t *ppos)
231{ 231{
232 const struct nf_logger *logger; 232 const struct nf_logger *logger;
@@ -260,7 +260,7 @@ static int nf_log_proc_dostring(ctl_table *table, int write, struct file *filp,
260 table->data = "NONE"; 260 table->data = "NONE";
261 else 261 else
262 table->data = logger->name; 262 table->data = logger->name;
263 r = proc_dostring(table, write, filp, buffer, lenp, ppos); 263 r = proc_dostring(table, write, buffer, lenp, ppos);
264 mutex_unlock(&nf_log_mutex); 264 mutex_unlock(&nf_log_mutex);
265 } 265 }
266 266
diff --git a/net/phonet/sysctl.c b/net/phonet/sysctl.c
index 7b5749ee2765..2220f3322326 100644
--- a/net/phonet/sysctl.c
+++ b/net/phonet/sysctl.c
@@ -56,7 +56,7 @@ void phonet_get_local_port_range(int *min, int *max)
56 } while (read_seqretry(&local_port_range_lock, seq)); 56 } while (read_seqretry(&local_port_range_lock, seq));
57} 57}
58 58
59static int proc_local_port_range(ctl_table *table, int write, struct file *filp, 59static int proc_local_port_range(ctl_table *table, int write,
60 void __user *buffer, 60 void __user *buffer,
61 size_t *lenp, loff_t *ppos) 61 size_t *lenp, loff_t *ppos)
62{ 62{
@@ -70,7 +70,7 @@ static int proc_local_port_range(ctl_table *table, int write, struct file *filp,
70 .extra2 = &local_port_range_max, 70 .extra2 = &local_port_range_max,
71 }; 71 };
72 72
73 ret = proc_dointvec_minmax(&tmp, write, filp, buffer, lenp, ppos); 73 ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
74 74
75 if (write && ret == 0) { 75 if (write && ret == 0) {
76 if (range[1] < range[0]) 76 if (range[1] < range[0])
diff --git a/net/sunrpc/sysctl.c b/net/sunrpc/sysctl.c
index 5231f7aaac0e..42f9748ae093 100644
--- a/net/sunrpc/sysctl.c
+++ b/net/sunrpc/sysctl.c
@@ -56,7 +56,7 @@ rpc_unregister_sysctl(void)
56 } 56 }
57} 57}
58 58
59static int proc_do_xprt(ctl_table *table, int write, struct file *file, 59static int proc_do_xprt(ctl_table *table, int write,
60 void __user *buffer, size_t *lenp, loff_t *ppos) 60 void __user *buffer, size_t *lenp, loff_t *ppos)
61{ 61{
62 char tmpbuf[256]; 62 char tmpbuf[256];
@@ -71,7 +71,7 @@ static int proc_do_xprt(ctl_table *table, int write, struct file *file,
71} 71}
72 72
73static int 73static int
74proc_dodebug(ctl_table *table, int write, struct file *file, 74proc_dodebug(ctl_table *table, int write,
75 void __user *buffer, size_t *lenp, loff_t *ppos) 75 void __user *buffer, size_t *lenp, loff_t *ppos)
76{ 76{
77 char tmpbuf[20], c, *s; 77 char tmpbuf[20], c, *s;
diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c
index 87101177825b..35fb68b9c8ec 100644
--- a/net/sunrpc/xprtrdma/svc_rdma.c
+++ b/net/sunrpc/xprtrdma/svc_rdma.c
@@ -80,7 +80,7 @@ struct kmem_cache *svc_rdma_ctxt_cachep;
80 * current value. 80 * current value.
81 */ 81 */
82static int read_reset_stat(ctl_table *table, int write, 82static int read_reset_stat(ctl_table *table, int write,
83 struct file *filp, void __user *buffer, size_t *lenp, 83 void __user *buffer, size_t *lenp,
84 loff_t *ppos) 84 loff_t *ppos)
85{ 85{
86 atomic_t *stat = (atomic_t *)table->data; 86 atomic_t *stat = (atomic_t *)table->data;
diff --git a/security/device_cgroup.c b/security/device_cgroup.c
index b8186bac8b7e..6cf8fd2b79e8 100644
--- a/security/device_cgroup.c
+++ b/security/device_cgroup.c
@@ -61,7 +61,8 @@ static inline struct dev_cgroup *task_devcgroup(struct task_struct *task)
61struct cgroup_subsys devices_subsys; 61struct cgroup_subsys devices_subsys;
62 62
63static int devcgroup_can_attach(struct cgroup_subsys *ss, 63static int devcgroup_can_attach(struct cgroup_subsys *ss,
64 struct cgroup *new_cgroup, struct task_struct *task) 64 struct cgroup *new_cgroup, struct task_struct *task,
65 bool threadgroup)
65{ 66{
66 if (current != task && !capable(CAP_SYS_ADMIN)) 67 if (current != task && !capable(CAP_SYS_ADMIN))
67 return -EPERM; 68 return -EPERM;
diff --git a/security/min_addr.c b/security/min_addr.c
index 14cc7b3b8d03..c844eed7915d 100644
--- a/security/min_addr.c
+++ b/security/min_addr.c
@@ -28,12 +28,12 @@ static void update_mmap_min_addr(void)
28 * sysctl handler which just sets dac_mmap_min_addr = the new value and then 28 * sysctl handler which just sets dac_mmap_min_addr = the new value and then
29 * calls update_mmap_min_addr() so non MAP_FIXED hints get rounded properly 29 * calls update_mmap_min_addr() so non MAP_FIXED hints get rounded properly
30 */ 30 */
31int mmap_min_addr_handler(struct ctl_table *table, int write, struct file *filp, 31int mmap_min_addr_handler(struct ctl_table *table, int write,
32 void __user *buffer, size_t *lenp, loff_t *ppos) 32 void __user *buffer, size_t *lenp, loff_t *ppos)
33{ 33{
34 int ret; 34 int ret;
35 35
36 ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos); 36 ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
37 37
38 update_mmap_min_addr(); 38 update_mmap_min_addr();
39 39
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 417f7c994522..bb230d5d7085 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -2411,7 +2411,7 @@ static void selinux_bprm_committed_creds(struct linux_binprm *bprm)
2411 /* Wake up the parent if it is waiting so that it can recheck 2411 /* Wake up the parent if it is waiting so that it can recheck
2412 * wait permission to the new task SID. */ 2412 * wait permission to the new task SID. */
2413 read_lock(&tasklist_lock); 2413 read_lock(&tasklist_lock);
2414 wake_up_interruptible(&current->real_parent->signal->wait_chldexit); 2414 __wake_up_parent(current, current->real_parent);
2415 read_unlock(&tasklist_lock); 2415 read_unlock(&tasklist_lock);
2416} 2416}
2417 2417