aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/SubmitChecklist4
-rw-r--r--Documentation/SubmittingPatches3
-rw-r--r--Documentation/cpu-hotplug.txt9
-rw-r--r--Documentation/device-mapper/delay.txt26
-rw-r--r--Documentation/fb/arkfb.txt68
-rw-r--r--Documentation/fb/vt8623fb.txt64
-rw-r--r--Documentation/md.txt72
-rw-r--r--Documentation/power/userland-swsusp.txt26
-rw-r--r--Documentation/vm/slabinfo.c426
-rw-r--r--arch/avr32/Makefile2
-rw-r--r--arch/avr32/kernel/process.c6
-rw-r--r--arch/avr32/kernel/ptrace.c2
-rw-r--r--arch/avr32/kernel/syscall_table.S1
-rw-r--r--arch/avr32/kernel/traps.c2
-rw-r--r--arch/avr32/kernel/vmlinux.lds.c2
-rw-r--r--arch/avr32/mach-at32ap/clock.c2
-rw-r--r--arch/avr32/mm/dma-coherent.c12
-rw-r--r--arch/blackfin/kernel/asm-offsets.c2
-rw-r--r--arch/blackfin/kernel/ptrace.c6
-rw-r--r--arch/frv/Kconfig6
-rw-r--r--arch/frv/kernel/process.c4
-rw-r--r--arch/frv/mm/pgalloc.c22
-rw-r--r--arch/h8300/kernel/asm-offsets.c2
-rw-r--r--arch/i386/kernel/cpu/intel_cacheinfo.c2
-rw-r--r--arch/i386/kernel/cpu/mcheck/therm_throt.c2
-rw-r--r--arch/i386/kernel/cpu/transmeta.c6
-rw-r--r--arch/i386/kernel/cpuid.c2
-rw-r--r--arch/i386/kernel/microcode.c59
-rw-r--r--arch/i386/kernel/msr.c2
-rw-r--r--arch/i386/kernel/traps.c2
-rw-r--r--arch/i386/mach-generic/probe.c2
-rw-r--r--arch/i386/mach-voyager/voyager_basic.c4
-rw-r--r--arch/i386/pci/init.c2
-rw-r--r--arch/ia64/kernel/err_inject.c2
-rw-r--r--arch/ia64/kernel/mca.c2
-rw-r--r--arch/ia64/kernel/palinfo.c2
-rw-r--r--arch/ia64/kernel/salinfo.c2
-rw-r--r--arch/ia64/kernel/topology.c2
-rw-r--r--arch/m68knommu/kernel/asm-offsets.c2
-rw-r--r--arch/mips/kernel/asm-offsets.c2
-rw-r--r--arch/mips/kernel/smtc.c2
-rw-r--r--arch/parisc/kernel/asm-offsets.c2
-rw-r--r--arch/powerpc/kernel/asm-offsets.c2
-rw-r--r--arch/powerpc/kernel/sysfs.c2
-rw-r--r--arch/powerpc/mm/numa.c3
-rw-r--r--arch/ppc/kernel/asm-offsets.c2
-rw-r--r--arch/s390/appldata/appldata_base.c2
-rw-r--r--arch/s390/kernel/asm-offsets.c2
-rw-r--r--arch/s390/kernel/smp.c2
-rw-r--r--arch/sparc/kernel/asm-offsets.c2
-rw-r--r--arch/sparc64/kernel/traps.c1
-rw-r--r--arch/um/Kconfig16
-rw-r--r--arch/um/Kconfig.scsi58
-rw-r--r--arch/um/kernel/skas/process.c9
-rw-r--r--arch/um/os-Linux/process.c2
-rw-r--r--arch/um/os-Linux/skas/mem.c2
-rw-r--r--arch/um/os-Linux/skas/process.c2
-rw-r--r--arch/v850/kernel/asm-offsets.c2
-rw-r--r--arch/x86_64/kernel/irq.c2
-rw-r--r--arch/x86_64/kernel/mce.c2
-rw-r--r--arch/x86_64/kernel/mce_amd.c2
-rw-r--r--arch/x86_64/kernel/vsyscall.c2
-rw-r--r--arch/xtensa/kernel/asm-offsets.c2
-rw-r--r--block/as-iosched.c2
-rw-r--r--block/genhd.c53
-rw-r--r--block/ll_rw_blk.c9
-rw-r--r--drivers/acpi/sleep/main.c67
-rw-r--r--drivers/acpi/sleep/proc.c2
-rw-r--r--drivers/ata/libata-core.c8
-rw-r--r--drivers/base/topology.c3
-rw-r--r--drivers/block/loop.c6
-rw-r--r--drivers/block/nbd.c15
-rw-r--r--drivers/char/hw_random/Kconfig14
-rw-r--r--drivers/char/hw_random/Makefile1
-rw-r--r--drivers/char/hw_random/pasemi-rng.c156
-rw-r--r--drivers/char/pcmcia/Kconfig1
-rw-r--r--drivers/char/pcmcia/cm4000_cs.c44
-rw-r--r--drivers/char/pcmcia/cm4040_cs.c7
-rw-r--r--drivers/char/tty_io.c14
-rw-r--r--drivers/cpufreq/cpufreq.c3
-rw-r--r--drivers/cpufreq/cpufreq_stats.c2
-rw-r--r--drivers/hwmon/coretemp.c2
-rw-r--r--drivers/i2c/chips/tps65010.c2
-rw-r--r--drivers/infiniband/hw/ehca/ehca_irq.c6
-rw-r--r--drivers/kvm/kvm_main.c3
-rw-r--r--drivers/mca/mca-bus.c28
-rw-r--r--drivers/mca/mca-driver.c13
-rw-r--r--drivers/md/Kconfig9
-rw-r--r--drivers/md/Makefile1
-rw-r--r--drivers/md/dm-bio-list.h26
-rw-r--r--drivers/md/dm-crypt.c91
-rw-r--r--drivers/md/dm-delay.c383
-rw-r--r--drivers/md/dm-exception-store.c54
-rw-r--r--drivers/md/dm-hw-handler.h1
-rw-r--r--drivers/md/dm-io.c232
-rw-r--r--drivers/md/dm-io.h83
-rw-r--r--drivers/md/dm-log.c77
-rw-r--r--drivers/md/dm-mpath.c3
-rw-r--r--drivers/md/dm-raid1.c187
-rw-r--r--drivers/md/dm-table.c10
-rw-r--r--drivers/md/dm.c1
-rw-r--r--drivers/md/kcopyd.c28
-rw-r--r--drivers/md/md.c186
-rw-r--r--drivers/md/raid1.c1
-rw-r--r--drivers/md/raid5.c6
-rw-r--r--drivers/mmc/core/core.c4
-rw-r--r--drivers/net/e1000/e1000_main.c2
-rw-r--r--drivers/net/phy/phy.c6
-rw-r--r--drivers/net/tg3.c11
-rw-r--r--drivers/net/tg3.h2
-rw-r--r--drivers/spi/atmel_spi.c5
-rw-r--r--drivers/usb/atm/usbatm.c2
-rw-r--r--drivers/video/Kconfig28
-rw-r--r--drivers/video/Makefile2
-rw-r--r--drivers/video/arkfb.c1200
-rw-r--r--drivers/video/fbmem.c4
-rw-r--r--drivers/video/nvidia/nv_hw.c7
-rw-r--r--drivers/video/nvidia/nvidia.c1
-rw-r--r--drivers/video/s3fb.c19
-rw-r--r--drivers/video/svgalib.c17
-rw-r--r--drivers/video/vt8623fb.c927
-rw-r--r--fs/affs/file.c6
-rw-r--r--fs/afs/Makefile3
-rw-r--r--fs/afs/afs_fs.h2
-rw-r--r--fs/afs/callback.c9
-rw-r--r--fs/afs/dir.c19
-rw-r--r--fs/afs/file.c107
-rw-r--r--fs/afs/fsclient.c369
-rw-r--r--fs/afs/inode.c70
-rw-r--r--fs/afs/internal.h95
-rw-r--r--fs/afs/main.c2
-rw-r--r--fs/afs/misc.c1
-rw-r--r--fs/afs/mntpt.c5
-rw-r--r--fs/afs/rxrpc.c80
-rw-r--r--fs/afs/security.c12
-rw-r--r--fs/afs/server.c3
-rw-r--r--fs/afs/super.c5
-rw-r--r--fs/afs/vnode.c121
-rw-r--r--fs/afs/write.c835
-rw-r--r--fs/aio.c7
-rw-r--r--fs/binfmt_misc.c13
-rw-r--r--fs/buffer.c58
-rw-r--r--fs/configfs/file.c33
-rw-r--r--fs/direct-io.c8
-rw-r--r--fs/ext3/inode.c12
-rw-r--r--fs/mpage.c15
-rw-r--r--fs/namei.c8
-rw-r--r--fs/nfsd/Makefile1
-rw-r--r--fs/nfsd/export.c14
-rw-r--r--fs/nfsd/nfs3proc.c2
-rw-r--r--fs/nfsd/nfs3xdr.c71
-rw-r--r--fs/nfsd/nfs4acl.c17
-rw-r--r--fs/nfsd/nfs4state.c2
-rw-r--r--fs/nfsd/nfsfh.c56
-rw-r--r--fs/nfsd/nfsproc.c2
-rw-r--r--fs/nfsd/nfsxdr.c53
-rw-r--r--fs/reiserfs/file.c39
-rw-r--r--fs/reiserfs/inode.c13
-rw-r--r--fs/sysfs/file.c33
-rw-r--r--fs/xfs/xfs_mount.c3
-rw-r--r--include/asm-alpha/smp.h1
-rw-r--r--include/asm-alpha/thread_info.h8
-rw-r--r--include/asm-arm/arch-at91/cpu.h6
-rw-r--r--include/asm-avr32/arch-at32ap/cpu.h33
-rw-r--r--include/asm-avr32/setup.h2
-rw-r--r--include/asm-avr32/unistd.h4
-rw-r--r--include/asm-blackfin/processor.h6
-rw-r--r--include/asm-blackfin/system.h4
-rw-r--r--include/asm-frv/tlb.h4
-rw-r--r--include/asm-i386/mmzone.h6
-rw-r--r--include/asm-i386/msr.h56
-rw-r--r--include/asm-i386/paravirt.h5
-rw-r--r--include/asm-i386/smp.h37
-rw-r--r--include/asm-i386/thread_info.h2
-rw-r--r--include/asm-ia64/smp.h6
-rw-r--r--include/asm-ia64/thread_info.h2
-rw-r--r--include/asm-m32r/smp.h6
-rw-r--r--include/asm-m68k/thread_info.h6
-rw-r--r--include/asm-mips/system.h2
-rw-r--r--include/asm-parisc/compat.h2
-rw-r--r--include/asm-powerpc/smp.h1
-rw-r--r--include/asm-s390/smp.h1
-rw-r--r--include/asm-sh/cpu-sh3/dma.h2
-rw-r--r--include/asm-sh/cpu-sh4/dma-sh7780.h2
-rw-r--r--include/asm-sh/cpu-sh4/dma.h2
-rw-r--r--include/asm-sparc/smp.h1
-rw-r--r--include/asm-sparc64/smp.h1
-rw-r--r--include/asm-um/required-features.h9
-rw-r--r--include/asm-um/smp.h4
-rw-r--r--include/asm-x86_64/smp.h14
-rw-r--r--include/asm-x86_64/system.h2
-rw-r--r--include/asm-x86_64/thread_info.h2
-rw-r--r--include/linux/aio.h3
-rw-r--r--include/linux/blkdev.h2
-rw-r--r--include/linux/clocksource.h3
-rw-r--r--include/linux/compat.h3
-rw-r--r--include/linux/compiler-gcc.h1
-rw-r--r--include/linux/compiler-gcc3.h6
-rw-r--r--include/linux/compiler-gcc4.h3
-rw-r--r--include/linux/compiler.h21
-rw-r--r--include/linux/fb.h2
-rw-r--r--include/linux/futex.h42
-rw-r--r--include/linux/genhd.h1
-rw-r--r--include/linux/gfp.h6
-rw-r--r--include/linux/highmem.h27
-rw-r--r--include/linux/init_task.h2
-rw-r--r--include/linux/kthread.h3
-rw-r--r--include/linux/ktime.h6
-rw-r--r--include/linux/mca.h2
-rw-r--r--include/linux/mmzone.h3
-rw-r--r--include/linux/module.h3
-rw-r--r--include/linux/mutex.h5
-rw-r--r--include/linux/nfs4_acl.h1
-rw-r--r--include/linux/notifier.h66
-rw-r--r--include/linux/pm.h31
-rw-r--r--include/linux/raid/md_k.h1
-rw-r--r--include/linux/relay.h3
-rw-r--r--include/linux/sched.h9
-rw-r--r--include/linux/signal.h125
-rw-r--r--include/linux/smp.h1
-rw-r--r--include/linux/sunrpc/svc.h19
-rw-r--r--include/linux/sunrpc/svcsock.h3
-rw-r--r--include/linux/suspend.h24
-rw-r--r--include/linux/svga.h2
-rw-r--r--include/linux/syscalls.h2
-rw-r--r--include/linux/vmstat.h3
-rw-r--r--include/linux/workqueue.h95
-rw-r--r--init/Kconfig24
-rw-r--r--init/do_mounts.c7
-rw-r--r--init/main.c5
-rw-r--r--kernel/configs.c15
-rw-r--r--kernel/cpu.c66
-rw-r--r--kernel/cpuset.c7
-rw-r--r--kernel/exit.c18
-rw-r--r--kernel/fork.c4
-rw-r--r--kernel/futex.c988
-rw-r--r--kernel/futex_compat.c22
-rw-r--r--kernel/hrtimer.c2
-rw-r--r--kernel/irq/handle.c1
-rw-r--r--kernel/kmod.c6
-rw-r--r--kernel/kthread.c113
-rw-r--r--kernel/mutex.c8
-rw-r--r--kernel/power/disk.c195
-rw-r--r--kernel/power/main.c42
-rw-r--r--kernel/power/power.h7
-rw-r--r--kernel/power/snapshot.c2
-rw-r--r--kernel/power/user.c13
-rw-r--r--kernel/profile.c4
-rw-r--r--kernel/rcupdate.c2
-rw-r--r--kernel/relay.c37
-rw-r--r--kernel/rtmutex.c41
-rw-r--r--kernel/rtmutex_common.h34
-rw-r--r--kernel/sched.c38
-rw-r--r--kernel/signal.c140
-rw-r--r--kernel/softirq.c4
-rw-r--r--kernel/softlockup.c4
-rw-r--r--kernel/sys.c96
-rw-r--r--kernel/sysctl.c12
-rw-r--r--kernel/time/clocksource.c51
-rw-r--r--kernel/time/timer_list.c25
-rw-r--r--kernel/timer.c14
-rw-r--r--kernel/workqueue.c783
-rw-r--r--lib/radix-tree.c2
-rw-r--r--mm/filemap.c2
-rw-r--r--mm/filemap_xip.c7
-rw-r--r--mm/hugetlb.c33
-rw-r--r--mm/page_alloc.c50
-rw-r--r--mm/slab.c41
-rw-r--r--mm/slub.c1050
-rw-r--r--mm/swap.c2
-rw-r--r--mm/truncate.c3
-rw-r--r--mm/vmscan.c2
-rw-r--r--mm/vmstat.c95
-rw-r--r--net/core/dev.c2
-rw-r--r--net/core/flow.c2
-rw-r--r--net/ipv4/ipvs/ip_vs_ctl.c1
-rw-r--r--net/iucv/iucv.c6
-rw-r--r--net/sunrpc/auth_gss/svcauth_gss.c15
-rw-r--r--net/sunrpc/rpc_pipe.c9
-rw-r--r--net/sunrpc/sunrpc_syms.c6
-rw-r--r--net/sunrpc/svc.c2
-rw-r--r--net/sunrpc/svcauth_unix.c10
-rw-r--r--net/sunrpc/svcsock.c34
-rwxr-xr-xscripts/kernel-doc7
-rw-r--r--scripts/mod/modpost.c1
285 files changed, 9123 insertions, 3127 deletions
diff --git a/Documentation/SubmitChecklist b/Documentation/SubmitChecklist
index bd23dc0bc0c7..6491b2c45dd4 100644
--- a/Documentation/SubmitChecklist
+++ b/Documentation/SubmitChecklist
@@ -80,3 +80,7 @@ kernel patches.
8023: Tested after it has been merged into the -mm patchset to make sure 8023: Tested after it has been merged into the -mm patchset to make sure
81 that it still works with all of the other queued patches and various 81 that it still works with all of the other queued patches and various
82 changes in the VM, VFS, and other subsystems. 82 changes in the VM, VFS, and other subsystems.
83
8424: Avoid whitespace damage such as indenting with spaces or whitespace
85 at the end of lines. You can test this by feeding the patch to
86 "git apply --check --whitespace=error-all"
diff --git a/Documentation/SubmittingPatches b/Documentation/SubmittingPatches
index b0d0043f7c46..a417b25fb1aa 100644
--- a/Documentation/SubmittingPatches
+++ b/Documentation/SubmittingPatches
@@ -363,7 +363,8 @@ area or subsystem of the kernel is being patched.
363The "summary phrase" in the email's Subject should concisely 363The "summary phrase" in the email's Subject should concisely
364describe the patch which that email contains. The "summary 364describe the patch which that email contains. The "summary
365phrase" should not be a filename. Do not use the same "summary 365phrase" should not be a filename. Do not use the same "summary
366phrase" for every patch in a whole patch series. 366phrase" for every patch in a whole patch series (where a "patch
367series" is an ordered sequence of multiple, related patches).
367 368
368Bear in mind that the "summary phrase" of your email becomes 369Bear in mind that the "summary phrase" of your email becomes
369a globally-unique identifier for that patch. It propagates 370a globally-unique identifier for that patch. It propagates
diff --git a/Documentation/cpu-hotplug.txt b/Documentation/cpu-hotplug.txt
index cc60d29b954c..b6d24c22274b 100644
--- a/Documentation/cpu-hotplug.txt
+++ b/Documentation/cpu-hotplug.txt
@@ -217,14 +217,17 @@ Q: What happens when a CPU is being logically offlined?
217A: The following happen, listed in no particular order :-) 217A: The following happen, listed in no particular order :-)
218 218
219- A notification is sent to in-kernel registered modules by sending an event 219- A notification is sent to in-kernel registered modules by sending an event
220 CPU_DOWN_PREPARE 220 CPU_DOWN_PREPARE or CPU_DOWN_PREPARE_FROZEN, depending on whether or not the
221 CPU is being offlined while tasks are frozen due to a suspend operation in
222 progress
221- All process is migrated away from this outgoing CPU to a new CPU 223- All process is migrated away from this outgoing CPU to a new CPU
222- All interrupts targeted to this CPU is migrated to a new CPU 224- All interrupts targeted to this CPU is migrated to a new CPU
223- timers/bottom half/task lets are also migrated to a new CPU 225- timers/bottom half/task lets are also migrated to a new CPU
224- Once all services are migrated, kernel calls an arch specific routine 226- Once all services are migrated, kernel calls an arch specific routine
225 __cpu_disable() to perform arch specific cleanup. 227 __cpu_disable() to perform arch specific cleanup.
226- Once this is successful, an event for successful cleanup is sent by an event 228- Once this is successful, an event for successful cleanup is sent by an event
227 CPU_DEAD. 229 CPU_DEAD (or CPU_DEAD_FROZEN if tasks are frozen due to a suspend while the
230 CPU is being offlined).
228 231
229 "It is expected that each service cleans up when the CPU_DOWN_PREPARE 232 "It is expected that each service cleans up when the CPU_DOWN_PREPARE
230 notifier is called, when CPU_DEAD is called its expected there is nothing 233 notifier is called, when CPU_DEAD is called its expected there is nothing
@@ -242,9 +245,11 @@ A: This is what you would need in your kernel code to receive notifications.
242 245
243 switch (action) { 246 switch (action) {
244 case CPU_ONLINE: 247 case CPU_ONLINE:
248 case CPU_ONLINE_FROZEN:
245 foobar_online_action(cpu); 249 foobar_online_action(cpu);
246 break; 250 break;
247 case CPU_DEAD: 251 case CPU_DEAD:
252 case CPU_DEAD_FROZEN:
248 foobar_dead_action(cpu); 253 foobar_dead_action(cpu);
249 break; 254 break;
250 } 255 }
diff --git a/Documentation/device-mapper/delay.txt b/Documentation/device-mapper/delay.txt
new file mode 100644
index 000000000000..15adc55359e5
--- /dev/null
+++ b/Documentation/device-mapper/delay.txt
@@ -0,0 +1,26 @@
1dm-delay
2========
3
4Device-Mapper's "delay" target delays reads and/or writes
5and maps them to different devices.
6
7Parameters:
8 <device> <offset> <delay> [<write_device> <write_offset> <write_delay>]
9
10With separate write parameters, the first set is only used for reads.
11Delays are specified in milliseconds.
12
13Example scripts
14===============
15[[
16#!/bin/sh
17# Create device delaying rw operation for 500ms
18echo "0 `blockdev --getsize $1` delay $1 0 500" | dmsetup create delayed
19]]
20
21[[
22#!/bin/sh
23# Create device delaying only write operation for 500ms and
24# splitting reads and writes to different devices $1 $2
25echo "0 `blockdev --getsize $1` delay $1 0 0 $2 0 500" | dmsetup create delayed
26]]
diff --git a/Documentation/fb/arkfb.txt b/Documentation/fb/arkfb.txt
new file mode 100644
index 000000000000..e8487a9d6a05
--- /dev/null
+++ b/Documentation/fb/arkfb.txt
@@ -0,0 +1,68 @@
1
2 arkfb - fbdev driver for ARK Logic chips
3 ========================================
4
5
6Supported Hardware
7==================
8
9 ARK 2000PV chip
10 ICS 5342 ramdac
11
12 - only BIOS initialized VGA devices supported
13 - probably not working on big endian
14
15
16Supported Features
17==================
18
19 * 4 bpp pseudocolor modes (with 18bit palette, two variants)
20 * 8 bpp pseudocolor mode (with 18bit palette)
21 * 16 bpp truecolor modes (RGB 555 and RGB 565)
22 * 24 bpp truecolor mode (RGB 888)
23 * 32 bpp truecolor mode (RGB 888)
24 * text mode (activated by bpp = 0)
25 * doublescan mode variant (not available in text mode)
26 * panning in both directions
27 * suspend/resume support
28
29Text mode is supported even in higher resolutions, but there is limitation to
30lower pixclocks (i got maximum about 70 MHz, it is dependent on specific
31hardware). This limitation is not enforced by driver. Text mode supports 8bit
32wide fonts only (hardware limitation) and 16bit tall fonts (driver
33limitation). Unfortunately character attributes (like color) in text mode are
34broken for unknown reason, so its usefulness is limited.
35
36There are two 4 bpp modes. First mode (selected if nonstd == 0) is mode with
37packed pixels, high nibble first. Second mode (selected if nonstd == 1) is mode
38with interleaved planes (1 byte interleave), MSB first. Both modes support
398bit wide fonts only (driver limitation).
40
41Suspend/resume works on systems that initialize video card during resume and
42if device is active (for example used by fbcon).
43
44
45Missing Features
46================
47(alias TODO list)
48
49 * secondary (not initialized by BIOS) device support
50 * big endian support
51 * DPMS support
52 * MMIO support
53 * interlaced mode variant
54 * support for fontwidths != 8 in 4 bpp modes
55 * support for fontheight != 16 in text mode
56 * hardware cursor
57 * vsync synchronization
58 * feature connector support
59 * acceleration support (8514-like 2D)
60
61
62Known bugs
63==========
64
65 * character attributes (and cursor) in text mode are broken
66
67--
68Ondrej Zajicek <santiago@crfreenet.org>
diff --git a/Documentation/fb/vt8623fb.txt b/Documentation/fb/vt8623fb.txt
new file mode 100644
index 000000000000..f654576c56b7
--- /dev/null
+++ b/Documentation/fb/vt8623fb.txt
@@ -0,0 +1,64 @@
1
2 vt8623fb - fbdev driver for graphics core in VIA VT8623 chipset
3 ===============================================================
4
5
6Supported Hardware
7==================
8
9 VIA VT8623 [CLE266] chipset and its graphics core
10 (known as CastleRock or Unichrome)
11
12I tested vt8623fb on VIA EPIA ML-6000
13
14
15Supported Features
16==================
17
18 * 4 bpp pseudocolor modes (with 18bit palette, two variants)
19 * 8 bpp pseudocolor mode (with 18bit palette)
20 * 16 bpp truecolor mode (RGB 565)
21 * 32 bpp truecolor mode (RGB 888)
22 * text mode (activated by bpp = 0)
23 * doublescan mode variant (not available in text mode)
24 * panning in both directions
25 * suspend/resume support
26 * DPMS support
27
28Text mode is supported even in higher resolutions, but there is limitation to
29lower pixclocks (maximum about 100 MHz). This limitation is not enforced by
30driver. Text mode supports 8bit wide fonts only (hardware limitation) and
3116bit tall fonts (driver limitation).
32
33There are two 4 bpp modes. First mode (selected if nonstd == 0) is mode with
34packed pixels, high nibble first. Second mode (selected if nonstd == 1) is mode
35with interleaved planes (1 byte interleave), MSB first. Both modes support
368bit wide fonts only (driver limitation).
37
38Suspend/resume works on systems that initialize video card during resume and
39if device is active (for example used by fbcon).
40
41
42Missing Features
43================
44(alias TODO list)
45
46 * secondary (not initialized by BIOS) device support
47 * MMIO support
48 * interlaced mode variant
49 * support for fontwidths != 8 in 4 bpp modes
50 * support for fontheight != 16 in text mode
51 * hardware cursor
52 * video overlay support
53 * vsync synchronization
54 * acceleration support (8514-like 2D, busmaster transfers)
55
56
57Known bugs
58==========
59
60 * cursor disable in text mode doesn't work
61
62
63--
64Ondrej Zajicek <santiago@crfreenet.org>
diff --git a/Documentation/md.txt b/Documentation/md.txt
index 2202f5dc8ac2..5818628207b5 100644
--- a/Documentation/md.txt
+++ b/Documentation/md.txt
@@ -178,6 +178,21 @@ All md devices contain:
178 The size should be at least PAGE_SIZE (4k) and should be a power 178 The size should be at least PAGE_SIZE (4k) and should be a power
179 of 2. This can only be set while assembling an array 179 of 2. This can only be set while assembling an array
180 180
181 layout
182 The "layout" for the array for the particular level. This is
183 simply a number that is interpretted differently by different
184 levels. It can be written while assembling an array.
185
186 reshape_position
187 This is either "none" or a sector number within the devices of
188 the array where "reshape" is up to. If this is set, the three
189 attributes mentioned above (raid_disks, chunk_size, layout) can
190 potentially have 2 values, an old and a new value. If these
191 values differ, reading the attribute returns
192 new (old)
193 and writing will effect the 'new' value, leaving the 'old'
194 unchanged.
195
181 component_size 196 component_size
182 For arrays with data redundancy (i.e. not raid0, linear, faulty, 197 For arrays with data redundancy (i.e. not raid0, linear, faulty,
183 multipath), all components must be the same size - or at least 198 multipath), all components must be the same size - or at least
@@ -193,11 +208,6 @@ All md devices contain:
193 1.2 (newer format in varying locations) or "none" indicating that 208 1.2 (newer format in varying locations) or "none" indicating that
194 the kernel isn't managing metadata at all. 209 the kernel isn't managing metadata at all.
195 210
196 layout
197 The "layout" for the array for the particular level. This is
198 simply a number that is interpretted differently by different
199 levels. It can be written while assembling an array.
200
201 resync_start 211 resync_start
202 The point at which resync should start. If no resync is needed, 212 The point at which resync should start. If no resync is needed,
203 this will be a very large number. At array creation it will 213 this will be a very large number. At array creation it will
@@ -259,29 +269,6 @@ All md devices contain:
259 like active, but no writes have been seen for a while (safe_mode_delay). 269 like active, but no writes have been seen for a while (safe_mode_delay).
260 270
261 271
262 sync_speed_min
263 sync_speed_max
264 This are similar to /proc/sys/dev/raid/speed_limit_{min,max}
265 however they only apply to the particular array.
266 If no value has been written to these, of if the word 'system'
267 is written, then the system-wide value is used. If a value,
268 in kibibytes-per-second is written, then it is used.
269 When the files are read, they show the currently active value
270 followed by "(local)" or "(system)" depending on whether it is
271 a locally set or system-wide value.
272
273 sync_completed
274 This shows the number of sectors that have been completed of
275 whatever the current sync_action is, followed by the number of
276 sectors in total that could need to be processed. The two
277 numbers are separated by a '/' thus effectively showing one
278 value, a fraction of the process that is complete.
279
280 sync_speed
281 This shows the current actual speed, in K/sec, of the current
282 sync_action. It is averaged over the last 30 seconds.
283
284
285As component devices are added to an md array, they appear in the 'md' 272As component devices are added to an md array, they appear in the 'md'
286directory as new directories named 273directory as new directories named
287 dev-XXX 274 dev-XXX
@@ -412,6 +399,35 @@ also have
412 Note that the numbers are 'bit' numbers, not 'block' numbers. 399 Note that the numbers are 'bit' numbers, not 'block' numbers.
413 They should be scaled by the bitmap_chunksize. 400 They should be scaled by the bitmap_chunksize.
414 401
402 sync_speed_min
403 sync_speed_max
404 This are similar to /proc/sys/dev/raid/speed_limit_{min,max}
405 however they only apply to the particular array.
406 If no value has been written to these, of if the word 'system'
407 is written, then the system-wide value is used. If a value,
408 in kibibytes-per-second is written, then it is used.
409 When the files are read, they show the currently active value
410 followed by "(local)" or "(system)" depending on whether it is
411 a locally set or system-wide value.
412
413 sync_completed
414 This shows the number of sectors that have been completed of
415 whatever the current sync_action is, followed by the number of
416 sectors in total that could need to be processed. The two
417 numbers are separated by a '/' thus effectively showing one
418 value, a fraction of the process that is complete.
419
420 sync_speed
421 This shows the current actual speed, in K/sec, of the current
422 sync_action. It is averaged over the last 30 seconds.
423
424 suspend_lo
425 suspend_hi
426 The two values, given as numbers of sectors, indicate a range
427 within the array where IO will be blocked. This is currently
428 only supported for raid4/5/6.
429
430
415Each active md device may also have attributes specific to the 431Each active md device may also have attributes specific to the
416personality module that manages it. 432personality module that manages it.
417These are specific to the implementation of the module and could 433These are specific to the implementation of the module and could
diff --git a/Documentation/power/userland-swsusp.txt b/Documentation/power/userland-swsusp.txt
index 000556c932e9..e00c6cf09e85 100644
--- a/Documentation/power/userland-swsusp.txt
+++ b/Documentation/power/userland-swsusp.txt
@@ -93,21 +93,23 @@ SNAPSHOT_S2RAM - suspend to RAM; using this call causes the kernel to
93 to resume the system from RAM if there's enough battery power or restore 93 to resume the system from RAM if there's enough battery power or restore
94 its state on the basis of the saved suspend image otherwise) 94 its state on the basis of the saved suspend image otherwise)
95 95
96SNAPSHOT_PMOPS - enable the usage of the pmops->prepare, pmops->enter and 96SNAPSHOT_PMOPS - enable the usage of the hibernation_ops->prepare,
97 pmops->finish methods (the in-kernel swsusp knows these as the "platform 97 hibernate_ops->enter and hibernation_ops->finish methods (the in-kernel
98 method") which are needed on many machines to (among others) speed up 98 swsusp knows these as the "platform method") which are needed on many
99 the resume by letting the BIOS skip some steps or to let the system 99 machines to (among others) speed up the resume by letting the BIOS skip
100 recognise the correct state of the hardware after the resume (in 100 some steps or to let the system recognise the correct state of the
101 particular on many machines this ensures that unplugged AC 101 hardware after the resume (in particular on many machines this ensures
102 adapters get correctly detected and that kacpid does not run wild after 102 that unplugged AC adapters get correctly detected and that kacpid does
103 the resume). The last ioctl() argument can take one of the three 103 not run wild after the resume). The last ioctl() argument can take one
104 values, defined in kernel/power/power.h: 104 of the three values, defined in kernel/power/power.h:
105 PMOPS_PREPARE - make the kernel carry out the 105 PMOPS_PREPARE - make the kernel carry out the
106 pm_ops->prepare(PM_SUSPEND_DISK) operation 106 hibernation_ops->prepare() operation
107 PMOPS_ENTER - make the kernel power off the system by calling 107 PMOPS_ENTER - make the kernel power off the system by calling
108 pm_ops->enter(PM_SUSPEND_DISK) 108 hibernation_ops->enter()
109 PMOPS_FINISH - make the kernel carry out the 109 PMOPS_FINISH - make the kernel carry out the
110 pm_ops->finish(PM_SUSPEND_DISK) operation 110 hibernation_ops->finish() operation
111 Note that the actual constants are misnamed because they surface
112 internal kernel implementation details that have changed.
111 113
112The device's read() operation can be used to transfer the snapshot image from 114The device's read() operation can be used to transfer the snapshot image from
113the kernel. It has the following limitations: 115the kernel. It has the following limitations:
diff --git a/Documentation/vm/slabinfo.c b/Documentation/vm/slabinfo.c
index 41710ccf3a29..686a8e04a4f3 100644
--- a/Documentation/vm/slabinfo.c
+++ b/Documentation/vm/slabinfo.c
@@ -16,6 +16,7 @@
16#include <stdarg.h> 16#include <stdarg.h>
17#include <getopt.h> 17#include <getopt.h>
18#include <regex.h> 18#include <regex.h>
19#include <errno.h>
19 20
20#define MAX_SLABS 500 21#define MAX_SLABS 500
21#define MAX_ALIASES 500 22#define MAX_ALIASES 500
@@ -41,12 +42,15 @@ struct aliasinfo {
41} aliasinfo[MAX_ALIASES]; 42} aliasinfo[MAX_ALIASES];
42 43
43int slabs = 0; 44int slabs = 0;
45int actual_slabs = 0;
44int aliases = 0; 46int aliases = 0;
45int alias_targets = 0; 47int alias_targets = 0;
46int highest_node = 0; 48int highest_node = 0;
47 49
48char buffer[4096]; 50char buffer[4096];
49 51
52int show_empty = 0;
53int show_report = 0;
50int show_alias = 0; 54int show_alias = 0;
51int show_slab = 0; 55int show_slab = 0;
52int skip_zero = 1; 56int skip_zero = 1;
@@ -59,6 +63,15 @@ int show_inverted = 0;
59int show_single_ref = 0; 63int show_single_ref = 0;
60int show_totals = 0; 64int show_totals = 0;
61int sort_size = 0; 65int sort_size = 0;
66int set_debug = 0;
67int show_ops = 0;
68
69/* Debug options */
70int sanity = 0;
71int redzone = 0;
72int poison = 0;
73int tracking = 0;
74int tracing = 0;
62 75
63int page_size; 76int page_size;
64 77
@@ -76,20 +89,33 @@ void fatal(const char *x, ...)
76 89
77void usage(void) 90void usage(void)
78{ 91{
79 printf("slabinfo [-ahnpvtsz] [slab-regexp]\n" 92 printf("slabinfo 5/7/2007. (c) 2007 sgi. clameter@sgi.com\n\n"
93 "slabinfo [-ahnpvtsz] [-d debugopts] [slab-regexp]\n"
80 "-a|--aliases Show aliases\n" 94 "-a|--aliases Show aliases\n"
95 "-d<options>|--debug=<options> Set/Clear Debug options\n"
96 "-e|--empty Show empty slabs\n"
97 "-f|--first-alias Show first alias\n"
81 "-h|--help Show usage information\n" 98 "-h|--help Show usage information\n"
99 "-i|--inverted Inverted list\n"
100 "-l|--slabs Show slabs\n"
82 "-n|--numa Show NUMA information\n" 101 "-n|--numa Show NUMA information\n"
102 "-o|--ops Show kmem_cache_ops\n"
83 "-s|--shrink Shrink slabs\n" 103 "-s|--shrink Shrink slabs\n"
84 "-v|--validate Validate slabs\n" 104 "-r|--report Detailed report on single slabs\n"
105 "-S|--Size Sort by size\n"
85 "-t|--tracking Show alloc/free information\n" 106 "-t|--tracking Show alloc/free information\n"
86 "-T|--Totals Show summary information\n" 107 "-T|--Totals Show summary information\n"
87 "-l|--slabs Show slabs\n" 108 "-v|--validate Validate slabs\n"
88 "-S|--Size Sort by size\n"
89 "-z|--zero Include empty slabs\n" 109 "-z|--zero Include empty slabs\n"
90 "-f|--first-alias Show first alias\n"
91 "-i|--inverted Inverted list\n"
92 "-1|--1ref Single reference\n" 110 "-1|--1ref Single reference\n"
111 "\nValid debug options (FZPUT may be combined)\n"
112 "a / A Switch on all debug options (=FZUP)\n"
113 "- Switch off all debug options\n"
114 "f / F Sanity Checks (SLAB_DEBUG_FREE)\n"
115 "z / Z Redzoning\n"
116 "p / P Poisoning\n"
117 "u / U Tracking\n"
118 "t / T Tracing\n"
93 ); 119 );
94} 120}
95 121
@@ -143,11 +169,10 @@ unsigned long get_obj_and_str(char *name, char **x)
143void set_obj(struct slabinfo *s, char *name, int n) 169void set_obj(struct slabinfo *s, char *name, int n)
144{ 170{
145 char x[100]; 171 char x[100];
172 FILE *f;
146 173
147 sprintf(x, "%s/%s", s->name, name); 174 sprintf(x, "%s/%s", s->name, name);
148 175 f = fopen(x, "w");
149 FILE *f = fopen(x, "w");
150
151 if (!f) 176 if (!f)
152 fatal("Cannot write to %s\n", x); 177 fatal("Cannot write to %s\n", x);
153 178
@@ -155,6 +180,26 @@ void set_obj(struct slabinfo *s, char *name, int n)
155 fclose(f); 180 fclose(f);
156} 181}
157 182
183unsigned long read_slab_obj(struct slabinfo *s, char *name)
184{
185 char x[100];
186 FILE *f;
187 int l;
188
189 sprintf(x, "%s/%s", s->name, name);
190 f = fopen(x, "r");
191 if (!f) {
192 buffer[0] = 0;
193 l = 0;
194 } else {
195 l = fread(buffer, 1, sizeof(buffer), f);
196 buffer[l] = 0;
197 fclose(f);
198 }
199 return l;
200}
201
202
158/* 203/*
159 * Put a size string together 204 * Put a size string together
160 */ 205 */
@@ -226,7 +271,7 @@ int line = 0;
226 271
227void first_line(void) 272void first_line(void)
228{ 273{
229 printf("Name Objects Objsize Space " 274 printf("Name Objects Objsize Space "
230 "Slabs/Part/Cpu O/S O %%Fr %%Ef Flg\n"); 275 "Slabs/Part/Cpu O/S O %%Fr %%Ef Flg\n");
231} 276}
232 277
@@ -246,10 +291,7 @@ struct aliasinfo *find_one_alias(struct slabinfo *find)
246 return best; 291 return best;
247 } 292 }
248 } 293 }
249 if (best) 294 return best;
250 return best;
251 fatal("Cannot find alias for %s\n", find->name);
252 return NULL;
253} 295}
254 296
255unsigned long slab_size(struct slabinfo *s) 297unsigned long slab_size(struct slabinfo *s)
@@ -257,6 +299,126 @@ unsigned long slab_size(struct slabinfo *s)
257 return s->slabs * (page_size << s->order); 299 return s->slabs * (page_size << s->order);
258} 300}
259 301
302void slab_numa(struct slabinfo *s, int mode)
303{
304 int node;
305
306 if (strcmp(s->name, "*") == 0)
307 return;
308
309 if (!highest_node) {
310 printf("\n%s: No NUMA information available.\n", s->name);
311 return;
312 }
313
314 if (skip_zero && !s->slabs)
315 return;
316
317 if (!line) {
318 printf("\n%-21s:", mode ? "NUMA nodes" : "Slab");
319 for(node = 0; node <= highest_node; node++)
320 printf(" %4d", node);
321 printf("\n----------------------");
322 for(node = 0; node <= highest_node; node++)
323 printf("-----");
324 printf("\n");
325 }
326 printf("%-21s ", mode ? "All slabs" : s->name);
327 for(node = 0; node <= highest_node; node++) {
328 char b[20];
329
330 store_size(b, s->numa[node]);
331 printf(" %4s", b);
332 }
333 printf("\n");
334 if (mode) {
335 printf("%-21s ", "Partial slabs");
336 for(node = 0; node <= highest_node; node++) {
337 char b[20];
338
339 store_size(b, s->numa_partial[node]);
340 printf(" %4s", b);
341 }
342 printf("\n");
343 }
344 line++;
345}
346
347void show_tracking(struct slabinfo *s)
348{
349 printf("\n%s: Kernel object allocation\n", s->name);
350 printf("-----------------------------------------------------------------------\n");
351 if (read_slab_obj(s, "alloc_calls"))
352 printf(buffer);
353 else
354 printf("No Data\n");
355
356 printf("\n%s: Kernel object freeing\n", s->name);
357 printf("------------------------------------------------------------------------\n");
358 if (read_slab_obj(s, "free_calls"))
359 printf(buffer);
360 else
361 printf("No Data\n");
362
363}
364
365void ops(struct slabinfo *s)
366{
367 if (strcmp(s->name, "*") == 0)
368 return;
369
370 if (read_slab_obj(s, "ops")) {
371 printf("\n%s: kmem_cache operations\n", s->name);
372 printf("--------------------------------------------\n");
373 printf(buffer);
374 } else
375 printf("\n%s has no kmem_cache operations\n", s->name);
376}
377
378const char *onoff(int x)
379{
380 if (x)
381 return "On ";
382 return "Off";
383}
384
385void report(struct slabinfo *s)
386{
387 if (strcmp(s->name, "*") == 0)
388 return;
389 printf("\nSlabcache: %-20s Aliases: %2d Order : %2d\n", s->name, s->aliases, s->order);
390 if (s->hwcache_align)
391 printf("** Hardware cacheline aligned\n");
392 if (s->cache_dma)
393 printf("** Memory is allocated in a special DMA zone\n");
394 if (s->destroy_by_rcu)
395 printf("** Slabs are destroyed via RCU\n");
396 if (s->reclaim_account)
397 printf("** Reclaim accounting active\n");
398
399 printf("\nSizes (bytes) Slabs Debug Memory\n");
400 printf("------------------------------------------------------------------------\n");
401 printf("Object : %7d Total : %7ld Sanity Checks : %s Total: %7ld\n",
402 s->object_size, s->slabs, onoff(s->sanity_checks),
403 s->slabs * (page_size << s->order));
404 printf("SlabObj: %7d Full : %7ld Redzoning : %s Used : %7ld\n",
405 s->slab_size, s->slabs - s->partial - s->cpu_slabs,
406 onoff(s->red_zone), s->objects * s->object_size);
407 printf("SlabSiz: %7d Partial: %7ld Poisoning : %s Loss : %7ld\n",
408 page_size << s->order, s->partial, onoff(s->poison),
409 s->slabs * (page_size << s->order) - s->objects * s->object_size);
410 printf("Loss : %7d CpuSlab: %7d Tracking : %s Lalig: %7ld\n",
411 s->slab_size - s->object_size, s->cpu_slabs, onoff(s->store_user),
412 (s->slab_size - s->object_size) * s->objects);
413 printf("Align : %7d Objects: %7d Tracing : %s Lpadd: %7ld\n",
414 s->align, s->objs_per_slab, onoff(s->trace),
415 ((page_size << s->order) - s->objs_per_slab * s->slab_size) *
416 s->slabs);
417
418 ops(s);
419 show_tracking(s);
420 slab_numa(s, 1);
421}
260 422
261void slabcache(struct slabinfo *s) 423void slabcache(struct slabinfo *s)
262{ 424{
@@ -265,7 +427,18 @@ void slabcache(struct slabinfo *s)
265 char flags[20]; 427 char flags[20];
266 char *p = flags; 428 char *p = flags;
267 429
268 if (skip_zero && !s->slabs) 430 if (strcmp(s->name, "*") == 0)
431 return;
432
433 if (actual_slabs == 1) {
434 report(s);
435 return;
436 }
437
438 if (skip_zero && !show_empty && !s->slabs)
439 return;
440
441 if (show_empty && s->slabs)
269 return; 442 return;
270 443
271 store_size(size_str, slab_size(s)); 444 store_size(size_str, slab_size(s));
@@ -303,48 +476,128 @@ void slabcache(struct slabinfo *s)
303 flags); 476 flags);
304} 477}
305 478
306void slab_numa(struct slabinfo *s) 479/*
480 * Analyze debug options. Return false if something is amiss.
481 */
482int debug_opt_scan(char *opt)
307{ 483{
308 int node; 484 if (!opt || !opt[0] || strcmp(opt, "-") == 0)
485 return 1;
486
487 if (strcasecmp(opt, "a") == 0) {
488 sanity = 1;
489 poison = 1;
490 redzone = 1;
491 tracking = 1;
492 return 1;
493 }
309 494
310 if (!highest_node) 495 for ( ; *opt; opt++)
311 fatal("No NUMA information available.\n"); 496 switch (*opt) {
497 case 'F' : case 'f':
498 if (sanity)
499 return 0;
500 sanity = 1;
501 break;
502 case 'P' : case 'p':
503 if (poison)
504 return 0;
505 poison = 1;
506 break;
312 507
313 if (skip_zero && !s->slabs) 508 case 'Z' : case 'z':
314 return; 509 if (redzone)
510 return 0;
511 redzone = 1;
512 break;
315 513
316 if (!line) { 514 case 'U' : case 'u':
317 printf("\nSlab Node "); 515 if (tracking)
318 for(node = 0; node <= highest_node; node++) 516 return 0;
319 printf(" %4d", node); 517 tracking = 1;
320 printf("\n----------------------"); 518 break;
321 for(node = 0; node <= highest_node; node++)
322 printf("-----");
323 printf("\n");
324 }
325 printf("%-21s ", s->name);
326 for(node = 0; node <= highest_node; node++) {
327 char b[20];
328 519
329 store_size(b, s->numa[node]); 520 case 'T' : case 't':
330 printf(" %4s", b); 521 if (tracing)
331 } 522 return 0;
332 printf("\n"); 523 tracing = 1;
333 line++; 524 break;
525 default:
526 return 0;
527 }
528 return 1;
334} 529}
335 530
336void show_tracking(struct slabinfo *s) 531int slab_empty(struct slabinfo *s)
337{ 532{
338 printf("\n%s: Calls to allocate a slab object\n", s->name); 533 if (s->objects > 0)
339 printf("---------------------------------------------------\n"); 534 return 0;
340 if (read_obj("alloc_calls"))
341 printf(buffer);
342 535
343 printf("%s: Calls to free a slab object\n", s->name); 536 /*
344 printf("-----------------------------------------------\n"); 537 * We may still have slabs even if there are no objects. Shrinking will
345 if (read_obj("free_calls")) 538 * remove them.
346 printf(buffer); 539 */
540 if (s->slabs != 0)
541 set_obj(s, "shrink", 1);
347 542
543 return 1;
544}
545
546void slab_debug(struct slabinfo *s)
547{
548 if (sanity && !s->sanity_checks) {
549 set_obj(s, "sanity", 1);
550 }
551 if (!sanity && s->sanity_checks) {
552 if (slab_empty(s))
553 set_obj(s, "sanity", 0);
554 else
555 fprintf(stderr, "%s not empty cannot disable sanity checks\n", s->name);
556 }
557 if (redzone && !s->red_zone) {
558 if (slab_empty(s))
559 set_obj(s, "red_zone", 1);
560 else
561 fprintf(stderr, "%s not empty cannot enable redzoning\n", s->name);
562 }
563 if (!redzone && s->red_zone) {
564 if (slab_empty(s))
565 set_obj(s, "red_zone", 0);
566 else
567 fprintf(stderr, "%s not empty cannot disable redzoning\n", s->name);
568 }
569 if (poison && !s->poison) {
570 if (slab_empty(s))
571 set_obj(s, "poison", 1);
572 else
573 fprintf(stderr, "%s not empty cannot enable poisoning\n", s->name);
574 }
575 if (!poison && s->poison) {
576 if (slab_empty(s))
577 set_obj(s, "poison", 0);
578 else
579 fprintf(stderr, "%s not empty cannot disable poisoning\n", s->name);
580 }
581 if (tracking && !s->store_user) {
582 if (slab_empty(s))
583 set_obj(s, "store_user", 1);
584 else
585 fprintf(stderr, "%s not empty cannot enable tracking\n", s->name);
586 }
587 if (!tracking && s->store_user) {
588 if (slab_empty(s))
589 set_obj(s, "store_user", 0);
590 else
591 fprintf(stderr, "%s not empty cannot disable tracking\n", s->name);
592 }
593 if (tracing && !s->trace) {
594 if (slabs == 1)
595 set_obj(s, "trace", 1);
596 else
597 fprintf(stderr, "%s can only enable trace for one slab at a time\n", s->name);
598 }
599 if (!tracing && s->trace)
600 set_obj(s, "trace", 1);
348} 601}
349 602
350void totals(void) 603void totals(void)
@@ -673,7 +926,7 @@ void link_slabs(void)
673 926
674 for (a = aliasinfo; a < aliasinfo + aliases; a++) { 927 for (a = aliasinfo; a < aliasinfo + aliases; a++) {
675 928
676 for(s = slabinfo; s < slabinfo + slabs; s++) 929 for (s = slabinfo; s < slabinfo + slabs; s++)
677 if (strcmp(a->ref, s->name) == 0) { 930 if (strcmp(a->ref, s->name) == 0) {
678 a->slab = s; 931 a->slab = s;
679 s->refs++; 932 s->refs++;
@@ -704,7 +957,7 @@ void alias(void)
704 continue; 957 continue;
705 } 958 }
706 } 959 }
707 printf("\n%-20s <- %s", a->slab->name, a->name); 960 printf("\n%-12s <- %s", a->slab->name, a->name);
708 active = a->slab->name; 961 active = a->slab->name;
709 } 962 }
710 else 963 else
@@ -729,7 +982,12 @@ void rename_slabs(void)
729 982
730 a = find_one_alias(s); 983 a = find_one_alias(s);
731 984
732 s->name = a->name; 985 if (a)
986 s->name = a->name;
987 else {
988 s->name = "*";
989 actual_slabs--;
990 }
733 } 991 }
734} 992}
735 993
@@ -748,11 +1006,14 @@ void read_slab_dir(void)
748 char *t; 1006 char *t;
749 int count; 1007 int count;
750 1008
1009 if (chdir("/sys/slab"))
1010 fatal("SYSFS support for SLUB not active\n");
1011
751 dir = opendir("."); 1012 dir = opendir(".");
752 while ((de = readdir(dir))) { 1013 while ((de = readdir(dir))) {
753 if (de->d_name[0] == '.' || 1014 if (de->d_name[0] == '.' ||
754 slab_mismatch(de->d_name)) 1015 (de->d_name[0] != ':' && slab_mismatch(de->d_name)))
755 continue; 1016 continue;
756 switch (de->d_type) { 1017 switch (de->d_type) {
757 case DT_LNK: 1018 case DT_LNK:
758 alias->name = strdup(de->d_name); 1019 alias->name = strdup(de->d_name);
@@ -807,6 +1068,7 @@ void read_slab_dir(void)
807 } 1068 }
808 closedir(dir); 1069 closedir(dir);
809 slabs = slab - slabinfo; 1070 slabs = slab - slabinfo;
1071 actual_slabs = slabs;
810 aliases = alias - aliasinfo; 1072 aliases = alias - aliasinfo;
811 if (slabs > MAX_SLABS) 1073 if (slabs > MAX_SLABS)
812 fatal("Too many slabs\n"); 1074 fatal("Too many slabs\n");
@@ -825,34 +1087,37 @@ void output_slabs(void)
825 1087
826 1088
827 if (show_numa) 1089 if (show_numa)
828 slab_numa(slab); 1090 slab_numa(slab, 0);
829 else 1091 else if (show_track)
830 if (show_track)
831 show_tracking(slab); 1092 show_tracking(slab);
832 else 1093 else if (validate)
833 if (validate)
834 slab_validate(slab); 1094 slab_validate(slab);
835 else 1095 else if (shrink)
836 if (shrink)
837 slab_shrink(slab); 1096 slab_shrink(slab);
838 else { 1097 else if (set_debug)
839 if (show_slab) 1098 slab_debug(slab);
840 slabcache(slab); 1099 else if (show_ops)
841 } 1100 ops(slab);
1101 else if (show_slab)
1102 slabcache(slab);
842 } 1103 }
843} 1104}
844 1105
845struct option opts[] = { 1106struct option opts[] = {
846 { "aliases", 0, NULL, 'a' }, 1107 { "aliases", 0, NULL, 'a' },
847 { "slabs", 0, NULL, 'l' }, 1108 { "debug", 2, NULL, 'd' },
848 { "numa", 0, NULL, 'n' }, 1109 { "empty", 0, NULL, 'e' },
849 { "zero", 0, NULL, 'z' },
850 { "help", 0, NULL, 'h' },
851 { "validate", 0, NULL, 'v' },
852 { "first-alias", 0, NULL, 'f' }, 1110 { "first-alias", 0, NULL, 'f' },
1111 { "help", 0, NULL, 'h' },
1112 { "inverted", 0, NULL, 'i'},
1113 { "numa", 0, NULL, 'n' },
1114 { "ops", 0, NULL, 'o' },
1115 { "report", 0, NULL, 'r' },
853 { "shrink", 0, NULL, 's' }, 1116 { "shrink", 0, NULL, 's' },
1117 { "slabs", 0, NULL, 'l' },
854 { "track", 0, NULL, 't'}, 1118 { "track", 0, NULL, 't'},
855 { "inverted", 0, NULL, 'i'}, 1119 { "validate", 0, NULL, 'v' },
1120 { "zero", 0, NULL, 'z' },
856 { "1ref", 0, NULL, '1'}, 1121 { "1ref", 0, NULL, '1'},
857 { NULL, 0, NULL, 0 } 1122 { NULL, 0, NULL, 0 }
858}; 1123};
@@ -864,10 +1129,9 @@ int main(int argc, char *argv[])
864 char *pattern_source; 1129 char *pattern_source;
865 1130
866 page_size = getpagesize(); 1131 page_size = getpagesize();
867 if (chdir("/sys/slab"))
868 fatal("This kernel does not have SLUB support.\n");
869 1132
870 while ((c = getopt_long(argc, argv, "afhil1npstvzTS", opts, NULL)) != -1) 1133 while ((c = getopt_long(argc, argv, "ad::efhil1noprstvzTS",
1134 opts, NULL)) != -1)
871 switch(c) { 1135 switch(c) {
872 case '1': 1136 case '1':
873 show_single_ref = 1; 1137 show_single_ref = 1;
@@ -875,6 +1139,14 @@ int main(int argc, char *argv[])
875 case 'a': 1139 case 'a':
876 show_alias = 1; 1140 show_alias = 1;
877 break; 1141 break;
1142 case 'd':
1143 set_debug = 1;
1144 if (!debug_opt_scan(optarg))
1145 fatal("Invalid debug option '%s'\n", optarg);
1146 break;
1147 case 'e':
1148 show_empty = 1;
1149 break;
878 case 'f': 1150 case 'f':
879 show_first_alias = 1; 1151 show_first_alias = 1;
880 break; 1152 break;
@@ -887,6 +1159,12 @@ int main(int argc, char *argv[])
887 case 'n': 1159 case 'n':
888 show_numa = 1; 1160 show_numa = 1;
889 break; 1161 break;
1162 case 'o':
1163 show_ops = 1;
1164 break;
1165 case 'r':
1166 show_report = 1;
1167 break;
890 case 's': 1168 case 's':
891 shrink = 1; 1169 shrink = 1;
892 break; 1170 break;
@@ -914,8 +1192,8 @@ int main(int argc, char *argv[])
914 1192
915 } 1193 }
916 1194
917 if (!show_slab && !show_alias && !show_track 1195 if (!show_slab && !show_alias && !show_track && !show_report
918 && !validate && !shrink) 1196 && !validate && !shrink && !set_debug && !show_ops)
919 show_slab = 1; 1197 show_slab = 1;
920 1198
921 if (argc > optind) 1199 if (argc > optind)
diff --git a/arch/avr32/Makefile b/arch/avr32/Makefile
index 6115fc1f0cfa..dc6bc01f232c 100644
--- a/arch/avr32/Makefile
+++ b/arch/avr32/Makefile
@@ -16,7 +16,7 @@ AFLAGS += -mrelax -mno-pic
16CFLAGS_MODULE += -mno-relax 16CFLAGS_MODULE += -mno-relax
17LDFLAGS_vmlinux += --relax 17LDFLAGS_vmlinux += --relax
18 18
19cpuflags-$(CONFIG_CPU_AP7000) += -mcpu=ap7000 19cpuflags-$(CONFIG_CPU_AT32AP7000) += -mcpu=ap7000
20 20
21CFLAGS += $(cpuflags-y) 21CFLAGS += $(cpuflags-y)
22AFLAGS += $(cpuflags-y) 22AFLAGS += $(cpuflags-y)
diff --git a/arch/avr32/kernel/process.c b/arch/avr32/kernel/process.c
index 4e4181ed1c6d..13f988402613 100644
--- a/arch/avr32/kernel/process.c
+++ b/arch/avr32/kernel/process.c
@@ -330,13 +330,13 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long usp,
330{ 330{
331 struct pt_regs *childregs; 331 struct pt_regs *childregs;
332 332
333 childregs = ((struct pt_regs *)(THREAD_SIZE + (unsigned long)p->thread_info)) - 1; 333 childregs = ((struct pt_regs *)(THREAD_SIZE + (unsigned long)task_stack_page(p))) - 1;
334 *childregs = *regs; 334 *childregs = *regs;
335 335
336 if (user_mode(regs)) 336 if (user_mode(regs))
337 childregs->sp = usp; 337 childregs->sp = usp;
338 else 338 else
339 childregs->sp = (unsigned long)p->thread_info + THREAD_SIZE; 339 childregs->sp = (unsigned long)task_stack_page(p) + THREAD_SIZE;
340 340
341 childregs->r12 = 0; /* Set return value for child */ 341 childregs->r12 = 0; /* Set return value for child */
342 342
@@ -403,7 +403,7 @@ unsigned long get_wchan(struct task_struct *p)
403 if (!p || p == current || p->state == TASK_RUNNING) 403 if (!p || p == current || p->state == TASK_RUNNING)
404 return 0; 404 return 0;
405 405
406 stack_page = (unsigned long)p->thread_info; 406 stack_page = (unsigned long)task_stack_page(p);
407 BUG_ON(!stack_page); 407 BUG_ON(!stack_page);
408 408
409 /* 409 /*
diff --git a/arch/avr32/kernel/ptrace.c b/arch/avr32/kernel/ptrace.c
index 8ac74dddbbde..3c36c2d16148 100644
--- a/arch/avr32/kernel/ptrace.c
+++ b/arch/avr32/kernel/ptrace.c
@@ -24,7 +24,7 @@
24 24
25static struct pt_regs *get_user_regs(struct task_struct *tsk) 25static struct pt_regs *get_user_regs(struct task_struct *tsk)
26{ 26{
27 return (struct pt_regs *)((unsigned long) tsk->thread_info + 27 return (struct pt_regs *)((unsigned long)task_stack_page(tsk) +
28 THREAD_SIZE - sizeof(struct pt_regs)); 28 THREAD_SIZE - sizeof(struct pt_regs));
29} 29}
30 30
diff --git a/arch/avr32/kernel/syscall_table.S b/arch/avr32/kernel/syscall_table.S
index 7c279586fbba..07f6a6fa340d 100644
--- a/arch/avr32/kernel/syscall_table.S
+++ b/arch/avr32/kernel/syscall_table.S
@@ -291,4 +291,5 @@ sys_call_table:
291 .long sys_shmget /* 275 */ 291 .long sys_shmget /* 275 */
292 .long sys_shmdt 292 .long sys_shmdt
293 .long sys_shmctl 293 .long sys_shmctl
294 .long sys_utimensat
294 .long sys_ni_syscall /* r8 is saturated at nr_syscalls */ 295 .long sys_ni_syscall /* r8 is saturated at nr_syscalls */
diff --git a/arch/avr32/kernel/traps.c b/arch/avr32/kernel/traps.c
index 4de9edf96ed2..86d107511dd4 100644
--- a/arch/avr32/kernel/traps.c
+++ b/arch/avr32/kernel/traps.c
@@ -123,7 +123,7 @@ asmlinkage void do_address_exception(unsigned long ecr, struct pt_regs *regs)
123 123
124/* This way of handling undefined instructions is stolen from ARM */ 124/* This way of handling undefined instructions is stolen from ARM */
125static LIST_HEAD(undef_hook); 125static LIST_HEAD(undef_hook);
126static spinlock_t undef_lock = SPIN_LOCK_UNLOCKED; 126static DEFINE_SPINLOCK(undef_lock);
127 127
128void register_undef_hook(struct undef_hook *hook) 128void register_undef_hook(struct undef_hook *hook)
129{ 129{
diff --git a/arch/avr32/kernel/vmlinux.lds.c b/arch/avr32/kernel/vmlinux.lds.c
index 7ad20cfb48a8..e7f72c995a32 100644
--- a/arch/avr32/kernel/vmlinux.lds.c
+++ b/arch/avr32/kernel/vmlinux.lds.c
@@ -35,7 +35,7 @@ SECTIONS
35 _einittext = .; 35 _einittext = .;
36 . = ALIGN(4); 36 . = ALIGN(4);
37 __tagtable_begin = .; 37 __tagtable_begin = .;
38 *(.taglist) 38 *(.taglist.init)
39 __tagtable_end = .; 39 __tagtable_end = .;
40 *(.init.data) 40 *(.init.data)
41 . = ALIGN(16); 41 . = ALIGN(16);
diff --git a/arch/avr32/mach-at32ap/clock.c b/arch/avr32/mach-at32ap/clock.c
index 00c435452d7e..0f8c89c9f832 100644
--- a/arch/avr32/mach-at32ap/clock.c
+++ b/arch/avr32/mach-at32ap/clock.c
@@ -18,7 +18,7 @@
18 18
19#include "clock.h" 19#include "clock.h"
20 20
21static spinlock_t clk_lock = SPIN_LOCK_UNLOCKED; 21static DEFINE_SPINLOCK(clk_lock);
22 22
23struct clk *clk_get(struct device *dev, const char *id) 23struct clk *clk_get(struct device *dev, const char *id)
24{ 24{
diff --git a/arch/avr32/mm/dma-coherent.c b/arch/avr32/mm/dma-coherent.c
index b68d669f823d..099212d4567c 100644
--- a/arch/avr32/mm/dma-coherent.c
+++ b/arch/avr32/mm/dma-coherent.c
@@ -112,16 +112,21 @@ void dma_free_coherent(struct device *dev, size_t size,
112} 112}
113EXPORT_SYMBOL(dma_free_coherent); 113EXPORT_SYMBOL(dma_free_coherent);
114 114
115#if 0
116void *dma_alloc_writecombine(struct device *dev, size_t size, 115void *dma_alloc_writecombine(struct device *dev, size_t size,
117 dma_addr_t *handle, gfp_t gfp) 116 dma_addr_t *handle, gfp_t gfp)
118{ 117{
119 struct page *page; 118 struct page *page;
119 dma_addr_t phys;
120 120
121 page = __dma_alloc(dev, size, handle, gfp); 121 page = __dma_alloc(dev, size, handle, gfp);
122 if (!page)
123 return NULL;
124
125 phys = page_to_phys(page);
126 *handle = phys;
122 127
123 /* Now, map the page into P3 with write-combining turned on */ 128 /* Now, map the page into P3 with write-combining turned on */
124 return __ioremap(page_to_phys(page), size, _PAGE_BUFFER); 129 return __ioremap(phys, size, _PAGE_BUFFER);
125} 130}
126EXPORT_SYMBOL(dma_alloc_writecombine); 131EXPORT_SYMBOL(dma_alloc_writecombine);
127 132
@@ -132,8 +137,7 @@ void dma_free_writecombine(struct device *dev, size_t size,
132 137
133 iounmap(cpu_addr); 138 iounmap(cpu_addr);
134 139
135 page = bus_to_page(handle); 140 page = phys_to_page(handle);
136 __dma_free(dev, size, page, handle); 141 __dma_free(dev, size, page, handle);
137} 142}
138EXPORT_SYMBOL(dma_free_writecombine); 143EXPORT_SYMBOL(dma_free_writecombine);
139#endif
diff --git a/arch/blackfin/kernel/asm-offsets.c b/arch/blackfin/kernel/asm-offsets.c
index 41d9a9f89700..e455f4504509 100644
--- a/arch/blackfin/kernel/asm-offsets.c
+++ b/arch/blackfin/kernel/asm-offsets.c
@@ -46,7 +46,7 @@ int main(void)
46 DEFINE(TASK_PTRACE, offsetof(struct task_struct, ptrace)); 46 DEFINE(TASK_PTRACE, offsetof(struct task_struct, ptrace));
47 DEFINE(TASK_BLOCKED, offsetof(struct task_struct, blocked)); 47 DEFINE(TASK_BLOCKED, offsetof(struct task_struct, blocked));
48 DEFINE(TASK_THREAD, offsetof(struct task_struct, thread)); 48 DEFINE(TASK_THREAD, offsetof(struct task_struct, thread));
49 DEFINE(TASK_THREAD_INFO, offsetof(struct task_struct, thread_info)); 49 DEFINE(TASK_THREAD_INFO, offsetof(struct task_struct, stack));
50 DEFINE(TASK_MM, offsetof(struct task_struct, mm)); 50 DEFINE(TASK_MM, offsetof(struct task_struct, mm));
51 DEFINE(TASK_ACTIVE_MM, offsetof(struct task_struct, active_mm)); 51 DEFINE(TASK_ACTIVE_MM, offsetof(struct task_struct, active_mm));
52 DEFINE(TASK_SIGPENDING, offsetof(struct task_struct, pending)); 52 DEFINE(TASK_SIGPENDING, offsetof(struct task_struct, pending));
diff --git a/arch/blackfin/kernel/ptrace.c b/arch/blackfin/kernel/ptrace.c
index d7c8e514cb92..e718bb4a1ef0 100644
--- a/arch/blackfin/kernel/ptrace.c
+++ b/arch/blackfin/kernel/ptrace.c
@@ -73,7 +73,7 @@
73static inline struct pt_regs *get_user_regs(struct task_struct *task) 73static inline struct pt_regs *get_user_regs(struct task_struct *task)
74{ 74{
75 return (struct pt_regs *) 75 return (struct pt_regs *)
76 ((unsigned long)task->thread_info + 76 ((unsigned long)task_stack_page(task) +
77 (THREAD_SIZE - sizeof(struct pt_regs))); 77 (THREAD_SIZE - sizeof(struct pt_regs)));
78} 78}
79 79
@@ -99,7 +99,7 @@ static inline long get_reg(struct task_struct *task, int regno)
99 unsigned char *reg_ptr; 99 unsigned char *reg_ptr;
100 100
101 struct pt_regs *regs = 101 struct pt_regs *regs =
102 (struct pt_regs *)((unsigned long)task->thread_info + 102 (struct pt_regs *)((unsigned long)task_stack_page(task) +
103 (THREAD_SIZE - sizeof(struct pt_regs))); 103 (THREAD_SIZE - sizeof(struct pt_regs)));
104 reg_ptr = (char *)regs; 104 reg_ptr = (char *)regs;
105 105
@@ -125,7 +125,7 @@ put_reg(struct task_struct *task, int regno, unsigned long data)
125 char * reg_ptr; 125 char * reg_ptr;
126 126
127 struct pt_regs *regs = 127 struct pt_regs *regs =
128 (struct pt_regs *)((unsigned long)task->thread_info + 128 (struct pt_regs *)((unsigned long)task_stack_page(task) +
129 (THREAD_SIZE - sizeof(struct pt_regs))); 129 (THREAD_SIZE - sizeof(struct pt_regs)));
130 reg_ptr = (char *)regs; 130 reg_ptr = (char *)regs;
131 131
diff --git a/arch/frv/Kconfig b/arch/frv/Kconfig
index eed694312a79..114738a45582 100644
--- a/arch/frv/Kconfig
+++ b/arch/frv/Kconfig
@@ -45,15 +45,15 @@ config TIME_LOW_RES
45 bool 45 bool
46 default y 46 default y
47 47
48config ARCH_HAS_ILOG2_U32 48config QUICKLIST
49 bool 49 bool
50 default y 50 default y
51 51
52config ARCH_HAS_ILOG2_U64 52config ARCH_HAS_ILOG2_U32
53 bool 53 bool
54 default y 54 default y
55 55
56config ARCH_USES_SLAB_PAGE_STRUCT 56config ARCH_HAS_ILOG2_U64
57 bool 57 bool
58 default y 58 default y
59 59
diff --git a/arch/frv/kernel/process.c b/arch/frv/kernel/process.c
index 515a5cea5469..9583a338e9d6 100644
--- a/arch/frv/kernel/process.c
+++ b/arch/frv/kernel/process.c
@@ -25,12 +25,14 @@
25#include <linux/elf.h> 25#include <linux/elf.h>
26#include <linux/reboot.h> 26#include <linux/reboot.h>
27#include <linux/interrupt.h> 27#include <linux/interrupt.h>
28#include <linux/pagemap.h>
28 29
29#include <asm/asm-offsets.h> 30#include <asm/asm-offsets.h>
30#include <asm/uaccess.h> 31#include <asm/uaccess.h>
31#include <asm/system.h> 32#include <asm/system.h>
32#include <asm/setup.h> 33#include <asm/setup.h>
33#include <asm/pgtable.h> 34#include <asm/pgtable.h>
35#include <asm/tlb.h>
34#include <asm/gdb-stub.h> 36#include <asm/gdb-stub.h>
35#include <asm/mb-regs.h> 37#include <asm/mb-regs.h>
36 38
@@ -88,6 +90,8 @@ void cpu_idle(void)
88 while (!need_resched()) { 90 while (!need_resched()) {
89 irq_stat[cpu].idle_timestamp = jiffies; 91 irq_stat[cpu].idle_timestamp = jiffies;
90 92
93 check_pgt_cache();
94
91 if (!frv_dma_inprogress && idle) 95 if (!frv_dma_inprogress && idle)
92 idle(); 96 idle();
93 } 97 }
diff --git a/arch/frv/mm/pgalloc.c b/arch/frv/mm/pgalloc.c
index 598a26ab8ad8..7787c3cc52c6 100644
--- a/arch/frv/mm/pgalloc.c
+++ b/arch/frv/mm/pgalloc.c
@@ -13,12 +13,12 @@
13#include <linux/slab.h> 13#include <linux/slab.h>
14#include <linux/mm.h> 14#include <linux/mm.h>
15#include <linux/highmem.h> 15#include <linux/highmem.h>
16#include <linux/quicklist.h>
16#include <asm/pgalloc.h> 17#include <asm/pgalloc.h>
17#include <asm/page.h> 18#include <asm/page.h>
18#include <asm/cacheflush.h> 19#include <asm/cacheflush.h>
19 20
20pgd_t swapper_pg_dir[PTRS_PER_PGD] __attribute__((aligned(PAGE_SIZE))); 21pgd_t swapper_pg_dir[PTRS_PER_PGD] __attribute__((aligned(PAGE_SIZE)));
21struct kmem_cache *pgd_cache;
22 22
23pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) 23pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
24{ 24{
@@ -100,7 +100,7 @@ static inline void pgd_list_del(pgd_t *pgd)
100 set_page_private(next, (unsigned long) pprev); 100 set_page_private(next, (unsigned long) pprev);
101} 101}
102 102
103void pgd_ctor(void *pgd, struct kmem_cache *cache, unsigned long unused) 103void pgd_ctor(void *pgd)
104{ 104{
105 unsigned long flags; 105 unsigned long flags;
106 106
@@ -120,7 +120,7 @@ void pgd_ctor(void *pgd, struct kmem_cache *cache, unsigned long unused)
120} 120}
121 121
122/* never called when PTRS_PER_PMD > 1 */ 122/* never called when PTRS_PER_PMD > 1 */
123void pgd_dtor(void *pgd, struct kmem_cache *cache, unsigned long unused) 123void pgd_dtor(void *pgd)
124{ 124{
125 unsigned long flags; /* can be called from interrupt context */ 125 unsigned long flags; /* can be called from interrupt context */
126 126
@@ -133,7 +133,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
133{ 133{
134 pgd_t *pgd; 134 pgd_t *pgd;
135 135
136 pgd = kmem_cache_alloc(pgd_cache, GFP_KERNEL); 136 pgd = quicklist_alloc(0, GFP_KERNEL, pgd_ctor);
137 if (!pgd) 137 if (!pgd)
138 return pgd; 138 return pgd;
139 139
@@ -143,15 +143,15 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
143void pgd_free(pgd_t *pgd) 143void pgd_free(pgd_t *pgd)
144{ 144{
145 /* in the non-PAE case, clear_page_tables() clears user pgd entries */ 145 /* in the non-PAE case, clear_page_tables() clears user pgd entries */
146 kmem_cache_free(pgd_cache, pgd); 146 quicklist_free(0, pgd_dtor, pgd);
147} 147}
148 148
149void __init pgtable_cache_init(void) 149void __init pgtable_cache_init(void)
150{ 150{
151 pgd_cache = kmem_cache_create("pgd",
152 PTRS_PER_PGD * sizeof(pgd_t),
153 PTRS_PER_PGD * sizeof(pgd_t),
154 SLAB_PANIC,
155 pgd_ctor,
156 pgd_dtor);
157} 151}
152
153void check_pgt_cache(void)
154{
155 quicklist_trim(0, pgd_dtor, 25, 16);
156}
157
diff --git a/arch/h8300/kernel/asm-offsets.c b/arch/h8300/kernel/asm-offsets.c
index b78b82ad28a3..fc30b4fd0914 100644
--- a/arch/h8300/kernel/asm-offsets.c
+++ b/arch/h8300/kernel/asm-offsets.c
@@ -30,7 +30,7 @@ int main(void)
30 DEFINE(TASK_PTRACE, offsetof(struct task_struct, ptrace)); 30 DEFINE(TASK_PTRACE, offsetof(struct task_struct, ptrace));
31 DEFINE(TASK_BLOCKED, offsetof(struct task_struct, blocked)); 31 DEFINE(TASK_BLOCKED, offsetof(struct task_struct, blocked));
32 DEFINE(TASK_THREAD, offsetof(struct task_struct, thread)); 32 DEFINE(TASK_THREAD, offsetof(struct task_struct, thread));
33 DEFINE(TASK_THREAD_INFO, offsetof(struct task_struct, thread_info)); 33 DEFINE(TASK_THREAD_INFO, offsetof(struct task_struct, stack));
34 DEFINE(TASK_MM, offsetof(struct task_struct, mm)); 34 DEFINE(TASK_MM, offsetof(struct task_struct, mm));
35 DEFINE(TASK_ACTIVE_MM, offsetof(struct task_struct, active_mm)); 35 DEFINE(TASK_ACTIVE_MM, offsetof(struct task_struct, active_mm));
36 36
diff --git a/arch/i386/kernel/cpu/intel_cacheinfo.c b/arch/i386/kernel/cpu/intel_cacheinfo.c
index 80b4c5d421b1..e5be819492ef 100644
--- a/arch/i386/kernel/cpu/intel_cacheinfo.c
+++ b/arch/i386/kernel/cpu/intel_cacheinfo.c
@@ -733,9 +733,11 @@ static int __cpuinit cacheinfo_cpu_callback(struct notifier_block *nfb,
733 sys_dev = get_cpu_sysdev(cpu); 733 sys_dev = get_cpu_sysdev(cpu);
734 switch (action) { 734 switch (action) {
735 case CPU_ONLINE: 735 case CPU_ONLINE:
736 case CPU_ONLINE_FROZEN:
736 cache_add_dev(sys_dev); 737 cache_add_dev(sys_dev);
737 break; 738 break;
738 case CPU_DEAD: 739 case CPU_DEAD:
740 case CPU_DEAD_FROZEN:
739 cache_remove_dev(sys_dev); 741 cache_remove_dev(sys_dev);
740 break; 742 break;
741 } 743 }
diff --git a/arch/i386/kernel/cpu/mcheck/therm_throt.c b/arch/i386/kernel/cpu/mcheck/therm_throt.c
index 2f28540caae2..7ba7c3abd3a4 100644
--- a/arch/i386/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/i386/kernel/cpu/mcheck/therm_throt.c
@@ -137,10 +137,12 @@ static __cpuinit int thermal_throttle_cpu_callback(struct notifier_block *nfb,
137 mutex_lock(&therm_cpu_lock); 137 mutex_lock(&therm_cpu_lock);
138 switch (action) { 138 switch (action) {
139 case CPU_ONLINE: 139 case CPU_ONLINE:
140 case CPU_ONLINE_FROZEN:
140 err = thermal_throttle_add_dev(sys_dev); 141 err = thermal_throttle_add_dev(sys_dev);
141 WARN_ON(err); 142 WARN_ON(err);
142 break; 143 break;
143 case CPU_DEAD: 144 case CPU_DEAD:
145 case CPU_DEAD_FROZEN:
144 thermal_throttle_remove_dev(sys_dev); 146 thermal_throttle_remove_dev(sys_dev);
145 break; 147 break;
146 } 148 }
diff --git a/arch/i386/kernel/cpu/transmeta.c b/arch/i386/kernel/cpu/transmeta.c
index 6471a5a13202..200fb3f9ebfb 100644
--- a/arch/i386/kernel/cpu/transmeta.c
+++ b/arch/i386/kernel/cpu/transmeta.c
@@ -77,8 +77,10 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
77 set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability); 77 set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability);
78 78
79 /* If we can run i686 user-space code, call us an i686 */ 79 /* If we can run i686 user-space code, call us an i686 */
80#define USER686 (X86_FEATURE_TSC|X86_FEATURE_CX8|X86_FEATURE_CMOV) 80#define USER686 ((1 << X86_FEATURE_TSC)|\
81 if ( c->x86 == 5 && (c->x86_capability[0] & USER686) == USER686 ) 81 (1 << X86_FEATURE_CX8)|\
82 (1 << X86_FEATURE_CMOV))
83 if (c->x86 == 5 && (c->x86_capability[0] & USER686) == USER686)
82 c->x86 = 6; 84 c->x86 = 6;
83 85
84#ifdef CONFIG_SYSCTL 86#ifdef CONFIG_SYSCTL
diff --git a/arch/i386/kernel/cpuid.c b/arch/i386/kernel/cpuid.c
index eeae0d992337..5c2faa10e9fa 100644
--- a/arch/i386/kernel/cpuid.c
+++ b/arch/i386/kernel/cpuid.c
@@ -169,9 +169,11 @@ static int cpuid_class_cpu_callback(struct notifier_block *nfb, unsigned long ac
169 169
170 switch (action) { 170 switch (action) {
171 case CPU_ONLINE: 171 case CPU_ONLINE:
172 case CPU_ONLINE_FROZEN:
172 cpuid_device_create(cpu); 173 cpuid_device_create(cpu);
173 break; 174 break;
174 case CPU_DEAD: 175 case CPU_DEAD:
176 case CPU_DEAD_FROZEN:
175 device_destroy(cpuid_class, MKDEV(CPUID_MAJOR, cpu)); 177 device_destroy(cpuid_class, MKDEV(CPUID_MAJOR, cpu));
176 break; 178 break;
177 } 179 }
diff --git a/arch/i386/kernel/microcode.c b/arch/i386/kernel/microcode.c
index cbe7ec8dbb9f..83f825f2e2d7 100644
--- a/arch/i386/kernel/microcode.c
+++ b/arch/i386/kernel/microcode.c
@@ -567,7 +567,7 @@ static int cpu_request_microcode(int cpu)
567 return error; 567 return error;
568} 568}
569 569
570static int apply_microcode_on_cpu(int cpu) 570static int apply_microcode_check_cpu(int cpu)
571{ 571{
572 struct cpuinfo_x86 *c = cpu_data + cpu; 572 struct cpuinfo_x86 *c = cpu_data + cpu;
573 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 573 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
@@ -575,8 +575,9 @@ static int apply_microcode_on_cpu(int cpu)
575 unsigned int val[2]; 575 unsigned int val[2];
576 int err = 0; 576 int err = 0;
577 577
578 /* Check if the microcode is available */
578 if (!uci->mc) 579 if (!uci->mc)
579 return -EINVAL; 580 return 0;
580 581
581 old = current->cpus_allowed; 582 old = current->cpus_allowed;
582 set_cpus_allowed(current, cpumask_of_cpu(cpu)); 583 set_cpus_allowed(current, cpumask_of_cpu(cpu));
@@ -614,7 +615,7 @@ static int apply_microcode_on_cpu(int cpu)
614 return err; 615 return err;
615} 616}
616 617
617static void microcode_init_cpu(int cpu) 618static void microcode_init_cpu(int cpu, int resume)
618{ 619{
619 cpumask_t old; 620 cpumask_t old;
620 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 621 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
@@ -624,8 +625,7 @@ static void microcode_init_cpu(int cpu)
624 set_cpus_allowed(current, cpumask_of_cpu(cpu)); 625 set_cpus_allowed(current, cpumask_of_cpu(cpu));
625 mutex_lock(&microcode_mutex); 626 mutex_lock(&microcode_mutex);
626 collect_cpu_info(cpu); 627 collect_cpu_info(cpu);
627 if (uci->valid && system_state == SYSTEM_RUNNING && 628 if (uci->valid && system_state == SYSTEM_RUNNING && !resume)
628 !suspend_cpu_hotplug)
629 cpu_request_microcode(cpu); 629 cpu_request_microcode(cpu);
630 mutex_unlock(&microcode_mutex); 630 mutex_unlock(&microcode_mutex);
631 set_cpus_allowed(current, old); 631 set_cpus_allowed(current, old);
@@ -702,7 +702,7 @@ static struct attribute_group mc_attr_group = {
702 .name = "microcode", 702 .name = "microcode",
703}; 703};
704 704
705static int mc_sysdev_add(struct sys_device *sys_dev) 705static int __mc_sysdev_add(struct sys_device *sys_dev, int resume)
706{ 706{
707 int err, cpu = sys_dev->id; 707 int err, cpu = sys_dev->id;
708 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 708 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
@@ -711,39 +711,31 @@ static int mc_sysdev_add(struct sys_device *sys_dev)
711 return 0; 711 return 0;
712 712
713 pr_debug("Microcode:CPU %d added\n", cpu); 713 pr_debug("Microcode:CPU %d added\n", cpu);
714 /* If suspend_cpu_hotplug is set, the system is resuming and we should 714 memset(uci, 0, sizeof(*uci));
715 * use the data from before the suspend.
716 */
717 if (suspend_cpu_hotplug) {
718 err = apply_microcode_on_cpu(cpu);
719 if (err)
720 microcode_fini_cpu(cpu);
721 }
722 if (!uci->valid)
723 memset(uci, 0, sizeof(*uci));
724 715
725 err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group); 716 err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group);
726 if (err) 717 if (err)
727 return err; 718 return err;
728 719
729 if (!uci->valid) 720 microcode_init_cpu(cpu, resume);
730 microcode_init_cpu(cpu);
731 721
732 return 0; 722 return 0;
733} 723}
734 724
725static int mc_sysdev_add(struct sys_device *sys_dev)
726{
727 return __mc_sysdev_add(sys_dev, 0);
728}
729
735static int mc_sysdev_remove(struct sys_device *sys_dev) 730static int mc_sysdev_remove(struct sys_device *sys_dev)
736{ 731{
737 int cpu = sys_dev->id; 732 int cpu = sys_dev->id;
738 733
739 if (!cpu_online(cpu)) 734 if (!cpu_online(cpu))
740 return 0; 735 return 0;
736
741 pr_debug("Microcode:CPU %d removed\n", cpu); 737 pr_debug("Microcode:CPU %d removed\n", cpu);
742 /* If suspend_cpu_hotplug is set, the system is suspending and we should 738 microcode_fini_cpu(cpu);
743 * keep the microcode in memory for the resume.
744 */
745 if (!suspend_cpu_hotplug)
746 microcode_fini_cpu(cpu);
747 sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); 739 sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
748 return 0; 740 return 0;
749} 741}
@@ -774,13 +766,34 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
774 766
775 sys_dev = get_cpu_sysdev(cpu); 767 sys_dev = get_cpu_sysdev(cpu);
776 switch (action) { 768 switch (action) {
769 case CPU_UP_CANCELED_FROZEN:
770 /* The CPU refused to come up during a system resume */
771 microcode_fini_cpu(cpu);
772 break;
777 case CPU_ONLINE: 773 case CPU_ONLINE:
778 case CPU_DOWN_FAILED: 774 case CPU_DOWN_FAILED:
779 mc_sysdev_add(sys_dev); 775 mc_sysdev_add(sys_dev);
780 break; 776 break;
777 case CPU_ONLINE_FROZEN:
778 /* System-wide resume is in progress, try to apply microcode */
779 if (apply_microcode_check_cpu(cpu)) {
780 /* The application of microcode failed */
781 microcode_fini_cpu(cpu);
782 __mc_sysdev_add(sys_dev, 1);
783 break;
784 }
785 case CPU_DOWN_FAILED_FROZEN:
786 if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group))
787 printk(KERN_ERR "Microcode: Failed to create the sysfs "
788 "group for CPU%d\n", cpu);
789 break;
781 case CPU_DOWN_PREPARE: 790 case CPU_DOWN_PREPARE:
782 mc_sysdev_remove(sys_dev); 791 mc_sysdev_remove(sys_dev);
783 break; 792 break;
793 case CPU_DOWN_PREPARE_FROZEN:
794 /* Suspend is in progress, only remove the interface */
795 sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
796 break;
784 } 797 }
785 return NOTIFY_OK; 798 return NOTIFY_OK;
786} 799}
diff --git a/arch/i386/kernel/msr.c b/arch/i386/kernel/msr.c
index 8cd0a91ce107..0c1069b8d638 100644
--- a/arch/i386/kernel/msr.c
+++ b/arch/i386/kernel/msr.c
@@ -153,9 +153,11 @@ static int msr_class_cpu_callback(struct notifier_block *nfb,
153 153
154 switch (action) { 154 switch (action) {
155 case CPU_ONLINE: 155 case CPU_ONLINE:
156 case CPU_ONLINE_FROZEN:
156 msr_device_create(cpu); 157 msr_device_create(cpu);
157 break; 158 break;
158 case CPU_DEAD: 159 case CPU_DEAD:
160 case CPU_DEAD_FROZEN:
159 device_destroy(msr_class, MKDEV(MSR_MAJOR, cpu)); 161 device_destroy(msr_class, MKDEV(MSR_MAJOR, cpu));
160 break; 162 break;
161 } 163 }
diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index 4bec0cbf407a..c05e7e861b29 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -305,7 +305,7 @@ void show_registers(struct pt_regs *regs)
305 regs->xds & 0xffff, regs->xes & 0xffff, regs->xfs & 0xffff, gs, ss); 305 regs->xds & 0xffff, regs->xes & 0xffff, regs->xfs & 0xffff, gs, ss);
306 printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)", 306 printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)",
307 TASK_COMM_LEN, current->comm, current->pid, 307 TASK_COMM_LEN, current->comm, current->pid,
308 current_thread_info(), current, current->thread_info); 308 current_thread_info(), current, task_thread_info(current));
309 /* 309 /*
310 * When in-kernel, we also print out the stack and code at the 310 * When in-kernel, we also print out the stack and code at the
311 * time of the fault.. 311 * time of the fault..
diff --git a/arch/i386/mach-generic/probe.c b/arch/i386/mach-generic/probe.c
index a7b3999bb37a..74f3da634423 100644
--- a/arch/i386/mach-generic/probe.c
+++ b/arch/i386/mach-generic/probe.c
@@ -119,9 +119,7 @@ int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id)
119 return 0; 119 return 0;
120} 120}
121 121
122#ifdef CONFIG_SMP
123int hard_smp_processor_id(void) 122int hard_smp_processor_id(void)
124{ 123{
125 return genapic->get_apic_id(*(unsigned long *)(APIC_BASE+APIC_ID)); 124 return genapic->get_apic_id(*(unsigned long *)(APIC_BASE+APIC_ID));
126} 125}
127#endif
diff --git a/arch/i386/mach-voyager/voyager_basic.c b/arch/i386/mach-voyager/voyager_basic.c
index 8fe7e4593d5f..9b77b39b71a6 100644
--- a/arch/i386/mach-voyager/voyager_basic.c
+++ b/arch/i386/mach-voyager/voyager_basic.c
@@ -292,8 +292,8 @@ machine_emergency_restart(void)
292void 292void
293mca_nmi_hook(void) 293mca_nmi_hook(void)
294{ 294{
295 __u8 dumpval __attribute__((unused)) = inb(0xf823); 295 __u8 dumpval __maybe_unused = inb(0xf823);
296 __u8 swnmi __attribute__((unused)) = inb(0xf813); 296 __u8 swnmi __maybe_unused = inb(0xf813);
297 297
298 /* FIXME: assume dump switch pressed */ 298 /* FIXME: assume dump switch pressed */
299 /* check to see if the dump switch was pressed */ 299 /* check to see if the dump switch was pressed */
diff --git a/arch/i386/pci/init.c b/arch/i386/pci/init.c
index 1cf11af96de2..3de9f9ba2da6 100644
--- a/arch/i386/pci/init.c
+++ b/arch/i386/pci/init.c
@@ -6,7 +6,7 @@
6 in the right sequence from here. */ 6 in the right sequence from here. */
7static __init int pci_access_init(void) 7static __init int pci_access_init(void)
8{ 8{
9 int type __attribute__((unused)) = 0; 9 int type __maybe_unused = 0;
10 10
11#ifdef CONFIG_PCI_DIRECT 11#ifdef CONFIG_PCI_DIRECT
12 type = pci_direct_probe(); 12 type = pci_direct_probe();
diff --git a/arch/ia64/kernel/err_inject.c b/arch/ia64/kernel/err_inject.c
index d3e9f33e8bdd..6a49600cf337 100644
--- a/arch/ia64/kernel/err_inject.c
+++ b/arch/ia64/kernel/err_inject.c
@@ -236,9 +236,11 @@ static int __cpuinit err_inject_cpu_callback(struct notifier_block *nfb,
236 sys_dev = get_cpu_sysdev(cpu); 236 sys_dev = get_cpu_sysdev(cpu);
237 switch (action) { 237 switch (action) {
238 case CPU_ONLINE: 238 case CPU_ONLINE:
239 case CPU_ONLINE_FROZEN:
239 err_inject_add_dev(sys_dev); 240 err_inject_add_dev(sys_dev);
240 break; 241 break;
241 case CPU_DEAD: 242 case CPU_DEAD:
243 case CPU_DEAD_FROZEN:
242 err_inject_remove_dev(sys_dev); 244 err_inject_remove_dev(sys_dev);
243 break; 245 break;
244 } 246 }
diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c
index 1d7cc7e2ce32..f8ae709de0b5 100644
--- a/arch/ia64/kernel/mca.c
+++ b/arch/ia64/kernel/mca.c
@@ -1689,7 +1689,7 @@ format_mca_init_stack(void *mca_data, unsigned long offset,
1689 ti->preempt_count = 1; 1689 ti->preempt_count = 1;
1690 ti->task = p; 1690 ti->task = p;
1691 ti->cpu = cpu; 1691 ti->cpu = cpu;
1692 p->thread_info = ti; 1692 p->stack = ti;
1693 p->state = TASK_UNINTERRUPTIBLE; 1693 p->state = TASK_UNINTERRUPTIBLE;
1694 cpu_set(cpu, p->cpus_allowed); 1694 cpu_set(cpu, p->cpus_allowed);
1695 INIT_LIST_HEAD(&p->tasks); 1695 INIT_LIST_HEAD(&p->tasks);
diff --git a/arch/ia64/kernel/palinfo.c b/arch/ia64/kernel/palinfo.c
index a71df9ae0397..85829e27785c 100644
--- a/arch/ia64/kernel/palinfo.c
+++ b/arch/ia64/kernel/palinfo.c
@@ -975,9 +975,11 @@ static int palinfo_cpu_callback(struct notifier_block *nfb,
975 975
976 switch (action) { 976 switch (action) {
977 case CPU_ONLINE: 977 case CPU_ONLINE:
978 case CPU_ONLINE_FROZEN:
978 create_palinfo_proc_entries(hotcpu); 979 create_palinfo_proc_entries(hotcpu);
979 break; 980 break;
980 case CPU_DEAD: 981 case CPU_DEAD:
982 case CPU_DEAD_FROZEN:
981 remove_palinfo_proc_entries(hotcpu); 983 remove_palinfo_proc_entries(hotcpu);
982 break; 984 break;
983 } 985 }
diff --git a/arch/ia64/kernel/salinfo.c b/arch/ia64/kernel/salinfo.c
index a51f1d0bfb70..89f6b138a62c 100644
--- a/arch/ia64/kernel/salinfo.c
+++ b/arch/ia64/kernel/salinfo.c
@@ -582,6 +582,7 @@ salinfo_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu
582 struct salinfo_data *data; 582 struct salinfo_data *data;
583 switch (action) { 583 switch (action) {
584 case CPU_ONLINE: 584 case CPU_ONLINE:
585 case CPU_ONLINE_FROZEN:
585 spin_lock_irqsave(&data_saved_lock, flags); 586 spin_lock_irqsave(&data_saved_lock, flags);
586 for (i = 0, data = salinfo_data; 587 for (i = 0, data = salinfo_data;
587 i < ARRAY_SIZE(salinfo_data); 588 i < ARRAY_SIZE(salinfo_data);
@@ -592,6 +593,7 @@ salinfo_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu
592 spin_unlock_irqrestore(&data_saved_lock, flags); 593 spin_unlock_irqrestore(&data_saved_lock, flags);
593 break; 594 break;
594 case CPU_DEAD: 595 case CPU_DEAD:
596 case CPU_DEAD_FROZEN:
595 spin_lock_irqsave(&data_saved_lock, flags); 597 spin_lock_irqsave(&data_saved_lock, flags);
596 for (i = 0, data = salinfo_data; 598 for (i = 0, data = salinfo_data;
597 i < ARRAY_SIZE(salinfo_data); 599 i < ARRAY_SIZE(salinfo_data);
diff --git a/arch/ia64/kernel/topology.c b/arch/ia64/kernel/topology.c
index 687500ddb4b8..94ae3c87d828 100644
--- a/arch/ia64/kernel/topology.c
+++ b/arch/ia64/kernel/topology.c
@@ -412,9 +412,11 @@ static int __cpuinit cache_cpu_callback(struct notifier_block *nfb,
412 sys_dev = get_cpu_sysdev(cpu); 412 sys_dev = get_cpu_sysdev(cpu);
413 switch (action) { 413 switch (action) {
414 case CPU_ONLINE: 414 case CPU_ONLINE:
415 case CPU_ONLINE_FROZEN:
415 cache_add_dev(sys_dev); 416 cache_add_dev(sys_dev);
416 break; 417 break;
417 case CPU_DEAD: 418 case CPU_DEAD:
419 case CPU_DEAD_FROZEN:
418 cache_remove_dev(sys_dev); 420 cache_remove_dev(sys_dev);
419 break; 421 break;
420 } 422 }
diff --git a/arch/m68knommu/kernel/asm-offsets.c b/arch/m68knommu/kernel/asm-offsets.c
index b988c7bdc6e4..7cd183d346ef 100644
--- a/arch/m68knommu/kernel/asm-offsets.c
+++ b/arch/m68knommu/kernel/asm-offsets.c
@@ -31,7 +31,7 @@ int main(void)
31 DEFINE(TASK_PTRACE, offsetof(struct task_struct, ptrace)); 31 DEFINE(TASK_PTRACE, offsetof(struct task_struct, ptrace));
32 DEFINE(TASK_BLOCKED, offsetof(struct task_struct, blocked)); 32 DEFINE(TASK_BLOCKED, offsetof(struct task_struct, blocked));
33 DEFINE(TASK_THREAD, offsetof(struct task_struct, thread)); 33 DEFINE(TASK_THREAD, offsetof(struct task_struct, thread));
34 DEFINE(TASK_THREAD_INFO, offsetof(struct task_struct, thread_info)); 34 DEFINE(TASK_THREAD_INFO, offsetof(struct task_struct, stack));
35 DEFINE(TASK_MM, offsetof(struct task_struct, mm)); 35 DEFINE(TASK_MM, offsetof(struct task_struct, mm));
36 DEFINE(TASK_ACTIVE_MM, offsetof(struct task_struct, active_mm)); 36 DEFINE(TASK_ACTIVE_MM, offsetof(struct task_struct, active_mm));
37 37
diff --git a/arch/mips/kernel/asm-offsets.c b/arch/mips/kernel/asm-offsets.c
index 761a779d5c4f..3b27309d54b1 100644
--- a/arch/mips/kernel/asm-offsets.c
+++ b/arch/mips/kernel/asm-offsets.c
@@ -82,7 +82,7 @@ void output_task_defines(void)
82{ 82{
83 text("/* MIPS task_struct offsets. */"); 83 text("/* MIPS task_struct offsets. */");
84 offset("#define TASK_STATE ", struct task_struct, state); 84 offset("#define TASK_STATE ", struct task_struct, state);
85 offset("#define TASK_THREAD_INFO ", struct task_struct, thread_info); 85 offset("#define TASK_THREAD_INFO ", struct task_struct, stack);
86 offset("#define TASK_FLAGS ", struct task_struct, flags); 86 offset("#define TASK_FLAGS ", struct task_struct, flags);
87 offset("#define TASK_MM ", struct task_struct, mm); 87 offset("#define TASK_MM ", struct task_struct, mm);
88 offset("#define TASK_PID ", struct task_struct, pid); 88 offset("#define TASK_PID ", struct task_struct, pid);
diff --git a/arch/mips/kernel/smtc.c b/arch/mips/kernel/smtc.c
index 5dcfab6b288e..b361edb83dc6 100644
--- a/arch/mips/kernel/smtc.c
+++ b/arch/mips/kernel/smtc.c
@@ -560,7 +560,7 @@ void smtc_boot_secondary(int cpu, struct task_struct *idle)
560 write_tc_gpr_sp(__KSTK_TOS(idle)); 560 write_tc_gpr_sp(__KSTK_TOS(idle));
561 561
562 /* global pointer */ 562 /* global pointer */
563 write_tc_gpr_gp((unsigned long)idle->thread_info); 563 write_tc_gpr_gp((unsigned long)task_thread_info(idle));
564 564
565 smtc_status |= SMTC_MTC_ACTIVE; 565 smtc_status |= SMTC_MTC_ACTIVE;
566 write_tc_c0_tchalt(0); 566 write_tc_c0_tchalt(0);
diff --git a/arch/parisc/kernel/asm-offsets.c b/arch/parisc/kernel/asm-offsets.c
index 54fdb959149c..d3b7917a87cb 100644
--- a/arch/parisc/kernel/asm-offsets.c
+++ b/arch/parisc/kernel/asm-offsets.c
@@ -54,7 +54,7 @@
54 54
55int main(void) 55int main(void)
56{ 56{
57 DEFINE(TASK_THREAD_INFO, offsetof(struct task_struct, thread_info)); 57 DEFINE(TASK_THREAD_INFO, offsetof(struct task_struct, stack));
58 DEFINE(TASK_STATE, offsetof(struct task_struct, state)); 58 DEFINE(TASK_STATE, offsetof(struct task_struct, state));
59 DEFINE(TASK_FLAGS, offsetof(struct task_struct, flags)); 59 DEFINE(TASK_FLAGS, offsetof(struct task_struct, flags));
60 DEFINE(TASK_SIGPENDING, offsetof(struct task_struct, pending)); 60 DEFINE(TASK_SIGPENDING, offsetof(struct task_struct, pending));
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 8f48560b7ee2..37bc35e69dbe 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -58,7 +58,7 @@ int main(void)
58#ifdef CONFIG_PPC64 58#ifdef CONFIG_PPC64
59 DEFINE(AUDITCONTEXT, offsetof(struct task_struct, audit_context)); 59 DEFINE(AUDITCONTEXT, offsetof(struct task_struct, audit_context));
60#else 60#else
61 DEFINE(THREAD_INFO, offsetof(struct task_struct, thread_info)); 61 DEFINE(THREAD_INFO, offsetof(struct task_struct, stack));
62 DEFINE(PTRACE, offsetof(struct task_struct, ptrace)); 62 DEFINE(PTRACE, offsetof(struct task_struct, ptrace));
63#endif /* CONFIG_PPC64 */ 63#endif /* CONFIG_PPC64 */
64 64
diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c
index cae39d9dfe48..68991c2d4a1b 100644
--- a/arch/powerpc/kernel/sysfs.c
+++ b/arch/powerpc/kernel/sysfs.c
@@ -342,10 +342,12 @@ static int __cpuinit sysfs_cpu_notify(struct notifier_block *self,
342 342
343 switch (action) { 343 switch (action) {
344 case CPU_ONLINE: 344 case CPU_ONLINE:
345 case CPU_ONLINE_FROZEN:
345 register_cpu_online(cpu); 346 register_cpu_online(cpu);
346 break; 347 break;
347#ifdef CONFIG_HOTPLUG_CPU 348#ifdef CONFIG_HOTPLUG_CPU
348 case CPU_DEAD: 349 case CPU_DEAD:
350 case CPU_DEAD_FROZEN:
349 unregister_cpu_online(cpu); 351 unregister_cpu_online(cpu);
350 break; 352 break;
351#endif 353#endif
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index b3a592b25ab3..de45aa82d97b 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -252,12 +252,15 @@ static int __cpuinit cpu_numa_callback(struct notifier_block *nfb,
252 252
253 switch (action) { 253 switch (action) {
254 case CPU_UP_PREPARE: 254 case CPU_UP_PREPARE:
255 case CPU_UP_PREPARE_FROZEN:
255 numa_setup_cpu(lcpu); 256 numa_setup_cpu(lcpu);
256 ret = NOTIFY_OK; 257 ret = NOTIFY_OK;
257 break; 258 break;
258#ifdef CONFIG_HOTPLUG_CPU 259#ifdef CONFIG_HOTPLUG_CPU
259 case CPU_DEAD: 260 case CPU_DEAD:
261 case CPU_DEAD_FROZEN:
260 case CPU_UP_CANCELED: 262 case CPU_UP_CANCELED:
263 case CPU_UP_CANCELED_FROZEN:
261 unmap_cpu_from_node(lcpu); 264 unmap_cpu_from_node(lcpu);
262 break; 265 break;
263 ret = NOTIFY_OK; 266 ret = NOTIFY_OK;
diff --git a/arch/ppc/kernel/asm-offsets.c b/arch/ppc/kernel/asm-offsets.c
index c5850a272650..e8e94321b59e 100644
--- a/arch/ppc/kernel/asm-offsets.c
+++ b/arch/ppc/kernel/asm-offsets.c
@@ -35,7 +35,7 @@ int
35main(void) 35main(void)
36{ 36{
37 DEFINE(THREAD, offsetof(struct task_struct, thread)); 37 DEFINE(THREAD, offsetof(struct task_struct, thread));
38 DEFINE(THREAD_INFO, offsetof(struct task_struct, thread_info)); 38 DEFINE(THREAD_INFO, offsetof(struct task_struct, stack));
39 DEFINE(MM, offsetof(struct task_struct, mm)); 39 DEFINE(MM, offsetof(struct task_struct, mm));
40 DEFINE(PTRACE, offsetof(struct task_struct, ptrace)); 40 DEFINE(PTRACE, offsetof(struct task_struct, ptrace));
41 DEFINE(KSP, offsetof(struct thread_struct, ksp)); 41 DEFINE(KSP, offsetof(struct thread_struct, ksp));
diff --git a/arch/s390/appldata/appldata_base.c b/arch/s390/appldata/appldata_base.c
index ee89b33145d5..81a2b92ab0c2 100644
--- a/arch/s390/appldata/appldata_base.c
+++ b/arch/s390/appldata/appldata_base.c
@@ -567,9 +567,11 @@ appldata_cpu_notify(struct notifier_block *self,
567{ 567{
568 switch (action) { 568 switch (action) {
569 case CPU_ONLINE: 569 case CPU_ONLINE:
570 case CPU_ONLINE_FROZEN:
570 appldata_online_cpu((long) hcpu); 571 appldata_online_cpu((long) hcpu);
571 break; 572 break;
572 case CPU_DEAD: 573 case CPU_DEAD:
574 case CPU_DEAD_FROZEN:
573 appldata_offline_cpu((long) hcpu); 575 appldata_offline_cpu((long) hcpu);
574 break; 576 break;
575 default: 577 default:
diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c
index ec514fe5ccd0..1375f8a4469e 100644
--- a/arch/s390/kernel/asm-offsets.c
+++ b/arch/s390/kernel/asm-offsets.c
@@ -15,7 +15,7 @@
15 15
16int main(void) 16int main(void)
17{ 17{
18 DEFINE(__THREAD_info, offsetof(struct task_struct, thread_info),); 18 DEFINE(__THREAD_info, offsetof(struct task_struct, stack),);
19 DEFINE(__THREAD_ksp, offsetof(struct task_struct, thread.ksp),); 19 DEFINE(__THREAD_ksp, offsetof(struct task_struct, thread.ksp),);
20 DEFINE(__THREAD_per, offsetof(struct task_struct, thread.per_info),); 20 DEFINE(__THREAD_per, offsetof(struct task_struct, thread.per_info),);
21 DEFINE(__THREAD_mm_segment, 21 DEFINE(__THREAD_mm_segment,
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index b7977027a28f..09f028a3266b 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -789,10 +789,12 @@ static int __cpuinit smp_cpu_notify(struct notifier_block *self,
789 789
790 switch (action) { 790 switch (action) {
791 case CPU_ONLINE: 791 case CPU_ONLINE:
792 case CPU_ONLINE_FROZEN:
792 if (sysdev_create_file(s, &attr_capability)) 793 if (sysdev_create_file(s, &attr_capability))
793 return NOTIFY_BAD; 794 return NOTIFY_BAD;
794 break; 795 break;
795 case CPU_DEAD: 796 case CPU_DEAD:
797 case CPU_DEAD_FROZEN:
796 sysdev_remove_file(s, &attr_capability); 798 sysdev_remove_file(s, &attr_capability);
797 break; 799 break;
798 } 800 }
diff --git a/arch/sparc/kernel/asm-offsets.c b/arch/sparc/kernel/asm-offsets.c
index 29d7cfd1c970..6773ed76e414 100644
--- a/arch/sparc/kernel/asm-offsets.c
+++ b/arch/sparc/kernel/asm-offsets.c
@@ -28,7 +28,7 @@ int foo(void)
28 DEFINE(AOFF_task_gid, offsetof(struct task_struct, gid)); 28 DEFINE(AOFF_task_gid, offsetof(struct task_struct, gid));
29 DEFINE(AOFF_task_euid, offsetof(struct task_struct, euid)); 29 DEFINE(AOFF_task_euid, offsetof(struct task_struct, euid));
30 DEFINE(AOFF_task_egid, offsetof(struct task_struct, egid)); 30 DEFINE(AOFF_task_egid, offsetof(struct task_struct, egid));
31 /* DEFINE(THREAD_INFO, offsetof(struct task_struct, thread_info)); */ 31 /* DEFINE(THREAD_INFO, offsetof(struct task_struct, stack)); */
32 DEFINE(ASIZ_task_uid, sizeof(current->uid)); 32 DEFINE(ASIZ_task_uid, sizeof(current->uid));
33 DEFINE(ASIZ_task_gid, sizeof(current->gid)); 33 DEFINE(ASIZ_task_gid, sizeof(current->gid));
34 DEFINE(ASIZ_task_euid, sizeof(current->euid)); 34 DEFINE(ASIZ_task_euid, sizeof(current->euid));
diff --git a/arch/sparc64/kernel/traps.c b/arch/sparc64/kernel/traps.c
index dc652f210290..d0fde36395b4 100644
--- a/arch/sparc64/kernel/traps.c
+++ b/arch/sparc64/kernel/traps.c
@@ -19,6 +19,7 @@
19#include <linux/init.h> 19#include <linux/init.h>
20#include <linux/kdebug.h> 20#include <linux/kdebug.h>
21 21
22#include <asm/smp.h>
22#include <asm/delay.h> 23#include <asm/delay.h>
23#include <asm/system.h> 24#include <asm/system.h>
24#include <asm/ptrace.h> 25#include <asm/ptrace.h>
diff --git a/arch/um/Kconfig b/arch/um/Kconfig
index 354cc6b70530..b9c0f307a8fa 100644
--- a/arch/um/Kconfig
+++ b/arch/um/Kconfig
@@ -320,21 +320,7 @@ source "crypto/Kconfig"
320 320
321source "lib/Kconfig" 321source "lib/Kconfig"
322 322
323menu "SCSI support" 323source "drivers/scsi/Kconfig"
324depends on BROKEN
325
326config SCSI
327 tristate "SCSI support"
328
329# This gives us free_dma, which scsi.c wants.
330config GENERIC_ISA_DMA
331 bool
332 depends on SCSI
333 default y
334
335source "arch/um/Kconfig.scsi"
336
337endmenu
338 324
339source "drivers/md/Kconfig" 325source "drivers/md/Kconfig"
340 326
diff --git a/arch/um/Kconfig.scsi b/arch/um/Kconfig.scsi
deleted file mode 100644
index c291c942b1a8..000000000000
--- a/arch/um/Kconfig.scsi
+++ /dev/null
@@ -1,58 +0,0 @@
1comment "SCSI support type (disk, tape, CD-ROM)"
2 depends on SCSI
3
4config BLK_DEV_SD
5 tristate "SCSI disk support"
6 depends on SCSI
7
8config SD_EXTRA_DEVS
9 int "Maximum number of SCSI disks that can be loaded as modules"
10 depends on BLK_DEV_SD
11 default "40"
12
13config CHR_DEV_ST
14 tristate "SCSI tape support"
15 depends on SCSI
16
17config BLK_DEV_SR
18 tristate "SCSI CD-ROM support"
19 depends on SCSI
20
21config BLK_DEV_SR_VENDOR
22 bool "Enable vendor-specific extensions (for SCSI CDROM)"
23 depends on BLK_DEV_SR
24
25config SR_EXTRA_DEVS
26 int "Maximum number of CDROM devices that can be loaded as modules"
27 depends on BLK_DEV_SR
28 default "2"
29
30config CHR_DEV_SG
31 tristate "SCSI generic support"
32 depends on SCSI
33
34comment "Some SCSI devices (e.g. CD jukebox) support multiple LUNs"
35 depends on SCSI
36
37#if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then
38config SCSI_DEBUG_QUEUES
39 bool "Enable extra checks in new queueing code"
40 depends on SCSI
41
42#fi
43config SCSI_MULTI_LUN
44 bool "Probe all LUNs on each SCSI device"
45 depends on SCSI
46
47config SCSI_CONSTANTS
48 bool "Verbose SCSI error reporting (kernel size +=12K)"
49 depends on SCSI
50
51config SCSI_LOGGING
52 bool "SCSI logging facility"
53 depends on SCSI
54
55config SCSI_DEBUG
56 tristate "SCSI debugging host simulator (EXPERIMENTAL)"
57 depends on SCSI
58
diff --git a/arch/um/kernel/skas/process.c b/arch/um/kernel/skas/process.c
index ef36facd8fe9..a96ae1a0610e 100644
--- a/arch/um/kernel/skas/process.c
+++ b/arch/um/kernel/skas/process.c
@@ -178,20 +178,23 @@ int start_uml_skas(void)
178 178
179int external_pid_skas(struct task_struct *task) 179int external_pid_skas(struct task_struct *task)
180{ 180{
181#warning Need to look up userspace_pid by cpu 181 /* FIXME: Need to look up userspace_pid by cpu */
182 return(userspace_pid[0]); 182 return(userspace_pid[0]);
183} 183}
184 184
185int thread_pid_skas(struct task_struct *task) 185int thread_pid_skas(struct task_struct *task)
186{ 186{
187#warning Need to look up userspace_pid by cpu 187 /* FIXME: Need to look up userspace_pid by cpu */
188 return(userspace_pid[0]); 188 return(userspace_pid[0]);
189} 189}
190 190
191void kill_off_processes_skas(void) 191void kill_off_processes_skas(void)
192{ 192{
193 if(proc_mm) 193 if(proc_mm)
194#warning need to loop over userspace_pids in kill_off_processes_skas 194 /*
195 * FIXME: need to loop over userspace_pids in
196 * kill_off_processes_skas
197 */
195 os_kill_ptraced_process(userspace_pid[0], 1); 198 os_kill_ptraced_process(userspace_pid[0], 1);
196 else { 199 else {
197 struct task_struct *p; 200 struct task_struct *p;
diff --git a/arch/um/os-Linux/process.c b/arch/um/os-Linux/process.c
index 92a7b59120d6..2d9d2ca39299 100644
--- a/arch/um/os-Linux/process.c
+++ b/arch/um/os-Linux/process.c
@@ -239,6 +239,7 @@ out:
239 return ok; 239 return ok;
240} 240}
241 241
242#ifdef UML_CONFIG_MODE_TT
242void init_new_thread_stack(void *sig_stack, void (*usr1_handler)(int)) 243void init_new_thread_stack(void *sig_stack, void (*usr1_handler)(int))
243{ 244{
244 int flags = 0, pages; 245 int flags = 0, pages;
@@ -260,6 +261,7 @@ void init_new_thread_stack(void *sig_stack, void (*usr1_handler)(int))
260 "errno = %d\n", errno); 261 "errno = %d\n", errno);
261 } 262 }
262} 263}
264#endif
263 265
264void init_new_thread_signals(void) 266void init_new_thread_signals(void)
265{ 267{
diff --git a/arch/um/os-Linux/skas/mem.c b/arch/um/os-Linux/skas/mem.c
index 8e490fff3d47..5c8946320799 100644
--- a/arch/um/os-Linux/skas/mem.c
+++ b/arch/um/os-Linux/skas/mem.c
@@ -68,7 +68,7 @@ static inline long do_syscall_stub(struct mm_id * mm_idp, void **addr)
68 int err, pid = mm_idp->u.pid; 68 int err, pid = mm_idp->u.pid;
69 69
70 if(proc_mm) 70 if(proc_mm)
71#warning Need to look up userspace_pid by cpu 71 /* FIXME: Need to look up userspace_pid by cpu */
72 pid = userspace_pid[0]; 72 pid = userspace_pid[0];
73 73
74 multi_count++; 74 multi_count++;
diff --git a/arch/um/os-Linux/skas/process.c b/arch/um/os-Linux/skas/process.c
index 5c088a55396c..6a0e466d01e3 100644
--- a/arch/um/os-Linux/skas/process.c
+++ b/arch/um/os-Linux/skas/process.c
@@ -586,7 +586,7 @@ void switch_mm_skas(struct mm_id *mm_idp)
586{ 586{
587 int err; 587 int err;
588 588
589#warning need cpu pid in switch_mm_skas 589 /* FIXME: need cpu pid in switch_mm_skas */
590 if(proc_mm){ 590 if(proc_mm){
591 err = ptrace(PTRACE_SWITCH_MM, userspace_pid[0], 0, 591 err = ptrace(PTRACE_SWITCH_MM, userspace_pid[0], 0,
592 mm_idp->u.mm_fd); 592 mm_idp->u.mm_fd);
diff --git a/arch/v850/kernel/asm-offsets.c b/arch/v850/kernel/asm-offsets.c
index 24f291369070..cee5c3142d41 100644
--- a/arch/v850/kernel/asm-offsets.c
+++ b/arch/v850/kernel/asm-offsets.c
@@ -29,7 +29,7 @@ int main (void)
29 DEFINE (TASK_PTRACE, offsetof (struct task_struct, ptrace)); 29 DEFINE (TASK_PTRACE, offsetof (struct task_struct, ptrace));
30 DEFINE (TASK_BLOCKED, offsetof (struct task_struct, blocked)); 30 DEFINE (TASK_BLOCKED, offsetof (struct task_struct, blocked));
31 DEFINE (TASK_THREAD, offsetof (struct task_struct, thread)); 31 DEFINE (TASK_THREAD, offsetof (struct task_struct, thread));
32 DEFINE (TASK_THREAD_INFO, offsetof (struct task_struct, thread_info)); 32 DEFINE (TASK_THREAD_INFO, offsetof (struct task_struct, stack));
33 DEFINE (TASK_MM, offsetof (struct task_struct, mm)); 33 DEFINE (TASK_MM, offsetof (struct task_struct, mm));
34 DEFINE (TASK_ACTIVE_MM, offsetof (struct task_struct, active_mm)); 34 DEFINE (TASK_ACTIVE_MM, offsetof (struct task_struct, active_mm));
35 DEFINE (TASK_PID, offsetof (struct task_struct, pid)); 35 DEFINE (TASK_PID, offsetof (struct task_struct, pid));
diff --git a/arch/x86_64/kernel/irq.c b/arch/x86_64/kernel/irq.c
index 3bc30d2c13d3..3eaceac32481 100644
--- a/arch/x86_64/kernel/irq.c
+++ b/arch/x86_64/kernel/irq.c
@@ -32,7 +32,7 @@ atomic_t irq_err_count;
32 */ 32 */
33static inline void stack_overflow_check(struct pt_regs *regs) 33static inline void stack_overflow_check(struct pt_regs *regs)
34{ 34{
35 u64 curbase = (u64) current->thread_info; 35 u64 curbase = (u64)task_stack_page(current);
36 static unsigned long warned = -60*HZ; 36 static unsigned long warned = -60*HZ;
37 37
38 if (regs->rsp >= curbase && regs->rsp <= curbase + THREAD_SIZE && 38 if (regs->rsp >= curbase && regs->rsp <= curbase + THREAD_SIZE &&
diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c
index 442169640e45..a14375dd5425 100644
--- a/arch/x86_64/kernel/mce.c
+++ b/arch/x86_64/kernel/mce.c
@@ -720,9 +720,11 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
720 720
721 switch (action) { 721 switch (action) {
722 case CPU_ONLINE: 722 case CPU_ONLINE:
723 case CPU_ONLINE_FROZEN:
723 mce_create_device(cpu); 724 mce_create_device(cpu);
724 break; 725 break;
725 case CPU_DEAD: 726 case CPU_DEAD:
727 case CPU_DEAD_FROZEN:
726 mce_remove_device(cpu); 728 mce_remove_device(cpu);
727 break; 729 break;
728 } 730 }
diff --git a/arch/x86_64/kernel/mce_amd.c b/arch/x86_64/kernel/mce_amd.c
index d0bd5d66e103..03356e64f9c8 100644
--- a/arch/x86_64/kernel/mce_amd.c
+++ b/arch/x86_64/kernel/mce_amd.c
@@ -654,9 +654,11 @@ static int threshold_cpu_callback(struct notifier_block *nfb,
654 654
655 switch (action) { 655 switch (action) {
656 case CPU_ONLINE: 656 case CPU_ONLINE:
657 case CPU_ONLINE_FROZEN:
657 threshold_create_device(cpu); 658 threshold_create_device(cpu);
658 break; 659 break;
659 case CPU_DEAD: 660 case CPU_DEAD:
661 case CPU_DEAD_FROZEN:
660 threshold_remove_device(cpu); 662 threshold_remove_device(cpu);
661 break; 663 break;
662 default: 664 default:
diff --git a/arch/x86_64/kernel/vsyscall.c b/arch/x86_64/kernel/vsyscall.c
index dc32cef96195..51d4c6fa88c8 100644
--- a/arch/x86_64/kernel/vsyscall.c
+++ b/arch/x86_64/kernel/vsyscall.c
@@ -327,7 +327,7 @@ static int __cpuinit
327cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg) 327cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg)
328{ 328{
329 long cpu = (long)arg; 329 long cpu = (long)arg;
330 if (action == CPU_ONLINE) 330 if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
331 smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 0, 1); 331 smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 0, 1);
332 return NOTIFY_DONE; 332 return NOTIFY_DONE;
333} 333}
diff --git a/arch/xtensa/kernel/asm-offsets.c b/arch/xtensa/kernel/asm-offsets.c
index b256cfbef344..698079b3a336 100644
--- a/arch/xtensa/kernel/asm-offsets.c
+++ b/arch/xtensa/kernel/asm-offsets.c
@@ -70,7 +70,7 @@ int main(void)
70 DEFINE(TASK_ACTIVE_MM, offsetof (struct task_struct, active_mm)); 70 DEFINE(TASK_ACTIVE_MM, offsetof (struct task_struct, active_mm));
71 DEFINE(TASK_PID, offsetof (struct task_struct, pid)); 71 DEFINE(TASK_PID, offsetof (struct task_struct, pid));
72 DEFINE(TASK_THREAD, offsetof (struct task_struct, thread)); 72 DEFINE(TASK_THREAD, offsetof (struct task_struct, thread));
73 DEFINE(TASK_THREAD_INFO, offsetof (struct task_struct, thread_info)); 73 DEFINE(TASK_THREAD_INFO, offsetof (struct task_struct, stack));
74 DEFINE(TASK_STRUCT_SIZE, sizeof (struct task_struct)); 74 DEFINE(TASK_STRUCT_SIZE, sizeof (struct task_struct));
75 BLANK(); 75 BLANK();
76 76
diff --git a/block/as-iosched.c b/block/as-iosched.c
index 640aa839d63f..109e91b91ffa 100644
--- a/block/as-iosched.c
+++ b/block/as-iosched.c
@@ -1306,7 +1306,7 @@ static void as_exit_queue(elevator_t *e)
1306 struct as_data *ad = e->elevator_data; 1306 struct as_data *ad = e->elevator_data;
1307 1307
1308 del_timer_sync(&ad->antic_timer); 1308 del_timer_sync(&ad->antic_timer);
1309 kblockd_flush(); 1309 kblockd_flush_work(&ad->antic_work);
1310 1310
1311 BUG_ON(!list_empty(&ad->fifo_list[REQ_SYNC])); 1311 BUG_ON(!list_empty(&ad->fifo_list[REQ_SYNC]));
1312 BUG_ON(!list_empty(&ad->fifo_list[REQ_ASYNC])); 1312 BUG_ON(!list_empty(&ad->fifo_list[REQ_ASYNC]));
diff --git a/block/genhd.c b/block/genhd.c
index b5664440896c..93a2cf654597 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -213,6 +213,59 @@ struct gendisk *get_gendisk(dev_t dev, int *part)
213 return kobj ? to_disk(kobj) : NULL; 213 return kobj ? to_disk(kobj) : NULL;
214} 214}
215 215
216/*
217 * print a full list of all partitions - intended for places where the root
218 * filesystem can't be mounted and thus to give the victim some idea of what
219 * went wrong
220 */
221void __init printk_all_partitions(void)
222{
223 int n;
224 struct gendisk *sgp;
225
226 mutex_lock(&block_subsys_lock);
227 /* For each block device... */
228 list_for_each_entry(sgp, &block_subsys.list, kobj.entry) {
229 char buf[BDEVNAME_SIZE];
230 /*
231 * Don't show empty devices or things that have been surpressed
232 */
233 if (get_capacity(sgp) == 0 ||
234 (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO))
235 continue;
236
237 /*
238 * Note, unlike /proc/partitions, I am showing the numbers in
239 * hex - the same format as the root= option takes.
240 */
241 printk("%02x%02x %10llu %s",
242 sgp->major, sgp->first_minor,
243 (unsigned long long)get_capacity(sgp) >> 1,
244 disk_name(sgp, 0, buf));
245 if (sgp->driverfs_dev != NULL &&
246 sgp->driverfs_dev->driver != NULL)
247 printk(" driver: %s\n",
248 sgp->driverfs_dev->driver->name);
249 else
250 printk(" (driver?)\n");
251
252 /* now show the partitions */
253 for (n = 0; n < sgp->minors - 1; ++n) {
254 if (sgp->part[n] == NULL)
255 continue;
256 if (sgp->part[n]->nr_sects == 0)
257 continue;
258 printk(" %02x%02x %10llu %s\n",
259 sgp->major, n + 1 + sgp->first_minor,
260 (unsigned long long)sgp->part[n]->nr_sects >> 1,
261 disk_name(sgp, n + 1, buf));
262 } /* partition subloop */
263 } /* Block device loop */
264
265 mutex_unlock(&block_subsys_lock);
266 return;
267}
268
216#ifdef CONFIG_PROC_FS 269#ifdef CONFIG_PROC_FS
217/* iterator */ 270/* iterator */
218static void *part_start(struct seq_file *part, loff_t *pos) 271static void *part_start(struct seq_file *part, loff_t *pos)
diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index f294f1538f1e..17e188973428 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -1712,7 +1712,6 @@ EXPORT_SYMBOL(blk_stop_queue);
1712void blk_sync_queue(struct request_queue *q) 1712void blk_sync_queue(struct request_queue *q)
1713{ 1713{
1714 del_timer_sync(&q->unplug_timer); 1714 del_timer_sync(&q->unplug_timer);
1715 kblockd_flush();
1716} 1715}
1717EXPORT_SYMBOL(blk_sync_queue); 1716EXPORT_SYMBOL(blk_sync_queue);
1718 1717
@@ -3508,7 +3507,7 @@ static int blk_cpu_notify(struct notifier_block *self, unsigned long action,
3508 * If a CPU goes away, splice its entries to the current CPU 3507 * If a CPU goes away, splice its entries to the current CPU
3509 * and trigger a run of the softirq 3508 * and trigger a run of the softirq
3510 */ 3509 */
3511 if (action == CPU_DEAD) { 3510 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
3512 int cpu = (unsigned long) hcpu; 3511 int cpu = (unsigned long) hcpu;
3513 3512
3514 local_irq_disable(); 3513 local_irq_disable();
@@ -3632,11 +3631,11 @@ int kblockd_schedule_work(struct work_struct *work)
3632 3631
3633EXPORT_SYMBOL(kblockd_schedule_work); 3632EXPORT_SYMBOL(kblockd_schedule_work);
3634 3633
3635void kblockd_flush(void) 3634void kblockd_flush_work(struct work_struct *work)
3636{ 3635{
3637 flush_workqueue(kblockd_workqueue); 3636 cancel_work_sync(work);
3638} 3637}
3639EXPORT_SYMBOL(kblockd_flush); 3638EXPORT_SYMBOL(kblockd_flush_work);
3640 3639
3641int __init blk_dev_init(void) 3640int __init blk_dev_init(void)
3642{ 3641{
diff --git a/drivers/acpi/sleep/main.c b/drivers/acpi/sleep/main.c
index f8c63410bcbf..52b23471dd69 100644
--- a/drivers/acpi/sleep/main.c
+++ b/drivers/acpi/sleep/main.c
@@ -29,7 +29,6 @@ static u32 acpi_suspend_states[] = {
29 [PM_SUSPEND_ON] = ACPI_STATE_S0, 29 [PM_SUSPEND_ON] = ACPI_STATE_S0,
30 [PM_SUSPEND_STANDBY] = ACPI_STATE_S1, 30 [PM_SUSPEND_STANDBY] = ACPI_STATE_S1,
31 [PM_SUSPEND_MEM] = ACPI_STATE_S3, 31 [PM_SUSPEND_MEM] = ACPI_STATE_S3,
32 [PM_SUSPEND_DISK] = ACPI_STATE_S4,
33 [PM_SUSPEND_MAX] = ACPI_STATE_S5 32 [PM_SUSPEND_MAX] = ACPI_STATE_S5
34}; 33};
35 34
@@ -94,14 +93,6 @@ static int acpi_pm_enter(suspend_state_t pm_state)
94 do_suspend_lowlevel(); 93 do_suspend_lowlevel();
95 break; 94 break;
96 95
97 case PM_SUSPEND_DISK:
98 if (acpi_pm_ops.pm_disk_mode == PM_DISK_PLATFORM)
99 status = acpi_enter_sleep_state(acpi_state);
100 break;
101 case PM_SUSPEND_MAX:
102 acpi_power_off();
103 break;
104
105 default: 96 default:
106 return -EINVAL; 97 return -EINVAL;
107 } 98 }
@@ -157,12 +148,13 @@ int acpi_suspend(u32 acpi_state)
157 suspend_state_t states[] = { 148 suspend_state_t states[] = {
158 [1] = PM_SUSPEND_STANDBY, 149 [1] = PM_SUSPEND_STANDBY,
159 [3] = PM_SUSPEND_MEM, 150 [3] = PM_SUSPEND_MEM,
160 [4] = PM_SUSPEND_DISK,
161 [5] = PM_SUSPEND_MAX 151 [5] = PM_SUSPEND_MAX
162 }; 152 };
163 153
164 if (acpi_state < 6 && states[acpi_state]) 154 if (acpi_state < 6 && states[acpi_state])
165 return pm_suspend(states[acpi_state]); 155 return pm_suspend(states[acpi_state]);
156 if (acpi_state == 4)
157 return hibernate();
166 return -EINVAL; 158 return -EINVAL;
167} 159}
168 160
@@ -189,6 +181,49 @@ static struct pm_ops acpi_pm_ops = {
189 .finish = acpi_pm_finish, 181 .finish = acpi_pm_finish,
190}; 182};
191 183
184#ifdef CONFIG_SOFTWARE_SUSPEND
185static int acpi_hibernation_prepare(void)
186{
187 return acpi_sleep_prepare(ACPI_STATE_S4);
188}
189
190static int acpi_hibernation_enter(void)
191{
192 acpi_status status = AE_OK;
193 unsigned long flags = 0;
194
195 ACPI_FLUSH_CPU_CACHE();
196
197 local_irq_save(flags);
198 acpi_enable_wakeup_device(ACPI_STATE_S4);
199 /* This shouldn't return. If it returns, we have a problem */
200 status = acpi_enter_sleep_state(ACPI_STATE_S4);
201 local_irq_restore(flags);
202
203 return ACPI_SUCCESS(status) ? 0 : -EFAULT;
204}
205
206static void acpi_hibernation_finish(void)
207{
208 acpi_leave_sleep_state(ACPI_STATE_S4);
209 acpi_disable_wakeup_device(ACPI_STATE_S4);
210
211 /* reset firmware waking vector */
212 acpi_set_firmware_waking_vector((acpi_physical_address) 0);
213
214 if (init_8259A_after_S1) {
215 printk("Broken toshiba laptop -> kicking interrupts\n");
216 init_8259A(0);
217 }
218}
219
220static struct hibernation_ops acpi_hibernation_ops = {
221 .prepare = acpi_hibernation_prepare,
222 .enter = acpi_hibernation_enter,
223 .finish = acpi_hibernation_finish,
224};
225#endif /* CONFIG_SOFTWARE_SUSPEND */
226
192/* 227/*
193 * Toshiba fails to preserve interrupts over S1, reinitialization 228 * Toshiba fails to preserve interrupts over S1, reinitialization
194 * of 8259 is needed after S1 resume. 229 * of 8259 is needed after S1 resume.
@@ -227,14 +262,18 @@ int __init acpi_sleep_init(void)
227 sleep_states[i] = 1; 262 sleep_states[i] = 1;
228 printk(" S%d", i); 263 printk(" S%d", i);
229 } 264 }
230 if (i == ACPI_STATE_S4) {
231 if (sleep_states[i])
232 acpi_pm_ops.pm_disk_mode = PM_DISK_PLATFORM;
233 }
234 } 265 }
235 printk(")\n"); 266 printk(")\n");
236 267
237 pm_set_ops(&acpi_pm_ops); 268 pm_set_ops(&acpi_pm_ops);
269
270#ifdef CONFIG_SOFTWARE_SUSPEND
271 if (sleep_states[ACPI_STATE_S4])
272 hibernation_set_ops(&acpi_hibernation_ops);
273#else
274 sleep_states[ACPI_STATE_S4] = 0;
275#endif
276
238 return 0; 277 return 0;
239} 278}
240 279
diff --git a/drivers/acpi/sleep/proc.c b/drivers/acpi/sleep/proc.c
index 5a76e5be61d5..76b45f0b8341 100644
--- a/drivers/acpi/sleep/proc.c
+++ b/drivers/acpi/sleep/proc.c
@@ -60,7 +60,7 @@ acpi_system_write_sleep(struct file *file,
60 state = simple_strtoul(str, NULL, 0); 60 state = simple_strtoul(str, NULL, 0);
61#ifdef CONFIG_SOFTWARE_SUSPEND 61#ifdef CONFIG_SOFTWARE_SUSPEND
62 if (state == 4) { 62 if (state == 4) {
63 error = pm_suspend(PM_SUSPEND_DISK); 63 error = hibernate();
64 goto Done; 64 goto Done;
65 } 65 }
66#endif 66#endif
diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index a7950885d18e..fef87dd70d17 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -1316,7 +1316,7 @@ void ata_port_flush_task(struct ata_port *ap)
1316 spin_unlock_irqrestore(ap->lock, flags); 1316 spin_unlock_irqrestore(ap->lock, flags);
1317 1317
1318 DPRINTK("flush #1\n"); 1318 DPRINTK("flush #1\n");
1319 flush_workqueue(ata_wq); 1319 cancel_work_sync(&ap->port_task.work); /* akpm: seems unneeded */
1320 1320
1321 /* 1321 /*
1322 * At this point, if a task is running, it's guaranteed to see 1322 * At this point, if a task is running, it's guaranteed to see
@@ -1327,7 +1327,7 @@ void ata_port_flush_task(struct ata_port *ap)
1327 if (ata_msg_ctl(ap)) 1327 if (ata_msg_ctl(ap))
1328 ata_port_printk(ap, KERN_DEBUG, "%s: flush #2\n", 1328 ata_port_printk(ap, KERN_DEBUG, "%s: flush #2\n",
1329 __FUNCTION__); 1329 __FUNCTION__);
1330 flush_workqueue(ata_wq); 1330 cancel_work_sync(&ap->port_task.work);
1331 } 1331 }
1332 1332
1333 spin_lock_irqsave(ap->lock, flags); 1333 spin_lock_irqsave(ap->lock, flags);
@@ -6475,9 +6475,9 @@ void ata_port_detach(struct ata_port *ap)
6475 /* Flush hotplug task. The sequence is similar to 6475 /* Flush hotplug task. The sequence is similar to
6476 * ata_port_flush_task(). 6476 * ata_port_flush_task().
6477 */ 6477 */
6478 flush_workqueue(ata_aux_wq); 6478 cancel_work_sync(&ap->hotplug_task.work); /* akpm: why? */
6479 cancel_delayed_work(&ap->hotplug_task); 6479 cancel_delayed_work(&ap->hotplug_task);
6480 flush_workqueue(ata_aux_wq); 6480 cancel_work_sync(&ap->hotplug_task.work);
6481 6481
6482 skip_eh: 6482 skip_eh:
6483 /* remove the associated SCSI host */ 6483 /* remove the associated SCSI host */
diff --git a/drivers/base/topology.c b/drivers/base/topology.c
index 067a9e8bc377..8d8cdfec6529 100644
--- a/drivers/base/topology.c
+++ b/drivers/base/topology.c
@@ -126,10 +126,13 @@ static int __cpuinit topology_cpu_callback(struct notifier_block *nfb,
126 126
127 switch (action) { 127 switch (action) {
128 case CPU_UP_PREPARE: 128 case CPU_UP_PREPARE:
129 case CPU_UP_PREPARE_FROZEN:
129 rc = topology_add_dev(cpu); 130 rc = topology_add_dev(cpu);
130 break; 131 break;
131 case CPU_UP_CANCELED: 132 case CPU_UP_CANCELED:
133 case CPU_UP_CANCELED_FROZEN:
132 case CPU_DEAD: 134 case CPU_DEAD:
135 case CPU_DEAD_FROZEN:
133 topology_remove_dev(cpu); 136 topology_remove_dev(cpu);
134 break; 137 break;
135 } 138 }
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index af6d7274a7cc..18cdd8c77626 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -243,17 +243,13 @@ static int do_lo_send_aops(struct loop_device *lo, struct bio_vec *bvec,
243 transfer_result = lo_do_transfer(lo, WRITE, page, offset, 243 transfer_result = lo_do_transfer(lo, WRITE, page, offset,
244 bvec->bv_page, bv_offs, size, IV); 244 bvec->bv_page, bv_offs, size, IV);
245 if (unlikely(transfer_result)) { 245 if (unlikely(transfer_result)) {
246 char *kaddr;
247
248 /* 246 /*
249 * The transfer failed, but we still write the data to 247 * The transfer failed, but we still write the data to
250 * keep prepare/commit calls balanced. 248 * keep prepare/commit calls balanced.
251 */ 249 */
252 printk(KERN_ERR "loop: transfer error block %llu\n", 250 printk(KERN_ERR "loop: transfer error block %llu\n",
253 (unsigned long long)index); 251 (unsigned long long)index);
254 kaddr = kmap_atomic(page, KM_USER0); 252 zero_user_page(page, offset, size, KM_USER0);
255 memset(kaddr + offset, 0, size);
256 kunmap_atomic(kaddr, KM_USER0);
257 } 253 }
258 flush_dcache_page(page); 254 flush_dcache_page(page);
259 ret = aops->commit_write(file, page, offset, 255 ret = aops->commit_write(file, page, offset,
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 090796bef78f..069ae39a9cd9 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -366,20 +366,25 @@ static struct disk_attribute pid_attr = {
366 .show = pid_show, 366 .show = pid_show,
367}; 367};
368 368
369static void nbd_do_it(struct nbd_device *lo) 369static int nbd_do_it(struct nbd_device *lo)
370{ 370{
371 struct request *req; 371 struct request *req;
372 int ret;
372 373
373 BUG_ON(lo->magic != LO_MAGIC); 374 BUG_ON(lo->magic != LO_MAGIC);
374 375
375 lo->pid = current->pid; 376 lo->pid = current->pid;
376 sysfs_create_file(&lo->disk->kobj, &pid_attr.attr); 377 ret = sysfs_create_file(&lo->disk->kobj, &pid_attr.attr);
378 if (ret) {
379 printk(KERN_ERR "nbd: sysfs_create_file failed!");
380 return ret;
381 }
377 382
378 while ((req = nbd_read_stat(lo)) != NULL) 383 while ((req = nbd_read_stat(lo)) != NULL)
379 nbd_end_request(req); 384 nbd_end_request(req);
380 385
381 sysfs_remove_file(&lo->disk->kobj, &pid_attr.attr); 386 sysfs_remove_file(&lo->disk->kobj, &pid_attr.attr);
382 return; 387 return 0;
383} 388}
384 389
385static void nbd_clear_que(struct nbd_device *lo) 390static void nbd_clear_que(struct nbd_device *lo)
@@ -569,7 +574,9 @@ static int nbd_ioctl(struct inode *inode, struct file *file,
569 case NBD_DO_IT: 574 case NBD_DO_IT:
570 if (!lo->file) 575 if (!lo->file)
571 return -EINVAL; 576 return -EINVAL;
572 nbd_do_it(lo); 577 error = nbd_do_it(lo);
578 if (error)
579 return error;
573 /* on return tidy up in case we have a signal */ 580 /* on return tidy up in case we have a signal */
574 /* Forcibly shutdown the socket causing all listeners 581 /* Forcibly shutdown the socket causing all listeners
575 * to error 582 * to error
diff --git a/drivers/char/hw_random/Kconfig b/drivers/char/hw_random/Kconfig
index 5f3acd8e64b8..7cda04b33534 100644
--- a/drivers/char/hw_random/Kconfig
+++ b/drivers/char/hw_random/Kconfig
@@ -91,3 +91,17 @@ config HW_RANDOM_OMAP
91 module will be called omap-rng. 91 module will be called omap-rng.
92 92
93 If unsure, say Y. 93 If unsure, say Y.
94
95config HW_RANDOM_PASEMI
96 tristate "PA Semi HW Random Number Generator support"
97 depends on HW_RANDOM && PPC_PASEMI
98 default HW_RANDOM
99 ---help---
100 This driver provides kernel-side support for the Random Number
101 Generator hardware found on PA6T-1682M processor.
102
103 To compile this driver as a module, choose M here: the
104 module will be called pasemi-rng.
105
106 If unsure, say Y.
107
diff --git a/drivers/char/hw_random/Makefile b/drivers/char/hw_random/Makefile
index c41fa19454e3..c8b7300e2fb1 100644
--- a/drivers/char/hw_random/Makefile
+++ b/drivers/char/hw_random/Makefile
@@ -10,3 +10,4 @@ obj-$(CONFIG_HW_RANDOM_GEODE) += geode-rng.o
10obj-$(CONFIG_HW_RANDOM_VIA) += via-rng.o 10obj-$(CONFIG_HW_RANDOM_VIA) += via-rng.o
11obj-$(CONFIG_HW_RANDOM_IXP4XX) += ixp4xx-rng.o 11obj-$(CONFIG_HW_RANDOM_IXP4XX) += ixp4xx-rng.o
12obj-$(CONFIG_HW_RANDOM_OMAP) += omap-rng.o 12obj-$(CONFIG_HW_RANDOM_OMAP) += omap-rng.o
13obj-$(CONFIG_HW_RANDOM_PASEMI) += pasemi-rng.o
diff --git a/drivers/char/hw_random/pasemi-rng.c b/drivers/char/hw_random/pasemi-rng.c
new file mode 100644
index 000000000000..fa6040b6c8f2
--- /dev/null
+++ b/drivers/char/hw_random/pasemi-rng.c
@@ -0,0 +1,156 @@
1/*
2 * Copyright (C) 2006-2007 PA Semi, Inc
3 *
4 * Maintained by: Olof Johansson <olof@lixom.net>
5 *
6 * Driver for the PWRficient onchip rng
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 as
10 * published by the Free Software Foundation.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
21
22#include <linux/module.h>
23#include <linux/kernel.h>
24#include <linux/platform_device.h>
25#include <linux/hw_random.h>
26#include <asm/of_platform.h>
27#include <asm/io.h>
28
29#define SDCRNG_CTL_REG 0x00
30#define SDCRNG_CTL_FVLD_M 0x0000f000
31#define SDCRNG_CTL_FVLD_S 12
32#define SDCRNG_CTL_KSZ 0x00000800
33#define SDCRNG_CTL_RSRC_CRG 0x00000010
34#define SDCRNG_CTL_RSRC_RRG 0x00000000
35#define SDCRNG_CTL_CE 0x00000004
36#define SDCRNG_CTL_RE 0x00000002
37#define SDCRNG_CTL_DR 0x00000001
38#define SDCRNG_CTL_SELECT_RRG_RNG (SDCRNG_CTL_RE | SDCRNG_CTL_RSRC_RRG)
39#define SDCRNG_CTL_SELECT_CRG_RNG (SDCRNG_CTL_CE | SDCRNG_CTL_RSRC_CRG)
40#define SDCRNG_VAL_REG 0x20
41
42#define MODULE_NAME "pasemi_rng"
43
44static int pasemi_rng_data_present(struct hwrng *rng)
45{
46 void __iomem *rng_regs = (void __iomem *)rng->priv;
47
48 return (in_le32(rng_regs + SDCRNG_CTL_REG)
49 & SDCRNG_CTL_FVLD_M) ? 1 : 0;
50}
51
52static int pasemi_rng_data_read(struct hwrng *rng, u32 *data)
53{
54 void __iomem *rng_regs = (void __iomem *)rng->priv;
55 *data = in_le32(rng_regs + SDCRNG_VAL_REG);
56 return 4;
57}
58
59static int pasemi_rng_init(struct hwrng *rng)
60{
61 void __iomem *rng_regs = (void __iomem *)rng->priv;
62 u32 ctl;
63
64 ctl = SDCRNG_CTL_DR | SDCRNG_CTL_SELECT_RRG_RNG | SDCRNG_CTL_KSZ;
65 out_le32(rng_regs + SDCRNG_CTL_REG, ctl);
66 out_le32(rng_regs + SDCRNG_CTL_REG, ctl & ~SDCRNG_CTL_DR);
67
68 return 0;
69}
70
71static void pasemi_rng_cleanup(struct hwrng *rng)
72{
73 void __iomem *rng_regs = (void __iomem *)rng->priv;
74 u32 ctl;
75
76 ctl = SDCRNG_CTL_RE | SDCRNG_CTL_CE;
77 out_le32(rng_regs + SDCRNG_CTL_REG,
78 in_le32(rng_regs + SDCRNG_CTL_REG) & ~ctl);
79}
80
81static struct hwrng pasemi_rng = {
82 .name = MODULE_NAME,
83 .init = pasemi_rng_init,
84 .cleanup = pasemi_rng_cleanup,
85 .data_present = pasemi_rng_data_present,
86 .data_read = pasemi_rng_data_read,
87};
88
89static int __devinit rng_probe(struct of_device *ofdev,
90 const struct of_device_id *match)
91{
92 void __iomem *rng_regs;
93 struct device_node *rng_np = ofdev->node;
94 struct resource res;
95 int err = 0;
96
97 err = of_address_to_resource(rng_np, 0, &res);
98 if (err)
99 return -ENODEV;
100
101 rng_regs = ioremap(res.start, 0x100);
102
103 if (!rng_regs)
104 return -ENOMEM;
105
106 pasemi_rng.priv = (unsigned long)rng_regs;
107
108 printk(KERN_INFO "Registering PA Semi RNG\n");
109
110 err = hwrng_register(&pasemi_rng);
111
112 if (err)
113 iounmap(rng_regs);
114
115 return err;
116}
117
118static int __devexit rng_remove(struct of_device *dev)
119{
120 void __iomem *rng_regs = (void __iomem *)pasemi_rng.priv;
121
122 hwrng_unregister(&pasemi_rng);
123 iounmap(rng_regs);
124
125 return 0;
126}
127
128static struct of_device_id rng_match[] = {
129 {
130 .compatible = "1682m-rng",
131 },
132 {},
133};
134
135static struct of_platform_driver rng_driver = {
136 .name = "pasemi-rng",
137 .match_table = rng_match,
138 .probe = rng_probe,
139 .remove = rng_remove,
140};
141
142static int __init rng_init(void)
143{
144 return of_register_platform_driver(&rng_driver);
145}
146module_init(rng_init);
147
148static void __exit rng_exit(void)
149{
150 of_unregister_platform_driver(&rng_driver);
151}
152module_exit(rng_exit);
153
154MODULE_LICENSE("GPL");
155MODULE_AUTHOR("Egor Martovetsky <egor@pasemi.com>");
156MODULE_DESCRIPTION("H/W RNG driver for PA Semi processor");
diff --git a/drivers/char/pcmcia/Kconfig b/drivers/char/pcmcia/Kconfig
index 27c1179ee527..f25facd97bb4 100644
--- a/drivers/char/pcmcia/Kconfig
+++ b/drivers/char/pcmcia/Kconfig
@@ -21,6 +21,7 @@ config SYNCLINK_CS
21config CARDMAN_4000 21config CARDMAN_4000
22 tristate "Omnikey Cardman 4000 support" 22 tristate "Omnikey Cardman 4000 support"
23 depends on PCMCIA 23 depends on PCMCIA
24 select BITREVERSE
24 help 25 help
25 Enable support for the Omnikey Cardman 4000 PCMCIA Smartcard 26 Enable support for the Omnikey Cardman 4000 PCMCIA Smartcard
26 reader. 27 reader.
diff --git a/drivers/char/pcmcia/cm4000_cs.c b/drivers/char/pcmcia/cm4000_cs.c
index 4ea587983aef..fee58e03dbe2 100644
--- a/drivers/char/pcmcia/cm4000_cs.c
+++ b/drivers/char/pcmcia/cm4000_cs.c
@@ -31,6 +31,7 @@
31#include <linux/init.h> 31#include <linux/init.h>
32#include <linux/fs.h> 32#include <linux/fs.h>
33#include <linux/delay.h> 33#include <linux/delay.h>
34#include <linux/bitrev.h>
34#include <asm/uaccess.h> 35#include <asm/uaccess.h>
35#include <asm/io.h> 36#include <asm/io.h>
36 37
@@ -194,41 +195,17 @@ static inline unsigned char xinb(unsigned short port)
194} 195}
195#endif 196#endif
196 197
197#define b_0000 15 198static inline unsigned char invert_revert(unsigned char ch)
198#define b_0001 14 199{
199#define b_0010 13 200 return bitrev8(~ch);
200#define b_0011 12 201}
201#define b_0100 11
202#define b_0101 10
203#define b_0110 9
204#define b_0111 8
205#define b_1000 7
206#define b_1001 6
207#define b_1010 5
208#define b_1011 4
209#define b_1100 3
210#define b_1101 2
211#define b_1110 1
212#define b_1111 0
213
214static unsigned char irtab[16] = {
215 b_0000, b_1000, b_0100, b_1100,
216 b_0010, b_1010, b_0110, b_1110,
217 b_0001, b_1001, b_0101, b_1101,
218 b_0011, b_1011, b_0111, b_1111
219};
220 202
221static void str_invert_revert(unsigned char *b, int len) 203static void str_invert_revert(unsigned char *b, int len)
222{ 204{
223 int i; 205 int i;
224 206
225 for (i = 0; i < len; i++) 207 for (i = 0; i < len; i++)
226 b[i] = (irtab[b[i] & 0x0f] << 4) | irtab[b[i] >> 4]; 208 b[i] = invert_revert(b[i]);
227}
228
229static unsigned char invert_revert(unsigned char ch)
230{
231 return (irtab[ch & 0x0f] << 4) | irtab[ch >> 4];
232} 209}
233 210
234#define ATRLENCK(dev,pos) \ 211#define ATRLENCK(dev,pos) \
@@ -1881,8 +1858,11 @@ static int cm4000_probe(struct pcmcia_device *link)
1881 init_waitqueue_head(&dev->readq); 1858 init_waitqueue_head(&dev->readq);
1882 1859
1883 ret = cm4000_config(link, i); 1860 ret = cm4000_config(link, i);
1884 if (ret) 1861 if (ret) {
1862 dev_table[i] = NULL;
1863 kfree(dev);
1885 return ret; 1864 return ret;
1865 }
1886 1866
1887 class_device_create(cmm_class, NULL, MKDEV(major, i), NULL, 1867 class_device_create(cmm_class, NULL, MKDEV(major, i), NULL,
1888 "cmm%d", i); 1868 "cmm%d", i);
@@ -1907,7 +1887,7 @@ static void cm4000_detach(struct pcmcia_device *link)
1907 cm4000_release(link); 1887 cm4000_release(link);
1908 1888
1909 dev_table[devno] = NULL; 1889 dev_table[devno] = NULL;
1910 kfree(dev); 1890 kfree(dev);
1911 1891
1912 class_device_destroy(cmm_class, MKDEV(major, devno)); 1892 class_device_destroy(cmm_class, MKDEV(major, devno));
1913 1893
@@ -1956,12 +1936,14 @@ static int __init cmm_init(void)
1956 if (major < 0) { 1936 if (major < 0) {
1957 printk(KERN_WARNING MODULE_NAME 1937 printk(KERN_WARNING MODULE_NAME
1958 ": could not get major number\n"); 1938 ": could not get major number\n");
1939 class_destroy(cmm_class);
1959 return major; 1940 return major;
1960 } 1941 }
1961 1942
1962 rc = pcmcia_register_driver(&cm4000_driver); 1943 rc = pcmcia_register_driver(&cm4000_driver);
1963 if (rc < 0) { 1944 if (rc < 0) {
1964 unregister_chrdev(major, DEVICE_NAME); 1945 unregister_chrdev(major, DEVICE_NAME);
1946 class_destroy(cmm_class);
1965 return rc; 1947 return rc;
1966 } 1948 }
1967 1949
diff --git a/drivers/char/pcmcia/cm4040_cs.c b/drivers/char/pcmcia/cm4040_cs.c
index f2e4ec4fd407..af88181a17f4 100644
--- a/drivers/char/pcmcia/cm4040_cs.c
+++ b/drivers/char/pcmcia/cm4040_cs.c
@@ -636,8 +636,11 @@ static int reader_probe(struct pcmcia_device *link)
636 setup_timer(&dev->poll_timer, cm4040_do_poll, 0); 636 setup_timer(&dev->poll_timer, cm4040_do_poll, 0);
637 637
638 ret = reader_config(link, i); 638 ret = reader_config(link, i);
639 if (ret) 639 if (ret) {
640 dev_table[i] = NULL;
641 kfree(dev);
640 return ret; 642 return ret;
643 }
641 644
642 class_device_create(cmx_class, NULL, MKDEV(major, i), NULL, 645 class_device_create(cmx_class, NULL, MKDEV(major, i), NULL,
643 "cmx%d", i); 646 "cmx%d", i);
@@ -708,12 +711,14 @@ static int __init cm4040_init(void)
708 if (major < 0) { 711 if (major < 0) {
709 printk(KERN_WARNING MODULE_NAME 712 printk(KERN_WARNING MODULE_NAME
710 ": could not get major number\n"); 713 ": could not get major number\n");
714 class_destroy(cmx_class);
711 return major; 715 return major;
712 } 716 }
713 717
714 rc = pcmcia_register_driver(&reader_driver); 718 rc = pcmcia_register_driver(&reader_driver);
715 if (rc < 0) { 719 if (rc < 0) {
716 unregister_chrdev(major, DEVICE_NAME); 720 unregister_chrdev(major, DEVICE_NAME);
721 class_destroy(cmx_class);
717 return rc; 722 return rc;
718 } 723 }
719 724
diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c
index f6ac1d316ea4..fc662e4ce58a 100644
--- a/drivers/char/tty_io.c
+++ b/drivers/char/tty_io.c
@@ -934,13 +934,6 @@ restart:
934 return -EINVAL; 934 return -EINVAL;
935 935
936 /* 936 /*
937 * No more input please, we are switching. The new ldisc
938 * will update this value in the ldisc open function
939 */
940
941 tty->receive_room = 0;
942
943 /*
944 * Problem: What do we do if this blocks ? 937 * Problem: What do we do if this blocks ?
945 */ 938 */
946 939
@@ -951,6 +944,13 @@ restart:
951 return 0; 944 return 0;
952 } 945 }
953 946
947 /*
948 * No more input please, we are switching. The new ldisc
949 * will update this value in the ldisc open function
950 */
951
952 tty->receive_room = 0;
953
954 o_ldisc = tty->ldisc; 954 o_ldisc = tty->ldisc;
955 o_tty = tty->link; 955 o_tty = tty->link;
956 956
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 893dbaf386fb..eb37fba9b7ef 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -1685,9 +1685,11 @@ static int cpufreq_cpu_callback(struct notifier_block *nfb,
1685 if (sys_dev) { 1685 if (sys_dev) {
1686 switch (action) { 1686 switch (action) {
1687 case CPU_ONLINE: 1687 case CPU_ONLINE:
1688 case CPU_ONLINE_FROZEN:
1688 cpufreq_add_dev(sys_dev); 1689 cpufreq_add_dev(sys_dev);
1689 break; 1690 break;
1690 case CPU_DOWN_PREPARE: 1691 case CPU_DOWN_PREPARE:
1692 case CPU_DOWN_PREPARE_FROZEN:
1691 if (unlikely(lock_policy_rwsem_write(cpu))) 1693 if (unlikely(lock_policy_rwsem_write(cpu)))
1692 BUG(); 1694 BUG();
1693 1695
@@ -1699,6 +1701,7 @@ static int cpufreq_cpu_callback(struct notifier_block *nfb,
1699 __cpufreq_remove_dev(sys_dev); 1701 __cpufreq_remove_dev(sys_dev);
1700 break; 1702 break;
1701 case CPU_DOWN_FAILED: 1703 case CPU_DOWN_FAILED:
1704 case CPU_DOWN_FAILED_FROZEN:
1702 cpufreq_add_dev(sys_dev); 1705 cpufreq_add_dev(sys_dev);
1703 break; 1706 break;
1704 } 1707 }
diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c
index d1c7cac9316c..d2f0cbd8b8f3 100644
--- a/drivers/cpufreq/cpufreq_stats.c
+++ b/drivers/cpufreq/cpufreq_stats.c
@@ -313,9 +313,11 @@ static int cpufreq_stat_cpu_callback(struct notifier_block *nfb,
313 313
314 switch (action) { 314 switch (action) {
315 case CPU_ONLINE: 315 case CPU_ONLINE:
316 case CPU_ONLINE_FROZEN:
316 cpufreq_update_policy(cpu); 317 cpufreq_update_policy(cpu);
317 break; 318 break;
318 case CPU_DEAD: 319 case CPU_DEAD:
320 case CPU_DEAD_FROZEN:
319 cpufreq_stats_free_table(cpu); 321 cpufreq_stats_free_table(cpu);
320 break; 322 break;
321 } 323 }
diff --git a/drivers/hwmon/coretemp.c b/drivers/hwmon/coretemp.c
index 03b1f650d1c4..75e3911810a3 100644
--- a/drivers/hwmon/coretemp.c
+++ b/drivers/hwmon/coretemp.c
@@ -309,9 +309,11 @@ static int coretemp_cpu_callback(struct notifier_block *nfb,
309 309
310 switch (action) { 310 switch (action) {
311 case CPU_ONLINE: 311 case CPU_ONLINE:
312 case CPU_ONLINE_FROZEN:
312 coretemp_device_add(cpu); 313 coretemp_device_add(cpu);
313 break; 314 break;
314 case CPU_DEAD: 315 case CPU_DEAD:
316 case CPU_DEAD_FROZEN:
315 coretemp_device_remove(cpu); 317 coretemp_device_remove(cpu);
316 break; 318 break;
317 } 319 }
diff --git a/drivers/i2c/chips/tps65010.c b/drivers/i2c/chips/tps65010.c
index 7ed92dc3d833..3c3f2ebf3fc9 100644
--- a/drivers/i2c/chips/tps65010.c
+++ b/drivers/i2c/chips/tps65010.c
@@ -354,7 +354,7 @@ static void tps65010_interrupt(struct tps65010 *tps)
354 * also needs to get error handling and probably 354 * also needs to get error handling and probably
355 * an #ifdef CONFIG_SOFTWARE_SUSPEND 355 * an #ifdef CONFIG_SOFTWARE_SUSPEND
356 */ 356 */
357 pm_suspend(PM_SUSPEND_DISK); 357 hibernate();
358#endif 358#endif
359 poll = 1; 359 poll = 1;
360 } 360 }
diff --git a/drivers/infiniband/hw/ehca/ehca_irq.c b/drivers/infiniband/hw/ehca/ehca_irq.c
index f284be1c9166..82dda2faf4d0 100644
--- a/drivers/infiniband/hw/ehca/ehca_irq.c
+++ b/drivers/infiniband/hw/ehca/ehca_irq.c
@@ -745,6 +745,7 @@ static int comp_pool_callback(struct notifier_block *nfb,
745 745
746 switch (action) { 746 switch (action) {
747 case CPU_UP_PREPARE: 747 case CPU_UP_PREPARE:
748 case CPU_UP_PREPARE_FROZEN:
748 ehca_gen_dbg("CPU: %x (CPU_PREPARE)", cpu); 749 ehca_gen_dbg("CPU: %x (CPU_PREPARE)", cpu);
749 if(!create_comp_task(pool, cpu)) { 750 if(!create_comp_task(pool, cpu)) {
750 ehca_gen_err("Can't create comp_task for cpu: %x", cpu); 751 ehca_gen_err("Can't create comp_task for cpu: %x", cpu);
@@ -752,24 +753,29 @@ static int comp_pool_callback(struct notifier_block *nfb,
752 } 753 }
753 break; 754 break;
754 case CPU_UP_CANCELED: 755 case CPU_UP_CANCELED:
756 case CPU_UP_CANCELED_FROZEN:
755 ehca_gen_dbg("CPU: %x (CPU_CANCELED)", cpu); 757 ehca_gen_dbg("CPU: %x (CPU_CANCELED)", cpu);
756 cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu); 758 cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
757 kthread_bind(cct->task, any_online_cpu(cpu_online_map)); 759 kthread_bind(cct->task, any_online_cpu(cpu_online_map));
758 destroy_comp_task(pool, cpu); 760 destroy_comp_task(pool, cpu);
759 break; 761 break;
760 case CPU_ONLINE: 762 case CPU_ONLINE:
763 case CPU_ONLINE_FROZEN:
761 ehca_gen_dbg("CPU: %x (CPU_ONLINE)", cpu); 764 ehca_gen_dbg("CPU: %x (CPU_ONLINE)", cpu);
762 cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu); 765 cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
763 kthread_bind(cct->task, cpu); 766 kthread_bind(cct->task, cpu);
764 wake_up_process(cct->task); 767 wake_up_process(cct->task);
765 break; 768 break;
766 case CPU_DOWN_PREPARE: 769 case CPU_DOWN_PREPARE:
770 case CPU_DOWN_PREPARE_FROZEN:
767 ehca_gen_dbg("CPU: %x (CPU_DOWN_PREPARE)", cpu); 771 ehca_gen_dbg("CPU: %x (CPU_DOWN_PREPARE)", cpu);
768 break; 772 break;
769 case CPU_DOWN_FAILED: 773 case CPU_DOWN_FAILED:
774 case CPU_DOWN_FAILED_FROZEN:
770 ehca_gen_dbg("CPU: %x (CPU_DOWN_FAILED)", cpu); 775 ehca_gen_dbg("CPU: %x (CPU_DOWN_FAILED)", cpu);
771 break; 776 break;
772 case CPU_DEAD: 777 case CPU_DEAD:
778 case CPU_DEAD_FROZEN:
773 ehca_gen_dbg("CPU: %x (CPU_DEAD)", cpu); 779 ehca_gen_dbg("CPU: %x (CPU_DEAD)", cpu);
774 destroy_comp_task(pool, cpu); 780 destroy_comp_task(pool, cpu);
775 take_over_work(pool, cpu); 781 take_over_work(pool, cpu);
diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c
index c8b8cfa332bb..0d892600ff00 100644
--- a/drivers/kvm/kvm_main.c
+++ b/drivers/kvm/kvm_main.c
@@ -2889,7 +2889,9 @@ static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
2889 2889
2890 switch (val) { 2890 switch (val) {
2891 case CPU_DOWN_PREPARE: 2891 case CPU_DOWN_PREPARE:
2892 case CPU_DOWN_PREPARE_FROZEN:
2892 case CPU_UP_CANCELED: 2893 case CPU_UP_CANCELED:
2894 case CPU_UP_CANCELED_FROZEN:
2893 printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n", 2895 printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
2894 cpu); 2896 cpu);
2895 decache_vcpus_on_cpu(cpu); 2897 decache_vcpus_on_cpu(cpu);
@@ -2897,6 +2899,7 @@ static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
2897 NULL, 0, 1); 2899 NULL, 0, 1);
2898 break; 2900 break;
2899 case CPU_ONLINE: 2901 case CPU_ONLINE:
2902 case CPU_ONLINE_FROZEN:
2900 printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n", 2903 printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n",
2901 cpu); 2904 cpu);
2902 smp_call_function_single(cpu, kvm_arch_ops->hardware_enable, 2905 smp_call_function_single(cpu, kvm_arch_ops->hardware_enable,
diff --git a/drivers/mca/mca-bus.c b/drivers/mca/mca-bus.c
index da862e4632dd..67b8e9453b19 100644
--- a/drivers/mca/mca-bus.c
+++ b/drivers/mca/mca-bus.c
@@ -47,19 +47,25 @@ static int mca_bus_match (struct device *dev, struct device_driver *drv)
47{ 47{
48 struct mca_device *mca_dev = to_mca_device (dev); 48 struct mca_device *mca_dev = to_mca_device (dev);
49 struct mca_driver *mca_drv = to_mca_driver (drv); 49 struct mca_driver *mca_drv = to_mca_driver (drv);
50 const short *mca_ids = mca_drv->id_table; 50 const unsigned short *mca_ids = mca_drv->id_table;
51 int i; 51 int i = 0;
52 52
53 if (!mca_ids) 53 if (mca_ids) {
54 return 0; 54 for(i = 0; mca_ids[i]; i++) {
55 55 if (mca_ids[i] == mca_dev->pos_id) {
56 for(i = 0; mca_ids[i]; i++) { 56 mca_dev->index = i;
57 if (mca_ids[i] == mca_dev->pos_id) { 57 return 1;
58 mca_dev->index = i; 58 }
59 return 1;
60 } 59 }
61 } 60 }
62 61 /* If the integrated id is present, treat it as though it were an
62 * additional id in the id_table (it can't be because by definition,
63 * integrated id's overflow a short */
64 if (mca_drv->integrated_id && mca_dev->pos_id ==
65 mca_drv->integrated_id) {
66 mca_dev->index = i;
67 return 1;
68 }
63 return 0; 69 return 0;
64} 70}
65 71
diff --git a/drivers/mca/mca-driver.c b/drivers/mca/mca-driver.c
index 2223466b3d8a..32cd39bcc715 100644
--- a/drivers/mca/mca-driver.c
+++ b/drivers/mca/mca-driver.c
@@ -36,12 +36,25 @@ int mca_register_driver(struct mca_driver *mca_drv)
36 mca_drv->driver.bus = &mca_bus_type; 36 mca_drv->driver.bus = &mca_bus_type;
37 if ((r = driver_register(&mca_drv->driver)) < 0) 37 if ((r = driver_register(&mca_drv->driver)) < 0)
38 return r; 38 return r;
39 mca_drv->integrated_id = 0;
39 } 40 }
40 41
41 return 0; 42 return 0;
42} 43}
43EXPORT_SYMBOL(mca_register_driver); 44EXPORT_SYMBOL(mca_register_driver);
44 45
46int mca_register_driver_integrated(struct mca_driver *mca_driver,
47 int integrated_id)
48{
49 int r = mca_register_driver(mca_driver);
50
51 if (!r)
52 mca_driver->integrated_id = integrated_id;
53
54 return r;
55}
56EXPORT_SYMBOL(mca_register_driver_integrated);
57
45void mca_unregister_driver(struct mca_driver *mca_drv) 58void mca_unregister_driver(struct mca_driver *mca_drv)
46{ 59{
47 if (MCA_bus) 60 if (MCA_bus)
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 4540ade6b6b5..7df934d69134 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -262,6 +262,15 @@ config DM_MULTIPATH_EMC
262 ---help--- 262 ---help---
263 Multipath support for EMC CX/AX series hardware. 263 Multipath support for EMC CX/AX series hardware.
264 264
265config DM_DELAY
266 tristate "I/O delaying target (EXPERIMENTAL)"
267 depends on BLK_DEV_DM && EXPERIMENTAL
268 ---help---
269 A target that delays reads and/or writes and can send
270 them to different devices. Useful for testing.
271
272 If unsure, say N.
273
265endmenu 274endmenu
266 275
267endif 276endif
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 34957a68d921..38754084eac7 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -31,6 +31,7 @@ obj-$(CONFIG_MD_FAULTY) += faulty.o
31obj-$(CONFIG_BLK_DEV_MD) += md-mod.o 31obj-$(CONFIG_BLK_DEV_MD) += md-mod.o
32obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o 32obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o
33obj-$(CONFIG_DM_CRYPT) += dm-crypt.o 33obj-$(CONFIG_DM_CRYPT) += dm-crypt.o
34obj-$(CONFIG_DM_DELAY) += dm-delay.o
34obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o 35obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o
35obj-$(CONFIG_DM_MULTIPATH_EMC) += dm-emc.o 36obj-$(CONFIG_DM_MULTIPATH_EMC) += dm-emc.o
36obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o 37obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o
diff --git a/drivers/md/dm-bio-list.h b/drivers/md/dm-bio-list.h
index da4349649f7f..c6be88826fae 100644
--- a/drivers/md/dm-bio-list.h
+++ b/drivers/md/dm-bio-list.h
@@ -8,17 +8,43 @@
8#define DM_BIO_LIST_H 8#define DM_BIO_LIST_H
9 9
10#include <linux/bio.h> 10#include <linux/bio.h>
11#include <linux/prefetch.h>
11 12
12struct bio_list { 13struct bio_list {
13 struct bio *head; 14 struct bio *head;
14 struct bio *tail; 15 struct bio *tail;
15}; 16};
16 17
18static inline int bio_list_empty(const struct bio_list *bl)
19{
20 return bl->head == NULL;
21}
22
23#define BIO_LIST_INIT { .head = NULL, .tail = NULL }
24
25#define BIO_LIST(bl) \
26 struct bio_list bl = BIO_LIST_INIT
27
17static inline void bio_list_init(struct bio_list *bl) 28static inline void bio_list_init(struct bio_list *bl)
18{ 29{
19 bl->head = bl->tail = NULL; 30 bl->head = bl->tail = NULL;
20} 31}
21 32
33#define bio_list_for_each(bio, bl) \
34 for (bio = (bl)->head; bio && ({ prefetch(bio->bi_next); 1; }); \
35 bio = bio->bi_next)
36
37static inline unsigned bio_list_size(const struct bio_list *bl)
38{
39 unsigned sz = 0;
40 struct bio *bio;
41
42 bio_list_for_each(bio, bl)
43 sz++;
44
45 return sz;
46}
47
22static inline void bio_list_add(struct bio_list *bl, struct bio *bio) 48static inline void bio_list_add(struct bio_list *bl, struct bio *bio)
23{ 49{
24 bio->bi_next = NULL; 50 bio->bi_next = NULL;
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index d8121234c347..7b0fcfc9eaa5 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -33,7 +33,6 @@
33struct crypt_io { 33struct crypt_io {
34 struct dm_target *target; 34 struct dm_target *target;
35 struct bio *base_bio; 35 struct bio *base_bio;
36 struct bio *first_clone;
37 struct work_struct work; 36 struct work_struct work;
38 atomic_t pending; 37 atomic_t pending;
39 int error; 38 int error;
@@ -107,6 +106,8 @@ struct crypt_config {
107 106
108static struct kmem_cache *_crypt_io_pool; 107static struct kmem_cache *_crypt_io_pool;
109 108
109static void clone_init(struct crypt_io *, struct bio *);
110
110/* 111/*
111 * Different IV generation algorithms: 112 * Different IV generation algorithms:
112 * 113 *
@@ -120,6 +121,9 @@ static struct kmem_cache *_crypt_io_pool;
120 * benbi: the 64-bit "big-endian 'narrow block'-count", starting at 1 121 * benbi: the 64-bit "big-endian 'narrow block'-count", starting at 1
121 * (needed for LRW-32-AES and possible other narrow block modes) 122 * (needed for LRW-32-AES and possible other narrow block modes)
122 * 123 *
124 * null: the initial vector is always zero. Provides compatibility with
125 * obsolete loop_fish2 devices. Do not use for new devices.
126 *
123 * plumb: unimplemented, see: 127 * plumb: unimplemented, see:
124 * http://article.gmane.org/gmane.linux.kernel.device-mapper.dm-crypt/454 128 * http://article.gmane.org/gmane.linux.kernel.device-mapper.dm-crypt/454
125 */ 129 */
@@ -256,6 +260,13 @@ static int crypt_iv_benbi_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
256 return 0; 260 return 0;
257} 261}
258 262
263static int crypt_iv_null_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
264{
265 memset(iv, 0, cc->iv_size);
266
267 return 0;
268}
269
259static struct crypt_iv_operations crypt_iv_plain_ops = { 270static struct crypt_iv_operations crypt_iv_plain_ops = {
260 .generator = crypt_iv_plain_gen 271 .generator = crypt_iv_plain_gen
261}; 272};
@@ -272,6 +283,10 @@ static struct crypt_iv_operations crypt_iv_benbi_ops = {
272 .generator = crypt_iv_benbi_gen 283 .generator = crypt_iv_benbi_gen
273}; 284};
274 285
286static struct crypt_iv_operations crypt_iv_null_ops = {
287 .generator = crypt_iv_null_gen
288};
289
275static int 290static int
276crypt_convert_scatterlist(struct crypt_config *cc, struct scatterlist *out, 291crypt_convert_scatterlist(struct crypt_config *cc, struct scatterlist *out,
277 struct scatterlist *in, unsigned int length, 292 struct scatterlist *in, unsigned int length,
@@ -378,36 +393,21 @@ static int crypt_convert(struct crypt_config *cc,
378 * This should never violate the device limitations 393 * This should never violate the device limitations
379 * May return a smaller bio when running out of pages 394 * May return a smaller bio when running out of pages
380 */ 395 */
381static struct bio * 396static struct bio *crypt_alloc_buffer(struct crypt_io *io, unsigned int size)
382crypt_alloc_buffer(struct crypt_config *cc, unsigned int size,
383 struct bio *base_bio, unsigned int *bio_vec_idx)
384{ 397{
398 struct crypt_config *cc = io->target->private;
385 struct bio *clone; 399 struct bio *clone;
386 unsigned int nr_iovecs = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; 400 unsigned int nr_iovecs = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
387 gfp_t gfp_mask = GFP_NOIO | __GFP_HIGHMEM; 401 gfp_t gfp_mask = GFP_NOIO | __GFP_HIGHMEM;
388 unsigned int i; 402 unsigned int i;
389 403
390 if (base_bio) { 404 clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, cc->bs);
391 clone = bio_alloc_bioset(GFP_NOIO, base_bio->bi_max_vecs, cc->bs);
392 __bio_clone(clone, base_bio);
393 } else
394 clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, cc->bs);
395
396 if (!clone) 405 if (!clone)
397 return NULL; 406 return NULL;
398 407
399 clone->bi_destructor = dm_crypt_bio_destructor; 408 clone_init(io, clone);
400
401 /* if the last bio was not complete, continue where that one ended */
402 clone->bi_idx = *bio_vec_idx;
403 clone->bi_vcnt = *bio_vec_idx;
404 clone->bi_size = 0;
405 clone->bi_flags &= ~(1 << BIO_SEG_VALID);
406
407 /* clone->bi_idx pages have already been allocated */
408 size -= clone->bi_idx * PAGE_SIZE;
409 409
410 for (i = clone->bi_idx; i < nr_iovecs; i++) { 410 for (i = 0; i < nr_iovecs; i++) {
411 struct bio_vec *bv = bio_iovec_idx(clone, i); 411 struct bio_vec *bv = bio_iovec_idx(clone, i);
412 412
413 bv->bv_page = mempool_alloc(cc->page_pool, gfp_mask); 413 bv->bv_page = mempool_alloc(cc->page_pool, gfp_mask);
@@ -419,7 +419,7 @@ crypt_alloc_buffer(struct crypt_config *cc, unsigned int size,
419 * return a partially allocated bio, the caller will then try 419 * return a partially allocated bio, the caller will then try
420 * to allocate additional bios while submitting this partial bio 420 * to allocate additional bios while submitting this partial bio
421 */ 421 */
422 if ((i - clone->bi_idx) == (MIN_BIO_PAGES - 1)) 422 if (i == (MIN_BIO_PAGES - 1))
423 gfp_mask = (gfp_mask | __GFP_NOWARN) & ~__GFP_WAIT; 423 gfp_mask = (gfp_mask | __GFP_NOWARN) & ~__GFP_WAIT;
424 424
425 bv->bv_offset = 0; 425 bv->bv_offset = 0;
@@ -438,12 +438,6 @@ crypt_alloc_buffer(struct crypt_config *cc, unsigned int size,
438 return NULL; 438 return NULL;
439 } 439 }
440 440
441 /*
442 * Remember the last bio_vec allocated to be able
443 * to correctly continue after the splitting.
444 */
445 *bio_vec_idx = clone->bi_vcnt;
446
447 return clone; 441 return clone;
448} 442}
449 443
@@ -495,9 +489,6 @@ static void dec_pending(struct crypt_io *io, int error)
495 if (!atomic_dec_and_test(&io->pending)) 489 if (!atomic_dec_and_test(&io->pending))
496 return; 490 return;
497 491
498 if (io->first_clone)
499 bio_put(io->first_clone);
500
501 bio_endio(io->base_bio, io->base_bio->bi_size, io->error); 492 bio_endio(io->base_bio, io->base_bio->bi_size, io->error);
502 493
503 mempool_free(io, cc->io_pool); 494 mempool_free(io, cc->io_pool);
@@ -562,6 +553,7 @@ static void clone_init(struct crypt_io *io, struct bio *clone)
562 clone->bi_end_io = crypt_endio; 553 clone->bi_end_io = crypt_endio;
563 clone->bi_bdev = cc->dev->bdev; 554 clone->bi_bdev = cc->dev->bdev;
564 clone->bi_rw = io->base_bio->bi_rw; 555 clone->bi_rw = io->base_bio->bi_rw;
556 clone->bi_destructor = dm_crypt_bio_destructor;
565} 557}
566 558
567static void process_read(struct crypt_io *io) 559static void process_read(struct crypt_io *io)
@@ -585,7 +577,6 @@ static void process_read(struct crypt_io *io)
585 } 577 }
586 578
587 clone_init(io, clone); 579 clone_init(io, clone);
588 clone->bi_destructor = dm_crypt_bio_destructor;
589 clone->bi_idx = 0; 580 clone->bi_idx = 0;
590 clone->bi_vcnt = bio_segments(base_bio); 581 clone->bi_vcnt = bio_segments(base_bio);
591 clone->bi_size = base_bio->bi_size; 582 clone->bi_size = base_bio->bi_size;
@@ -604,7 +595,6 @@ static void process_write(struct crypt_io *io)
604 struct convert_context ctx; 595 struct convert_context ctx;
605 unsigned remaining = base_bio->bi_size; 596 unsigned remaining = base_bio->bi_size;
606 sector_t sector = base_bio->bi_sector - io->target->begin; 597 sector_t sector = base_bio->bi_sector - io->target->begin;
607 unsigned bvec_idx = 0;
608 598
609 atomic_inc(&io->pending); 599 atomic_inc(&io->pending);
610 600
@@ -615,14 +605,14 @@ static void process_write(struct crypt_io *io)
615 * so repeat the whole process until all the data can be handled. 605 * so repeat the whole process until all the data can be handled.
616 */ 606 */
617 while (remaining) { 607 while (remaining) {
618 clone = crypt_alloc_buffer(cc, base_bio->bi_size, 608 clone = crypt_alloc_buffer(io, remaining);
619 io->first_clone, &bvec_idx);
620 if (unlikely(!clone)) { 609 if (unlikely(!clone)) {
621 dec_pending(io, -ENOMEM); 610 dec_pending(io, -ENOMEM);
622 return; 611 return;
623 } 612 }
624 613
625 ctx.bio_out = clone; 614 ctx.bio_out = clone;
615 ctx.idx_out = 0;
626 616
627 if (unlikely(crypt_convert(cc, &ctx) < 0)) { 617 if (unlikely(crypt_convert(cc, &ctx) < 0)) {
628 crypt_free_buffer_pages(cc, clone, clone->bi_size); 618 crypt_free_buffer_pages(cc, clone, clone->bi_size);
@@ -631,31 +621,26 @@ static void process_write(struct crypt_io *io)
631 return; 621 return;
632 } 622 }
633 623
634 clone_init(io, clone); 624 /* crypt_convert should have filled the clone bio */
635 clone->bi_sector = cc->start + sector; 625 BUG_ON(ctx.idx_out < clone->bi_vcnt);
636
637 if (!io->first_clone) {
638 /*
639 * hold a reference to the first clone, because it
640 * holds the bio_vec array and that can't be freed
641 * before all other clones are released
642 */
643 bio_get(clone);
644 io->first_clone = clone;
645 }
646 626
627 clone->bi_sector = cc->start + sector;
647 remaining -= clone->bi_size; 628 remaining -= clone->bi_size;
648 sector += bio_sectors(clone); 629 sector += bio_sectors(clone);
649 630
650 /* prevent bio_put of first_clone */ 631 /* Grab another reference to the io struct
632 * before we kick off the request */
651 if (remaining) 633 if (remaining)
652 atomic_inc(&io->pending); 634 atomic_inc(&io->pending);
653 635
654 generic_make_request(clone); 636 generic_make_request(clone);
655 637
638 /* Do not reference clone after this - it
639 * may be gone already. */
640
656 /* out of memory -> run queues */ 641 /* out of memory -> run queues */
657 if (remaining) 642 if (remaining)
658 congestion_wait(bio_data_dir(clone), HZ/100); 643 congestion_wait(WRITE, HZ/100);
659 } 644 }
660} 645}
661 646
@@ -832,6 +817,8 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
832 cc->iv_gen_ops = &crypt_iv_essiv_ops; 817 cc->iv_gen_ops = &crypt_iv_essiv_ops;
833 else if (strcmp(ivmode, "benbi") == 0) 818 else if (strcmp(ivmode, "benbi") == 0)
834 cc->iv_gen_ops = &crypt_iv_benbi_ops; 819 cc->iv_gen_ops = &crypt_iv_benbi_ops;
820 else if (strcmp(ivmode, "null") == 0)
821 cc->iv_gen_ops = &crypt_iv_null_ops;
835 else { 822 else {
836 ti->error = "Invalid IV mode"; 823 ti->error = "Invalid IV mode";
837 goto bad2; 824 goto bad2;
@@ -954,10 +941,12 @@ static int crypt_map(struct dm_target *ti, struct bio *bio,
954 struct crypt_config *cc = ti->private; 941 struct crypt_config *cc = ti->private;
955 struct crypt_io *io; 942 struct crypt_io *io;
956 943
944 if (bio_barrier(bio))
945 return -EOPNOTSUPP;
946
957 io = mempool_alloc(cc->io_pool, GFP_NOIO); 947 io = mempool_alloc(cc->io_pool, GFP_NOIO);
958 io->target = ti; 948 io->target = ti;
959 io->base_bio = bio; 949 io->base_bio = bio;
960 io->first_clone = NULL;
961 io->error = io->post_process = 0; 950 io->error = io->post_process = 0;
962 atomic_set(&io->pending, 0); 951 atomic_set(&io->pending, 0);
963 kcryptd_queue_io(io); 952 kcryptd_queue_io(io);
@@ -1057,7 +1046,7 @@ error:
1057 1046
1058static struct target_type crypt_target = { 1047static struct target_type crypt_target = {
1059 .name = "crypt", 1048 .name = "crypt",
1060 .version= {1, 3, 0}, 1049 .version= {1, 5, 0},
1061 .module = THIS_MODULE, 1050 .module = THIS_MODULE,
1062 .ctr = crypt_ctr, 1051 .ctr = crypt_ctr,
1063 .dtr = crypt_dtr, 1052 .dtr = crypt_dtr,
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
new file mode 100644
index 000000000000..52c7cf9e5803
--- /dev/null
+++ b/drivers/md/dm-delay.c
@@ -0,0 +1,383 @@
1/*
2 * Copyright (C) 2005-2007 Red Hat GmbH
3 *
4 * A target that delays reads and/or writes and can send
5 * them to different devices.
6 *
7 * This file is released under the GPL.
8 */
9
10#include <linux/module.h>
11#include <linux/init.h>
12#include <linux/blkdev.h>
13#include <linux/bio.h>
14#include <linux/slab.h>
15
16#include "dm.h"
17#include "dm-bio-list.h"
18
19#define DM_MSG_PREFIX "delay"
20
21struct delay_c {
22 struct timer_list delay_timer;
23 struct semaphore timer_lock;
24 struct work_struct flush_expired_bios;
25 struct list_head delayed_bios;
26 atomic_t may_delay;
27 mempool_t *delayed_pool;
28
29 struct dm_dev *dev_read;
30 sector_t start_read;
31 unsigned read_delay;
32 unsigned reads;
33
34 struct dm_dev *dev_write;
35 sector_t start_write;
36 unsigned write_delay;
37 unsigned writes;
38};
39
40struct delay_info {
41 struct delay_c *context;
42 struct list_head list;
43 struct bio *bio;
44 unsigned long expires;
45};
46
47static DEFINE_MUTEX(delayed_bios_lock);
48
49static struct workqueue_struct *kdelayd_wq;
50static struct kmem_cache *delayed_cache;
51
52static void handle_delayed_timer(unsigned long data)
53{
54 struct delay_c *dc = (struct delay_c *)data;
55
56 queue_work(kdelayd_wq, &dc->flush_expired_bios);
57}
58
59static void queue_timeout(struct delay_c *dc, unsigned long expires)
60{
61 down(&dc->timer_lock);
62
63 if (!timer_pending(&dc->delay_timer) || expires < dc->delay_timer.expires)
64 mod_timer(&dc->delay_timer, expires);
65
66 up(&dc->timer_lock);
67}
68
69static void flush_bios(struct bio *bio)
70{
71 struct bio *n;
72
73 while (bio) {
74 n = bio->bi_next;
75 bio->bi_next = NULL;
76 generic_make_request(bio);
77 bio = n;
78 }
79}
80
81static struct bio *flush_delayed_bios(struct delay_c *dc, int flush_all)
82{
83 struct delay_info *delayed, *next;
84 unsigned long next_expires = 0;
85 int start_timer = 0;
86 BIO_LIST(flush_bios);
87
88 mutex_lock(&delayed_bios_lock);
89 list_for_each_entry_safe(delayed, next, &dc->delayed_bios, list) {
90 if (flush_all || time_after_eq(jiffies, delayed->expires)) {
91 list_del(&delayed->list);
92 bio_list_add(&flush_bios, delayed->bio);
93 if ((bio_data_dir(delayed->bio) == WRITE))
94 delayed->context->writes--;
95 else
96 delayed->context->reads--;
97 mempool_free(delayed, dc->delayed_pool);
98 continue;
99 }
100
101 if (!start_timer) {
102 start_timer = 1;
103 next_expires = delayed->expires;
104 } else
105 next_expires = min(next_expires, delayed->expires);
106 }
107
108 mutex_unlock(&delayed_bios_lock);
109
110 if (start_timer)
111 queue_timeout(dc, next_expires);
112
113 return bio_list_get(&flush_bios);
114}
115
116static void flush_expired_bios(struct work_struct *work)
117{
118 struct delay_c *dc;
119
120 dc = container_of(work, struct delay_c, flush_expired_bios);
121 flush_bios(flush_delayed_bios(dc, 0));
122}
123
124/*
125 * Mapping parameters:
126 * <device> <offset> <delay> [<write_device> <write_offset> <write_delay>]
127 *
128 * With separate write parameters, the first set is only used for reads.
129 * Delays are specified in milliseconds.
130 */
131static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
132{
133 struct delay_c *dc;
134 unsigned long long tmpll;
135
136 if (argc != 3 && argc != 6) {
137 ti->error = "requires exactly 3 or 6 arguments";
138 return -EINVAL;
139 }
140
141 dc = kmalloc(sizeof(*dc), GFP_KERNEL);
142 if (!dc) {
143 ti->error = "Cannot allocate context";
144 return -ENOMEM;
145 }
146
147 dc->reads = dc->writes = 0;
148
149 if (sscanf(argv[1], "%llu", &tmpll) != 1) {
150 ti->error = "Invalid device sector";
151 goto bad;
152 }
153 dc->start_read = tmpll;
154
155 if (sscanf(argv[2], "%u", &dc->read_delay) != 1) {
156 ti->error = "Invalid delay";
157 goto bad;
158 }
159
160 if (dm_get_device(ti, argv[0], dc->start_read, ti->len,
161 dm_table_get_mode(ti->table), &dc->dev_read)) {
162 ti->error = "Device lookup failed";
163 goto bad;
164 }
165
166 if (argc == 3) {
167 dc->dev_write = NULL;
168 goto out;
169 }
170
171 if (sscanf(argv[4], "%llu", &tmpll) != 1) {
172 ti->error = "Invalid write device sector";
173 goto bad;
174 }
175 dc->start_write = tmpll;
176
177 if (sscanf(argv[5], "%u", &dc->write_delay) != 1) {
178 ti->error = "Invalid write delay";
179 goto bad;
180 }
181
182 if (dm_get_device(ti, argv[3], dc->start_write, ti->len,
183 dm_table_get_mode(ti->table), &dc->dev_write)) {
184 ti->error = "Write device lookup failed";
185 dm_put_device(ti, dc->dev_read);
186 goto bad;
187 }
188
189out:
190 dc->delayed_pool = mempool_create_slab_pool(128, delayed_cache);
191 if (!dc->delayed_pool) {
192 DMERR("Couldn't create delayed bio pool.");
193 goto bad;
194 }
195
196 init_timer(&dc->delay_timer);
197 dc->delay_timer.function = handle_delayed_timer;
198 dc->delay_timer.data = (unsigned long)dc;
199
200 INIT_WORK(&dc->flush_expired_bios, flush_expired_bios);
201 INIT_LIST_HEAD(&dc->delayed_bios);
202 init_MUTEX(&dc->timer_lock);
203 atomic_set(&dc->may_delay, 1);
204
205 ti->private = dc;
206 return 0;
207
208bad:
209 kfree(dc);
210 return -EINVAL;
211}
212
213static void delay_dtr(struct dm_target *ti)
214{
215 struct delay_c *dc = ti->private;
216
217 flush_workqueue(kdelayd_wq);
218
219 dm_put_device(ti, dc->dev_read);
220
221 if (dc->dev_write)
222 dm_put_device(ti, dc->dev_write);
223
224 mempool_destroy(dc->delayed_pool);
225 kfree(dc);
226}
227
228static int delay_bio(struct delay_c *dc, int delay, struct bio *bio)
229{
230 struct delay_info *delayed;
231 unsigned long expires = 0;
232
233 if (!delay || !atomic_read(&dc->may_delay))
234 return 1;
235
236 delayed = mempool_alloc(dc->delayed_pool, GFP_NOIO);
237
238 delayed->context = dc;
239 delayed->bio = bio;
240 delayed->expires = expires = jiffies + (delay * HZ / 1000);
241
242 mutex_lock(&delayed_bios_lock);
243
244 if (bio_data_dir(bio) == WRITE)
245 dc->writes++;
246 else
247 dc->reads++;
248
249 list_add_tail(&delayed->list, &dc->delayed_bios);
250
251 mutex_unlock(&delayed_bios_lock);
252
253 queue_timeout(dc, expires);
254
255 return 0;
256}
257
258static void delay_presuspend(struct dm_target *ti)
259{
260 struct delay_c *dc = ti->private;
261
262 atomic_set(&dc->may_delay, 0);
263 del_timer_sync(&dc->delay_timer);
264 flush_bios(flush_delayed_bios(dc, 1));
265}
266
267static void delay_resume(struct dm_target *ti)
268{
269 struct delay_c *dc = ti->private;
270
271 atomic_set(&dc->may_delay, 1);
272}
273
274static int delay_map(struct dm_target *ti, struct bio *bio,
275 union map_info *map_context)
276{
277 struct delay_c *dc = ti->private;
278
279 if ((bio_data_dir(bio) == WRITE) && (dc->dev_write)) {
280 bio->bi_bdev = dc->dev_write->bdev;
281 bio->bi_sector = dc->start_write +
282 (bio->bi_sector - ti->begin);
283
284 return delay_bio(dc, dc->write_delay, bio);
285 }
286
287 bio->bi_bdev = dc->dev_read->bdev;
288 bio->bi_sector = dc->start_read +
289 (bio->bi_sector - ti->begin);
290
291 return delay_bio(dc, dc->read_delay, bio);
292}
293
294static int delay_status(struct dm_target *ti, status_type_t type,
295 char *result, unsigned maxlen)
296{
297 struct delay_c *dc = ti->private;
298 int sz = 0;
299
300 switch (type) {
301 case STATUSTYPE_INFO:
302 DMEMIT("%u %u", dc->reads, dc->writes);
303 break;
304
305 case STATUSTYPE_TABLE:
306 DMEMIT("%s %llu %u", dc->dev_read->name,
307 (unsigned long long) dc->start_read,
308 dc->read_delay);
309 if (dc->dev_write)
310 DMEMIT("%s %llu %u", dc->dev_write->name,
311 (unsigned long long) dc->start_write,
312 dc->write_delay);
313 break;
314 }
315
316 return 0;
317}
318
319static struct target_type delay_target = {
320 .name = "delay",
321 .version = {1, 0, 2},
322 .module = THIS_MODULE,
323 .ctr = delay_ctr,
324 .dtr = delay_dtr,
325 .map = delay_map,
326 .presuspend = delay_presuspend,
327 .resume = delay_resume,
328 .status = delay_status,
329};
330
331static int __init dm_delay_init(void)
332{
333 int r = -ENOMEM;
334
335 kdelayd_wq = create_workqueue("kdelayd");
336 if (!kdelayd_wq) {
337 DMERR("Couldn't start kdelayd");
338 goto bad_queue;
339 }
340
341 delayed_cache = kmem_cache_create("dm-delay",
342 sizeof(struct delay_info),
343 __alignof__(struct delay_info),
344 0, NULL, NULL);
345 if (!delayed_cache) {
346 DMERR("Couldn't create delayed bio cache.");
347 goto bad_memcache;
348 }
349
350 r = dm_register_target(&delay_target);
351 if (r < 0) {
352 DMERR("register failed %d", r);
353 goto bad_register;
354 }
355
356 return 0;
357
358bad_register:
359 kmem_cache_destroy(delayed_cache);
360bad_memcache:
361 destroy_workqueue(kdelayd_wq);
362bad_queue:
363 return r;
364}
365
366static void __exit dm_delay_exit(void)
367{
368 int r = dm_unregister_target(&delay_target);
369
370 if (r < 0)
371 DMERR("unregister failed %d", r);
372
373 kmem_cache_destroy(delayed_cache);
374 destroy_workqueue(kdelayd_wq);
375}
376
377/* Module hooks */
378module_init(dm_delay_init);
379module_exit(dm_delay_exit);
380
381MODULE_DESCRIPTION(DM_NAME " delay target");
382MODULE_AUTHOR("Heinz Mauelshagen <mauelshagen@redhat.com>");
383MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c
index 99cdffa7fbfe..07e0a0c84f6e 100644
--- a/drivers/md/dm-exception-store.c
+++ b/drivers/md/dm-exception-store.c
@@ -1,7 +1,8 @@
1/* 1/*
2 * dm-snapshot.c 2 * dm-exception-store.c
3 * 3 *
4 * Copyright (C) 2001-2002 Sistina Software (UK) Limited. 4 * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
5 * Copyright (C) 2006 Red Hat GmbH
5 * 6 *
6 * This file is released under the GPL. 7 * This file is released under the GPL.
7 */ 8 */
@@ -123,6 +124,7 @@ struct pstore {
123 atomic_t pending_count; 124 atomic_t pending_count;
124 uint32_t callback_count; 125 uint32_t callback_count;
125 struct commit_callback *callbacks; 126 struct commit_callback *callbacks;
127 struct dm_io_client *io_client;
126}; 128};
127 129
128static inline unsigned int sectors_to_pages(unsigned int sectors) 130static inline unsigned int sectors_to_pages(unsigned int sectors)
@@ -159,14 +161,20 @@ static void free_area(struct pstore *ps)
159 */ 161 */
160static int chunk_io(struct pstore *ps, uint32_t chunk, int rw) 162static int chunk_io(struct pstore *ps, uint32_t chunk, int rw)
161{ 163{
162 struct io_region where; 164 struct io_region where = {
163 unsigned long bits; 165 .bdev = ps->snap->cow->bdev,
164 166 .sector = ps->snap->chunk_size * chunk,
165 where.bdev = ps->snap->cow->bdev; 167 .count = ps->snap->chunk_size,
166 where.sector = ps->snap->chunk_size * chunk; 168 };
167 where.count = ps->snap->chunk_size; 169 struct dm_io_request io_req = {
168 170 .bi_rw = rw,
169 return dm_io_sync_vm(1, &where, rw, ps->area, &bits); 171 .mem.type = DM_IO_VMA,
172 .mem.ptr.vma = ps->area,
173 .client = ps->io_client,
174 .notify.fn = NULL,
175 };
176
177 return dm_io(&io_req, 1, &where, NULL);
170} 178}
171 179
172/* 180/*
@@ -213,17 +221,18 @@ static int read_header(struct pstore *ps, int *new_snapshot)
213 chunk_size_supplied = 0; 221 chunk_size_supplied = 0;
214 } 222 }
215 223
216 r = dm_io_get(sectors_to_pages(ps->snap->chunk_size)); 224 ps->io_client = dm_io_client_create(sectors_to_pages(ps->snap->
217 if (r) 225 chunk_size));
218 return r; 226 if (IS_ERR(ps->io_client))
227 return PTR_ERR(ps->io_client);
219 228
220 r = alloc_area(ps); 229 r = alloc_area(ps);
221 if (r) 230 if (r)
222 goto bad1; 231 return r;
223 232
224 r = chunk_io(ps, 0, READ); 233 r = chunk_io(ps, 0, READ);
225 if (r) 234 if (r)
226 goto bad2; 235 goto bad;
227 236
228 dh = (struct disk_header *) ps->area; 237 dh = (struct disk_header *) ps->area;
229 238
@@ -235,7 +244,7 @@ static int read_header(struct pstore *ps, int *new_snapshot)
235 if (le32_to_cpu(dh->magic) != SNAP_MAGIC) { 244 if (le32_to_cpu(dh->magic) != SNAP_MAGIC) {
236 DMWARN("Invalid or corrupt snapshot"); 245 DMWARN("Invalid or corrupt snapshot");
237 r = -ENXIO; 246 r = -ENXIO;
238 goto bad2; 247 goto bad;
239 } 248 }
240 249
241 *new_snapshot = 0; 250 *new_snapshot = 0;
@@ -252,27 +261,22 @@ static int read_header(struct pstore *ps, int *new_snapshot)
252 (unsigned long long)ps->snap->chunk_size); 261 (unsigned long long)ps->snap->chunk_size);
253 262
254 /* We had a bogus chunk_size. Fix stuff up. */ 263 /* We had a bogus chunk_size. Fix stuff up. */
255 dm_io_put(sectors_to_pages(ps->snap->chunk_size));
256 free_area(ps); 264 free_area(ps);
257 265
258 ps->snap->chunk_size = chunk_size; 266 ps->snap->chunk_size = chunk_size;
259 ps->snap->chunk_mask = chunk_size - 1; 267 ps->snap->chunk_mask = chunk_size - 1;
260 ps->snap->chunk_shift = ffs(chunk_size) - 1; 268 ps->snap->chunk_shift = ffs(chunk_size) - 1;
261 269
262 r = dm_io_get(sectors_to_pages(chunk_size)); 270 r = dm_io_client_resize(sectors_to_pages(ps->snap->chunk_size),
271 ps->io_client);
263 if (r) 272 if (r)
264 return r; 273 return r;
265 274
266 r = alloc_area(ps); 275 r = alloc_area(ps);
267 if (r) 276 return r;
268 goto bad1;
269
270 return 0;
271 277
272bad2: 278bad:
273 free_area(ps); 279 free_area(ps);
274bad1:
275 dm_io_put(sectors_to_pages(ps->snap->chunk_size));
276 return r; 280 return r;
277} 281}
278 282
@@ -405,7 +409,7 @@ static void persistent_destroy(struct exception_store *store)
405{ 409{
406 struct pstore *ps = get_info(store); 410 struct pstore *ps = get_info(store);
407 411
408 dm_io_put(sectors_to_pages(ps->snap->chunk_size)); 412 dm_io_client_destroy(ps->io_client);
409 vfree(ps->callbacks); 413 vfree(ps->callbacks);
410 free_area(ps); 414 free_area(ps);
411 kfree(ps); 415 kfree(ps);
diff --git a/drivers/md/dm-hw-handler.h b/drivers/md/dm-hw-handler.h
index 32eff28e4adc..e0832e6fcf36 100644
--- a/drivers/md/dm-hw-handler.h
+++ b/drivers/md/dm-hw-handler.h
@@ -16,6 +16,7 @@
16struct hw_handler_type; 16struct hw_handler_type;
17struct hw_handler { 17struct hw_handler {
18 struct hw_handler_type *type; 18 struct hw_handler_type *type;
19 struct mapped_device *md;
19 void *context; 20 void *context;
20}; 21};
21 22
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index 8bdc8a87b249..352c6fbeac53 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -1,5 +1,6 @@
1/* 1/*
2 * Copyright (C) 2003 Sistina Software 2 * Copyright (C) 2003 Sistina Software
3 * Copyright (C) 2006 Red Hat GmbH
3 * 4 *
4 * This file is released under the GPL. 5 * This file is released under the GPL.
5 */ 6 */
@@ -12,13 +13,17 @@
12#include <linux/sched.h> 13#include <linux/sched.h>
13#include <linux/slab.h> 14#include <linux/slab.h>
14 15
15static struct bio_set *_bios; 16struct dm_io_client {
17 mempool_t *pool;
18 struct bio_set *bios;
19};
16 20
17/* FIXME: can we shrink this ? */ 21/* FIXME: can we shrink this ? */
18struct io { 22struct io {
19 unsigned long error; 23 unsigned long error;
20 atomic_t count; 24 atomic_t count;
21 struct task_struct *sleeper; 25 struct task_struct *sleeper;
26 struct dm_io_client *client;
22 io_notify_fn callback; 27 io_notify_fn callback;
23 void *context; 28 void *context;
24}; 29};
@@ -26,63 +31,58 @@ struct io {
26/* 31/*
27 * io contexts are only dynamically allocated for asynchronous 32 * io contexts are only dynamically allocated for asynchronous
28 * io. Since async io is likely to be the majority of io we'll 33 * io. Since async io is likely to be the majority of io we'll
29 * have the same number of io contexts as buffer heads ! (FIXME: 34 * have the same number of io contexts as bios! (FIXME: must reduce this).
30 * must reduce this).
31 */ 35 */
32static unsigned _num_ios;
33static mempool_t *_io_pool;
34 36
35static unsigned int pages_to_ios(unsigned int pages) 37static unsigned int pages_to_ios(unsigned int pages)
36{ 38{
37 return 4 * pages; /* too many ? */ 39 return 4 * pages; /* too many ? */
38} 40}
39 41
40static int resize_pool(unsigned int new_ios) 42/*
43 * Create a client with mempool and bioset.
44 */
45struct dm_io_client *dm_io_client_create(unsigned num_pages)
41{ 46{
42 int r = 0; 47 unsigned ios = pages_to_ios(num_pages);
43 48 struct dm_io_client *client;
44 if (_io_pool) {
45 if (new_ios == 0) {
46 /* free off the pool */
47 mempool_destroy(_io_pool);
48 _io_pool = NULL;
49 bioset_free(_bios);
50
51 } else {
52 /* resize the pool */
53 r = mempool_resize(_io_pool, new_ios, GFP_KERNEL);
54 }
55 49
56 } else { 50 client = kmalloc(sizeof(*client), GFP_KERNEL);
57 /* create new pool */ 51 if (!client)
58 _io_pool = mempool_create_kmalloc_pool(new_ios, 52 return ERR_PTR(-ENOMEM);
59 sizeof(struct io)); 53
60 if (!_io_pool) 54 client->pool = mempool_create_kmalloc_pool(ios, sizeof(struct io));
61 return -ENOMEM; 55 if (!client->pool)
62 56 goto bad;
63 _bios = bioset_create(16, 16);
64 if (!_bios) {
65 mempool_destroy(_io_pool);
66 _io_pool = NULL;
67 return -ENOMEM;
68 }
69 }
70 57
71 if (!r) 58 client->bios = bioset_create(16, 16);
72 _num_ios = new_ios; 59 if (!client->bios)
60 goto bad;
73 61
74 return r; 62 return client;
63
64 bad:
65 if (client->pool)
66 mempool_destroy(client->pool);
67 kfree(client);
68 return ERR_PTR(-ENOMEM);
75} 69}
70EXPORT_SYMBOL(dm_io_client_create);
76 71
77int dm_io_get(unsigned int num_pages) 72int dm_io_client_resize(unsigned num_pages, struct dm_io_client *client)
78{ 73{
79 return resize_pool(_num_ios + pages_to_ios(num_pages)); 74 return mempool_resize(client->pool, pages_to_ios(num_pages),
75 GFP_KERNEL);
80} 76}
77EXPORT_SYMBOL(dm_io_client_resize);
81 78
82void dm_io_put(unsigned int num_pages) 79void dm_io_client_destroy(struct dm_io_client *client)
83{ 80{
84 resize_pool(_num_ios - pages_to_ios(num_pages)); 81 mempool_destroy(client->pool);
82 bioset_free(client->bios);
83 kfree(client);
85} 84}
85EXPORT_SYMBOL(dm_io_client_destroy);
86 86
87/*----------------------------------------------------------------- 87/*-----------------------------------------------------------------
88 * We need to keep track of which region a bio is doing io for. 88 * We need to keep track of which region a bio is doing io for.
@@ -118,7 +118,7 @@ static void dec_count(struct io *io, unsigned int region, int error)
118 io_notify_fn fn = io->callback; 118 io_notify_fn fn = io->callback;
119 void *context = io->context; 119 void *context = io->context;
120 120
121 mempool_free(io, _io_pool); 121 mempool_free(io, io->client->pool);
122 fn(r, context); 122 fn(r, context);
123 } 123 }
124 } 124 }
@@ -126,7 +126,8 @@ static void dec_count(struct io *io, unsigned int region, int error)
126 126
127static int endio(struct bio *bio, unsigned int done, int error) 127static int endio(struct bio *bio, unsigned int done, int error)
128{ 128{
129 struct io *io = (struct io *) bio->bi_private; 129 struct io *io;
130 unsigned region;
130 131
131 /* keep going until we've finished */ 132 /* keep going until we've finished */
132 if (bio->bi_size) 133 if (bio->bi_size)
@@ -135,10 +136,17 @@ static int endio(struct bio *bio, unsigned int done, int error)
135 if (error && bio_data_dir(bio) == READ) 136 if (error && bio_data_dir(bio) == READ)
136 zero_fill_bio(bio); 137 zero_fill_bio(bio);
137 138
138 dec_count(io, bio_get_region(bio), error); 139 /*
140 * The bio destructor in bio_put() may use the io object.
141 */
142 io = bio->bi_private;
143 region = bio_get_region(bio);
144
139 bio->bi_max_vecs++; 145 bio->bi_max_vecs++;
140 bio_put(bio); 146 bio_put(bio);
141 147
148 dec_count(io, region, error);
149
142 return 0; 150 return 0;
143} 151}
144 152
@@ -209,6 +217,9 @@ static void bvec_dp_init(struct dpages *dp, struct bio_vec *bvec)
209 dp->context_ptr = bvec; 217 dp->context_ptr = bvec;
210} 218}
211 219
220/*
221 * Functions for getting the pages from a VMA.
222 */
212static void vm_get_page(struct dpages *dp, 223static void vm_get_page(struct dpages *dp,
213 struct page **p, unsigned long *len, unsigned *offset) 224 struct page **p, unsigned long *len, unsigned *offset)
214{ 225{
@@ -233,7 +244,34 @@ static void vm_dp_init(struct dpages *dp, void *data)
233 244
234static void dm_bio_destructor(struct bio *bio) 245static void dm_bio_destructor(struct bio *bio)
235{ 246{
236 bio_free(bio, _bios); 247 struct io *io = bio->bi_private;
248
249 bio_free(bio, io->client->bios);
250}
251
252/*
253 * Functions for getting the pages from kernel memory.
254 */
255static void km_get_page(struct dpages *dp, struct page **p, unsigned long *len,
256 unsigned *offset)
257{
258 *p = virt_to_page(dp->context_ptr);
259 *offset = dp->context_u;
260 *len = PAGE_SIZE - dp->context_u;
261}
262
263static void km_next_page(struct dpages *dp)
264{
265 dp->context_ptr += PAGE_SIZE - dp->context_u;
266 dp->context_u = 0;
267}
268
269static void km_dp_init(struct dpages *dp, void *data)
270{
271 dp->get_page = km_get_page;
272 dp->next_page = km_next_page;
273 dp->context_u = ((unsigned long) data) & (PAGE_SIZE - 1);
274 dp->context_ptr = data;
237} 275}
238 276
239/*----------------------------------------------------------------- 277/*-----------------------------------------------------------------
@@ -256,7 +294,7 @@ static void do_region(int rw, unsigned int region, struct io_region *where,
256 * to hide it from bio_add_page(). 294 * to hide it from bio_add_page().
257 */ 295 */
258 num_bvecs = (remaining / (PAGE_SIZE >> SECTOR_SHIFT)) + 2; 296 num_bvecs = (remaining / (PAGE_SIZE >> SECTOR_SHIFT)) + 2;
259 bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, _bios); 297 bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, io->client->bios);
260 bio->bi_sector = where->sector + (where->count - remaining); 298 bio->bi_sector = where->sector + (where->count - remaining);
261 bio->bi_bdev = where->bdev; 299 bio->bi_bdev = where->bdev;
262 bio->bi_end_io = endio; 300 bio->bi_end_io = endio;
@@ -311,8 +349,9 @@ static void dispatch_io(int rw, unsigned int num_regions,
311 dec_count(io, 0, 0); 349 dec_count(io, 0, 0);
312} 350}
313 351
314static int sync_io(unsigned int num_regions, struct io_region *where, 352static int sync_io(struct dm_io_client *client, unsigned int num_regions,
315 int rw, struct dpages *dp, unsigned long *error_bits) 353 struct io_region *where, int rw, struct dpages *dp,
354 unsigned long *error_bits)
316{ 355{
317 struct io io; 356 struct io io;
318 357
@@ -324,6 +363,7 @@ static int sync_io(unsigned int num_regions, struct io_region *where,
324 io.error = 0; 363 io.error = 0;
325 atomic_set(&io.count, 1); /* see dispatch_io() */ 364 atomic_set(&io.count, 1); /* see dispatch_io() */
326 io.sleeper = current; 365 io.sleeper = current;
366 io.client = client;
327 367
328 dispatch_io(rw, num_regions, where, dp, &io, 1); 368 dispatch_io(rw, num_regions, where, dp, &io, 1);
329 369
@@ -340,12 +380,15 @@ static int sync_io(unsigned int num_regions, struct io_region *where,
340 if (atomic_read(&io.count)) 380 if (atomic_read(&io.count))
341 return -EINTR; 381 return -EINTR;
342 382
343 *error_bits = io.error; 383 if (error_bits)
384 *error_bits = io.error;
385
344 return io.error ? -EIO : 0; 386 return io.error ? -EIO : 0;
345} 387}
346 388
347static int async_io(unsigned int num_regions, struct io_region *where, int rw, 389static int async_io(struct dm_io_client *client, unsigned int num_regions,
348 struct dpages *dp, io_notify_fn fn, void *context) 390 struct io_region *where, int rw, struct dpages *dp,
391 io_notify_fn fn, void *context)
349{ 392{
350 struct io *io; 393 struct io *io;
351 394
@@ -355,10 +398,11 @@ static int async_io(unsigned int num_regions, struct io_region *where, int rw,
355 return -EIO; 398 return -EIO;
356 } 399 }
357 400
358 io = mempool_alloc(_io_pool, GFP_NOIO); 401 io = mempool_alloc(client->pool, GFP_NOIO);
359 io->error = 0; 402 io->error = 0;
360 atomic_set(&io->count, 1); /* see dispatch_io() */ 403 atomic_set(&io->count, 1); /* see dispatch_io() */
361 io->sleeper = NULL; 404 io->sleeper = NULL;
405 io->client = client;
362 io->callback = fn; 406 io->callback = fn;
363 io->context = context; 407 io->context = context;
364 408
@@ -366,61 +410,51 @@ static int async_io(unsigned int num_regions, struct io_region *where, int rw,
366 return 0; 410 return 0;
367} 411}
368 412
369int dm_io_sync(unsigned int num_regions, struct io_region *where, int rw, 413static int dp_init(struct dm_io_request *io_req, struct dpages *dp)
370 struct page_list *pl, unsigned int offset,
371 unsigned long *error_bits)
372{ 414{
373 struct dpages dp; 415 /* Set up dpages based on memory type */
374 list_dp_init(&dp, pl, offset); 416 switch (io_req->mem.type) {
375 return sync_io(num_regions, where, rw, &dp, error_bits); 417 case DM_IO_PAGE_LIST:
376} 418 list_dp_init(dp, io_req->mem.ptr.pl, io_req->mem.offset);
419 break;
420
421 case DM_IO_BVEC:
422 bvec_dp_init(dp, io_req->mem.ptr.bvec);
423 break;
424
425 case DM_IO_VMA:
426 vm_dp_init(dp, io_req->mem.ptr.vma);
427 break;
428
429 case DM_IO_KMEM:
430 km_dp_init(dp, io_req->mem.ptr.addr);
431 break;
432
433 default:
434 return -EINVAL;
435 }
377 436
378int dm_io_sync_bvec(unsigned int num_regions, struct io_region *where, int rw, 437 return 0;
379 struct bio_vec *bvec, unsigned long *error_bits)
380{
381 struct dpages dp;
382 bvec_dp_init(&dp, bvec);
383 return sync_io(num_regions, where, rw, &dp, error_bits);
384} 438}
385 439
386int dm_io_sync_vm(unsigned int num_regions, struct io_region *where, int rw, 440/*
387 void *data, unsigned long *error_bits) 441 * New collapsed (a)synchronous interface
442 */
443int dm_io(struct dm_io_request *io_req, unsigned num_regions,
444 struct io_region *where, unsigned long *sync_error_bits)
388{ 445{
446 int r;
389 struct dpages dp; 447 struct dpages dp;
390 vm_dp_init(&dp, data);
391 return sync_io(num_regions, where, rw, &dp, error_bits);
392}
393 448
394int dm_io_async(unsigned int num_regions, struct io_region *where, int rw, 449 r = dp_init(io_req, &dp);
395 struct page_list *pl, unsigned int offset, 450 if (r)
396 io_notify_fn fn, void *context) 451 return r;
397{
398 struct dpages dp;
399 list_dp_init(&dp, pl, offset);
400 return async_io(num_regions, where, rw, &dp, fn, context);
401}
402 452
403int dm_io_async_bvec(unsigned int num_regions, struct io_region *where, int rw, 453 if (!io_req->notify.fn)
404 struct bio_vec *bvec, io_notify_fn fn, void *context) 454 return sync_io(io_req->client, num_regions, where,
405{ 455 io_req->bi_rw, &dp, sync_error_bits);
406 struct dpages dp;
407 bvec_dp_init(&dp, bvec);
408 return async_io(num_regions, where, rw, &dp, fn, context);
409}
410 456
411int dm_io_async_vm(unsigned int num_regions, struct io_region *where, int rw, 457 return async_io(io_req->client, num_regions, where, io_req->bi_rw,
412 void *data, io_notify_fn fn, void *context) 458 &dp, io_req->notify.fn, io_req->notify.context);
413{
414 struct dpages dp;
415 vm_dp_init(&dp, data);
416 return async_io(num_regions, where, rw, &dp, fn, context);
417} 459}
418 460EXPORT_SYMBOL(dm_io);
419EXPORT_SYMBOL(dm_io_get);
420EXPORT_SYMBOL(dm_io_put);
421EXPORT_SYMBOL(dm_io_sync);
422EXPORT_SYMBOL(dm_io_async);
423EXPORT_SYMBOL(dm_io_sync_bvec);
424EXPORT_SYMBOL(dm_io_async_bvec);
425EXPORT_SYMBOL(dm_io_sync_vm);
426EXPORT_SYMBOL(dm_io_async_vm);
diff --git a/drivers/md/dm-io.h b/drivers/md/dm-io.h
index f9035bfd1a9f..f647e2cceaa6 100644
--- a/drivers/md/dm-io.h
+++ b/drivers/md/dm-io.h
@@ -12,7 +12,7 @@
12struct io_region { 12struct io_region {
13 struct block_device *bdev; 13 struct block_device *bdev;
14 sector_t sector; 14 sector_t sector;
15 sector_t count; 15 sector_t count; /* If this is zero the region is ignored. */
16}; 16};
17 17
18struct page_list { 18struct page_list {
@@ -20,55 +20,60 @@ struct page_list {
20 struct page *page; 20 struct page *page;
21}; 21};
22 22
23
24/*
25 * 'error' is a bitset, with each bit indicating whether an error
26 * occurred doing io to the corresponding region.
27 */
28typedef void (*io_notify_fn)(unsigned long error, void *context); 23typedef void (*io_notify_fn)(unsigned long error, void *context);
29 24
25enum dm_io_mem_type {
26 DM_IO_PAGE_LIST,/* Page list */
27 DM_IO_BVEC, /* Bio vector */
28 DM_IO_VMA, /* Virtual memory area */
29 DM_IO_KMEM, /* Kernel memory */
30};
31
32struct dm_io_memory {
33 enum dm_io_mem_type type;
34
35 union {
36 struct page_list *pl;
37 struct bio_vec *bvec;
38 void *vma;
39 void *addr;
40 } ptr;
41
42 unsigned offset;
43};
44
45struct dm_io_notify {
46 io_notify_fn fn; /* Callback for asynchronous requests */
47 void *context; /* Passed to callback */
48};
30 49
31/* 50/*
32 * Before anyone uses the IO interface they should call 51 * IO request structure
33 * dm_io_get(), specifying roughly how many pages they are
34 * expecting to perform io on concurrently.
35 *
36 * This function may block.
37 */ 52 */
38int dm_io_get(unsigned int num_pages); 53struct dm_io_client;
39void dm_io_put(unsigned int num_pages); 54struct dm_io_request {
55 int bi_rw; /* READ|WRITE - not READA */
56 struct dm_io_memory mem; /* Memory to use for io */
57 struct dm_io_notify notify; /* Synchronous if notify.fn is NULL */
58 struct dm_io_client *client; /* Client memory handler */
59};
40 60
41/* 61/*
42 * Synchronous IO. 62 * For async io calls, users can alternatively use the dm_io() function below
63 * and dm_io_client_create() to create private mempools for the client.
43 * 64 *
44 * Please ensure that the rw flag in the next two functions is 65 * Create/destroy may block.
45 * either READ or WRITE, ie. we don't take READA. Any
46 * regions with a zero count field will be ignored.
47 */ 66 */
48int dm_io_sync(unsigned int num_regions, struct io_region *where, int rw, 67struct dm_io_client *dm_io_client_create(unsigned num_pages);
49 struct page_list *pl, unsigned int offset, 68int dm_io_client_resize(unsigned num_pages, struct dm_io_client *client);
50 unsigned long *error_bits); 69void dm_io_client_destroy(struct dm_io_client *client);
51
52int dm_io_sync_bvec(unsigned int num_regions, struct io_region *where, int rw,
53 struct bio_vec *bvec, unsigned long *error_bits);
54
55int dm_io_sync_vm(unsigned int num_regions, struct io_region *where, int rw,
56 void *data, unsigned long *error_bits);
57 70
58/* 71/*
59 * Aynchronous IO. 72 * IO interface using private per-client pools.
60 * 73 * Each bit in the optional 'sync_error_bits' bitset indicates whether an
61 * The 'where' array may be safely allocated on the stack since 74 * error occurred doing io to the corresponding region.
62 * the function takes a copy.
63 */ 75 */
64int dm_io_async(unsigned int num_regions, struct io_region *where, int rw, 76int dm_io(struct dm_io_request *io_req, unsigned num_regions,
65 struct page_list *pl, unsigned int offset, 77 struct io_region *region, unsigned long *sync_error_bits);
66 io_notify_fn fn, void *context);
67
68int dm_io_async_bvec(unsigned int num_regions, struct io_region *where, int rw,
69 struct bio_vec *bvec, io_notify_fn fn, void *context);
70
71int dm_io_async_vm(unsigned int num_regions, struct io_region *where, int rw,
72 void *data, io_notify_fn fn, void *context);
73 78
74#endif 79#endif
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index 6a9261351848..a66428d860fe 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -149,9 +149,12 @@ struct log_c {
149 FORCESYNC, /* Force a sync to happen */ 149 FORCESYNC, /* Force a sync to happen */
150 } sync; 150 } sync;
151 151
152 struct dm_io_request io_req;
153
152 /* 154 /*
153 * Disk log fields 155 * Disk log fields
154 */ 156 */
157 int log_dev_failed;
155 struct dm_dev *log_dev; 158 struct dm_dev *log_dev;
156 struct log_header header; 159 struct log_header header;
157 160
@@ -199,13 +202,20 @@ static void header_from_disk(struct log_header *core, struct log_header *disk)
199 core->nr_regions = le64_to_cpu(disk->nr_regions); 202 core->nr_regions = le64_to_cpu(disk->nr_regions);
200} 203}
201 204
205static int rw_header(struct log_c *lc, int rw)
206{
207 lc->io_req.bi_rw = rw;
208 lc->io_req.mem.ptr.vma = lc->disk_header;
209 lc->io_req.notify.fn = NULL;
210
211 return dm_io(&lc->io_req, 1, &lc->header_location, NULL);
212}
213
202static int read_header(struct log_c *log) 214static int read_header(struct log_c *log)
203{ 215{
204 int r; 216 int r;
205 unsigned long ebits;
206 217
207 r = dm_io_sync_vm(1, &log->header_location, READ, 218 r = rw_header(log, READ);
208 log->disk_header, &ebits);
209 if (r) 219 if (r)
210 return r; 220 return r;
211 221
@@ -233,11 +243,8 @@ static int read_header(struct log_c *log)
233 243
234static inline int write_header(struct log_c *log) 244static inline int write_header(struct log_c *log)
235{ 245{
236 unsigned long ebits;
237
238 header_to_disk(&log->header, log->disk_header); 246 header_to_disk(&log->header, log->disk_header);
239 return dm_io_sync_vm(1, &log->header_location, WRITE, 247 return rw_header(log, WRITE);
240 log->disk_header, &ebits);
241} 248}
242 249
243/*---------------------------------------------------------------- 250/*----------------------------------------------------------------
@@ -256,6 +263,7 @@ static int create_log_context(struct dirty_log *log, struct dm_target *ti,
256 uint32_t region_size; 263 uint32_t region_size;
257 unsigned int region_count; 264 unsigned int region_count;
258 size_t bitset_size, buf_size; 265 size_t bitset_size, buf_size;
266 int r;
259 267
260 if (argc < 1 || argc > 2) { 268 if (argc < 1 || argc > 2) {
261 DMWARN("wrong number of arguments to mirror log"); 269 DMWARN("wrong number of arguments to mirror log");
@@ -315,6 +323,7 @@ static int create_log_context(struct dirty_log *log, struct dm_target *ti,
315 lc->disk_header = NULL; 323 lc->disk_header = NULL;
316 } else { 324 } else {
317 lc->log_dev = dev; 325 lc->log_dev = dev;
326 lc->log_dev_failed = 0;
318 lc->header_location.bdev = lc->log_dev->bdev; 327 lc->header_location.bdev = lc->log_dev->bdev;
319 lc->header_location.sector = 0; 328 lc->header_location.sector = 0;
320 329
@@ -324,6 +333,15 @@ static int create_log_context(struct dirty_log *log, struct dm_target *ti,
324 buf_size = dm_round_up((LOG_OFFSET << SECTOR_SHIFT) + 333 buf_size = dm_round_up((LOG_OFFSET << SECTOR_SHIFT) +
325 bitset_size, ti->limits.hardsect_size); 334 bitset_size, ti->limits.hardsect_size);
326 lc->header_location.count = buf_size >> SECTOR_SHIFT; 335 lc->header_location.count = buf_size >> SECTOR_SHIFT;
336 lc->io_req.mem.type = DM_IO_VMA;
337 lc->io_req.client = dm_io_client_create(dm_div_up(buf_size,
338 PAGE_SIZE));
339 if (IS_ERR(lc->io_req.client)) {
340 r = PTR_ERR(lc->io_req.client);
341 DMWARN("couldn't allocate disk io client");
342 kfree(lc);
343 return -ENOMEM;
344 }
327 345
328 lc->disk_header = vmalloc(buf_size); 346 lc->disk_header = vmalloc(buf_size);
329 if (!lc->disk_header) { 347 if (!lc->disk_header) {
@@ -424,6 +442,7 @@ static void disk_dtr(struct dirty_log *log)
424 442
425 dm_put_device(lc->ti, lc->log_dev); 443 dm_put_device(lc->ti, lc->log_dev);
426 vfree(lc->disk_header); 444 vfree(lc->disk_header);
445 dm_io_client_destroy(lc->io_req.client);
427 destroy_log_context(lc); 446 destroy_log_context(lc);
428} 447}
429 448
@@ -437,6 +456,15 @@ static int count_bits32(uint32_t *addr, unsigned size)
437 return count; 456 return count;
438} 457}
439 458
459static void fail_log_device(struct log_c *lc)
460{
461 if (lc->log_dev_failed)
462 return;
463
464 lc->log_dev_failed = 1;
465 dm_table_event(lc->ti->table);
466}
467
440static int disk_resume(struct dirty_log *log) 468static int disk_resume(struct dirty_log *log)
441{ 469{
442 int r; 470 int r;
@@ -446,8 +474,19 @@ static int disk_resume(struct dirty_log *log)
446 474
447 /* read the disk header */ 475 /* read the disk header */
448 r = read_header(lc); 476 r = read_header(lc);
449 if (r) 477 if (r) {
450 return r; 478 DMWARN("%s: Failed to read header on mirror log device",
479 lc->log_dev->name);
480 fail_log_device(lc);
481 /*
482 * If the log device cannot be read, we must assume
483 * all regions are out-of-sync. If we simply return
484 * here, the state will be uninitialized and could
485 * lead us to return 'in-sync' status for regions
486 * that are actually 'out-of-sync'.
487 */
488 lc->header.nr_regions = 0;
489 }
451 490
452 /* set or clear any new bits -- device has grown */ 491 /* set or clear any new bits -- device has grown */
453 if (lc->sync == NOSYNC) 492 if (lc->sync == NOSYNC)
@@ -472,7 +511,14 @@ static int disk_resume(struct dirty_log *log)
472 lc->header.nr_regions = lc->region_count; 511 lc->header.nr_regions = lc->region_count;
473 512
474 /* write the new header */ 513 /* write the new header */
475 return write_header(lc); 514 r = write_header(lc);
515 if (r) {
516 DMWARN("%s: Failed to write header on mirror log device",
517 lc->log_dev->name);
518 fail_log_device(lc);
519 }
520
521 return r;
476} 522}
477 523
478static uint32_t core_get_region_size(struct dirty_log *log) 524static uint32_t core_get_region_size(struct dirty_log *log)
@@ -516,7 +562,9 @@ static int disk_flush(struct dirty_log *log)
516 return 0; 562 return 0;
517 563
518 r = write_header(lc); 564 r = write_header(lc);
519 if (!r) 565 if (r)
566 fail_log_device(lc);
567 else
520 lc->touched = 0; 568 lc->touched = 0;
521 569
522 return r; 570 return r;
@@ -591,6 +639,7 @@ static int core_status(struct dirty_log *log, status_type_t status,
591 639
592 switch(status) { 640 switch(status) {
593 case STATUSTYPE_INFO: 641 case STATUSTYPE_INFO:
642 DMEMIT("1 %s", log->type->name);
594 break; 643 break;
595 644
596 case STATUSTYPE_TABLE: 645 case STATUSTYPE_TABLE:
@@ -606,17 +655,17 @@ static int disk_status(struct dirty_log *log, status_type_t status,
606 char *result, unsigned int maxlen) 655 char *result, unsigned int maxlen)
607{ 656{
608 int sz = 0; 657 int sz = 0;
609 char buffer[16];
610 struct log_c *lc = log->context; 658 struct log_c *lc = log->context;
611 659
612 switch(status) { 660 switch(status) {
613 case STATUSTYPE_INFO: 661 case STATUSTYPE_INFO:
662 DMEMIT("3 %s %s %c", log->type->name, lc->log_dev->name,
663 lc->log_dev_failed ? 'D' : 'A');
614 break; 664 break;
615 665
616 case STATUSTYPE_TABLE: 666 case STATUSTYPE_TABLE:
617 format_dev_t(buffer, lc->log_dev->bdev->bd_dev);
618 DMEMIT("%s %u %s %u ", log->type->name, 667 DMEMIT("%s %u %s %u ", log->type->name,
619 lc->sync == DEFAULTSYNC ? 2 : 3, buffer, 668 lc->sync == DEFAULTSYNC ? 2 : 3, lc->log_dev->name,
620 lc->region_size); 669 lc->region_size);
621 DMEMIT_SYNC; 670 DMEMIT_SYNC;
622 } 671 }
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 3aa013506967..de54b39e6ffe 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -668,6 +668,9 @@ static int parse_hw_handler(struct arg_set *as, struct multipath *m)
668 return -EINVAL; 668 return -EINVAL;
669 } 669 }
670 670
671 m->hw_handler.md = dm_table_get_md(ti->table);
672 dm_put(m->hw_handler.md);
673
671 r = hwht->create(&m->hw_handler, hw_argc - 1, as->argv); 674 r = hwht->create(&m->hw_handler, hw_argc - 1, as->argv);
672 if (r) { 675 if (r) {
673 dm_put_hw_handler(hwht); 676 dm_put_hw_handler(hwht);
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 23a642619bed..ef124b71ccc8 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -21,15 +21,11 @@
21#include <linux/workqueue.h> 21#include <linux/workqueue.h>
22 22
23#define DM_MSG_PREFIX "raid1" 23#define DM_MSG_PREFIX "raid1"
24#define DM_IO_PAGES 64
24 25
25static struct workqueue_struct *_kmirrord_wq; 26#define DM_RAID1_HANDLE_ERRORS 0x01
26static struct work_struct _kmirrord_work;
27static DECLARE_WAIT_QUEUE_HEAD(_kmirrord_recovery_stopped);
28 27
29static inline void wake(void) 28static DECLARE_WAIT_QUEUE_HEAD(_kmirrord_recovery_stopped);
30{
31 queue_work(_kmirrord_wq, &_kmirrord_work);
32}
33 29
34/*----------------------------------------------------------------- 30/*-----------------------------------------------------------------
35 * Region hash 31 * Region hash
@@ -125,17 +121,23 @@ struct mirror_set {
125 struct list_head list; 121 struct list_head list;
126 struct region_hash rh; 122 struct region_hash rh;
127 struct kcopyd_client *kcopyd_client; 123 struct kcopyd_client *kcopyd_client;
124 uint64_t features;
128 125
129 spinlock_t lock; /* protects the next two lists */ 126 spinlock_t lock; /* protects the next two lists */
130 struct bio_list reads; 127 struct bio_list reads;
131 struct bio_list writes; 128 struct bio_list writes;
132 129
130 struct dm_io_client *io_client;
131
133 /* recovery */ 132 /* recovery */
134 region_t nr_regions; 133 region_t nr_regions;
135 int in_sync; 134 int in_sync;
136 135
137 struct mirror *default_mirror; /* Default mirror */ 136 struct mirror *default_mirror; /* Default mirror */
138 137
138 struct workqueue_struct *kmirrord_wq;
139 struct work_struct kmirrord_work;
140
139 unsigned int nr_mirrors; 141 unsigned int nr_mirrors;
140 struct mirror mirror[0]; 142 struct mirror mirror[0];
141}; 143};
@@ -153,6 +155,11 @@ static inline sector_t region_to_sector(struct region_hash *rh, region_t region)
153 return region << rh->region_shift; 155 return region << rh->region_shift;
154} 156}
155 157
158static void wake(struct mirror_set *ms)
159{
160 queue_work(ms->kmirrord_wq, &ms->kmirrord_work);
161}
162
156/* FIXME move this */ 163/* FIXME move this */
157static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw); 164static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw);
158 165
@@ -398,8 +405,7 @@ static void rh_update_states(struct region_hash *rh)
398 mempool_free(reg, rh->region_pool); 405 mempool_free(reg, rh->region_pool);
399 } 406 }
400 407
401 if (!list_empty(&recovered)) 408 rh->log->type->flush(rh->log);
402 rh->log->type->flush(rh->log);
403 409
404 list_for_each_entry_safe (reg, next, &clean, list) 410 list_for_each_entry_safe (reg, next, &clean, list)
405 mempool_free(reg, rh->region_pool); 411 mempool_free(reg, rh->region_pool);
@@ -471,7 +477,7 @@ static void rh_dec(struct region_hash *rh, region_t region)
471 spin_unlock_irqrestore(&rh->region_lock, flags); 477 spin_unlock_irqrestore(&rh->region_lock, flags);
472 478
473 if (should_wake) 479 if (should_wake)
474 wake(); 480 wake(rh->ms);
475} 481}
476 482
477/* 483/*
@@ -558,7 +564,7 @@ static void rh_recovery_end(struct region *reg, int success)
558 list_add(&reg->list, &reg->rh->recovered_regions); 564 list_add(&reg->list, &reg->rh->recovered_regions);
559 spin_unlock_irq(&rh->region_lock); 565 spin_unlock_irq(&rh->region_lock);
560 566
561 wake(); 567 wake(rh->ms);
562} 568}
563 569
564static void rh_flush(struct region_hash *rh) 570static void rh_flush(struct region_hash *rh)
@@ -592,7 +598,7 @@ static void rh_start_recovery(struct region_hash *rh)
592 for (i = 0; i < MAX_RECOVERY; i++) 598 for (i = 0; i < MAX_RECOVERY; i++)
593 up(&rh->recovery_count); 599 up(&rh->recovery_count);
594 600
595 wake(); 601 wake(rh->ms);
596} 602}
597 603
598/* 604/*
@@ -735,7 +741,7 @@ static void do_reads(struct mirror_set *ms, struct bio_list *reads)
735 /* 741 /*
736 * We can only read balance if the region is in sync. 742 * We can only read balance if the region is in sync.
737 */ 743 */
738 if (rh_in_sync(&ms->rh, region, 0)) 744 if (rh_in_sync(&ms->rh, region, 1))
739 m = choose_mirror(ms, bio->bi_sector); 745 m = choose_mirror(ms, bio->bi_sector);
740 else 746 else
741 m = ms->default_mirror; 747 m = ms->default_mirror;
@@ -792,6 +798,14 @@ static void do_write(struct mirror_set *ms, struct bio *bio)
792 unsigned int i; 798 unsigned int i;
793 struct io_region io[KCOPYD_MAX_REGIONS+1]; 799 struct io_region io[KCOPYD_MAX_REGIONS+1];
794 struct mirror *m; 800 struct mirror *m;
801 struct dm_io_request io_req = {
802 .bi_rw = WRITE,
803 .mem.type = DM_IO_BVEC,
804 .mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx,
805 .notify.fn = write_callback,
806 .notify.context = bio,
807 .client = ms->io_client,
808 };
795 809
796 for (i = 0; i < ms->nr_mirrors; i++) { 810 for (i = 0; i < ms->nr_mirrors; i++) {
797 m = ms->mirror + i; 811 m = ms->mirror + i;
@@ -802,9 +816,8 @@ static void do_write(struct mirror_set *ms, struct bio *bio)
802 } 816 }
803 817
804 bio_set_ms(bio, ms); 818 bio_set_ms(bio, ms);
805 dm_io_async_bvec(ms->nr_mirrors, io, WRITE, 819
806 bio->bi_io_vec + bio->bi_idx, 820 (void) dm_io(&io_req, ms->nr_mirrors, io, NULL);
807 write_callback, bio);
808} 821}
809 822
810static void do_writes(struct mirror_set *ms, struct bio_list *writes) 823static void do_writes(struct mirror_set *ms, struct bio_list *writes)
@@ -870,11 +883,10 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
870/*----------------------------------------------------------------- 883/*-----------------------------------------------------------------
871 * kmirrord 884 * kmirrord
872 *---------------------------------------------------------------*/ 885 *---------------------------------------------------------------*/
873static LIST_HEAD(_mirror_sets); 886static void do_mirror(struct work_struct *work)
874static DECLARE_RWSEM(_mirror_sets_lock);
875
876static void do_mirror(struct mirror_set *ms)
877{ 887{
888 struct mirror_set *ms =container_of(work, struct mirror_set,
889 kmirrord_work);
878 struct bio_list reads, writes; 890 struct bio_list reads, writes;
879 891
880 spin_lock(&ms->lock); 892 spin_lock(&ms->lock);
@@ -890,16 +902,6 @@ static void do_mirror(struct mirror_set *ms)
890 do_writes(ms, &writes); 902 do_writes(ms, &writes);
891} 903}
892 904
893static void do_work(struct work_struct *ignored)
894{
895 struct mirror_set *ms;
896
897 down_read(&_mirror_sets_lock);
898 list_for_each_entry (ms, &_mirror_sets, list)
899 do_mirror(ms);
900 up_read(&_mirror_sets_lock);
901}
902
903/*----------------------------------------------------------------- 905/*-----------------------------------------------------------------
904 * Target functions 906 * Target functions
905 *---------------------------------------------------------------*/ 907 *---------------------------------------------------------------*/
@@ -931,6 +933,13 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors,
931 ms->in_sync = 0; 933 ms->in_sync = 0;
932 ms->default_mirror = &ms->mirror[DEFAULT_MIRROR]; 934 ms->default_mirror = &ms->mirror[DEFAULT_MIRROR];
933 935
936 ms->io_client = dm_io_client_create(DM_IO_PAGES);
937 if (IS_ERR(ms->io_client)) {
938 ti->error = "Error creating dm_io client";
939 kfree(ms);
940 return NULL;
941 }
942
934 if (rh_init(&ms->rh, ms, dl, region_size, ms->nr_regions)) { 943 if (rh_init(&ms->rh, ms, dl, region_size, ms->nr_regions)) {
935 ti->error = "Error creating dirty region hash"; 944 ti->error = "Error creating dirty region hash";
936 kfree(ms); 945 kfree(ms);
@@ -946,6 +955,7 @@ static void free_context(struct mirror_set *ms, struct dm_target *ti,
946 while (m--) 955 while (m--)
947 dm_put_device(ti, ms->mirror[m].dev); 956 dm_put_device(ti, ms->mirror[m].dev);
948 957
958 dm_io_client_destroy(ms->io_client);
949 rh_exit(&ms->rh); 959 rh_exit(&ms->rh);
950 kfree(ms); 960 kfree(ms);
951} 961}
@@ -978,23 +988,6 @@ static int get_mirror(struct mirror_set *ms, struct dm_target *ti,
978 return 0; 988 return 0;
979} 989}
980 990
981static int add_mirror_set(struct mirror_set *ms)
982{
983 down_write(&_mirror_sets_lock);
984 list_add_tail(&ms->list, &_mirror_sets);
985 up_write(&_mirror_sets_lock);
986 wake();
987
988 return 0;
989}
990
991static void del_mirror_set(struct mirror_set *ms)
992{
993 down_write(&_mirror_sets_lock);
994 list_del(&ms->list);
995 up_write(&_mirror_sets_lock);
996}
997
998/* 991/*
999 * Create dirty log: log_type #log_params <log_params> 992 * Create dirty log: log_type #log_params <log_params>
1000 */ 993 */
@@ -1037,16 +1030,55 @@ static struct dirty_log *create_dirty_log(struct dm_target *ti,
1037 return dl; 1030 return dl;
1038} 1031}
1039 1032
1033static int parse_features(struct mirror_set *ms, unsigned argc, char **argv,
1034 unsigned *args_used)
1035{
1036 unsigned num_features;
1037 struct dm_target *ti = ms->ti;
1038
1039 *args_used = 0;
1040
1041 if (!argc)
1042 return 0;
1043
1044 if (sscanf(argv[0], "%u", &num_features) != 1) {
1045 ti->error = "Invalid number of features";
1046 return -EINVAL;
1047 }
1048
1049 argc--;
1050 argv++;
1051 (*args_used)++;
1052
1053 if (num_features > argc) {
1054 ti->error = "Not enough arguments to support feature count";
1055 return -EINVAL;
1056 }
1057
1058 if (!strcmp("handle_errors", argv[0]))
1059 ms->features |= DM_RAID1_HANDLE_ERRORS;
1060 else {
1061 ti->error = "Unrecognised feature requested";
1062 return -EINVAL;
1063 }
1064
1065 (*args_used)++;
1066
1067 return 0;
1068}
1069
1040/* 1070/*
1041 * Construct a mirror mapping: 1071 * Construct a mirror mapping:
1042 * 1072 *
1043 * log_type #log_params <log_params> 1073 * log_type #log_params <log_params>
1044 * #mirrors [mirror_path offset]{2,} 1074 * #mirrors [mirror_path offset]{2,}
1075 * [#features <features>]
1045 * 1076 *
1046 * log_type is "core" or "disk" 1077 * log_type is "core" or "disk"
1047 * #log_params is between 1 and 3 1078 * #log_params is between 1 and 3
1079 *
1080 * If present, features must be "handle_errors".
1048 */ 1081 */
1049#define DM_IO_PAGES 64
1050static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv) 1082static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1051{ 1083{
1052 int r; 1084 int r;
@@ -1070,8 +1102,8 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1070 1102
1071 argv++, argc--; 1103 argv++, argc--;
1072 1104
1073 if (argc != nr_mirrors * 2) { 1105 if (argc < nr_mirrors * 2) {
1074 ti->error = "Wrong number of mirror arguments"; 1106 ti->error = "Too few mirror arguments";
1075 dm_destroy_dirty_log(dl); 1107 dm_destroy_dirty_log(dl);
1076 return -EINVAL; 1108 return -EINVAL;
1077 } 1109 }
@@ -1096,13 +1128,37 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1096 ti->private = ms; 1128 ti->private = ms;
1097 ti->split_io = ms->rh.region_size; 1129 ti->split_io = ms->rh.region_size;
1098 1130
1131 ms->kmirrord_wq = create_singlethread_workqueue("kmirrord");
1132 if (!ms->kmirrord_wq) {
1133 DMERR("couldn't start kmirrord");
1134 free_context(ms, ti, m);
1135 return -ENOMEM;
1136 }
1137 INIT_WORK(&ms->kmirrord_work, do_mirror);
1138
1139 r = parse_features(ms, argc, argv, &args_used);
1140 if (r) {
1141 free_context(ms, ti, ms->nr_mirrors);
1142 return r;
1143 }
1144
1145 argv += args_used;
1146 argc -= args_used;
1147
1148 if (argc) {
1149 ti->error = "Too many mirror arguments";
1150 free_context(ms, ti, ms->nr_mirrors);
1151 return -EINVAL;
1152 }
1153
1099 r = kcopyd_client_create(DM_IO_PAGES, &ms->kcopyd_client); 1154 r = kcopyd_client_create(DM_IO_PAGES, &ms->kcopyd_client);
1100 if (r) { 1155 if (r) {
1156 destroy_workqueue(ms->kmirrord_wq);
1101 free_context(ms, ti, ms->nr_mirrors); 1157 free_context(ms, ti, ms->nr_mirrors);
1102 return r; 1158 return r;
1103 } 1159 }
1104 1160
1105 add_mirror_set(ms); 1161 wake(ms);
1106 return 0; 1162 return 0;
1107} 1163}
1108 1164
@@ -1110,8 +1166,9 @@ static void mirror_dtr(struct dm_target *ti)
1110{ 1166{
1111 struct mirror_set *ms = (struct mirror_set *) ti->private; 1167 struct mirror_set *ms = (struct mirror_set *) ti->private;
1112 1168
1113 del_mirror_set(ms); 1169 flush_workqueue(ms->kmirrord_wq);
1114 kcopyd_client_destroy(ms->kcopyd_client); 1170 kcopyd_client_destroy(ms->kcopyd_client);
1171 destroy_workqueue(ms->kmirrord_wq);
1115 free_context(ms, ti, ms->nr_mirrors); 1172 free_context(ms, ti, ms->nr_mirrors);
1116} 1173}
1117 1174
@@ -1127,7 +1184,7 @@ static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw)
1127 spin_unlock(&ms->lock); 1184 spin_unlock(&ms->lock);
1128 1185
1129 if (should_wake) 1186 if (should_wake)
1130 wake(); 1187 wake(ms);
1131} 1188}
1132 1189
1133/* 1190/*
@@ -1222,11 +1279,9 @@ static void mirror_resume(struct dm_target *ti)
1222static int mirror_status(struct dm_target *ti, status_type_t type, 1279static int mirror_status(struct dm_target *ti, status_type_t type,
1223 char *result, unsigned int maxlen) 1280 char *result, unsigned int maxlen)
1224{ 1281{
1225 unsigned int m, sz; 1282 unsigned int m, sz = 0;
1226 struct mirror_set *ms = (struct mirror_set *) ti->private; 1283 struct mirror_set *ms = (struct mirror_set *) ti->private;
1227 1284
1228 sz = ms->rh.log->type->status(ms->rh.log, type, result, maxlen);
1229
1230 switch (type) { 1285 switch (type) {
1231 case STATUSTYPE_INFO: 1286 case STATUSTYPE_INFO:
1232 DMEMIT("%d ", ms->nr_mirrors); 1287 DMEMIT("%d ", ms->nr_mirrors);
@@ -1237,13 +1292,21 @@ static int mirror_status(struct dm_target *ti, status_type_t type,
1237 (unsigned long long)ms->rh.log->type-> 1292 (unsigned long long)ms->rh.log->type->
1238 get_sync_count(ms->rh.log), 1293 get_sync_count(ms->rh.log),
1239 (unsigned long long)ms->nr_regions); 1294 (unsigned long long)ms->nr_regions);
1295
1296 sz = ms->rh.log->type->status(ms->rh.log, type, result, maxlen);
1297
1240 break; 1298 break;
1241 1299
1242 case STATUSTYPE_TABLE: 1300 case STATUSTYPE_TABLE:
1301 sz = ms->rh.log->type->status(ms->rh.log, type, result, maxlen);
1302
1243 DMEMIT("%d", ms->nr_mirrors); 1303 DMEMIT("%d", ms->nr_mirrors);
1244 for (m = 0; m < ms->nr_mirrors; m++) 1304 for (m = 0; m < ms->nr_mirrors; m++)
1245 DMEMIT(" %s %llu", ms->mirror[m].dev->name, 1305 DMEMIT(" %s %llu", ms->mirror[m].dev->name,
1246 (unsigned long long)ms->mirror[m].offset); 1306 (unsigned long long)ms->mirror[m].offset);
1307
1308 if (ms->features & DM_RAID1_HANDLE_ERRORS)
1309 DMEMIT(" 1 handle_errors");
1247 } 1310 }
1248 1311
1249 return 0; 1312 return 0;
@@ -1251,7 +1314,7 @@ static int mirror_status(struct dm_target *ti, status_type_t type,
1251 1314
1252static struct target_type mirror_target = { 1315static struct target_type mirror_target = {
1253 .name = "mirror", 1316 .name = "mirror",
1254 .version = {1, 0, 2}, 1317 .version = {1, 0, 3},
1255 .module = THIS_MODULE, 1318 .module = THIS_MODULE,
1256 .ctr = mirror_ctr, 1319 .ctr = mirror_ctr,
1257 .dtr = mirror_dtr, 1320 .dtr = mirror_dtr,
@@ -1270,20 +1333,11 @@ static int __init dm_mirror_init(void)
1270 if (r) 1333 if (r)
1271 return r; 1334 return r;
1272 1335
1273 _kmirrord_wq = create_singlethread_workqueue("kmirrord");
1274 if (!_kmirrord_wq) {
1275 DMERR("couldn't start kmirrord");
1276 dm_dirty_log_exit();
1277 return r;
1278 }
1279 INIT_WORK(&_kmirrord_work, do_work);
1280
1281 r = dm_register_target(&mirror_target); 1336 r = dm_register_target(&mirror_target);
1282 if (r < 0) { 1337 if (r < 0) {
1283 DMERR("%s: Failed to register mirror target", 1338 DMERR("%s: Failed to register mirror target",
1284 mirror_target.name); 1339 mirror_target.name);
1285 dm_dirty_log_exit(); 1340 dm_dirty_log_exit();
1286 destroy_workqueue(_kmirrord_wq);
1287 } 1341 }
1288 1342
1289 return r; 1343 return r;
@@ -1297,7 +1351,6 @@ static void __exit dm_mirror_exit(void)
1297 if (r < 0) 1351 if (r < 0)
1298 DMERR("%s: unregister failed %d", mirror_target.name, r); 1352 DMERR("%s: unregister failed %d", mirror_target.name, r);
1299 1353
1300 destroy_workqueue(_kmirrord_wq);
1301 dm_dirty_log_exit(); 1354 dm_dirty_log_exit();
1302} 1355}
1303 1356
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 05befa91807a..2fc199b0016b 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -425,13 +425,15 @@ static void close_dev(struct dm_dev *d, struct mapped_device *md)
425} 425}
426 426
427/* 427/*
428 * If possible (ie. blk_size[major] is set), this checks an area 428 * If possible, this checks an area of a destination device is valid.
429 * of a destination device is valid.
430 */ 429 */
431static int check_device_area(struct dm_dev *dd, sector_t start, sector_t len) 430static int check_device_area(struct dm_dev *dd, sector_t start, sector_t len)
432{ 431{
433 sector_t dev_size; 432 sector_t dev_size = dd->bdev->bd_inode->i_size >> SECTOR_SHIFT;
434 dev_size = dd->bdev->bd_inode->i_size >> SECTOR_SHIFT; 433
434 if (!dev_size)
435 return 1;
436
435 return ((start < dev_size) && (len <= (dev_size - start))); 437 return ((start < dev_size) && (len <= (dev_size - start)));
436} 438}
437 439
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 11a98df298ec..2717a355dc5b 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1236,6 +1236,7 @@ void dm_put(struct mapped_device *md)
1236 free_dev(md); 1236 free_dev(md);
1237 } 1237 }
1238} 1238}
1239EXPORT_SYMBOL_GPL(dm_put);
1239 1240
1240/* 1241/*
1241 * Process the deferred bios 1242 * Process the deferred bios
diff --git a/drivers/md/kcopyd.c b/drivers/md/kcopyd.c
index b46f6c575f7e..dbc234e3c69f 100644
--- a/drivers/md/kcopyd.c
+++ b/drivers/md/kcopyd.c
@@ -1,5 +1,6 @@
1/* 1/*
2 * Copyright (C) 2002 Sistina Software (UK) Limited. 2 * Copyright (C) 2002 Sistina Software (UK) Limited.
3 * Copyright (C) 2006 Red Hat GmbH
3 * 4 *
4 * This file is released under the GPL. 5 * This file is released under the GPL.
5 * 6 *
@@ -45,6 +46,8 @@ struct kcopyd_client {
45 unsigned int nr_pages; 46 unsigned int nr_pages;
46 unsigned int nr_free_pages; 47 unsigned int nr_free_pages;
47 48
49 struct dm_io_client *io_client;
50
48 wait_queue_head_t destroyq; 51 wait_queue_head_t destroyq;
49 atomic_t nr_jobs; 52 atomic_t nr_jobs;
50}; 53};
@@ -342,16 +345,20 @@ static void complete_io(unsigned long error, void *context)
342static int run_io_job(struct kcopyd_job *job) 345static int run_io_job(struct kcopyd_job *job)
343{ 346{
344 int r; 347 int r;
348 struct dm_io_request io_req = {
349 .bi_rw = job->rw,
350 .mem.type = DM_IO_PAGE_LIST,
351 .mem.ptr.pl = job->pages,
352 .mem.offset = job->offset,
353 .notify.fn = complete_io,
354 .notify.context = job,
355 .client = job->kc->io_client,
356 };
345 357
346 if (job->rw == READ) 358 if (job->rw == READ)
347 r = dm_io_async(1, &job->source, job->rw, 359 r = dm_io(&io_req, 1, &job->source, NULL);
348 job->pages,
349 job->offset, complete_io, job);
350
351 else 360 else
352 r = dm_io_async(job->num_dests, job->dests, job->rw, 361 r = dm_io(&io_req, job->num_dests, job->dests, NULL);
353 job->pages,
354 job->offset, complete_io, job);
355 362
356 return r; 363 return r;
357} 364}
@@ -670,8 +677,9 @@ int kcopyd_client_create(unsigned int nr_pages, struct kcopyd_client **result)
670 return r; 677 return r;
671 } 678 }
672 679
673 r = dm_io_get(nr_pages); 680 kc->io_client = dm_io_client_create(nr_pages);
674 if (r) { 681 if (IS_ERR(kc->io_client)) {
682 r = PTR_ERR(kc->io_client);
675 client_free_pages(kc); 683 client_free_pages(kc);
676 kfree(kc); 684 kfree(kc);
677 kcopyd_exit(); 685 kcopyd_exit();
@@ -691,7 +699,7 @@ void kcopyd_client_destroy(struct kcopyd_client *kc)
691 /* Wait for completion of all jobs submitted by this client. */ 699 /* Wait for completion of all jobs submitted by this client. */
692 wait_event(kc->destroyq, !atomic_read(&kc->nr_jobs)); 700 wait_event(kc->destroyq, !atomic_read(&kc->nr_jobs));
693 701
694 dm_io_put(kc->nr_pages); 702 dm_io_client_destroy(kc->io_client);
695 client_free_pages(kc); 703 client_free_pages(kc);
696 client_del(kc); 704 client_del(kc);
697 kfree(kc); 705 kfree(kc);
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 2b4315d7e5d6..2901d0c0ee9e 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -33,6 +33,7 @@
33*/ 33*/
34 34
35#include <linux/module.h> 35#include <linux/module.h>
36#include <linux/kernel.h>
36#include <linux/kthread.h> 37#include <linux/kthread.h>
37#include <linux/linkage.h> 38#include <linux/linkage.h>
38#include <linux/raid/md.h> 39#include <linux/raid/md.h>
@@ -273,6 +274,7 @@ static mddev_t * mddev_find(dev_t unit)
273 atomic_set(&new->active, 1); 274 atomic_set(&new->active, 1);
274 spin_lock_init(&new->write_lock); 275 spin_lock_init(&new->write_lock);
275 init_waitqueue_head(&new->sb_wait); 276 init_waitqueue_head(&new->sb_wait);
277 new->reshape_position = MaxSector;
276 278
277 new->queue = blk_alloc_queue(GFP_KERNEL); 279 new->queue = blk_alloc_queue(GFP_KERNEL);
278 if (!new->queue) { 280 if (!new->queue) {
@@ -589,14 +591,41 @@ abort:
589 return ret; 591 return ret;
590} 592}
591 593
594
595static u32 md_csum_fold(u32 csum)
596{
597 csum = (csum & 0xffff) + (csum >> 16);
598 return (csum & 0xffff) + (csum >> 16);
599}
600
592static unsigned int calc_sb_csum(mdp_super_t * sb) 601static unsigned int calc_sb_csum(mdp_super_t * sb)
593{ 602{
603 u64 newcsum = 0;
604 u32 *sb32 = (u32*)sb;
605 int i;
594 unsigned int disk_csum, csum; 606 unsigned int disk_csum, csum;
595 607
596 disk_csum = sb->sb_csum; 608 disk_csum = sb->sb_csum;
597 sb->sb_csum = 0; 609 sb->sb_csum = 0;
598 csum = csum_partial((void *)sb, MD_SB_BYTES, 0); 610
611 for (i = 0; i < MD_SB_BYTES/4 ; i++)
612 newcsum += sb32[i];
613 csum = (newcsum & 0xffffffff) + (newcsum>>32);
614
615
616#ifdef CONFIG_ALPHA
617 /* This used to use csum_partial, which was wrong for several
618 * reasons including that different results are returned on
619 * different architectures. It isn't critical that we get exactly
620 * the same return value as before (we always csum_fold before
621 * testing, and that removes any differences). However as we
622 * know that csum_partial always returned a 16bit value on
623 * alphas, do a fold to maximise conformity to previous behaviour.
624 */
625 sb->sb_csum = md_csum_fold(disk_csum);
626#else
599 sb->sb_csum = disk_csum; 627 sb->sb_csum = disk_csum;
628#endif
600 return csum; 629 return csum;
601} 630}
602 631
@@ -684,7 +713,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
684 if (sb->raid_disks <= 0) 713 if (sb->raid_disks <= 0)
685 goto abort; 714 goto abort;
686 715
687 if (csum_fold(calc_sb_csum(sb)) != csum_fold(sb->sb_csum)) { 716 if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) {
688 printk(KERN_WARNING "md: invalid superblock checksum on %s\n", 717 printk(KERN_WARNING "md: invalid superblock checksum on %s\n",
689 b); 718 b);
690 goto abort; 719 goto abort;
@@ -694,6 +723,17 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
694 rdev->data_offset = 0; 723 rdev->data_offset = 0;
695 rdev->sb_size = MD_SB_BYTES; 724 rdev->sb_size = MD_SB_BYTES;
696 725
726 if (sb->state & (1<<MD_SB_BITMAP_PRESENT)) {
727 if (sb->level != 1 && sb->level != 4
728 && sb->level != 5 && sb->level != 6
729 && sb->level != 10) {
730 /* FIXME use a better test */
731 printk(KERN_WARNING
732 "md: bitmaps not supported for this level.\n");
733 goto abort;
734 }
735 }
736
697 if (sb->level == LEVEL_MULTIPATH) 737 if (sb->level == LEVEL_MULTIPATH)
698 rdev->desc_nr = -1; 738 rdev->desc_nr = -1;
699 else 739 else
@@ -792,16 +832,8 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
792 mddev->max_disks = MD_SB_DISKS; 832 mddev->max_disks = MD_SB_DISKS;
793 833
794 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && 834 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
795 mddev->bitmap_file == NULL) { 835 mddev->bitmap_file == NULL)
796 if (mddev->level != 1 && mddev->level != 4
797 && mddev->level != 5 && mddev->level != 6
798 && mddev->level != 10) {
799 /* FIXME use a better test */
800 printk(KERN_WARNING "md: bitmaps not supported for this level.\n");
801 return -EINVAL;
802 }
803 mddev->bitmap_offset = mddev->default_bitmap_offset; 836 mddev->bitmap_offset = mddev->default_bitmap_offset;
804 }
805 837
806 } else if (mddev->pers == NULL) { 838 } else if (mddev->pers == NULL) {
807 /* Insist on good event counter while assembling */ 839 /* Insist on good event counter while assembling */
@@ -1058,6 +1090,18 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
1058 bdevname(rdev->bdev,b)); 1090 bdevname(rdev->bdev,b));
1059 return -EINVAL; 1091 return -EINVAL;
1060 } 1092 }
1093 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET)) {
1094 if (sb->level != cpu_to_le32(1) &&
1095 sb->level != cpu_to_le32(4) &&
1096 sb->level != cpu_to_le32(5) &&
1097 sb->level != cpu_to_le32(6) &&
1098 sb->level != cpu_to_le32(10)) {
1099 printk(KERN_WARNING
1100 "md: bitmaps not supported for this level.\n");
1101 return -EINVAL;
1102 }
1103 }
1104
1061 rdev->preferred_minor = 0xffff; 1105 rdev->preferred_minor = 0xffff;
1062 rdev->data_offset = le64_to_cpu(sb->data_offset); 1106 rdev->data_offset = le64_to_cpu(sb->data_offset);
1063 atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read)); 1107 atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read));
@@ -1141,14 +1185,9 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
1141 mddev->max_disks = (4096-256)/2; 1185 mddev->max_disks = (4096-256)/2;
1142 1186
1143 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && 1187 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) &&
1144 mddev->bitmap_file == NULL ) { 1188 mddev->bitmap_file == NULL )
1145 if (mddev->level != 1 && mddev->level != 5 && mddev->level != 6
1146 && mddev->level != 10) {
1147 printk(KERN_WARNING "md: bitmaps not supported for this level.\n");
1148 return -EINVAL;
1149 }
1150 mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset); 1189 mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset);
1151 } 1190
1152 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { 1191 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
1153 mddev->reshape_position = le64_to_cpu(sb->reshape_position); 1192 mddev->reshape_position = le64_to_cpu(sb->reshape_position);
1154 mddev->delta_disks = le32_to_cpu(sb->delta_disks); 1193 mddev->delta_disks = le32_to_cpu(sb->delta_disks);
@@ -2204,6 +2243,10 @@ static ssize_t
2204layout_show(mddev_t *mddev, char *page) 2243layout_show(mddev_t *mddev, char *page)
2205{ 2244{
2206 /* just a number, not meaningful for all levels */ 2245 /* just a number, not meaningful for all levels */
2246 if (mddev->reshape_position != MaxSector &&
2247 mddev->layout != mddev->new_layout)
2248 return sprintf(page, "%d (%d)\n",
2249 mddev->new_layout, mddev->layout);
2207 return sprintf(page, "%d\n", mddev->layout); 2250 return sprintf(page, "%d\n", mddev->layout);
2208} 2251}
2209 2252
@@ -2212,13 +2255,16 @@ layout_store(mddev_t *mddev, const char *buf, size_t len)
2212{ 2255{
2213 char *e; 2256 char *e;
2214 unsigned long n = simple_strtoul(buf, &e, 10); 2257 unsigned long n = simple_strtoul(buf, &e, 10);
2215 if (mddev->pers)
2216 return -EBUSY;
2217 2258
2218 if (!*buf || (*e && *e != '\n')) 2259 if (!*buf || (*e && *e != '\n'))
2219 return -EINVAL; 2260 return -EINVAL;
2220 2261
2221 mddev->layout = n; 2262 if (mddev->pers)
2263 return -EBUSY;
2264 if (mddev->reshape_position != MaxSector)
2265 mddev->new_layout = n;
2266 else
2267 mddev->layout = n;
2222 return len; 2268 return len;
2223} 2269}
2224static struct md_sysfs_entry md_layout = 2270static struct md_sysfs_entry md_layout =
@@ -2230,6 +2276,10 @@ raid_disks_show(mddev_t *mddev, char *page)
2230{ 2276{
2231 if (mddev->raid_disks == 0) 2277 if (mddev->raid_disks == 0)
2232 return 0; 2278 return 0;
2279 if (mddev->reshape_position != MaxSector &&
2280 mddev->delta_disks != 0)
2281 return sprintf(page, "%d (%d)\n", mddev->raid_disks,
2282 mddev->raid_disks - mddev->delta_disks);
2233 return sprintf(page, "%d\n", mddev->raid_disks); 2283 return sprintf(page, "%d\n", mddev->raid_disks);
2234} 2284}
2235 2285
@@ -2247,7 +2297,11 @@ raid_disks_store(mddev_t *mddev, const char *buf, size_t len)
2247 2297
2248 if (mddev->pers) 2298 if (mddev->pers)
2249 rv = update_raid_disks(mddev, n); 2299 rv = update_raid_disks(mddev, n);
2250 else 2300 else if (mddev->reshape_position != MaxSector) {
2301 int olddisks = mddev->raid_disks - mddev->delta_disks;
2302 mddev->delta_disks = n - olddisks;
2303 mddev->raid_disks = n;
2304 } else
2251 mddev->raid_disks = n; 2305 mddev->raid_disks = n;
2252 return rv ? rv : len; 2306 return rv ? rv : len;
2253} 2307}
@@ -2257,6 +2311,10 @@ __ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store);
2257static ssize_t 2311static ssize_t
2258chunk_size_show(mddev_t *mddev, char *page) 2312chunk_size_show(mddev_t *mddev, char *page)
2259{ 2313{
2314 if (mddev->reshape_position != MaxSector &&
2315 mddev->chunk_size != mddev->new_chunk)
2316 return sprintf(page, "%d (%d)\n", mddev->new_chunk,
2317 mddev->chunk_size);
2260 return sprintf(page, "%d\n", mddev->chunk_size); 2318 return sprintf(page, "%d\n", mddev->chunk_size);
2261} 2319}
2262 2320
@@ -2267,12 +2325,15 @@ chunk_size_store(mddev_t *mddev, const char *buf, size_t len)
2267 char *e; 2325 char *e;
2268 unsigned long n = simple_strtoul(buf, &e, 10); 2326 unsigned long n = simple_strtoul(buf, &e, 10);
2269 2327
2270 if (mddev->pers)
2271 return -EBUSY;
2272 if (!*buf || (*e && *e != '\n')) 2328 if (!*buf || (*e && *e != '\n'))
2273 return -EINVAL; 2329 return -EINVAL;
2274 2330
2275 mddev->chunk_size = n; 2331 if (mddev->pers)
2332 return -EBUSY;
2333 else if (mddev->reshape_position != MaxSector)
2334 mddev->new_chunk = n;
2335 else
2336 mddev->chunk_size = n;
2276 return len; 2337 return len;
2277} 2338}
2278static struct md_sysfs_entry md_chunk_size = 2339static struct md_sysfs_entry md_chunk_size =
@@ -2637,8 +2698,7 @@ metadata_store(mddev_t *mddev, const char *buf, size_t len)
2637 minor = simple_strtoul(buf, &e, 10); 2698 minor = simple_strtoul(buf, &e, 10);
2638 if (e==buf || (*e && *e != '\n') ) 2699 if (e==buf || (*e && *e != '\n') )
2639 return -EINVAL; 2700 return -EINVAL;
2640 if (major >= sizeof(super_types)/sizeof(super_types[0]) || 2701 if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL)
2641 super_types[major].name == NULL)
2642 return -ENOENT; 2702 return -ENOENT;
2643 mddev->major_version = major; 2703 mddev->major_version = major;
2644 mddev->minor_version = minor; 2704 mddev->minor_version = minor;
@@ -2859,6 +2919,37 @@ suspend_hi_store(mddev_t *mddev, const char *buf, size_t len)
2859static struct md_sysfs_entry md_suspend_hi = 2919static struct md_sysfs_entry md_suspend_hi =
2860__ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store); 2920__ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store);
2861 2921
2922static ssize_t
2923reshape_position_show(mddev_t *mddev, char *page)
2924{
2925 if (mddev->reshape_position != MaxSector)
2926 return sprintf(page, "%llu\n",
2927 (unsigned long long)mddev->reshape_position);
2928 strcpy(page, "none\n");
2929 return 5;
2930}
2931
2932static ssize_t
2933reshape_position_store(mddev_t *mddev, const char *buf, size_t len)
2934{
2935 char *e;
2936 unsigned long long new = simple_strtoull(buf, &e, 10);
2937 if (mddev->pers)
2938 return -EBUSY;
2939 if (buf == e || (*e && *e != '\n'))
2940 return -EINVAL;
2941 mddev->reshape_position = new;
2942 mddev->delta_disks = 0;
2943 mddev->new_level = mddev->level;
2944 mddev->new_layout = mddev->layout;
2945 mddev->new_chunk = mddev->chunk_size;
2946 return len;
2947}
2948
2949static struct md_sysfs_entry md_reshape_position =
2950__ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show,
2951 reshape_position_store);
2952
2862 2953
2863static struct attribute *md_default_attrs[] = { 2954static struct attribute *md_default_attrs[] = {
2864 &md_level.attr, 2955 &md_level.attr,
@@ -2871,6 +2962,7 @@ static struct attribute *md_default_attrs[] = {
2871 &md_new_device.attr, 2962 &md_new_device.attr,
2872 &md_safe_delay.attr, 2963 &md_safe_delay.attr,
2873 &md_array_state.attr, 2964 &md_array_state.attr,
2965 &md_reshape_position.attr,
2874 NULL, 2966 NULL,
2875}; 2967};
2876 2968
@@ -3012,6 +3104,7 @@ static int do_md_run(mddev_t * mddev)
3012 struct gendisk *disk; 3104 struct gendisk *disk;
3013 struct mdk_personality *pers; 3105 struct mdk_personality *pers;
3014 char b[BDEVNAME_SIZE]; 3106 char b[BDEVNAME_SIZE];
3107 struct block_device *bdev;
3015 3108
3016 if (list_empty(&mddev->disks)) 3109 if (list_empty(&mddev->disks))
3017 /* cannot run an array with no devices.. */ 3110 /* cannot run an array with no devices.. */
@@ -3239,7 +3332,13 @@ static int do_md_run(mddev_t * mddev)
3239 md_wakeup_thread(mddev->thread); 3332 md_wakeup_thread(mddev->thread);
3240 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ 3333 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
3241 3334
3242 mddev->changed = 1; 3335 bdev = bdget_disk(mddev->gendisk, 0);
3336 if (bdev) {
3337 bd_set_size(bdev, mddev->array_size << 1);
3338 blkdev_ioctl(bdev->bd_inode, NULL, BLKRRPART, 0);
3339 bdput(bdev);
3340 }
3341
3243 md_new_event(mddev); 3342 md_new_event(mddev);
3244 kobject_uevent(&mddev->gendisk->kobj, KOBJ_CHANGE); 3343 kobject_uevent(&mddev->gendisk->kobj, KOBJ_CHANGE);
3245 return 0; 3344 return 0;
@@ -3361,7 +3460,6 @@ static int do_md_stop(mddev_t * mddev, int mode)
3361 mddev->pers = NULL; 3460 mddev->pers = NULL;
3362 3461
3363 set_capacity(disk, 0); 3462 set_capacity(disk, 0);
3364 mddev->changed = 1;
3365 3463
3366 if (mddev->ro) 3464 if (mddev->ro)
3367 mddev->ro = 0; 3465 mddev->ro = 0;
@@ -3409,6 +3507,7 @@ static int do_md_stop(mddev_t * mddev, int mode)
3409 mddev->size = 0; 3507 mddev->size = 0;
3410 mddev->raid_disks = 0; 3508 mddev->raid_disks = 0;
3411 mddev->recovery_cp = 0; 3509 mddev->recovery_cp = 0;
3510 mddev->reshape_position = MaxSector;
3412 3511
3413 } else if (mddev->pers) 3512 } else if (mddev->pers)
3414 printk(KERN_INFO "md: %s switched to read-only mode.\n", 3513 printk(KERN_INFO "md: %s switched to read-only mode.\n",
@@ -4019,7 +4118,7 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
4019 if (info->raid_disks == 0) { 4118 if (info->raid_disks == 0) {
4020 /* just setting version number for superblock loading */ 4119 /* just setting version number for superblock loading */
4021 if (info->major_version < 0 || 4120 if (info->major_version < 0 ||
4022 info->major_version >= sizeof(super_types)/sizeof(super_types[0]) || 4121 info->major_version >= ARRAY_SIZE(super_types) ||
4023 super_types[info->major_version].name == NULL) { 4122 super_types[info->major_version].name == NULL) {
4024 /* maybe try to auto-load a module? */ 4123 /* maybe try to auto-load a module? */
4025 printk(KERN_INFO 4124 printk(KERN_INFO
@@ -4500,20 +4599,6 @@ static int md_release(struct inode *inode, struct file * file)
4500 return 0; 4599 return 0;
4501} 4600}
4502 4601
4503static int md_media_changed(struct gendisk *disk)
4504{
4505 mddev_t *mddev = disk->private_data;
4506
4507 return mddev->changed;
4508}
4509
4510static int md_revalidate(struct gendisk *disk)
4511{
4512 mddev_t *mddev = disk->private_data;
4513
4514 mddev->changed = 0;
4515 return 0;
4516}
4517static struct block_device_operations md_fops = 4602static struct block_device_operations md_fops =
4518{ 4603{
4519 .owner = THIS_MODULE, 4604 .owner = THIS_MODULE,
@@ -4521,8 +4606,6 @@ static struct block_device_operations md_fops =
4521 .release = md_release, 4606 .release = md_release,
4522 .ioctl = md_ioctl, 4607 .ioctl = md_ioctl,
4523 .getgeo = md_getgeo, 4608 .getgeo = md_getgeo,
4524 .media_changed = md_media_changed,
4525 .revalidate_disk= md_revalidate,
4526}; 4609};
4527 4610
4528static int md_thread(void * arg) 4611static int md_thread(void * arg)
@@ -4941,15 +5024,6 @@ static int md_seq_open(struct inode *inode, struct file *file)
4941 return error; 5024 return error;
4942} 5025}
4943 5026
4944static int md_seq_release(struct inode *inode, struct file *file)
4945{
4946 struct seq_file *m = file->private_data;
4947 struct mdstat_info *mi = m->private;
4948 m->private = NULL;
4949 kfree(mi);
4950 return seq_release(inode, file);
4951}
4952
4953static unsigned int mdstat_poll(struct file *filp, poll_table *wait) 5027static unsigned int mdstat_poll(struct file *filp, poll_table *wait)
4954{ 5028{
4955 struct seq_file *m = filp->private_data; 5029 struct seq_file *m = filp->private_data;
@@ -4971,7 +5045,7 @@ static const struct file_operations md_seq_fops = {
4971 .open = md_seq_open, 5045 .open = md_seq_open,
4972 .read = seq_read, 5046 .read = seq_read,
4973 .llseek = seq_lseek, 5047 .llseek = seq_lseek,
4974 .release = md_seq_release, 5048 .release = seq_release_private,
4975 .poll = mdstat_poll, 5049 .poll = mdstat_poll,
4976}; 5050};
4977 5051
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 97ee870b265d..1b7130cad21f 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2063,7 +2063,6 @@ static int raid1_resize(mddev_t *mddev, sector_t sectors)
2063 */ 2063 */
2064 mddev->array_size = sectors>>1; 2064 mddev->array_size = sectors>>1;
2065 set_capacity(mddev->gendisk, mddev->array_size << 1); 2065 set_capacity(mddev->gendisk, mddev->array_size << 1);
2066 mddev->changed = 1;
2067 if (mddev->array_size > mddev->size && mddev->recovery_cp == MaxSector) { 2066 if (mddev->array_size > mddev->size && mddev->recovery_cp == MaxSector) {
2068 mddev->recovery_cp = mddev->size << 1; 2067 mddev->recovery_cp = mddev->size << 1;
2069 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2068 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 8d59914f2057..a72e70ad0975 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -353,8 +353,8 @@ static int grow_stripes(raid5_conf_t *conf, int num)
353 struct kmem_cache *sc; 353 struct kmem_cache *sc;
354 int devs = conf->raid_disks; 354 int devs = conf->raid_disks;
355 355
356 sprintf(conf->cache_name[0], "raid5/%s", mdname(conf->mddev)); 356 sprintf(conf->cache_name[0], "raid5-%s", mdname(conf->mddev));
357 sprintf(conf->cache_name[1], "raid5/%s-alt", mdname(conf->mddev)); 357 sprintf(conf->cache_name[1], "raid5-%s-alt", mdname(conf->mddev));
358 conf->active_name = 0; 358 conf->active_name = 0;
359 sc = kmem_cache_create(conf->cache_name[conf->active_name], 359 sc = kmem_cache_create(conf->cache_name[conf->active_name],
360 sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev), 360 sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev),
@@ -3864,7 +3864,6 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors)
3864 sectors &= ~((sector_t)mddev->chunk_size/512 - 1); 3864 sectors &= ~((sector_t)mddev->chunk_size/512 - 1);
3865 mddev->array_size = (sectors * (mddev->raid_disks-conf->max_degraded))>>1; 3865 mddev->array_size = (sectors * (mddev->raid_disks-conf->max_degraded))>>1;
3866 set_capacity(mddev->gendisk, mddev->array_size << 1); 3866 set_capacity(mddev->gendisk, mddev->array_size << 1);
3867 mddev->changed = 1;
3868 if (sectors/2 > mddev->size && mddev->recovery_cp == MaxSector) { 3867 if (sectors/2 > mddev->size && mddev->recovery_cp == MaxSector) {
3869 mddev->recovery_cp = mddev->size << 1; 3868 mddev->recovery_cp = mddev->size << 1;
3870 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3869 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
@@ -3999,7 +3998,6 @@ static void end_reshape(raid5_conf_t *conf)
3999 conf->mddev->array_size = conf->mddev->size * 3998 conf->mddev->array_size = conf->mddev->size *
4000 (conf->raid_disks - conf->max_degraded); 3999 (conf->raid_disks - conf->max_degraded);
4001 set_capacity(conf->mddev->gendisk, conf->mddev->array_size << 1); 4000 set_capacity(conf->mddev->gendisk, conf->mddev->array_size << 1);
4002 conf->mddev->changed = 1;
4003 4001
4004 bdev = bdget_disk(conf->mddev->gendisk, 0); 4002 bdev = bdget_disk(conf->mddev->gendisk, 0);
4005 if (bdev) { 4003 if (bdev) {
diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c
index b6c16704aaab..7385acfa1dd9 100644
--- a/drivers/mmc/core/core.c
+++ b/drivers/mmc/core/core.c
@@ -501,9 +501,9 @@ void mmc_detect_change(struct mmc_host *host, unsigned long delay)
501{ 501{
502#ifdef CONFIG_MMC_DEBUG 502#ifdef CONFIG_MMC_DEBUG
503 unsigned long flags; 503 unsigned long flags;
504 spin_lock_irqsave(host->lock, flags); 504 spin_lock_irqsave(&host->lock, flags);
505 BUG_ON(host->removed); 505 BUG_ON(host->removed);
506 spin_unlock_irqrestore(host->lock, flags); 506 spin_unlock_irqrestore(&host->lock, flags);
507#endif 507#endif
508 508
509 mmc_schedule_delayed_work(&host->detect, delay); 509 mmc_schedule_delayed_work(&host->detect, delay);
diff --git a/drivers/net/e1000/e1000_main.c b/drivers/net/e1000/e1000_main.c
index 3a03a74c0609..637ae8f68791 100644
--- a/drivers/net/e1000/e1000_main.c
+++ b/drivers/net/e1000/e1000_main.c
@@ -1214,7 +1214,7 @@ e1000_remove(struct pci_dev *pdev)
1214 int i; 1214 int i;
1215#endif 1215#endif
1216 1216
1217 flush_scheduled_work(); 1217 cancel_work_sync(&adapter->reset_task);
1218 1218
1219 e1000_release_manageability(adapter); 1219 e1000_release_manageability(adapter);
1220 1220
diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index eed433d6056a..f71dab347667 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -662,10 +662,10 @@ int phy_stop_interrupts(struct phy_device *phydev)
662 phy_error(phydev); 662 phy_error(phydev);
663 663
664 /* 664 /*
665 * Finish any pending work; we might have been scheduled 665 * Finish any pending work; we might have been scheduled to be called
666 * to be called from keventd ourselves, though. 666 * from keventd ourselves, but cancel_work_sync() handles that.
667 */ 667 */
668 run_scheduled_work(&phydev->phy_queue); 668 cancel_work_sync(&phydev->phy_queue);
669 669
670 free_irq(phydev->irq, phydev); 670 free_irq(phydev->irq, phydev);
671 671
diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c
index e5e901ecd808..923b9c725cc3 100644
--- a/drivers/net/tg3.c
+++ b/drivers/net/tg3.c
@@ -3716,10 +3716,8 @@ static void tg3_reset_task(struct work_struct *work)
3716 unsigned int restart_timer; 3716 unsigned int restart_timer;
3717 3717
3718 tg3_full_lock(tp, 0); 3718 tg3_full_lock(tp, 0);
3719 tp->tg3_flags |= TG3_FLAG_IN_RESET_TASK;
3720 3719
3721 if (!netif_running(tp->dev)) { 3720 if (!netif_running(tp->dev)) {
3722 tp->tg3_flags &= ~TG3_FLAG_IN_RESET_TASK;
3723 tg3_full_unlock(tp); 3721 tg3_full_unlock(tp);
3724 return; 3722 return;
3725 } 3723 }
@@ -3750,8 +3748,6 @@ static void tg3_reset_task(struct work_struct *work)
3750 mod_timer(&tp->timer, jiffies + 1); 3748 mod_timer(&tp->timer, jiffies + 1);
3751 3749
3752out: 3750out:
3753 tp->tg3_flags &= ~TG3_FLAG_IN_RESET_TASK;
3754
3755 tg3_full_unlock(tp); 3751 tg3_full_unlock(tp);
3756} 3752}
3757 3753
@@ -7390,12 +7386,7 @@ static int tg3_close(struct net_device *dev)
7390{ 7386{
7391 struct tg3 *tp = netdev_priv(dev); 7387 struct tg3 *tp = netdev_priv(dev);
7392 7388
7393 /* Calling flush_scheduled_work() may deadlock because 7389 cancel_work_sync(&tp->reset_task);
7394 * linkwatch_event() may be on the workqueue and it will try to get
7395 * the rtnl_lock which we are holding.
7396 */
7397 while (tp->tg3_flags & TG3_FLAG_IN_RESET_TASK)
7398 msleep(1);
7399 7390
7400 netif_stop_queue(dev); 7391 netif_stop_queue(dev);
7401 7392
diff --git a/drivers/net/tg3.h b/drivers/net/tg3.h
index 4d334cf5a243..bd9f4f428e5b 100644
--- a/drivers/net/tg3.h
+++ b/drivers/net/tg3.h
@@ -2228,7 +2228,7 @@ struct tg3 {
2228#define TG3_FLAG_JUMBO_RING_ENABLE 0x00800000 2228#define TG3_FLAG_JUMBO_RING_ENABLE 0x00800000
2229#define TG3_FLAG_10_100_ONLY 0x01000000 2229#define TG3_FLAG_10_100_ONLY 0x01000000
2230#define TG3_FLAG_PAUSE_AUTONEG 0x02000000 2230#define TG3_FLAG_PAUSE_AUTONEG 0x02000000
2231#define TG3_FLAG_IN_RESET_TASK 0x04000000 2231
2232#define TG3_FLAG_40BIT_DMA_BUG 0x08000000 2232#define TG3_FLAG_40BIT_DMA_BUG 0x08000000
2233#define TG3_FLAG_BROKEN_CHECKSUMS 0x10000000 2233#define TG3_FLAG_BROKEN_CHECKSUMS 0x10000000
2234#define TG3_FLAG_SUPPORT_MSI 0x20000000 2234#define TG3_FLAG_SUPPORT_MSI 0x20000000
diff --git a/drivers/spi/atmel_spi.c b/drivers/spi/atmel_spi.c
index 66e7bc985797..1d8a2f6bb8eb 100644
--- a/drivers/spi/atmel_spi.c
+++ b/drivers/spi/atmel_spi.c
@@ -22,10 +22,7 @@
22#include <asm/io.h> 22#include <asm/io.h>
23#include <asm/arch/board.h> 23#include <asm/arch/board.h>
24#include <asm/arch/gpio.h> 24#include <asm/arch/gpio.h>
25
26#ifdef CONFIG_ARCH_AT91
27#include <asm/arch/cpu.h> 25#include <asm/arch/cpu.h>
28#endif
29 26
30#include "atmel_spi.h" 27#include "atmel_spi.h"
31 28
@@ -552,10 +549,8 @@ static int __init atmel_spi_probe(struct platform_device *pdev)
552 goto out_free_buffer; 549 goto out_free_buffer;
553 as->irq = irq; 550 as->irq = irq;
554 as->clk = clk; 551 as->clk = clk;
555#ifdef CONFIG_ARCH_AT91
556 if (!cpu_is_at91rm9200()) 552 if (!cpu_is_at91rm9200())
557 as->new_1 = 1; 553 as->new_1 = 1;
558#endif
559 554
560 ret = request_irq(irq, atmel_spi_interrupt, 0, 555 ret = request_irq(irq, atmel_spi_interrupt, 0,
561 pdev->dev.bus_id, master); 556 pdev->dev.bus_id, master);
diff --git a/drivers/usb/atm/usbatm.c b/drivers/usb/atm/usbatm.c
index b082d95bbbaa..11e9b15ca45a 100644
--- a/drivers/usb/atm/usbatm.c
+++ b/drivers/usb/atm/usbatm.c
@@ -1033,7 +1033,7 @@ static int usbatm_do_heavy_init(void *arg)
1033 1033
1034static int usbatm_heavy_init(struct usbatm_data *instance) 1034static int usbatm_heavy_init(struct usbatm_data *instance)
1035{ 1035{
1036 int ret = kernel_thread(usbatm_do_heavy_init, instance, CLONE_KERNEL); 1036 int ret = kernel_thread(usbatm_do_heavy_init, instance, CLONE_FS | CLONE_FILES);
1037 1037
1038 if (ret < 0) { 1038 if (ret < 0) {
1039 usb_err(instance, "%s: failed to create kernel_thread (%d)!\n", __func__, ret); 1039 usb_err(instance, "%s: failed to create kernel_thread (%d)!\n", __func__, ret);
diff --git a/drivers/video/Kconfig b/drivers/video/Kconfig
index 1132ba5ff391..9a256d2ff9dc 100644
--- a/drivers/video/Kconfig
+++ b/drivers/video/Kconfig
@@ -1348,6 +1348,20 @@ config FB_VOODOO1
1348 Please read the <file:Documentation/fb/README-sstfb.txt> for supported 1348 Please read the <file:Documentation/fb/README-sstfb.txt> for supported
1349 options and other important info support. 1349 options and other important info support.
1350 1350
1351config FB_VT8623
1352 tristate "VIA VT8623 support"
1353 depends on FB && PCI
1354 select FB_CFB_FILLRECT
1355 select FB_CFB_COPYAREA
1356 select FB_CFB_IMAGEBLIT
1357 select FB_TILEBLITTING
1358 select FB_SVGALIB
1359 select VGASTATE
1360 select FONT_8x16 if FRAMEBUFFER_CONSOLE
1361 ---help---
1362 Driver for CastleRock integrated graphics core in the
1363 VIA VT8623 [Apollo CLE266] chipset.
1364
1351config FB_CYBLA 1365config FB_CYBLA
1352 tristate "Cyberblade/i1 support" 1366 tristate "Cyberblade/i1 support"
1353 depends on FB && PCI && X86_32 && !64BIT 1367 depends on FB && PCI && X86_32 && !64BIT
@@ -1401,6 +1415,20 @@ config FB_TRIDENT_ACCEL
1401 This will compile the Trident frame buffer device with 1415 This will compile the Trident frame buffer device with
1402 acceleration functions. 1416 acceleration functions.
1403 1417
1418config FB_ARK
1419 tristate "ARK 2000PV support"
1420 depends on FB && PCI
1421 select FB_CFB_FILLRECT
1422 select FB_CFB_COPYAREA
1423 select FB_CFB_IMAGEBLIT
1424 select FB_TILEBLITTING
1425 select FB_SVGALIB
1426 select VGASTATE
1427 select FONT_8x16 if FRAMEBUFFER_CONSOLE
1428 ---help---
1429 Driver for PCI graphics boards with ARK 2000PV chip
1430 and ICS 5342 RAMDAC.
1431
1404config FB_PM3 1432config FB_PM3
1405 tristate "Permedia3 support" 1433 tristate "Permedia3 support"
1406 depends on FB && PCI && BROKEN 1434 depends on FB && PCI && BROKEN
diff --git a/drivers/video/Makefile b/drivers/video/Makefile
index a916c204274f..0b70567458fb 100644
--- a/drivers/video/Makefile
+++ b/drivers/video/Makefile
@@ -54,10 +54,12 @@ obj-$(CONFIG_FB_VALKYRIE) += valkyriefb.o
54obj-$(CONFIG_FB_CT65550) += chipsfb.o 54obj-$(CONFIG_FB_CT65550) += chipsfb.o
55obj-$(CONFIG_FB_IMSTT) += imsttfb.o 55obj-$(CONFIG_FB_IMSTT) += imsttfb.o
56obj-$(CONFIG_FB_FM2) += fm2fb.o 56obj-$(CONFIG_FB_FM2) += fm2fb.o
57obj-$(CONFIG_FB_VT8623) += vt8623fb.o
57obj-$(CONFIG_FB_CYBLA) += cyblafb.o 58obj-$(CONFIG_FB_CYBLA) += cyblafb.o
58obj-$(CONFIG_FB_TRIDENT) += tridentfb.o 59obj-$(CONFIG_FB_TRIDENT) += tridentfb.o
59obj-$(CONFIG_FB_LE80578) += vermilion/ 60obj-$(CONFIG_FB_LE80578) += vermilion/
60obj-$(CONFIG_FB_S3) += s3fb.o 61obj-$(CONFIG_FB_S3) += s3fb.o
62obj-$(CONFIG_FB_ARK) += arkfb.o
61obj-$(CONFIG_FB_STI) += stifb.o 63obj-$(CONFIG_FB_STI) += stifb.o
62obj-$(CONFIG_FB_FFB) += ffb.o sbuslib.o 64obj-$(CONFIG_FB_FFB) += ffb.o sbuslib.o
63obj-$(CONFIG_FB_CG6) += cg6.o sbuslib.o 65obj-$(CONFIG_FB_CG6) += cg6.o sbuslib.o
diff --git a/drivers/video/arkfb.c b/drivers/video/arkfb.c
new file mode 100644
index 000000000000..ba6fede5c466
--- /dev/null
+++ b/drivers/video/arkfb.c
@@ -0,0 +1,1200 @@
1/*
2 * linux/drivers/video/arkfb.c -- Frame buffer device driver for ARK 2000PV
3 * with ICS 5342 dac (it is easy to add support for different dacs).
4 *
5 * Copyright (c) 2007 Ondrej Zajicek <santiago@crfreenet.org>
6 *
7 * This file is subject to the terms and conditions of the GNU General Public
8 * License. See the file COPYING in the main directory of this archive for
9 * more details.
10 *
11 * Code is based on s3fb
12 */
13
14#include <linux/version.h>
15#include <linux/module.h>
16#include <linux/kernel.h>
17#include <linux/errno.h>
18#include <linux/string.h>
19#include <linux/mm.h>
20#include <linux/tty.h>
21#include <linux/slab.h>
22#include <linux/delay.h>
23#include <linux/fb.h>
24#include <linux/svga.h>
25#include <linux/init.h>
26#include <linux/pci.h>
27#include <linux/console.h> /* Why should fb driver call console functions? because acquire_console_sem() */
28#include <video/vga.h>
29
30#ifdef CONFIG_MTRR
31#include <asm/mtrr.h>
32#endif
33
34struct arkfb_info {
35 int mclk_freq;
36 int mtrr_reg;
37
38 struct dac_info *dac;
39 struct vgastate state;
40 struct mutex open_lock;
41 unsigned int ref_count;
42 u32 pseudo_palette[16];
43};
44
45
46/* ------------------------------------------------------------------------- */
47
48
49static const struct svga_fb_format arkfb_formats[] = {
50 { 0, {0, 6, 0}, {0, 6, 0}, {0, 6, 0}, {0, 0, 0}, 0,
51 FB_TYPE_TEXT, FB_AUX_TEXT_SVGA_STEP4, FB_VISUAL_PSEUDOCOLOR, 8, 8},
52 { 4, {0, 6, 0}, {0, 6, 0}, {0, 6, 0}, {0, 0, 0}, 0,
53 FB_TYPE_PACKED_PIXELS, 0, FB_VISUAL_PSEUDOCOLOR, 8, 16},
54 { 4, {0, 6, 0}, {0, 6, 0}, {0, 6, 0}, {0, 0, 0}, 1,
55 FB_TYPE_INTERLEAVED_PLANES, 1, FB_VISUAL_PSEUDOCOLOR, 8, 16},
56 { 8, {0, 6, 0}, {0, 6, 0}, {0, 6, 0}, {0, 0, 0}, 0,
57 FB_TYPE_PACKED_PIXELS, 0, FB_VISUAL_PSEUDOCOLOR, 8, 8},
58 {16, {10, 5, 0}, {5, 5, 0}, {0, 5, 0}, {0, 0, 0}, 0,
59 FB_TYPE_PACKED_PIXELS, 0, FB_VISUAL_TRUECOLOR, 4, 4},
60 {16, {11, 5, 0}, {5, 6, 0}, {0, 5, 0}, {0, 0, 0}, 0,
61 FB_TYPE_PACKED_PIXELS, 0, FB_VISUAL_TRUECOLOR, 4, 4},
62 {24, {16, 8, 0}, {8, 8, 0}, {0, 8, 0}, {0, 0, 0}, 0,
63 FB_TYPE_PACKED_PIXELS, 0, FB_VISUAL_TRUECOLOR, 8, 8},
64 {32, {16, 8, 0}, {8, 8, 0}, {0, 8, 0}, {0, 0, 0}, 0,
65 FB_TYPE_PACKED_PIXELS, 0, FB_VISUAL_TRUECOLOR, 2, 2},
66 SVGA_FORMAT_END
67};
68
69
70/* CRT timing register sets */
71
72static const struct vga_regset ark_h_total_regs[] = {{0x00, 0, 7}, {0x41, 7, 7}, VGA_REGSET_END};
73static const struct vga_regset ark_h_display_regs[] = {{0x01, 0, 7}, {0x41, 6, 6}, VGA_REGSET_END};
74static const struct vga_regset ark_h_blank_start_regs[] = {{0x02, 0, 7}, {0x41, 5, 5}, VGA_REGSET_END};
75static const struct vga_regset ark_h_blank_end_regs[] = {{0x03, 0, 4}, {0x05, 7, 7 }, VGA_REGSET_END};
76static const struct vga_regset ark_h_sync_start_regs[] = {{0x04, 0, 7}, {0x41, 4, 4}, VGA_REGSET_END};
77static const struct vga_regset ark_h_sync_end_regs[] = {{0x05, 0, 4}, VGA_REGSET_END};
78
79static const struct vga_regset ark_v_total_regs[] = {{0x06, 0, 7}, {0x07, 0, 0}, {0x07, 5, 5}, {0x40, 7, 7}, VGA_REGSET_END};
80static const struct vga_regset ark_v_display_regs[] = {{0x12, 0, 7}, {0x07, 1, 1}, {0x07, 6, 6}, {0x40, 6, 6}, VGA_REGSET_END};
81static const struct vga_regset ark_v_blank_start_regs[] = {{0x15, 0, 7}, {0x07, 3, 3}, {0x09, 5, 5}, {0x40, 5, 5}, VGA_REGSET_END};
82// const struct vga_regset ark_v_blank_end_regs[] = {{0x16, 0, 6}, VGA_REGSET_END};
83static const struct vga_regset ark_v_blank_end_regs[] = {{0x16, 0, 7}, VGA_REGSET_END};
84static const struct vga_regset ark_v_sync_start_regs[] = {{0x10, 0, 7}, {0x07, 2, 2}, {0x07, 7, 7}, {0x40, 4, 4}, VGA_REGSET_END};
85static const struct vga_regset ark_v_sync_end_regs[] = {{0x11, 0, 3}, VGA_REGSET_END};
86
87static const struct vga_regset ark_line_compare_regs[] = {{0x18, 0, 7}, {0x07, 4, 4}, {0x09, 6, 6}, VGA_REGSET_END};
88static const struct vga_regset ark_start_address_regs[] = {{0x0d, 0, 7}, {0x0c, 0, 7}, {0x40, 0, 2}, VGA_REGSET_END};
89static const struct vga_regset ark_offset_regs[] = {{0x13, 0, 7}, {0x41, 3, 3}, VGA_REGSET_END};
90
91static const struct svga_timing_regs ark_timing_regs = {
92 ark_h_total_regs, ark_h_display_regs, ark_h_blank_start_regs,
93 ark_h_blank_end_regs, ark_h_sync_start_regs, ark_h_sync_end_regs,
94 ark_v_total_regs, ark_v_display_regs, ark_v_blank_start_regs,
95 ark_v_blank_end_regs, ark_v_sync_start_regs, ark_v_sync_end_regs,
96};
97
98
99/* ------------------------------------------------------------------------- */
100
101
102/* Module parameters */
103
104static char *mode = "640x480-8@60";
105
106#ifdef CONFIG_MTRR
107static int mtrr = 1;
108#endif
109
110MODULE_AUTHOR("(c) 2007 Ondrej Zajicek <santiago@crfreenet.org>");
111MODULE_LICENSE("GPL");
112MODULE_DESCRIPTION("fbdev driver for ARK 2000PV");
113
114module_param(mode, charp, 0444);
115MODULE_PARM_DESC(mode, "Default video mode ('640x480-8@60', etc)");
116
117#ifdef CONFIG_MTRR
118module_param(mtrr, int, 0444);
119MODULE_PARM_DESC(mtrr, "Enable write-combining with MTRR (1=enable, 0=disable, default=1)");
120#endif
121
122static int threshold = 4;
123
124module_param(threshold, int, 0644);
125MODULE_PARM_DESC(threshold, "FIFO threshold");
126
127
128/* ------------------------------------------------------------------------- */
129
130
131static void arkfb_settile(struct fb_info *info, struct fb_tilemap *map)
132{
133 const u8 *font = map->data;
134 u8 __iomem *fb = (u8 __iomem *)info->screen_base;
135 int i, c;
136
137 if ((map->width != 8) || (map->height != 16) ||
138 (map->depth != 1) || (map->length != 256)) {
139 printk(KERN_ERR "fb%d: unsupported font parameters: width %d, "
140 "height %d, depth %d, length %d\n", info->node,
141 map->width, map->height, map->depth, map->length);
142 return;
143 }
144
145 fb += 2;
146 for (c = 0; c < map->length; c++) {
147 for (i = 0; i < map->height; i++) {
148 fb_writeb(font[i], &fb[i * 4]);
149 fb_writeb(font[i], &fb[i * 4 + (128 * 8)]);
150 }
151 fb += 128;
152
153 if ((c % 8) == 7)
154 fb += 128*8;
155
156 font += map->height;
157 }
158}
159
160static struct fb_tile_ops arkfb_tile_ops = {
161 .fb_settile = arkfb_settile,
162 .fb_tilecopy = svga_tilecopy,
163 .fb_tilefill = svga_tilefill,
164 .fb_tileblit = svga_tileblit,
165 .fb_tilecursor = svga_tilecursor,
166 .fb_get_tilemax = svga_get_tilemax,
167};
168
169
170/* ------------------------------------------------------------------------- */
171
172
173/* image data is MSB-first, fb structure is MSB-first too */
174static inline u32 expand_color(u32 c)
175{
176 return ((c & 1) | ((c & 2) << 7) | ((c & 4) << 14) | ((c & 8) << 21)) * 0xFF;
177}
178
179/* arkfb_iplan_imageblit silently assumes that almost everything is 8-pixel aligned */
180static void arkfb_iplan_imageblit(struct fb_info *info, const struct fb_image *image)
181{
182 u32 fg = expand_color(image->fg_color);
183 u32 bg = expand_color(image->bg_color);
184 const u8 *src1, *src;
185 u8 __iomem *dst1;
186 u32 __iomem *dst;
187 u32 val;
188 int x, y;
189
190 src1 = image->data;
191 dst1 = info->screen_base + (image->dy * info->fix.line_length)
192 + ((image->dx / 8) * 4);
193
194 for (y = 0; y < image->height; y++) {
195 src = src1;
196 dst = (u32 __iomem *) dst1;
197 for (x = 0; x < image->width; x += 8) {
198 val = *(src++) * 0x01010101;
199 val = (val & fg) | (~val & bg);
200 fb_writel(val, dst++);
201 }
202 src1 += image->width / 8;
203 dst1 += info->fix.line_length;
204 }
205
206}
207
208/* arkfb_iplan_fillrect silently assumes that almost everything is 8-pixel aligned */
209static void arkfb_iplan_fillrect(struct fb_info *info, const struct fb_fillrect *rect)
210{
211 u32 fg = expand_color(rect->color);
212 u8 __iomem *dst1;
213 u32 __iomem *dst;
214 int x, y;
215
216 dst1 = info->screen_base + (rect->dy * info->fix.line_length)
217 + ((rect->dx / 8) * 4);
218
219 for (y = 0; y < rect->height; y++) {
220 dst = (u32 __iomem *) dst1;
221 for (x = 0; x < rect->width; x += 8) {
222 fb_writel(fg, dst++);
223 }
224 dst1 += info->fix.line_length;
225 }
226
227}
228
229
230/* image data is MSB-first, fb structure is high-nibble-in-low-byte-first */
231static inline u32 expand_pixel(u32 c)
232{
233 return (((c & 1) << 24) | ((c & 2) << 27) | ((c & 4) << 14) | ((c & 8) << 17) |
234 ((c & 16) << 4) | ((c & 32) << 7) | ((c & 64) >> 6) | ((c & 128) >> 3)) * 0xF;
235}
236
237/* arkfb_cfb4_imageblit silently assumes that almost everything is 8-pixel aligned */
238static void arkfb_cfb4_imageblit(struct fb_info *info, const struct fb_image *image)
239{
240 u32 fg = image->fg_color * 0x11111111;
241 u32 bg = image->bg_color * 0x11111111;
242 const u8 *src1, *src;
243 u8 __iomem *dst1;
244 u32 __iomem *dst;
245 u32 val;
246 int x, y;
247
248 src1 = image->data;
249 dst1 = info->screen_base + (image->dy * info->fix.line_length)
250 + ((image->dx / 8) * 4);
251
252 for (y = 0; y < image->height; y++) {
253 src = src1;
254 dst = (u32 __iomem *) dst1;
255 for (x = 0; x < image->width; x += 8) {
256 val = expand_pixel(*(src++));
257 val = (val & fg) | (~val & bg);
258 fb_writel(val, dst++);
259 }
260 src1 += image->width / 8;
261 dst1 += info->fix.line_length;
262 }
263
264}
265
266static void arkfb_imageblit(struct fb_info *info, const struct fb_image *image)
267{
268 if ((info->var.bits_per_pixel == 4) && (image->depth == 1)
269 && ((image->width % 8) == 0) && ((image->dx % 8) == 0)) {
270 if (info->fix.type == FB_TYPE_INTERLEAVED_PLANES)
271 arkfb_iplan_imageblit(info, image);
272 else
273 arkfb_cfb4_imageblit(info, image);
274 } else
275 cfb_imageblit(info, image);
276}
277
278static void arkfb_fillrect(struct fb_info *info, const struct fb_fillrect *rect)
279{
280 if ((info->var.bits_per_pixel == 4)
281 && ((rect->width % 8) == 0) && ((rect->dx % 8) == 0)
282 && (info->fix.type == FB_TYPE_INTERLEAVED_PLANES))
283 arkfb_iplan_fillrect(info, rect);
284 else
285 cfb_fillrect(info, rect);
286}
287
288
289/* ------------------------------------------------------------------------- */
290
291
292enum
293{
294 DAC_PSEUDO8_8,
295 DAC_RGB1555_8,
296 DAC_RGB0565_8,
297 DAC_RGB0888_8,
298 DAC_RGB8888_8,
299 DAC_PSEUDO8_16,
300 DAC_RGB1555_16,
301 DAC_RGB0565_16,
302 DAC_RGB0888_16,
303 DAC_RGB8888_16,
304 DAC_MAX
305};
306
307struct dac_ops {
308 int (*dac_get_mode)(struct dac_info *info);
309 int (*dac_set_mode)(struct dac_info *info, int mode);
310 int (*dac_get_freq)(struct dac_info *info, int channel);
311 int (*dac_set_freq)(struct dac_info *info, int channel, u32 freq);
312 void (*dac_release)(struct dac_info *info);
313};
314
315typedef void (*dac_read_regs_t)(void *data, u8 *code, int count);
316typedef void (*dac_write_regs_t)(void *data, u8 *code, int count);
317
318struct dac_info
319{
320 struct dac_ops *dacops;
321 dac_read_regs_t dac_read_regs;
322 dac_write_regs_t dac_write_regs;
323 void *data;
324};
325
326
327static inline u8 dac_read_reg(struct dac_info *info, u8 reg)
328{
329 u8 code[2] = {reg, 0};
330 info->dac_read_regs(info->data, code, 1);
331 return code[1];
332}
333
334static inline void dac_read_regs(struct dac_info *info, u8 *code, int count)
335{
336 info->dac_read_regs(info->data, code, count);
337}
338
339static inline void dac_write_reg(struct dac_info *info, u8 reg, u8 val)
340{
341 u8 code[2] = {reg, val};
342 info->dac_write_regs(info->data, code, 1);
343}
344
345static inline void dac_write_regs(struct dac_info *info, u8 *code, int count)
346{
347 info->dac_write_regs(info->data, code, count);
348}
349
350static inline int dac_set_mode(struct dac_info *info, int mode)
351{
352 return info->dacops->dac_set_mode(info, mode);
353}
354
355static inline int dac_set_freq(struct dac_info *info, int channel, u32 freq)
356{
357 return info->dacops->dac_set_freq(info, channel, freq);
358}
359
360static inline void dac_release(struct dac_info *info)
361{
362 info->dacops->dac_release(info);
363}
364
365
366/* ------------------------------------------------------------------------- */
367
368
369/* ICS5342 DAC */
370
371struct ics5342_info
372{
373 struct dac_info dac;
374 u8 mode;
375};
376
377#define DAC_PAR(info) ((struct ics5342_info *) info)
378
379/* LSB is set to distinguish unused slots */
380static const u8 ics5342_mode_table[DAC_MAX] = {
381 [DAC_PSEUDO8_8] = 0x01, [DAC_RGB1555_8] = 0x21, [DAC_RGB0565_8] = 0x61,
382 [DAC_RGB0888_8] = 0x41, [DAC_PSEUDO8_16] = 0x11, [DAC_RGB1555_16] = 0x31,
383 [DAC_RGB0565_16] = 0x51, [DAC_RGB0888_16] = 0x91, [DAC_RGB8888_16] = 0x71
384};
385
386static int ics5342_set_mode(struct dac_info *info, int mode)
387{
388 u8 code;
389
390 if (mode >= DAC_MAX)
391 return -EINVAL;
392
393 code = ics5342_mode_table[mode];
394
395 if (! code)
396 return -EINVAL;
397
398 dac_write_reg(info, 6, code & 0xF0);
399 DAC_PAR(info)->mode = mode;
400
401 return 0;
402}
403
404static const struct svga_pll ics5342_pll = {3, 129, 3, 33, 0, 3,
405 60000, 250000, 14318};
406
407/* pd4 - allow only posdivider 4 (r=2) */
408static const struct svga_pll ics5342_pll_pd4 = {3, 129, 3, 33, 2, 2,
409 60000, 335000, 14318};
410
411/* 270 MHz should be upper bound for VCO clock according to specs,
412 but that is too restrictive in pd4 case */
413
414static int ics5342_set_freq(struct dac_info *info, int channel, u32 freq)
415{
416 u16 m, n, r;
417
418 /* only postdivider 4 (r=2) is valid in mode DAC_PSEUDO8_16 */
419 int rv = svga_compute_pll((DAC_PAR(info)->mode == DAC_PSEUDO8_16)
420 ? &ics5342_pll_pd4 : &ics5342_pll,
421 freq, &m, &n, &r, 0);
422
423 if (rv < 0) {
424 return -EINVAL;
425 } else {
426 u8 code[6] = {4, 3, 5, m-2, 5, (n-2) | (r << 5)};
427 dac_write_regs(info, code, 3);
428 return 0;
429 }
430}
431
432static void ics5342_release(struct dac_info *info)
433{
434 ics5342_set_mode(info, DAC_PSEUDO8_8);
435 kfree(info);
436}
437
438static struct dac_ops ics5342_ops = {
439 .dac_set_mode = ics5342_set_mode,
440 .dac_set_freq = ics5342_set_freq,
441 .dac_release = ics5342_release
442};
443
444
445static struct dac_info * ics5342_init(dac_read_regs_t drr, dac_write_regs_t dwr, void *data)
446{
447 struct dac_info *info = kzalloc(sizeof(struct ics5342_info), GFP_KERNEL);
448
449 if (! info)
450 return NULL;
451
452 info->dacops = &ics5342_ops;
453 info->dac_read_regs = drr;
454 info->dac_write_regs = dwr;
455 info->data = data;
456 DAC_PAR(info)->mode = DAC_PSEUDO8_8; /* estimation */
457 return info;
458}
459
460
461/* ------------------------------------------------------------------------- */
462
463
464static unsigned short dac_regs[4] = {0x3c8, 0x3c9, 0x3c6, 0x3c7};
465
466static void ark_dac_read_regs(void *data, u8 *code, int count)
467{
468 u8 regval = vga_rseq(NULL, 0x1C);
469
470 while (count != 0)
471 {
472 vga_wseq(NULL, 0x1C, regval | (code[0] & 4) ? 0x80 : 0);
473 code[1] = vga_r(NULL, dac_regs[code[0] & 3]);
474 count--;
475 code += 2;
476 }
477
478 vga_wseq(NULL, 0x1C, regval);
479}
480
481static void ark_dac_write_regs(void *data, u8 *code, int count)
482{
483 u8 regval = vga_rseq(NULL, 0x1C);
484
485 while (count != 0)
486 {
487 vga_wseq(NULL, 0x1C, regval | (code[0] & 4) ? 0x80 : 0);
488 vga_w(NULL, dac_regs[code[0] & 3], code[1]);
489 count--;
490 code += 2;
491 }
492
493 vga_wseq(NULL, 0x1C, regval);
494}
495
496
497static void ark_set_pixclock(struct fb_info *info, u32 pixclock)
498{
499 struct arkfb_info *par = info->par;
500 u8 regval;
501
502 int rv = dac_set_freq(par->dac, 0, 1000000000 / pixclock);
503 if (rv < 0) {
504 printk(KERN_ERR "fb%d: cannot set requested pixclock, keeping old value\n", info->node);
505 return;
506 }
507
508 /* Set VGA misc register */
509 regval = vga_r(NULL, VGA_MIS_R);
510 vga_w(NULL, VGA_MIS_W, regval | VGA_MIS_ENB_PLL_LOAD);
511}
512
513
514/* Open framebuffer */
515
516static int arkfb_open(struct fb_info *info, int user)
517{
518 struct arkfb_info *par = info->par;
519
520 mutex_lock(&(par->open_lock));
521 if (par->ref_count == 0) {
522 memset(&(par->state), 0, sizeof(struct vgastate));
523 par->state.flags = VGA_SAVE_MODE | VGA_SAVE_FONTS | VGA_SAVE_CMAP;
524 par->state.num_crtc = 0x60;
525 par->state.num_seq = 0x30;
526 save_vga(&(par->state));
527 }
528
529 par->ref_count++;
530 mutex_unlock(&(par->open_lock));
531
532 return 0;
533}
534
535/* Close framebuffer */
536
537static int arkfb_release(struct fb_info *info, int user)
538{
539 struct arkfb_info *par = info->par;
540
541 mutex_lock(&(par->open_lock));
542 if (par->ref_count == 0) {
543 mutex_unlock(&(par->open_lock));
544 return -EINVAL;
545 }
546
547 if (par->ref_count == 1) {
548 restore_vga(&(par->state));
549 dac_set_mode(par->dac, DAC_PSEUDO8_8);
550 }
551
552 par->ref_count--;
553 mutex_unlock(&(par->open_lock));
554
555 return 0;
556}
557
558/* Validate passed in var */
559
560static int arkfb_check_var(struct fb_var_screeninfo *var, struct fb_info *info)
561{
562 int rv, mem, step;
563
564 /* Find appropriate format */
565 rv = svga_match_format (arkfb_formats, var, NULL);
566 if (rv < 0)
567 {
568 printk(KERN_ERR "fb%d: unsupported mode requested\n", info->node);
569 return rv;
570 }
571
572 /* Do not allow to have real resoulution larger than virtual */
573 if (var->xres > var->xres_virtual)
574 var->xres_virtual = var->xres;
575
576 if (var->yres > var->yres_virtual)
577 var->yres_virtual = var->yres;
578
579 /* Round up xres_virtual to have proper alignment of lines */
580 step = arkfb_formats[rv].xresstep - 1;
581 var->xres_virtual = (var->xres_virtual+step) & ~step;
582
583
584 /* Check whether have enough memory */
585 mem = ((var->bits_per_pixel * var->xres_virtual) >> 3) * var->yres_virtual;
586 if (mem > info->screen_size)
587 {
588 printk(KERN_ERR "fb%d: not enough framebuffer memory (%d kB requested , %d kB available)\n", info->node, mem >> 10, (unsigned int) (info->screen_size >> 10));
589 return -EINVAL;
590 }
591
592 rv = svga_check_timings (&ark_timing_regs, var, info->node);
593 if (rv < 0)
594 {
595 printk(KERN_ERR "fb%d: invalid timings requested\n", info->node);
596 return rv;
597 }
598
599 /* Interlaced mode is broken */
600 if (var->vmode & FB_VMODE_INTERLACED)
601 return -EINVAL;
602
603 return 0;
604}
605
606/* Set video mode from par */
607
608static int arkfb_set_par(struct fb_info *info)
609{
610 struct arkfb_info *par = info->par;
611 u32 value, mode, hmul, hdiv, offset_value, screen_size;
612 u32 bpp = info->var.bits_per_pixel;
613 u8 regval;
614
615 if (bpp != 0) {
616 info->fix.ypanstep = 1;
617 info->fix.line_length = (info->var.xres_virtual * bpp) / 8;
618
619 info->flags &= ~FBINFO_MISC_TILEBLITTING;
620 info->tileops = NULL;
621
622 /* in 4bpp supports 8p wide tiles only, any tiles otherwise */
623 info->pixmap.blit_x = (bpp == 4) ? (1 << (8 - 1)) : (~(u32)0);
624 info->pixmap.blit_y = ~(u32)0;
625
626 offset_value = (info->var.xres_virtual * bpp) / 64;
627 screen_size = info->var.yres_virtual * info->fix.line_length;
628 } else {
629 info->fix.ypanstep = 16;
630 info->fix.line_length = 0;
631
632 info->flags |= FBINFO_MISC_TILEBLITTING;
633 info->tileops = &arkfb_tile_ops;
634
635 /* supports 8x16 tiles only */
636 info->pixmap.blit_x = 1 << (8 - 1);
637 info->pixmap.blit_y = 1 << (16 - 1);
638
639 offset_value = info->var.xres_virtual / 16;
640 screen_size = (info->var.xres_virtual * info->var.yres_virtual) / 64;
641 }
642
643 info->var.xoffset = 0;
644 info->var.yoffset = 0;
645 info->var.activate = FB_ACTIVATE_NOW;
646
647 /* Unlock registers */
648 svga_wcrt_mask(0x11, 0x00, 0x80);
649
650 /* Blank screen and turn off sync */
651 svga_wseq_mask(0x01, 0x20, 0x20);
652 svga_wcrt_mask(0x17, 0x00, 0x80);
653
654 /* Set default values */
655 svga_set_default_gfx_regs();
656 svga_set_default_atc_regs();
657 svga_set_default_seq_regs();
658 svga_set_default_crt_regs();
659 svga_wcrt_multi(ark_line_compare_regs, 0xFFFFFFFF);
660 svga_wcrt_multi(ark_start_address_regs, 0);
661
662 /* ARK specific initialization */
663 svga_wseq_mask(0x10, 0x1F, 0x1F); /* enable linear framebuffer and full memory access */
664 svga_wseq_mask(0x12, 0x03, 0x03); /* 4 MB linear framebuffer size */
665
666 vga_wseq(NULL, 0x13, info->fix.smem_start >> 16);
667 vga_wseq(NULL, 0x14, info->fix.smem_start >> 24);
668 vga_wseq(NULL, 0x15, 0);
669 vga_wseq(NULL, 0x16, 0);
670
671 /* Set the FIFO threshold register */
672 /* It is fascinating way to store 5-bit value in 8-bit register */
673 regval = 0x10 | ((threshold & 0x0E) >> 1) | (threshold & 0x01) << 7 | (threshold & 0x10) << 1;
674 vga_wseq(NULL, 0x18, regval);
675
676 /* Set the offset register */
677 pr_debug("fb%d: offset register : %d\n", info->node, offset_value);
678 svga_wcrt_multi(ark_offset_regs, offset_value);
679
680 /* fix for hi-res textmode */
681 svga_wcrt_mask(0x40, 0x08, 0x08);
682
683 if (info->var.vmode & FB_VMODE_DOUBLE)
684 svga_wcrt_mask(0x09, 0x80, 0x80);
685 else
686 svga_wcrt_mask(0x09, 0x00, 0x80);
687
688 if (info->var.vmode & FB_VMODE_INTERLACED)
689 svga_wcrt_mask(0x44, 0x04, 0x04);
690 else
691 svga_wcrt_mask(0x44, 0x00, 0x04);
692
693 hmul = 1;
694 hdiv = 1;
695 mode = svga_match_format(arkfb_formats, &(info->var), &(info->fix));
696
697 /* Set mode-specific register values */
698 switch (mode) {
699 case 0:
700 pr_debug("fb%d: text mode\n", info->node);
701 svga_set_textmode_vga_regs();
702
703 vga_wseq(NULL, 0x11, 0x10); /* basic VGA mode */
704 svga_wcrt_mask(0x46, 0x00, 0x04); /* 8bit pixel path */
705 dac_set_mode(par->dac, DAC_PSEUDO8_8);
706
707 break;
708 case 1:
709 pr_debug("fb%d: 4 bit pseudocolor\n", info->node);
710 vga_wgfx(NULL, VGA_GFX_MODE, 0x40);
711
712 vga_wseq(NULL, 0x11, 0x10); /* basic VGA mode */
713 svga_wcrt_mask(0x46, 0x00, 0x04); /* 8bit pixel path */
714 dac_set_mode(par->dac, DAC_PSEUDO8_8);
715 break;
716 case 2:
717 pr_debug("fb%d: 4 bit pseudocolor, planar\n", info->node);
718
719 vga_wseq(NULL, 0x11, 0x10); /* basic VGA mode */
720 svga_wcrt_mask(0x46, 0x00, 0x04); /* 8bit pixel path */
721 dac_set_mode(par->dac, DAC_PSEUDO8_8);
722 break;
723 case 3:
724 pr_debug("fb%d: 8 bit pseudocolor\n", info->node);
725
726 vga_wseq(NULL, 0x11, 0x16); /* 8bpp accel mode */
727
728 if (info->var.pixclock > 20000) {
729 pr_debug("fb%d: not using multiplex\n", info->node);
730 svga_wcrt_mask(0x46, 0x00, 0x04); /* 8bit pixel path */
731 dac_set_mode(par->dac, DAC_PSEUDO8_8);
732 } else {
733 pr_debug("fb%d: using multiplex\n", info->node);
734 svga_wcrt_mask(0x46, 0x04, 0x04); /* 16bit pixel path */
735 dac_set_mode(par->dac, DAC_PSEUDO8_16);
736 hdiv = 2;
737 }
738 break;
739 case 4:
740 pr_debug("fb%d: 5/5/5 truecolor\n", info->node);
741
742 vga_wseq(NULL, 0x11, 0x1A); /* 16bpp accel mode */
743 svga_wcrt_mask(0x46, 0x04, 0x04); /* 16bit pixel path */
744 dac_set_mode(par->dac, DAC_RGB1555_16);
745 break;
746 case 5:
747 pr_debug("fb%d: 5/6/5 truecolor\n", info->node);
748
749 vga_wseq(NULL, 0x11, 0x1A); /* 16bpp accel mode */
750 svga_wcrt_mask(0x46, 0x04, 0x04); /* 16bit pixel path */
751 dac_set_mode(par->dac, DAC_RGB0565_16);
752 break;
753 case 6:
754 pr_debug("fb%d: 8/8/8 truecolor\n", info->node);
755
756 vga_wseq(NULL, 0x11, 0x16); /* 8bpp accel mode ??? */
757 svga_wcrt_mask(0x46, 0x04, 0x04); /* 16bit pixel path */
758 dac_set_mode(par->dac, DAC_RGB0888_16);
759 hmul = 3;
760 hdiv = 2;
761 break;
762 case 7:
763 pr_debug("fb%d: 8/8/8/8 truecolor\n", info->node);
764
765 vga_wseq(NULL, 0x11, 0x1E); /* 32bpp accel mode */
766 svga_wcrt_mask(0x46, 0x04, 0x04); /* 16bit pixel path */
767 dac_set_mode(par->dac, DAC_RGB8888_16);
768 hmul = 2;
769 break;
770 default:
771 printk(KERN_ERR "fb%d: unsupported mode - bug\n", info->node);
772 return -EINVAL;
773 }
774
775 ark_set_pixclock(info, (hdiv * info->var.pixclock) / hmul);
776 svga_set_timings(&ark_timing_regs, &(info->var), hmul, hdiv,
777 (info->var.vmode & FB_VMODE_DOUBLE) ? 2 : 1,
778 (info->var.vmode & FB_VMODE_INTERLACED) ? 2 : 1,
779 hmul, info->node);
780
781 /* Set interlaced mode start/end register */
782 value = info->var.xres + info->var.left_margin + info->var.right_margin + info->var.hsync_len;
783 value = ((value * hmul / hdiv) / 8) - 5;
784 vga_wcrt(NULL, 0x42, (value + 1) / 2);
785
786 memset_io(info->screen_base, 0x00, screen_size);
787 /* Device and screen back on */
788 svga_wcrt_mask(0x17, 0x80, 0x80);
789 svga_wseq_mask(0x01, 0x00, 0x20);
790
791 return 0;
792}
793
794/* Set a colour register */
795
796static int arkfb_setcolreg(u_int regno, u_int red, u_int green, u_int blue,
797 u_int transp, struct fb_info *fb)
798{
799 switch (fb->var.bits_per_pixel) {
800 case 0:
801 case 4:
802 if (regno >= 16)
803 return -EINVAL;
804
805 if ((fb->var.bits_per_pixel == 4) &&
806 (fb->var.nonstd == 0)) {
807 outb(0xF0, VGA_PEL_MSK);
808 outb(regno*16, VGA_PEL_IW);
809 } else {
810 outb(0x0F, VGA_PEL_MSK);
811 outb(regno, VGA_PEL_IW);
812 }
813 outb(red >> 10, VGA_PEL_D);
814 outb(green >> 10, VGA_PEL_D);
815 outb(blue >> 10, VGA_PEL_D);
816 break;
817 case 8:
818 if (regno >= 256)
819 return -EINVAL;
820
821 outb(0xFF, VGA_PEL_MSK);
822 outb(regno, VGA_PEL_IW);
823 outb(red >> 10, VGA_PEL_D);
824 outb(green >> 10, VGA_PEL_D);
825 outb(blue >> 10, VGA_PEL_D);
826 break;
827 case 16:
828 if (regno >= 16)
829 return 0;
830
831 if (fb->var.green.length == 5)
832 ((u32*)fb->pseudo_palette)[regno] = ((red & 0xF800) >> 1) |
833 ((green & 0xF800) >> 6) | ((blue & 0xF800) >> 11);
834 else if (fb->var.green.length == 6)
835 ((u32*)fb->pseudo_palette)[regno] = (red & 0xF800) |
836 ((green & 0xFC00) >> 5) | ((blue & 0xF800) >> 11);
837 else
838 return -EINVAL;
839 break;
840 case 24:
841 case 32:
842 if (regno >= 16)
843 return 0;
844
845 ((u32*)fb->pseudo_palette)[regno] = ((red & 0xFF00) << 8) |
846 (green & 0xFF00) | ((blue & 0xFF00) >> 8);
847 break;
848 default:
849 return -EINVAL;
850 }
851
852 return 0;
853}
854
855/* Set the display blanking state */
856
857static int arkfb_blank(int blank_mode, struct fb_info *info)
858{
859 switch (blank_mode) {
860 case FB_BLANK_UNBLANK:
861 pr_debug("fb%d: unblank\n", info->node);
862 svga_wseq_mask(0x01, 0x00, 0x20);
863 svga_wcrt_mask(0x17, 0x80, 0x80);
864 break;
865 case FB_BLANK_NORMAL:
866 pr_debug("fb%d: blank\n", info->node);
867 svga_wseq_mask(0x01, 0x20, 0x20);
868 svga_wcrt_mask(0x17, 0x80, 0x80);
869 break;
870 case FB_BLANK_POWERDOWN:
871 case FB_BLANK_HSYNC_SUSPEND:
872 case FB_BLANK_VSYNC_SUSPEND:
873 pr_debug("fb%d: sync down\n", info->node);
874 svga_wseq_mask(0x01, 0x20, 0x20);
875 svga_wcrt_mask(0x17, 0x00, 0x80);
876 break;
877 }
878 return 0;
879}
880
881
882/* Pan the display */
883
884static int arkfb_pan_display(struct fb_var_screeninfo *var, struct fb_info *info)
885{
886 unsigned int offset;
887
888 /* Calculate the offset */
889 if (var->bits_per_pixel == 0) {
890 offset = (var->yoffset / 16) * (var->xres_virtual / 2) + (var->xoffset / 2);
891 offset = offset >> 2;
892 } else {
893 offset = (var->yoffset * info->fix.line_length) +
894 (var->xoffset * var->bits_per_pixel / 8);
895 offset = offset >> ((var->bits_per_pixel == 4) ? 2 : 3);
896 }
897
898 /* Set the offset */
899 svga_wcrt_multi(ark_start_address_regs, offset);
900
901 return 0;
902}
903
904
905/* ------------------------------------------------------------------------- */
906
907
908/* Frame buffer operations */
909
910static struct fb_ops arkfb_ops = {
911 .owner = THIS_MODULE,
912 .fb_open = arkfb_open,
913 .fb_release = arkfb_release,
914 .fb_check_var = arkfb_check_var,
915 .fb_set_par = arkfb_set_par,
916 .fb_setcolreg = arkfb_setcolreg,
917 .fb_blank = arkfb_blank,
918 .fb_pan_display = arkfb_pan_display,
919 .fb_fillrect = arkfb_fillrect,
920 .fb_copyarea = cfb_copyarea,
921 .fb_imageblit = arkfb_imageblit,
922 .fb_get_caps = svga_get_caps,
923};
924
925
926/* ------------------------------------------------------------------------- */
927
928
929/* PCI probe */
930static int __devinit ark_pci_probe(struct pci_dev *dev, const struct pci_device_id *id)
931{
932 struct fb_info *info;
933 struct arkfb_info *par;
934 int rc;
935 u8 regval;
936
937 /* Ignore secondary VGA device because there is no VGA arbitration */
938 if (! svga_primary_device(dev)) {
939 dev_info(&(dev->dev), "ignoring secondary device\n");
940 return -ENODEV;
941 }
942
943 /* Allocate and fill driver data structure */
944 info = framebuffer_alloc(sizeof(struct arkfb_info), NULL);
945 if (! info) {
946 dev_err(&(dev->dev), "cannot allocate memory\n");
947 return -ENOMEM;
948 }
949
950 par = info->par;
951 mutex_init(&par->open_lock);
952
953 info->flags = FBINFO_PARTIAL_PAN_OK | FBINFO_HWACCEL_YPAN;
954 info->fbops = &arkfb_ops;
955
956 /* Prepare PCI device */
957 rc = pci_enable_device(dev);
958 if (rc < 0) {
959 dev_err(&(dev->dev), "cannot enable PCI device\n");
960 goto err_enable_device;
961 }
962
963 rc = pci_request_regions(dev, "arkfb");
964 if (rc < 0) {
965 dev_err(&(dev->dev), "cannot reserve framebuffer region\n");
966 goto err_request_regions;
967 }
968
969 par->dac = ics5342_init(ark_dac_read_regs, ark_dac_write_regs, info);
970 if (! par->dac) {
971 rc = -ENOMEM;
972 dev_err(&(dev->dev), "RAMDAC initialization failed\n");
973 goto err_dac;
974 }
975
976 info->fix.smem_start = pci_resource_start(dev, 0);
977 info->fix.smem_len = pci_resource_len(dev, 0);
978
979 /* Map physical IO memory address into kernel space */
980 info->screen_base = pci_iomap(dev, 0, 0);
981 if (! info->screen_base) {
982 rc = -ENOMEM;
983 dev_err(&(dev->dev), "iomap for framebuffer failed\n");
984 goto err_iomap;
985 }
986
987 /* FIXME get memsize */
988 regval = vga_rseq(NULL, 0x10);
989 info->screen_size = (1 << (regval >> 6)) << 20;
990 info->fix.smem_len = info->screen_size;
991
992 strcpy(info->fix.id, "ARK 2000PV");
993 info->fix.mmio_start = 0;
994 info->fix.mmio_len = 0;
995 info->fix.type = FB_TYPE_PACKED_PIXELS;
996 info->fix.visual = FB_VISUAL_PSEUDOCOLOR;
997 info->fix.ypanstep = 0;
998 info->fix.accel = FB_ACCEL_NONE;
999 info->pseudo_palette = (void*) (par->pseudo_palette);
1000
1001 /* Prepare startup mode */
1002 rc = fb_find_mode(&(info->var), info, mode, NULL, 0, NULL, 8);
1003 if (! ((rc == 1) || (rc == 2))) {
1004 rc = -EINVAL;
1005 dev_err(&(dev->dev), "mode %s not found\n", mode);
1006 goto err_find_mode;
1007 }
1008
1009 rc = fb_alloc_cmap(&info->cmap, 256, 0);
1010 if (rc < 0) {
1011 dev_err(&(dev->dev), "cannot allocate colormap\n");
1012 goto err_alloc_cmap;
1013 }
1014
1015 rc = register_framebuffer(info);
1016 if (rc < 0) {
1017 dev_err(&(dev->dev), "cannot register framebugger\n");
1018 goto err_reg_fb;
1019 }
1020
1021 printk(KERN_INFO "fb%d: %s on %s, %d MB RAM\n", info->node, info->fix.id,
1022 pci_name(dev), info->fix.smem_len >> 20);
1023
1024 /* Record a reference to the driver data */
1025 pci_set_drvdata(dev, info);
1026
1027#ifdef CONFIG_MTRR
1028 if (mtrr) {
1029 par->mtrr_reg = -1;
1030 par->mtrr_reg = mtrr_add(info->fix.smem_start, info->fix.smem_len, MTRR_TYPE_WRCOMB, 1);
1031 }
1032#endif
1033
1034 return 0;
1035
1036 /* Error handling */
1037err_reg_fb:
1038 fb_dealloc_cmap(&info->cmap);
1039err_alloc_cmap:
1040err_find_mode:
1041 pci_iounmap(dev, info->screen_base);
1042err_iomap:
1043 dac_release(par->dac);
1044err_dac:
1045 pci_release_regions(dev);
1046err_request_regions:
1047/* pci_disable_device(dev); */
1048err_enable_device:
1049 framebuffer_release(info);
1050 return rc;
1051}
1052
1053/* PCI remove */
1054
1055static void __devexit ark_pci_remove(struct pci_dev *dev)
1056{
1057 struct fb_info *info = pci_get_drvdata(dev);
1058 struct arkfb_info *par = info->par;
1059
1060 if (info) {
1061#ifdef CONFIG_MTRR
1062 if (par->mtrr_reg >= 0) {
1063 mtrr_del(par->mtrr_reg, 0, 0);
1064 par->mtrr_reg = -1;
1065 }
1066#endif
1067
1068 dac_release(par->dac);
1069 unregister_framebuffer(info);
1070 fb_dealloc_cmap(&info->cmap);
1071
1072 pci_iounmap(dev, info->screen_base);
1073 pci_release_regions(dev);
1074/* pci_disable_device(dev); */
1075
1076 pci_set_drvdata(dev, NULL);
1077 framebuffer_release(info);
1078 }
1079}
1080
1081
1082#ifdef CONFIG_PM
1083/* PCI suspend */
1084
1085static int ark_pci_suspend (struct pci_dev* dev, pm_message_t state)
1086{
1087 struct fb_info *info = pci_get_drvdata(dev);
1088 struct arkfb_info *par = info->par;
1089
1090 dev_info(&(dev->dev), "suspend\n");
1091
1092 acquire_console_sem();
1093 mutex_lock(&(par->open_lock));
1094
1095 if ((state.event == PM_EVENT_FREEZE) || (par->ref_count == 0)) {
1096 mutex_unlock(&(par->open_lock));
1097 release_console_sem();
1098 return 0;
1099 }
1100
1101 fb_set_suspend(info, 1);
1102
1103 pci_save_state(dev);
1104 pci_disable_device(dev);
1105 pci_set_power_state(dev, pci_choose_state(dev, state));
1106
1107 mutex_unlock(&(par->open_lock));
1108 release_console_sem();
1109
1110 return 0;
1111}
1112
1113
1114/* PCI resume */
1115
1116static int ark_pci_resume (struct pci_dev* dev)
1117{
1118 struct fb_info *info = pci_get_drvdata(dev);
1119 struct arkfb_info *par = info->par;
1120
1121 dev_info(&(dev->dev), "resume\n");
1122
1123 acquire_console_sem();
1124 mutex_lock(&(par->open_lock));
1125
1126 if (par->ref_count == 0) {
1127 mutex_unlock(&(par->open_lock));
1128 release_console_sem();
1129 return 0;
1130 }
1131
1132 pci_set_power_state(dev, PCI_D0);
1133 pci_restore_state(dev);
1134
1135 if (pci_enable_device(dev))
1136 goto fail;
1137
1138 pci_set_master(dev);
1139
1140 arkfb_set_par(info);
1141 fb_set_suspend(info, 0);
1142
1143 mutex_unlock(&(par->open_lock));
1144fail:
1145 release_console_sem();
1146 return 0;
1147}
1148#else
1149#define ark_pci_suspend NULL
1150#define ark_pci_resume NULL
1151#endif /* CONFIG_PM */
1152
1153/* List of boards that we are trying to support */
1154
1155static struct pci_device_id ark_devices[] __devinitdata = {
1156 {PCI_DEVICE(0xEDD8, 0xA099)},
1157 {0, 0, 0, 0, 0, 0, 0}
1158};
1159
1160
1161MODULE_DEVICE_TABLE(pci, ark_devices);
1162
1163static struct pci_driver arkfb_pci_driver = {
1164 .name = "arkfb",
1165 .id_table = ark_devices,
1166 .probe = ark_pci_probe,
1167 .remove = __devexit_p(ark_pci_remove),
1168 .suspend = ark_pci_suspend,
1169 .resume = ark_pci_resume,
1170};
1171
1172/* Cleanup */
1173
1174static void __exit arkfb_cleanup(void)
1175{
1176 pr_debug("arkfb: cleaning up\n");
1177 pci_unregister_driver(&arkfb_pci_driver);
1178}
1179
1180/* Driver Initialisation */
1181
1182static int __init arkfb_init(void)
1183{
1184
1185#ifndef MODULE
1186 char *option = NULL;
1187
1188 if (fb_get_options("arkfb", &option))
1189 return -ENODEV;
1190
1191 if (option && *option)
1192 mode = option;
1193#endif
1194
1195 pr_debug("arkfb: initializing\n");
1196 return pci_register_driver(&arkfb_pci_driver);
1197}
1198
1199module_init(arkfb_init);
1200module_exit(arkfb_cleanup);
diff --git a/drivers/video/fbmem.c b/drivers/video/fbmem.c
index 08d4e11d9121..38c2e2558f5e 100644
--- a/drivers/video/fbmem.c
+++ b/drivers/video/fbmem.c
@@ -1236,6 +1236,10 @@ fb_mmap(struct file *file, struct vm_area_struct * vma)
1236 pgprot_val(vma->vm_page_prot) |= _PAGE_NO_CACHE; 1236 pgprot_val(vma->vm_page_prot) |= _PAGE_NO_CACHE;
1237#elif defined(__arm__) || defined(__sh__) || defined(__m32r__) 1237#elif defined(__arm__) || defined(__sh__) || defined(__m32r__)
1238 vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); 1238 vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
1239#elif defined(__avr32__)
1240 vma->vm_page_prot = __pgprot((pgprot_val(vma->vm_page_prot)
1241 & ~_PAGE_CACHABLE)
1242 | (_PAGE_BUFFER | _PAGE_DIRTY));
1239#elif defined(__ia64__) 1243#elif defined(__ia64__)
1240 if (efi_range_is_wc(vma->vm_start, vma->vm_end - vma->vm_start)) 1244 if (efi_range_is_wc(vma->vm_start, vma->vm_end - vma->vm_start))
1241 vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); 1245 vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
diff --git a/drivers/video/nvidia/nv_hw.c b/drivers/video/nvidia/nv_hw.c
index f297c7b14a41..c627955aa124 100644
--- a/drivers/video/nvidia/nv_hw.c
+++ b/drivers/video/nvidia/nv_hw.c
@@ -149,8 +149,7 @@ static void nvGetClocks(struct nvidia_par *par, unsigned int *MClk,
149 pll = NV_RD32(par->PMC, 0x4024); 149 pll = NV_RD32(par->PMC, 0x4024);
150 M = pll & 0xFF; 150 M = pll & 0xFF;
151 N = (pll >> 8) & 0xFF; 151 N = (pll >> 8) & 0xFF;
152 if (((par->Chipset & 0xfff0) == 0x0290) || 152 if (((par->Chipset & 0xfff0) == 0x0290) || ((par->Chipset & 0xfff0) == 0x0390) || ((par->Chipset & 0xfff0) == 0x02E0)) {
153 ((par->Chipset & 0xfff0) == 0x0390)) {
154 MB = 1; 153 MB = 1;
155 NB = 1; 154 NB = 1;
156 } else { 155 } else {
@@ -963,6 +962,7 @@ void NVLoadStateExt(struct nvidia_par *par, RIVA_HW_STATE * state)
963 962
964 if (((par->Chipset & 0xfff0) == 0x0090) || 963 if (((par->Chipset & 0xfff0) == 0x0090) ||
965 ((par->Chipset & 0xfff0) == 0x01D0) || 964 ((par->Chipset & 0xfff0) == 0x01D0) ||
965 ((par->Chipset & 0xfff0) == 0x02E0) ||
966 ((par->Chipset & 0xfff0) == 0x0290)) 966 ((par->Chipset & 0xfff0) == 0x0290))
967 regions = 15; 967 regions = 15;
968 for(i = 0; i < regions; i++) { 968 for(i = 0; i < regions; i++) {
@@ -1275,6 +1275,7 @@ void NVLoadStateExt(struct nvidia_par *par, RIVA_HW_STATE * state)
1275 0x00100000); 1275 0x00100000);
1276 break; 1276 break;
1277 case 0x0090: 1277 case 0x0090:
1278 case 0x02E0:
1278 case 0x0290: 1279 case 0x0290:
1279 NV_WR32(par->PRAMDAC, 0x0608, 1280 NV_WR32(par->PRAMDAC, 0x0608,
1280 NV_RD32(par->PRAMDAC, 0x0608) | 1281 NV_RD32(par->PRAMDAC, 0x0608) |
@@ -1352,6 +1353,7 @@ void NVLoadStateExt(struct nvidia_par *par, RIVA_HW_STATE * state)
1352 } else { 1353 } else {
1353 if (((par->Chipset & 0xfff0) == 0x0090) || 1354 if (((par->Chipset & 0xfff0) == 0x0090) ||
1354 ((par->Chipset & 0xfff0) == 0x01D0) || 1355 ((par->Chipset & 0xfff0) == 0x01D0) ||
1356 ((par->Chipset & 0xfff0) == 0x02E0) ||
1355 ((par->Chipset & 0xfff0) == 0x0290)) { 1357 ((par->Chipset & 0xfff0) == 0x0290)) {
1356 for (i = 0; i < 60; i++) { 1358 for (i = 0; i < 60; i++) {
1357 NV_WR32(par->PGRAPH, 1359 NV_WR32(par->PGRAPH,
@@ -1403,6 +1405,7 @@ void NVLoadStateExt(struct nvidia_par *par, RIVA_HW_STATE * state)
1403 } else { 1405 } else {
1404 if ((par->Chipset & 0xfff0) == 0x0090 || 1406 if ((par->Chipset & 0xfff0) == 0x0090 ||
1405 (par->Chipset & 0xfff0) == 0x01D0 || 1407 (par->Chipset & 0xfff0) == 0x01D0 ||
1408 (par->Chipset & 0xfff0) == 0x02E0 ||
1406 (par->Chipset & 0xfff0) == 0x0290) { 1409 (par->Chipset & 0xfff0) == 0x0290) {
1407 NV_WR32(par->PGRAPH, 0x0DF0, 1410 NV_WR32(par->PGRAPH, 0x0DF0,
1408 NV_RD32(par->PFB, 0x0200)); 1411 NV_RD32(par->PFB, 0x0200));
diff --git a/drivers/video/nvidia/nvidia.c b/drivers/video/nvidia/nvidia.c
index 7c36b5fe582e..f85edf084da3 100644
--- a/drivers/video/nvidia/nvidia.c
+++ b/drivers/video/nvidia/nvidia.c
@@ -1243,6 +1243,7 @@ static u32 __devinit nvidia_get_arch(struct fb_info *info)
1243 case 0x0140: /* GeForce 6600 */ 1243 case 0x0140: /* GeForce 6600 */
1244 case 0x0160: /* GeForce 6200 */ 1244 case 0x0160: /* GeForce 6200 */
1245 case 0x01D0: /* GeForce 7200, 7300, 7400 */ 1245 case 0x01D0: /* GeForce 7200, 7300, 7400 */
1246 case 0x02E0: /* GeForce 7300 GT */
1246 case 0x0090: /* GeForce 7800 */ 1247 case 0x0090: /* GeForce 7800 */
1247 case 0x0210: /* GeForce 6800 */ 1248 case 0x0210: /* GeForce 6800 */
1248 case 0x0220: /* GeForce 6200 */ 1249 case 0x0220: /* GeForce 6200 */
diff --git a/drivers/video/s3fb.c b/drivers/video/s3fb.c
index 756fafb41d78..d11735895a01 100644
--- a/drivers/video/s3fb.c
+++ b/drivers/video/s3fb.c
@@ -796,23 +796,6 @@ static int s3fb_pan_display(struct fb_var_screeninfo *var, struct fb_info *info)
796 return 0; 796 return 0;
797} 797}
798 798
799/* Get capabilities of accelerator based on the mode */
800
801static void s3fb_get_caps(struct fb_info *info, struct fb_blit_caps *caps,
802 struct fb_var_screeninfo *var)
803{
804 if (var->bits_per_pixel == 0) {
805 /* can only support 256 8x16 bitmap */
806 caps->x = 1 << (8 - 1);
807 caps->y = 1 << (16 - 1);
808 caps->len = 256;
809 } else {
810 caps->x = ~(u32)0;
811 caps->y = ~(u32)0;
812 caps->len = ~(u32)0;
813 }
814}
815
816/* ------------------------------------------------------------------------- */ 799/* ------------------------------------------------------------------------- */
817 800
818/* Frame buffer operations */ 801/* Frame buffer operations */
@@ -829,7 +812,7 @@ static struct fb_ops s3fb_ops = {
829 .fb_fillrect = s3fb_fillrect, 812 .fb_fillrect = s3fb_fillrect,
830 .fb_copyarea = cfb_copyarea, 813 .fb_copyarea = cfb_copyarea,
831 .fb_imageblit = s3fb_imageblit, 814 .fb_imageblit = s3fb_imageblit,
832 .fb_get_caps = s3fb_get_caps, 815 .fb_get_caps = svga_get_caps,
833}; 816};
834 817
835/* ------------------------------------------------------------------------- */ 818/* ------------------------------------------------------------------------- */
diff --git a/drivers/video/svgalib.c b/drivers/video/svgalib.c
index 079cdc911e48..25df928d37d8 100644
--- a/drivers/video/svgalib.c
+++ b/drivers/video/svgalib.c
@@ -347,6 +347,23 @@ int svga_get_tilemax(struct fb_info *info)
347 return 256; 347 return 256;
348} 348}
349 349
350/* Get capabilities of accelerator based on the mode */
351
352void svga_get_caps(struct fb_info *info, struct fb_blit_caps *caps,
353 struct fb_var_screeninfo *var)
354{
355 if (var->bits_per_pixel == 0) {
356 /* can only support 256 8x16 bitmap */
357 caps->x = 1 << (8 - 1);
358 caps->y = 1 << (16 - 1);
359 caps->len = 256;
360 } else {
361 caps->x = (var->bits_per_pixel == 4) ? 1 << (8 - 1) : ~(u32)0;
362 caps->y = ~(u32)0;
363 caps->len = ~(u32)0;
364 }
365}
366EXPORT_SYMBOL(svga_get_caps);
350 367
351/* ------------------------------------------------------------------------- */ 368/* ------------------------------------------------------------------------- */
352 369
diff --git a/drivers/video/vt8623fb.c b/drivers/video/vt8623fb.c
new file mode 100644
index 000000000000..5e9755e464a1
--- /dev/null
+++ b/drivers/video/vt8623fb.c
@@ -0,0 +1,927 @@
1/*
2 * linux/drivers/video/vt8623fb.c - fbdev driver for
3 * integrated graphic core in VIA VT8623 [CLE266] chipset
4 *
5 * Copyright (c) 2006-2007 Ondrej Zajicek <santiago@crfreenet.org>
6 *
7 * This file is subject to the terms and conditions of the GNU General Public
8 * License. See the file COPYING in the main directory of this archive for
9 * more details.
10 *
11 * Code is based on s3fb, some parts are from David Boucher's viafb
12 * (http://davesdomain.org.uk/viafb/)
13 */
14
15#include <linux/version.h>
16#include <linux/module.h>
17#include <linux/kernel.h>
18#include <linux/errno.h>
19#include <linux/string.h>
20#include <linux/mm.h>
21#include <linux/tty.h>
22#include <linux/slab.h>
23#include <linux/delay.h>
24#include <linux/fb.h>
25#include <linux/svga.h>
26#include <linux/init.h>
27#include <linux/pci.h>
28#include <linux/console.h> /* Why should fb driver call console functions? because acquire_console_sem() */
29#include <video/vga.h>
30
31#ifdef CONFIG_MTRR
32#include <asm/mtrr.h>
33#endif
34
35struct vt8623fb_info {
36 char __iomem *mmio_base;
37 int mtrr_reg;
38 struct vgastate state;
39 struct mutex open_lock;
40 unsigned int ref_count;
41 u32 pseudo_palette[16];
42};
43
44
45
46/* ------------------------------------------------------------------------- */
47
48static const struct svga_fb_format vt8623fb_formats[] = {
49 { 0, {0, 6, 0}, {0, 6, 0}, {0, 6, 0}, {0, 0, 0}, 0,
50 FB_TYPE_TEXT, FB_AUX_TEXT_SVGA_STEP8, FB_VISUAL_PSEUDOCOLOR, 16, 16},
51 { 4, {0, 6, 0}, {0, 6, 0}, {0, 6, 0}, {0, 0, 0}, 0,
52 FB_TYPE_PACKED_PIXELS, 0, FB_VISUAL_PSEUDOCOLOR, 16, 16},
53 { 4, {0, 6, 0}, {0, 6, 0}, {0, 6, 0}, {0, 0, 0}, 1,
54 FB_TYPE_INTERLEAVED_PLANES, 1, FB_VISUAL_PSEUDOCOLOR, 16, 16},
55 { 8, {0, 6, 0}, {0, 6, 0}, {0, 6, 0}, {0, 0, 0}, 0,
56 FB_TYPE_PACKED_PIXELS, 0, FB_VISUAL_PSEUDOCOLOR, 8, 8},
57/* {16, {10, 5, 0}, {5, 5, 0}, {0, 5, 0}, {0, 0, 0}, 0,
58 FB_TYPE_PACKED_PIXELS, 0, FB_VISUAL_TRUECOLOR, 4, 4}, */
59 {16, {11, 5, 0}, {5, 6, 0}, {0, 5, 0}, {0, 0, 0}, 0,
60 FB_TYPE_PACKED_PIXELS, 0, FB_VISUAL_TRUECOLOR, 4, 4},
61 {32, {16, 8, 0}, {8, 8, 0}, {0, 8, 0}, {0, 0, 0}, 0,
62 FB_TYPE_PACKED_PIXELS, 0, FB_VISUAL_TRUECOLOR, 2, 2},
63 SVGA_FORMAT_END
64};
65
66static const struct svga_pll vt8623_pll = {2, 127, 2, 7, 0, 3,
67 60000, 300000, 14318};
68
69/* CRT timing register sets */
70
71struct vga_regset vt8623_h_total_regs[] = {{0x00, 0, 7}, {0x36, 3, 3}, VGA_REGSET_END};
72struct vga_regset vt8623_h_display_regs[] = {{0x01, 0, 7}, VGA_REGSET_END};
73struct vga_regset vt8623_h_blank_start_regs[] = {{0x02, 0, 7}, VGA_REGSET_END};
74struct vga_regset vt8623_h_blank_end_regs[] = {{0x03, 0, 4}, {0x05, 7, 7}, {0x33, 5, 5}, VGA_REGSET_END};
75struct vga_regset vt8623_h_sync_start_regs[] = {{0x04, 0, 7}, {0x33, 4, 4}, VGA_REGSET_END};
76struct vga_regset vt8623_h_sync_end_regs[] = {{0x05, 0, 4}, VGA_REGSET_END};
77
78struct vga_regset vt8623_v_total_regs[] = {{0x06, 0, 7}, {0x07, 0, 0}, {0x07, 5, 5}, {0x35, 0, 0}, VGA_REGSET_END};
79struct vga_regset vt8623_v_display_regs[] = {{0x12, 0, 7}, {0x07, 1, 1}, {0x07, 6, 6}, {0x35, 2, 2}, VGA_REGSET_END};
80struct vga_regset vt8623_v_blank_start_regs[] = {{0x15, 0, 7}, {0x07, 3, 3}, {0x09, 5, 5}, {0x35, 3, 3}, VGA_REGSET_END};
81struct vga_regset vt8623_v_blank_end_regs[] = {{0x16, 0, 7}, VGA_REGSET_END};
82struct vga_regset vt8623_v_sync_start_regs[] = {{0x10, 0, 7}, {0x07, 2, 2}, {0x07, 7, 7}, {0x35, 1, 1}, VGA_REGSET_END};
83struct vga_regset vt8623_v_sync_end_regs[] = {{0x11, 0, 3}, VGA_REGSET_END};
84
85struct vga_regset vt8623_offset_regs[] = {{0x13, 0, 7}, {0x35, 5, 7}, VGA_REGSET_END};
86struct vga_regset vt8623_line_compare_regs[] = {{0x18, 0, 7}, {0x07, 4, 4}, {0x09, 6, 6}, {0x33, 0, 2}, {0x35, 4, 4}, VGA_REGSET_END};
87struct vga_regset vt8623_fetch_count_regs[] = {{0x1C, 0, 7}, {0x1D, 0, 1}, VGA_REGSET_END};
88struct vga_regset vt8623_start_address_regs[] = {{0x0d, 0, 7}, {0x0c, 0, 7}, {0x34, 0, 7}, {0x48, 0, 1}, VGA_REGSET_END};
89
90struct svga_timing_regs vt8623_timing_regs = {
91 vt8623_h_total_regs, vt8623_h_display_regs, vt8623_h_blank_start_regs,
92 vt8623_h_blank_end_regs, vt8623_h_sync_start_regs, vt8623_h_sync_end_regs,
93 vt8623_v_total_regs, vt8623_v_display_regs, vt8623_v_blank_start_regs,
94 vt8623_v_blank_end_regs, vt8623_v_sync_start_regs, vt8623_v_sync_end_regs,
95};
96
97
98/* ------------------------------------------------------------------------- */
99
100
101/* Module parameters */
102
103static char *mode = "640x480-8@60";
104
105#ifdef CONFIG_MTRR
106static int mtrr = 1;
107#endif
108
109MODULE_AUTHOR("(c) 2006 Ondrej Zajicek <santiago@crfreenet.org>");
110MODULE_LICENSE("GPL");
111MODULE_DESCRIPTION("fbdev driver for integrated graphics core in VIA VT8623 [CLE266]");
112
113module_param(mode, charp, 0644);
114MODULE_PARM_DESC(mode, "Default video mode ('640x480-8@60', etc)");
115
116#ifdef CONFIG_MTRR
117module_param(mtrr, int, 0444);
118MODULE_PARM_DESC(mtrr, "Enable write-combining with MTRR (1=enable, 0=disable, default=1)");
119#endif
120
121
122/* ------------------------------------------------------------------------- */
123
124
125static struct fb_tile_ops vt8623fb_tile_ops = {
126 .fb_settile = svga_settile,
127 .fb_tilecopy = svga_tilecopy,
128 .fb_tilefill = svga_tilefill,
129 .fb_tileblit = svga_tileblit,
130 .fb_tilecursor = svga_tilecursor,
131 .fb_get_tilemax = svga_get_tilemax,
132};
133
134
135/* ------------------------------------------------------------------------- */
136
137
138/* image data is MSB-first, fb structure is MSB-first too */
139static inline u32 expand_color(u32 c)
140{
141 return ((c & 1) | ((c & 2) << 7) | ((c & 4) << 14) | ((c & 8) << 21)) * 0xFF;
142}
143
144/* vt8623fb_iplan_imageblit silently assumes that almost everything is 8-pixel aligned */
145static void vt8623fb_iplan_imageblit(struct fb_info *info, const struct fb_image *image)
146{
147 u32 fg = expand_color(image->fg_color);
148 u32 bg = expand_color(image->bg_color);
149 const u8 *src1, *src;
150 u8 __iomem *dst1;
151 u32 __iomem *dst;
152 u32 val;
153 int x, y;
154
155 src1 = image->data;
156 dst1 = info->screen_base + (image->dy * info->fix.line_length)
157 + ((image->dx / 8) * 4);
158
159 for (y = 0; y < image->height; y++) {
160 src = src1;
161 dst = (u32 __iomem *) dst1;
162 for (x = 0; x < image->width; x += 8) {
163 val = *(src++) * 0x01010101;
164 val = (val & fg) | (~val & bg);
165 fb_writel(val, dst++);
166 }
167 src1 += image->width / 8;
168 dst1 += info->fix.line_length;
169 }
170}
171
172/* vt8623fb_iplan_fillrect silently assumes that almost everything is 8-pixel aligned */
173static void vt8623fb_iplan_fillrect(struct fb_info *info, const struct fb_fillrect *rect)
174{
175 u32 fg = expand_color(rect->color);
176 u8 __iomem *dst1;
177 u32 __iomem *dst;
178 int x, y;
179
180 dst1 = info->screen_base + (rect->dy * info->fix.line_length)
181 + ((rect->dx / 8) * 4);
182
183 for (y = 0; y < rect->height; y++) {
184 dst = (u32 __iomem *) dst1;
185 for (x = 0; x < rect->width; x += 8) {
186 fb_writel(fg, dst++);
187 }
188 dst1 += info->fix.line_length;
189 }
190}
191
192
193/* image data is MSB-first, fb structure is high-nibble-in-low-byte-first */
194static inline u32 expand_pixel(u32 c)
195{
196 return (((c & 1) << 24) | ((c & 2) << 27) | ((c & 4) << 14) | ((c & 8) << 17) |
197 ((c & 16) << 4) | ((c & 32) << 7) | ((c & 64) >> 6) | ((c & 128) >> 3)) * 0xF;
198}
199
200/* vt8623fb_cfb4_imageblit silently assumes that almost everything is 8-pixel aligned */
201static void vt8623fb_cfb4_imageblit(struct fb_info *info, const struct fb_image *image)
202{
203 u32 fg = image->fg_color * 0x11111111;
204 u32 bg = image->bg_color * 0x11111111;
205 const u8 *src1, *src;
206 u8 __iomem *dst1;
207 u32 __iomem *dst;
208 u32 val;
209 int x, y;
210
211 src1 = image->data;
212 dst1 = info->screen_base + (image->dy * info->fix.line_length)
213 + ((image->dx / 8) * 4);
214
215 for (y = 0; y < image->height; y++) {
216 src = src1;
217 dst = (u32 __iomem *) dst1;
218 for (x = 0; x < image->width; x += 8) {
219 val = expand_pixel(*(src++));
220 val = (val & fg) | (~val & bg);
221 fb_writel(val, dst++);
222 }
223 src1 += image->width / 8;
224 dst1 += info->fix.line_length;
225 }
226}
227
228static void vt8623fb_imageblit(struct fb_info *info, const struct fb_image *image)
229{
230 if ((info->var.bits_per_pixel == 4) && (image->depth == 1)
231 && ((image->width % 8) == 0) && ((image->dx % 8) == 0)) {
232 if (info->fix.type == FB_TYPE_INTERLEAVED_PLANES)
233 vt8623fb_iplan_imageblit(info, image);
234 else
235 vt8623fb_cfb4_imageblit(info, image);
236 } else
237 cfb_imageblit(info, image);
238}
239
240static void vt8623fb_fillrect(struct fb_info *info, const struct fb_fillrect *rect)
241{
242 if ((info->var.bits_per_pixel == 4)
243 && ((rect->width % 8) == 0) && ((rect->dx % 8) == 0)
244 && (info->fix.type == FB_TYPE_INTERLEAVED_PLANES))
245 vt8623fb_iplan_fillrect(info, rect);
246 else
247 cfb_fillrect(info, rect);
248}
249
250
251/* ------------------------------------------------------------------------- */
252
253
254static void vt8623_set_pixclock(struct fb_info *info, u32 pixclock)
255{
256 u16 m, n, r;
257 u8 regval;
258 int rv;
259
260 rv = svga_compute_pll(&vt8623_pll, 1000000000 / pixclock, &m, &n, &r, info->node);
261 if (rv < 0) {
262 printk(KERN_ERR "fb%d: cannot set requested pixclock, keeping old value\n", info->node);
263 return;
264 }
265
266 /* Set VGA misc register */
267 regval = vga_r(NULL, VGA_MIS_R);
268 vga_w(NULL, VGA_MIS_W, regval | VGA_MIS_ENB_PLL_LOAD);
269
270 /* Set clock registers */
271 vga_wseq(NULL, 0x46, (n | (r << 6)));
272 vga_wseq(NULL, 0x47, m);
273
274 udelay(1000);
275
276 /* PLL reset */
277 svga_wseq_mask(0x40, 0x02, 0x02);
278 svga_wseq_mask(0x40, 0x00, 0x02);
279}
280
281
282static int vt8623fb_open(struct fb_info *info, int user)
283{
284 struct vt8623fb_info *par = info->par;
285
286 mutex_lock(&(par->open_lock));
287 if (par->ref_count == 0) {
288 memset(&(par->state), 0, sizeof(struct vgastate));
289 par->state.flags = VGA_SAVE_MODE | VGA_SAVE_FONTS | VGA_SAVE_CMAP;
290 par->state.num_crtc = 0xA2;
291 par->state.num_seq = 0x50;
292 save_vga(&(par->state));
293 }
294
295 par->ref_count++;
296 mutex_unlock(&(par->open_lock));
297
298 return 0;
299}
300
301static int vt8623fb_release(struct fb_info *info, int user)
302{
303 struct vt8623fb_info *par = info->par;
304
305 mutex_lock(&(par->open_lock));
306 if (par->ref_count == 0) {
307 mutex_unlock(&(par->open_lock));
308 return -EINVAL;
309 }
310
311 if (par->ref_count == 1)
312 restore_vga(&(par->state));
313
314 par->ref_count--;
315 mutex_unlock(&(par->open_lock));
316
317 return 0;
318}
319
320static int vt8623fb_check_var(struct fb_var_screeninfo *var, struct fb_info *info)
321{
322 int rv, mem, step;
323
324 /* Find appropriate format */
325 rv = svga_match_format (vt8623fb_formats, var, NULL);
326 if (rv < 0)
327 {
328 printk(KERN_ERR "fb%d: unsupported mode requested\n", info->node);
329 return rv;
330 }
331
332 /* Do not allow to have real resoulution larger than virtual */
333 if (var->xres > var->xres_virtual)
334 var->xres_virtual = var->xres;
335
336 if (var->yres > var->yres_virtual)
337 var->yres_virtual = var->yres;
338
339 /* Round up xres_virtual to have proper alignment of lines */
340 step = vt8623fb_formats[rv].xresstep - 1;
341 var->xres_virtual = (var->xres_virtual+step) & ~step;
342
343 /* Check whether have enough memory */
344 mem = ((var->bits_per_pixel * var->xres_virtual) >> 3) * var->yres_virtual;
345 if (mem > info->screen_size)
346 {
347 printk(KERN_ERR "fb%d: not enough framebuffer memory (%d kB requested , %d kB available)\n", info->node, mem >> 10, (unsigned int) (info->screen_size >> 10));
348 return -EINVAL;
349 }
350
351 /* Text mode is limited to 256 kB of memory */
352 if ((var->bits_per_pixel == 0) && (mem > (256*1024)))
353 {
354 printk(KERN_ERR "fb%d: text framebuffer size too large (%d kB requested, 256 kB possible)\n", info->node, mem >> 10);
355 return -EINVAL;
356 }
357
358 rv = svga_check_timings (&vt8623_timing_regs, var, info->node);
359 if (rv < 0)
360 {
361 printk(KERN_ERR "fb%d: invalid timings requested\n", info->node);
362 return rv;
363 }
364
365 /* Interlaced mode not supported */
366 if (var->vmode & FB_VMODE_INTERLACED)
367 return -EINVAL;
368
369 return 0;
370}
371
372
373static int vt8623fb_set_par(struct fb_info *info)
374{
375 u32 mode, offset_value, fetch_value, screen_size;
376 u32 bpp = info->var.bits_per_pixel;
377
378 if (bpp != 0) {
379 info->fix.ypanstep = 1;
380 info->fix.line_length = (info->var.xres_virtual * bpp) / 8;
381
382 info->flags &= ~FBINFO_MISC_TILEBLITTING;
383 info->tileops = NULL;
384
385 /* in 4bpp supports 8p wide tiles only, any tiles otherwise */
386 info->pixmap.blit_x = (bpp == 4) ? (1 << (8 - 1)) : (~(u32)0);
387 info->pixmap.blit_y = ~(u32)0;
388
389 offset_value = (info->var.xres_virtual * bpp) / 64;
390 fetch_value = ((info->var.xres * bpp) / 128) + 4;
391
392 if (bpp == 4)
393 fetch_value = (info->var.xres / 8) + 8; /* + 0 is OK */
394
395 screen_size = info->var.yres_virtual * info->fix.line_length;
396 } else {
397 info->fix.ypanstep = 16;
398 info->fix.line_length = 0;
399
400 info->flags |= FBINFO_MISC_TILEBLITTING;
401 info->tileops = &vt8623fb_tile_ops;
402
403 /* supports 8x16 tiles only */
404 info->pixmap.blit_x = 1 << (8 - 1);
405 info->pixmap.blit_y = 1 << (16 - 1);
406
407 offset_value = info->var.xres_virtual / 16;
408 fetch_value = (info->var.xres / 8) + 8;
409 screen_size = (info->var.xres_virtual * info->var.yres_virtual) / 64;
410 }
411
412 info->var.xoffset = 0;
413 info->var.yoffset = 0;
414 info->var.activate = FB_ACTIVATE_NOW;
415
416 /* Unlock registers */
417 svga_wseq_mask(0x10, 0x01, 0x01);
418 svga_wcrt_mask(0x11, 0x00, 0x80);
419 svga_wcrt_mask(0x47, 0x00, 0x01);
420
421 /* Device, screen and sync off */
422 svga_wseq_mask(0x01, 0x20, 0x20);
423 svga_wcrt_mask(0x36, 0x30, 0x30);
424 svga_wcrt_mask(0x17, 0x00, 0x80);
425
426 /* Set default values */
427 svga_set_default_gfx_regs();
428 svga_set_default_atc_regs();
429 svga_set_default_seq_regs();
430 svga_set_default_crt_regs();
431 svga_wcrt_multi(vt8623_line_compare_regs, 0xFFFFFFFF);
432 svga_wcrt_multi(vt8623_start_address_regs, 0);
433
434 svga_wcrt_multi(vt8623_offset_regs, offset_value);
435 svga_wseq_multi(vt8623_fetch_count_regs, fetch_value);
436
437 if (info->var.vmode & FB_VMODE_DOUBLE)
438 svga_wcrt_mask(0x09, 0x80, 0x80);
439 else
440 svga_wcrt_mask(0x09, 0x00, 0x80);
441
442 svga_wseq_mask(0x1E, 0xF0, 0xF0); // DI/DVP bus
443 svga_wseq_mask(0x2A, 0x0F, 0x0F); // DI/DVP bus
444 svga_wseq_mask(0x16, 0x08, 0xBF); // FIFO read treshold
445 vga_wseq(NULL, 0x17, 0x1F); // FIFO depth
446 vga_wseq(NULL, 0x18, 0x4E);
447 svga_wseq_mask(0x1A, 0x08, 0x08); // enable MMIO ?
448
449 vga_wcrt(NULL, 0x32, 0x00);
450 vga_wcrt(NULL, 0x34, 0x00);
451 vga_wcrt(NULL, 0x6A, 0x80);
452 vga_wcrt(NULL, 0x6A, 0xC0);
453
454 vga_wgfx(NULL, 0x20, 0x00);
455 vga_wgfx(NULL, 0x21, 0x00);
456 vga_wgfx(NULL, 0x22, 0x00);
457
458 /* Set SR15 according to number of bits per pixel */
459 mode = svga_match_format(vt8623fb_formats, &(info->var), &(info->fix));
460 switch (mode) {
461 case 0:
462 pr_debug("fb%d: text mode\n", info->node);
463 svga_set_textmode_vga_regs();
464 svga_wseq_mask(0x15, 0x00, 0xFE);
465 svga_wcrt_mask(0x11, 0x60, 0x70);
466 break;
467 case 1:
468 pr_debug("fb%d: 4 bit pseudocolor\n", info->node);
469 vga_wgfx(NULL, VGA_GFX_MODE, 0x40);
470 svga_wseq_mask(0x15, 0x20, 0xFE);
471 svga_wcrt_mask(0x11, 0x00, 0x70);
472 break;
473 case 2:
474 pr_debug("fb%d: 4 bit pseudocolor, planar\n", info->node);
475 svga_wseq_mask(0x15, 0x00, 0xFE);
476 svga_wcrt_mask(0x11, 0x00, 0x70);
477 break;
478 case 3:
479 pr_debug("fb%d: 8 bit pseudocolor\n", info->node);
480 svga_wseq_mask(0x15, 0x22, 0xFE);
481 break;
482 case 4:
483 pr_debug("fb%d: 5/6/5 truecolor\n", info->node);
484 svga_wseq_mask(0x15, 0xB6, 0xFE);
485 break;
486 case 5:
487 pr_debug("fb%d: 8/8/8 truecolor\n", info->node);
488 svga_wseq_mask(0x15, 0xAE, 0xFE);
489 break;
490 default:
491 printk(KERN_ERR "vt8623fb: unsupported mode - bug\n");
492 return (-EINVAL);
493 }
494
495 vt8623_set_pixclock(info, info->var.pixclock);
496 svga_set_timings(&vt8623_timing_regs, &(info->var), 1, 1,
497 (info->var.vmode & FB_VMODE_DOUBLE) ? 2 : 1, 1,
498 1, info->node);
499
500 memset_io(info->screen_base, 0x00, screen_size);
501
502 /* Device and screen back on */
503 svga_wcrt_mask(0x17, 0x80, 0x80);
504 svga_wcrt_mask(0x36, 0x00, 0x30);
505 svga_wseq_mask(0x01, 0x00, 0x20);
506
507 return 0;
508}
509
510
511static int vt8623fb_setcolreg(u_int regno, u_int red, u_int green, u_int blue,
512 u_int transp, struct fb_info *fb)
513{
514 switch (fb->var.bits_per_pixel) {
515 case 0:
516 case 4:
517 if (regno >= 16)
518 return -EINVAL;
519
520 outb(0x0F, VGA_PEL_MSK);
521 outb(regno, VGA_PEL_IW);
522 outb(red >> 10, VGA_PEL_D);
523 outb(green >> 10, VGA_PEL_D);
524 outb(blue >> 10, VGA_PEL_D);
525 break;
526 case 8:
527 if (regno >= 256)
528 return -EINVAL;
529
530 outb(0xFF, VGA_PEL_MSK);
531 outb(regno, VGA_PEL_IW);
532 outb(red >> 10, VGA_PEL_D);
533 outb(green >> 10, VGA_PEL_D);
534 outb(blue >> 10, VGA_PEL_D);
535 break;
536 case 16:
537 if (regno >= 16)
538 return 0;
539
540 if (fb->var.green.length == 5)
541 ((u32*)fb->pseudo_palette)[regno] = ((red & 0xF800) >> 1) |
542 ((green & 0xF800) >> 6) | ((blue & 0xF800) >> 11);
543 else if (fb->var.green.length == 6)
544 ((u32*)fb->pseudo_palette)[regno] = (red & 0xF800) |
545 ((green & 0xFC00) >> 5) | ((blue & 0xF800) >> 11);
546 else
547 return -EINVAL;
548 break;
549 case 24:
550 case 32:
551 if (regno >= 16)
552 return 0;
553
554 /* ((transp & 0xFF00) << 16) */
555 ((u32*)fb->pseudo_palette)[regno] = ((red & 0xFF00) << 8) |
556 (green & 0xFF00) | ((blue & 0xFF00) >> 8);
557 break;
558 default:
559 return -EINVAL;
560 }
561
562 return 0;
563}
564
565
566static int vt8623fb_blank(int blank_mode, struct fb_info *info)
567{
568 switch (blank_mode) {
569 case FB_BLANK_UNBLANK:
570 pr_debug("fb%d: unblank\n", info->node);
571 svga_wcrt_mask(0x36, 0x00, 0x30);
572 svga_wseq_mask(0x01, 0x00, 0x20);
573 break;
574 case FB_BLANK_NORMAL:
575 pr_debug("fb%d: blank\n", info->node);
576 svga_wcrt_mask(0x36, 0x00, 0x30);
577 svga_wseq_mask(0x01, 0x20, 0x20);
578 break;
579 case FB_BLANK_HSYNC_SUSPEND:
580 pr_debug("fb%d: DPMS standby (hsync off)\n", info->node);
581 svga_wcrt_mask(0x36, 0x10, 0x30);
582 svga_wseq_mask(0x01, 0x20, 0x20);
583 break;
584 case FB_BLANK_VSYNC_SUSPEND:
585 pr_debug("fb%d: DPMS suspend (vsync off)\n", info->node);
586 svga_wcrt_mask(0x36, 0x20, 0x30);
587 svga_wseq_mask(0x01, 0x20, 0x20);
588 break;
589 case FB_BLANK_POWERDOWN:
590 pr_debug("fb%d: DPMS off (no sync)\n", info->node);
591 svga_wcrt_mask(0x36, 0x30, 0x30);
592 svga_wseq_mask(0x01, 0x20, 0x20);
593 break;
594 }
595
596 return 0;
597}
598
599
600static int vt8623fb_pan_display(struct fb_var_screeninfo *var, struct fb_info *info)
601{
602 unsigned int offset;
603
604 /* Calculate the offset */
605 if (var->bits_per_pixel == 0) {
606 offset = (var->yoffset / 16) * var->xres_virtual + var->xoffset;
607 offset = offset >> 3;
608 } else {
609 offset = (var->yoffset * info->fix.line_length) +
610 (var->xoffset * var->bits_per_pixel / 8);
611 offset = offset >> ((var->bits_per_pixel == 4) ? 2 : 1);
612 }
613
614 /* Set the offset */
615 svga_wcrt_multi(vt8623_start_address_regs, offset);
616
617 return 0;
618}
619
620
621/* ------------------------------------------------------------------------- */
622
623
624/* Frame buffer operations */
625
626static struct fb_ops vt8623fb_ops = {
627 .owner = THIS_MODULE,
628 .fb_open = vt8623fb_open,
629 .fb_release = vt8623fb_release,
630 .fb_check_var = vt8623fb_check_var,
631 .fb_set_par = vt8623fb_set_par,
632 .fb_setcolreg = vt8623fb_setcolreg,
633 .fb_blank = vt8623fb_blank,
634 .fb_pan_display = vt8623fb_pan_display,
635 .fb_fillrect = vt8623fb_fillrect,
636 .fb_copyarea = cfb_copyarea,
637 .fb_imageblit = vt8623fb_imageblit,
638 .fb_get_caps = svga_get_caps,
639};
640
641
642/* PCI probe */
643
644static int __devinit vt8623_pci_probe(struct pci_dev *dev, const struct pci_device_id *id)
645{
646 struct fb_info *info;
647 struct vt8623fb_info *par;
648 unsigned int memsize1, memsize2;
649 int rc;
650
651 /* Ignore secondary VGA device because there is no VGA arbitration */
652 if (! svga_primary_device(dev)) {
653 dev_info(&(dev->dev), "ignoring secondary device\n");
654 return -ENODEV;
655 }
656
657 /* Allocate and fill driver data structure */
658 info = framebuffer_alloc(sizeof(struct vt8623fb_info), NULL);
659 if (! info) {
660 dev_err(&(dev->dev), "cannot allocate memory\n");
661 return -ENOMEM;
662 }
663
664 par = info->par;
665 mutex_init(&par->open_lock);
666
667 info->flags = FBINFO_PARTIAL_PAN_OK | FBINFO_HWACCEL_YPAN;
668 info->fbops = &vt8623fb_ops;
669
670 /* Prepare PCI device */
671
672 rc = pci_enable_device(dev);
673 if (rc < 0) {
674 dev_err(&(dev->dev), "cannot enable PCI device\n");
675 goto err_enable_device;
676 }
677
678 rc = pci_request_regions(dev, "vt8623fb");
679 if (rc < 0) {
680 dev_err(&(dev->dev), "cannot reserve framebuffer region\n");
681 goto err_request_regions;
682 }
683
684 info->fix.smem_start = pci_resource_start(dev, 0);
685 info->fix.smem_len = pci_resource_len(dev, 0);
686 info->fix.mmio_start = pci_resource_start(dev, 1);
687 info->fix.mmio_len = pci_resource_len(dev, 1);
688
689 /* Map physical IO memory address into kernel space */
690 info->screen_base = pci_iomap(dev, 0, 0);
691 if (! info->screen_base) {
692 rc = -ENOMEM;
693 dev_err(&(dev->dev), "iomap for framebuffer failed\n");
694 goto err_iomap_1;
695 }
696
697 par->mmio_base = pci_iomap(dev, 1, 0);
698 if (! par->mmio_base) {
699 rc = -ENOMEM;
700 dev_err(&(dev->dev), "iomap for MMIO failed\n");
701 goto err_iomap_2;
702 }
703
704 /* Find how many physical memory there is on card */
705 memsize1 = (vga_rseq(NULL, 0x34) + 1) >> 1;
706 memsize2 = vga_rseq(NULL, 0x39) << 2;
707
708 if ((16 <= memsize1) && (memsize1 <= 64) && (memsize1 == memsize2))
709 info->screen_size = memsize1 << 20;
710 else {
711 dev_err(&(dev->dev), "memory size detection failed (%x %x), suppose 16 MB\n", memsize1, memsize2);
712 info->screen_size = 16 << 20;
713 }
714
715 info->fix.smem_len = info->screen_size;
716 strcpy(info->fix.id, "VIA VT8623");
717 info->fix.type = FB_TYPE_PACKED_PIXELS;
718 info->fix.visual = FB_VISUAL_PSEUDOCOLOR;
719 info->fix.ypanstep = 0;
720 info->fix.accel = FB_ACCEL_NONE;
721 info->pseudo_palette = (void*)par->pseudo_palette;
722
723 /* Prepare startup mode */
724
725 rc = fb_find_mode(&(info->var), info, mode, NULL, 0, NULL, 8);
726 if (! ((rc == 1) || (rc == 2))) {
727 rc = -EINVAL;
728 dev_err(&(dev->dev), "mode %s not found\n", mode);
729 goto err_find_mode;
730 }
731
732 rc = fb_alloc_cmap(&info->cmap, 256, 0);
733 if (rc < 0) {
734 dev_err(&(dev->dev), "cannot allocate colormap\n");
735 goto err_alloc_cmap;
736 }
737
738 rc = register_framebuffer(info);
739 if (rc < 0) {
740 dev_err(&(dev->dev), "cannot register framebugger\n");
741 goto err_reg_fb;
742 }
743
744 printk(KERN_INFO "fb%d: %s on %s, %d MB RAM\n", info->node, info->fix.id,
745 pci_name(dev), info->fix.smem_len >> 20);
746
747 /* Record a reference to the driver data */
748 pci_set_drvdata(dev, info);
749
750#ifdef CONFIG_MTRR
751 if (mtrr) {
752 par->mtrr_reg = -1;
753 par->mtrr_reg = mtrr_add(info->fix.smem_start, info->fix.smem_len, MTRR_TYPE_WRCOMB, 1);
754 }
755#endif
756
757 return 0;
758
759 /* Error handling */
760err_reg_fb:
761 fb_dealloc_cmap(&info->cmap);
762err_alloc_cmap:
763err_find_mode:
764 pci_iounmap(dev, par->mmio_base);
765err_iomap_2:
766 pci_iounmap(dev, info->screen_base);
767err_iomap_1:
768 pci_release_regions(dev);
769err_request_regions:
770/* pci_disable_device(dev); */
771err_enable_device:
772 framebuffer_release(info);
773 return rc;
774}
775
776/* PCI remove */
777
778static void __devexit vt8623_pci_remove(struct pci_dev *dev)
779{
780 struct fb_info *info = pci_get_drvdata(dev);
781 struct vt8623fb_info *par = info->par;
782
783 if (info) {
784#ifdef CONFIG_MTRR
785 if (par->mtrr_reg >= 0) {
786 mtrr_del(par->mtrr_reg, 0, 0);
787 par->mtrr_reg = -1;
788 }
789#endif
790
791 unregister_framebuffer(info);
792 fb_dealloc_cmap(&info->cmap);
793
794 pci_iounmap(dev, info->screen_base);
795 pci_iounmap(dev, par->mmio_base);
796 pci_release_regions(dev);
797/* pci_disable_device(dev); */
798
799 pci_set_drvdata(dev, NULL);
800 framebuffer_release(info);
801 }
802}
803
804
805#ifdef CONFIG_PM
806/* PCI suspend */
807
808static int vt8623_pci_suspend(struct pci_dev* dev, pm_message_t state)
809{
810 struct fb_info *info = pci_get_drvdata(dev);
811 struct vt8623fb_info *par = info->par;
812
813 dev_info(&(dev->dev), "suspend\n");
814
815 acquire_console_sem();
816 mutex_lock(&(par->open_lock));
817
818 if ((state.event == PM_EVENT_FREEZE) || (par->ref_count == 0)) {
819 mutex_unlock(&(par->open_lock));
820 release_console_sem();
821 return 0;
822 }
823
824 fb_set_suspend(info, 1);
825
826 pci_save_state(dev);
827 pci_disable_device(dev);
828 pci_set_power_state(dev, pci_choose_state(dev, state));
829
830 mutex_unlock(&(par->open_lock));
831 release_console_sem();
832
833 return 0;
834}
835
836
837/* PCI resume */
838
839static int vt8623_pci_resume(struct pci_dev* dev)
840{
841 struct fb_info *info = pci_get_drvdata(dev);
842 struct vt8623fb_info *par = info->par;
843
844 dev_info(&(dev->dev), "resume\n");
845
846 acquire_console_sem();
847 mutex_lock(&(par->open_lock));
848
849 if (par->ref_count == 0) {
850 mutex_unlock(&(par->open_lock));
851 release_console_sem();
852 return 0;
853 }
854
855 pci_set_power_state(dev, PCI_D0);
856 pci_restore_state(dev);
857
858 if (pci_enable_device(dev))
859 goto fail;
860
861 pci_set_master(dev);
862
863 vt8623fb_set_par(info);
864 fb_set_suspend(info, 0);
865
866 mutex_unlock(&(par->open_lock));
867fail:
868 release_console_sem();
869
870 return 0;
871}
872#else
873#define vt8623_pci_suspend NULL
874#define vt8623_pci_resume NULL
875#endif /* CONFIG_PM */
876
877/* List of boards that we are trying to support */
878
879static struct pci_device_id vt8623_devices[] __devinitdata = {
880 {PCI_DEVICE(PCI_VENDOR_ID_VIA, 0x3122)},
881 {0, 0, 0, 0, 0, 0, 0}
882};
883
884MODULE_DEVICE_TABLE(pci, vt8623_devices);
885
886static struct pci_driver vt8623fb_pci_driver = {
887 .name = "vt8623fb",
888 .id_table = vt8623_devices,
889 .probe = vt8623_pci_probe,
890 .remove = __devexit_p(vt8623_pci_remove),
891 .suspend = vt8623_pci_suspend,
892 .resume = vt8623_pci_resume,
893};
894
895/* Cleanup */
896
897static void __exit vt8623fb_cleanup(void)
898{
899 pr_debug("vt8623fb: cleaning up\n");
900 pci_unregister_driver(&vt8623fb_pci_driver);
901}
902
903/* Driver Initialisation */
904
905int __init vt8623fb_init(void)
906{
907
908#ifndef MODULE
909 char *option = NULL;
910
911 if (fb_get_options("vt8623fb", &option))
912 return -ENODEV;
913
914 if (option && *option)
915 mode = option;
916#endif
917
918 pr_debug("vt8623fb: initializing\n");
919 return pci_register_driver(&vt8623fb_pci_driver);
920}
921
922/* ------------------------------------------------------------------------- */
923
924/* Modularization */
925
926module_init(vt8623fb_init);
927module_exit(vt8623fb_cleanup);
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 4aa8079e71be..c8796906f584 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -628,11 +628,7 @@ static int affs_prepare_write_ofs(struct file *file, struct page *page, unsigned
628 return err; 628 return err;
629 } 629 }
630 if (to < PAGE_CACHE_SIZE) { 630 if (to < PAGE_CACHE_SIZE) {
631 char *kaddr = kmap_atomic(page, KM_USER0); 631 zero_user_page(page, to, PAGE_CACHE_SIZE - to, KM_USER0);
632
633 memset(kaddr + to, 0, PAGE_CACHE_SIZE - to);
634 flush_dcache_page(page);
635 kunmap_atomic(kaddr, KM_USER0);
636 if (size > offset + to) { 632 if (size > offset + to) {
637 if (size < offset + PAGE_CACHE_SIZE) 633 if (size < offset + PAGE_CACHE_SIZE)
638 tmp = size & ~PAGE_CACHE_MASK; 634 tmp = size & ~PAGE_CACHE_MASK;
diff --git a/fs/afs/Makefile b/fs/afs/Makefile
index cf83e5d63512..73ce561f3ea0 100644
--- a/fs/afs/Makefile
+++ b/fs/afs/Makefile
@@ -22,6 +22,7 @@ kafs-objs := \
22 vlclient.o \ 22 vlclient.o \
23 vlocation.o \ 23 vlocation.o \
24 vnode.o \ 24 vnode.o \
25 volume.o 25 volume.o \
26 write.o
26 27
27obj-$(CONFIG_AFS_FS) := kafs.o 28obj-$(CONFIG_AFS_FS) := kafs.o
diff --git a/fs/afs/afs_fs.h b/fs/afs/afs_fs.h
index 89e0d1650a72..2198006d2d03 100644
--- a/fs/afs/afs_fs.h
+++ b/fs/afs/afs_fs.h
@@ -18,6 +18,8 @@
18enum AFS_FS_Operations { 18enum AFS_FS_Operations {
19 FSFETCHDATA = 130, /* AFS Fetch file data */ 19 FSFETCHDATA = 130, /* AFS Fetch file data */
20 FSFETCHSTATUS = 132, /* AFS Fetch file status */ 20 FSFETCHSTATUS = 132, /* AFS Fetch file status */
21 FSSTOREDATA = 133, /* AFS Store file data */
22 FSSTORESTATUS = 135, /* AFS Store file status */
21 FSREMOVEFILE = 136, /* AFS Remove a file */ 23 FSREMOVEFILE = 136, /* AFS Remove a file */
22 FSCREATEFILE = 137, /* AFS Create a file */ 24 FSCREATEFILE = 137, /* AFS Create a file */
23 FSRENAME = 138, /* AFS Rename or move a file or directory */ 25 FSRENAME = 138, /* AFS Rename or move a file or directory */
diff --git a/fs/afs/callback.c b/fs/afs/callback.c
index 9bdbf36a9aa9..f64e40fefc02 100644
--- a/fs/afs/callback.c
+++ b/fs/afs/callback.c
@@ -44,7 +44,7 @@ void afs_init_callback_state(struct afs_server *server)
44 while (!RB_EMPTY_ROOT(&server->cb_promises)) { 44 while (!RB_EMPTY_ROOT(&server->cb_promises)) {
45 vnode = rb_entry(server->cb_promises.rb_node, 45 vnode = rb_entry(server->cb_promises.rb_node,
46 struct afs_vnode, cb_promise); 46 struct afs_vnode, cb_promise);
47 _debug("UNPROMISE { vid=%x vn=%u uq=%u}", 47 _debug("UNPROMISE { vid=%x:%u uq=%u}",
48 vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique); 48 vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique);
49 rb_erase(&vnode->cb_promise, &server->cb_promises); 49 rb_erase(&vnode->cb_promise, &server->cb_promises);
50 vnode->cb_promised = false; 50 vnode->cb_promised = false;
@@ -84,11 +84,8 @@ void afs_broken_callback_work(struct work_struct *work)
84 84
85 /* if the vnode's data version number changed then its contents 85 /* if the vnode's data version number changed then its contents
86 * are different */ 86 * are different */
87 if (test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags)) { 87 if (test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags))
88 _debug("zap data {%x:%u}", 88 afs_zap_data(vnode);
89 vnode->fid.vid, vnode->fid.vnode);
90 invalidate_remote_inode(&vnode->vfs_inode);
91 }
92 } 89 }
93 90
94out: 91out:
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 0c1e902f17a3..2fb31276196b 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -55,7 +55,8 @@ const struct inode_operations afs_dir_inode_operations = {
55 .rmdir = afs_rmdir, 55 .rmdir = afs_rmdir,
56 .rename = afs_rename, 56 .rename = afs_rename,
57 .permission = afs_permission, 57 .permission = afs_permission,
58 .getattr = afs_inode_getattr, 58 .getattr = afs_getattr,
59 .setattr = afs_setattr,
59}; 60};
60 61
61static struct dentry_operations afs_fs_dentry_operations = { 62static struct dentry_operations afs_fs_dentry_operations = {
@@ -491,7 +492,7 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
491 492
492 vnode = AFS_FS_I(dir); 493 vnode = AFS_FS_I(dir);
493 494
494 _enter("{%x:%d},%p{%s},", 495 _enter("{%x:%u},%p{%s},",
495 vnode->fid.vid, vnode->fid.vnode, dentry, dentry->d_name.name); 496 vnode->fid.vid, vnode->fid.vnode, dentry, dentry->d_name.name);
496 497
497 ASSERTCMP(dentry->d_inode, ==, NULL); 498 ASSERTCMP(dentry->d_inode, ==, NULL);
@@ -731,7 +732,7 @@ static int afs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
731 732
732 dvnode = AFS_FS_I(dir); 733 dvnode = AFS_FS_I(dir);
733 734
734 _enter("{%x:%d},{%s},%o", 735 _enter("{%x:%u},{%s},%o",
735 dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name, mode); 736 dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name, mode);
736 737
737 ret = -ENAMETOOLONG; 738 ret = -ENAMETOOLONG;
@@ -796,7 +797,7 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry)
796 797
797 dvnode = AFS_FS_I(dir); 798 dvnode = AFS_FS_I(dir);
798 799
799 _enter("{%x:%d},{%s}", 800 _enter("{%x:%u},{%s}",
800 dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name); 801 dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name);
801 802
802 ret = -ENAMETOOLONG; 803 ret = -ENAMETOOLONG;
@@ -842,7 +843,7 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry)
842 843
843 dvnode = AFS_FS_I(dir); 844 dvnode = AFS_FS_I(dir);
844 845
845 _enter("{%x:%d},{%s}", 846 _enter("{%x:%u},{%s}",
846 dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name); 847 dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name);
847 848
848 ret = -ENAMETOOLONG; 849 ret = -ENAMETOOLONG;
@@ -916,7 +917,7 @@ static int afs_create(struct inode *dir, struct dentry *dentry, int mode,
916 917
917 dvnode = AFS_FS_I(dir); 918 dvnode = AFS_FS_I(dir);
918 919
919 _enter("{%x:%d},{%s},%o,", 920 _enter("{%x:%u},{%s},%o,",
920 dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name, mode); 921 dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name, mode);
921 922
922 ret = -ENAMETOOLONG; 923 ret = -ENAMETOOLONG;
@@ -983,7 +984,7 @@ static int afs_link(struct dentry *from, struct inode *dir,
983 vnode = AFS_FS_I(from->d_inode); 984 vnode = AFS_FS_I(from->d_inode);
984 dvnode = AFS_FS_I(dir); 985 dvnode = AFS_FS_I(dir);
985 986
986 _enter("{%x:%d},{%x:%d},{%s}", 987 _enter("{%x:%u},{%x:%u},{%s}",
987 vnode->fid.vid, vnode->fid.vnode, 988 vnode->fid.vid, vnode->fid.vnode,
988 dvnode->fid.vid, dvnode->fid.vnode, 989 dvnode->fid.vid, dvnode->fid.vnode,
989 dentry->d_name.name); 990 dentry->d_name.name);
@@ -1032,7 +1033,7 @@ static int afs_symlink(struct inode *dir, struct dentry *dentry,
1032 1033
1033 dvnode = AFS_FS_I(dir); 1034 dvnode = AFS_FS_I(dir);
1034 1035
1035 _enter("{%x:%d},{%s},%s", 1036 _enter("{%x:%u},{%s},%s",
1036 dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name, 1037 dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name,
1037 content); 1038 content);
1038 1039
@@ -1104,7 +1105,7 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
1104 orig_dvnode = AFS_FS_I(old_dir); 1105 orig_dvnode = AFS_FS_I(old_dir);
1105 new_dvnode = AFS_FS_I(new_dir); 1106 new_dvnode = AFS_FS_I(new_dir);
1106 1107
1107 _enter("{%x:%d},{%x:%d},{%x:%d},{%s}", 1108 _enter("{%x:%u},{%x:%u},{%x:%u},{%s}",
1108 orig_dvnode->fid.vid, orig_dvnode->fid.vnode, 1109 orig_dvnode->fid.vid, orig_dvnode->fid.vnode,
1109 vnode->fid.vid, vnode->fid.vnode, 1110 vnode->fid.vid, vnode->fid.vnode,
1110 new_dvnode->fid.vid, new_dvnode->fid.vnode, 1111 new_dvnode->fid.vid, new_dvnode->fid.vnode,
diff --git a/fs/afs/file.c b/fs/afs/file.c
index ae256498f4f7..3e25795e5a42 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -15,32 +15,43 @@
15#include <linux/slab.h> 15#include <linux/slab.h>
16#include <linux/fs.h> 16#include <linux/fs.h>
17#include <linux/pagemap.h> 17#include <linux/pagemap.h>
18#include <linux/writeback.h>
18#include "internal.h" 19#include "internal.h"
19 20
20static int afs_file_readpage(struct file *file, struct page *page); 21static int afs_readpage(struct file *file, struct page *page);
21static void afs_file_invalidatepage(struct page *page, unsigned long offset); 22static void afs_invalidatepage(struct page *page, unsigned long offset);
22static int afs_file_releasepage(struct page *page, gfp_t gfp_flags); 23static int afs_releasepage(struct page *page, gfp_t gfp_flags);
24static int afs_launder_page(struct page *page);
23 25
24const struct file_operations afs_file_operations = { 26const struct file_operations afs_file_operations = {
25 .open = afs_open, 27 .open = afs_open,
26 .release = afs_release, 28 .release = afs_release,
27 .llseek = generic_file_llseek, 29 .llseek = generic_file_llseek,
28 .read = do_sync_read, 30 .read = do_sync_read,
31 .write = do_sync_write,
29 .aio_read = generic_file_aio_read, 32 .aio_read = generic_file_aio_read,
33 .aio_write = afs_file_write,
30 .mmap = generic_file_readonly_mmap, 34 .mmap = generic_file_readonly_mmap,
31 .sendfile = generic_file_sendfile, 35 .sendfile = generic_file_sendfile,
36 .fsync = afs_fsync,
32}; 37};
33 38
34const struct inode_operations afs_file_inode_operations = { 39const struct inode_operations afs_file_inode_operations = {
35 .getattr = afs_inode_getattr, 40 .getattr = afs_getattr,
41 .setattr = afs_setattr,
36 .permission = afs_permission, 42 .permission = afs_permission,
37}; 43};
38 44
39const struct address_space_operations afs_fs_aops = { 45const struct address_space_operations afs_fs_aops = {
40 .readpage = afs_file_readpage, 46 .readpage = afs_readpage,
41 .set_page_dirty = __set_page_dirty_nobuffers, 47 .set_page_dirty = afs_set_page_dirty,
42 .releasepage = afs_file_releasepage, 48 .launder_page = afs_launder_page,
43 .invalidatepage = afs_file_invalidatepage, 49 .releasepage = afs_releasepage,
50 .invalidatepage = afs_invalidatepage,
51 .prepare_write = afs_prepare_write,
52 .commit_write = afs_commit_write,
53 .writepage = afs_writepage,
54 .writepages = afs_writepages,
44}; 55};
45 56
46/* 57/*
@@ -52,7 +63,7 @@ int afs_open(struct inode *inode, struct file *file)
52 struct key *key; 63 struct key *key;
53 int ret; 64 int ret;
54 65
55 _enter("{%x:%x},", vnode->fid.vid, vnode->fid.vnode); 66 _enter("{%x:%u},", vnode->fid.vid, vnode->fid.vnode);
56 67
57 key = afs_request_key(vnode->volume->cell); 68 key = afs_request_key(vnode->volume->cell);
58 if (IS_ERR(key)) { 69 if (IS_ERR(key)) {
@@ -78,7 +89,7 @@ int afs_release(struct inode *inode, struct file *file)
78{ 89{
79 struct afs_vnode *vnode = AFS_FS_I(inode); 90 struct afs_vnode *vnode = AFS_FS_I(inode);
80 91
81 _enter("{%x:%x},", vnode->fid.vid, vnode->fid.vnode); 92 _enter("{%x:%u},", vnode->fid.vid, vnode->fid.vnode);
82 93
83 key_put(file->private_data); 94 key_put(file->private_data);
84 _leave(" = 0"); 95 _leave(" = 0");
@@ -89,10 +100,10 @@ int afs_release(struct inode *inode, struct file *file)
89 * deal with notification that a page was read from the cache 100 * deal with notification that a page was read from the cache
90 */ 101 */
91#ifdef AFS_CACHING_SUPPORT 102#ifdef AFS_CACHING_SUPPORT
92static void afs_file_readpage_read_complete(void *cookie_data, 103static void afs_readpage_read_complete(void *cookie_data,
93 struct page *page, 104 struct page *page,
94 void *data, 105 void *data,
95 int error) 106 int error)
96{ 107{
97 _enter("%p,%p,%p,%d", cookie_data, page, data, error); 108 _enter("%p,%p,%p,%d", cookie_data, page, data, error);
98 109
@@ -109,10 +120,10 @@ static void afs_file_readpage_read_complete(void *cookie_data,
109 * deal with notification that a page was written to the cache 120 * deal with notification that a page was written to the cache
110 */ 121 */
111#ifdef AFS_CACHING_SUPPORT 122#ifdef AFS_CACHING_SUPPORT
112static void afs_file_readpage_write_complete(void *cookie_data, 123static void afs_readpage_write_complete(void *cookie_data,
113 struct page *page, 124 struct page *page,
114 void *data, 125 void *data,
115 int error) 126 int error)
116{ 127{
117 _enter("%p,%p,%p,%d", cookie_data, page, data, error); 128 _enter("%p,%p,%p,%d", cookie_data, page, data, error);
118 129
@@ -121,9 +132,9 @@ static void afs_file_readpage_write_complete(void *cookie_data,
121#endif 132#endif
122 133
123/* 134/*
124 * AFS read page from file (or symlink) 135 * AFS read page from file, directory or symlink
125 */ 136 */
126static int afs_file_readpage(struct file *file, struct page *page) 137static int afs_readpage(struct file *file, struct page *page)
127{ 138{
128 struct afs_vnode *vnode; 139 struct afs_vnode *vnode;
129 struct inode *inode; 140 struct inode *inode;
@@ -219,39 +230,17 @@ error:
219} 230}
220 231
221/* 232/*
222 * get a page cookie for the specified page
223 */
224#ifdef AFS_CACHING_SUPPORT
225int afs_cache_get_page_cookie(struct page *page,
226 struct cachefs_page **_page_cookie)
227{
228 int ret;
229
230 _enter("");
231 ret = cachefs_page_get_private(page,_page_cookie, GFP_NOIO);
232
233 _leave(" = %d", ret);
234 return ret;
235}
236#endif
237
238/*
239 * invalidate part or all of a page 233 * invalidate part or all of a page
240 */ 234 */
241static void afs_file_invalidatepage(struct page *page, unsigned long offset) 235static void afs_invalidatepage(struct page *page, unsigned long offset)
242{ 236{
243 int ret = 1; 237 int ret = 1;
244 238
245 _enter("{%lu},%lu", page->index, offset); 239 kenter("{%lu},%lu", page->index, offset);
246 240
247 BUG_ON(!PageLocked(page)); 241 BUG_ON(!PageLocked(page));
248 242
249 if (PagePrivate(page)) { 243 if (PagePrivate(page)) {
250#ifdef AFS_CACHING_SUPPORT
251 struct afs_vnode *vnode = AFS_FS_I(page->mapping->host);
252 cachefs_uncache_page(vnode->cache,page);
253#endif
254
255 /* We release buffers only if the entire page is being 244 /* We release buffers only if the entire page is being
256 * invalidated. 245 * invalidated.
257 * The get_block cached value has been unconditionally 246 * The get_block cached value has been unconditionally
@@ -272,25 +261,33 @@ static void afs_file_invalidatepage(struct page *page, unsigned long offset)
272} 261}
273 262
274/* 263/*
264 * write back a dirty page
265 */
266static int afs_launder_page(struct page *page)
267{
268 _enter("{%lu}", page->index);
269
270 return 0;
271}
272
273/*
275 * release a page and cleanup its private data 274 * release a page and cleanup its private data
276 */ 275 */
277static int afs_file_releasepage(struct page *page, gfp_t gfp_flags) 276static int afs_releasepage(struct page *page, gfp_t gfp_flags)
278{ 277{
279 struct cachefs_page *pageio; 278 struct afs_vnode *vnode = AFS_FS_I(page->mapping->host);
279 struct afs_writeback *wb;
280 280
281 _enter("{%lu},%x", page->index, gfp_flags); 281 _enter("{{%x:%u}[%lu],%lx},%x",
282 vnode->fid.vid, vnode->fid.vnode, page->index, page->flags,
283 gfp_flags);
282 284
283 if (PagePrivate(page)) { 285 if (PagePrivate(page)) {
284#ifdef AFS_CACHING_SUPPORT 286 wb = (struct afs_writeback *) page_private(page);
285 struct afs_vnode *vnode = AFS_FS_I(page->mapping->host); 287 ASSERT(wb != NULL);
286 cachefs_uncache_page(vnode->cache, page);
287#endif
288
289 pageio = (struct cachefs_page *) page_private(page);
290 set_page_private(page, 0); 288 set_page_private(page, 0);
291 ClearPagePrivate(page); 289 ClearPagePrivate(page);
292 290 afs_put_writeback(wb);
293 kfree(pageio);
294 } 291 }
295 292
296 _leave(" = 0"); 293 _leave(" = 0");
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index e54e6c2ad343..025b1903d9e1 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -33,8 +33,10 @@ static void xdr_decode_AFSFid(const __be32 **_bp, struct afs_fid *fid)
33 */ 33 */
34static void xdr_decode_AFSFetchStatus(const __be32 **_bp, 34static void xdr_decode_AFSFetchStatus(const __be32 **_bp,
35 struct afs_file_status *status, 35 struct afs_file_status *status,
36 struct afs_vnode *vnode) 36 struct afs_vnode *vnode,
37 afs_dataversion_t *store_version)
37{ 38{
39 afs_dataversion_t expected_version;
38 const __be32 *bp = *_bp; 40 const __be32 *bp = *_bp;
39 umode_t mode; 41 umode_t mode;
40 u64 data_version, size; 42 u64 data_version, size;
@@ -101,7 +103,11 @@ static void xdr_decode_AFSFetchStatus(const __be32 **_bp,
101 vnode->vfs_inode.i_atime = vnode->vfs_inode.i_ctime; 103 vnode->vfs_inode.i_atime = vnode->vfs_inode.i_ctime;
102 } 104 }
103 105
104 if (status->data_version != data_version) { 106 expected_version = status->data_version;
107 if (store_version)
108 expected_version = *store_version;
109
110 if (expected_version != data_version) {
105 status->data_version = data_version; 111 status->data_version = data_version;
106 if (vnode && !test_bit(AFS_VNODE_UNSET, &vnode->flags)) { 112 if (vnode && !test_bit(AFS_VNODE_UNSET, &vnode->flags)) {
107 _debug("vnode modified %llx on {%x:%u}", 113 _debug("vnode modified %llx on {%x:%u}",
@@ -110,6 +116,8 @@ static void xdr_decode_AFSFetchStatus(const __be32 **_bp,
110 set_bit(AFS_VNODE_MODIFIED, &vnode->flags); 116 set_bit(AFS_VNODE_MODIFIED, &vnode->flags);
111 set_bit(AFS_VNODE_ZAP_DATA, &vnode->flags); 117 set_bit(AFS_VNODE_ZAP_DATA, &vnode->flags);
112 } 118 }
119 } else if (store_version) {
120 status->data_version = data_version;
113 } 121 }
114} 122}
115 123
@@ -156,6 +164,44 @@ static void xdr_decode_AFSVolSync(const __be32 **_bp,
156} 164}
157 165
158/* 166/*
167 * encode the requested attributes into an AFSStoreStatus block
168 */
169static void xdr_encode_AFS_StoreStatus(__be32 **_bp, struct iattr *attr)
170{
171 __be32 *bp = *_bp;
172 u32 mask = 0, mtime = 0, owner = 0, group = 0, mode = 0;
173
174 mask = 0;
175 if (attr->ia_valid & ATTR_MTIME) {
176 mask |= AFS_SET_MTIME;
177 mtime = attr->ia_mtime.tv_sec;
178 }
179
180 if (attr->ia_valid & ATTR_UID) {
181 mask |= AFS_SET_OWNER;
182 owner = attr->ia_uid;
183 }
184
185 if (attr->ia_valid & ATTR_GID) {
186 mask |= AFS_SET_GROUP;
187 group = attr->ia_gid;
188 }
189
190 if (attr->ia_valid & ATTR_MODE) {
191 mask |= AFS_SET_MODE;
192 mode = attr->ia_mode & S_IALLUGO;
193 }
194
195 *bp++ = htonl(mask);
196 *bp++ = htonl(mtime);
197 *bp++ = htonl(owner);
198 *bp++ = htonl(group);
199 *bp++ = htonl(mode);
200 *bp++ = 0; /* segment size */
201 *_bp = bp;
202}
203
204/*
159 * deliver reply data to an FS.FetchStatus 205 * deliver reply data to an FS.FetchStatus
160 */ 206 */
161static int afs_deliver_fs_fetch_status(struct afs_call *call, 207static int afs_deliver_fs_fetch_status(struct afs_call *call,
@@ -175,7 +221,7 @@ static int afs_deliver_fs_fetch_status(struct afs_call *call,
175 221
176 /* unmarshall the reply once we've received all of it */ 222 /* unmarshall the reply once we've received all of it */
177 bp = call->buffer; 223 bp = call->buffer;
178 xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode); 224 xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL);
179 xdr_decode_AFSCallBack(&bp, vnode); 225 xdr_decode_AFSCallBack(&bp, vnode);
180 if (call->reply2) 226 if (call->reply2)
181 xdr_decode_AFSVolSync(&bp, call->reply2); 227 xdr_decode_AFSVolSync(&bp, call->reply2);
@@ -206,7 +252,7 @@ int afs_fs_fetch_file_status(struct afs_server *server,
206 struct afs_call *call; 252 struct afs_call *call;
207 __be32 *bp; 253 __be32 *bp;
208 254
209 _enter(",%x,{%x:%d},,", 255 _enter(",%x,{%x:%u},,",
210 key_serial(key), vnode->fid.vid, vnode->fid.vnode); 256 key_serial(key), vnode->fid.vid, vnode->fid.vnode);
211 257
212 call = afs_alloc_flat_call(&afs_RXFSFetchStatus, 16, (21 + 3 + 6) * 4); 258 call = afs_alloc_flat_call(&afs_RXFSFetchStatus, 16, (21 + 3 + 6) * 4);
@@ -265,25 +311,20 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call,
265 call->offset = 0; 311 call->offset = 0;
266 call->unmarshall++; 312 call->unmarshall++;
267 313
268 if (call->count < PAGE_SIZE) {
269 page = call->reply3;
270 buffer = kmap_atomic(page, KM_USER0);
271 memset(buffer + PAGE_SIZE - call->count, 0,
272 call->count);
273 kunmap_atomic(buffer, KM_USER0);
274 }
275
276 /* extract the returned data */ 314 /* extract the returned data */
277 case 2: 315 case 2:
278 _debug("extract data"); 316 _debug("extract data");
279 page = call->reply3; 317 if (call->count > 0) {
280 buffer = kmap_atomic(page, KM_USER0); 318 page = call->reply3;
281 ret = afs_extract_data(call, skb, last, buffer, call->count); 319 buffer = kmap_atomic(page, KM_USER0);
282 kunmap_atomic(buffer, KM_USER0); 320 ret = afs_extract_data(call, skb, last, buffer,
283 switch (ret) { 321 call->count);
284 case 0: break; 322 kunmap_atomic(buffer, KM_USER0);
285 case -EAGAIN: return 0; 323 switch (ret) {
286 default: return ret; 324 case 0: break;
325 case -EAGAIN: return 0;
326 default: return ret;
327 }
287 } 328 }
288 329
289 call->offset = 0; 330 call->offset = 0;
@@ -300,7 +341,7 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call,
300 } 341 }
301 342
302 bp = call->buffer; 343 bp = call->buffer;
303 xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode); 344 xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL);
304 xdr_decode_AFSCallBack(&bp, vnode); 345 xdr_decode_AFSCallBack(&bp, vnode);
305 if (call->reply2) 346 if (call->reply2)
306 xdr_decode_AFSVolSync(&bp, call->reply2); 347 xdr_decode_AFSVolSync(&bp, call->reply2);
@@ -318,6 +359,14 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call,
318 if (!last) 359 if (!last)
319 return 0; 360 return 0;
320 361
362 if (call->count < PAGE_SIZE) {
363 _debug("clear");
364 page = call->reply3;
365 buffer = kmap_atomic(page, KM_USER0);
366 memset(buffer + call->count, 0, PAGE_SIZE - call->count);
367 kunmap_atomic(buffer, KM_USER0);
368 }
369
321 _leave(" = 0 [done]"); 370 _leave(" = 0 [done]");
322 return 0; 371 return 0;
323} 372}
@@ -476,8 +525,8 @@ static int afs_deliver_fs_create_vnode(struct afs_call *call,
476 /* unmarshall the reply once we've received all of it */ 525 /* unmarshall the reply once we've received all of it */
477 bp = call->buffer; 526 bp = call->buffer;
478 xdr_decode_AFSFid(&bp, call->reply2); 527 xdr_decode_AFSFid(&bp, call->reply2);
479 xdr_decode_AFSFetchStatus(&bp, call->reply3, NULL); 528 xdr_decode_AFSFetchStatus(&bp, call->reply3, NULL, NULL);
480 xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode); 529 xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL);
481 xdr_decode_AFSCallBack_raw(&bp, call->reply4); 530 xdr_decode_AFSCallBack_raw(&bp, call->reply4);
482 /* xdr_decode_AFSVolSync(&bp, call->replyX); */ 531 /* xdr_decode_AFSVolSync(&bp, call->replyX); */
483 532
@@ -574,7 +623,7 @@ static int afs_deliver_fs_remove(struct afs_call *call,
574 623
575 /* unmarshall the reply once we've received all of it */ 624 /* unmarshall the reply once we've received all of it */
576 bp = call->buffer; 625 bp = call->buffer;
577 xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode); 626 xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL);
578 /* xdr_decode_AFSVolSync(&bp, call->replyX); */ 627 /* xdr_decode_AFSVolSync(&bp, call->replyX); */
579 628
580 _leave(" = 0 [done]"); 629 _leave(" = 0 [done]");
@@ -657,8 +706,8 @@ static int afs_deliver_fs_link(struct afs_call *call,
657 706
658 /* unmarshall the reply once we've received all of it */ 707 /* unmarshall the reply once we've received all of it */
659 bp = call->buffer; 708 bp = call->buffer;
660 xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode); 709 xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL);
661 xdr_decode_AFSFetchStatus(&bp, &dvnode->status, dvnode); 710 xdr_decode_AFSFetchStatus(&bp, &dvnode->status, dvnode, NULL);
662 /* xdr_decode_AFSVolSync(&bp, call->replyX); */ 711 /* xdr_decode_AFSVolSync(&bp, call->replyX); */
663 712
664 _leave(" = 0 [done]"); 713 _leave(" = 0 [done]");
@@ -746,8 +795,8 @@ static int afs_deliver_fs_symlink(struct afs_call *call,
746 /* unmarshall the reply once we've received all of it */ 795 /* unmarshall the reply once we've received all of it */
747 bp = call->buffer; 796 bp = call->buffer;
748 xdr_decode_AFSFid(&bp, call->reply2); 797 xdr_decode_AFSFid(&bp, call->reply2);
749 xdr_decode_AFSFetchStatus(&bp, call->reply3, NULL); 798 xdr_decode_AFSFetchStatus(&bp, call->reply3, NULL, NULL);
750 xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode); 799 xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL);
751 /* xdr_decode_AFSVolSync(&bp, call->replyX); */ 800 /* xdr_decode_AFSVolSync(&bp, call->replyX); */
752 801
753 _leave(" = 0 [done]"); 802 _leave(" = 0 [done]");
@@ -852,9 +901,10 @@ static int afs_deliver_fs_rename(struct afs_call *call,
852 901
853 /* unmarshall the reply once we've received all of it */ 902 /* unmarshall the reply once we've received all of it */
854 bp = call->buffer; 903 bp = call->buffer;
855 xdr_decode_AFSFetchStatus(&bp, &orig_dvnode->status, orig_dvnode); 904 xdr_decode_AFSFetchStatus(&bp, &orig_dvnode->status, orig_dvnode, NULL);
856 if (new_dvnode != orig_dvnode) 905 if (new_dvnode != orig_dvnode)
857 xdr_decode_AFSFetchStatus(&bp, &new_dvnode->status, new_dvnode); 906 xdr_decode_AFSFetchStatus(&bp, &new_dvnode->status, new_dvnode,
907 NULL);
858 /* xdr_decode_AFSVolSync(&bp, call->replyX); */ 908 /* xdr_decode_AFSVolSync(&bp, call->replyX); */
859 909
860 _leave(" = 0 [done]"); 910 _leave(" = 0 [done]");
@@ -936,3 +986,262 @@ int afs_fs_rename(struct afs_server *server,
936 986
937 return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode); 987 return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
938} 988}
989
990/*
991 * deliver reply data to an FS.StoreData
992 */
993static int afs_deliver_fs_store_data(struct afs_call *call,
994 struct sk_buff *skb, bool last)
995{
996 struct afs_vnode *vnode = call->reply;
997 const __be32 *bp;
998
999 _enter(",,%u", last);
1000
1001 afs_transfer_reply(call, skb);
1002 if (!last) {
1003 _leave(" = 0 [more]");
1004 return 0;
1005 }
1006
1007 if (call->reply_size != call->reply_max) {
1008 _leave(" = -EBADMSG [%u != %u]",
1009 call->reply_size, call->reply_max);
1010 return -EBADMSG;
1011 }
1012
1013 /* unmarshall the reply once we've received all of it */
1014 bp = call->buffer;
1015 xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode,
1016 &call->store_version);
1017 /* xdr_decode_AFSVolSync(&bp, call->replyX); */
1018
1019 afs_pages_written_back(vnode, call);
1020
1021 _leave(" = 0 [done]");
1022 return 0;
1023}
1024
1025/*
1026 * FS.StoreData operation type
1027 */
1028static const struct afs_call_type afs_RXFSStoreData = {
1029 .name = "FS.StoreData",
1030 .deliver = afs_deliver_fs_store_data,
1031 .abort_to_error = afs_abort_to_error,
1032 .destructor = afs_flat_call_destructor,
1033};
1034
1035/*
1036 * store a set of pages
1037 */
1038int afs_fs_store_data(struct afs_server *server, struct afs_writeback *wb,
1039 pgoff_t first, pgoff_t last,
1040 unsigned offset, unsigned to,
1041 const struct afs_wait_mode *wait_mode)
1042{
1043 struct afs_vnode *vnode = wb->vnode;
1044 struct afs_call *call;
1045 loff_t size, pos, i_size;
1046 __be32 *bp;
1047
1048 _enter(",%x,{%x:%u},,",
1049 key_serial(wb->key), vnode->fid.vid, vnode->fid.vnode);
1050
1051 size = to - offset;
1052 if (first != last)
1053 size += (loff_t)(last - first) << PAGE_SHIFT;
1054 pos = (loff_t)first << PAGE_SHIFT;
1055 pos += offset;
1056
1057 i_size = i_size_read(&vnode->vfs_inode);
1058 if (pos + size > i_size)
1059 i_size = size + pos;
1060
1061 _debug("size %llx, at %llx, i_size %llx",
1062 (unsigned long long) size, (unsigned long long) pos,
1063 (unsigned long long) i_size);
1064
1065 BUG_ON(i_size > 0xffffffff); // TODO: use 64-bit store
1066
1067 call = afs_alloc_flat_call(&afs_RXFSStoreData,
1068 (4 + 6 + 3) * 4,
1069 (21 + 6) * 4);
1070 if (!call)
1071 return -ENOMEM;
1072
1073 call->wb = wb;
1074 call->key = wb->key;
1075 call->reply = vnode;
1076 call->service_id = FS_SERVICE;
1077 call->port = htons(AFS_FS_PORT);
1078 call->mapping = vnode->vfs_inode.i_mapping;
1079 call->first = first;
1080 call->last = last;
1081 call->first_offset = offset;
1082 call->last_to = to;
1083 call->send_pages = true;
1084 call->store_version = vnode->status.data_version + 1;
1085
1086 /* marshall the parameters */
1087 bp = call->request;
1088 *bp++ = htonl(FSSTOREDATA);
1089 *bp++ = htonl(vnode->fid.vid);
1090 *bp++ = htonl(vnode->fid.vnode);
1091 *bp++ = htonl(vnode->fid.unique);
1092
1093 *bp++ = 0; /* mask */
1094 *bp++ = 0; /* mtime */
1095 *bp++ = 0; /* owner */
1096 *bp++ = 0; /* group */
1097 *bp++ = 0; /* unix mode */
1098 *bp++ = 0; /* segment size */
1099
1100 *bp++ = htonl(pos);
1101 *bp++ = htonl(size);
1102 *bp++ = htonl(i_size);
1103
1104 return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
1105}
1106
1107/*
1108 * deliver reply data to an FS.StoreStatus
1109 */
1110static int afs_deliver_fs_store_status(struct afs_call *call,
1111 struct sk_buff *skb, bool last)
1112{
1113 afs_dataversion_t *store_version;
1114 struct afs_vnode *vnode = call->reply;
1115 const __be32 *bp;
1116
1117 _enter(",,%u", last);
1118
1119 afs_transfer_reply(call, skb);
1120 if (!last) {
1121 _leave(" = 0 [more]");
1122 return 0;
1123 }
1124
1125 if (call->reply_size != call->reply_max) {
1126 _leave(" = -EBADMSG [%u != %u]",
1127 call->reply_size, call->reply_max);
1128 return -EBADMSG;
1129 }
1130
1131 /* unmarshall the reply once we've received all of it */
1132 store_version = NULL;
1133 if (call->operation_ID == FSSTOREDATA)
1134 store_version = &call->store_version;
1135
1136 bp = call->buffer;
1137 xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, store_version);
1138 /* xdr_decode_AFSVolSync(&bp, call->replyX); */
1139
1140 _leave(" = 0 [done]");
1141 return 0;
1142}
1143
1144/*
1145 * FS.StoreStatus operation type
1146 */
1147static const struct afs_call_type afs_RXFSStoreStatus = {
1148 .name = "FS.StoreStatus",
1149 .deliver = afs_deliver_fs_store_status,
1150 .abort_to_error = afs_abort_to_error,
1151 .destructor = afs_flat_call_destructor,
1152};
1153
1154static const struct afs_call_type afs_RXFSStoreData_as_Status = {
1155 .name = "FS.StoreData",
1156 .deliver = afs_deliver_fs_store_status,
1157 .abort_to_error = afs_abort_to_error,
1158 .destructor = afs_flat_call_destructor,
1159};
1160
1161/*
1162 * set the attributes on a file, using FS.StoreData rather than FS.StoreStatus
1163 * so as to alter the file size also
1164 */
1165static int afs_fs_setattr_size(struct afs_server *server, struct key *key,
1166 struct afs_vnode *vnode, struct iattr *attr,
1167 const struct afs_wait_mode *wait_mode)
1168{
1169 struct afs_call *call;
1170 __be32 *bp;
1171
1172 _enter(",%x,{%x:%u},,",
1173 key_serial(key), vnode->fid.vid, vnode->fid.vnode);
1174
1175 ASSERT(attr->ia_valid & ATTR_SIZE);
1176 ASSERTCMP(attr->ia_size, <=, 0xffffffff); // TODO: use 64-bit store
1177
1178 call = afs_alloc_flat_call(&afs_RXFSStoreData_as_Status,
1179 (4 + 6 + 3) * 4,
1180 (21 + 6) * 4);
1181 if (!call)
1182 return -ENOMEM;
1183
1184 call->key = key;
1185 call->reply = vnode;
1186 call->service_id = FS_SERVICE;
1187 call->port = htons(AFS_FS_PORT);
1188 call->store_version = vnode->status.data_version + 1;
1189 call->operation_ID = FSSTOREDATA;
1190
1191 /* marshall the parameters */
1192 bp = call->request;
1193 *bp++ = htonl(FSSTOREDATA);
1194 *bp++ = htonl(vnode->fid.vid);
1195 *bp++ = htonl(vnode->fid.vnode);
1196 *bp++ = htonl(vnode->fid.unique);
1197
1198 xdr_encode_AFS_StoreStatus(&bp, attr);
1199
1200 *bp++ = 0; /* position of start of write */
1201 *bp++ = 0; /* size of write */
1202 *bp++ = htonl(attr->ia_size); /* new file length */
1203
1204 return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
1205}
1206
1207/*
1208 * set the attributes on a file, using FS.StoreData if there's a change in file
1209 * size, and FS.StoreStatus otherwise
1210 */
1211int afs_fs_setattr(struct afs_server *server, struct key *key,
1212 struct afs_vnode *vnode, struct iattr *attr,
1213 const struct afs_wait_mode *wait_mode)
1214{
1215 struct afs_call *call;
1216 __be32 *bp;
1217
1218 if (attr->ia_valid & ATTR_SIZE)
1219 return afs_fs_setattr_size(server, key, vnode, attr,
1220 wait_mode);
1221
1222 _enter(",%x,{%x:%u},,",
1223 key_serial(key), vnode->fid.vid, vnode->fid.vnode);
1224
1225 call = afs_alloc_flat_call(&afs_RXFSStoreStatus,
1226 (4 + 6) * 4,
1227 (21 + 6) * 4);
1228 if (!call)
1229 return -ENOMEM;
1230
1231 call->key = key;
1232 call->reply = vnode;
1233 call->service_id = FS_SERVICE;
1234 call->port = htons(AFS_FS_PORT);
1235 call->operation_ID = FSSTORESTATUS;
1236
1237 /* marshall the parameters */
1238 bp = call->request;
1239 *bp++ = htonl(FSSTORESTATUS);
1240 *bp++ = htonl(vnode->fid.vid);
1241 *bp++ = htonl(vnode->fid.vnode);
1242 *bp++ = htonl(vnode->fid.unique);
1243
1244 xdr_encode_AFS_StoreStatus(&bp, attr);
1245
1246 return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
1247}
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index c184a4ee5995..515a5d12d8fb 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -125,7 +125,7 @@ struct inode *afs_iget(struct super_block *sb, struct key *key,
125 struct inode *inode; 125 struct inode *inode;
126 int ret; 126 int ret;
127 127
128 _enter(",{%u,%u,%u},,", fid->vid, fid->vnode, fid->unique); 128 _enter(",{%x:%u.%u},,", fid->vid, fid->vnode, fid->unique);
129 129
130 as = sb->s_fs_info; 130 as = sb->s_fs_info;
131 data.volume = as->volume; 131 data.volume = as->volume;
@@ -204,6 +204,19 @@ bad_inode:
204} 204}
205 205
206/* 206/*
207 * mark the data attached to an inode as obsolete due to a write on the server
208 * - might also want to ditch all the outstanding writes and dirty pages
209 */
210void afs_zap_data(struct afs_vnode *vnode)
211{
212 _enter("zap data {%x:%u}", vnode->fid.vid, vnode->fid.vnode);
213
214 /* nuke all the non-dirty pages that aren't locked, mapped or being
215 * written back */
216 invalidate_remote_inode(&vnode->vfs_inode);
217}
218
219/*
207 * validate a vnode/inode 220 * validate a vnode/inode
208 * - there are several things we need to check 221 * - there are several things we need to check
209 * - parent dir data changes (rm, rmdir, rename, mkdir, create, link, 222 * - parent dir data changes (rm, rmdir, rename, mkdir, create, link,
@@ -258,10 +271,8 @@ int afs_validate(struct afs_vnode *vnode, struct key *key)
258 271
259 /* if the vnode's data version number changed then its contents are 272 /* if the vnode's data version number changed then its contents are
260 * different */ 273 * different */
261 if (test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags)) { 274 if (test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags))
262 _debug("zap data {%x:%d}", vnode->fid.vid, vnode->fid.vnode); 275 afs_zap_data(vnode);
263 invalidate_remote_inode(&vnode->vfs_inode);
264 }
265 276
266 clear_bit(AFS_VNODE_MODIFIED, &vnode->flags); 277 clear_bit(AFS_VNODE_MODIFIED, &vnode->flags);
267 mutex_unlock(&vnode->validate_lock); 278 mutex_unlock(&vnode->validate_lock);
@@ -278,7 +289,7 @@ error_unlock:
278/* 289/*
279 * read the attributes of an inode 290 * read the attributes of an inode
280 */ 291 */
281int afs_inode_getattr(struct vfsmount *mnt, struct dentry *dentry, 292int afs_getattr(struct vfsmount *mnt, struct dentry *dentry,
282 struct kstat *stat) 293 struct kstat *stat)
283{ 294{
284 struct inode *inode; 295 struct inode *inode;
@@ -301,7 +312,7 @@ void afs_clear_inode(struct inode *inode)
301 312
302 vnode = AFS_FS_I(inode); 313 vnode = AFS_FS_I(inode);
303 314
304 _enter("{%x:%d.%d} v=%u x=%u t=%u }", 315 _enter("{%x:%u.%d} v=%u x=%u t=%u }",
305 vnode->fid.vid, 316 vnode->fid.vid,
306 vnode->fid.vnode, 317 vnode->fid.vnode,
307 vnode->fid.unique, 318 vnode->fid.unique,
@@ -323,6 +334,7 @@ void afs_clear_inode(struct inode *inode)
323 vnode->server = NULL; 334 vnode->server = NULL;
324 } 335 }
325 336
337 ASSERT(list_empty(&vnode->writebacks));
326 ASSERT(!vnode->cb_promised); 338 ASSERT(!vnode->cb_promised);
327 339
328#ifdef AFS_CACHING_SUPPORT 340#ifdef AFS_CACHING_SUPPORT
@@ -339,3 +351,47 @@ void afs_clear_inode(struct inode *inode)
339 351
340 _leave(""); 352 _leave("");
341} 353}
354
355/*
356 * set the attributes of an inode
357 */
358int afs_setattr(struct dentry *dentry, struct iattr *attr)
359{
360 struct afs_vnode *vnode = AFS_FS_I(dentry->d_inode);
361 struct key *key;
362 int ret;
363
364 _enter("{%x:%u},{n=%s},%x",
365 vnode->fid.vid, vnode->fid.vnode, dentry->d_name.name,
366 attr->ia_valid);
367
368 if (!(attr->ia_valid & (ATTR_SIZE | ATTR_MODE | ATTR_UID | ATTR_GID |
369 ATTR_MTIME))) {
370 _leave(" = 0 [unsupported]");
371 return 0;
372 }
373
374 /* flush any dirty data outstanding on a regular file */
375 if (S_ISREG(vnode->vfs_inode.i_mode)) {
376 filemap_write_and_wait(vnode->vfs_inode.i_mapping);
377 afs_writeback_all(vnode);
378 }
379
380 if (attr->ia_valid & ATTR_FILE) {
381 key = attr->ia_file->private_data;
382 } else {
383 key = afs_request_key(vnode->volume->cell);
384 if (IS_ERR(key)) {
385 ret = PTR_ERR(key);
386 goto error;
387 }
388 }
389
390 ret = afs_vnode_setattr(vnode, key, attr);
391 if (!(attr->ia_valid & ATTR_FILE))
392 key_put(key);
393
394error:
395 _leave(" = %d", ret);
396 return ret;
397}
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index d90c158cd934..a30d4fa768e3 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -21,6 +21,7 @@
21 21
22#define AFS_CELL_MAX_ADDRS 15 22#define AFS_CELL_MAX_ADDRS 15
23 23
24struct pagevec;
24struct afs_call; 25struct afs_call;
25 26
26typedef enum { 27typedef enum {
@@ -75,12 +76,15 @@ struct afs_call {
75 struct key *key; /* security for this call */ 76 struct key *key; /* security for this call */
76 struct afs_server *server; /* server affected by incoming CM call */ 77 struct afs_server *server; /* server affected by incoming CM call */
77 void *request; /* request data (first part) */ 78 void *request; /* request data (first part) */
78 void *request2; /* request data (second part) */ 79 struct address_space *mapping; /* page set */
80 struct afs_writeback *wb; /* writeback being performed */
79 void *buffer; /* reply receive buffer */ 81 void *buffer; /* reply receive buffer */
80 void *reply; /* reply buffer (first part) */ 82 void *reply; /* reply buffer (first part) */
81 void *reply2; /* reply buffer (second part) */ 83 void *reply2; /* reply buffer (second part) */
82 void *reply3; /* reply buffer (third part) */ 84 void *reply3; /* reply buffer (third part) */
83 void *reply4; /* reply buffer (fourth part) */ 85 void *reply4; /* reply buffer (fourth part) */
86 pgoff_t first; /* first page in mapping to deal with */
87 pgoff_t last; /* last page in mapping to deal with */
84 enum { /* call state */ 88 enum { /* call state */
85 AFS_CALL_REQUESTING, /* request is being sent for outgoing call */ 89 AFS_CALL_REQUESTING, /* request is being sent for outgoing call */
86 AFS_CALL_AWAIT_REPLY, /* awaiting reply to outgoing call */ 90 AFS_CALL_AWAIT_REPLY, /* awaiting reply to outgoing call */
@@ -97,14 +101,18 @@ struct afs_call {
97 unsigned request_size; /* size of request data */ 101 unsigned request_size; /* size of request data */
98 unsigned reply_max; /* maximum size of reply */ 102 unsigned reply_max; /* maximum size of reply */
99 unsigned reply_size; /* current size of reply */ 103 unsigned reply_size; /* current size of reply */
104 unsigned first_offset; /* offset into mapping[first] */
105 unsigned last_to; /* amount of mapping[last] */
100 unsigned short offset; /* offset into received data store */ 106 unsigned short offset; /* offset into received data store */
101 unsigned char unmarshall; /* unmarshalling phase */ 107 unsigned char unmarshall; /* unmarshalling phase */
102 bool incoming; /* T if incoming call */ 108 bool incoming; /* T if incoming call */
109 bool send_pages; /* T if data from mapping should be sent */
103 u16 service_id; /* RxRPC service ID to call */ 110 u16 service_id; /* RxRPC service ID to call */
104 __be16 port; /* target UDP port */ 111 __be16 port; /* target UDP port */
105 __be32 operation_ID; /* operation ID for an incoming call */ 112 __be32 operation_ID; /* operation ID for an incoming call */
106 u32 count; /* count for use in unmarshalling */ 113 u32 count; /* count for use in unmarshalling */
107 __be32 tmp; /* place to extract temporary data */ 114 __be32 tmp; /* place to extract temporary data */
115 afs_dataversion_t store_version; /* updated version expected from store */
108}; 116};
109 117
110struct afs_call_type { 118struct afs_call_type {
@@ -124,6 +132,32 @@ struct afs_call_type {
124}; 132};
125 133
126/* 134/*
135 * record of an outstanding writeback on a vnode
136 */
137struct afs_writeback {
138 struct list_head link; /* link in vnode->writebacks */
139 struct work_struct writer; /* work item to perform the writeback */
140 struct afs_vnode *vnode; /* vnode to which this write applies */
141 struct key *key; /* owner of this write */
142 wait_queue_head_t waitq; /* completion and ready wait queue */
143 pgoff_t first; /* first page in batch */
144 pgoff_t point; /* last page in current store op */
145 pgoff_t last; /* last page in batch (inclusive) */
146 unsigned offset_first; /* offset into first page of start of write */
147 unsigned to_last; /* offset into last page of end of write */
148 int num_conflicts; /* count of conflicting writes in list */
149 int usage;
150 bool conflicts; /* T if has dependent conflicts */
151 enum {
152 AFS_WBACK_SYNCING, /* synchronisation being performed */
153 AFS_WBACK_PENDING, /* write pending */
154 AFS_WBACK_CONFLICTING, /* conflicting writes posted */
155 AFS_WBACK_WRITING, /* writing back */
156 AFS_WBACK_COMPLETE /* the writeback record has been unlinked */
157 } state __attribute__((packed));
158};
159
160/*
127 * AFS superblock private data 161 * AFS superblock private data
128 * - there's one superblock per volume 162 * - there's one superblock per volume
129 */ 163 */
@@ -305,6 +339,7 @@ struct afs_vnode {
305 wait_queue_head_t update_waitq; /* status fetch waitqueue */ 339 wait_queue_head_t update_waitq; /* status fetch waitqueue */
306 int update_cnt; /* number of outstanding ops that will update the 340 int update_cnt; /* number of outstanding ops that will update the
307 * status */ 341 * status */
342 spinlock_t writeback_lock; /* lock for writebacks */
308 spinlock_t lock; /* waitqueue/flags lock */ 343 spinlock_t lock; /* waitqueue/flags lock */
309 unsigned long flags; 344 unsigned long flags;
310#define AFS_VNODE_CB_BROKEN 0 /* set if vnode's callback was broken */ 345#define AFS_VNODE_CB_BROKEN 0 /* set if vnode's callback was broken */
@@ -316,6 +351,8 @@ struct afs_vnode {
316 351
317 long acl_order; /* ACL check count (callback break count) */ 352 long acl_order; /* ACL check count (callback break count) */
318 353
354 struct list_head writebacks; /* alterations in pagecache that need writing */
355
319 /* outstanding callback notification on this file */ 356 /* outstanding callback notification on this file */
320 struct rb_node server_rb; /* link in server->fs_vnodes */ 357 struct rb_node server_rb; /* link in server->fs_vnodes */
321 struct rb_node cb_promise; /* link in server->cb_promises */ 358 struct rb_node cb_promise; /* link in server->cb_promises */
@@ -433,10 +470,6 @@ extern const struct file_operations afs_file_operations;
433extern int afs_open(struct inode *, struct file *); 470extern int afs_open(struct inode *, struct file *);
434extern int afs_release(struct inode *, struct file *); 471extern int afs_release(struct inode *, struct file *);
435 472
436#ifdef AFS_CACHING_SUPPORT
437extern int afs_cache_get_page_cookie(struct page *, struct cachefs_page **);
438#endif
439
440/* 473/*
441 * fsclient.c 474 * fsclient.c
442 */ 475 */
@@ -467,6 +500,12 @@ extern int afs_fs_rename(struct afs_server *, struct key *,
467 struct afs_vnode *, const char *, 500 struct afs_vnode *, const char *,
468 struct afs_vnode *, const char *, 501 struct afs_vnode *, const char *,
469 const struct afs_wait_mode *); 502 const struct afs_wait_mode *);
503extern int afs_fs_store_data(struct afs_server *, struct afs_writeback *,
504 pgoff_t, pgoff_t, unsigned, unsigned,
505 const struct afs_wait_mode *);
506extern int afs_fs_setattr(struct afs_server *, struct key *,
507 struct afs_vnode *, struct iattr *,
508 const struct afs_wait_mode *);
470 509
471/* 510/*
472 * inode.c 511 * inode.c
@@ -474,10 +513,10 @@ extern int afs_fs_rename(struct afs_server *, struct key *,
474extern struct inode *afs_iget(struct super_block *, struct key *, 513extern struct inode *afs_iget(struct super_block *, struct key *,
475 struct afs_fid *, struct afs_file_status *, 514 struct afs_fid *, struct afs_file_status *,
476 struct afs_callback *); 515 struct afs_callback *);
516extern void afs_zap_data(struct afs_vnode *);
477extern int afs_validate(struct afs_vnode *, struct key *); 517extern int afs_validate(struct afs_vnode *, struct key *);
478extern int afs_inode_getattr(struct vfsmount *, struct dentry *, 518extern int afs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
479 struct kstat *); 519extern int afs_setattr(struct dentry *, struct iattr *);
480extern void afs_zap_permits(struct rcu_head *);
481extern void afs_clear_inode(struct inode *); 520extern void afs_clear_inode(struct inode *);
482 521
483/* 522/*
@@ -533,6 +572,7 @@ extern int afs_extract_data(struct afs_call *, struct sk_buff *, bool, void *,
533 */ 572 */
534extern void afs_clear_permits(struct afs_vnode *); 573extern void afs_clear_permits(struct afs_vnode *);
535extern void afs_cache_permit(struct afs_vnode *, struct key *, long); 574extern void afs_cache_permit(struct afs_vnode *, struct key *, long);
575extern void afs_zap_permits(struct rcu_head *);
536extern struct key *afs_request_key(struct afs_cell *); 576extern struct key *afs_request_key(struct afs_cell *);
537extern int afs_permission(struct inode *, int, struct nameidata *); 577extern int afs_permission(struct inode *, int, struct nameidata *);
538 578
@@ -629,6 +669,9 @@ extern int afs_vnode_symlink(struct afs_vnode *, struct key *, const char *,
629 struct afs_file_status *, struct afs_server **); 669 struct afs_file_status *, struct afs_server **);
630extern int afs_vnode_rename(struct afs_vnode *, struct afs_vnode *, 670extern int afs_vnode_rename(struct afs_vnode *, struct afs_vnode *,
631 struct key *, const char *, const char *); 671 struct key *, const char *, const char *);
672extern int afs_vnode_store_data(struct afs_writeback *, pgoff_t, pgoff_t,
673 unsigned, unsigned);
674extern int afs_vnode_setattr(struct afs_vnode *, struct key *, struct iattr *);
632 675
633/* 676/*
634 * volume.c 677 * volume.c
@@ -645,6 +688,23 @@ extern struct afs_server *afs_volume_pick_fileserver(struct afs_vnode *);
645extern int afs_volume_release_fileserver(struct afs_vnode *, 688extern int afs_volume_release_fileserver(struct afs_vnode *,
646 struct afs_server *, int); 689 struct afs_server *, int);
647 690
691/*
692 * write.c
693 */
694extern int afs_set_page_dirty(struct page *);
695extern void afs_put_writeback(struct afs_writeback *);
696extern int afs_prepare_write(struct file *, struct page *, unsigned, unsigned);
697extern int afs_commit_write(struct file *, struct page *, unsigned, unsigned);
698extern int afs_writepage(struct page *, struct writeback_control *);
699extern int afs_writepages(struct address_space *, struct writeback_control *);
700extern int afs_write_inode(struct inode *, int);
701extern void afs_pages_written_back(struct afs_vnode *, struct afs_call *);
702extern ssize_t afs_file_write(struct kiocb *, const struct iovec *,
703 unsigned long, loff_t);
704extern int afs_writeback_all(struct afs_vnode *);
705extern int afs_fsync(struct file *, struct dentry *, int);
706
707
648/*****************************************************************************/ 708/*****************************************************************************/
649/* 709/*
650 * debug tracing 710 * debug tracing
@@ -726,6 +786,21 @@ do { \
726 } \ 786 } \
727} while(0) 787} while(0)
728 788
789#define ASSERTRANGE(L, OP1, N, OP2, H) \
790do { \
791 if (unlikely(!((L) OP1 (N)) || !((N) OP2 (H)))) { \
792 printk(KERN_ERR "\n"); \
793 printk(KERN_ERR "AFS: Assertion failed\n"); \
794 printk(KERN_ERR "%lu "#OP1" %lu "#OP2" %lu is false\n", \
795 (unsigned long)(L), (unsigned long)(N), \
796 (unsigned long)(H)); \
797 printk(KERN_ERR "0x%lx "#OP1" 0x%lx "#OP2" 0x%lx is false\n", \
798 (unsigned long)(L), (unsigned long)(N), \
799 (unsigned long)(H)); \
800 BUG(); \
801 } \
802} while(0)
803
729#define ASSERTIF(C, X) \ 804#define ASSERTIF(C, X) \
730do { \ 805do { \
731 if (unlikely((C) && !(X))) { \ 806 if (unlikely((C) && !(X))) { \
@@ -758,6 +833,10 @@ do { \
758do { \ 833do { \
759} while(0) 834} while(0)
760 835
836#define ASSERTRANGE(L, OP1, N, OP2, H) \
837do { \
838} while(0)
839
761#define ASSERTIF(C, X) \ 840#define ASSERTIF(C, X) \
762do { \ 841do { \
763} while(0) 842} while(0)
diff --git a/fs/afs/main.c b/fs/afs/main.c
index 80ec6fd19a73..f1f71ff7d5c6 100644
--- a/fs/afs/main.c
+++ b/fs/afs/main.c
@@ -149,6 +149,7 @@ error_cache:
149 afs_vlocation_purge(); 149 afs_vlocation_purge();
150 afs_cell_purge(); 150 afs_cell_purge();
151 afs_proc_cleanup(); 151 afs_proc_cleanup();
152 rcu_barrier();
152 printk(KERN_ERR "kAFS: failed to register: %d\n", ret); 153 printk(KERN_ERR "kAFS: failed to register: %d\n", ret);
153 return ret; 154 return ret;
154} 155}
@@ -176,6 +177,7 @@ static void __exit afs_exit(void)
176 cachefs_unregister_netfs(&afs_cache_netfs); 177 cachefs_unregister_netfs(&afs_cache_netfs);
177#endif 178#endif
178 afs_proc_cleanup(); 179 afs_proc_cleanup();
180 rcu_barrier();
179} 181}
180 182
181module_exit(afs_exit); 183module_exit(afs_exit);
diff --git a/fs/afs/misc.c b/fs/afs/misc.c
index cdb9792d8161..d1a889c40742 100644
--- a/fs/afs/misc.c
+++ b/fs/afs/misc.c
@@ -22,6 +22,7 @@ int afs_abort_to_error(u32 abort_code)
22{ 22{
23 switch (abort_code) { 23 switch (abort_code) {
24 case 13: return -EACCES; 24 case 13: return -EACCES;
25 case 27: return -EFBIG;
25 case 30: return -EROFS; 26 case 30: return -EROFS;
26 case VSALVAGE: return -EIO; 27 case VSALVAGE: return -EIO;
27 case VNOVNODE: return -ENOENT; 28 case VNOVNODE: return -ENOENT;
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 034fcfd4e330..a3684dcc76e7 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -36,7 +36,7 @@ const struct inode_operations afs_mntpt_inode_operations = {
36 .lookup = afs_mntpt_lookup, 36 .lookup = afs_mntpt_lookup,
37 .follow_link = afs_mntpt_follow_link, 37 .follow_link = afs_mntpt_follow_link,
38 .readlink = page_readlink, 38 .readlink = page_readlink,
39 .getattr = afs_inode_getattr, 39 .getattr = afs_getattr,
40}; 40};
41 41
42static LIST_HEAD(afs_vfsmounts); 42static LIST_HEAD(afs_vfsmounts);
@@ -58,7 +58,8 @@ int afs_mntpt_check_symlink(struct afs_vnode *vnode, struct key *key)
58 char *buf; 58 char *buf;
59 int ret; 59 int ret;
60 60
61 _enter("{%u,%u}", vnode->fid.vnode, vnode->fid.unique); 61 _enter("{%x:%u,%u}",
62 vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique);
62 63
63 /* read the contents of the symlink into the pagecache */ 64 /* read the contents of the symlink into the pagecache */
64 page = read_mapping_page(AFS_VNODE_TO_I(vnode)->i_mapping, 0, &file); 65 page = read_mapping_page(AFS_VNODE_TO_I(vnode)->i_mapping, 0, &file);
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 222c1a3abbb8..04189c47d6a0 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -237,6 +237,70 @@ void afs_flat_call_destructor(struct afs_call *call)
237} 237}
238 238
239/* 239/*
240 * attach the data from a bunch of pages on an inode to a call
241 */
242int afs_send_pages(struct afs_call *call, struct msghdr *msg, struct kvec *iov)
243{
244 struct page *pages[8];
245 unsigned count, n, loop, offset, to;
246 pgoff_t first = call->first, last = call->last;
247 int ret;
248
249 _enter("");
250
251 offset = call->first_offset;
252 call->first_offset = 0;
253
254 do {
255 _debug("attach %lx-%lx", first, last);
256
257 count = last - first + 1;
258 if (count > ARRAY_SIZE(pages))
259 count = ARRAY_SIZE(pages);
260 n = find_get_pages_contig(call->mapping, first, count, pages);
261 ASSERTCMP(n, ==, count);
262
263 loop = 0;
264 do {
265 msg->msg_flags = 0;
266 to = PAGE_SIZE;
267 if (first + loop >= last)
268 to = call->last_to;
269 else
270 msg->msg_flags = MSG_MORE;
271 iov->iov_base = kmap(pages[loop]) + offset;
272 iov->iov_len = to - offset;
273 offset = 0;
274
275 _debug("- range %u-%u%s",
276 offset, to, msg->msg_flags ? " [more]" : "");
277 msg->msg_iov = (struct iovec *) iov;
278 msg->msg_iovlen = 1;
279
280 /* have to change the state *before* sending the last
281 * packet as RxRPC might give us the reply before it
282 * returns from sending the request */
283 if (first + loop >= last)
284 call->state = AFS_CALL_AWAIT_REPLY;
285 ret = rxrpc_kernel_send_data(call->rxcall, msg,
286 to - offset);
287 kunmap(pages[loop]);
288 if (ret < 0)
289 break;
290 } while (++loop < count);
291 first += count;
292
293 for (loop = 0; loop < count; loop++)
294 put_page(pages[loop]);
295 if (ret < 0)
296 break;
297 } while (first < last);
298
299 _leave(" = %d", ret);
300 return ret;
301}
302
303/*
240 * initiate a call 304 * initiate a call
241 */ 305 */
242int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp, 306int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp,
@@ -253,8 +317,9 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp,
253 ASSERT(call->type != NULL); 317 ASSERT(call->type != NULL);
254 ASSERT(call->type->name != NULL); 318 ASSERT(call->type->name != NULL);
255 319
256 _debug("MAKE %p{%s} [%d]", 320 _debug("____MAKE %p{%s,%x} [%d]____",
257 call, call->type->name, atomic_read(&afs_outstanding_calls)); 321 call, call->type->name, key_serial(call->key),
322 atomic_read(&afs_outstanding_calls));
258 323
259 call->wait_mode = wait_mode; 324 call->wait_mode = wait_mode;
260 INIT_WORK(&call->async_work, afs_process_async_call); 325 INIT_WORK(&call->async_work, afs_process_async_call);
@@ -289,16 +354,23 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp,
289 msg.msg_iovlen = 1; 354 msg.msg_iovlen = 1;
290 msg.msg_control = NULL; 355 msg.msg_control = NULL;
291 msg.msg_controllen = 0; 356 msg.msg_controllen = 0;
292 msg.msg_flags = 0; 357 msg.msg_flags = (call->send_pages ? MSG_MORE : 0);
293 358
294 /* have to change the state *before* sending the last packet as RxRPC 359 /* have to change the state *before* sending the last packet as RxRPC
295 * might give us the reply before it returns from sending the 360 * might give us the reply before it returns from sending the
296 * request */ 361 * request */
297 call->state = AFS_CALL_AWAIT_REPLY; 362 if (!call->send_pages)
363 call->state = AFS_CALL_AWAIT_REPLY;
298 ret = rxrpc_kernel_send_data(rxcall, &msg, call->request_size); 364 ret = rxrpc_kernel_send_data(rxcall, &msg, call->request_size);
299 if (ret < 0) 365 if (ret < 0)
300 goto error_do_abort; 366 goto error_do_abort;
301 367
368 if (call->send_pages) {
369 ret = afs_send_pages(call, &msg, iov);
370 if (ret < 0)
371 goto error_do_abort;
372 }
373
302 /* at this point, an async call may no longer exist as it may have 374 /* at this point, an async call may no longer exist as it may have
303 * already completed */ 375 * already completed */
304 return wait_mode->wait(call); 376 return wait_mode->wait(call);
diff --git a/fs/afs/security.c b/fs/afs/security.c
index f9f424d80458..e0ea88b63ebf 100644
--- a/fs/afs/security.c
+++ b/fs/afs/security.c
@@ -109,7 +109,7 @@ void afs_clear_permits(struct afs_vnode *vnode)
109{ 109{
110 struct afs_permits *permits; 110 struct afs_permits *permits;
111 111
112 _enter("{%x}", vnode->fid.vnode); 112 _enter("{%x:%u}", vnode->fid.vid, vnode->fid.vnode);
113 113
114 mutex_lock(&vnode->permits_lock); 114 mutex_lock(&vnode->permits_lock);
115 permits = vnode->permits; 115 permits = vnode->permits;
@@ -132,7 +132,8 @@ void afs_cache_permit(struct afs_vnode *vnode, struct key *key, long acl_order)
132 struct afs_vnode *auth_vnode; 132 struct afs_vnode *auth_vnode;
133 int count, loop; 133 int count, loop;
134 134
135 _enter("{%x},%x,%lx", vnode->fid.vnode, key_serial(key), acl_order); 135 _enter("{%x:%u},%x,%lx",
136 vnode->fid.vid, vnode->fid.vnode, key_serial(key), acl_order);
136 137
137 auth_vnode = afs_get_auth_inode(vnode, key); 138 auth_vnode = afs_get_auth_inode(vnode, key);
138 if (IS_ERR(auth_vnode)) { 139 if (IS_ERR(auth_vnode)) {
@@ -220,7 +221,8 @@ static int afs_check_permit(struct afs_vnode *vnode, struct key *key,
220 bool valid; 221 bool valid;
221 int loop, ret; 222 int loop, ret;
222 223
223 _enter(""); 224 _enter("{%x:%u},%x",
225 vnode->fid.vid, vnode->fid.vnode, key_serial(key));
224 226
225 auth_vnode = afs_get_auth_inode(vnode, key); 227 auth_vnode = afs_get_auth_inode(vnode, key);
226 if (IS_ERR(auth_vnode)) { 228 if (IS_ERR(auth_vnode)) {
@@ -268,9 +270,9 @@ static int afs_check_permit(struct afs_vnode *vnode, struct key *key,
268 _leave(" = %d", ret); 270 _leave(" = %d", ret);
269 return ret; 271 return ret;
270 } 272 }
273 *_access = vnode->status.caller_access;
271 } 274 }
272 275
273 *_access = vnode->status.caller_access;
274 iput(&auth_vnode->vfs_inode); 276 iput(&auth_vnode->vfs_inode);
275 _leave(" = 0 [access %x]", *_access); 277 _leave(" = 0 [access %x]", *_access);
276 return 0; 278 return 0;
@@ -288,7 +290,7 @@ int afs_permission(struct inode *inode, int mask, struct nameidata *nd)
288 struct key *key; 290 struct key *key;
289 int ret; 291 int ret;
290 292
291 _enter("{{%x:%x},%lx},%x,", 293 _enter("{{%x:%u},%lx},%x,",
292 vnode->fid.vid, vnode->fid.vnode, vnode->flags, mask); 294 vnode->fid.vid, vnode->fid.vnode, vnode->flags, mask);
293 295
294 key = afs_request_key(vnode->volume->cell); 296 key = afs_request_key(vnode->volume->cell);
diff --git a/fs/afs/server.c b/fs/afs/server.c
index 96bb23b476a2..231ae4150279 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -252,6 +252,9 @@ static void afs_destroy_server(struct afs_server *server)
252{ 252{
253 _enter("%p", server); 253 _enter("%p", server);
254 254
255 ASSERTIF(server->cb_break_head != server->cb_break_tail,
256 delayed_work_pending(&server->cb_break_work));
257
255 ASSERTCMP(server->fs_vnodes.rb_node, ==, NULL); 258 ASSERTCMP(server->fs_vnodes.rb_node, ==, NULL);
256 ASSERTCMP(server->cb_promises.rb_node, ==, NULL); 259 ASSERTCMP(server->cb_promises.rb_node, ==, NULL);
257 ASSERTCMP(server->cb_break_head, ==, server->cb_break_tail); 260 ASSERTCMP(server->cb_break_head, ==, server->cb_break_tail);
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 7030d76155fc..d24be334b608 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -50,6 +50,7 @@ static const struct super_operations afs_super_ops = {
50 .statfs = simple_statfs, 50 .statfs = simple_statfs,
51 .alloc_inode = afs_alloc_inode, 51 .alloc_inode = afs_alloc_inode,
52 .drop_inode = generic_delete_inode, 52 .drop_inode = generic_delete_inode,
53 .write_inode = afs_write_inode,
53 .destroy_inode = afs_destroy_inode, 54 .destroy_inode = afs_destroy_inode,
54 .clear_inode = afs_clear_inode, 55 .clear_inode = afs_clear_inode,
55 .umount_begin = afs_umount_begin, 56 .umount_begin = afs_umount_begin,
@@ -66,7 +67,7 @@ enum {
66 afs_opt_vol, 67 afs_opt_vol,
67}; 68};
68 69
69static const match_table_t afs_options_list = { 70static match_table_t afs_options_list = {
70 { afs_opt_cell, "cell=%s" }, 71 { afs_opt_cell, "cell=%s" },
71 { afs_opt_rwpath, "rwpath" }, 72 { afs_opt_rwpath, "rwpath" },
72 { afs_opt_vol, "vol=%s" }, 73 { afs_opt_vol, "vol=%s" },
@@ -459,7 +460,9 @@ static void afs_i_init_once(void *_vnode, struct kmem_cache *cachep,
459 init_waitqueue_head(&vnode->update_waitq); 460 init_waitqueue_head(&vnode->update_waitq);
460 mutex_init(&vnode->permits_lock); 461 mutex_init(&vnode->permits_lock);
461 mutex_init(&vnode->validate_lock); 462 mutex_init(&vnode->validate_lock);
463 spin_lock_init(&vnode->writeback_lock);
462 spin_lock_init(&vnode->lock); 464 spin_lock_init(&vnode->lock);
465 INIT_LIST_HEAD(&vnode->writebacks);
463 INIT_WORK(&vnode->cb_broken_work, afs_broken_callback_work); 466 INIT_WORK(&vnode->cb_broken_work, afs_broken_callback_work);
464 } 467 }
465} 468}
diff --git a/fs/afs/vnode.c b/fs/afs/vnode.c
index a1904ab8426a..ec814660209f 100644
--- a/fs/afs/vnode.c
+++ b/fs/afs/vnode.c
@@ -261,7 +261,7 @@ int afs_vnode_fetch_status(struct afs_vnode *vnode,
261 261
262 DECLARE_WAITQUEUE(myself, current); 262 DECLARE_WAITQUEUE(myself, current);
263 263
264 _enter("%s,{%u,%u,%u}", 264 _enter("%s,{%x:%u.%u}",
265 vnode->volume->vlocation->vldb.name, 265 vnode->volume->vlocation->vldb.name,
266 vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique); 266 vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique);
267 267
@@ -389,7 +389,7 @@ int afs_vnode_fetch_data(struct afs_vnode *vnode, struct key *key,
389 struct afs_server *server; 389 struct afs_server *server;
390 int ret; 390 int ret;
391 391
392 _enter("%s{%u,%u,%u},%x,,,", 392 _enter("%s{%x:%u.%u},%x,,,",
393 vnode->volume->vlocation->vldb.name, 393 vnode->volume->vlocation->vldb.name,
394 vnode->fid.vid, 394 vnode->fid.vid,
395 vnode->fid.vnode, 395 vnode->fid.vnode,
@@ -446,7 +446,7 @@ int afs_vnode_create(struct afs_vnode *vnode, struct key *key,
446 struct afs_server *server; 446 struct afs_server *server;
447 int ret; 447 int ret;
448 448
449 _enter("%s{%u,%u,%u},%x,%s,,", 449 _enter("%s{%x:%u.%u},%x,%s,,",
450 vnode->volume->vlocation->vldb.name, 450 vnode->volume->vlocation->vldb.name,
451 vnode->fid.vid, 451 vnode->fid.vid,
452 vnode->fid.vnode, 452 vnode->fid.vnode,
@@ -502,7 +502,7 @@ int afs_vnode_remove(struct afs_vnode *vnode, struct key *key, const char *name,
502 struct afs_server *server; 502 struct afs_server *server;
503 int ret; 503 int ret;
504 504
505 _enter("%s{%u,%u,%u},%x,%s", 505 _enter("%s{%x:%u.%u},%x,%s",
506 vnode->volume->vlocation->vldb.name, 506 vnode->volume->vlocation->vldb.name,
507 vnode->fid.vid, 507 vnode->fid.vid,
508 vnode->fid.vnode, 508 vnode->fid.vnode,
@@ -557,7 +557,7 @@ extern int afs_vnode_link(struct afs_vnode *dvnode, struct afs_vnode *vnode,
557 struct afs_server *server; 557 struct afs_server *server;
558 int ret; 558 int ret;
559 559
560 _enter("%s{%u,%u,%u},%s{%u,%u,%u},%x,%s", 560 _enter("%s{%x:%u.%u},%s{%x:%u.%u},%x,%s",
561 dvnode->volume->vlocation->vldb.name, 561 dvnode->volume->vlocation->vldb.name,
562 dvnode->fid.vid, 562 dvnode->fid.vid,
563 dvnode->fid.vnode, 563 dvnode->fid.vnode,
@@ -628,7 +628,7 @@ int afs_vnode_symlink(struct afs_vnode *vnode, struct key *key,
628 struct afs_server *server; 628 struct afs_server *server;
629 int ret; 629 int ret;
630 630
631 _enter("%s{%u,%u,%u},%x,%s,%s,,,", 631 _enter("%s{%x:%u.%u},%x,%s,%s,,,",
632 vnode->volume->vlocation->vldb.name, 632 vnode->volume->vlocation->vldb.name,
633 vnode->fid.vid, 633 vnode->fid.vid,
634 vnode->fid.vnode, 634 vnode->fid.vnode,
@@ -687,7 +687,7 @@ int afs_vnode_rename(struct afs_vnode *orig_dvnode,
687 struct afs_server *server; 687 struct afs_server *server;
688 int ret; 688 int ret;
689 689
690 _enter("%s{%u,%u,%u},%s{%u,%u,%u},%x,%s,%s", 690 _enter("%s{%x:%u.%u},%s{%u,%u,%u},%x,%s,%s",
691 orig_dvnode->volume->vlocation->vldb.name, 691 orig_dvnode->volume->vlocation->vldb.name,
692 orig_dvnode->fid.vid, 692 orig_dvnode->fid.vid,
693 orig_dvnode->fid.vnode, 693 orig_dvnode->fid.vnode,
@@ -753,3 +753,110 @@ no_server:
753 _leave(" = %ld [cnt %d]", PTR_ERR(server), orig_dvnode->update_cnt); 753 _leave(" = %ld [cnt %d]", PTR_ERR(server), orig_dvnode->update_cnt);
754 return PTR_ERR(server); 754 return PTR_ERR(server);
755} 755}
756
757/*
758 * write to a file
759 */
760int afs_vnode_store_data(struct afs_writeback *wb, pgoff_t first, pgoff_t last,
761 unsigned offset, unsigned to)
762{
763 struct afs_server *server;
764 struct afs_vnode *vnode = wb->vnode;
765 int ret;
766
767 _enter("%s{%x:%u.%u},%x,%lx,%lx,%x,%x",
768 vnode->volume->vlocation->vldb.name,
769 vnode->fid.vid,
770 vnode->fid.vnode,
771 vnode->fid.unique,
772 key_serial(wb->key),
773 first, last, offset, to);
774
775 /* this op will fetch the status */
776 spin_lock(&vnode->lock);
777 vnode->update_cnt++;
778 spin_unlock(&vnode->lock);
779
780 do {
781 /* pick a server to query */
782 server = afs_volume_pick_fileserver(vnode);
783 if (IS_ERR(server))
784 goto no_server;
785
786 _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
787
788 ret = afs_fs_store_data(server, wb, first, last, offset, to,
789 &afs_sync_call);
790
791 } while (!afs_volume_release_fileserver(vnode, server, ret));
792
793 /* adjust the flags */
794 if (ret == 0) {
795 afs_vnode_finalise_status_update(vnode, server);
796 afs_put_server(server);
797 } else {
798 afs_vnode_status_update_failed(vnode, ret);
799 }
800
801 _leave(" = %d", ret);
802 return ret;
803
804no_server:
805 spin_lock(&vnode->lock);
806 vnode->update_cnt--;
807 ASSERTCMP(vnode->update_cnt, >=, 0);
808 spin_unlock(&vnode->lock);
809 return PTR_ERR(server);
810}
811
812/*
813 * set the attributes on a file
814 */
815int afs_vnode_setattr(struct afs_vnode *vnode, struct key *key,
816 struct iattr *attr)
817{
818 struct afs_server *server;
819 int ret;
820
821 _enter("%s{%x:%u.%u},%x",
822 vnode->volume->vlocation->vldb.name,
823 vnode->fid.vid,
824 vnode->fid.vnode,
825 vnode->fid.unique,
826 key_serial(key));
827
828 /* this op will fetch the status */
829 spin_lock(&vnode->lock);
830 vnode->update_cnt++;
831 spin_unlock(&vnode->lock);
832
833 do {
834 /* pick a server to query */
835 server = afs_volume_pick_fileserver(vnode);
836 if (IS_ERR(server))
837 goto no_server;
838
839 _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
840
841 ret = afs_fs_setattr(server, key, vnode, attr, &afs_sync_call);
842
843 } while (!afs_volume_release_fileserver(vnode, server, ret));
844
845 /* adjust the flags */
846 if (ret == 0) {
847 afs_vnode_finalise_status_update(vnode, server);
848 afs_put_server(server);
849 } else {
850 afs_vnode_status_update_failed(vnode, ret);
851 }
852
853 _leave(" = %d", ret);
854 return ret;
855
856no_server:
857 spin_lock(&vnode->lock);
858 vnode->update_cnt--;
859 ASSERTCMP(vnode->update_cnt, >=, 0);
860 spin_unlock(&vnode->lock);
861 return PTR_ERR(server);
862}
diff --git a/fs/afs/write.c b/fs/afs/write.c
new file mode 100644
index 000000000000..83ff29262816
--- /dev/null
+++ b/fs/afs/write.c
@@ -0,0 +1,835 @@
1/* handling of writes to regular files and writing back to the server
2 *
3 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12#include <linux/slab.h>
13#include <linux/fs.h>
14#include <linux/pagemap.h>
15#include <linux/writeback.h>
16#include <linux/pagevec.h>
17#include "internal.h"
18
19static int afs_write_back_from_locked_page(struct afs_writeback *wb,
20 struct page *page);
21
22/*
23 * mark a page as having been made dirty and thus needing writeback
24 */
25int afs_set_page_dirty(struct page *page)
26{
27 _enter("");
28 return __set_page_dirty_nobuffers(page);
29}
30
31/*
32 * unlink a writeback record because its usage has reached zero
33 * - must be called with the wb->vnode->writeback_lock held
34 */
35static void afs_unlink_writeback(struct afs_writeback *wb)
36{
37 struct afs_writeback *front;
38 struct afs_vnode *vnode = wb->vnode;
39
40 list_del_init(&wb->link);
41 if (!list_empty(&vnode->writebacks)) {
42 /* if an fsync rises to the front of the queue then wake it
43 * up */
44 front = list_entry(vnode->writebacks.next,
45 struct afs_writeback, link);
46 if (front->state == AFS_WBACK_SYNCING) {
47 _debug("wake up sync");
48 front->state = AFS_WBACK_COMPLETE;
49 wake_up(&front->waitq);
50 }
51 }
52}
53
54/*
55 * free a writeback record
56 */
57static void afs_free_writeback(struct afs_writeback *wb)
58{
59 _enter("");
60 key_put(wb->key);
61 kfree(wb);
62}
63
64/*
65 * dispose of a reference to a writeback record
66 */
67void afs_put_writeback(struct afs_writeback *wb)
68{
69 struct afs_vnode *vnode = wb->vnode;
70
71 _enter("{%d}", wb->usage);
72
73 spin_lock(&vnode->writeback_lock);
74 if (--wb->usage == 0)
75 afs_unlink_writeback(wb);
76 else
77 wb = NULL;
78 spin_unlock(&vnode->writeback_lock);
79 if (wb)
80 afs_free_writeback(wb);
81}
82
83/*
84 * partly or wholly fill a page that's under preparation for writing
85 */
86static int afs_fill_page(struct afs_vnode *vnode, struct key *key,
87 unsigned start, unsigned len, struct page *page)
88{
89 int ret;
90
91 _enter(",,%u,%u", start, len);
92
93 ASSERTCMP(start + len, <=, PAGE_SIZE);
94
95 ret = afs_vnode_fetch_data(vnode, key, start, len, page);
96 if (ret < 0) {
97 if (ret == -ENOENT) {
98 _debug("got NOENT from server"
99 " - marking file deleted and stale");
100 set_bit(AFS_VNODE_DELETED, &vnode->flags);
101 ret = -ESTALE;
102 }
103 }
104
105 _leave(" = %d", ret);
106 return ret;
107}
108
109/*
110 * prepare a page for being written to
111 */
112static int afs_prepare_page(struct afs_vnode *vnode, struct page *page,
113 struct key *key, unsigned offset, unsigned to)
114{
115 unsigned eof, tail, start, stop, len;
116 loff_t i_size, pos;
117 void *p;
118 int ret;
119
120 _enter("");
121
122 if (offset == 0 && to == PAGE_SIZE)
123 return 0;
124
125 p = kmap(page);
126
127 i_size = i_size_read(&vnode->vfs_inode);
128 pos = (loff_t) page->index << PAGE_SHIFT;
129 if (pos >= i_size) {
130 /* partial write, page beyond EOF */
131 _debug("beyond");
132 if (offset > 0)
133 memset(p, 0, offset);
134 if (to < PAGE_SIZE)
135 memset(p + to, 0, PAGE_SIZE - to);
136 kunmap(page);
137 return 0;
138 }
139
140 if (i_size - pos >= PAGE_SIZE) {
141 /* partial write, page entirely before EOF */
142 _debug("before");
143 tail = eof = PAGE_SIZE;
144 } else {
145 /* partial write, page overlaps EOF */
146 eof = i_size - pos;
147 _debug("overlap %u", eof);
148 tail = max(eof, to);
149 if (tail < PAGE_SIZE)
150 memset(p + tail, 0, PAGE_SIZE - tail);
151 if (offset > eof)
152 memset(p + eof, 0, PAGE_SIZE - eof);
153 }
154
155 kunmap(p);
156
157 ret = 0;
158 if (offset > 0 || eof > to) {
159 /* need to fill one or two bits that aren't going to be written
160 * (cover both fillers in one read if there are two) */
161 start = (offset > 0) ? 0 : to;
162 stop = (eof > to) ? eof : offset;
163 len = stop - start;
164 _debug("wr=%u-%u av=0-%u rd=%u@%u",
165 offset, to, eof, start, len);
166 ret = afs_fill_page(vnode, key, start, len, page);
167 }
168
169 _leave(" = %d", ret);
170 return ret;
171}
172
173/*
174 * prepare to perform part of a write to a page
175 * - the caller holds the page locked, preventing it from being written out or
176 * modified by anyone else
177 */
178int afs_prepare_write(struct file *file, struct page *page,
179 unsigned offset, unsigned to)
180{
181 struct afs_writeback *candidate, *wb;
182 struct afs_vnode *vnode = AFS_FS_I(file->f_dentry->d_inode);
183 struct key *key = file->private_data;
184 pgoff_t index;
185 int ret;
186
187 _enter("{%x:%u},{%lx},%u,%u",
188 vnode->fid.vid, vnode->fid.vnode, page->index, offset, to);
189
190 candidate = kzalloc(sizeof(*candidate), GFP_KERNEL);
191 if (!candidate)
192 return -ENOMEM;
193 candidate->vnode = vnode;
194 candidate->first = candidate->last = page->index;
195 candidate->offset_first = offset;
196 candidate->to_last = to;
197 candidate->usage = 1;
198 candidate->state = AFS_WBACK_PENDING;
199 init_waitqueue_head(&candidate->waitq);
200
201 if (!PageUptodate(page)) {
202 _debug("not up to date");
203 ret = afs_prepare_page(vnode, page, key, offset, to);
204 if (ret < 0) {
205 kfree(candidate);
206 _leave(" = %d [prep]", ret);
207 return ret;
208 }
209 SetPageUptodate(page);
210 }
211
212try_again:
213 index = page->index;
214 spin_lock(&vnode->writeback_lock);
215
216 /* see if this page is already pending a writeback under a suitable key
217 * - if so we can just join onto that one */
218 wb = (struct afs_writeback *) page_private(page);
219 if (wb) {
220 if (wb->key == key && wb->state == AFS_WBACK_PENDING)
221 goto subsume_in_current_wb;
222 goto flush_conflicting_wb;
223 }
224
225 if (index > 0) {
226 /* see if we can find an already pending writeback that we can
227 * append this page to */
228 list_for_each_entry(wb, &vnode->writebacks, link) {
229 if (wb->last == index - 1 && wb->key == key &&
230 wb->state == AFS_WBACK_PENDING)
231 goto append_to_previous_wb;
232 }
233 }
234
235 list_add_tail(&candidate->link, &vnode->writebacks);
236 candidate->key = key_get(key);
237 spin_unlock(&vnode->writeback_lock);
238 SetPagePrivate(page);
239 set_page_private(page, (unsigned long) candidate);
240 _leave(" = 0 [new]");
241 return 0;
242
243subsume_in_current_wb:
244 _debug("subsume");
245 ASSERTRANGE(wb->first, <=, index, <=, wb->last);
246 if (index == wb->first && offset < wb->offset_first)
247 wb->offset_first = offset;
248 if (index == wb->last && to > wb->to_last)
249 wb->to_last = to;
250 spin_unlock(&vnode->writeback_lock);
251 kfree(candidate);
252 _leave(" = 0 [sub]");
253 return 0;
254
255append_to_previous_wb:
256 _debug("append into %lx-%lx", wb->first, wb->last);
257 wb->usage++;
258 wb->last++;
259 wb->to_last = to;
260 spin_unlock(&vnode->writeback_lock);
261 SetPagePrivate(page);
262 set_page_private(page, (unsigned long) wb);
263 kfree(candidate);
264 _leave(" = 0 [app]");
265 return 0;
266
267 /* the page is currently bound to another context, so if it's dirty we
268 * need to flush it before we can use the new context */
269flush_conflicting_wb:
270 _debug("flush conflict");
271 if (wb->state == AFS_WBACK_PENDING)
272 wb->state = AFS_WBACK_CONFLICTING;
273 spin_unlock(&vnode->writeback_lock);
274 if (PageDirty(page)) {
275 ret = afs_write_back_from_locked_page(wb, page);
276 if (ret < 0) {
277 afs_put_writeback(candidate);
278 _leave(" = %d", ret);
279 return ret;
280 }
281 }
282
283 /* the page holds a ref on the writeback record */
284 afs_put_writeback(wb);
285 set_page_private(page, 0);
286 ClearPagePrivate(page);
287 goto try_again;
288}
289
290/*
291 * finalise part of a write to a page
292 */
293int afs_commit_write(struct file *file, struct page *page,
294 unsigned offset, unsigned to)
295{
296 struct afs_vnode *vnode = AFS_FS_I(file->f_dentry->d_inode);
297 loff_t i_size, maybe_i_size;
298
299 _enter("{%x:%u},{%lx},%u,%u",
300 vnode->fid.vid, vnode->fid.vnode, page->index, offset, to);
301
302 maybe_i_size = (loff_t) page->index << PAGE_SHIFT;
303 maybe_i_size += to;
304
305 i_size = i_size_read(&vnode->vfs_inode);
306 if (maybe_i_size > i_size) {
307 spin_lock(&vnode->writeback_lock);
308 i_size = i_size_read(&vnode->vfs_inode);
309 if (maybe_i_size > i_size)
310 i_size_write(&vnode->vfs_inode, maybe_i_size);
311 spin_unlock(&vnode->writeback_lock);
312 }
313
314 set_page_dirty(page);
315
316 if (PageDirty(page))
317 _debug("dirtied");
318
319 return 0;
320}
321
322/*
323 * kill all the pages in the given range
324 */
325static void afs_kill_pages(struct afs_vnode *vnode, bool error,
326 pgoff_t first, pgoff_t last)
327{
328 struct pagevec pv;
329 unsigned count, loop;
330
331 _enter("{%x:%u},%lx-%lx",
332 vnode->fid.vid, vnode->fid.vnode, first, last);
333
334 pagevec_init(&pv, 0);
335
336 do {
337 _debug("kill %lx-%lx", first, last);
338
339 count = last - first + 1;
340 if (count > PAGEVEC_SIZE)
341 count = PAGEVEC_SIZE;
342 pv.nr = find_get_pages_contig(vnode->vfs_inode.i_mapping,
343 first, count, pv.pages);
344 ASSERTCMP(pv.nr, ==, count);
345
346 for (loop = 0; loop < count; loop++) {
347 ClearPageUptodate(pv.pages[loop]);
348 if (error)
349 SetPageError(pv.pages[loop]);
350 end_page_writeback(pv.pages[loop]);
351 }
352
353 __pagevec_release(&pv);
354 } while (first < last);
355
356 _leave("");
357}
358
359/*
360 * synchronously write back the locked page and any subsequent non-locked dirty
361 * pages also covered by the same writeback record
362 */
363static int afs_write_back_from_locked_page(struct afs_writeback *wb,
364 struct page *primary_page)
365{
366 struct page *pages[8], *page;
367 unsigned long count;
368 unsigned n, offset, to;
369 pgoff_t start, first, last;
370 int loop, ret;
371
372 _enter(",%lx", primary_page->index);
373
374 count = 1;
375 if (!clear_page_dirty_for_io(primary_page))
376 BUG();
377 if (test_set_page_writeback(primary_page))
378 BUG();
379
380 /* find all consecutive lockable dirty pages, stopping when we find a
381 * page that is not immediately lockable, is not dirty or is missing,
382 * or we reach the end of the range */
383 start = primary_page->index;
384 if (start >= wb->last)
385 goto no_more;
386 start++;
387 do {
388 _debug("more %lx [%lx]", start, count);
389 n = wb->last - start + 1;
390 if (n > ARRAY_SIZE(pages))
391 n = ARRAY_SIZE(pages);
392 n = find_get_pages_contig(wb->vnode->vfs_inode.i_mapping,
393 start, n, pages);
394 _debug("fgpc %u", n);
395 if (n == 0)
396 goto no_more;
397 if (pages[0]->index != start) {
398 for (n--; n >= 0; n--)
399 put_page(pages[n]);
400 goto no_more;
401 }
402
403 for (loop = 0; loop < n; loop++) {
404 page = pages[loop];
405 if (page->index > wb->last)
406 break;
407 if (TestSetPageLocked(page))
408 break;
409 if (!PageDirty(page) ||
410 page_private(page) != (unsigned long) wb) {
411 unlock_page(page);
412 break;
413 }
414 if (!clear_page_dirty_for_io(page))
415 BUG();
416 if (test_set_page_writeback(page))
417 BUG();
418 unlock_page(page);
419 put_page(page);
420 }
421 count += loop;
422 if (loop < n) {
423 for (; loop < n; loop++)
424 put_page(pages[loop]);
425 goto no_more;
426 }
427
428 start += loop;
429 } while (start <= wb->last && count < 65536);
430
431no_more:
432 /* we now have a contiguous set of dirty pages, each with writeback set
433 * and the dirty mark cleared; the first page is locked and must remain
434 * so, all the rest are unlocked */
435 first = primary_page->index;
436 last = first + count - 1;
437
438 offset = (first == wb->first) ? wb->offset_first : 0;
439 to = (last == wb->last) ? wb->to_last : PAGE_SIZE;
440
441 _debug("write back %lx[%u..] to %lx[..%u]", first, offset, last, to);
442
443 ret = afs_vnode_store_data(wb, first, last, offset, to);
444 if (ret < 0) {
445 switch (ret) {
446 case -EDQUOT:
447 case -ENOSPC:
448 set_bit(AS_ENOSPC,
449 &wb->vnode->vfs_inode.i_mapping->flags);
450 break;
451 case -EROFS:
452 case -EIO:
453 case -EREMOTEIO:
454 case -EFBIG:
455 case -ENOENT:
456 case -ENOMEDIUM:
457 case -ENXIO:
458 afs_kill_pages(wb->vnode, true, first, last);
459 set_bit(AS_EIO, &wb->vnode->vfs_inode.i_mapping->flags);
460 break;
461 case -EACCES:
462 case -EPERM:
463 case -ENOKEY:
464 case -EKEYEXPIRED:
465 case -EKEYREJECTED:
466 case -EKEYREVOKED:
467 afs_kill_pages(wb->vnode, false, first, last);
468 break;
469 default:
470 break;
471 }
472 } else {
473 ret = count;
474 }
475
476 _leave(" = %d", ret);
477 return ret;
478}
479
480/*
481 * write a page back to the server
482 * - the caller locked the page for us
483 */
484int afs_writepage(struct page *page, struct writeback_control *wbc)
485{
486 struct backing_dev_info *bdi = page->mapping->backing_dev_info;
487 struct afs_writeback *wb;
488 int ret;
489
490 _enter("{%lx},", page->index);
491
492 if (wbc->sync_mode != WB_SYNC_NONE)
493 wait_on_page_writeback(page);
494
495 if (PageWriteback(page) || !PageDirty(page)) {
496 unlock_page(page);
497 return 0;
498 }
499
500 wb = (struct afs_writeback *) page_private(page);
501 ASSERT(wb != NULL);
502
503 ret = afs_write_back_from_locked_page(wb, page);
504 unlock_page(page);
505 if (ret < 0) {
506 _leave(" = %d", ret);
507 return 0;
508 }
509
510 wbc->nr_to_write -= ret;
511 if (wbc->nonblocking && bdi_write_congested(bdi))
512 wbc->encountered_congestion = 1;
513
514 _leave(" = 0");
515 return 0;
516}
517
518/*
519 * write a region of pages back to the server
520 */
521int afs_writepages_region(struct address_space *mapping,
522 struct writeback_control *wbc,
523 pgoff_t index, pgoff_t end, pgoff_t *_next)
524{
525 struct backing_dev_info *bdi = mapping->backing_dev_info;
526 struct afs_writeback *wb;
527 struct page *page;
528 int ret, n;
529
530 _enter(",,%lx,%lx,", index, end);
531
532 do {
533 n = find_get_pages_tag(mapping, &index, PAGECACHE_TAG_DIRTY,
534 1, &page);
535 if (!n)
536 break;
537
538 _debug("wback %lx", page->index);
539
540 if (page->index > end) {
541 *_next = index;
542 page_cache_release(page);
543 _leave(" = 0 [%lx]", *_next);
544 return 0;
545 }
546
547 /* at this point we hold neither mapping->tree_lock nor lock on
548 * the page itself: the page may be truncated or invalidated
549 * (changing page->mapping to NULL), or even swizzled back from
550 * swapper_space to tmpfs file mapping
551 */
552 lock_page(page);
553
554 if (page->mapping != mapping) {
555 unlock_page(page);
556 page_cache_release(page);
557 continue;
558 }
559
560 if (wbc->sync_mode != WB_SYNC_NONE)
561 wait_on_page_writeback(page);
562
563 if (PageWriteback(page) || !PageDirty(page)) {
564 unlock_page(page);
565 continue;
566 }
567
568 wb = (struct afs_writeback *) page_private(page);
569 ASSERT(wb != NULL);
570
571 spin_lock(&wb->vnode->writeback_lock);
572 wb->state = AFS_WBACK_WRITING;
573 spin_unlock(&wb->vnode->writeback_lock);
574
575 ret = afs_write_back_from_locked_page(wb, page);
576 unlock_page(page);
577 page_cache_release(page);
578 if (ret < 0) {
579 _leave(" = %d", ret);
580 return ret;
581 }
582
583 wbc->nr_to_write -= ret;
584
585 if (wbc->nonblocking && bdi_write_congested(bdi)) {
586 wbc->encountered_congestion = 1;
587 break;
588 }
589
590 cond_resched();
591 } while (index < end && wbc->nr_to_write > 0);
592
593 *_next = index;
594 _leave(" = 0 [%lx]", *_next);
595 return 0;
596}
597
598/*
599 * write some of the pending data back to the server
600 */
601int afs_writepages(struct address_space *mapping,
602 struct writeback_control *wbc)
603{
604 struct backing_dev_info *bdi = mapping->backing_dev_info;
605 pgoff_t start, end, next;
606 int ret;
607
608 _enter("");
609
610 if (wbc->nonblocking && bdi_write_congested(bdi)) {
611 wbc->encountered_congestion = 1;
612 _leave(" = 0 [congest]");
613 return 0;
614 }
615
616 if (wbc->range_cyclic) {
617 start = mapping->writeback_index;
618 end = -1;
619 ret = afs_writepages_region(mapping, wbc, start, end, &next);
620 if (start > 0 && wbc->nr_to_write > 0 && ret == 0 &&
621 !(wbc->nonblocking && wbc->encountered_congestion))
622 ret = afs_writepages_region(mapping, wbc, 0, start,
623 &next);
624 mapping->writeback_index = next;
625 } else if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) {
626 end = (pgoff_t)(LLONG_MAX >> PAGE_CACHE_SHIFT);
627 ret = afs_writepages_region(mapping, wbc, 0, end, &next);
628 if (wbc->nr_to_write > 0)
629 mapping->writeback_index = next;
630 } else {
631 start = wbc->range_start >> PAGE_CACHE_SHIFT;
632 end = wbc->range_end >> PAGE_CACHE_SHIFT;
633 ret = afs_writepages_region(mapping, wbc, start, end, &next);
634 }
635
636 _leave(" = %d", ret);
637 return ret;
638}
639
640/*
641 * write an inode back
642 */
643int afs_write_inode(struct inode *inode, int sync)
644{
645 struct afs_vnode *vnode = AFS_FS_I(inode);
646 int ret;
647
648 _enter("{%x:%u},", vnode->fid.vid, vnode->fid.vnode);
649
650 ret = 0;
651 if (sync) {
652 ret = filemap_fdatawait(inode->i_mapping);
653 if (ret < 0)
654 __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
655 }
656
657 _leave(" = %d", ret);
658 return ret;
659}
660
661/*
662 * completion of write to server
663 */
664void afs_pages_written_back(struct afs_vnode *vnode, struct afs_call *call)
665{
666 struct afs_writeback *wb = call->wb;
667 struct pagevec pv;
668 unsigned count, loop;
669 pgoff_t first = call->first, last = call->last;
670 bool free_wb;
671
672 _enter("{%x:%u},{%lx-%lx}",
673 vnode->fid.vid, vnode->fid.vnode, first, last);
674
675 ASSERT(wb != NULL);
676
677 pagevec_init(&pv, 0);
678
679 do {
680 _debug("attach %lx-%lx", first, last);
681
682 count = last - first + 1;
683 if (count > PAGEVEC_SIZE)
684 count = PAGEVEC_SIZE;
685 pv.nr = find_get_pages_contig(call->mapping, first, count,
686 pv.pages);
687 ASSERTCMP(pv.nr, ==, count);
688
689 spin_lock(&vnode->writeback_lock);
690 for (loop = 0; loop < count; loop++) {
691 struct page *page = pv.pages[loop];
692 end_page_writeback(page);
693 if (page_private(page) == (unsigned long) wb) {
694 set_page_private(page, 0);
695 ClearPagePrivate(page);
696 wb->usage--;
697 }
698 }
699 free_wb = false;
700 if (wb->usage == 0) {
701 afs_unlink_writeback(wb);
702 free_wb = true;
703 }
704 spin_unlock(&vnode->writeback_lock);
705 first += count;
706 if (free_wb) {
707 afs_free_writeback(wb);
708 wb = NULL;
709 }
710
711 __pagevec_release(&pv);
712 } while (first < last);
713
714 _leave("");
715}
716
717/*
718 * write to an AFS file
719 */
720ssize_t afs_file_write(struct kiocb *iocb, const struct iovec *iov,
721 unsigned long nr_segs, loff_t pos)
722{
723 struct dentry *dentry = iocb->ki_filp->f_path.dentry;
724 struct afs_vnode *vnode = AFS_FS_I(dentry->d_inode);
725 ssize_t result;
726 size_t count = iov_length(iov, nr_segs);
727 int ret;
728
729 _enter("{%x.%u},{%zu},%lu,",
730 vnode->fid.vid, vnode->fid.vnode, count, nr_segs);
731
732 if (IS_SWAPFILE(&vnode->vfs_inode)) {
733 printk(KERN_INFO
734 "AFS: Attempt to write to active swap file!\n");
735 return -EBUSY;
736 }
737
738 if (!count)
739 return 0;
740
741 result = generic_file_aio_write(iocb, iov, nr_segs, pos);
742 if (IS_ERR_VALUE(result)) {
743 _leave(" = %zd", result);
744 return result;
745 }
746
747 /* return error values for O_SYNC and IS_SYNC() */
748 if (IS_SYNC(&vnode->vfs_inode) || iocb->ki_filp->f_flags & O_SYNC) {
749 ret = afs_fsync(iocb->ki_filp, dentry, 1);
750 if (ret < 0)
751 result = ret;
752 }
753
754 _leave(" = %zd", result);
755 return result;
756}
757
758/*
759 * flush the vnode to the fileserver
760 */
761int afs_writeback_all(struct afs_vnode *vnode)
762{
763 struct address_space *mapping = vnode->vfs_inode.i_mapping;
764 struct writeback_control wbc = {
765 .bdi = mapping->backing_dev_info,
766 .sync_mode = WB_SYNC_ALL,
767 .nr_to_write = LONG_MAX,
768 .for_writepages = 1,
769 .range_cyclic = 1,
770 };
771 int ret;
772
773 _enter("");
774
775 ret = mapping->a_ops->writepages(mapping, &wbc);
776 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
777
778 _leave(" = %d", ret);
779 return ret;
780}
781
782/*
783 * flush any dirty pages for this process, and check for write errors.
784 * - the return status from this call provides a reliable indication of
785 * whether any write errors occurred for this process.
786 */
787int afs_fsync(struct file *file, struct dentry *dentry, int datasync)
788{
789 struct afs_writeback *wb, *xwb;
790 struct afs_vnode *vnode = AFS_FS_I(dentry->d_inode);
791 int ret;
792
793 _enter("{%x:%u},{n=%s},%d",
794 vnode->fid.vid, vnode->fid.vnode, dentry->d_name.name,
795 datasync);
796
797 /* use a writeback record as a marker in the queue - when this reaches
798 * the front of the queue, all the outstanding writes are either
799 * completed or rejected */
800 wb = kzalloc(sizeof(*wb), GFP_KERNEL);
801 if (!wb)
802 return -ENOMEM;
803 wb->vnode = vnode;
804 wb->first = 0;
805 wb->last = -1;
806 wb->offset_first = 0;
807 wb->to_last = PAGE_SIZE;
808 wb->usage = 1;
809 wb->state = AFS_WBACK_SYNCING;
810 init_waitqueue_head(&wb->waitq);
811
812 spin_lock(&vnode->writeback_lock);
813 list_for_each_entry(xwb, &vnode->writebacks, link) {
814 if (xwb->state == AFS_WBACK_PENDING)
815 xwb->state = AFS_WBACK_CONFLICTING;
816 }
817 list_add_tail(&wb->link, &vnode->writebacks);
818 spin_unlock(&vnode->writeback_lock);
819
820 /* push all the outstanding writebacks to the server */
821 ret = afs_writeback_all(vnode);
822 if (ret < 0) {
823 afs_put_writeback(wb);
824 _leave(" = %d [wb]", ret);
825 return ret;
826 }
827
828 /* wait for the preceding writes to actually complete */
829 ret = wait_event_interruptible(wb->waitq,
830 wb->state == AFS_WBACK_COMPLETE ||
831 vnode->writebacks.next == &wb->link);
832 afs_put_writeback(wb);
833 _leave(" = %d", ret);
834 return ret;
835}
diff --git a/fs/aio.c b/fs/aio.c
index b97ab8028b6d..ac1c1587aa02 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -346,10 +346,9 @@ void fastcall exit_aio(struct mm_struct *mm)
346 346
347 wait_for_all_aios(ctx); 347 wait_for_all_aios(ctx);
348 /* 348 /*
349 * this is an overkill, but ensures we don't leave 349 * Ensure we don't leave the ctx on the aio_wq
350 * the ctx on the aio_wq
351 */ 350 */
352 flush_workqueue(aio_wq); 351 cancel_work_sync(&ctx->wq.work);
353 352
354 if (1 != atomic_read(&ctx->users)) 353 if (1 != atomic_read(&ctx->users))
355 printk(KERN_DEBUG 354 printk(KERN_DEBUG
@@ -372,7 +371,7 @@ void fastcall __put_ioctx(struct kioctx *ctx)
372 BUG_ON(ctx->reqs_active); 371 BUG_ON(ctx->reqs_active);
373 372
374 cancel_delayed_work(&ctx->wq); 373 cancel_delayed_work(&ctx->wq);
375 flush_workqueue(aio_wq); 374 cancel_work_sync(&ctx->wq.work);
376 aio_free_ring(ctx); 375 aio_free_ring(ctx);
377 mmdrop(ctx->mm); 376 mmdrop(ctx->mm);
378 ctx->mm = NULL; 377 ctx->mm = NULL;
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 18657f001b43..72d0b412c376 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -675,19 +675,8 @@ static ssize_t
675bm_status_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos) 675bm_status_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos)
676{ 676{
677 char *s = enabled ? "enabled" : "disabled"; 677 char *s = enabled ? "enabled" : "disabled";
678 int len = strlen(s);
679 loff_t pos = *ppos;
680 678
681 if (pos < 0) 679 return simple_read_from_buffer(buf, nbytes, ppos, s, strlen(s));
682 return -EINVAL;
683 if (pos >= len)
684 return 0;
685 if (len < pos + nbytes)
686 nbytes = len - pos;
687 if (copy_to_user(buf, s + pos, nbytes))
688 return -EFAULT;
689 *ppos = pos + nbytes;
690 return nbytes;
691} 680}
692 681
693static ssize_t bm_status_write(struct file * file, const char __user * buffer, 682static ssize_t bm_status_write(struct file * file, const char __user * buffer,
diff --git a/fs/buffer.c b/fs/buffer.c
index eb820b82a636..aecd057cd0e0 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1846,13 +1846,8 @@ static int __block_prepare_write(struct inode *inode, struct page *page,
1846 if (block_start >= to) 1846 if (block_start >= to)
1847 break; 1847 break;
1848 if (buffer_new(bh)) { 1848 if (buffer_new(bh)) {
1849 void *kaddr;
1850
1851 clear_buffer_new(bh); 1849 clear_buffer_new(bh);
1852 kaddr = kmap_atomic(page, KM_USER0); 1850 zero_user_page(page, block_start, bh->b_size, KM_USER0);
1853 memset(kaddr+block_start, 0, bh->b_size);
1854 flush_dcache_page(page);
1855 kunmap_atomic(kaddr, KM_USER0);
1856 set_buffer_uptodate(bh); 1851 set_buffer_uptodate(bh);
1857 mark_buffer_dirty(bh); 1852 mark_buffer_dirty(bh);
1858 } 1853 }
@@ -1940,10 +1935,8 @@ int block_read_full_page(struct page *page, get_block_t *get_block)
1940 SetPageError(page); 1935 SetPageError(page);
1941 } 1936 }
1942 if (!buffer_mapped(bh)) { 1937 if (!buffer_mapped(bh)) {
1943 void *kaddr = kmap_atomic(page, KM_USER0); 1938 zero_user_page(page, i * blocksize, blocksize,
1944 memset(kaddr + i * blocksize, 0, blocksize); 1939 KM_USER0);
1945 flush_dcache_page(page);
1946 kunmap_atomic(kaddr, KM_USER0);
1947 if (!err) 1940 if (!err)
1948 set_buffer_uptodate(bh); 1941 set_buffer_uptodate(bh);
1949 continue; 1942 continue;
@@ -2086,7 +2079,6 @@ int cont_prepare_write(struct page *page, unsigned offset,
2086 long status; 2079 long status;
2087 unsigned zerofrom; 2080 unsigned zerofrom;
2088 unsigned blocksize = 1 << inode->i_blkbits; 2081 unsigned blocksize = 1 << inode->i_blkbits;
2089 void *kaddr;
2090 2082
2091 while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) { 2083 while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
2092 status = -ENOMEM; 2084 status = -ENOMEM;
@@ -2108,10 +2100,8 @@ int cont_prepare_write(struct page *page, unsigned offset,
2108 PAGE_CACHE_SIZE, get_block); 2100 PAGE_CACHE_SIZE, get_block);
2109 if (status) 2101 if (status)
2110 goto out_unmap; 2102 goto out_unmap;
2111 kaddr = kmap_atomic(new_page, KM_USER0); 2103 zero_user_page(page, zerofrom, PAGE_CACHE_SIZE - zerofrom,
2112 memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom); 2104 KM_USER0);
2113 flush_dcache_page(new_page);
2114 kunmap_atomic(kaddr, KM_USER0);
2115 generic_commit_write(NULL, new_page, zerofrom, PAGE_CACHE_SIZE); 2105 generic_commit_write(NULL, new_page, zerofrom, PAGE_CACHE_SIZE);
2116 unlock_page(new_page); 2106 unlock_page(new_page);
2117 page_cache_release(new_page); 2107 page_cache_release(new_page);
@@ -2138,10 +2128,7 @@ int cont_prepare_write(struct page *page, unsigned offset,
2138 if (status) 2128 if (status)
2139 goto out1; 2129 goto out1;
2140 if (zerofrom < offset) { 2130 if (zerofrom < offset) {
2141 kaddr = kmap_atomic(page, KM_USER0); 2131 zero_user_page(page, zerofrom, offset - zerofrom, KM_USER0);
2142 memset(kaddr+zerofrom, 0, offset-zerofrom);
2143 flush_dcache_page(page);
2144 kunmap_atomic(kaddr, KM_USER0);
2145 __block_commit_write(inode, page, zerofrom, offset); 2132 __block_commit_write(inode, page, zerofrom, offset);
2146 } 2133 }
2147 return 0; 2134 return 0;
@@ -2340,10 +2327,7 @@ failed:
2340 * Error recovery is pretty slack. Clear the page and mark it dirty 2327 * Error recovery is pretty slack. Clear the page and mark it dirty
2341 * so we'll later zero out any blocks which _were_ allocated. 2328 * so we'll later zero out any blocks which _were_ allocated.
2342 */ 2329 */
2343 kaddr = kmap_atomic(page, KM_USER0); 2330 zero_user_page(page, 0, PAGE_CACHE_SIZE, KM_USER0);
2344 memset(kaddr, 0, PAGE_CACHE_SIZE);
2345 flush_dcache_page(page);
2346 kunmap_atomic(kaddr, KM_USER0);
2347 SetPageUptodate(page); 2331 SetPageUptodate(page);
2348 set_page_dirty(page); 2332 set_page_dirty(page);
2349 return ret; 2333 return ret;
@@ -2382,7 +2366,6 @@ int nobh_writepage(struct page *page, get_block_t *get_block,
2382 loff_t i_size = i_size_read(inode); 2366 loff_t i_size = i_size_read(inode);
2383 const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; 2367 const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2384 unsigned offset; 2368 unsigned offset;
2385 void *kaddr;
2386 int ret; 2369 int ret;
2387 2370
2388 /* Is the page fully inside i_size? */ 2371 /* Is the page fully inside i_size? */
@@ -2413,10 +2396,7 @@ int nobh_writepage(struct page *page, get_block_t *get_block,
2413 * the page size, the remaining memory is zeroed when mapped, and 2396 * the page size, the remaining memory is zeroed when mapped, and
2414 * writes to that region are not written out to the file." 2397 * writes to that region are not written out to the file."
2415 */ 2398 */
2416 kaddr = kmap_atomic(page, KM_USER0); 2399 zero_user_page(page, offset, PAGE_CACHE_SIZE - offset, KM_USER0);
2417 memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
2418 flush_dcache_page(page);
2419 kunmap_atomic(kaddr, KM_USER0);
2420out: 2400out:
2421 ret = mpage_writepage(page, get_block, wbc); 2401 ret = mpage_writepage(page, get_block, wbc);
2422 if (ret == -EAGAIN) 2402 if (ret == -EAGAIN)
@@ -2437,7 +2417,6 @@ int nobh_truncate_page(struct address_space *mapping, loff_t from)
2437 unsigned to; 2417 unsigned to;
2438 struct page *page; 2418 struct page *page;
2439 const struct address_space_operations *a_ops = mapping->a_ops; 2419 const struct address_space_operations *a_ops = mapping->a_ops;
2440 char *kaddr;
2441 int ret = 0; 2420 int ret = 0;
2442 2421
2443 if ((offset & (blocksize - 1)) == 0) 2422 if ((offset & (blocksize - 1)) == 0)
@@ -2451,10 +2430,8 @@ int nobh_truncate_page(struct address_space *mapping, loff_t from)
2451 to = (offset + blocksize) & ~(blocksize - 1); 2430 to = (offset + blocksize) & ~(blocksize - 1);
2452 ret = a_ops->prepare_write(NULL, page, offset, to); 2431 ret = a_ops->prepare_write(NULL, page, offset, to);
2453 if (ret == 0) { 2432 if (ret == 0) {
2454 kaddr = kmap_atomic(page, KM_USER0); 2433 zero_user_page(page, offset, PAGE_CACHE_SIZE - offset,
2455 memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset); 2434 KM_USER0);
2456 flush_dcache_page(page);
2457 kunmap_atomic(kaddr, KM_USER0);
2458 /* 2435 /*
2459 * It would be more correct to call aops->commit_write() 2436 * It would be more correct to call aops->commit_write()
2460 * here, but this is more efficient. 2437 * here, but this is more efficient.
@@ -2480,7 +2457,6 @@ int block_truncate_page(struct address_space *mapping,
2480 struct inode *inode = mapping->host; 2457 struct inode *inode = mapping->host;
2481 struct page *page; 2458 struct page *page;
2482 struct buffer_head *bh; 2459 struct buffer_head *bh;
2483 void *kaddr;
2484 int err; 2460 int err;
2485 2461
2486 blocksize = 1 << inode->i_blkbits; 2462 blocksize = 1 << inode->i_blkbits;
@@ -2534,11 +2510,7 @@ int block_truncate_page(struct address_space *mapping,
2534 goto unlock; 2510 goto unlock;
2535 } 2511 }
2536 2512
2537 kaddr = kmap_atomic(page, KM_USER0); 2513 zero_user_page(page, offset, length, KM_USER0);
2538 memset(kaddr + offset, 0, length);
2539 flush_dcache_page(page);
2540 kunmap_atomic(kaddr, KM_USER0);
2541
2542 mark_buffer_dirty(bh); 2514 mark_buffer_dirty(bh);
2543 err = 0; 2515 err = 0;
2544 2516
@@ -2559,7 +2531,6 @@ int block_write_full_page(struct page *page, get_block_t *get_block,
2559 loff_t i_size = i_size_read(inode); 2531 loff_t i_size = i_size_read(inode);
2560 const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; 2532 const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2561 unsigned offset; 2533 unsigned offset;
2562 void *kaddr;
2563 2534
2564 /* Is the page fully inside i_size? */ 2535 /* Is the page fully inside i_size? */
2565 if (page->index < end_index) 2536 if (page->index < end_index)
@@ -2585,10 +2556,7 @@ int block_write_full_page(struct page *page, get_block_t *get_block,
2585 * the page size, the remaining memory is zeroed when mapped, and 2556 * the page size, the remaining memory is zeroed when mapped, and
2586 * writes to that region are not written out to the file." 2557 * writes to that region are not written out to the file."
2587 */ 2558 */
2588 kaddr = kmap_atomic(page, KM_USER0); 2559 zero_user_page(page, offset, PAGE_CACHE_SIZE - offset, KM_USER0);
2589 memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
2590 flush_dcache_page(page);
2591 kunmap_atomic(kaddr, KM_USER0);
2592 return __block_write_full_page(inode, page, get_block, wbc); 2560 return __block_write_full_page(inode, page, get_block, wbc);
2593} 2561}
2594 2562
@@ -2978,7 +2946,7 @@ static void buffer_exit_cpu(int cpu)
2978static int buffer_cpu_notify(struct notifier_block *self, 2946static int buffer_cpu_notify(struct notifier_block *self,
2979 unsigned long action, void *hcpu) 2947 unsigned long action, void *hcpu)
2980{ 2948{
2981 if (action == CPU_DEAD) 2949 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
2982 buffer_exit_cpu((unsigned long)hcpu); 2950 buffer_exit_cpu((unsigned long)hcpu);
2983 return NOTIFY_OK; 2951 return NOTIFY_OK;
2984} 2952}
diff --git a/fs/configfs/file.c b/fs/configfs/file.c
index d98be5e01328..3527c7c6def8 100644
--- a/fs/configfs/file.c
+++ b/fs/configfs/file.c
@@ -77,36 +77,6 @@ static int fill_read_buffer(struct dentry * dentry, struct configfs_buffer * buf
77 return ret; 77 return ret;
78} 78}
79 79
80
81/**
82 * flush_read_buffer - push buffer to userspace.
83 * @buffer: data buffer for file.
84 * @userbuf: user-passed buffer.
85 * @count: number of bytes requested.
86 * @ppos: file position.
87 *
88 * Copy the buffer we filled in fill_read_buffer() to userspace.
89 * This is done at the reader's leisure, copying and advancing
90 * the amount they specify each time.
91 * This may be called continuously until the buffer is empty.
92 */
93static int flush_read_buffer(struct configfs_buffer * buffer, char __user * buf,
94 size_t count, loff_t * ppos)
95{
96 int error;
97
98 if (*ppos > buffer->count)
99 return 0;
100
101 if (count > (buffer->count - *ppos))
102 count = buffer->count - *ppos;
103
104 error = copy_to_user(buf,buffer->page + *ppos,count);
105 if (!error)
106 *ppos += count;
107 return error ? -EFAULT : count;
108}
109
110/** 80/**
111 * configfs_read_file - read an attribute. 81 * configfs_read_file - read an attribute.
112 * @file: file pointer. 82 * @file: file pointer.
@@ -139,7 +109,8 @@ configfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *pp
139 } 109 }
140 pr_debug("%s: count = %zd, ppos = %lld, buf = %s\n", 110 pr_debug("%s: count = %zd, ppos = %lld, buf = %s\n",
141 __FUNCTION__, count, *ppos, buffer->page); 111 __FUNCTION__, count, *ppos, buffer->page);
142 retval = flush_read_buffer(buffer,buf,count,ppos); 112 retval = simple_read_from_buffer(buf, count, ppos, buffer->page,
113 buffer->count);
143out: 114out:
144 up(&buffer->sem); 115 up(&buffer->sem);
145 return retval; 116 return retval;
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 1e88d8d1d2a9..8593f3dfd299 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -867,7 +867,6 @@ static int do_direct_IO(struct dio *dio)
867do_holes: 867do_holes:
868 /* Handle holes */ 868 /* Handle holes */
869 if (!buffer_mapped(map_bh)) { 869 if (!buffer_mapped(map_bh)) {
870 char *kaddr;
871 loff_t i_size_aligned; 870 loff_t i_size_aligned;
872 871
873 /* AKPM: eargh, -ENOTBLK is a hack */ 872 /* AKPM: eargh, -ENOTBLK is a hack */
@@ -888,11 +887,8 @@ do_holes:
888 page_cache_release(page); 887 page_cache_release(page);
889 goto out; 888 goto out;
890 } 889 }
891 kaddr = kmap_atomic(page, KM_USER0); 890 zero_user_page(page, block_in_page << blkbits,
892 memset(kaddr + (block_in_page << blkbits), 891 1 << blkbits, KM_USER0);
893 0, 1 << blkbits);
894 flush_dcache_page(page);
895 kunmap_atomic(kaddr, KM_USER0);
896 dio->block_in_file++; 892 dio->block_in_file++;
897 block_in_page++; 893 block_in_page++;
898 goto next_block; 894 goto next_block;
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index e1bb03171986..a6cb6171c3af 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1767,7 +1767,6 @@ static int ext3_block_truncate_page(handle_t *handle, struct page *page,
1767 struct inode *inode = mapping->host; 1767 struct inode *inode = mapping->host;
1768 struct buffer_head *bh; 1768 struct buffer_head *bh;
1769 int err = 0; 1769 int err = 0;
1770 void *kaddr;
1771 1770
1772 blocksize = inode->i_sb->s_blocksize; 1771 blocksize = inode->i_sb->s_blocksize;
1773 length = blocksize - (offset & (blocksize - 1)); 1772 length = blocksize - (offset & (blocksize - 1));
@@ -1779,10 +1778,7 @@ static int ext3_block_truncate_page(handle_t *handle, struct page *page,
1779 */ 1778 */
1780 if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH) && 1779 if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH) &&
1781 ext3_should_writeback_data(inode) && PageUptodate(page)) { 1780 ext3_should_writeback_data(inode) && PageUptodate(page)) {
1782 kaddr = kmap_atomic(page, KM_USER0); 1781 zero_user_page(page, offset, length, KM_USER0);
1783 memset(kaddr + offset, 0, length);
1784 flush_dcache_page(page);
1785 kunmap_atomic(kaddr, KM_USER0);
1786 set_page_dirty(page); 1782 set_page_dirty(page);
1787 goto unlock; 1783 goto unlock;
1788 } 1784 }
@@ -1835,11 +1831,7 @@ static int ext3_block_truncate_page(handle_t *handle, struct page *page,
1835 goto unlock; 1831 goto unlock;
1836 } 1832 }
1837 1833
1838 kaddr = kmap_atomic(page, KM_USER0); 1834 zero_user_page(page, offset, length, KM_USER0);
1839 memset(kaddr + offset, 0, length);
1840 flush_dcache_page(page);
1841 kunmap_atomic(kaddr, KM_USER0);
1842
1843 BUFFER_TRACE(bh, "zeroed end of block"); 1835 BUFFER_TRACE(bh, "zeroed end of block");
1844 1836
1845 err = 0; 1837 err = 0;
diff --git a/fs/mpage.c b/fs/mpage.c
index fa2441f57b41..0fb914fc2ee0 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -284,11 +284,9 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
284 } 284 }
285 285
286 if (first_hole != blocks_per_page) { 286 if (first_hole != blocks_per_page) {
287 char *kaddr = kmap_atomic(page, KM_USER0); 287 zero_user_page(page, first_hole << blkbits,
288 memset(kaddr + (first_hole << blkbits), 0, 288 PAGE_CACHE_SIZE - (first_hole << blkbits),
289 PAGE_CACHE_SIZE - (first_hole << blkbits)); 289 KM_USER0);
290 flush_dcache_page(page);
291 kunmap_atomic(kaddr, KM_USER0);
292 if (first_hole == 0) { 290 if (first_hole == 0) {
293 SetPageUptodate(page); 291 SetPageUptodate(page);
294 unlock_page(page); 292 unlock_page(page);
@@ -576,14 +574,11 @@ page_is_mapped:
576 * written out to the file." 574 * written out to the file."
577 */ 575 */
578 unsigned offset = i_size & (PAGE_CACHE_SIZE - 1); 576 unsigned offset = i_size & (PAGE_CACHE_SIZE - 1);
579 char *kaddr;
580 577
581 if (page->index > end_index || !offset) 578 if (page->index > end_index || !offset)
582 goto confused; 579 goto confused;
583 kaddr = kmap_atomic(page, KM_USER0); 580 zero_user_page(page, offset, PAGE_CACHE_SIZE - offset,
584 memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset); 581 KM_USER0);
585 flush_dcache_page(page);
586 kunmap_atomic(kaddr, KM_USER0);
587 } 582 }
588 583
589 /* 584 /*
diff --git a/fs/namei.c b/fs/namei.c
index 856b2f5da51d..b3780e3fc88e 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1152,14 +1152,12 @@ static int fastcall do_path_lookup(int dfd, const char *name,
1152 1152
1153 fput_light(file, fput_needed); 1153 fput_light(file, fput_needed);
1154 } 1154 }
1155 current->total_link_count = 0; 1155
1156 retval = link_path_walk(name, nd); 1156 retval = path_walk(name, nd);
1157out: 1157out:
1158 if (likely(retval == 0)) { 1158 if (unlikely(!retval && !audit_dummy_context() && nd->dentry &&
1159 if (unlikely(!audit_dummy_context() && nd && nd->dentry &&
1160 nd->dentry->d_inode)) 1159 nd->dentry->d_inode))
1161 audit_inode(name, nd->dentry->d_inode); 1160 audit_inode(name, nd->dentry->d_inode);
1162 }
1163out_fail: 1161out_fail:
1164 return retval; 1162 return retval;
1165 1163
diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile
index ce341dc76d5e..9b118ee20193 100644
--- a/fs/nfsd/Makefile
+++ b/fs/nfsd/Makefile
@@ -11,4 +11,3 @@ nfsd-$(CONFIG_NFSD_V3) += nfs3proc.o nfs3xdr.o
11nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o 11nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o
12nfsd-$(CONFIG_NFSD_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \ 12nfsd-$(CONFIG_NFSD_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \
13 nfs4acl.o nfs4callback.o nfs4recover.o 13 nfs4acl.o nfs4callback.o nfs4recover.o
14nfsd-objs := $(nfsd-y)
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 6f24768272a1..79bd03b8bbf8 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -469,6 +469,13 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
469 nd.dentry = NULL; 469 nd.dentry = NULL;
470 exp.ex_path = NULL; 470 exp.ex_path = NULL;
471 471
472 /* fs locations */
473 exp.ex_fslocs.locations = NULL;
474 exp.ex_fslocs.locations_count = 0;
475 exp.ex_fslocs.migrated = 0;
476
477 exp.ex_uuid = NULL;
478
472 if (mesg[mlen-1] != '\n') 479 if (mesg[mlen-1] != '\n')
473 return -EINVAL; 480 return -EINVAL;
474 mesg[mlen-1] = 0; 481 mesg[mlen-1] = 0;
@@ -509,13 +516,6 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
509 if (exp.h.expiry_time == 0) 516 if (exp.h.expiry_time == 0)
510 goto out; 517 goto out;
511 518
512 /* fs locations */
513 exp.ex_fslocs.locations = NULL;
514 exp.ex_fslocs.locations_count = 0;
515 exp.ex_fslocs.migrated = 0;
516
517 exp.ex_uuid = NULL;
518
519 /* flags */ 519 /* flags */
520 err = get_int(&mesg, &an_int); 520 err = get_int(&mesg, &an_int);
521 if (err == -ENOENT) 521 if (err == -ENOENT)
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 7f5bad0393b1..eac82830bfd7 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -177,7 +177,7 @@ nfsd3_proc_read(struct svc_rqst *rqstp, struct nfsd3_readargs *argp,
177 if (max_blocksize < resp->count) 177 if (max_blocksize < resp->count)
178 resp->count = max_blocksize; 178 resp->count = max_blocksize;
179 179
180 svc_reserve(rqstp, ((1 + NFS3_POST_OP_ATTR_WORDS + 3)<<2) + resp->count +4); 180 svc_reserve_auth(rqstp, ((1 + NFS3_POST_OP_ATTR_WORDS + 3)<<2) + resp->count +4);
181 181
182 fh_copy(&resp->fh, &argp->fh); 182 fh_copy(&resp->fh, &argp->fh);
183 nfserr = nfsd_read(rqstp, &resp->fh, NULL, 183 nfserr = nfsd_read(rqstp, &resp->fh, NULL,
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 7e4bb0af24d7..10f6e7dcf633 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -239,7 +239,7 @@ static __be32 *
239encode_post_op_attr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp) 239encode_post_op_attr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp)
240{ 240{
241 struct dentry *dentry = fhp->fh_dentry; 241 struct dentry *dentry = fhp->fh_dentry;
242 if (dentry && dentry->d_inode != NULL) { 242 if (dentry && dentry->d_inode) {
243 int err; 243 int err;
244 struct kstat stat; 244 struct kstat stat;
245 245
@@ -300,9 +300,9 @@ int
300nfs3svc_decode_sattrargs(struct svc_rqst *rqstp, __be32 *p, 300nfs3svc_decode_sattrargs(struct svc_rqst *rqstp, __be32 *p,
301 struct nfsd3_sattrargs *args) 301 struct nfsd3_sattrargs *args)
302{ 302{
303 if (!(p = decode_fh(p, &args->fh)) 303 if (!(p = decode_fh(p, &args->fh)))
304 || !(p = decode_sattr3(p, &args->attrs)))
305 return 0; 304 return 0;
305 p = decode_sattr3(p, &args->attrs);
306 306
307 if ((args->check_guard = ntohl(*p++)) != 0) { 307 if ((args->check_guard = ntohl(*p++)) != 0) {
308 struct timespec time; 308 struct timespec time;
@@ -343,9 +343,9 @@ nfs3svc_decode_readargs(struct svc_rqst *rqstp, __be32 *p,
343 int v,pn; 343 int v,pn;
344 u32 max_blocksize = svc_max_payload(rqstp); 344 u32 max_blocksize = svc_max_payload(rqstp);
345 345
346 if (!(p = decode_fh(p, &args->fh)) 346 if (!(p = decode_fh(p, &args->fh)))
347 || !(p = xdr_decode_hyper(p, &args->offset)))
348 return 0; 347 return 0;
348 p = xdr_decode_hyper(p, &args->offset);
349 349
350 len = args->count = ntohl(*p++); 350 len = args->count = ntohl(*p++);
351 351
@@ -369,28 +369,44 @@ int
369nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p, 369nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p,
370 struct nfsd3_writeargs *args) 370 struct nfsd3_writeargs *args)
371{ 371{
372 unsigned int len, v, hdr; 372 unsigned int len, v, hdr, dlen;
373 u32 max_blocksize = svc_max_payload(rqstp); 373 u32 max_blocksize = svc_max_payload(rqstp);
374 374
375 if (!(p = decode_fh(p, &args->fh)) 375 if (!(p = decode_fh(p, &args->fh)))
376 || !(p = xdr_decode_hyper(p, &args->offset)))
377 return 0; 376 return 0;
377 p = xdr_decode_hyper(p, &args->offset);
378 378
379 args->count = ntohl(*p++); 379 args->count = ntohl(*p++);
380 args->stable = ntohl(*p++); 380 args->stable = ntohl(*p++);
381 len = args->len = ntohl(*p++); 381 len = args->len = ntohl(*p++);
382 /*
383 * The count must equal the amount of data passed.
384 */
385 if (args->count != args->len)
386 return 0;
382 387
388 /*
389 * Check to make sure that we got the right number of
390 * bytes.
391 */
383 hdr = (void*)p - rqstp->rq_arg.head[0].iov_base; 392 hdr = (void*)p - rqstp->rq_arg.head[0].iov_base;
384 if (rqstp->rq_arg.len < hdr || 393 dlen = rqstp->rq_arg.head[0].iov_len + rqstp->rq_arg.page_len
385 rqstp->rq_arg.len - hdr < len) 394 - hdr;
395 /*
396 * Round the length of the data which was specified up to
397 * the next multiple of XDR units and then compare that
398 * against the length which was actually received.
399 */
400 if (dlen != XDR_QUADLEN(len)*4)
386 return 0; 401 return 0;
387 402
403 if (args->count > max_blocksize) {
404 args->count = max_blocksize;
405 len = args->len = max_blocksize;
406 }
388 rqstp->rq_vec[0].iov_base = (void*)p; 407 rqstp->rq_vec[0].iov_base = (void*)p;
389 rqstp->rq_vec[0].iov_len = rqstp->rq_arg.head[0].iov_len - hdr; 408 rqstp->rq_vec[0].iov_len = rqstp->rq_arg.head[0].iov_len - hdr;
390 409 v = 0;
391 if (len > max_blocksize)
392 len = max_blocksize;
393 v= 0;
394 while (len > rqstp->rq_vec[v].iov_len) { 410 while (len > rqstp->rq_vec[v].iov_len) {
395 len -= rqstp->rq_vec[v].iov_len; 411 len -= rqstp->rq_vec[v].iov_len;
396 v++; 412 v++;
@@ -398,9 +414,8 @@ nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p,
398 rqstp->rq_vec[v].iov_len = PAGE_SIZE; 414 rqstp->rq_vec[v].iov_len = PAGE_SIZE;
399 } 415 }
400 rqstp->rq_vec[v].iov_len = len; 416 rqstp->rq_vec[v].iov_len = len;
401 args->vlen = v+1; 417 args->vlen = v + 1;
402 418 return 1;
403 return args->count == args->len && rqstp->rq_vec[0].iov_len > 0;
404} 419}
405 420
406int 421int
@@ -414,8 +429,7 @@ nfs3svc_decode_createargs(struct svc_rqst *rqstp, __be32 *p,
414 switch (args->createmode = ntohl(*p++)) { 429 switch (args->createmode = ntohl(*p++)) {
415 case NFS3_CREATE_UNCHECKED: 430 case NFS3_CREATE_UNCHECKED:
416 case NFS3_CREATE_GUARDED: 431 case NFS3_CREATE_GUARDED:
417 if (!(p = decode_sattr3(p, &args->attrs))) 432 p = decode_sattr3(p, &args->attrs);
418 return 0;
419 break; 433 break;
420 case NFS3_CREATE_EXCLUSIVE: 434 case NFS3_CREATE_EXCLUSIVE:
421 args->verf = p; 435 args->verf = p;
@@ -431,10 +445,10 @@ int
431nfs3svc_decode_mkdirargs(struct svc_rqst *rqstp, __be32 *p, 445nfs3svc_decode_mkdirargs(struct svc_rqst *rqstp, __be32 *p,
432 struct nfsd3_createargs *args) 446 struct nfsd3_createargs *args)
433{ 447{
434 if (!(p = decode_fh(p, &args->fh)) 448 if (!(p = decode_fh(p, &args->fh)) ||
435 || !(p = decode_filename(p, &args->name, &args->len)) 449 !(p = decode_filename(p, &args->name, &args->len)))
436 || !(p = decode_sattr3(p, &args->attrs)))
437 return 0; 450 return 0;
451 p = decode_sattr3(p, &args->attrs);
438 452
439 return xdr_argsize_check(rqstp, p); 453 return xdr_argsize_check(rqstp, p);
440} 454}
@@ -448,11 +462,12 @@ nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p,
448 char *old, *new; 462 char *old, *new;
449 struct kvec *vec; 463 struct kvec *vec;
450 464
451 if (!(p = decode_fh(p, &args->ffh)) 465 if (!(p = decode_fh(p, &args->ffh)) ||
452 || !(p = decode_filename(p, &args->fname, &args->flen)) 466 !(p = decode_filename(p, &args->fname, &args->flen))
453 || !(p = decode_sattr3(p, &args->attrs))
454 ) 467 )
455 return 0; 468 return 0;
469 p = decode_sattr3(p, &args->attrs);
470
456 /* now decode the pathname, which might be larger than the first page. 471 /* now decode the pathname, which might be larger than the first page.
457 * As we have to check for nul's anyway, we copy it into a new page 472 * As we have to check for nul's anyway, we copy it into a new page
458 * This page appears in the rq_res.pages list, but as pages_len is always 473 * This page appears in the rq_res.pages list, but as pages_len is always
@@ -502,10 +517,8 @@ nfs3svc_decode_mknodargs(struct svc_rqst *rqstp, __be32 *p,
502 args->ftype = ntohl(*p++); 517 args->ftype = ntohl(*p++);
503 518
504 if (args->ftype == NF3BLK || args->ftype == NF3CHR 519 if (args->ftype == NF3BLK || args->ftype == NF3CHR
505 || args->ftype == NF3SOCK || args->ftype == NF3FIFO) { 520 || args->ftype == NF3SOCK || args->ftype == NF3FIFO)
506 if (!(p = decode_sattr3(p, &args->attrs))) 521 p = decode_sattr3(p, &args->attrs);
507 return 0;
508 }
509 522
510 if (args->ftype == NF3BLK || args->ftype == NF3CHR) { 523 if (args->ftype == NF3BLK || args->ftype == NF3CHR) {
511 args->major = ntohl(*p++); 524 args->major = ntohl(*p++);
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index 673a53c014a3..cc3b7badd486 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -137,7 +137,6 @@ struct ace_container {
137static short ace2type(struct nfs4_ace *); 137static short ace2type(struct nfs4_ace *);
138static void _posix_to_nfsv4_one(struct posix_acl *, struct nfs4_acl *, 138static void _posix_to_nfsv4_one(struct posix_acl *, struct nfs4_acl *,
139 unsigned int); 139 unsigned int);
140void nfs4_acl_add_ace(struct nfs4_acl *, u32, u32, u32, int, uid_t);
141 140
142struct nfs4_acl * 141struct nfs4_acl *
143nfs4_acl_posix_to_nfsv4(struct posix_acl *pacl, struct posix_acl *dpacl, 142nfs4_acl_posix_to_nfsv4(struct posix_acl *pacl, struct posix_acl *dpacl,
@@ -785,21 +784,6 @@ nfs4_acl_new(int n)
785 return acl; 784 return acl;
786} 785}
787 786
788void
789nfs4_acl_add_ace(struct nfs4_acl *acl, u32 type, u32 flag, u32 access_mask,
790 int whotype, uid_t who)
791{
792 struct nfs4_ace *ace = acl->aces + acl->naces;
793
794 ace->type = type;
795 ace->flag = flag;
796 ace->access_mask = access_mask;
797 ace->whotype = whotype;
798 ace->who = who;
799
800 acl->naces++;
801}
802
803static struct { 787static struct {
804 char *string; 788 char *string;
805 int stringlen; 789 int stringlen;
@@ -851,6 +835,5 @@ nfs4_acl_write_who(int who, char *p)
851} 835}
852 836
853EXPORT_SYMBOL(nfs4_acl_new); 837EXPORT_SYMBOL(nfs4_acl_new);
854EXPORT_SYMBOL(nfs4_acl_add_ace);
855EXPORT_SYMBOL(nfs4_acl_get_whotype); 838EXPORT_SYMBOL(nfs4_acl_get_whotype);
856EXPORT_SYMBOL(nfs4_acl_write_who); 839EXPORT_SYMBOL(nfs4_acl_write_who);
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 678f3be88ac0..3cc8ce422ab1 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1326,8 +1326,6 @@ do_recall(void *__dp)
1326{ 1326{
1327 struct nfs4_delegation *dp = __dp; 1327 struct nfs4_delegation *dp = __dp;
1328 1328
1329 daemonize("nfsv4-recall");
1330
1331 nfsd4_cb_recall(dp); 1329 nfsd4_cb_recall(dp);
1332 return 0; 1330 return 0;
1333} 1331}
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 739dd3c5c3b2..6ca2d24fc216 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -323,7 +323,7 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
323 * 323 *
324 */ 324 */
325 325
326 u8 version = 1; 326 u8 version;
327 u8 fsid_type = 0; 327 u8 fsid_type = 0;
328 struct inode * inode = dentry->d_inode; 328 struct inode * inode = dentry->d_inode;
329 struct dentry *parent = dentry->d_parent; 329 struct dentry *parent = dentry->d_parent;
@@ -341,15 +341,59 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
341 * the reference filehandle (if it is in the same export) 341 * the reference filehandle (if it is in the same export)
342 * or the export options. 342 * or the export options.
343 */ 343 */
344 retry:
345 version = 1;
344 if (ref_fh && ref_fh->fh_export == exp) { 346 if (ref_fh && ref_fh->fh_export == exp) {
345 version = ref_fh->fh_handle.fh_version; 347 version = ref_fh->fh_handle.fh_version;
346 if (version == 0xca) 348 fsid_type = ref_fh->fh_handle.fh_fsid_type;
349
350 if (ref_fh == fhp)
351 fh_put(ref_fh);
352 ref_fh = NULL;
353
354 switch (version) {
355 case 0xca:
347 fsid_type = FSID_DEV; 356 fsid_type = FSID_DEV;
348 else 357 break;
349 fsid_type = ref_fh->fh_handle.fh_fsid_type; 358 case 1:
350 /* We know this version/type works for this export 359 break;
351 * so there is no need for further checks. 360 default:
361 goto retry;
362 }
363
364 /* Need to check that this type works for this
365 * export point. As the fsid -> filesystem mapping
366 * was guided by user-space, there is no guarantee
367 * that the filesystem actually supports that fsid
368 * type. If it doesn't we loop around again without
369 * ref_fh set.
352 */ 370 */
371 switch(fsid_type) {
372 case FSID_DEV:
373 if (!old_valid_dev(ex_dev))
374 goto retry;
375 /* FALL THROUGH */
376 case FSID_MAJOR_MINOR:
377 case FSID_ENCODE_DEV:
378 if (!(exp->ex_dentry->d_inode->i_sb->s_type->fs_flags
379 & FS_REQUIRES_DEV))
380 goto retry;
381 break;
382 case FSID_NUM:
383 if (! (exp->ex_flags & NFSEXP_FSID))
384 goto retry;
385 break;
386 case FSID_UUID8:
387 case FSID_UUID16:
388 if (!root_export)
389 goto retry;
390 /* fall through */
391 case FSID_UUID4_INUM:
392 case FSID_UUID16_INUM:
393 if (exp->ex_uuid == NULL)
394 goto retry;
395 break;
396 }
353 } else if (exp->ex_uuid) { 397 } else if (exp->ex_uuid) {
354 if (fhp->fh_maxsize >= 64) { 398 if (fhp->fh_maxsize >= 64) {
355 if (root_export) 399 if (root_export)
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 5cc2eec981b8..b2c7147aa921 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -155,7 +155,7 @@ nfsd_proc_read(struct svc_rqst *rqstp, struct nfsd_readargs *argp,
155 argp->count); 155 argp->count);
156 argp->count = NFSSVC_MAXBLKSIZE_V2; 156 argp->count = NFSSVC_MAXBLKSIZE_V2;
157 } 157 }
158 svc_reserve(rqstp, (19<<2) + argp->count + 4); 158 svc_reserve_auth(rqstp, (19<<2) + argp->count + 4);
159 159
160 resp->count = argp->count; 160 resp->count = argp->count;
161 nfserr = nfsd_read(rqstp, fh_copy(&resp->fh, &argp->fh), NULL, 161 nfserr = nfsd_read(rqstp, fh_copy(&resp->fh, &argp->fh), NULL,
diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c
index 0c24b9e24fe8..cb3e7fadb772 100644
--- a/fs/nfsd/nfsxdr.c
+++ b/fs/nfsd/nfsxdr.c
@@ -231,9 +231,10 @@ int
231nfssvc_decode_sattrargs(struct svc_rqst *rqstp, __be32 *p, 231nfssvc_decode_sattrargs(struct svc_rqst *rqstp, __be32 *p,
232 struct nfsd_sattrargs *args) 232 struct nfsd_sattrargs *args)
233{ 233{
234 if (!(p = decode_fh(p, &args->fh)) 234 p = decode_fh(p, &args->fh);
235 || !(p = decode_sattr(p, &args->attrs))) 235 if (!p)
236 return 0; 236 return 0;
237 p = decode_sattr(p, &args->attrs);
237 238
238 return xdr_argsize_check(rqstp, p); 239 return xdr_argsize_check(rqstp, p);
239} 240}
@@ -284,8 +285,9 @@ int
284nfssvc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p, 285nfssvc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p,
285 struct nfsd_writeargs *args) 286 struct nfsd_writeargs *args)
286{ 287{
287 unsigned int len; 288 unsigned int len, hdr, dlen;
288 int v; 289 int v;
290
289 if (!(p = decode_fh(p, &args->fh))) 291 if (!(p = decode_fh(p, &args->fh)))
290 return 0; 292 return 0;
291 293
@@ -293,11 +295,30 @@ nfssvc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p,
293 args->offset = ntohl(*p++); /* offset */ 295 args->offset = ntohl(*p++); /* offset */
294 p++; /* totalcount */ 296 p++; /* totalcount */
295 len = args->len = ntohl(*p++); 297 len = args->len = ntohl(*p++);
296 rqstp->rq_vec[0].iov_base = (void*)p; 298 /*
297 rqstp->rq_vec[0].iov_len = rqstp->rq_arg.head[0].iov_len - 299 * The protocol specifies a maximum of 8192 bytes.
298 (((void*)p) - rqstp->rq_arg.head[0].iov_base); 300 */
299 if (len > NFSSVC_MAXBLKSIZE_V2) 301 if (len > NFSSVC_MAXBLKSIZE_V2)
300 len = NFSSVC_MAXBLKSIZE_V2; 302 return 0;
303
304 /*
305 * Check to make sure that we got the right number of
306 * bytes.
307 */
308 hdr = (void*)p - rqstp->rq_arg.head[0].iov_base;
309 dlen = rqstp->rq_arg.head[0].iov_len + rqstp->rq_arg.page_len
310 - hdr;
311
312 /*
313 * Round the length of the data which was specified up to
314 * the next multiple of XDR units and then compare that
315 * against the length which was actually received.
316 */
317 if (dlen != XDR_QUADLEN(len)*4)
318 return 0;
319
320 rqstp->rq_vec[0].iov_base = (void*)p;
321 rqstp->rq_vec[0].iov_len = rqstp->rq_arg.head[0].iov_len - hdr;
301 v = 0; 322 v = 0;
302 while (len > rqstp->rq_vec[v].iov_len) { 323 while (len > rqstp->rq_vec[v].iov_len) {
303 len -= rqstp->rq_vec[v].iov_len; 324 len -= rqstp->rq_vec[v].iov_len;
@@ -306,18 +327,18 @@ nfssvc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p,
306 rqstp->rq_vec[v].iov_len = PAGE_SIZE; 327 rqstp->rq_vec[v].iov_len = PAGE_SIZE;
307 } 328 }
308 rqstp->rq_vec[v].iov_len = len; 329 rqstp->rq_vec[v].iov_len = len;
309 args->vlen = v+1; 330 args->vlen = v + 1;
310 return rqstp->rq_vec[0].iov_len > 0; 331 return 1;
311} 332}
312 333
313int 334int
314nfssvc_decode_createargs(struct svc_rqst *rqstp, __be32 *p, 335nfssvc_decode_createargs(struct svc_rqst *rqstp, __be32 *p,
315 struct nfsd_createargs *args) 336 struct nfsd_createargs *args)
316{ 337{
317 if (!(p = decode_fh(p, &args->fh)) 338 if ( !(p = decode_fh(p, &args->fh))
318 || !(p = decode_filename(p, &args->name, &args->len)) 339 || !(p = decode_filename(p, &args->name, &args->len)))
319 || !(p = decode_sattr(p, &args->attrs)))
320 return 0; 340 return 0;
341 p = decode_sattr(p, &args->attrs);
321 342
322 return xdr_argsize_check(rqstp, p); 343 return xdr_argsize_check(rqstp, p);
323} 344}
@@ -361,11 +382,11 @@ int
361nfssvc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p, 382nfssvc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p,
362 struct nfsd_symlinkargs *args) 383 struct nfsd_symlinkargs *args)
363{ 384{
364 if (!(p = decode_fh(p, &args->ffh)) 385 if ( !(p = decode_fh(p, &args->ffh))
365 || !(p = decode_filename(p, &args->fname, &args->flen)) 386 || !(p = decode_filename(p, &args->fname, &args->flen))
366 || !(p = decode_pathname(p, &args->tname, &args->tlen)) 387 || !(p = decode_pathname(p, &args->tname, &args->tlen)))
367 || !(p = decode_sattr(p, &args->attrs)))
368 return 0; 388 return 0;
389 p = decode_sattr(p, &args->attrs);
369 390
370 return xdr_argsize_check(rqstp, p); 391 return xdr_argsize_check(rqstp, p);
371} 392}
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index ab45db529c80..9e451a68580f 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -1059,20 +1059,12 @@ static int reiserfs_prepare_file_region_for_write(struct inode *inode
1059 maping blocks, since there is none, so we just zero out remaining 1059 maping blocks, since there is none, so we just zero out remaining
1060 parts of first and last pages in write area (if needed) */ 1060 parts of first and last pages in write area (if needed) */
1061 if ((pos & ~((loff_t) PAGE_CACHE_SIZE - 1)) > inode->i_size) { 1061 if ((pos & ~((loff_t) PAGE_CACHE_SIZE - 1)) > inode->i_size) {
1062 if (from != 0) { /* First page needs to be partially zeroed */ 1062 if (from != 0) /* First page needs to be partially zeroed */
1063 char *kaddr = kmap_atomic(prepared_pages[0], KM_USER0); 1063 zero_user_page(prepared_pages[0], 0, from, KM_USER0);
1064 memset(kaddr, 0, from); 1064
1065 kunmap_atomic(kaddr, KM_USER0); 1065 if (to != PAGE_CACHE_SIZE) /* Last page needs to be partially zeroed */
1066 flush_dcache_page(prepared_pages[0]); 1066 zero_user_page(prepared_pages[num_pages-1], to,
1067 } 1067 PAGE_CACHE_SIZE - to, KM_USER0);
1068 if (to != PAGE_CACHE_SIZE) { /* Last page needs to be partially zeroed */
1069 char *kaddr =
1070 kmap_atomic(prepared_pages[num_pages - 1],
1071 KM_USER0);
1072 memset(kaddr + to, 0, PAGE_CACHE_SIZE - to);
1073 kunmap_atomic(kaddr, KM_USER0);
1074 flush_dcache_page(prepared_pages[num_pages - 1]);
1075 }
1076 1068
1077 /* Since all blocks are new - use already calculated value */ 1069 /* Since all blocks are new - use already calculated value */
1078 return blocks; 1070 return blocks;
@@ -1199,13 +1191,9 @@ static int reiserfs_prepare_file_region_for_write(struct inode *inode
1199 ll_rw_block(READ, 1, &bh); 1191 ll_rw_block(READ, 1, &bh);
1200 *wait_bh++ = bh; 1192 *wait_bh++ = bh;
1201 } else { /* Not mapped, zero it */ 1193 } else { /* Not mapped, zero it */
1202 char *kaddr = 1194 zero_user_page(prepared_pages[0],
1203 kmap_atomic(prepared_pages[0], 1195 block_start,
1204 KM_USER0); 1196 from - block_start, KM_USER0);
1205 memset(kaddr + block_start, 0,
1206 from - block_start);
1207 kunmap_atomic(kaddr, KM_USER0);
1208 flush_dcache_page(prepared_pages[0]);
1209 set_buffer_uptodate(bh); 1197 set_buffer_uptodate(bh);
1210 } 1198 }
1211 } 1199 }
@@ -1237,13 +1225,8 @@ static int reiserfs_prepare_file_region_for_write(struct inode *inode
1237 ll_rw_block(READ, 1, &bh); 1225 ll_rw_block(READ, 1, &bh);
1238 *wait_bh++ = bh; 1226 *wait_bh++ = bh;
1239 } else { /* Not mapped, zero it */ 1227 } else { /* Not mapped, zero it */
1240 char *kaddr = 1228 zero_user_page(prepared_pages[num_pages-1],
1241 kmap_atomic(prepared_pages 1229 to, block_end - to, KM_USER0);
1242 [num_pages - 1],
1243 KM_USER0);
1244 memset(kaddr + to, 0, block_end - to);
1245 kunmap_atomic(kaddr, KM_USER0);
1246 flush_dcache_page(prepared_pages[num_pages - 1]);
1247 set_buffer_uptodate(bh); 1230 set_buffer_uptodate(bh);
1248 } 1231 }
1249 } 1232 }
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 9fcbfe316977..1272d11399fb 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -2148,13 +2148,8 @@ int reiserfs_truncate_file(struct inode *p_s_inode, int update_timestamps)
2148 length = offset & (blocksize - 1); 2148 length = offset & (blocksize - 1);
2149 /* if we are not on a block boundary */ 2149 /* if we are not on a block boundary */
2150 if (length) { 2150 if (length) {
2151 char *kaddr;
2152
2153 length = blocksize - length; 2151 length = blocksize - length;
2154 kaddr = kmap_atomic(page, KM_USER0); 2152 zero_user_page(page, offset, length, KM_USER0);
2155 memset(kaddr + offset, 0, length);
2156 flush_dcache_page(page);
2157 kunmap_atomic(kaddr, KM_USER0);
2158 if (buffer_mapped(bh) && bh->b_blocknr != 0) { 2153 if (buffer_mapped(bh) && bh->b_blocknr != 0) {
2159 mark_buffer_dirty(bh); 2154 mark_buffer_dirty(bh);
2160 } 2155 }
@@ -2370,7 +2365,6 @@ static int reiserfs_write_full_page(struct page *page,
2370 ** last byte in the file 2365 ** last byte in the file
2371 */ 2366 */
2372 if (page->index >= end_index) { 2367 if (page->index >= end_index) {
2373 char *kaddr;
2374 unsigned last_offset; 2368 unsigned last_offset;
2375 2369
2376 last_offset = inode->i_size & (PAGE_CACHE_SIZE - 1); 2370 last_offset = inode->i_size & (PAGE_CACHE_SIZE - 1);
@@ -2379,10 +2373,7 @@ static int reiserfs_write_full_page(struct page *page,
2379 unlock_page(page); 2373 unlock_page(page);
2380 return 0; 2374 return 0;
2381 } 2375 }
2382 kaddr = kmap_atomic(page, KM_USER0); 2376 zero_user_page(page, last_offset, PAGE_CACHE_SIZE - last_offset, KM_USER0);
2383 memset(kaddr + last_offset, 0, PAGE_CACHE_SIZE - last_offset);
2384 flush_dcache_page(page);
2385 kunmap_atomic(kaddr, KM_USER0);
2386 } 2377 }
2387 bh = head; 2378 bh = head;
2388 block = page->index << (PAGE_CACHE_SHIFT - s->s_blocksize_bits); 2379 block = page->index << (PAGE_CACHE_SHIFT - s->s_blocksize_bits);
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 0e637adc2b87..b502c7197ec0 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -111,36 +111,6 @@ static int fill_read_buffer(struct dentry * dentry, struct sysfs_buffer * buffer
111 return ret; 111 return ret;
112} 112}
113 113
114
115/**
116 * flush_read_buffer - push buffer to userspace.
117 * @buffer: data buffer for file.
118 * @buf: user-passed buffer.
119 * @count: number of bytes requested.
120 * @ppos: file position.
121 *
122 * Copy the buffer we filled in fill_read_buffer() to userspace.
123 * This is done at the reader's leisure, copying and advancing
124 * the amount they specify each time.
125 * This may be called continuously until the buffer is empty.
126 */
127static int flush_read_buffer(struct sysfs_buffer * buffer, char __user * buf,
128 size_t count, loff_t * ppos)
129{
130 int error;
131
132 if (*ppos > buffer->count)
133 return 0;
134
135 if (count > (buffer->count - *ppos))
136 count = buffer->count - *ppos;
137
138 error = copy_to_user(buf,buffer->page + *ppos,count);
139 if (!error)
140 *ppos += count;
141 return error ? -EFAULT : count;
142}
143
144/** 114/**
145 * sysfs_read_file - read an attribute. 115 * sysfs_read_file - read an attribute.
146 * @file: file pointer. 116 * @file: file pointer.
@@ -177,7 +147,8 @@ sysfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *ppos)
177 } 147 }
178 pr_debug("%s: count = %zd, ppos = %lld, buf = %s\n", 148 pr_debug("%s: count = %zd, ppos = %lld, buf = %s\n",
179 __FUNCTION__, count, *ppos, buffer->page); 149 __FUNCTION__, count, *ppos, buffer->page);
180 retval = flush_read_buffer(buffer,buf,count,ppos); 150 retval = simple_read_from_buffer(buf, count, ppos, buffer->page,
151 buffer->count);
181out: 152out:
182 up(&buffer->sem); 153 up(&buffer->sem);
183 return retval; 154 return retval;
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index f5aa3ef855fb..a96bde6df96d 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1734,11 +1734,13 @@ xfs_icsb_cpu_notify(
1734 per_cpu_ptr(mp->m_sb_cnts, (unsigned long)hcpu); 1734 per_cpu_ptr(mp->m_sb_cnts, (unsigned long)hcpu);
1735 switch (action) { 1735 switch (action) {
1736 case CPU_UP_PREPARE: 1736 case CPU_UP_PREPARE:
1737 case CPU_UP_PREPARE_FROZEN:
1737 /* Easy Case - initialize the area and locks, and 1738 /* Easy Case - initialize the area and locks, and
1738 * then rebalance when online does everything else for us. */ 1739 * then rebalance when online does everything else for us. */
1739 memset(cntp, 0, sizeof(xfs_icsb_cnts_t)); 1740 memset(cntp, 0, sizeof(xfs_icsb_cnts_t));
1740 break; 1741 break;
1741 case CPU_ONLINE: 1742 case CPU_ONLINE:
1743 case CPU_ONLINE_FROZEN:
1742 xfs_icsb_lock(mp); 1744 xfs_icsb_lock(mp);
1743 xfs_icsb_balance_counter(mp, XFS_SBS_ICOUNT, 0, 0); 1745 xfs_icsb_balance_counter(mp, XFS_SBS_ICOUNT, 0, 0);
1744 xfs_icsb_balance_counter(mp, XFS_SBS_IFREE, 0, 0); 1746 xfs_icsb_balance_counter(mp, XFS_SBS_IFREE, 0, 0);
@@ -1746,6 +1748,7 @@ xfs_icsb_cpu_notify(
1746 xfs_icsb_unlock(mp); 1748 xfs_icsb_unlock(mp);
1747 break; 1749 break;
1748 case CPU_DEAD: 1750 case CPU_DEAD:
1751 case CPU_DEAD_FROZEN:
1749 /* Disable all the counters, then fold the dead cpu's 1752 /* Disable all the counters, then fold the dead cpu's
1750 * count into the total on the global superblock and 1753 * count into the total on the global superblock and
1751 * re-enable the counters. */ 1754 * re-enable the counters. */
diff --git a/include/asm-alpha/smp.h b/include/asm-alpha/smp.h
index a1a1eca6be45..286e1d844f63 100644
--- a/include/asm-alpha/smp.h
+++ b/include/asm-alpha/smp.h
@@ -51,6 +51,7 @@ int smp_call_function_on_cpu(void (*func) (void *info), void *info,int retry, in
51 51
52#else /* CONFIG_SMP */ 52#else /* CONFIG_SMP */
53 53
54#define hard_smp_processor_id() 0
54#define smp_call_function_on_cpu(func,info,retry,wait,cpu) ({ 0; }) 55#define smp_call_function_on_cpu(func,info,retry,wait,cpu) ({ 0; })
55 56
56#endif /* CONFIG_SMP */ 57#endif /* CONFIG_SMP */
diff --git a/include/asm-alpha/thread_info.h b/include/asm-alpha/thread_info.h
index eeb3bef91e11..f4defc2bd3fb 100644
--- a/include/asm-alpha/thread_info.h
+++ b/include/asm-alpha/thread_info.h
@@ -97,7 +97,7 @@ register struct thread_info *__current_thread_info __asm__("$8");
97 1 << TIF_UAC_SIGBUS) 97 1 << TIF_UAC_SIGBUS)
98 98
99#define SET_UNALIGN_CTL(task,value) ({ \ 99#define SET_UNALIGN_CTL(task,value) ({ \
100 (task)->thread_info->flags = (((task)->thread_info->flags & \ 100 task_thread_info(task)->flags = ((task_thread_info(task)->flags & \
101 ~ALPHA_UAC_MASK) \ 101 ~ALPHA_UAC_MASK) \
102 | (((value) << ALPHA_UAC_SHIFT) & (1<<TIF_UAC_NOPRINT))\ 102 | (((value) << ALPHA_UAC_SHIFT) & (1<<TIF_UAC_NOPRINT))\
103 | (((value) << (ALPHA_UAC_SHIFT + 1)) & (1<<TIF_UAC_SIGBUS)) \ 103 | (((value) << (ALPHA_UAC_SHIFT + 1)) & (1<<TIF_UAC_SIGBUS)) \
@@ -105,11 +105,11 @@ register struct thread_info *__current_thread_info __asm__("$8");
105 0; }) 105 0; })
106 106
107#define GET_UNALIGN_CTL(task,value) ({ \ 107#define GET_UNALIGN_CTL(task,value) ({ \
108 put_user(((task)->thread_info->flags & (1 << TIF_UAC_NOPRINT)) \ 108 put_user((task_thread_info(task)->flags & (1 << TIF_UAC_NOPRINT))\
109 >> ALPHA_UAC_SHIFT \ 109 >> ALPHA_UAC_SHIFT \
110 | ((task)->thread_info->flags & (1 << TIF_UAC_SIGBUS)) \ 110 | (task_thread_info(task)->flags & (1 << TIF_UAC_SIGBUS))\
111 >> (ALPHA_UAC_SHIFT + 1) \ 111 >> (ALPHA_UAC_SHIFT + 1) \
112 | ((task)->thread_info->flags & (1 << TIF_UAC_NOFIX)) \ 112 | (task_thread_info(task)->flags & (1 << TIF_UAC_NOFIX))\
113 >> (ALPHA_UAC_SHIFT - 1), \ 113 >> (ALPHA_UAC_SHIFT - 1), \
114 (int __user *)(value)); \ 114 (int __user *)(value)); \
115 }) 115 })
diff --git a/include/asm-arm/arch-at91/cpu.h b/include/asm-arm/arch-at91/cpu.h
index d464ca58cdbc..7ef4eebe9f8e 100644
--- a/include/asm-arm/arch-at91/cpu.h
+++ b/include/asm-arm/arch-at91/cpu.h
@@ -68,4 +68,10 @@ static inline unsigned long at91_arch_identify(void)
68#define cpu_is_at91sam9263() (0) 68#define cpu_is_at91sam9263() (0)
69#endif 69#endif
70 70
71/*
72 * Since this is ARM, we will never run on any AVR32 CPU. But these
73 * definitions may reduce clutter in common drivers.
74 */
75#define cpu_is_at32ap7000() (0)
76
71#endif 77#endif
diff --git a/include/asm-avr32/arch-at32ap/cpu.h b/include/asm-avr32/arch-at32ap/cpu.h
new file mode 100644
index 000000000000..2bdc5bd6f793
--- /dev/null
+++ b/include/asm-avr32/arch-at32ap/cpu.h
@@ -0,0 +1,33 @@
1/*
2 * AVR32 and (fake) AT91 CPU identification
3 *
4 * Copyright (C) 2007 Atmel Corporation
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
10#ifndef __ASM_ARCH_CPU_H
11#define __ASM_ARCH_CPU_H
12
13/*
14 * Only AT32AP7000 is defined for now. We can identify the specific
15 * chip at runtime, but I'm not sure if it's really worth it.
16 */
17#ifdef CONFIG_CPU_AT32AP7000
18# define cpu_is_at32ap7000() (1)
19#else
20# define cpu_is_at32ap7000() (0)
21#endif
22
23/*
24 * Since this is AVR32, we will never run on any AT91 CPU. But these
25 * definitions may reduce clutter in common drivers.
26 */
27#define cpu_is_at91rm9200() (0)
28#define cpu_is_at91sam9xe() (0)
29#define cpu_is_at91sam9260() (0)
30#define cpu_is_at91sam9261() (0)
31#define cpu_is_at91sam9263() (0)
32
33#endif /* __ASM_ARCH_CPU_H */
diff --git a/include/asm-avr32/setup.h b/include/asm-avr32/setup.h
index 1ff1a217015d..b0828d43e110 100644
--- a/include/asm-avr32/setup.h
+++ b/include/asm-avr32/setup.h
@@ -110,7 +110,7 @@ struct tagtable {
110 int (*parse)(struct tag *); 110 int (*parse)(struct tag *);
111}; 111};
112 112
113#define __tag __attribute_used__ __attribute__((__section__(".taglist"))) 113#define __tag __attribute_used__ __attribute__((__section__(".taglist.init")))
114#define __tagtable(tag, fn) \ 114#define __tagtable(tag, fn) \
115 static struct tagtable __tagtable_##fn __tag = { tag, fn } 115 static struct tagtable __tagtable_##fn __tag = { tag, fn }
116 116
diff --git a/include/asm-avr32/unistd.h b/include/asm-avr32/unistd.h
index 8f5120471819..2418cce624cc 100644
--- a/include/asm-avr32/unistd.h
+++ b/include/asm-avr32/unistd.h
@@ -295,8 +295,10 @@
295#define __NR_shmdt 276 295#define __NR_shmdt 276
296#define __NR_shmctl 277 296#define __NR_shmctl 277
297 297
298#define __NR_utimensat 278
299
298#ifdef __KERNEL__ 300#ifdef __KERNEL__
299#define NR_syscalls 278 301#define NR_syscalls 279
300 302
301 303
302#define __ARCH_WANT_IPC_PARSE_VERSION 304#define __ARCH_WANT_IPC_PARSE_VERSION
diff --git a/include/asm-blackfin/processor.h b/include/asm-blackfin/processor.h
index 997465c93e82..0336ff132c16 100644
--- a/include/asm-blackfin/processor.h
+++ b/include/asm-blackfin/processor.h
@@ -58,10 +58,10 @@ do { \
58 (_regs)->pc = (_pc); \ 58 (_regs)->pc = (_pc); \
59 if (current->mm) \ 59 if (current->mm) \
60 (_regs)->p5 = current->mm->start_data; \ 60 (_regs)->p5 = current->mm->start_data; \
61 current->thread_info->l1_task_info.stack_start \ 61 task_thread_info(current)->l1_task_info.stack_start \
62 = (void *)current->mm->context.stack_start; \ 62 = (void *)current->mm->context.stack_start; \
63 current->thread_info->l1_task_info.lowest_sp = (void *)(_usp); \ 63 task_thread_info(current)->l1_task_info.lowest_sp = (void *)(_usp); \
64 memcpy(L1_SCRATCH_TASK_INFO, &current->thread_info->l1_task_info, \ 64 memcpy(L1_SCRATCH_TASK_INFO, &task_thread_info(current)->l1_task_info, \
65 sizeof(*L1_SCRATCH_TASK_INFO)); \ 65 sizeof(*L1_SCRATCH_TASK_INFO)); \
66 wrusp(_usp); \ 66 wrusp(_usp); \
67} while(0) 67} while(0)
diff --git a/include/asm-blackfin/system.h b/include/asm-blackfin/system.h
index b5bf6e7cb5e8..5e5f1a0566c0 100644
--- a/include/asm-blackfin/system.h
+++ b/include/asm-blackfin/system.h
@@ -239,9 +239,9 @@ asmlinkage struct task_struct *resume(struct task_struct *prev, struct task_stru
239 239
240#define switch_to(prev,next,last) \ 240#define switch_to(prev,next,last) \
241do { \ 241do { \
242 memcpy (&prev->thread_info->l1_task_info, L1_SCRATCH_TASK_INFO, \ 242 memcpy (&task_thread_info(prev)->l1_task_info, L1_SCRATCH_TASK_INFO, \
243 sizeof *L1_SCRATCH_TASK_INFO); \ 243 sizeof *L1_SCRATCH_TASK_INFO); \
244 memcpy (L1_SCRATCH_TASK_INFO, &next->thread_info->l1_task_info, \ 244 memcpy (L1_SCRATCH_TASK_INFO, &task_thread_info(next)->l1_task_info, \
245 sizeof *L1_SCRATCH_TASK_INFO); \ 245 sizeof *L1_SCRATCH_TASK_INFO); \
246 (last) = resume (prev, next); \ 246 (last) = resume (prev, next); \
247} while (0) 247} while (0)
diff --git a/include/asm-frv/tlb.h b/include/asm-frv/tlb.h
index f94fe5cb9b3a..cd458eb6d75e 100644
--- a/include/asm-frv/tlb.h
+++ b/include/asm-frv/tlb.h
@@ -3,7 +3,11 @@
3 3
4#include <asm/tlbflush.h> 4#include <asm/tlbflush.h>
5 5
6#ifdef CONFIG_MMU
7extern void check_pgt_cache(void);
8#else
6#define check_pgt_cache() do {} while(0) 9#define check_pgt_cache() do {} while(0)
10#endif
7 11
8/* 12/*
9 * we don't need any special per-pte or per-vma handling... 13 * we don't need any special per-pte or per-vma handling...
diff --git a/include/asm-i386/mmzone.h b/include/asm-i386/mmzone.h
index 3503ad66945e..118e9812778f 100644
--- a/include/asm-i386/mmzone.h
+++ b/include/asm-i386/mmzone.h
@@ -122,21 +122,21 @@ static inline int pfn_valid(int pfn)
122 __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0) 122 __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0)
123#define alloc_bootmem_node(pgdat, x) \ 123#define alloc_bootmem_node(pgdat, x) \
124({ \ 124({ \
125 struct pglist_data __attribute__ ((unused)) \ 125 struct pglist_data __maybe_unused \
126 *__alloc_bootmem_node__pgdat = (pgdat); \ 126 *__alloc_bootmem_node__pgdat = (pgdat); \
127 __alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, \ 127 __alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, \
128 __pa(MAX_DMA_ADDRESS)); \ 128 __pa(MAX_DMA_ADDRESS)); \
129}) 129})
130#define alloc_bootmem_pages_node(pgdat, x) \ 130#define alloc_bootmem_pages_node(pgdat, x) \
131({ \ 131({ \
132 struct pglist_data __attribute__ ((unused)) \ 132 struct pglist_data __maybe_unused \
133 *__alloc_bootmem_node__pgdat = (pgdat); \ 133 *__alloc_bootmem_node__pgdat = (pgdat); \
134 __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, \ 134 __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, \
135 __pa(MAX_DMA_ADDRESS)) \ 135 __pa(MAX_DMA_ADDRESS)) \
136}) 136})
137#define alloc_bootmem_low_pages_node(pgdat, x) \ 137#define alloc_bootmem_low_pages_node(pgdat, x) \
138({ \ 138({ \
139 struct pglist_data __attribute__ ((unused)) \ 139 struct pglist_data __maybe_unused \
140 *__alloc_bootmem_node__pgdat = (pgdat); \ 140 *__alloc_bootmem_node__pgdat = (pgdat); \
141 __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0); \ 141 __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0); \
142}) 142})
diff --git a/include/asm-i386/msr.h b/include/asm-i386/msr.h
index 26861df52cc4..df21ea049369 100644
--- a/include/asm-i386/msr.h
+++ b/include/asm-i386/msr.h
@@ -86,62 +86,50 @@ static inline unsigned long long native_read_pmc(void)
86 86
87#define rdmsr(msr,val1,val2) \ 87#define rdmsr(msr,val1,val2) \
88 do { \ 88 do { \
89 unsigned long long __val = native_read_msr(msr); \ 89 u64 __val = native_read_msr(msr); \
90 val1 = __val; \ 90 (val1) = (u32)__val; \
91 val2 = __val >> 32; \ 91 (val2) = (u32)(__val >> 32); \
92 } while(0) 92 } while(0)
93 93
94#define wrmsr(msr,val1,val2) \ 94static inline void wrmsr(u32 __msr, u32 __low, u32 __high)
95 native_write_msr(msr, ((unsigned long long)val2 << 32) | val1)
96
97#define rdmsrl(msr,val) \
98 do { \
99 (val) = native_read_msr(msr); \
100 } while(0)
101
102static inline void wrmsrl (unsigned long msr, unsigned long long val)
103{ 95{
104 unsigned long lo, hi; 96 native_write_msr(__msr, ((u64)__high << 32) | __low);
105 lo = (unsigned long) val;
106 hi = val >> 32;
107 wrmsr (msr, lo, hi);
108} 97}
109 98
99#define rdmsrl(msr,val) \
100 ((val) = native_read_msr(msr))
101
102#define wrmsrl(msr,val) native_write_msr(msr, val)
103
110/* wrmsr with exception handling */ 104/* wrmsr with exception handling */
111#define wrmsr_safe(msr,val1,val2) \ 105static inline int wrmsr_safe(u32 __msr, u32 __low, u32 __high)
112 (native_write_msr_safe(msr, ((unsigned long long)val2 << 32) | val1)) 106{
107 return native_write_msr_safe(__msr, ((u64)__high << 32) | __low);
108}
113 109
114/* rdmsr with exception handling */ 110/* rdmsr with exception handling */
115#define rdmsr_safe(msr,p1,p2) \ 111#define rdmsr_safe(msr,p1,p2) \
116 ({ \ 112 ({ \
117 int __err; \ 113 int __err; \
118 unsigned long long __val = native_read_msr_safe(msr, &__err);\ 114 u64 __val = native_read_msr_safe(msr, &__err); \
119 (*p1) = __val; \ 115 (*p1) = (u32)__val; \
120 (*p2) = __val >> 32; \ 116 (*p2) = (u32)(__val >> 32); \
121 __err; \ 117 __err; \
122 }) 118 })
123 119
124#define rdtsc(low,high) \
125 do { \
126 u64 _l = native_read_tsc(); \
127 (low) = (u32)_l; \
128 (high) = _l >> 32; \
129 } while(0)
130
131#define rdtscl(low) \ 120#define rdtscl(low) \
132 do { \ 121 ((low) = (u32)native_read_tsc())
133 (low) = native_read_tsc(); \
134 } while(0)
135 122
136#define rdtscll(val) ((val) = native_read_tsc()) 123#define rdtscll(val) \
124 ((val) = native_read_tsc())
137 125
138#define write_tsc(val1,val2) wrmsr(0x10, val1, val2) 126#define write_tsc(val1,val2) wrmsr(0x10, val1, val2)
139 127
140#define rdpmc(counter,low,high) \ 128#define rdpmc(counter,low,high) \
141 do { \ 129 do { \
142 u64 _l = native_read_pmc(); \ 130 u64 _l = native_read_pmc(); \
143 low = (u32)_l; \ 131 (low) = (u32)_l; \
144 high = _l >> 32; \ 132 (high) = (u32)(_l >> 32); \
145 } while(0) 133 } while(0)
146#endif /* !CONFIG_PARAVIRT */ 134#endif /* !CONFIG_PARAVIRT */
147 135
diff --git a/include/asm-i386/paravirt.h b/include/asm-i386/paravirt.h
index e2e7f98723c5..bc5c12c13581 100644
--- a/include/asm-i386/paravirt.h
+++ b/include/asm-i386/paravirt.h
@@ -560,11 +560,6 @@ static inline u64 paravirt_read_tsc(void)
560{ 560{
561 return PVOP_CALL0(u64, read_tsc); 561 return PVOP_CALL0(u64, read_tsc);
562} 562}
563#define rdtsc(low,high) do { \
564 u64 _l = paravirt_read_tsc(); \
565 low = (u32)_l; \
566 high = _l >> 32; \
567} while(0)
568 563
569#define rdtscl(low) do { \ 564#define rdtscl(low) do { \
570 u64 _l = paravirt_read_tsc(); \ 565 u64 _l = paravirt_read_tsc(); \
diff --git a/include/asm-i386/smp.h b/include/asm-i386/smp.h
index 090abc1da32a..0c7132787062 100644
--- a/include/asm-i386/smp.h
+++ b/include/asm-i386/smp.h
@@ -124,20 +124,6 @@ static inline int num_booting_cpus(void)
124 return cpus_weight(cpu_callout_map); 124 return cpus_weight(cpu_callout_map);
125} 125}
126 126
127#ifdef CONFIG_X86_LOCAL_APIC
128
129#ifdef APIC_DEFINITION
130extern int hard_smp_processor_id(void);
131#else
132#include <mach_apicdef.h>
133static inline int hard_smp_processor_id(void)
134{
135 /* we don't want to mark this access volatile - bad code generation */
136 return GET_APIC_ID(*(unsigned long *)(APIC_BASE+APIC_ID));
137}
138#endif
139#endif
140
141extern int safe_smp_processor_id(void); 127extern int safe_smp_processor_id(void);
142extern int __cpu_disable(void); 128extern int __cpu_disable(void);
143extern void __cpu_die(unsigned int cpu); 129extern void __cpu_die(unsigned int cpu);
@@ -152,10 +138,31 @@ extern unsigned int num_processors;
152 138
153#define NO_PROC_ID 0xFF /* No processor magic marker */ 139#define NO_PROC_ID 0xFF /* No processor magic marker */
154 140
155#endif 141#endif /* CONFIG_SMP */
156 142
157#ifndef __ASSEMBLY__ 143#ifndef __ASSEMBLY__
158 144
145#ifdef CONFIG_X86_LOCAL_APIC
146
147#ifdef APIC_DEFINITION
148extern int hard_smp_processor_id(void);
149#else
150#include <mach_apicdef.h>
151static inline int hard_smp_processor_id(void)
152{
153 /* we don't want to mark this access volatile - bad code generation */
154 return GET_APIC_ID(*(unsigned long *)(APIC_BASE+APIC_ID));
155}
156#endif /* APIC_DEFINITION */
157
158#else /* CONFIG_X86_LOCAL_APIC */
159
160#ifndef CONFIG_SMP
161#define hard_smp_processor_id() 0
162#endif
163
164#endif /* CONFIG_X86_LOCAL_APIC */
165
159extern u8 apicid_2_node[]; 166extern u8 apicid_2_node[];
160 167
161#ifdef CONFIG_X86_LOCAL_APIC 168#ifdef CONFIG_X86_LOCAL_APIC
diff --git a/include/asm-i386/thread_info.h b/include/asm-i386/thread_info.h
index bf01d4b342bd..4cb0f91ae64f 100644
--- a/include/asm-i386/thread_info.h
+++ b/include/asm-i386/thread_info.h
@@ -172,7 +172,7 @@ static inline struct thread_info *current_thread_info(void)
172#define TS_USEDFPU 0x0001 /* FPU was used by this task this quantum (SMP) */ 172#define TS_USEDFPU 0x0001 /* FPU was used by this task this quantum (SMP) */
173#define TS_POLLING 0x0002 /* True if in idle loop and not sleeping */ 173#define TS_POLLING 0x0002 /* True if in idle loop and not sleeping */
174 174
175#define tsk_is_polling(t) ((t)->thread_info->status & TS_POLLING) 175#define tsk_is_polling(t) (task_thread_info(t)->status & TS_POLLING)
176 176
177#endif /* __KERNEL__ */ 177#endif /* __KERNEL__ */
178 178
diff --git a/include/asm-ia64/smp.h b/include/asm-ia64/smp.h
index 60fd4ae014f6..c60024989ebd 100644
--- a/include/asm-ia64/smp.h
+++ b/include/asm-ia64/smp.h
@@ -38,6 +38,8 @@ ia64_get_lid (void)
38 return lid.f.id << 8 | lid.f.eid; 38 return lid.f.id << 8 | lid.f.eid;
39} 39}
40 40
41#define hard_smp_processor_id() ia64_get_lid()
42
41#ifdef CONFIG_SMP 43#ifdef CONFIG_SMP
42 44
43#define XTP_OFFSET 0x1e0008 45#define XTP_OFFSET 0x1e0008
@@ -110,8 +112,6 @@ max_xtp (void)
110 writeb(0x0f, ipi_base_addr + XTP_OFFSET); /* Set XTP to max */ 112 writeb(0x0f, ipi_base_addr + XTP_OFFSET); /* Set XTP to max */
111} 113}
112 114
113#define hard_smp_processor_id() ia64_get_lid()
114
115/* Upping and downing of CPUs */ 115/* Upping and downing of CPUs */
116extern int __cpu_disable (void); 116extern int __cpu_disable (void);
117extern void __cpu_die (unsigned int cpu); 117extern void __cpu_die (unsigned int cpu);
@@ -128,7 +128,7 @@ extern void unlock_ipi_calllock(void);
128extern void identify_siblings (struct cpuinfo_ia64 *); 128extern void identify_siblings (struct cpuinfo_ia64 *);
129extern int is_multithreading_enabled(void); 129extern int is_multithreading_enabled(void);
130 130
131#else 131#else /* CONFIG_SMP */
132 132
133#define cpu_logical_id(i) 0 133#define cpu_logical_id(i) 0
134#define cpu_physical_id(i) ia64_get_lid() 134#define cpu_physical_id(i) ia64_get_lid()
diff --git a/include/asm-ia64/thread_info.h b/include/asm-ia64/thread_info.h
index 91698599f918..d28147506585 100644
--- a/include/asm-ia64/thread_info.h
+++ b/include/asm-ia64/thread_info.h
@@ -110,6 +110,6 @@ struct thread_info {
110 110
111#define TS_POLLING 1 /* true if in idle loop and not sleeping */ 111#define TS_POLLING 1 /* true if in idle loop and not sleeping */
112 112
113#define tsk_is_polling(t) ((t)->thread_info->status & TS_POLLING) 113#define tsk_is_polling(t) (task_thread_info(t)->status & TS_POLLING)
114 114
115#endif /* _ASM_IA64_THREAD_INFO_H */ 115#endif /* _ASM_IA64_THREAD_INFO_H */
diff --git a/include/asm-m32r/smp.h b/include/asm-m32r/smp.h
index abd937ac5239..078e1a51a042 100644
--- a/include/asm-m32r/smp.h
+++ b/include/asm-m32r/smp.h
@@ -108,6 +108,10 @@ extern unsigned long send_IPI_mask_phys(cpumask_t, int, int);
108#define IPI_SHIFT (0) 108#define IPI_SHIFT (0)
109#define NR_IPIS (8) 109#define NR_IPIS (8)
110 110
111#endif /* CONFIG_SMP */ 111#else /* CONFIG_SMP */
112
113#define hard_smp_processor_id() 0
114
115#endif /* CONFIG_SMP */
112 116
113#endif /* _ASM_M32R_SMP_H */ 117#endif /* _ASM_M32R_SMP_H */
diff --git a/include/asm-m68k/thread_info.h b/include/asm-m68k/thread_info.h
index c4d622a57dfb..d635a3752488 100644
--- a/include/asm-m68k/thread_info.h
+++ b/include/asm-m68k/thread_info.h
@@ -37,17 +37,17 @@ struct thread_info {
37#define init_stack (init_thread_union.stack) 37#define init_stack (init_thread_union.stack)
38 38
39#define task_thread_info(tsk) (&(tsk)->thread.info) 39#define task_thread_info(tsk) (&(tsk)->thread.info)
40#define task_stack_page(tsk) ((void *)(tsk)->thread_info) 40#define task_stack_page(tsk) ((tsk)->stack)
41#define current_thread_info() task_thread_info(current) 41#define current_thread_info() task_thread_info(current)
42 42
43#define __HAVE_THREAD_FUNCTIONS 43#define __HAVE_THREAD_FUNCTIONS
44 44
45#define setup_thread_stack(p, org) ({ \ 45#define setup_thread_stack(p, org) ({ \
46 *(struct task_struct **)(p)->thread_info = (p); \ 46 *(struct task_struct **)(p)->stack = (p); \
47 task_thread_info(p)->task = (p); \ 47 task_thread_info(p)->task = (p); \
48}) 48})
49 49
50#define end_of_stack(p) ((unsigned long *)(p)->thread_info + 1) 50#define end_of_stack(p) ((unsigned long *)(p)->stack + 1)
51 51
52/* entry.S relies on these definitions! 52/* entry.S relies on these definitions!
53 * bits 0-7 are tested at every exception exit 53 * bits 0-7 are tested at every exception exit
diff --git a/include/asm-mips/system.h b/include/asm-mips/system.h
index 30f23a2b46ca..3713d256d369 100644
--- a/include/asm-mips/system.h
+++ b/include/asm-mips/system.h
@@ -55,7 +55,7 @@ do { \
55 if (cpu_has_dsp) \ 55 if (cpu_has_dsp) \
56 __save_dsp(prev); \ 56 __save_dsp(prev); \
57 next->thread.emulated_fp = 0; \ 57 next->thread.emulated_fp = 0; \
58 (last) = resume(prev, next, next->thread_info); \ 58 (last) = resume(prev, next, task_thread_info(next)); \
59 if (cpu_has_dsp) \ 59 if (cpu_has_dsp) \
60 __restore_dsp(current); \ 60 __restore_dsp(current); \
61} while(0) 61} while(0)
diff --git a/include/asm-parisc/compat.h b/include/asm-parisc/compat.h
index fe8579023531..11f4222597a0 100644
--- a/include/asm-parisc/compat.h
+++ b/include/asm-parisc/compat.h
@@ -152,7 +152,7 @@ static __inline__ void __user *compat_alloc_user_space(long len)
152 152
153static inline int __is_compat_task(struct task_struct *t) 153static inline int __is_compat_task(struct task_struct *t)
154{ 154{
155 return test_ti_thread_flag(t->thread_info, TIF_32BIT); 155 return test_ti_thread_flag(task_thread_info(t), TIF_32BIT);
156} 156}
157 157
158static inline int is_compat_task(void) 158static inline int is_compat_task(void)
diff --git a/include/asm-powerpc/smp.h b/include/asm-powerpc/smp.h
index 01717f266dc9..d037f50580e2 100644
--- a/include/asm-powerpc/smp.h
+++ b/include/asm-powerpc/smp.h
@@ -83,6 +83,7 @@ extern void __cpu_die(unsigned int cpu);
83 83
84#else 84#else
85/* for UP */ 85/* for UP */
86#define hard_smp_processor_id() 0
86#define smp_setup_cpu_maps() 87#define smp_setup_cpu_maps()
87 88
88#endif /* CONFIG_SMP */ 89#endif /* CONFIG_SMP */
diff --git a/include/asm-s390/smp.h b/include/asm-s390/smp.h
index 0a28e6d6ef40..76e424f718c6 100644
--- a/include/asm-s390/smp.h
+++ b/include/asm-s390/smp.h
@@ -110,6 +110,7 @@ static inline void smp_send_stop(void)
110 __load_psw_mask(psw_kernel_bits & ~PSW_MASK_MCHECK); 110 __load_psw_mask(psw_kernel_bits & ~PSW_MASK_MCHECK);
111} 111}
112 112
113#define hard_smp_processor_id() 0
113#define smp_cpu_not_running(cpu) 1 114#define smp_cpu_not_running(cpu) 1
114#define smp_setup_cpu_possible_map() do { } while (0) 115#define smp_setup_cpu_possible_map() do { } while (0)
115#endif 116#endif
diff --git a/include/asm-sh/cpu-sh3/dma.h b/include/asm-sh/cpu-sh3/dma.h
index 954801b46022..3a66dc458023 100644
--- a/include/asm-sh/cpu-sh3/dma.h
+++ b/include/asm-sh/cpu-sh3/dma.h
@@ -26,7 +26,7 @@ enum {
26 XMIT_SZ_128BIT, 26 XMIT_SZ_128BIT,
27}; 27};
28 28
29static unsigned int ts_shift[] __attribute__ ((used)) = { 29static unsigned int ts_shift[] __maybe_unused = {
30 [XMIT_SZ_8BIT] = 0, 30 [XMIT_SZ_8BIT] = 0,
31 [XMIT_SZ_16BIT] = 1, 31 [XMIT_SZ_16BIT] = 1,
32 [XMIT_SZ_32BIT] = 2, 32 [XMIT_SZ_32BIT] = 2,
diff --git a/include/asm-sh/cpu-sh4/dma-sh7780.h b/include/asm-sh/cpu-sh4/dma-sh7780.h
index 6c90d28331b2..71b426a6e482 100644
--- a/include/asm-sh/cpu-sh4/dma-sh7780.h
+++ b/include/asm-sh/cpu-sh4/dma-sh7780.h
@@ -28,7 +28,7 @@ enum {
28/* 28/*
29 * The DMA count is defined as the number of bytes to transfer. 29 * The DMA count is defined as the number of bytes to transfer.
30 */ 30 */
31static unsigned int __attribute__ ((used)) ts_shift[] = { 31static unsigned int ts_shift[] __maybe_unused = {
32 [XMIT_SZ_8BIT] = 0, 32 [XMIT_SZ_8BIT] = 0,
33 [XMIT_SZ_16BIT] = 1, 33 [XMIT_SZ_16BIT] = 1,
34 [XMIT_SZ_32BIT] = 2, 34 [XMIT_SZ_32BIT] = 2,
diff --git a/include/asm-sh/cpu-sh4/dma.h b/include/asm-sh/cpu-sh4/dma.h
index c135e9cebd9c..36e26a964765 100644
--- a/include/asm-sh/cpu-sh4/dma.h
+++ b/include/asm-sh/cpu-sh4/dma.h
@@ -53,7 +53,7 @@ enum {
53/* 53/*
54 * The DMA count is defined as the number of bytes to transfer. 54 * The DMA count is defined as the number of bytes to transfer.
55 */ 55 */
56static unsigned int ts_shift[] __attribute__ ((used)) = { 56static unsigned int ts_shift[] __maybe_unused = {
57 [XMIT_SZ_64BIT] = 3, 57 [XMIT_SZ_64BIT] = 3,
58 [XMIT_SZ_8BIT] = 0, 58 [XMIT_SZ_8BIT] = 0,
59 [XMIT_SZ_16BIT] = 1, 59 [XMIT_SZ_16BIT] = 1,
diff --git a/include/asm-sparc/smp.h b/include/asm-sparc/smp.h
index b9da9a600e35..b3f492208fd2 100644
--- a/include/asm-sparc/smp.h
+++ b/include/asm-sparc/smp.h
@@ -165,6 +165,7 @@ void smp_setup_cpu_possible_map(void);
165 165
166#else /* SMP */ 166#else /* SMP */
167 167
168#define hard_smp_processor_id() 0
168#define smp_setup_cpu_possible_map() do { } while (0) 169#define smp_setup_cpu_possible_map() do { } while (0)
169 170
170#endif /* !(SMP) */ 171#endif /* !(SMP) */
diff --git a/include/asm-sparc64/smp.h b/include/asm-sparc64/smp.h
index cca54804b722..869d16fb907b 100644
--- a/include/asm-sparc64/smp.h
+++ b/include/asm-sparc64/smp.h
@@ -48,6 +48,7 @@ extern unsigned char boot_cpu_id;
48 48
49#else 49#else
50 50
51#define hard_smp_processor_id() 0
51#define smp_setup_cpu_possible_map() do { } while (0) 52#define smp_setup_cpu_possible_map() do { } while (0)
52#define boot_cpu_id (0) 53#define boot_cpu_id (0)
53 54
diff --git a/include/asm-um/required-features.h b/include/asm-um/required-features.h
new file mode 100644
index 000000000000..dfb967b2d2f3
--- /dev/null
+++ b/include/asm-um/required-features.h
@@ -0,0 +1,9 @@
1#ifndef __UM_REQUIRED_FEATURES_H
2#define __UM_REQUIRED_FEATURES_H
3
4/*
5 * Nothing to see, just need something for the i386 and x86_64 asm
6 * headers to include.
7 */
8
9#endif
diff --git a/include/asm-um/smp.h b/include/asm-um/smp.h
index ca552261ed1f..84f8cf29324e 100644
--- a/include/asm-um/smp.h
+++ b/include/asm-um/smp.h
@@ -24,6 +24,10 @@ extern inline void smp_cpus_done(unsigned int maxcpus)
24 24
25extern struct task_struct *idle_threads[NR_CPUS]; 25extern struct task_struct *idle_threads[NR_CPUS];
26 26
27#else
28
29#define hard_smp_processor_id() 0
30
27#endif 31#endif
28 32
29#endif 33#endif
diff --git a/include/asm-x86_64/smp.h b/include/asm-x86_64/smp.h
index d5704421456b..3f303d2365ed 100644
--- a/include/asm-x86_64/smp.h
+++ b/include/asm-x86_64/smp.h
@@ -57,12 +57,6 @@ static inline int num_booting_cpus(void)
57 57
58#define raw_smp_processor_id() read_pda(cpunumber) 58#define raw_smp_processor_id() read_pda(cpunumber)
59 59
60static inline int hard_smp_processor_id(void)
61{
62 /* we don't want to mark this access volatile - bad code generation */
63 return GET_APIC_ID(*(unsigned int *)(APIC_BASE+APIC_ID));
64}
65
66extern int __cpu_disable(void); 60extern int __cpu_disable(void);
67extern void __cpu_die(unsigned int cpu); 61extern void __cpu_die(unsigned int cpu);
68extern void prefill_possible_map(void); 62extern void prefill_possible_map(void);
@@ -71,7 +65,13 @@ extern unsigned __cpuinitdata disabled_cpus;
71 65
72#define NO_PROC_ID 0xFF /* No processor magic marker */ 66#define NO_PROC_ID 0xFF /* No processor magic marker */
73 67
74#endif 68#endif /* CONFIG_SMP */
69
70static inline int hard_smp_processor_id(void)
71{
72 /* we don't want to mark this access volatile - bad code generation */
73 return GET_APIC_ID(*(unsigned int *)(APIC_BASE+APIC_ID));
74}
75 75
76/* 76/*
77 * Some lowlevel functions might want to know about 77 * Some lowlevel functions might want to know about
diff --git a/include/asm-x86_64/system.h b/include/asm-x86_64/system.h
index b7b8021e8c43..ead9f9a56234 100644
--- a/include/asm-x86_64/system.h
+++ b/include/asm-x86_64/system.h
@@ -39,7 +39,7 @@
39 [threadrsp] "i" (offsetof(struct task_struct, thread.rsp)), \ 39 [threadrsp] "i" (offsetof(struct task_struct, thread.rsp)), \
40 [ti_flags] "i" (offsetof(struct thread_info, flags)),\ 40 [ti_flags] "i" (offsetof(struct thread_info, flags)),\
41 [tif_fork] "i" (TIF_FORK), \ 41 [tif_fork] "i" (TIF_FORK), \
42 [thread_info] "i" (offsetof(struct task_struct, thread_info)), \ 42 [thread_info] "i" (offsetof(struct task_struct, stack)), \
43 [pda_pcurrent] "i" (offsetof(struct x8664_pda, pcurrent)) \ 43 [pda_pcurrent] "i" (offsetof(struct x8664_pda, pcurrent)) \
44 : "memory", "cc" __EXTRA_CLOBBER) 44 : "memory", "cc" __EXTRA_CLOBBER)
45 45
diff --git a/include/asm-x86_64/thread_info.h b/include/asm-x86_64/thread_info.h
index 74a6c74397f7..10bb5a8ed688 100644
--- a/include/asm-x86_64/thread_info.h
+++ b/include/asm-x86_64/thread_info.h
@@ -162,7 +162,7 @@ static inline struct thread_info *stack_thread_info(void)
162#define TS_COMPAT 0x0002 /* 32bit syscall active */ 162#define TS_COMPAT 0x0002 /* 32bit syscall active */
163#define TS_POLLING 0x0004 /* true if in idle loop and not sleeping */ 163#define TS_POLLING 0x0004 /* true if in idle loop and not sleeping */
164 164
165#define tsk_is_polling(t) ((t)->thread_info->status & TS_POLLING) 165#define tsk_is_polling(t) (task_thread_info(t)->status & TS_POLLING)
166 166
167#endif /* __KERNEL__ */ 167#endif /* __KERNEL__ */
168 168
diff --git a/include/linux/aio.h b/include/linux/aio.h
index a30ef13c9e62..43dc2ebfaa0e 100644
--- a/include/linux/aio.h
+++ b/include/linux/aio.h
@@ -226,7 +226,8 @@ int FASTCALL(io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
226 __put_ioctx(kioctx); \ 226 __put_ioctx(kioctx); \
227} while (0) 227} while (0)
228 228
229#define in_aio() !is_sync_wait(current->io_wait) 229#define in_aio() (unlikely(!is_sync_wait(current->io_wait)))
230
230/* may be used for debugging */ 231/* may be used for debugging */
231#define warn_if_async() \ 232#define warn_if_async() \
232do { \ 233do { \
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index a686eabe22d6..db5b00a792f5 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -854,7 +854,7 @@ static inline void put_dev_sector(Sector p)
854 854
855struct work_struct; 855struct work_struct;
856int kblockd_schedule_work(struct work_struct *work); 856int kblockd_schedule_work(struct work_struct *work);
857void kblockd_flush(void); 857void kblockd_flush_work(struct work_struct *work);
858 858
859#define MODULE_ALIAS_BLOCKDEV(major,minor) \ 859#define MODULE_ALIAS_BLOCKDEV(major,minor) \
860 MODULE_ALIAS("block-major-" __stringify(major) "-" __stringify(minor)) 860 MODULE_ALIAS("block-major-" __stringify(major) "-" __stringify(minor))
diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index 2665ca04cf8f..bf297b03a4e4 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -49,6 +49,7 @@ struct clocksource;
49 * @shift: cycle to nanosecond divisor (power of two) 49 * @shift: cycle to nanosecond divisor (power of two)
50 * @flags: flags describing special properties 50 * @flags: flags describing special properties
51 * @vread: vsyscall based read 51 * @vread: vsyscall based read
52 * @resume: resume function for the clocksource, if necessary
52 * @cycle_interval: Used internally by timekeeping core, please ignore. 53 * @cycle_interval: Used internally by timekeeping core, please ignore.
53 * @xtime_interval: Used internally by timekeeping core, please ignore. 54 * @xtime_interval: Used internally by timekeeping core, please ignore.
54 */ 55 */
@@ -65,6 +66,7 @@ struct clocksource {
65 u32 shift; 66 u32 shift;
66 unsigned long flags; 67 unsigned long flags;
67 cycle_t (*vread)(void); 68 cycle_t (*vread)(void);
69 void (*resume)(void);
68 70
69 /* timekeeping specific data, ignore */ 71 /* timekeeping specific data, ignore */
70 cycle_t cycle_interval; 72 cycle_t cycle_interval;
@@ -209,6 +211,7 @@ static inline void clocksource_calculate_interval(struct clocksource *c,
209extern int clocksource_register(struct clocksource*); 211extern int clocksource_register(struct clocksource*);
210extern struct clocksource* clocksource_get_next(void); 212extern struct clocksource* clocksource_get_next(void);
211extern void clocksource_change_rating(struct clocksource *cs, int rating); 213extern void clocksource_change_rating(struct clocksource *cs, int rating);
214extern void clocksource_resume(void);
212 215
213#ifdef CONFIG_GENERIC_TIME_VSYSCALL 216#ifdef CONFIG_GENERIC_TIME_VSYSCALL
214extern void update_vsyscall(struct timespec *ts, struct clocksource *c); 217extern void update_vsyscall(struct timespec *ts, struct clocksource *c);
diff --git a/include/linux/compat.h b/include/linux/compat.h
index ccd863dd77fa..70a157a130bb 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -253,5 +253,8 @@ asmlinkage long compat_sys_epoll_pwait(int epfd,
253 const compat_sigset_t __user *sigmask, 253 const compat_sigset_t __user *sigmask,
254 compat_size_t sigsetsize); 254 compat_size_t sigsetsize);
255 255
256asmlinkage long compat_sys_utimensat(unsigned int dfd, char __user *filename,
257 struct compat_timespec __user *t, int flags);
258
256#endif /* CONFIG_COMPAT */ 259#endif /* CONFIG_COMPAT */
257#endif /* _LINUX_COMPAT_H */ 260#endif /* _LINUX_COMPAT_H */
diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index a9f794716a81..03ec2311fb29 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -40,3 +40,4 @@
40#define noinline __attribute__((noinline)) 40#define noinline __attribute__((noinline))
41#define __attribute_pure__ __attribute__((pure)) 41#define __attribute_pure__ __attribute__((pure))
42#define __attribute_const__ __attribute__((__const__)) 42#define __attribute_const__ __attribute__((__const__))
43#define __maybe_unused __attribute__((unused))
diff --git a/include/linux/compiler-gcc3.h b/include/linux/compiler-gcc3.h
index ecd621fd27d2..a9e2863c2dbf 100644
--- a/include/linux/compiler-gcc3.h
+++ b/include/linux/compiler-gcc3.h
@@ -4,9 +4,11 @@
4#include <linux/compiler-gcc.h> 4#include <linux/compiler-gcc.h>
5 5
6#if __GNUC_MINOR__ >= 3 6#if __GNUC_MINOR__ >= 3
7# define __attribute_used__ __attribute__((__used__)) 7# define __used __attribute__((__used__))
8# define __attribute_used__ __used /* deprecated */
8#else 9#else
9# define __attribute_used__ __attribute__((__unused__)) 10# define __used __attribute__((__unused__))
11# define __attribute_used__ __used /* deprecated */
10#endif 12#endif
11 13
12#if __GNUC_MINOR__ >= 4 14#if __GNUC_MINOR__ >= 4
diff --git a/include/linux/compiler-gcc4.h b/include/linux/compiler-gcc4.h
index fd0cc7c4a636..a03e9398a6c2 100644
--- a/include/linux/compiler-gcc4.h
+++ b/include/linux/compiler-gcc4.h
@@ -12,7 +12,8 @@
12# define __inline __inline __attribute__((always_inline)) 12# define __inline __inline __attribute__((always_inline))
13#endif 13#endif
14 14
15#define __attribute_used__ __attribute__((__used__)) 15#define __used __attribute__((__used__))
16#define __attribute_used__ __used /* deprecated */
16#define __must_check __attribute__((warn_unused_result)) 17#define __must_check __attribute__((warn_unused_result))
17#define __compiler_offsetof(a,b) __builtin_offsetof(a,b) 18#define __compiler_offsetof(a,b) __builtin_offsetof(a,b)
18#define __always_inline inline __attribute__((always_inline)) 19#define __always_inline inline __attribute__((always_inline))
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 3b6949b41745..498c35920762 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -108,15 +108,30 @@ extern void __chk_io_ptr(const void __iomem *);
108 * Allow us to avoid 'defined but not used' warnings on functions and data, 108 * Allow us to avoid 'defined but not used' warnings on functions and data,
109 * as well as force them to be emitted to the assembly file. 109 * as well as force them to be emitted to the assembly file.
110 * 110 *
111 * As of gcc 3.3, static functions that are not marked with attribute((used)) 111 * As of gcc 3.4, static functions that are not marked with attribute((used))
112 * may be elided from the assembly file. As of gcc 3.3, static data not so 112 * may be elided from the assembly file. As of gcc 3.4, static data not so
113 * marked will not be elided, but this may change in a future gcc version. 113 * marked will not be elided, but this may change in a future gcc version.
114 * 114 *
115 * NOTE: Because distributions shipped with a backported unit-at-a-time
116 * compiler in gcc 3.3, we must define __used to be __attribute__((used))
117 * for gcc >=3.3 instead of 3.4.
118 *
115 * In prior versions of gcc, such functions and data would be emitted, but 119 * In prior versions of gcc, such functions and data would be emitted, but
116 * would be warned about except with attribute((unused)). 120 * would be warned about except with attribute((unused)).
121 *
122 * Mark functions that are referenced only in inline assembly as __used so
123 * the code is emitted even though it appears to be unreferenced.
117 */ 124 */
118#ifndef __attribute_used__ 125#ifndef __attribute_used__
119# define __attribute_used__ /* unimplemented */ 126# define __attribute_used__ /* deprecated */
127#endif
128
129#ifndef __used
130# define __used /* unimplemented */
131#endif
132
133#ifndef __maybe_unused
134# define __maybe_unused /* unimplemented */
120#endif 135#endif
121 136
122/* 137/*
diff --git a/include/linux/fb.h b/include/linux/fb.h
index dff7a728948c..c654d0e9ce33 100644
--- a/include/linux/fb.h
+++ b/include/linux/fb.h
@@ -868,7 +868,7 @@ struct fb_info {
868#define fb_writeq sbus_writeq 868#define fb_writeq sbus_writeq
869#define fb_memset sbus_memset_io 869#define fb_memset sbus_memset_io
870 870
871#elif defined(__i386__) || defined(__alpha__) || defined(__x86_64__) || defined(__hppa__) || (defined(__sh__) && !defined(__SH5__)) || defined(__powerpc__) 871#elif defined(__i386__) || defined(__alpha__) || defined(__x86_64__) || defined(__hppa__) || (defined(__sh__) && !defined(__SH5__)) || defined(__powerpc__) || defined(__avr32__)
872 872
873#define fb_readb __raw_readb 873#define fb_readb __raw_readb
874#define fb_readw __raw_readw 874#define fb_readw __raw_readw
diff --git a/include/linux/futex.h b/include/linux/futex.h
index 820125c628c1..899fc7f20edd 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -3,6 +3,8 @@
3 3
4#include <linux/sched.h> 4#include <linux/sched.h>
5 5
6union ktime;
7
6/* Second argument to futex syscall */ 8/* Second argument to futex syscall */
7 9
8 10
@@ -15,6 +17,19 @@
15#define FUTEX_LOCK_PI 6 17#define FUTEX_LOCK_PI 6
16#define FUTEX_UNLOCK_PI 7 18#define FUTEX_UNLOCK_PI 7
17#define FUTEX_TRYLOCK_PI 8 19#define FUTEX_TRYLOCK_PI 8
20#define FUTEX_CMP_REQUEUE_PI 9
21
22#define FUTEX_PRIVATE_FLAG 128
23#define FUTEX_CMD_MASK ~FUTEX_PRIVATE_FLAG
24
25#define FUTEX_WAIT_PRIVATE (FUTEX_WAIT | FUTEX_PRIVATE_FLAG)
26#define FUTEX_WAKE_PRIVATE (FUTEX_WAKE | FUTEX_PRIVATE_FLAG)
27#define FUTEX_REQUEUE_PRIVATE (FUTEX_REQUEUE | FUTEX_PRIVATE_FLAG)
28#define FUTEX_CMP_REQUEUE_PRIVATE (FUTEX_CMP_REQUEUE | FUTEX_PRIVATE_FLAG)
29#define FUTEX_WAKE_OP_PRIVATE (FUTEX_WAKE_OP | FUTEX_PRIVATE_FLAG)
30#define FUTEX_LOCK_PI_PRIVATE (FUTEX_LOCK_PI | FUTEX_PRIVATE_FLAG)
31#define FUTEX_UNLOCK_PI_PRIVATE (FUTEX_UNLOCK_PI | FUTEX_PRIVATE_FLAG)
32#define FUTEX_TRYLOCK_PI_PRIVATE (FUTEX_TRYLOCK_PI | FUTEX_PRIVATE_FLAG)
18 33
19/* 34/*
20 * Support for robust futexes: the kernel cleans up held futexes at 35 * Support for robust futexes: the kernel cleans up held futexes at
@@ -83,9 +98,14 @@ struct robust_list_head {
83#define FUTEX_OWNER_DIED 0x40000000 98#define FUTEX_OWNER_DIED 0x40000000
84 99
85/* 100/*
101 * Some processes have been requeued on this PI-futex
102 */
103#define FUTEX_WAITER_REQUEUED 0x20000000
104
105/*
86 * The rest of the robust-futex field is for the TID: 106 * The rest of the robust-futex field is for the TID:
87 */ 107 */
88#define FUTEX_TID_MASK 0x3fffffff 108#define FUTEX_TID_MASK 0x0fffffff
89 109
90/* 110/*
91 * This limit protects against a deliberately circular list. 111 * This limit protects against a deliberately circular list.
@@ -94,7 +114,7 @@ struct robust_list_head {
94#define ROBUST_LIST_LIMIT 2048 114#define ROBUST_LIST_LIMIT 2048
95 115
96#ifdef __KERNEL__ 116#ifdef __KERNEL__
97long do_futex(u32 __user *uaddr, int op, u32 val, unsigned long timeout, 117long do_futex(u32 __user *uaddr, int op, u32 val, union ktime *timeout,
98 u32 __user *uaddr2, u32 val2, u32 val3); 118 u32 __user *uaddr2, u32 val2, u32 val3);
99 119
100extern int 120extern int
@@ -106,9 +126,20 @@ handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi);
106 * Don't rearrange members without looking at hash_futex(). 126 * Don't rearrange members without looking at hash_futex().
107 * 127 *
108 * offset is aligned to a multiple of sizeof(u32) (== 4) by definition. 128 * offset is aligned to a multiple of sizeof(u32) (== 4) by definition.
109 * We set bit 0 to indicate if it's an inode-based key. 129 * We use the two low order bits of offset to tell what is the kind of key :
110 */ 130 * 00 : Private process futex (PTHREAD_PROCESS_PRIVATE)
131 * (no reference on an inode or mm)
132 * 01 : Shared futex (PTHREAD_PROCESS_SHARED)
133 * mapped on a file (reference on the underlying inode)
134 * 10 : Shared futex (PTHREAD_PROCESS_SHARED)
135 * (but private mapping on an mm, and reference taken on it)
136*/
137
138#define FUT_OFF_INODE 1 /* We set bit 0 if key has a reference on inode */
139#define FUT_OFF_MMSHARED 2 /* We set bit 1 if key has a reference on mm */
140
111union futex_key { 141union futex_key {
142 u32 __user *uaddr;
112 struct { 143 struct {
113 unsigned long pgoff; 144 unsigned long pgoff;
114 struct inode *inode; 145 struct inode *inode;
@@ -125,7 +156,8 @@ union futex_key {
125 int offset; 156 int offset;
126 } both; 157 } both;
127}; 158};
128int get_futex_key(u32 __user *uaddr, union futex_key *key); 159int get_futex_key(u32 __user *uaddr, struct rw_semaphore *shared,
160 union futex_key *key);
129void get_futex_key_refs(union futex_key *key); 161void get_futex_key_refs(union futex_key *key);
130void drop_futex_key_refs(union futex_key *key); 162void drop_futex_key_refs(union futex_key *key);
131 163
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 2c65da7cabb2..f589559cf070 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -413,6 +413,7 @@ char *disk_name (struct gendisk *hd, int part, char *buf);
413extern int rescan_partitions(struct gendisk *disk, struct block_device *bdev); 413extern int rescan_partitions(struct gendisk *disk, struct block_device *bdev);
414extern void add_partition(struct gendisk *, int, sector_t, sector_t, int); 414extern void add_partition(struct gendisk *, int, sector_t, sector_t, int);
415extern void delete_partition(struct gendisk *, int); 415extern void delete_partition(struct gendisk *, int);
416extern void printk_all_partitions(void);
416 417
417extern struct gendisk *alloc_disk_node(int minors, int node_id); 418extern struct gendisk *alloc_disk_node(int minors, int node_id);
418extern struct gendisk *alloc_disk(int minors); 419extern struct gendisk *alloc_disk(int minors);
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 97a36c3d96e2..0d2ef0b082a6 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -176,10 +176,6 @@ extern void FASTCALL(free_cold_page(struct page *page));
176#define free_page(addr) free_pages((addr),0) 176#define free_page(addr) free_pages((addr),0)
177 177
178void page_alloc_init(void); 178void page_alloc_init(void);
179#ifdef CONFIG_NUMA 179void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp);
180void drain_node_pages(int node);
181#else
182static inline void drain_node_pages(int node) { };
183#endif
184 180
185#endif /* __LINUX_GFP_H */ 181#endif /* __LINUX_GFP_H */
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index a515eb0afdfb..98e2cce996a4 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -94,17 +94,26 @@ static inline void clear_highpage(struct page *page)
94 94
95/* 95/*
96 * Same but also flushes aliased cache contents to RAM. 96 * Same but also flushes aliased cache contents to RAM.
97 *
98 * This must be a macro because KM_USER0 and friends aren't defined if
99 * !CONFIG_HIGHMEM
97 */ 100 */
98static inline void memclear_highpage_flush(struct page *page, unsigned int offset, unsigned int size) 101#define zero_user_page(page, offset, size, km_type) \
102 do { \
103 void *kaddr; \
104 \
105 BUG_ON((offset) + (size) > PAGE_SIZE); \
106 \
107 kaddr = kmap_atomic(page, km_type); \
108 memset((char *)kaddr + (offset), 0, (size)); \
109 flush_dcache_page(page); \
110 kunmap_atomic(kaddr, (km_type)); \
111 } while (0)
112
113static inline void __deprecated memclear_highpage_flush(struct page *page,
114 unsigned int offset, unsigned int size)
99{ 115{
100 void *kaddr; 116 zero_user_page(page, offset, size, KM_USER0);
101
102 BUG_ON(offset + size > PAGE_SIZE);
103
104 kaddr = kmap_atomic(page, KM_USER0);
105 memset((char *)kaddr + offset, 0, size);
106 flush_dcache_page(page);
107 kunmap_atomic(kaddr, KM_USER0);
108} 117}
109 118
110#ifndef __HAVE_ARCH_COPY_USER_HIGHPAGE 119#ifndef __HAVE_ARCH_COPY_USER_HIGHPAGE
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 795102309bf1..45170b2fa253 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -95,7 +95,7 @@ extern struct group_info init_groups;
95#define INIT_TASK(tsk) \ 95#define INIT_TASK(tsk) \
96{ \ 96{ \
97 .state = 0, \ 97 .state = 0, \
98 .thread_info = &init_thread_info, \ 98 .stack = &init_thread_info, \
99 .usage = ATOMIC_INIT(2), \ 99 .usage = ATOMIC_INIT(2), \
100 .flags = 0, \ 100 .flags = 0, \
101 .lock_depth = -1, \ 101 .lock_depth = -1, \
diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index 1c65e7a9f186..00dd957e245b 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -30,4 +30,7 @@ void kthread_bind(struct task_struct *k, unsigned int cpu);
30int kthread_stop(struct task_struct *k); 30int kthread_stop(struct task_struct *k);
31int kthread_should_stop(void); 31int kthread_should_stop(void);
32 32
33int kthreadd(void *unused);
34extern struct task_struct *kthreadd_task;
35
33#endif /* _LINUX_KTHREAD_H */ 36#endif /* _LINUX_KTHREAD_H */
diff --git a/include/linux/ktime.h b/include/linux/ktime.h
index 81bb9c7a4eb3..c762954bda14 100644
--- a/include/linux/ktime.h
+++ b/include/linux/ktime.h
@@ -43,7 +43,7 @@
43 * plain scalar nanosecond based representation can be selected by the 43 * plain scalar nanosecond based representation can be selected by the
44 * config switch CONFIG_KTIME_SCALAR. 44 * config switch CONFIG_KTIME_SCALAR.
45 */ 45 */
46typedef union { 46union ktime {
47 s64 tv64; 47 s64 tv64;
48#if BITS_PER_LONG != 64 && !defined(CONFIG_KTIME_SCALAR) 48#if BITS_PER_LONG != 64 && !defined(CONFIG_KTIME_SCALAR)
49 struct { 49 struct {
@@ -54,7 +54,9 @@ typedef union {
54# endif 54# endif
55 } tv; 55 } tv;
56#endif 56#endif
57} ktime_t; 57};
58
59typedef union ktime ktime_t; /* Kill this */
58 60
59#define KTIME_MAX ((s64)~((u64)1 << 63)) 61#define KTIME_MAX ((s64)~((u64)1 << 63))
60#if (BITS_PER_LONG == 64) 62#if (BITS_PER_LONG == 64)
diff --git a/include/linux/mca.h b/include/linux/mca.h
index 5cff2923092b..37972704617f 100644
--- a/include/linux/mca.h
+++ b/include/linux/mca.h
@@ -94,6 +94,7 @@ struct mca_bus {
94struct mca_driver { 94struct mca_driver {
95 const short *id_table; 95 const short *id_table;
96 void *driver_data; 96 void *driver_data;
97 int integrated_id;
97 struct device_driver driver; 98 struct device_driver driver;
98}; 99};
99#define to_mca_driver(mdriver) container_of(mdriver, struct mca_driver, driver) 100#define to_mca_driver(mdriver) container_of(mdriver, struct mca_driver, driver)
@@ -125,6 +126,7 @@ extern enum MCA_AdapterStatus mca_device_status(struct mca_device *mca_dev);
125extern struct bus_type mca_bus_type; 126extern struct bus_type mca_bus_type;
126 127
127extern int mca_register_driver(struct mca_driver *drv); 128extern int mca_register_driver(struct mca_driver *drv);
129extern int mca_register_driver_integrated(struct mca_driver *, int);
128extern void mca_unregister_driver(struct mca_driver *drv); 130extern void mca_unregister_driver(struct mca_driver *drv);
129 131
130/* WARNING: only called by the boot time device setup */ 132/* WARNING: only called by the boot time device setup */
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 2f1544e83042..d09b1345a3a1 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -83,6 +83,9 @@ struct per_cpu_pages {
83 83
84struct per_cpu_pageset { 84struct per_cpu_pageset {
85 struct per_cpu_pages pcp[2]; /* 0: hot. 1: cold */ 85 struct per_cpu_pages pcp[2]; /* 0: hot. 1: cold */
86#ifdef CONFIG_NUMA
87 s8 expire;
88#endif
86#ifdef CONFIG_SMP 89#ifdef CONFIG_SMP
87 s8 stat_threshold; 90 s8 stat_threshold;
88 s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS]; 91 s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS];
diff --git a/include/linux/module.h b/include/linux/module.h
index 6d3dc9c4ff96..792d483c9af7 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -356,6 +356,9 @@ struct module
356 keeping pointers to this stuff */ 356 keeping pointers to this stuff */
357 char *args; 357 char *args;
358}; 358};
359#ifndef MODULE_ARCH_INIT
360#define MODULE_ARCH_INIT {}
361#endif
359 362
360/* FIXME: It'd be nice to isolate modules during init, too, so they 363/* FIXME: It'd be nice to isolate modules during init, too, so they
361 aren't used before they (may) fail. But presently too much code 364 aren't used before they (may) fail. But presently too much code
diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index b81bc2adaeff..0d50ea3df689 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -121,11 +121,12 @@ static inline int fastcall mutex_is_locked(struct mutex *lock)
121 * Also see Documentation/mutex-design.txt. 121 * Also see Documentation/mutex-design.txt.
122 */ 122 */
123extern void fastcall mutex_lock(struct mutex *lock); 123extern void fastcall mutex_lock(struct mutex *lock);
124extern int fastcall mutex_lock_interruptible(struct mutex *lock); 124extern int __must_check fastcall mutex_lock_interruptible(struct mutex *lock);
125 125
126#ifdef CONFIG_DEBUG_LOCK_ALLOC 126#ifdef CONFIG_DEBUG_LOCK_ALLOC
127extern void mutex_lock_nested(struct mutex *lock, unsigned int subclass); 127extern void mutex_lock_nested(struct mutex *lock, unsigned int subclass);
128extern int mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass); 128extern int __must_check mutex_lock_interruptible_nested(struct mutex *lock,
129 unsigned int subclass);
129#else 130#else
130# define mutex_lock_nested(lock, subclass) mutex_lock(lock) 131# define mutex_lock_nested(lock, subclass) mutex_lock(lock)
131# define mutex_lock_interruptible_nested(lock, subclass) mutex_lock_interruptible(lock) 132# define mutex_lock_interruptible_nested(lock, subclass) mutex_lock_interruptible(lock)
diff --git a/include/linux/nfs4_acl.h b/include/linux/nfs4_acl.h
index 409b6e02f337..c9c05a78e9bb 100644
--- a/include/linux/nfs4_acl.h
+++ b/include/linux/nfs4_acl.h
@@ -44,7 +44,6 @@
44#define NFS4_ACL_MAX 170 44#define NFS4_ACL_MAX 170
45 45
46struct nfs4_acl *nfs4_acl_new(int); 46struct nfs4_acl *nfs4_acl_new(int);
47void nfs4_acl_add_ace(struct nfs4_acl *, u32, u32, u32, int, uid_t);
48int nfs4_acl_get_whotype(char *, u32); 47int nfs4_acl_get_whotype(char *, u32);
49int nfs4_acl_write_who(int who, char *p); 48int nfs4_acl_write_who(int who, char *p);
50int nfs4_acl_permission(struct nfs4_acl *acl, uid_t owner, gid_t group, 49int nfs4_acl_permission(struct nfs4_acl *acl, uid_t owner, gid_t group,
diff --git a/include/linux/notifier.h b/include/linux/notifier.h
index 10a43ed0527e..9431101bf876 100644
--- a/include/linux/notifier.h
+++ b/include/linux/notifier.h
@@ -112,32 +112,40 @@ extern void srcu_init_notifier_head(struct srcu_notifier_head *nh);
112 112
113#ifdef __KERNEL__ 113#ifdef __KERNEL__
114 114
115extern int atomic_notifier_chain_register(struct atomic_notifier_head *, 115extern int atomic_notifier_chain_register(struct atomic_notifier_head *nh,
116 struct notifier_block *); 116 struct notifier_block *nb);
117extern int blocking_notifier_chain_register(struct blocking_notifier_head *, 117extern int blocking_notifier_chain_register(struct blocking_notifier_head *nh,
118 struct notifier_block *); 118 struct notifier_block *nb);
119extern int raw_notifier_chain_register(struct raw_notifier_head *, 119extern int raw_notifier_chain_register(struct raw_notifier_head *nh,
120 struct notifier_block *); 120 struct notifier_block *nb);
121extern int srcu_notifier_chain_register(struct srcu_notifier_head *, 121extern int srcu_notifier_chain_register(struct srcu_notifier_head *nh,
122 struct notifier_block *); 122 struct notifier_block *nb);
123 123
124extern int atomic_notifier_chain_unregister(struct atomic_notifier_head *, 124extern int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh,
125 struct notifier_block *); 125 struct notifier_block *nb);
126extern int blocking_notifier_chain_unregister(struct blocking_notifier_head *, 126extern int blocking_notifier_chain_unregister(struct blocking_notifier_head *nh,
127 struct notifier_block *); 127 struct notifier_block *nb);
128extern int raw_notifier_chain_unregister(struct raw_notifier_head *, 128extern int raw_notifier_chain_unregister(struct raw_notifier_head *nh,
129 struct notifier_block *); 129 struct notifier_block *nb);
130extern int srcu_notifier_chain_unregister(struct srcu_notifier_head *, 130extern int srcu_notifier_chain_unregister(struct srcu_notifier_head *nh,
131 struct notifier_block *); 131 struct notifier_block *nb);
132 132
133extern int atomic_notifier_call_chain(struct atomic_notifier_head *, 133extern int atomic_notifier_call_chain(struct atomic_notifier_head *nh,
134 unsigned long val, void *v); 134 unsigned long val, void *v);
135extern int blocking_notifier_call_chain(struct blocking_notifier_head *, 135extern int __atomic_notifier_call_chain(struct atomic_notifier_head *nh,
136 unsigned long val, void *v, int nr_to_call, int *nr_calls);
137extern int blocking_notifier_call_chain(struct blocking_notifier_head *nh,
136 unsigned long val, void *v); 138 unsigned long val, void *v);
137extern int raw_notifier_call_chain(struct raw_notifier_head *, 139extern int __blocking_notifier_call_chain(struct blocking_notifier_head *nh,
140 unsigned long val, void *v, int nr_to_call, int *nr_calls);
141extern int raw_notifier_call_chain(struct raw_notifier_head *nh,
138 unsigned long val, void *v); 142 unsigned long val, void *v);
139extern int srcu_notifier_call_chain(struct srcu_notifier_head *, 143extern int __raw_notifier_call_chain(struct raw_notifier_head *nh,
144 unsigned long val, void *v, int nr_to_call, int *nr_calls);
145extern int srcu_notifier_call_chain(struct srcu_notifier_head *nh,
140 unsigned long val, void *v); 146 unsigned long val, void *v);
147extern int __srcu_notifier_call_chain(struct srcu_notifier_head *nh,
148 unsigned long val, void *v, int nr_to_call, int *nr_calls);
141 149
142#define NOTIFY_DONE 0x0000 /* Don't care */ 150#define NOTIFY_DONE 0x0000 /* Don't care */
143#define NOTIFY_OK 0x0001 /* Suits me */ 151#define NOTIFY_OK 0x0001 /* Suits me */
@@ -186,6 +194,20 @@ extern int srcu_notifier_call_chain(struct srcu_notifier_head *,
186#define CPU_DOWN_PREPARE 0x0005 /* CPU (unsigned)v going down */ 194#define CPU_DOWN_PREPARE 0x0005 /* CPU (unsigned)v going down */
187#define CPU_DOWN_FAILED 0x0006 /* CPU (unsigned)v NOT going down */ 195#define CPU_DOWN_FAILED 0x0006 /* CPU (unsigned)v NOT going down */
188#define CPU_DEAD 0x0007 /* CPU (unsigned)v dead */ 196#define CPU_DEAD 0x0007 /* CPU (unsigned)v dead */
197#define CPU_LOCK_ACQUIRE 0x0008 /* Acquire all hotcpu locks */
198#define CPU_LOCK_RELEASE 0x0009 /* Release all hotcpu locks */
199
200/* Used for CPU hotplug events occuring while tasks are frozen due to a suspend
201 * operation in progress
202 */
203#define CPU_TASKS_FROZEN 0x0010
204
205#define CPU_ONLINE_FROZEN (CPU_ONLINE | CPU_TASKS_FROZEN)
206#define CPU_UP_PREPARE_FROZEN (CPU_UP_PREPARE | CPU_TASKS_FROZEN)
207#define CPU_UP_CANCELED_FROZEN (CPU_UP_CANCELED | CPU_TASKS_FROZEN)
208#define CPU_DOWN_PREPARE_FROZEN (CPU_DOWN_PREPARE | CPU_TASKS_FROZEN)
209#define CPU_DOWN_FAILED_FROZEN (CPU_DOWN_FAILED | CPU_TASKS_FROZEN)
210#define CPU_DEAD_FROZEN (CPU_DEAD | CPU_TASKS_FROZEN)
189 211
190#endif /* __KERNEL__ */ 212#endif /* __KERNEL__ */
191#endif /* _LINUX_NOTIFIER_H */ 213#endif /* _LINUX_NOTIFIER_H */
diff --git a/include/linux/pm.h b/include/linux/pm.h
index 6e8fa3049e5d..87545e0f0b58 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -107,26 +107,11 @@ typedef int __bitwise suspend_state_t;
107#define PM_SUSPEND_ON ((__force suspend_state_t) 0) 107#define PM_SUSPEND_ON ((__force suspend_state_t) 0)
108#define PM_SUSPEND_STANDBY ((__force suspend_state_t) 1) 108#define PM_SUSPEND_STANDBY ((__force suspend_state_t) 1)
109#define PM_SUSPEND_MEM ((__force suspend_state_t) 3) 109#define PM_SUSPEND_MEM ((__force suspend_state_t) 3)
110#define PM_SUSPEND_DISK ((__force suspend_state_t) 4) 110#define PM_SUSPEND_MAX ((__force suspend_state_t) 4)
111#define PM_SUSPEND_MAX ((__force suspend_state_t) 5)
112
113typedef int __bitwise suspend_disk_method_t;
114
115/* invalid must be 0 so struct pm_ops initialisers can leave it out */
116#define PM_DISK_INVALID ((__force suspend_disk_method_t) 0)
117#define PM_DISK_PLATFORM ((__force suspend_disk_method_t) 1)
118#define PM_DISK_SHUTDOWN ((__force suspend_disk_method_t) 2)
119#define PM_DISK_REBOOT ((__force suspend_disk_method_t) 3)
120#define PM_DISK_TEST ((__force suspend_disk_method_t) 4)
121#define PM_DISK_TESTPROC ((__force suspend_disk_method_t) 5)
122#define PM_DISK_MAX ((__force suspend_disk_method_t) 6)
123 111
124/** 112/**
125 * struct pm_ops - Callbacks for managing platform dependent suspend states. 113 * struct pm_ops - Callbacks for managing platform dependent suspend states.
126 * @valid: Callback to determine whether the given state can be entered. 114 * @valid: Callback to determine whether the given state can be entered.
127 * If %CONFIG_SOFTWARE_SUSPEND is set then %PM_SUSPEND_DISK is
128 * always valid and never passed to this call. If not assigned,
129 * no suspend states are valid.
130 * Valid states are advertised in /sys/power/state but can still 115 * Valid states are advertised in /sys/power/state but can still
131 * be rejected by prepare or enter if the conditions aren't right. 116 * be rejected by prepare or enter if the conditions aren't right.
132 * There is a %pm_valid_only_mem function available that can be assigned 117 * There is a %pm_valid_only_mem function available that can be assigned
@@ -140,24 +125,12 @@ typedef int __bitwise suspend_disk_method_t;
140 * 125 *
141 * @finish: Called when the system has left the given state and all devices 126 * @finish: Called when the system has left the given state and all devices
142 * are resumed. The return value is ignored. 127 * are resumed. The return value is ignored.
143 *
144 * @pm_disk_mode: The generic code always allows one of the shutdown methods
145 * %PM_DISK_SHUTDOWN, %PM_DISK_REBOOT, %PM_DISK_TEST and
146 * %PM_DISK_TESTPROC. If this variable is set, the mode it is set
147 * to is allowed in addition to those modes and is also made default.
148 * When this mode is sent selected, the @prepare call will be called
149 * before suspending to disk (if present), the @enter call should be
150 * present and will be called after all state has been saved and the
151 * machine is ready to be powered off; the @finish callback is called
152 * after state has been restored. All these calls are called with
153 * %PM_SUSPEND_DISK as the state.
154 */ 128 */
155struct pm_ops { 129struct pm_ops {
156 int (*valid)(suspend_state_t state); 130 int (*valid)(suspend_state_t state);
157 int (*prepare)(suspend_state_t state); 131 int (*prepare)(suspend_state_t state);
158 int (*enter)(suspend_state_t state); 132 int (*enter)(suspend_state_t state);
159 int (*finish)(suspend_state_t state); 133 int (*finish)(suspend_state_t state);
160 suspend_disk_method_t pm_disk_mode;
161}; 134};
162 135
163/** 136/**
@@ -276,8 +249,6 @@ extern void device_power_up(void);
276extern void device_resume(void); 249extern void device_resume(void);
277 250
278#ifdef CONFIG_PM 251#ifdef CONFIG_PM
279extern suspend_disk_method_t pm_disk_mode;
280
281extern int device_suspend(pm_message_t state); 252extern int device_suspend(pm_message_t state);
282extern int device_prepare_suspend(pm_message_t state); 253extern int device_prepare_suspend(pm_message_t state);
283 254
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index de72c49747c8..a121f36f4437 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -201,7 +201,6 @@ struct mddev_s
201 struct mutex reconfig_mutex; 201 struct mutex reconfig_mutex;
202 atomic_t active; 202 atomic_t active;
203 203
204 int changed; /* true if we might need to reread partition info */
205 int degraded; /* whether md should consider 204 int degraded; /* whether md should consider
206 * adding a spare 205 * adding a spare
207 */ 206 */
diff --git a/include/linux/relay.h b/include/linux/relay.h
index 759a0f97bec2..6cd8c4425fc7 100644
--- a/include/linux/relay.h
+++ b/include/linux/relay.h
@@ -12,6 +12,7 @@
12 12
13#include <linux/types.h> 13#include <linux/types.h>
14#include <linux/sched.h> 14#include <linux/sched.h>
15#include <linux/timer.h>
15#include <linux/wait.h> 16#include <linux/wait.h>
16#include <linux/list.h> 17#include <linux/list.h>
17#include <linux/fs.h> 18#include <linux/fs.h>
@@ -38,7 +39,7 @@ struct rchan_buf
38 size_t subbufs_consumed; /* count of sub-buffers consumed */ 39 size_t subbufs_consumed; /* count of sub-buffers consumed */
39 struct rchan *chan; /* associated channel */ 40 struct rchan *chan; /* associated channel */
40 wait_queue_head_t read_wait; /* reader wait queue */ 41 wait_queue_head_t read_wait; /* reader wait queue */
41 struct delayed_work wake_readers; /* reader wake-up work struct */ 42 struct timer_list timer; /* reader wake-up timer */
42 struct dentry *dentry; /* channel file dentry */ 43 struct dentry *dentry; /* channel file dentry */
43 struct kref kref; /* channel buffer refcount */ 44 struct kref kref; /* channel buffer refcount */
44 struct page **page_array; /* array of current buffer pages */ 45 struct page **page_array; /* array of current buffer pages */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3d95c480f58d..17b72d88c4cb 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -817,7 +817,7 @@ struct prio_array;
817 817
818struct task_struct { 818struct task_struct {
819 volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ 819 volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */
820 struct thread_info *thread_info; 820 void *stack;
821 atomic_t usage; 821 atomic_t usage;
822 unsigned int flags; /* per process flags, defined below */ 822 unsigned int flags; /* per process flags, defined below */
823 unsigned int ptrace; 823 unsigned int ptrace;
@@ -1317,6 +1317,7 @@ extern int in_egroup_p(gid_t);
1317 1317
1318extern void proc_caches_init(void); 1318extern void proc_caches_init(void);
1319extern void flush_signals(struct task_struct *); 1319extern void flush_signals(struct task_struct *);
1320extern void ignore_signals(struct task_struct *);
1320extern void flush_signal_handlers(struct task_struct *, int force_default); 1321extern void flush_signal_handlers(struct task_struct *, int force_default);
1321extern int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info); 1322extern int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info);
1322 1323
@@ -1512,8 +1513,8 @@ static inline void unlock_task_sighand(struct task_struct *tsk,
1512 1513
1513#ifndef __HAVE_THREAD_FUNCTIONS 1514#ifndef __HAVE_THREAD_FUNCTIONS
1514 1515
1515#define task_thread_info(task) (task)->thread_info 1516#define task_thread_info(task) ((struct thread_info *)(task)->stack)
1516#define task_stack_page(task) ((void*)((task)->thread_info)) 1517#define task_stack_page(task) ((task)->stack)
1517 1518
1518static inline void setup_thread_stack(struct task_struct *p, struct task_struct *org) 1519static inline void setup_thread_stack(struct task_struct *p, struct task_struct *org)
1519{ 1520{
@@ -1523,7 +1524,7 @@ static inline void setup_thread_stack(struct task_struct *p, struct task_struct
1523 1524
1524static inline unsigned long *end_of_stack(struct task_struct *p) 1525static inline unsigned long *end_of_stack(struct task_struct *p)
1525{ 1526{
1526 return (unsigned long *)(p->thread_info + 1); 1527 return (unsigned long *)(task_thread_info(p) + 1);
1527} 1528}
1528 1529
1529#endif 1530#endif
diff --git a/include/linux/signal.h b/include/linux/signal.h
index 14749056dd63..3fa0fab4a04b 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -243,6 +243,131 @@ extern int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka,
243 243
244extern struct kmem_cache *sighand_cachep; 244extern struct kmem_cache *sighand_cachep;
245 245
246/*
247 * In POSIX a signal is sent either to a specific thread (Linux task)
248 * or to the process as a whole (Linux thread group). How the signal
249 * is sent determines whether it's to one thread or the whole group,
250 * which determines which signal mask(s) are involved in blocking it
251 * from being delivered until later. When the signal is delivered,
252 * either it's caught or ignored by a user handler or it has a default
253 * effect that applies to the whole thread group (POSIX process).
254 *
255 * The possible effects an unblocked signal set to SIG_DFL can have are:
256 * ignore - Nothing Happens
257 * terminate - kill the process, i.e. all threads in the group,
258 * similar to exit_group. The group leader (only) reports
259 * WIFSIGNALED status to its parent.
260 * coredump - write a core dump file describing all threads using
261 * the same mm and then kill all those threads
262 * stop - stop all the threads in the group, i.e. TASK_STOPPED state
263 *
264 * SIGKILL and SIGSTOP cannot be caught, blocked, or ignored.
265 * Other signals when not blocked and set to SIG_DFL behaves as follows.
266 * The job control signals also have other special effects.
267 *
268 * +--------------------+------------------+
269 * | POSIX signal | default action |
270 * +--------------------+------------------+
271 * | SIGHUP | terminate |
272 * | SIGINT | terminate |
273 * | SIGQUIT | coredump |
274 * | SIGILL | coredump |
275 * | SIGTRAP | coredump |
276 * | SIGABRT/SIGIOT | coredump |
277 * | SIGBUS | coredump |
278 * | SIGFPE | coredump |
279 * | SIGKILL | terminate(+) |
280 * | SIGUSR1 | terminate |
281 * | SIGSEGV | coredump |
282 * | SIGUSR2 | terminate |
283 * | SIGPIPE | terminate |
284 * | SIGALRM | terminate |
285 * | SIGTERM | terminate |
286 * | SIGCHLD | ignore |
287 * | SIGCONT | ignore(*) |
288 * | SIGSTOP | stop(*)(+) |
289 * | SIGTSTP | stop(*) |
290 * | SIGTTIN | stop(*) |
291 * | SIGTTOU | stop(*) |
292 * | SIGURG | ignore |
293 * | SIGXCPU | coredump |
294 * | SIGXFSZ | coredump |
295 * | SIGVTALRM | terminate |
296 * | SIGPROF | terminate |
297 * | SIGPOLL/SIGIO | terminate |
298 * | SIGSYS/SIGUNUSED | coredump |
299 * | SIGSTKFLT | terminate |
300 * | SIGWINCH | ignore |
301 * | SIGPWR | terminate |
302 * | SIGRTMIN-SIGRTMAX | terminate |
303 * +--------------------+------------------+
304 * | non-POSIX signal | default action |
305 * +--------------------+------------------+
306 * | SIGEMT | coredump |
307 * +--------------------+------------------+
308 *
309 * (+) For SIGKILL and SIGSTOP the action is "always", not just "default".
310 * (*) Special job control effects:
311 * When SIGCONT is sent, it resumes the process (all threads in the group)
312 * from TASK_STOPPED state and also clears any pending/queued stop signals
313 * (any of those marked with "stop(*)"). This happens regardless of blocking,
314 * catching, or ignoring SIGCONT. When any stop signal is sent, it clears
315 * any pending/queued SIGCONT signals; this happens regardless of blocking,
316 * catching, or ignored the stop signal, though (except for SIGSTOP) the
317 * default action of stopping the process may happen later or never.
318 */
319
320#ifdef SIGEMT
321#define SIGEMT_MASK rt_sigmask(SIGEMT)
322#else
323#define SIGEMT_MASK 0
324#endif
325
326#if SIGRTMIN > BITS_PER_LONG
327#define rt_sigmask(sig) (1ULL << ((sig)-1))
328#else
329#define rt_sigmask(sig) sigmask(sig)
330#endif
331#define siginmask(sig, mask) (rt_sigmask(sig) & (mask))
332
333#define SIG_KERNEL_ONLY_MASK (\
334 rt_sigmask(SIGKILL) | rt_sigmask(SIGSTOP))
335
336#define SIG_KERNEL_STOP_MASK (\
337 rt_sigmask(SIGSTOP) | rt_sigmask(SIGTSTP) | \
338 rt_sigmask(SIGTTIN) | rt_sigmask(SIGTTOU) )
339
340#define SIG_KERNEL_COREDUMP_MASK (\
341 rt_sigmask(SIGQUIT) | rt_sigmask(SIGILL) | \
342 rt_sigmask(SIGTRAP) | rt_sigmask(SIGABRT) | \
343 rt_sigmask(SIGFPE) | rt_sigmask(SIGSEGV) | \
344 rt_sigmask(SIGBUS) | rt_sigmask(SIGSYS) | \
345 rt_sigmask(SIGXCPU) | rt_sigmask(SIGXFSZ) | \
346 SIGEMT_MASK )
347
348#define SIG_KERNEL_IGNORE_MASK (\
349 rt_sigmask(SIGCONT) | rt_sigmask(SIGCHLD) | \
350 rt_sigmask(SIGWINCH) | rt_sigmask(SIGURG) )
351
352#define sig_kernel_only(sig) \
353 (((sig) < SIGRTMIN) && siginmask(sig, SIG_KERNEL_ONLY_MASK))
354#define sig_kernel_coredump(sig) \
355 (((sig) < SIGRTMIN) && siginmask(sig, SIG_KERNEL_COREDUMP_MASK))
356#define sig_kernel_ignore(sig) \
357 (((sig) < SIGRTMIN) && siginmask(sig, SIG_KERNEL_IGNORE_MASK))
358#define sig_kernel_stop(sig) \
359 (((sig) < SIGRTMIN) && siginmask(sig, SIG_KERNEL_STOP_MASK))
360
361#define sig_needs_tasklist(sig) ((sig) == SIGCONT)
362
363#define sig_user_defined(t, signr) \
364 (((t)->sighand->action[(signr)-1].sa.sa_handler != SIG_DFL) && \
365 ((t)->sighand->action[(signr)-1].sa.sa_handler != SIG_IGN))
366
367#define sig_fatal(t, signr) \
368 (!siginmask(signr, SIG_KERNEL_IGNORE_MASK|SIG_KERNEL_STOP_MASK) && \
369 (t)->sighand->action[(signr)-1].sa.sa_handler == SIG_DFL)
370
246#endif /* __KERNEL__ */ 371#endif /* __KERNEL__ */
247 372
248#endif /* _LINUX_SIGNAL_H */ 373#endif /* _LINUX_SIGNAL_H */
diff --git a/include/linux/smp.h b/include/linux/smp.h
index 7ba23ec8211b..3f70149eabbb 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -83,7 +83,6 @@ void smp_prepare_boot_cpu(void);
83 * These macros fold the SMP functionality into a single CPU system 83 * These macros fold the SMP functionality into a single CPU system
84 */ 84 */
85#define raw_smp_processor_id() 0 85#define raw_smp_processor_id() 0
86#define hard_smp_processor_id() 0
87static inline int up_smp_call_function(void) 86static inline int up_smp_call_function(void)
88{ 87{
89 return 0; 88 return 0;
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 35fa4d5aadd0..4a7ae8ab6eb8 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -396,4 +396,23 @@ char * svc_print_addr(struct svc_rqst *, char *, size_t);
396 396
397#define RPC_MAX_ADDRBUFLEN (63U) 397#define RPC_MAX_ADDRBUFLEN (63U)
398 398
399/*
400 * When we want to reduce the size of the reserved space in the response
401 * buffer, we need to take into account the size of any checksum data that
402 * may be at the end of the packet. This is difficult to determine exactly
403 * for all cases without actually generating the checksum, so we just use a
404 * static value.
405 */
406static inline void
407svc_reserve_auth(struct svc_rqst *rqstp, int space)
408{
409 int added_space = 0;
410
411 switch(rqstp->rq_authop->flavour) {
412 case RPC_AUTH_GSS:
413 added_space = RPC_MAX_AUTH_SIZE;
414 }
415 return svc_reserve(rqstp, space + added_space);
416}
417
399#endif /* SUNRPC_SVC_H */ 418#endif /* SUNRPC_SVC_H */
diff --git a/include/linux/sunrpc/svcsock.h b/include/linux/sunrpc/svcsock.h
index 7909687557bf..e21dd93ac4b7 100644
--- a/include/linux/sunrpc/svcsock.h
+++ b/include/linux/sunrpc/svcsock.h
@@ -37,7 +37,8 @@ struct svc_sock {
37 37
38 atomic_t sk_reserved; /* space on outq that is reserved */ 38 atomic_t sk_reserved; /* space on outq that is reserved */
39 39
40 spinlock_t sk_defer_lock; /* protects sk_deferred */ 40 spinlock_t sk_lock; /* protects sk_deferred and
41 * sk_info_authunix */
41 struct list_head sk_deferred; /* deferred requests that need to 42 struct list_head sk_deferred; /* deferred requests that need to
42 * be revisted */ 43 * be revisted */
43 struct mutex sk_mutex; /* to serialize sending data */ 44 struct mutex sk_mutex; /* to serialize sending data */
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index 9d2aa1a12aa0..d74da9122b60 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -32,6 +32,24 @@ static inline int pm_prepare_console(void) { return 0; }
32static inline void pm_restore_console(void) {} 32static inline void pm_restore_console(void) {}
33#endif 33#endif
34 34
35/**
36 * struct hibernation_ops - hibernation platform support
37 *
38 * The methods in this structure allow a platform to override the default
39 * mechanism of shutting down the machine during a hibernation transition.
40 *
41 * All three methods must be assigned.
42 *
43 * @prepare: prepare system for hibernation
44 * @enter: shut down system after state has been saved to disk
45 * @finish: finish/clean up after state has been reloaded
46 */
47struct hibernation_ops {
48 int (*prepare)(void);
49 int (*enter)(void);
50 void (*finish)(void);
51};
52
35#if defined(CONFIG_PM) && defined(CONFIG_SOFTWARE_SUSPEND) 53#if defined(CONFIG_PM) && defined(CONFIG_SOFTWARE_SUSPEND)
36/* kernel/power/snapshot.c */ 54/* kernel/power/snapshot.c */
37extern void __init register_nosave_region(unsigned long, unsigned long); 55extern void __init register_nosave_region(unsigned long, unsigned long);
@@ -39,11 +57,17 @@ extern int swsusp_page_is_forbidden(struct page *);
39extern void swsusp_set_page_free(struct page *); 57extern void swsusp_set_page_free(struct page *);
40extern void swsusp_unset_page_free(struct page *); 58extern void swsusp_unset_page_free(struct page *);
41extern unsigned long get_safe_page(gfp_t gfp_mask); 59extern unsigned long get_safe_page(gfp_t gfp_mask);
60
61extern void hibernation_set_ops(struct hibernation_ops *ops);
62extern int hibernate(void);
42#else 63#else
43static inline void register_nosave_region(unsigned long b, unsigned long e) {} 64static inline void register_nosave_region(unsigned long b, unsigned long e) {}
44static inline int swsusp_page_is_forbidden(struct page *p) { return 0; } 65static inline int swsusp_page_is_forbidden(struct page *p) { return 0; }
45static inline void swsusp_set_page_free(struct page *p) {} 66static inline void swsusp_set_page_free(struct page *p) {}
46static inline void swsusp_unset_page_free(struct page *p) {} 67static inline void swsusp_unset_page_free(struct page *p) {}
68
69static inline void hibernation_set_ops(struct hibernation_ops *ops) {}
70static inline int hibernate(void) { return -ENOSYS; }
47#endif /* defined(CONFIG_PM) && defined(CONFIG_SOFTWARE_SUSPEND) */ 71#endif /* defined(CONFIG_PM) && defined(CONFIG_SOFTWARE_SUSPEND) */
48 72
49void save_processor_state(void); 73void save_processor_state(void);
diff --git a/include/linux/svga.h b/include/linux/svga.h
index e1cc552e04fe..13ad0b82ac28 100644
--- a/include/linux/svga.h
+++ b/include/linux/svga.h
@@ -113,6 +113,8 @@ void svga_tilefill(struct fb_info *info, struct fb_tilerect *rect);
113void svga_tileblit(struct fb_info *info, struct fb_tileblit *blit); 113void svga_tileblit(struct fb_info *info, struct fb_tileblit *blit);
114void svga_tilecursor(struct fb_info *info, struct fb_tilecursor *cursor); 114void svga_tilecursor(struct fb_info *info, struct fb_tilecursor *cursor);
115int svga_get_tilemax(struct fb_info *info); 115int svga_get_tilemax(struct fb_info *info);
116void svga_get_caps(struct fb_info *info, struct fb_blit_caps *caps,
117 struct fb_var_screeninfo *var);
116 118
117int svga_compute_pll(const struct svga_pll *pll, u32 f_wanted, u16 *m, u16 *n, u16 *r, int node); 119int svga_compute_pll(const struct svga_pll *pll, u32 f_wanted, u16 *m, u16 *n, u16 *r, int node);
118int svga_check_timings(const struct svga_timing_regs *tm, struct fb_var_screeninfo *var, int node); 120int svga_check_timings(const struct svga_timing_regs *tm, struct fb_var_screeninfo *var, int node);
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 1912c6cbef55..3139f4412297 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -576,6 +576,8 @@ asmlinkage long sys_fstatat64(int dfd, char __user *filename,
576 struct stat64 __user *statbuf, int flag); 576 struct stat64 __user *statbuf, int flag);
577asmlinkage long sys_readlinkat(int dfd, const char __user *path, char __user *buf, 577asmlinkage long sys_readlinkat(int dfd, const char __user *path, char __user *buf,
578 int bufsiz); 578 int bufsiz);
579asmlinkage long sys_utimensat(int dfd, char __user *filename,
580 struct timespec __user *utimes, int flags);
579asmlinkage long compat_sys_futimesat(unsigned int dfd, char __user *filename, 581asmlinkage long compat_sys_futimesat(unsigned int dfd, char __user *filename,
580 struct compat_timeval __user *t); 582 struct compat_timeval __user *t);
581asmlinkage long compat_sys_newfstatat(unsigned int dfd, char __user * filename, 583asmlinkage long compat_sys_newfstatat(unsigned int dfd, char __user * filename,
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index acb1f105870c..d9325cf8a134 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -212,8 +212,6 @@ extern void dec_zone_state(struct zone *, enum zone_stat_item);
212extern void __dec_zone_state(struct zone *, enum zone_stat_item); 212extern void __dec_zone_state(struct zone *, enum zone_stat_item);
213 213
214void refresh_cpu_vm_stats(int); 214void refresh_cpu_vm_stats(int);
215void refresh_vm_stats(void);
216
217#else /* CONFIG_SMP */ 215#else /* CONFIG_SMP */
218 216
219/* 217/*
@@ -260,7 +258,6 @@ static inline void __dec_zone_page_state(struct page *page,
260#define mod_zone_page_state __mod_zone_page_state 258#define mod_zone_page_state __mod_zone_page_state
261 259
262static inline void refresh_cpu_vm_stats(int cpu) { } 260static inline void refresh_cpu_vm_stats(int cpu) { }
263static inline void refresh_vm_stats(void) { }
264#endif 261#endif
265 262
266#endif /* _LINUX_VMSTAT_H */ 263#endif /* _LINUX_VMSTAT_H */
diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index f16ba1e0687d..d555f31c0746 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -24,15 +24,13 @@ typedef void (*work_func_t)(struct work_struct *work);
24struct work_struct { 24struct work_struct {
25 atomic_long_t data; 25 atomic_long_t data;
26#define WORK_STRUCT_PENDING 0 /* T if work item pending execution */ 26#define WORK_STRUCT_PENDING 0 /* T if work item pending execution */
27#define WORK_STRUCT_NOAUTOREL 1 /* F if work item automatically released on exec */
28#define WORK_STRUCT_FLAG_MASK (3UL) 27#define WORK_STRUCT_FLAG_MASK (3UL)
29#define WORK_STRUCT_WQ_DATA_MASK (~WORK_STRUCT_FLAG_MASK) 28#define WORK_STRUCT_WQ_DATA_MASK (~WORK_STRUCT_FLAG_MASK)
30 struct list_head entry; 29 struct list_head entry;
31 work_func_t func; 30 work_func_t func;
32}; 31};
33 32
34#define WORK_DATA_INIT(autorelease) \ 33#define WORK_DATA_INIT() ATOMIC_LONG_INIT(0)
35 ATOMIC_LONG_INIT((autorelease) << WORK_STRUCT_NOAUTOREL)
36 34
37struct delayed_work { 35struct delayed_work {
38 struct work_struct work; 36 struct work_struct work;
@@ -44,14 +42,8 @@ struct execute_work {
44}; 42};
45 43
46#define __WORK_INITIALIZER(n, f) { \ 44#define __WORK_INITIALIZER(n, f) { \
47 .data = WORK_DATA_INIT(0), \ 45 .data = WORK_DATA_INIT(), \
48 .entry = { &(n).entry, &(n).entry }, \ 46 .entry = { &(n).entry, &(n).entry }, \
49 .func = (f), \
50 }
51
52#define __WORK_INITIALIZER_NAR(n, f) { \
53 .data = WORK_DATA_INIT(1), \
54 .entry = { &(n).entry, &(n).entry }, \
55 .func = (f), \ 47 .func = (f), \
56 } 48 }
57 49
@@ -60,23 +52,12 @@ struct execute_work {
60 .timer = TIMER_INITIALIZER(NULL, 0, 0), \ 52 .timer = TIMER_INITIALIZER(NULL, 0, 0), \
61 } 53 }
62 54
63#define __DELAYED_WORK_INITIALIZER_NAR(n, f) { \
64 .work = __WORK_INITIALIZER_NAR((n).work, (f)), \
65 .timer = TIMER_INITIALIZER(NULL, 0, 0), \
66 }
67
68#define DECLARE_WORK(n, f) \ 55#define DECLARE_WORK(n, f) \
69 struct work_struct n = __WORK_INITIALIZER(n, f) 56 struct work_struct n = __WORK_INITIALIZER(n, f)
70 57
71#define DECLARE_WORK_NAR(n, f) \
72 struct work_struct n = __WORK_INITIALIZER_NAR(n, f)
73
74#define DECLARE_DELAYED_WORK(n, f) \ 58#define DECLARE_DELAYED_WORK(n, f) \
75 struct delayed_work n = __DELAYED_WORK_INITIALIZER(n, f) 59 struct delayed_work n = __DELAYED_WORK_INITIALIZER(n, f)
76 60
77#define DECLARE_DELAYED_WORK_NAR(n, f) \
78 struct dwork_struct n = __DELAYED_WORK_INITIALIZER_NAR(n, f)
79
80/* 61/*
81 * initialize a work item's function pointer 62 * initialize a work item's function pointer
82 */ 63 */
@@ -95,16 +76,9 @@ struct execute_work {
95 * assignment of the work data initializer allows the compiler 76 * assignment of the work data initializer allows the compiler
96 * to generate better code. 77 * to generate better code.
97 */ 78 */
98#define INIT_WORK(_work, _func) \ 79#define INIT_WORK(_work, _func) \
99 do { \
100 (_work)->data = (atomic_long_t) WORK_DATA_INIT(0); \
101 INIT_LIST_HEAD(&(_work)->entry); \
102 PREPARE_WORK((_work), (_func)); \
103 } while (0)
104
105#define INIT_WORK_NAR(_work, _func) \
106 do { \ 80 do { \
107 (_work)->data = (atomic_long_t) WORK_DATA_INIT(1); \ 81 (_work)->data = (atomic_long_t) WORK_DATA_INIT(); \
108 INIT_LIST_HEAD(&(_work)->entry); \ 82 INIT_LIST_HEAD(&(_work)->entry); \
109 PREPARE_WORK((_work), (_func)); \ 83 PREPARE_WORK((_work), (_func)); \
110 } while (0) 84 } while (0)
@@ -115,12 +89,6 @@ struct execute_work {
115 init_timer(&(_work)->timer); \ 89 init_timer(&(_work)->timer); \
116 } while (0) 90 } while (0)
117 91
118#define INIT_DELAYED_WORK_NAR(_work, _func) \
119 do { \
120 INIT_WORK_NAR(&(_work)->work, (_func)); \
121 init_timer(&(_work)->timer); \
122 } while (0)
123
124#define INIT_DELAYED_WORK_DEFERRABLE(_work, _func) \ 92#define INIT_DELAYED_WORK_DEFERRABLE(_work, _func) \
125 do { \ 93 do { \
126 INIT_WORK(&(_work)->work, (_func)); \ 94 INIT_WORK(&(_work)->work, (_func)); \
@@ -143,24 +111,10 @@ struct execute_work {
143 work_pending(&(w)->work) 111 work_pending(&(w)->work)
144 112
145/** 113/**
146 * work_release - Release a work item under execution 114 * work_clear_pending - for internal use only, mark a work item as not pending
147 * @work: The work item to release 115 * @work: The work item in question
148 *
149 * This is used to release a work item that has been initialised with automatic
150 * release mode disabled (WORK_STRUCT_NOAUTOREL is set). This gives the work
151 * function the opportunity to grab auxiliary data from the container of the
152 * work_struct before clearing the pending bit as the work_struct may be
153 * subject to deallocation the moment the pending bit is cleared.
154 *
155 * In such a case, this should be called in the work function after it has
156 * fetched any data it may require from the containter of the work_struct.
157 * After this function has been called, the work_struct may be scheduled for
158 * further execution or it may be deallocated unless other precautions are
159 * taken.
160 *
161 * This should also be used to release a delayed work item.
162 */ 116 */
163#define work_release(work) \ 117#define work_clear_pending(work) \
164 clear_bit(WORK_STRUCT_PENDING, work_data_bits(work)) 118 clear_bit(WORK_STRUCT_PENDING, work_data_bits(work))
165 119
166 120
@@ -174,27 +128,28 @@ extern struct workqueue_struct *__create_workqueue(const char *name,
174extern void destroy_workqueue(struct workqueue_struct *wq); 128extern void destroy_workqueue(struct workqueue_struct *wq);
175 129
176extern int FASTCALL(queue_work(struct workqueue_struct *wq, struct work_struct *work)); 130extern int FASTCALL(queue_work(struct workqueue_struct *wq, struct work_struct *work));
177extern int FASTCALL(queue_delayed_work(struct workqueue_struct *wq, struct delayed_work *work, unsigned long delay)); 131extern int FASTCALL(queue_delayed_work(struct workqueue_struct *wq,
132 struct delayed_work *work, unsigned long delay));
178extern int queue_delayed_work_on(int cpu, struct workqueue_struct *wq, 133extern int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
179 struct delayed_work *work, unsigned long delay); 134 struct delayed_work *work, unsigned long delay);
135
180extern void FASTCALL(flush_workqueue(struct workqueue_struct *wq)); 136extern void FASTCALL(flush_workqueue(struct workqueue_struct *wq));
137extern void flush_scheduled_work(void);
181 138
182extern int FASTCALL(schedule_work(struct work_struct *work)); 139extern int FASTCALL(schedule_work(struct work_struct *work));
183extern int FASTCALL(run_scheduled_work(struct work_struct *work)); 140extern int FASTCALL(schedule_delayed_work(struct delayed_work *work,
184extern int FASTCALL(schedule_delayed_work(struct delayed_work *work, unsigned long delay)); 141 unsigned long delay));
185 142extern int schedule_delayed_work_on(int cpu, struct delayed_work *work,
186extern int schedule_delayed_work_on(int cpu, struct delayed_work *work, unsigned long delay); 143 unsigned long delay);
187extern int schedule_on_each_cpu(work_func_t func); 144extern int schedule_on_each_cpu(work_func_t func);
188extern void flush_scheduled_work(void);
189extern int current_is_keventd(void); 145extern int current_is_keventd(void);
190extern int keventd_up(void); 146extern int keventd_up(void);
191 147
192extern void init_workqueues(void); 148extern void init_workqueues(void);
193void cancel_rearming_delayed_work(struct delayed_work *work);
194void cancel_rearming_delayed_workqueue(struct workqueue_struct *,
195 struct delayed_work *);
196int execute_in_process_context(work_func_t fn, struct execute_work *); 149int execute_in_process_context(work_func_t fn, struct execute_work *);
197 150
151extern void cancel_work_sync(struct work_struct *work);
152
198/* 153/*
199 * Kill off a pending schedule_delayed_work(). Note that the work callback 154 * Kill off a pending schedule_delayed_work(). Note that the work callback
200 * function may still be running on return from cancel_delayed_work(), unless 155 * function may still be running on return from cancel_delayed_work(), unless
@@ -207,8 +162,18 @@ static inline int cancel_delayed_work(struct delayed_work *work)
207 162
208 ret = del_timer(&work->timer); 163 ret = del_timer(&work->timer);
209 if (ret) 164 if (ret)
210 work_release(&work->work); 165 work_clear_pending(&work->work);
211 return ret; 166 return ret;
212} 167}
213 168
169extern void cancel_rearming_delayed_work(struct delayed_work *work);
170
171/* Obsolete. use cancel_rearming_delayed_work() */
172static inline
173void cancel_rearming_delayed_workqueue(struct workqueue_struct *wq,
174 struct delayed_work *work)
175{
176 cancel_rearming_delayed_work(work);
177}
178
214#endif 179#endif
diff --git a/init/Kconfig b/init/Kconfig
index a7e48796d571..e63a017c391e 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -502,6 +502,15 @@ config VM_EVENT_COUNTERS
502 on EMBEDDED systems. /proc/vmstat will only show page counts 502 on EMBEDDED systems. /proc/vmstat will only show page counts
503 if VM event counters are disabled. 503 if VM event counters are disabled.
504 504
505config SLUB_DEBUG
506 default y
507 bool "Enable SLUB debugging support" if EMBEDDED
508 help
509 SLUB has extensive debug support features. Disabling these can
510 result in significant savings in code size. This also disables
511 SLUB sysfs support. /sys/slab will not exist and there will be
512 no support for cache validation etc.
513
505choice 514choice
506 prompt "Choose SLAB allocator" 515 prompt "Choose SLAB allocator"
507 default SLAB 516 default SLAB
@@ -512,9 +521,9 @@ config SLAB
512 bool "SLAB" 521 bool "SLAB"
513 help 522 help
514 The regular slab allocator that is established and known to work 523 The regular slab allocator that is established and known to work
515 well in all environments. It organizes chache hot objects in 524 well in all environments. It organizes cache hot objects in
516 per cpu and per node queues. SLAB is the default choice for 525 per cpu and per node queues. SLAB is the default choice for
517 slab allocator. 526 a slab allocator.
518 527
519config SLUB 528config SLUB
520 depends on EXPERIMENTAL && !ARCH_USES_SLAB_PAGE_STRUCT 529 depends on EXPERIMENTAL && !ARCH_USES_SLAB_PAGE_STRUCT
@@ -524,21 +533,20 @@ config SLUB
524 instead of managing queues of cached objects (SLAB approach). 533 instead of managing queues of cached objects (SLAB approach).
525 Per cpu caching is realized using slabs of objects instead 534 Per cpu caching is realized using slabs of objects instead
526 of queues of objects. SLUB can use memory efficiently 535 of queues of objects. SLUB can use memory efficiently
527 way and has enhanced diagnostics. 536 and has enhanced diagnostics.
528 537
529config SLOB 538config SLOB
530# 539#
531# SLOB cannot support SMP because SLAB_DESTROY_BY_RCU does not work 540# SLOB does not support SMP because SLAB_DESTROY_BY_RCU is unsupported
532# properly.
533# 541#
534 depends on EMBEDDED && !SMP && !SPARSEMEM 542 depends on EMBEDDED && !SMP && !SPARSEMEM
535 bool "SLOB (Simple Allocator)" 543 bool "SLOB (Simple Allocator)"
536 help 544 help
537 SLOB replaces the SLAB allocator with a drastically simpler 545 SLOB replaces the SLAB allocator with a drastically simpler
538 allocator. SLOB is more space efficient that SLAB but does not 546 allocator. SLOB is more space efficient that SLAB but does not
539 scale well (single lock for all operations) and is more susceptible 547 scale well (single lock for all operations) and is also highly
540 to fragmentation. SLOB it is a great choice to reduce 548 susceptible to fragmentation. SLUB can accomplish a higher object
541 memory usage and code size for embedded systems. 549 density. It is usually better to use SLUB instead of SLOB.
542 550
543endchoice 551endchoice
544 552
diff --git a/init/do_mounts.c b/init/do_mounts.c
index 3f57ed4599d6..46fe407fb03e 100644
--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@@ -7,6 +7,7 @@
7#include <linux/root_dev.h> 7#include <linux/root_dev.h>
8#include <linux/security.h> 8#include <linux/security.h>
9#include <linux/delay.h> 9#include <linux/delay.h>
10#include <linux/genhd.h>
10#include <linux/mount.h> 11#include <linux/mount.h>
11#include <linux/device.h> 12#include <linux/device.h>
12#include <linux/init.h> 13#include <linux/init.h>
@@ -308,17 +309,21 @@ retry:
308 /* 309 /*
309 * Allow the user to distinguish between failed sys_open 310 * Allow the user to distinguish between failed sys_open
310 * and bad superblock on root device. 311 * and bad superblock on root device.
312 * and give them a list of the available devices
311 */ 313 */
312#ifdef CONFIG_BLOCK 314#ifdef CONFIG_BLOCK
313 __bdevname(ROOT_DEV, b); 315 __bdevname(ROOT_DEV, b);
314#endif 316#endif
315 printk("VFS: Cannot open root device \"%s\" or %s\n", 317 printk("VFS: Cannot open root device \"%s\" or %s\n",
316 root_device_name, b); 318 root_device_name, b);
317 printk("Please append a correct \"root=\" boot option\n"); 319 printk("Please append a correct \"root=\" boot option; here are the available partitions:\n");
318 320
321 printk_all_partitions();
319 panic("VFS: Unable to mount root fs on %s", b); 322 panic("VFS: Unable to mount root fs on %s", b);
320 } 323 }
321 324
325 printk("List of all partitions:\n");
326 printk_all_partitions();
322 printk("No filesystem could mount root, tried: "); 327 printk("No filesystem could mount root, tried: ");
323 for (p = fs_names; *p; p += strlen(p)+1) 328 for (p = fs_names; *p; p += strlen(p)+1)
324 printk(" %s", p); 329 printk(" %s", p);
diff --git a/init/main.c b/init/main.c
index c1537e0ddceb..e8d080cab443 100644
--- a/init/main.c
+++ b/init/main.c
@@ -54,6 +54,7 @@
54#include <linux/lockdep.h> 54#include <linux/lockdep.h>
55#include <linux/pid_namespace.h> 55#include <linux/pid_namespace.h>
56#include <linux/device.h> 56#include <linux/device.h>
57#include <linux/kthread.h>
57 58
58#include <asm/io.h> 59#include <asm/io.h>
59#include <asm/bugs.h> 60#include <asm/bugs.h>
@@ -425,8 +426,12 @@ static void __init setup_command_line(char *command_line)
425static void noinline rest_init(void) 426static void noinline rest_init(void)
426 __releases(kernel_lock) 427 __releases(kernel_lock)
427{ 428{
429 int pid;
430
428 kernel_thread(kernel_init, NULL, CLONE_FS | CLONE_SIGHAND); 431 kernel_thread(kernel_init, NULL, CLONE_FS | CLONE_SIGHAND);
429 numa_default_policy(); 432 numa_default_policy();
433 pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES);
434 kthreadd_task = find_task_by_pid(pid);
430 unlock_kernel(); 435 unlock_kernel();
431 436
432 /* 437 /*
diff --git a/kernel/configs.c b/kernel/configs.c
index 8fa1fb28f8a7..e84d3f9c6c7b 100644
--- a/kernel/configs.c
+++ b/kernel/configs.c
@@ -61,18 +61,9 @@ static ssize_t
61ikconfig_read_current(struct file *file, char __user *buf, 61ikconfig_read_current(struct file *file, char __user *buf,
62 size_t len, loff_t * offset) 62 size_t len, loff_t * offset)
63{ 63{
64 loff_t pos = *offset; 64 return simple_read_from_buffer(buf, len, offset,
65 ssize_t count; 65 kernel_config_data + MAGIC_SIZE,
66 66 kernel_config_data_size);
67 if (pos >= kernel_config_data_size)
68 return 0;
69
70 count = min(len, (size_t)(kernel_config_data_size - pos));
71 if (copy_to_user(buf, kernel_config_data + MAGIC_SIZE + pos, count))
72 return -EFAULT;
73
74 *offset += count;
75 return count;
76} 67}
77 68
78static const struct file_operations ikconfig_file_ops = { 69static const struct file_operations ikconfig_file_ops = {
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 36e70845cfc3..208cf3497c10 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -97,7 +97,7 @@ static inline void check_for_tasks(int cpu)
97 (!cputime_eq(p->utime, cputime_zero) || 97 (!cputime_eq(p->utime, cputime_zero) ||
98 !cputime_eq(p->stime, cputime_zero))) 98 !cputime_eq(p->stime, cputime_zero)))
99 printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d\ 99 printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d\
100 (state = %ld, flags = %lx) \n", 100 (state = %ld, flags = %x) \n",
101 p->comm, p->pid, cpu, p->state, p->flags); 101 p->comm, p->pid, cpu, p->state, p->flags);
102 } 102 }
103 write_unlock_irq(&tasklist_lock); 103 write_unlock_irq(&tasklist_lock);
@@ -120,11 +120,13 @@ static int take_cpu_down(void *unused)
120} 120}
121 121
122/* Requires cpu_add_remove_lock to be held */ 122/* Requires cpu_add_remove_lock to be held */
123static int _cpu_down(unsigned int cpu) 123static int _cpu_down(unsigned int cpu, int tasks_frozen)
124{ 124{
125 int err; 125 int err, nr_calls = 0;
126 struct task_struct *p; 126 struct task_struct *p;
127 cpumask_t old_allowed, tmp; 127 cpumask_t old_allowed, tmp;
128 void *hcpu = (void *)(long)cpu;
129 unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
128 130
129 if (num_online_cpus() == 1) 131 if (num_online_cpus() == 1)
130 return -EBUSY; 132 return -EBUSY;
@@ -132,12 +134,16 @@ static int _cpu_down(unsigned int cpu)
132 if (!cpu_online(cpu)) 134 if (!cpu_online(cpu))
133 return -EINVAL; 135 return -EINVAL;
134 136
135 err = raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE, 137 raw_notifier_call_chain(&cpu_chain, CPU_LOCK_ACQUIRE, hcpu);
136 (void *)(long)cpu); 138 err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod,
139 hcpu, -1, &nr_calls);
137 if (err == NOTIFY_BAD) { 140 if (err == NOTIFY_BAD) {
141 __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
142 hcpu, nr_calls, NULL);
138 printk("%s: attempt to take down CPU %u failed\n", 143 printk("%s: attempt to take down CPU %u failed\n",
139 __FUNCTION__, cpu); 144 __FUNCTION__, cpu);
140 return -EINVAL; 145 err = -EINVAL;
146 goto out_release;
141 } 147 }
142 148
143 /* Ensure that we are not runnable on dying cpu */ 149 /* Ensure that we are not runnable on dying cpu */
@@ -152,8 +158,8 @@ static int _cpu_down(unsigned int cpu)
152 158
153 if (IS_ERR(p) || cpu_online(cpu)) { 159 if (IS_ERR(p) || cpu_online(cpu)) {
154 /* CPU didn't die: tell everyone. Can't complain. */ 160 /* CPU didn't die: tell everyone. Can't complain. */
155 if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED, 161 if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
156 (void *)(long)cpu) == NOTIFY_BAD) 162 hcpu) == NOTIFY_BAD)
157 BUG(); 163 BUG();
158 164
159 if (IS_ERR(p)) { 165 if (IS_ERR(p)) {
@@ -170,13 +176,9 @@ static int _cpu_down(unsigned int cpu)
170 /* This actually kills the CPU. */ 176 /* This actually kills the CPU. */
171 __cpu_die(cpu); 177 __cpu_die(cpu);
172 178
173 /* Move it here so it can run. */
174 kthread_bind(p, get_cpu());
175 put_cpu();
176
177 /* CPU is completely dead: tell everyone. Too late to complain. */ 179 /* CPU is completely dead: tell everyone. Too late to complain. */
178 if (raw_notifier_call_chain(&cpu_chain, CPU_DEAD, 180 if (raw_notifier_call_chain(&cpu_chain, CPU_DEAD | mod,
179 (void *)(long)cpu) == NOTIFY_BAD) 181 hcpu) == NOTIFY_BAD)
180 BUG(); 182 BUG();
181 183
182 check_for_tasks(cpu); 184 check_for_tasks(cpu);
@@ -185,6 +187,8 @@ out_thread:
185 err = kthread_stop(p); 187 err = kthread_stop(p);
186out_allowed: 188out_allowed:
187 set_cpus_allowed(current, old_allowed); 189 set_cpus_allowed(current, old_allowed);
190out_release:
191 raw_notifier_call_chain(&cpu_chain, CPU_LOCK_RELEASE, hcpu);
188 return err; 192 return err;
189} 193}
190 194
@@ -196,7 +200,7 @@ int cpu_down(unsigned int cpu)
196 if (cpu_hotplug_disabled) 200 if (cpu_hotplug_disabled)
197 err = -EBUSY; 201 err = -EBUSY;
198 else 202 else
199 err = _cpu_down(cpu); 203 err = _cpu_down(cpu, 0);
200 204
201 mutex_unlock(&cpu_add_remove_lock); 205 mutex_unlock(&cpu_add_remove_lock);
202 return err; 206 return err;
@@ -204,15 +208,18 @@ int cpu_down(unsigned int cpu)
204#endif /*CONFIG_HOTPLUG_CPU*/ 208#endif /*CONFIG_HOTPLUG_CPU*/
205 209
206/* Requires cpu_add_remove_lock to be held */ 210/* Requires cpu_add_remove_lock to be held */
207static int __cpuinit _cpu_up(unsigned int cpu) 211static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
208{ 212{
209 int ret; 213 int ret, nr_calls = 0;
210 void *hcpu = (void *)(long)cpu; 214 void *hcpu = (void *)(long)cpu;
215 unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
211 216
212 if (cpu_online(cpu) || !cpu_present(cpu)) 217 if (cpu_online(cpu) || !cpu_present(cpu))
213 return -EINVAL; 218 return -EINVAL;
214 219
215 ret = raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu); 220 raw_notifier_call_chain(&cpu_chain, CPU_LOCK_ACQUIRE, hcpu);
221 ret = __raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE | mod, hcpu,
222 -1, &nr_calls);
216 if (ret == NOTIFY_BAD) { 223 if (ret == NOTIFY_BAD) {
217 printk("%s: attempt to bring up CPU %u failed\n", 224 printk("%s: attempt to bring up CPU %u failed\n",
218 __FUNCTION__, cpu); 225 __FUNCTION__, cpu);
@@ -229,12 +236,13 @@ static int __cpuinit _cpu_up(unsigned int cpu)
229 BUG_ON(!cpu_online(cpu)); 236 BUG_ON(!cpu_online(cpu));
230 237
231 /* Now call notifier in preparation. */ 238 /* Now call notifier in preparation. */
232 raw_notifier_call_chain(&cpu_chain, CPU_ONLINE, hcpu); 239 raw_notifier_call_chain(&cpu_chain, CPU_ONLINE | mod, hcpu);
233 240
234out_notify: 241out_notify:
235 if (ret != 0) 242 if (ret != 0)
236 raw_notifier_call_chain(&cpu_chain, 243 __raw_notifier_call_chain(&cpu_chain,
237 CPU_UP_CANCELED, hcpu); 244 CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL);
245 raw_notifier_call_chain(&cpu_chain, CPU_LOCK_RELEASE, hcpu);
238 246
239 return ret; 247 return ret;
240} 248}
@@ -247,19 +255,13 @@ int __cpuinit cpu_up(unsigned int cpu)
247 if (cpu_hotplug_disabled) 255 if (cpu_hotplug_disabled)
248 err = -EBUSY; 256 err = -EBUSY;
249 else 257 else
250 err = _cpu_up(cpu); 258 err = _cpu_up(cpu, 0);
251 259
252 mutex_unlock(&cpu_add_remove_lock); 260 mutex_unlock(&cpu_add_remove_lock);
253 return err; 261 return err;
254} 262}
255 263
256#ifdef CONFIG_SUSPEND_SMP 264#ifdef CONFIG_SUSPEND_SMP
257/* Needed to prevent the microcode driver from requesting firmware in its CPU
258 * hotplug notifier during the suspend/resume.
259 */
260int suspend_cpu_hotplug;
261EXPORT_SYMBOL(suspend_cpu_hotplug);
262
263static cpumask_t frozen_cpus; 265static cpumask_t frozen_cpus;
264 266
265int disable_nonboot_cpus(void) 267int disable_nonboot_cpus(void)
@@ -267,7 +269,6 @@ int disable_nonboot_cpus(void)
267 int cpu, first_cpu, error = 0; 269 int cpu, first_cpu, error = 0;
268 270
269 mutex_lock(&cpu_add_remove_lock); 271 mutex_lock(&cpu_add_remove_lock);
270 suspend_cpu_hotplug = 1;
271 first_cpu = first_cpu(cpu_online_map); 272 first_cpu = first_cpu(cpu_online_map);
272 /* We take down all of the non-boot CPUs in one shot to avoid races 273 /* We take down all of the non-boot CPUs in one shot to avoid races
273 * with the userspace trying to use the CPU hotplug at the same time 274 * with the userspace trying to use the CPU hotplug at the same time
@@ -277,7 +278,7 @@ int disable_nonboot_cpus(void)
277 for_each_online_cpu(cpu) { 278 for_each_online_cpu(cpu) {
278 if (cpu == first_cpu) 279 if (cpu == first_cpu)
279 continue; 280 continue;
280 error = _cpu_down(cpu); 281 error = _cpu_down(cpu, 1);
281 if (!error) { 282 if (!error) {
282 cpu_set(cpu, frozen_cpus); 283 cpu_set(cpu, frozen_cpus);
283 printk("CPU%d is down\n", cpu); 284 printk("CPU%d is down\n", cpu);
@@ -294,7 +295,6 @@ int disable_nonboot_cpus(void)
294 } else { 295 } else {
295 printk(KERN_ERR "Non-boot CPUs are not disabled\n"); 296 printk(KERN_ERR "Non-boot CPUs are not disabled\n");
296 } 297 }
297 suspend_cpu_hotplug = 0;
298 mutex_unlock(&cpu_add_remove_lock); 298 mutex_unlock(&cpu_add_remove_lock);
299 return error; 299 return error;
300} 300}
@@ -309,10 +309,9 @@ void enable_nonboot_cpus(void)
309 if (cpus_empty(frozen_cpus)) 309 if (cpus_empty(frozen_cpus))
310 goto out; 310 goto out;
311 311
312 suspend_cpu_hotplug = 1;
313 printk("Enabling non-boot CPUs ...\n"); 312 printk("Enabling non-boot CPUs ...\n");
314 for_each_cpu_mask(cpu, frozen_cpus) { 313 for_each_cpu_mask(cpu, frozen_cpus) {
315 error = _cpu_up(cpu); 314 error = _cpu_up(cpu, 1);
316 if (!error) { 315 if (!error) {
317 printk("CPU%d is up\n", cpu); 316 printk("CPU%d is up\n", cpu);
318 continue; 317 continue;
@@ -320,7 +319,6 @@ void enable_nonboot_cpus(void)
320 printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error); 319 printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error);
321 } 320 }
322 cpus_clear(frozen_cpus); 321 cpus_clear(frozen_cpus);
323 suspend_cpu_hotplug = 0;
324out: 322out:
325 mutex_unlock(&cpu_add_remove_lock); 323 mutex_unlock(&cpu_add_remove_lock);
326} 324}
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 88b416dfbc72..f57854b08922 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1772,12 +1772,7 @@ static ssize_t cpuset_tasks_read(struct file *file, char __user *buf,
1772{ 1772{
1773 struct ctr_struct *ctr = file->private_data; 1773 struct ctr_struct *ctr = file->private_data;
1774 1774
1775 if (*ppos + nbytes > ctr->bufsz) 1775 return simple_read_from_buffer(buf, nbytes, ppos, ctr->buf, ctr->bufsz);
1776 nbytes = ctr->bufsz - *ppos;
1777 if (copy_to_user(buf, ctr->buf + *ppos, nbytes))
1778 return -EFAULT;
1779 *ppos += nbytes;
1780 return nbytes;
1781} 1776}
1782 1777
1783static int cpuset_tasks_release(struct inode *unused_inode, struct file *file) 1778static int cpuset_tasks_release(struct inode *unused_inode, struct file *file)
diff --git a/kernel/exit.c b/kernel/exit.c
index f5a7abb621f3..b0c6f0c3a2df 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -26,6 +26,7 @@
26#include <linux/profile.h> 26#include <linux/profile.h>
27#include <linux/mount.h> 27#include <linux/mount.h>
28#include <linux/proc_fs.h> 28#include <linux/proc_fs.h>
29#include <linux/kthread.h>
29#include <linux/mempolicy.h> 30#include <linux/mempolicy.h>
30#include <linux/taskstats_kern.h> 31#include <linux/taskstats_kern.h>
31#include <linux/delayacct.h> 32#include <linux/delayacct.h>
@@ -254,26 +255,25 @@ static int has_stopped_jobs(struct pid *pgrp)
254} 255}
255 256
256/** 257/**
257 * reparent_to_init - Reparent the calling kernel thread to the init task of the pid space that the thread belongs to. 258 * reparent_to_kthreadd - Reparent the calling kernel thread to kthreadd
258 * 259 *
259 * If a kernel thread is launched as a result of a system call, or if 260 * If a kernel thread is launched as a result of a system call, or if
260 * it ever exits, it should generally reparent itself to init so that 261 * it ever exits, it should generally reparent itself to kthreadd so it
261 * it is correctly cleaned up on exit. 262 * isn't in the way of other processes and is correctly cleaned up on exit.
262 * 263 *
263 * The various task state such as scheduling policy and priority may have 264 * The various task state such as scheduling policy and priority may have
264 * been inherited from a user process, so we reset them to sane values here. 265 * been inherited from a user process, so we reset them to sane values here.
265 * 266 *
266 * NOTE that reparent_to_init() gives the caller full capabilities. 267 * NOTE that reparent_to_kthreadd() gives the caller full capabilities.
267 */ 268 */
268static void reparent_to_init(void) 269static void reparent_to_kthreadd(void)
269{ 270{
270 write_lock_irq(&tasklist_lock); 271 write_lock_irq(&tasklist_lock);
271 272
272 ptrace_unlink(current); 273 ptrace_unlink(current);
273 /* Reparent to init */ 274 /* Reparent to init */
274 remove_parent(current); 275 remove_parent(current);
275 current->parent = child_reaper(current); 276 current->real_parent = current->parent = kthreadd_task;
276 current->real_parent = child_reaper(current);
277 add_parent(current); 277 add_parent(current);
278 278
279 /* Set the exit signal to SIGCHLD so we signal init on exit */ 279 /* Set the exit signal to SIGCHLD so we signal init on exit */
@@ -347,7 +347,7 @@ int disallow_signal(int sig)
347 return -EINVAL; 347 return -EINVAL;
348 348
349 spin_lock_irq(&current->sighand->siglock); 349 spin_lock_irq(&current->sighand->siglock);
350 sigaddset(&current->blocked, sig); 350 current->sighand->action[(sig)-1].sa.sa_handler = SIG_IGN;
351 recalc_sigpending(); 351 recalc_sigpending();
352 spin_unlock_irq(&current->sighand->siglock); 352 spin_unlock_irq(&current->sighand->siglock);
353 return 0; 353 return 0;
@@ -400,7 +400,7 @@ void daemonize(const char *name, ...)
400 current->files = init_task.files; 400 current->files = init_task.files;
401 atomic_inc(&current->files->count); 401 atomic_inc(&current->files->count);
402 402
403 reparent_to_init(); 403 reparent_to_kthreadd();
404} 404}
405 405
406EXPORT_SYMBOL(daemonize); 406EXPORT_SYMBOL(daemonize);
diff --git a/kernel/fork.c b/kernel/fork.c
index a8dd75d4992b..5dd3979747f5 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -105,7 +105,7 @@ static struct kmem_cache *mm_cachep;
105 105
106void free_task(struct task_struct *tsk) 106void free_task(struct task_struct *tsk)
107{ 107{
108 free_thread_info(tsk->thread_info); 108 free_thread_info(tsk->stack);
109 rt_mutex_debug_task_free(tsk); 109 rt_mutex_debug_task_free(tsk);
110 free_task_struct(tsk); 110 free_task_struct(tsk);
111} 111}
@@ -175,7 +175,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
175 } 175 }
176 176
177 *tsk = *orig; 177 *tsk = *orig;
178 tsk->thread_info = ti; 178 tsk->stack = ti;
179 setup_thread_stack(tsk, orig); 179 setup_thread_stack(tsk, orig);
180 180
181#ifdef CONFIG_CC_STACKPROTECTOR 181#ifdef CONFIG_CC_STACKPROTECTOR
diff --git a/kernel/futex.c b/kernel/futex.c
index 600bc9d801f2..b7ce15c67e32 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -16,6 +16,9 @@
16 * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> 16 * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
17 * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> 17 * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
18 * 18 *
19 * PRIVATE futexes by Eric Dumazet
20 * Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com>
21 *
19 * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly 22 * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
20 * enough at me, Linus for the original (flawed) idea, Matthew 23 * enough at me, Linus for the original (flawed) idea, Matthew
21 * Kirkwood for proof-of-concept implementation. 24 * Kirkwood for proof-of-concept implementation.
@@ -53,6 +56,12 @@
53 56
54#include "rtmutex_common.h" 57#include "rtmutex_common.h"
55 58
59#ifdef CONFIG_DEBUG_RT_MUTEXES
60# include "rtmutex-debug.h"
61#else
62# include "rtmutex.h"
63#endif
64
56#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) 65#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
57 66
58/* 67/*
@@ -81,12 +90,12 @@ struct futex_pi_state {
81 * we can wake only the relevant ones (hashed queues may be shared). 90 * we can wake only the relevant ones (hashed queues may be shared).
82 * 91 *
83 * A futex_q has a woken state, just like tasks have TASK_RUNNING. 92 * A futex_q has a woken state, just like tasks have TASK_RUNNING.
84 * It is considered woken when list_empty(&q->list) || q->lock_ptr == 0. 93 * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
85 * The order of wakup is always to make the first condition true, then 94 * The order of wakup is always to make the first condition true, then
86 * wake up q->waiters, then make the second condition true. 95 * wake up q->waiters, then make the second condition true.
87 */ 96 */
88struct futex_q { 97struct futex_q {
89 struct list_head list; 98 struct plist_node list;
90 wait_queue_head_t waiters; 99 wait_queue_head_t waiters;
91 100
92 /* Which hash list lock to use: */ 101 /* Which hash list lock to use: */
@@ -102,14 +111,20 @@ struct futex_q {
102 /* Optional priority inheritance state: */ 111 /* Optional priority inheritance state: */
103 struct futex_pi_state *pi_state; 112 struct futex_pi_state *pi_state;
104 struct task_struct *task; 113 struct task_struct *task;
114
115 /*
116 * This waiter is used in case of requeue from a
117 * normal futex to a PI-futex
118 */
119 struct rt_mutex_waiter waiter;
105}; 120};
106 121
107/* 122/*
108 * Split the global futex_lock into every hash list lock. 123 * Split the global futex_lock into every hash list lock.
109 */ 124 */
110struct futex_hash_bucket { 125struct futex_hash_bucket {
111 spinlock_t lock; 126 spinlock_t lock;
112 struct list_head chain; 127 struct plist_head chain;
113}; 128};
114 129
115static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS]; 130static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS];
@@ -138,19 +153,26 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2)
138 && key1->both.offset == key2->both.offset); 153 && key1->both.offset == key2->both.offset);
139} 154}
140 155
141/* 156/**
142 * Get parameters which are the keys for a futex. 157 * get_futex_key - Get parameters which are the keys for a futex.
158 * @uaddr: virtual address of the futex
159 * @shared: NULL for a PROCESS_PRIVATE futex,
160 * &current->mm->mmap_sem for a PROCESS_SHARED futex
161 * @key: address where result is stored.
162 *
163 * Returns a negative error code or 0
164 * The key words are stored in *key on success.
143 * 165 *
144 * For shared mappings, it's (page->index, vma->vm_file->f_path.dentry->d_inode, 166 * For shared mappings, it's (page->index, vma->vm_file->f_path.dentry->d_inode,
145 * offset_within_page). For private mappings, it's (uaddr, current->mm). 167 * offset_within_page). For private mappings, it's (uaddr, current->mm).
146 * We can usually work out the index without swapping in the page. 168 * We can usually work out the index without swapping in the page.
147 * 169 *
148 * Returns: 0, or negative error code. 170 * fshared is NULL for PROCESS_PRIVATE futexes
149 * The key words are stored in *key on success. 171 * For other futexes, it points to &current->mm->mmap_sem and
150 * 172 * caller must have taken the reader lock. but NOT any spinlocks.
151 * Should be called with &current->mm->mmap_sem but NOT any spinlocks.
152 */ 173 */
153int get_futex_key(u32 __user *uaddr, union futex_key *key) 174int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared,
175 union futex_key *key)
154{ 176{
155 unsigned long address = (unsigned long)uaddr; 177 unsigned long address = (unsigned long)uaddr;
156 struct mm_struct *mm = current->mm; 178 struct mm_struct *mm = current->mm;
@@ -162,11 +184,25 @@ int get_futex_key(u32 __user *uaddr, union futex_key *key)
162 * The futex address must be "naturally" aligned. 184 * The futex address must be "naturally" aligned.
163 */ 185 */
164 key->both.offset = address % PAGE_SIZE; 186 key->both.offset = address % PAGE_SIZE;
165 if (unlikely((key->both.offset % sizeof(u32)) != 0)) 187 if (unlikely((address % sizeof(u32)) != 0))
166 return -EINVAL; 188 return -EINVAL;
167 address -= key->both.offset; 189 address -= key->both.offset;
168 190
169 /* 191 /*
192 * PROCESS_PRIVATE futexes are fast.
193 * As the mm cannot disappear under us and the 'key' only needs
194 * virtual address, we dont even have to find the underlying vma.
195 * Note : We do have to check 'uaddr' is a valid user address,
196 * but access_ok() should be faster than find_vma()
197 */
198 if (!fshared) {
199 if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))))
200 return -EFAULT;
201 key->private.mm = mm;
202 key->private.address = address;
203 return 0;
204 }
205 /*
170 * The futex is hashed differently depending on whether 206 * The futex is hashed differently depending on whether
171 * it's in a shared or private mapping. So check vma first. 207 * it's in a shared or private mapping. So check vma first.
172 */ 208 */
@@ -180,6 +216,9 @@ int get_futex_key(u32 __user *uaddr, union futex_key *key)
180 if (unlikely((vma->vm_flags & (VM_IO|VM_READ)) != VM_READ)) 216 if (unlikely((vma->vm_flags & (VM_IO|VM_READ)) != VM_READ))
181 return (vma->vm_flags & VM_IO) ? -EPERM : -EACCES; 217 return (vma->vm_flags & VM_IO) ? -EPERM : -EACCES;
182 218
219 /* Save the user address in the ley */
220 key->uaddr = uaddr;
221
183 /* 222 /*
184 * Private mappings are handled in a simple way. 223 * Private mappings are handled in a simple way.
185 * 224 *
@@ -190,6 +229,7 @@ int get_futex_key(u32 __user *uaddr, union futex_key *key)
190 * mappings of _writable_ handles. 229 * mappings of _writable_ handles.
191 */ 230 */
192 if (likely(!(vma->vm_flags & VM_MAYSHARE))) { 231 if (likely(!(vma->vm_flags & VM_MAYSHARE))) {
232 key->both.offset |= FUT_OFF_MMSHARED; /* reference taken on mm */
193 key->private.mm = mm; 233 key->private.mm = mm;
194 key->private.address = address; 234 key->private.address = address;
195 return 0; 235 return 0;
@@ -199,7 +239,7 @@ int get_futex_key(u32 __user *uaddr, union futex_key *key)
199 * Linear file mappings are also simple. 239 * Linear file mappings are also simple.
200 */ 240 */
201 key->shared.inode = vma->vm_file->f_path.dentry->d_inode; 241 key->shared.inode = vma->vm_file->f_path.dentry->d_inode;
202 key->both.offset++; /* Bit 0 of offset indicates inode-based key. */ 242 key->both.offset |= FUT_OFF_INODE; /* inode-based key. */
203 if (likely(!(vma->vm_flags & VM_NONLINEAR))) { 243 if (likely(!(vma->vm_flags & VM_NONLINEAR))) {
204 key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT) 244 key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT)
205 + vma->vm_pgoff); 245 + vma->vm_pgoff);
@@ -227,16 +267,18 @@ EXPORT_SYMBOL_GPL(get_futex_key);
227 * Take a reference to the resource addressed by a key. 267 * Take a reference to the resource addressed by a key.
228 * Can be called while holding spinlocks. 268 * Can be called while holding spinlocks.
229 * 269 *
230 * NOTE: mmap_sem MUST be held between get_futex_key() and calling this
231 * function, if it is called at all. mmap_sem keeps key->shared.inode valid.
232 */ 270 */
233inline void get_futex_key_refs(union futex_key *key) 271inline void get_futex_key_refs(union futex_key *key)
234{ 272{
235 if (key->both.ptr != 0) { 273 if (key->both.ptr == 0)
236 if (key->both.offset & 1) 274 return;
275 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
276 case FUT_OFF_INODE:
237 atomic_inc(&key->shared.inode->i_count); 277 atomic_inc(&key->shared.inode->i_count);
238 else 278 break;
279 case FUT_OFF_MMSHARED:
239 atomic_inc(&key->private.mm->mm_count); 280 atomic_inc(&key->private.mm->mm_count);
281 break;
240 } 282 }
241} 283}
242EXPORT_SYMBOL_GPL(get_futex_key_refs); 284EXPORT_SYMBOL_GPL(get_futex_key_refs);
@@ -247,11 +289,15 @@ EXPORT_SYMBOL_GPL(get_futex_key_refs);
247 */ 289 */
248void drop_futex_key_refs(union futex_key *key) 290void drop_futex_key_refs(union futex_key *key)
249{ 291{
250 if (key->both.ptr != 0) { 292 if (key->both.ptr == 0)
251 if (key->both.offset & 1) 293 return;
294 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
295 case FUT_OFF_INODE:
252 iput(key->shared.inode); 296 iput(key->shared.inode);
253 else 297 break;
298 case FUT_OFF_MMSHARED:
254 mmdrop(key->private.mm); 299 mmdrop(key->private.mm);
300 break;
255 } 301 }
256} 302}
257EXPORT_SYMBOL_GPL(drop_futex_key_refs); 303EXPORT_SYMBOL_GPL(drop_futex_key_refs);
@@ -268,28 +314,38 @@ static inline int get_futex_value_locked(u32 *dest, u32 __user *from)
268} 314}
269 315
270/* 316/*
271 * Fault handling. Called with current->mm->mmap_sem held. 317 * Fault handling.
318 * if fshared is non NULL, current->mm->mmap_sem is already held
272 */ 319 */
273static int futex_handle_fault(unsigned long address, int attempt) 320static int futex_handle_fault(unsigned long address,
321 struct rw_semaphore *fshared, int attempt)
274{ 322{
275 struct vm_area_struct * vma; 323 struct vm_area_struct * vma;
276 struct mm_struct *mm = current->mm; 324 struct mm_struct *mm = current->mm;
325 int ret = -EFAULT;
277 326
278 if (attempt > 2 || !(vma = find_vma(mm, address)) || 327 if (attempt > 2)
279 vma->vm_start > address || !(vma->vm_flags & VM_WRITE)) 328 return ret;
280 return -EFAULT;
281 329
282 switch (handle_mm_fault(mm, vma, address, 1)) { 330 if (!fshared)
283 case VM_FAULT_MINOR: 331 down_read(&mm->mmap_sem);
284 current->min_flt++; 332 vma = find_vma(mm, address);
285 break; 333 if (vma && address >= vma->vm_start &&
286 case VM_FAULT_MAJOR: 334 (vma->vm_flags & VM_WRITE)) {
287 current->maj_flt++; 335 switch (handle_mm_fault(mm, vma, address, 1)) {
288 break; 336 case VM_FAULT_MINOR:
289 default: 337 ret = 0;
290 return -EFAULT; 338 current->min_flt++;
339 break;
340 case VM_FAULT_MAJOR:
341 ret = 0;
342 current->maj_flt++;
343 break;
344 }
291 } 345 }
292 return 0; 346 if (!fshared)
347 up_read(&mm->mmap_sem);
348 return ret;
293} 349}
294 350
295/* 351/*
@@ -439,18 +495,19 @@ void exit_pi_state_list(struct task_struct *curr)
439} 495}
440 496
441static int 497static int
442lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me) 498lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
499 union futex_key *key, struct futex_pi_state **ps)
443{ 500{
444 struct futex_pi_state *pi_state = NULL; 501 struct futex_pi_state *pi_state = NULL;
445 struct futex_q *this, *next; 502 struct futex_q *this, *next;
446 struct list_head *head; 503 struct plist_head *head;
447 struct task_struct *p; 504 struct task_struct *p;
448 pid_t pid; 505 pid_t pid;
449 506
450 head = &hb->chain; 507 head = &hb->chain;
451 508
452 list_for_each_entry_safe(this, next, head, list) { 509 plist_for_each_entry_safe(this, next, head, list) {
453 if (match_futex(&this->key, &me->key)) { 510 if (match_futex(&this->key, key)) {
454 /* 511 /*
455 * Another waiter already exists - bump up 512 * Another waiter already exists - bump up
456 * the refcount and return its pi_state: 513 * the refcount and return its pi_state:
@@ -465,7 +522,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
465 WARN_ON(!atomic_read(&pi_state->refcount)); 522 WARN_ON(!atomic_read(&pi_state->refcount));
466 523
467 atomic_inc(&pi_state->refcount); 524 atomic_inc(&pi_state->refcount);
468 me->pi_state = pi_state; 525 *ps = pi_state;
469 526
470 return 0; 527 return 0;
471 } 528 }
@@ -492,7 +549,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
492 rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p); 549 rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
493 550
494 /* Store the key for possible exit cleanups: */ 551 /* Store the key for possible exit cleanups: */
495 pi_state->key = me->key; 552 pi_state->key = *key;
496 553
497 spin_lock_irq(&p->pi_lock); 554 spin_lock_irq(&p->pi_lock);
498 WARN_ON(!list_empty(&pi_state->list)); 555 WARN_ON(!list_empty(&pi_state->list));
@@ -502,7 +559,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
502 559
503 put_task_struct(p); 560 put_task_struct(p);
504 561
505 me->pi_state = pi_state; 562 *ps = pi_state;
506 563
507 return 0; 564 return 0;
508} 565}
@@ -513,12 +570,12 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
513 */ 570 */
514static void wake_futex(struct futex_q *q) 571static void wake_futex(struct futex_q *q)
515{ 572{
516 list_del_init(&q->list); 573 plist_del(&q->list, &q->list.plist);
517 if (q->filp) 574 if (q->filp)
518 send_sigio(&q->filp->f_owner, q->fd, POLL_IN); 575 send_sigio(&q->filp->f_owner, q->fd, POLL_IN);
519 /* 576 /*
520 * The lock in wake_up_all() is a crucial memory barrier after the 577 * The lock in wake_up_all() is a crucial memory barrier after the
521 * list_del_init() and also before assigning to q->lock_ptr. 578 * plist_del() and also before assigning to q->lock_ptr.
522 */ 579 */
523 wake_up_all(&q->waiters); 580 wake_up_all(&q->waiters);
524 /* 581 /*
@@ -562,6 +619,8 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
562 */ 619 */
563 if (!(uval & FUTEX_OWNER_DIED)) { 620 if (!(uval & FUTEX_OWNER_DIED)) {
564 newval = FUTEX_WAITERS | new_owner->pid; 621 newval = FUTEX_WAITERS | new_owner->pid;
622 /* Keep the FUTEX_WAITER_REQUEUED flag if it was set */
623 newval |= (uval & FUTEX_WAITER_REQUEUED);
565 624
566 pagefault_disable(); 625 pagefault_disable();
567 curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); 626 curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
@@ -629,17 +688,19 @@ double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
629 * Wake up all waiters hashed on the physical page that is mapped 688 * Wake up all waiters hashed on the physical page that is mapped
630 * to this virtual address: 689 * to this virtual address:
631 */ 690 */
632static int futex_wake(u32 __user *uaddr, int nr_wake) 691static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared,
692 int nr_wake)
633{ 693{
634 struct futex_hash_bucket *hb; 694 struct futex_hash_bucket *hb;
635 struct futex_q *this, *next; 695 struct futex_q *this, *next;
636 struct list_head *head; 696 struct plist_head *head;
637 union futex_key key; 697 union futex_key key;
638 int ret; 698 int ret;
639 699
640 down_read(&current->mm->mmap_sem); 700 if (fshared)
701 down_read(fshared);
641 702
642 ret = get_futex_key(uaddr, &key); 703 ret = get_futex_key(uaddr, fshared, &key);
643 if (unlikely(ret != 0)) 704 if (unlikely(ret != 0))
644 goto out; 705 goto out;
645 706
@@ -647,7 +708,7 @@ static int futex_wake(u32 __user *uaddr, int nr_wake)
647 spin_lock(&hb->lock); 708 spin_lock(&hb->lock);
648 head = &hb->chain; 709 head = &hb->chain;
649 710
650 list_for_each_entry_safe(this, next, head, list) { 711 plist_for_each_entry_safe(this, next, head, list) {
651 if (match_futex (&this->key, &key)) { 712 if (match_futex (&this->key, &key)) {
652 if (this->pi_state) { 713 if (this->pi_state) {
653 ret = -EINVAL; 714 ret = -EINVAL;
@@ -661,7 +722,261 @@ static int futex_wake(u32 __user *uaddr, int nr_wake)
661 722
662 spin_unlock(&hb->lock); 723 spin_unlock(&hb->lock);
663out: 724out:
664 up_read(&current->mm->mmap_sem); 725 if (fshared)
726 up_read(fshared);
727 return ret;
728}
729
730/*
731 * Called from futex_requeue_pi.
732 * Set FUTEX_WAITERS and FUTEX_WAITER_REQUEUED flags on the
733 * PI-futex value; search its associated pi_state if an owner exist
734 * or create a new one without owner.
735 */
736static inline int
737lookup_pi_state_for_requeue(u32 __user *uaddr, struct futex_hash_bucket *hb,
738 union futex_key *key,
739 struct futex_pi_state **pi_state)
740{
741 u32 curval, uval, newval;
742
743retry:
744 /*
745 * We can't handle a fault cleanly because we can't
746 * release the locks here. Simply return the fault.
747 */
748 if (get_futex_value_locked(&curval, uaddr))
749 return -EFAULT;
750
751 /* set the flags FUTEX_WAITERS and FUTEX_WAITER_REQUEUED */
752 if ((curval & (FUTEX_WAITERS | FUTEX_WAITER_REQUEUED))
753 != (FUTEX_WAITERS | FUTEX_WAITER_REQUEUED)) {
754 /*
755 * No waiters yet, we prepare the futex to have some waiters.
756 */
757
758 uval = curval;
759 newval = uval | FUTEX_WAITERS | FUTEX_WAITER_REQUEUED;
760
761 pagefault_disable();
762 curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
763 pagefault_enable();
764
765 if (unlikely(curval == -EFAULT))
766 return -EFAULT;
767 if (unlikely(curval != uval))
768 goto retry;
769 }
770
771 if (!(curval & FUTEX_TID_MASK)
772 || lookup_pi_state(curval, hb, key, pi_state)) {
773 /* the futex has no owner (yet) or the lookup failed:
774 allocate one pi_state without owner */
775
776 *pi_state = alloc_pi_state();
777
778 /* Already stores the key: */
779 (*pi_state)->key = *key;
780
781 /* init the mutex without owner */
782 __rt_mutex_init(&(*pi_state)->pi_mutex, NULL);
783 }
784
785 return 0;
786}
787
788/*
789 * Keep the first nr_wake waiter from futex1, wake up one,
790 * and requeue the next nr_requeue waiters following hashed on
791 * one physical page to another physical page (PI-futex uaddr2)
792 */
793static int futex_requeue_pi(u32 __user *uaddr1,
794 struct rw_semaphore *fshared,
795 u32 __user *uaddr2,
796 int nr_wake, int nr_requeue, u32 *cmpval)
797{
798 union futex_key key1, key2;
799 struct futex_hash_bucket *hb1, *hb2;
800 struct plist_head *head1;
801 struct futex_q *this, *next;
802 struct futex_pi_state *pi_state2 = NULL;
803 struct rt_mutex_waiter *waiter, *top_waiter = NULL;
804 struct rt_mutex *lock2 = NULL;
805 int ret, drop_count = 0;
806
807 if (refill_pi_state_cache())
808 return -ENOMEM;
809
810retry:
811 /*
812 * First take all the futex related locks:
813 */
814 if (fshared)
815 down_read(fshared);
816
817 ret = get_futex_key(uaddr1, fshared, &key1);
818 if (unlikely(ret != 0))
819 goto out;
820 ret = get_futex_key(uaddr2, fshared, &key2);
821 if (unlikely(ret != 0))
822 goto out;
823
824 hb1 = hash_futex(&key1);
825 hb2 = hash_futex(&key2);
826
827 double_lock_hb(hb1, hb2);
828
829 if (likely(cmpval != NULL)) {
830 u32 curval;
831
832 ret = get_futex_value_locked(&curval, uaddr1);
833
834 if (unlikely(ret)) {
835 spin_unlock(&hb1->lock);
836 if (hb1 != hb2)
837 spin_unlock(&hb2->lock);
838
839 /*
840 * If we would have faulted, release mmap_sem, fault
841 * it in and start all over again.
842 */
843 if (fshared)
844 up_read(fshared);
845
846 ret = get_user(curval, uaddr1);
847
848 if (!ret)
849 goto retry;
850
851 return ret;
852 }
853 if (curval != *cmpval) {
854 ret = -EAGAIN;
855 goto out_unlock;
856 }
857 }
858
859 head1 = &hb1->chain;
860 plist_for_each_entry_safe(this, next, head1, list) {
861 if (!match_futex (&this->key, &key1))
862 continue;
863 if (++ret <= nr_wake) {
864 wake_futex(this);
865 } else {
866 /*
867 * FIRST: get and set the pi_state
868 */
869 if (!pi_state2) {
870 int s;
871 /* do this only the first time we requeue someone */
872 s = lookup_pi_state_for_requeue(uaddr2, hb2,
873 &key2, &pi_state2);
874 if (s) {
875 ret = s;
876 goto out_unlock;
877 }
878
879 lock2 = &pi_state2->pi_mutex;
880 spin_lock(&lock2->wait_lock);
881
882 /* Save the top waiter of the wait_list */
883 if (rt_mutex_has_waiters(lock2))
884 top_waiter = rt_mutex_top_waiter(lock2);
885 } else
886 atomic_inc(&pi_state2->refcount);
887
888
889 this->pi_state = pi_state2;
890
891 /*
892 * SECOND: requeue futex_q to the correct hashbucket
893 */
894
895 /*
896 * If key1 and key2 hash to the same bucket, no need to
897 * requeue.
898 */
899 if (likely(head1 != &hb2->chain)) {
900 plist_del(&this->list, &hb1->chain);
901 plist_add(&this->list, &hb2->chain);
902 this->lock_ptr = &hb2->lock;
903#ifdef CONFIG_DEBUG_PI_LIST
904 this->list.plist.lock = &hb2->lock;
905#endif
906 }
907 this->key = key2;
908 get_futex_key_refs(&key2);
909 drop_count++;
910
911
912 /*
913 * THIRD: queue it to lock2
914 */
915 spin_lock_irq(&this->task->pi_lock);
916 waiter = &this->waiter;
917 waiter->task = this->task;
918 waiter->lock = lock2;
919 plist_node_init(&waiter->list_entry, this->task->prio);
920 plist_node_init(&waiter->pi_list_entry, this->task->prio);
921 plist_add(&waiter->list_entry, &lock2->wait_list);
922 this->task->pi_blocked_on = waiter;
923 spin_unlock_irq(&this->task->pi_lock);
924
925 if (ret - nr_wake >= nr_requeue)
926 break;
927 }
928 }
929
930 /* If we've requeued some tasks and the top_waiter of the rt_mutex
931 has changed, we must adjust the priority of the owner, if any */
932 if (drop_count) {
933 struct task_struct *owner = rt_mutex_owner(lock2);
934 if (owner &&
935 (top_waiter != (waiter = rt_mutex_top_waiter(lock2)))) {
936 int chain_walk = 0;
937
938 spin_lock_irq(&owner->pi_lock);
939 if (top_waiter)
940 plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters);
941 else
942 /*
943 * There was no waiters before the requeue,
944 * the flag must be updated
945 */
946 mark_rt_mutex_waiters(lock2);
947
948 plist_add(&waiter->pi_list_entry, &owner->pi_waiters);
949 __rt_mutex_adjust_prio(owner);
950 if (owner->pi_blocked_on) {
951 chain_walk = 1;
952 get_task_struct(owner);
953 }
954
955 spin_unlock_irq(&owner->pi_lock);
956 spin_unlock(&lock2->wait_lock);
957
958 if (chain_walk)
959 rt_mutex_adjust_prio_chain(owner, 0, lock2, NULL,
960 current);
961 } else {
962 /* No owner or the top_waiter does not change */
963 mark_rt_mutex_waiters(lock2);
964 spin_unlock(&lock2->wait_lock);
965 }
966 }
967
968out_unlock:
969 spin_unlock(&hb1->lock);
970 if (hb1 != hb2)
971 spin_unlock(&hb2->lock);
972
973 /* drop_futex_key_refs() must be called outside the spinlocks. */
974 while (--drop_count >= 0)
975 drop_futex_key_refs(&key1);
976
977out:
978 if (fshared)
979 up_read(fshared);
665 return ret; 980 return ret;
666} 981}
667 982
@@ -670,22 +985,24 @@ out:
670 * to this virtual address: 985 * to this virtual address:
671 */ 986 */
672static int 987static int
673futex_wake_op(u32 __user *uaddr1, u32 __user *uaddr2, 988futex_wake_op(u32 __user *uaddr1, struct rw_semaphore *fshared,
989 u32 __user *uaddr2,
674 int nr_wake, int nr_wake2, int op) 990 int nr_wake, int nr_wake2, int op)
675{ 991{
676 union futex_key key1, key2; 992 union futex_key key1, key2;
677 struct futex_hash_bucket *hb1, *hb2; 993 struct futex_hash_bucket *hb1, *hb2;
678 struct list_head *head; 994 struct plist_head *head;
679 struct futex_q *this, *next; 995 struct futex_q *this, *next;
680 int ret, op_ret, attempt = 0; 996 int ret, op_ret, attempt = 0;
681 997
682retryfull: 998retryfull:
683 down_read(&current->mm->mmap_sem); 999 if (fshared)
1000 down_read(fshared);
684 1001
685 ret = get_futex_key(uaddr1, &key1); 1002 ret = get_futex_key(uaddr1, fshared, &key1);
686 if (unlikely(ret != 0)) 1003 if (unlikely(ret != 0))
687 goto out; 1004 goto out;
688 ret = get_futex_key(uaddr2, &key2); 1005 ret = get_futex_key(uaddr2, fshared, &key2);
689 if (unlikely(ret != 0)) 1006 if (unlikely(ret != 0))
690 goto out; 1007 goto out;
691 1008
@@ -725,11 +1042,10 @@ retry:
725 * still holding the mmap_sem. 1042 * still holding the mmap_sem.
726 */ 1043 */
727 if (attempt++) { 1044 if (attempt++) {
728 if (futex_handle_fault((unsigned long)uaddr2, 1045 ret = futex_handle_fault((unsigned long)uaddr2,
729 attempt)) { 1046 fshared, attempt);
730 ret = -EFAULT; 1047 if (ret)
731 goto out; 1048 goto out;
732 }
733 goto retry; 1049 goto retry;
734 } 1050 }
735 1051
@@ -737,7 +1053,8 @@ retry:
737 * If we would have faulted, release mmap_sem, 1053 * If we would have faulted, release mmap_sem,
738 * fault it in and start all over again. 1054 * fault it in and start all over again.
739 */ 1055 */
740 up_read(&current->mm->mmap_sem); 1056 if (fshared)
1057 up_read(fshared);
741 1058
742 ret = get_user(dummy, uaddr2); 1059 ret = get_user(dummy, uaddr2);
743 if (ret) 1060 if (ret)
@@ -748,7 +1065,7 @@ retry:
748 1065
749 head = &hb1->chain; 1066 head = &hb1->chain;
750 1067
751 list_for_each_entry_safe(this, next, head, list) { 1068 plist_for_each_entry_safe(this, next, head, list) {
752 if (match_futex (&this->key, &key1)) { 1069 if (match_futex (&this->key, &key1)) {
753 wake_futex(this); 1070 wake_futex(this);
754 if (++ret >= nr_wake) 1071 if (++ret >= nr_wake)
@@ -760,7 +1077,7 @@ retry:
760 head = &hb2->chain; 1077 head = &hb2->chain;
761 1078
762 op_ret = 0; 1079 op_ret = 0;
763 list_for_each_entry_safe(this, next, head, list) { 1080 plist_for_each_entry_safe(this, next, head, list) {
764 if (match_futex (&this->key, &key2)) { 1081 if (match_futex (&this->key, &key2)) {
765 wake_futex(this); 1082 wake_futex(this);
766 if (++op_ret >= nr_wake2) 1083 if (++op_ret >= nr_wake2)
@@ -774,7 +1091,8 @@ retry:
774 if (hb1 != hb2) 1091 if (hb1 != hb2)
775 spin_unlock(&hb2->lock); 1092 spin_unlock(&hb2->lock);
776out: 1093out:
777 up_read(&current->mm->mmap_sem); 1094 if (fshared)
1095 up_read(fshared);
778 return ret; 1096 return ret;
779} 1097}
780 1098
@@ -782,22 +1100,24 @@ out:
782 * Requeue all waiters hashed on one physical page to another 1100 * Requeue all waiters hashed on one physical page to another
783 * physical page. 1101 * physical page.
784 */ 1102 */
785static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2, 1103static int futex_requeue(u32 __user *uaddr1, struct rw_semaphore *fshared,
1104 u32 __user *uaddr2,
786 int nr_wake, int nr_requeue, u32 *cmpval) 1105 int nr_wake, int nr_requeue, u32 *cmpval)
787{ 1106{
788 union futex_key key1, key2; 1107 union futex_key key1, key2;
789 struct futex_hash_bucket *hb1, *hb2; 1108 struct futex_hash_bucket *hb1, *hb2;
790 struct list_head *head1; 1109 struct plist_head *head1;
791 struct futex_q *this, *next; 1110 struct futex_q *this, *next;
792 int ret, drop_count = 0; 1111 int ret, drop_count = 0;
793 1112
794 retry: 1113 retry:
795 down_read(&current->mm->mmap_sem); 1114 if (fshared)
1115 down_read(fshared);
796 1116
797 ret = get_futex_key(uaddr1, &key1); 1117 ret = get_futex_key(uaddr1, fshared, &key1);
798 if (unlikely(ret != 0)) 1118 if (unlikely(ret != 0))
799 goto out; 1119 goto out;
800 ret = get_futex_key(uaddr2, &key2); 1120 ret = get_futex_key(uaddr2, fshared, &key2);
801 if (unlikely(ret != 0)) 1121 if (unlikely(ret != 0))
802 goto out; 1122 goto out;
803 1123
@@ -820,7 +1140,8 @@ static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2,
820 * If we would have faulted, release mmap_sem, fault 1140 * If we would have faulted, release mmap_sem, fault
821 * it in and start all over again. 1141 * it in and start all over again.
822 */ 1142 */
823 up_read(&current->mm->mmap_sem); 1143 if (fshared)
1144 up_read(fshared);
824 1145
825 ret = get_user(curval, uaddr1); 1146 ret = get_user(curval, uaddr1);
826 1147
@@ -836,7 +1157,7 @@ static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2,
836 } 1157 }
837 1158
838 head1 = &hb1->chain; 1159 head1 = &hb1->chain;
839 list_for_each_entry_safe(this, next, head1, list) { 1160 plist_for_each_entry_safe(this, next, head1, list) {
840 if (!match_futex (&this->key, &key1)) 1161 if (!match_futex (&this->key, &key1))
841 continue; 1162 continue;
842 if (++ret <= nr_wake) { 1163 if (++ret <= nr_wake) {
@@ -847,9 +1168,13 @@ static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2,
847 * requeue. 1168 * requeue.
848 */ 1169 */
849 if (likely(head1 != &hb2->chain)) { 1170 if (likely(head1 != &hb2->chain)) {
850 list_move_tail(&this->list, &hb2->chain); 1171 plist_del(&this->list, &hb1->chain);
1172 plist_add(&this->list, &hb2->chain);
851 this->lock_ptr = &hb2->lock; 1173 this->lock_ptr = &hb2->lock;
852 } 1174#ifdef CONFIG_DEBUG_PI_LIST
1175 this->list.plist.lock = &hb2->lock;
1176#endif
1177 }
853 this->key = key2; 1178 this->key = key2;
854 get_futex_key_refs(&key2); 1179 get_futex_key_refs(&key2);
855 drop_count++; 1180 drop_count++;
@@ -869,7 +1194,8 @@ out_unlock:
869 drop_futex_key_refs(&key1); 1194 drop_futex_key_refs(&key1);
870 1195
871out: 1196out:
872 up_read(&current->mm->mmap_sem); 1197 if (fshared)
1198 up_read(fshared);
873 return ret; 1199 return ret;
874} 1200}
875 1201
@@ -894,7 +1220,23 @@ queue_lock(struct futex_q *q, int fd, struct file *filp)
894 1220
895static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb) 1221static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
896{ 1222{
897 list_add_tail(&q->list, &hb->chain); 1223 int prio;
1224
1225 /*
1226 * The priority used to register this element is
1227 * - either the real thread-priority for the real-time threads
1228 * (i.e. threads with a priority lower than MAX_RT_PRIO)
1229 * - or MAX_RT_PRIO for non-RT threads.
1230 * Thus, all RT-threads are woken first in priority order, and
1231 * the others are woken last, in FIFO order.
1232 */
1233 prio = min(current->normal_prio, MAX_RT_PRIO);
1234
1235 plist_node_init(&q->list, prio);
1236#ifdef CONFIG_DEBUG_PI_LIST
1237 q->list.plist.lock = &hb->lock;
1238#endif
1239 plist_add(&q->list, &hb->chain);
898 q->task = current; 1240 q->task = current;
899 spin_unlock(&hb->lock); 1241 spin_unlock(&hb->lock);
900} 1242}
@@ -949,8 +1291,8 @@ static int unqueue_me(struct futex_q *q)
949 spin_unlock(lock_ptr); 1291 spin_unlock(lock_ptr);
950 goto retry; 1292 goto retry;
951 } 1293 }
952 WARN_ON(list_empty(&q->list)); 1294 WARN_ON(plist_node_empty(&q->list));
953 list_del(&q->list); 1295 plist_del(&q->list, &q->list.plist);
954 1296
955 BUG_ON(q->pi_state); 1297 BUG_ON(q->pi_state);
956 1298
@@ -964,39 +1306,104 @@ static int unqueue_me(struct futex_q *q)
964 1306
965/* 1307/*
966 * PI futexes can not be requeued and must remove themself from the 1308 * PI futexes can not be requeued and must remove themself from the
967 * hash bucket. The hash bucket lock is held on entry and dropped here. 1309 * hash bucket. The hash bucket lock (i.e. lock_ptr) is held on entry
1310 * and dropped here.
968 */ 1311 */
969static void unqueue_me_pi(struct futex_q *q, struct futex_hash_bucket *hb) 1312static void unqueue_me_pi(struct futex_q *q)
970{ 1313{
971 WARN_ON(list_empty(&q->list)); 1314 WARN_ON(plist_node_empty(&q->list));
972 list_del(&q->list); 1315 plist_del(&q->list, &q->list.plist);
973 1316
974 BUG_ON(!q->pi_state); 1317 BUG_ON(!q->pi_state);
975 free_pi_state(q->pi_state); 1318 free_pi_state(q->pi_state);
976 q->pi_state = NULL; 1319 q->pi_state = NULL;
977 1320
978 spin_unlock(&hb->lock); 1321 spin_unlock(q->lock_ptr);
979 1322
980 drop_futex_key_refs(&q->key); 1323 drop_futex_key_refs(&q->key);
981} 1324}
982 1325
1326/*
1327 * Fixup the pi_state owner with current.
1328 *
1329 * The cur->mm semaphore must be held, it is released at return of this
1330 * function.
1331 */
1332static int fixup_pi_state_owner(u32 __user *uaddr, struct rw_semaphore *fshared,
1333 struct futex_q *q,
1334 struct futex_hash_bucket *hb,
1335 struct task_struct *curr)
1336{
1337 u32 newtid = curr->pid | FUTEX_WAITERS;
1338 struct futex_pi_state *pi_state = q->pi_state;
1339 u32 uval, curval, newval;
1340 int ret;
1341
1342 /* Owner died? */
1343 if (pi_state->owner != NULL) {
1344 spin_lock_irq(&pi_state->owner->pi_lock);
1345 WARN_ON(list_empty(&pi_state->list));
1346 list_del_init(&pi_state->list);
1347 spin_unlock_irq(&pi_state->owner->pi_lock);
1348 } else
1349 newtid |= FUTEX_OWNER_DIED;
1350
1351 pi_state->owner = curr;
1352
1353 spin_lock_irq(&curr->pi_lock);
1354 WARN_ON(!list_empty(&pi_state->list));
1355 list_add(&pi_state->list, &curr->pi_state_list);
1356 spin_unlock_irq(&curr->pi_lock);
1357
1358 /* Unqueue and drop the lock */
1359 unqueue_me_pi(q);
1360 if (fshared)
1361 up_read(fshared);
1362 /*
1363 * We own it, so we have to replace the pending owner
1364 * TID. This must be atomic as we have preserve the
1365 * owner died bit here.
1366 */
1367 ret = get_user(uval, uaddr);
1368 while (!ret) {
1369 newval = (uval & FUTEX_OWNER_DIED) | newtid;
1370 newval |= (uval & FUTEX_WAITER_REQUEUED);
1371 curval = futex_atomic_cmpxchg_inatomic(uaddr,
1372 uval, newval);
1373 if (curval == -EFAULT)
1374 ret = -EFAULT;
1375 if (curval == uval)
1376 break;
1377 uval = curval;
1378 }
1379 return ret;
1380}
1381
1382/*
1383 * In case we must use restart_block to restart a futex_wait,
1384 * we encode in the 'arg3' shared capability
1385 */
1386#define ARG3_SHARED 1
1387
983static long futex_wait_restart(struct restart_block *restart); 1388static long futex_wait_restart(struct restart_block *restart);
984static int futex_wait_abstime(u32 __user *uaddr, u32 val, 1389static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
985 int timed, unsigned long abs_time) 1390 u32 val, ktime_t *abs_time)
986{ 1391{
987 struct task_struct *curr = current; 1392 struct task_struct *curr = current;
988 DECLARE_WAITQUEUE(wait, curr); 1393 DECLARE_WAITQUEUE(wait, curr);
989 struct futex_hash_bucket *hb; 1394 struct futex_hash_bucket *hb;
990 struct futex_q q; 1395 struct futex_q q;
991 unsigned long time_left = 0;
992 u32 uval; 1396 u32 uval;
993 int ret; 1397 int ret;
1398 struct hrtimer_sleeper t, *to = NULL;
1399 int rem = 0;
994 1400
995 q.pi_state = NULL; 1401 q.pi_state = NULL;
996 retry: 1402 retry:
997 down_read(&curr->mm->mmap_sem); 1403 if (fshared)
1404 down_read(fshared);
998 1405
999 ret = get_futex_key(uaddr, &q.key); 1406 ret = get_futex_key(uaddr, fshared, &q.key);
1000 if (unlikely(ret != 0)) 1407 if (unlikely(ret != 0))
1001 goto out_release_sem; 1408 goto out_release_sem;
1002 1409
@@ -1019,8 +1426,8 @@ static int futex_wait_abstime(u32 __user *uaddr, u32 val,
1019 * a wakeup when *uaddr != val on entry to the syscall. This is 1426 * a wakeup when *uaddr != val on entry to the syscall. This is
1020 * rare, but normal. 1427 * rare, but normal.
1021 * 1428 *
1022 * We hold the mmap semaphore, so the mapping cannot have changed 1429 * for shared futexes, we hold the mmap semaphore, so the mapping
1023 * since we looked it up in get_futex_key. 1430 * cannot have changed since we looked it up in get_futex_key.
1024 */ 1431 */
1025 ret = get_futex_value_locked(&uval, uaddr); 1432 ret = get_futex_value_locked(&uval, uaddr);
1026 1433
@@ -1031,7 +1438,8 @@ static int futex_wait_abstime(u32 __user *uaddr, u32 val,
1031 * If we would have faulted, release mmap_sem, fault it in and 1438 * If we would have faulted, release mmap_sem, fault it in and
1032 * start all over again. 1439 * start all over again.
1033 */ 1440 */
1034 up_read(&curr->mm->mmap_sem); 1441 if (fshared)
1442 up_read(fshared);
1035 1443
1036 ret = get_user(uval, uaddr); 1444 ret = get_user(uval, uaddr);
1037 1445
@@ -1043,6 +1451,14 @@ static int futex_wait_abstime(u32 __user *uaddr, u32 val,
1043 if (uval != val) 1451 if (uval != val)
1044 goto out_unlock_release_sem; 1452 goto out_unlock_release_sem;
1045 1453
1454 /*
1455 * This rt_mutex_waiter structure is prepared here and will
1456 * be used only if this task is requeued from a normal futex to
1457 * a PI-futex with futex_requeue_pi.
1458 */
1459 debug_rt_mutex_init_waiter(&q.waiter);
1460 q.waiter.task = NULL;
1461
1046 /* Only actually queue if *uaddr contained val. */ 1462 /* Only actually queue if *uaddr contained val. */
1047 __queue_me(&q, hb); 1463 __queue_me(&q, hb);
1048 1464
@@ -1050,7 +1466,8 @@ static int futex_wait_abstime(u32 __user *uaddr, u32 val,
1050 * Now the futex is queued and we have checked the data, we 1466 * Now the futex is queued and we have checked the data, we
1051 * don't want to hold mmap_sem while we sleep. 1467 * don't want to hold mmap_sem while we sleep.
1052 */ 1468 */
1053 up_read(&curr->mm->mmap_sem); 1469 if (fshared)
1470 up_read(fshared);
1054 1471
1055 /* 1472 /*
1056 * There might have been scheduling since the queue_me(), as we 1473 * There might have been scheduling since the queue_me(), as we
@@ -1065,23 +1482,33 @@ static int futex_wait_abstime(u32 __user *uaddr, u32 val,
1065 __set_current_state(TASK_INTERRUPTIBLE); 1482 __set_current_state(TASK_INTERRUPTIBLE);
1066 add_wait_queue(&q.waiters, &wait); 1483 add_wait_queue(&q.waiters, &wait);
1067 /* 1484 /*
1068 * !list_empty() is safe here without any lock. 1485 * !plist_node_empty() is safe here without any lock.
1069 * q.lock_ptr != 0 is not safe, because of ordering against wakeup. 1486 * q.lock_ptr != 0 is not safe, because of ordering against wakeup.
1070 */ 1487 */
1071 time_left = 0; 1488 if (likely(!plist_node_empty(&q.list))) {
1072 if (likely(!list_empty(&q.list))) { 1489 if (!abs_time)
1073 unsigned long rel_time; 1490 schedule();
1074 1491 else {
1075 if (timed) { 1492 to = &t;
1076 unsigned long now = jiffies; 1493 hrtimer_init(&t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
1077 if (time_after(now, abs_time)) 1494 hrtimer_init_sleeper(&t, current);
1078 rel_time = 0; 1495 t.timer.expires = *abs_time;
1079 else
1080 rel_time = abs_time - now;
1081 } else
1082 rel_time = MAX_SCHEDULE_TIMEOUT;
1083 1496
1084 time_left = schedule_timeout(rel_time); 1497 hrtimer_start(&t.timer, t.timer.expires, HRTIMER_MODE_ABS);
1498
1499 /*
1500 * the timer could have already expired, in which
1501 * case current would be flagged for rescheduling.
1502 * Don't bother calling schedule.
1503 */
1504 if (likely(t.task))
1505 schedule();
1506
1507 hrtimer_cancel(&t.timer);
1508
1509 /* Flag if a timeout occured */
1510 rem = (t.task == NULL);
1511 }
1085 } 1512 }
1086 __set_current_state(TASK_RUNNING); 1513 __set_current_state(TASK_RUNNING);
1087 1514
@@ -1090,17 +1517,80 @@ static int futex_wait_abstime(u32 __user *uaddr, u32 val,
1090 * we are the only user of it. 1517 * we are the only user of it.
1091 */ 1518 */
1092 1519
1520 if (q.pi_state) {
1521 /*
1522 * We were woken but have been requeued on a PI-futex.
1523 * We have to complete the lock acquisition by taking
1524 * the rtmutex.
1525 */
1526
1527 struct rt_mutex *lock = &q.pi_state->pi_mutex;
1528
1529 spin_lock(&lock->wait_lock);
1530 if (unlikely(q.waiter.task)) {
1531 remove_waiter(lock, &q.waiter);
1532 }
1533 spin_unlock(&lock->wait_lock);
1534
1535 if (rem)
1536 ret = -ETIMEDOUT;
1537 else
1538 ret = rt_mutex_timed_lock(lock, to, 1);
1539
1540 if (fshared)
1541 down_read(fshared);
1542 spin_lock(q.lock_ptr);
1543
1544 /*
1545 * Got the lock. We might not be the anticipated owner if we
1546 * did a lock-steal - fix up the PI-state in that case.
1547 */
1548 if (!ret && q.pi_state->owner != curr) {
1549 /*
1550 * We MUST play with the futex we were requeued on,
1551 * NOT the current futex.
1552 * We can retrieve it from the key of the pi_state
1553 */
1554 uaddr = q.pi_state->key.uaddr;
1555
1556 /* mmap_sem and hash_bucket lock are unlocked at
1557 return of this function */
1558 ret = fixup_pi_state_owner(uaddr, fshared,
1559 &q, hb, curr);
1560 } else {
1561 /*
1562 * Catch the rare case, where the lock was released
1563 * when we were on the way back before we locked
1564 * the hash bucket.
1565 */
1566 if (ret && q.pi_state->owner == curr) {
1567 if (rt_mutex_trylock(&q.pi_state->pi_mutex))
1568 ret = 0;
1569 }
1570 /* Unqueue and drop the lock */
1571 unqueue_me_pi(&q);
1572 if (fshared)
1573 up_read(fshared);
1574 }
1575
1576 debug_rt_mutex_free_waiter(&q.waiter);
1577
1578 return ret;
1579 }
1580
1581 debug_rt_mutex_free_waiter(&q.waiter);
1582
1093 /* If we were woken (and unqueued), we succeeded, whatever. */ 1583 /* If we were woken (and unqueued), we succeeded, whatever. */
1094 if (!unqueue_me(&q)) 1584 if (!unqueue_me(&q))
1095 return 0; 1585 return 0;
1096 if (time_left == 0) 1586 if (rem)
1097 return -ETIMEDOUT; 1587 return -ETIMEDOUT;
1098 1588
1099 /* 1589 /*
1100 * We expect signal_pending(current), but another thread may 1590 * We expect signal_pending(current), but another thread may
1101 * have handled it for us already. 1591 * have handled it for us already.
1102 */ 1592 */
1103 if (time_left == MAX_SCHEDULE_TIMEOUT) 1593 if (!abs_time)
1104 return -ERESTARTSYS; 1594 return -ERESTARTSYS;
1105 else { 1595 else {
1106 struct restart_block *restart; 1596 struct restart_block *restart;
@@ -1108,8 +1598,10 @@ static int futex_wait_abstime(u32 __user *uaddr, u32 val,
1108 restart->fn = futex_wait_restart; 1598 restart->fn = futex_wait_restart;
1109 restart->arg0 = (unsigned long)uaddr; 1599 restart->arg0 = (unsigned long)uaddr;
1110 restart->arg1 = (unsigned long)val; 1600 restart->arg1 = (unsigned long)val;
1111 restart->arg2 = (unsigned long)timed; 1601 restart->arg2 = (unsigned long)abs_time;
1112 restart->arg3 = abs_time; 1602 restart->arg3 = 0;
1603 if (fshared)
1604 restart->arg3 |= ARG3_SHARED;
1113 return -ERESTART_RESTARTBLOCK; 1605 return -ERESTART_RESTARTBLOCK;
1114 } 1606 }
1115 1607
@@ -1117,65 +1609,111 @@ static int futex_wait_abstime(u32 __user *uaddr, u32 val,
1117 queue_unlock(&q, hb); 1609 queue_unlock(&q, hb);
1118 1610
1119 out_release_sem: 1611 out_release_sem:
1120 up_read(&curr->mm->mmap_sem); 1612 if (fshared)
1613 up_read(fshared);
1121 return ret; 1614 return ret;
1122} 1615}
1123 1616
1124static int futex_wait(u32 __user *uaddr, u32 val, unsigned long rel_time)
1125{
1126 int timed = (rel_time != MAX_SCHEDULE_TIMEOUT);
1127 return futex_wait_abstime(uaddr, val, timed, jiffies+rel_time);
1128}
1129 1617
1130static long futex_wait_restart(struct restart_block *restart) 1618static long futex_wait_restart(struct restart_block *restart)
1131{ 1619{
1132 u32 __user *uaddr = (u32 __user *)restart->arg0; 1620 u32 __user *uaddr = (u32 __user *)restart->arg0;
1133 u32 val = (u32)restart->arg1; 1621 u32 val = (u32)restart->arg1;
1134 int timed = (int)restart->arg2; 1622 ktime_t *abs_time = (ktime_t *)restart->arg2;
1135 unsigned long abs_time = restart->arg3; 1623 struct rw_semaphore *fshared = NULL;
1136 1624
1137 restart->fn = do_no_restart_syscall; 1625 restart->fn = do_no_restart_syscall;
1138 return (long)futex_wait_abstime(uaddr, val, timed, abs_time); 1626 if (restart->arg3 & ARG3_SHARED)
1627 fshared = &current->mm->mmap_sem;
1628 return (long)futex_wait(uaddr, fshared, val, abs_time);
1139} 1629}
1140 1630
1141 1631
1632static void set_pi_futex_owner(struct futex_hash_bucket *hb,
1633 union futex_key *key, struct task_struct *p)
1634{
1635 struct plist_head *head;
1636 struct futex_q *this, *next;
1637 struct futex_pi_state *pi_state = NULL;
1638 struct rt_mutex *lock;
1639
1640 /* Search a waiter that should already exists */
1641
1642 head = &hb->chain;
1643
1644 plist_for_each_entry_safe(this, next, head, list) {
1645 if (match_futex (&this->key, key)) {
1646 pi_state = this->pi_state;
1647 break;
1648 }
1649 }
1650
1651 BUG_ON(!pi_state);
1652
1653 /* set p as pi_state's owner */
1654 lock = &pi_state->pi_mutex;
1655
1656 spin_lock(&lock->wait_lock);
1657 spin_lock_irq(&p->pi_lock);
1658
1659 list_add(&pi_state->list, &p->pi_state_list);
1660 pi_state->owner = p;
1661
1662
1663 /* set p as pi_mutex's owner */
1664 debug_rt_mutex_proxy_lock(lock, p);
1665 WARN_ON(rt_mutex_owner(lock));
1666 rt_mutex_set_owner(lock, p, 0);
1667 rt_mutex_deadlock_account_lock(lock, p);
1668
1669 plist_add(&rt_mutex_top_waiter(lock)->pi_list_entry,
1670 &p->pi_waiters);
1671 __rt_mutex_adjust_prio(p);
1672
1673 spin_unlock_irq(&p->pi_lock);
1674 spin_unlock(&lock->wait_lock);
1675}
1676
1142/* 1677/*
1143 * Userspace tried a 0 -> TID atomic transition of the futex value 1678 * Userspace tried a 0 -> TID atomic transition of the futex value
1144 * and failed. The kernel side here does the whole locking operation: 1679 * and failed. The kernel side here does the whole locking operation:
1145 * if there are waiters then it will block, it does PI, etc. (Due to 1680 * if there are waiters then it will block, it does PI, etc. (Due to
1146 * races the kernel might see a 0 value of the futex too.) 1681 * races the kernel might see a 0 value of the futex too.)
1147 */ 1682 */
1148static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec, 1683static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1149 long nsec, int trylock) 1684 int detect, ktime_t *time, int trylock)
1150{ 1685{
1151 struct hrtimer_sleeper timeout, *to = NULL; 1686 struct hrtimer_sleeper timeout, *to = NULL;
1152 struct task_struct *curr = current; 1687 struct task_struct *curr = current;
1153 struct futex_hash_bucket *hb; 1688 struct futex_hash_bucket *hb;
1154 u32 uval, newval, curval; 1689 u32 uval, newval, curval;
1155 struct futex_q q; 1690 struct futex_q q;
1156 int ret, attempt = 0; 1691 int ret, lock_held, attempt = 0;
1157 1692
1158 if (refill_pi_state_cache()) 1693 if (refill_pi_state_cache())
1159 return -ENOMEM; 1694 return -ENOMEM;
1160 1695
1161 if (sec != MAX_SCHEDULE_TIMEOUT) { 1696 if (time) {
1162 to = &timeout; 1697 to = &timeout;
1163 hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_MODE_ABS); 1698 hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
1164 hrtimer_init_sleeper(to, current); 1699 hrtimer_init_sleeper(to, current);
1165 to->timer.expires = ktime_set(sec, nsec); 1700 to->timer.expires = *time;
1166 } 1701 }
1167 1702
1168 q.pi_state = NULL; 1703 q.pi_state = NULL;
1169 retry: 1704 retry:
1170 down_read(&curr->mm->mmap_sem); 1705 if (fshared)
1706 down_read(fshared);
1171 1707
1172 ret = get_futex_key(uaddr, &q.key); 1708 ret = get_futex_key(uaddr, fshared, &q.key);
1173 if (unlikely(ret != 0)) 1709 if (unlikely(ret != 0))
1174 goto out_release_sem; 1710 goto out_release_sem;
1175 1711
1176 hb = queue_lock(&q, -1, NULL); 1712 hb = queue_lock(&q, -1, NULL);
1177 1713
1178 retry_locked: 1714 retry_locked:
1715 lock_held = 0;
1716
1179 /* 1717 /*
1180 * To avoid races, we attempt to take the lock here again 1718 * To avoid races, we attempt to take the lock here again
1181 * (by doing a 0 -> TID atomic cmpxchg), while holding all 1719 * (by doing a 0 -> TID atomic cmpxchg), while holding all
@@ -1194,7 +1732,16 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
1194 if (unlikely((curval & FUTEX_TID_MASK) == current->pid)) { 1732 if (unlikely((curval & FUTEX_TID_MASK) == current->pid)) {
1195 if (!detect && 0) 1733 if (!detect && 0)
1196 force_sig(SIGKILL, current); 1734 force_sig(SIGKILL, current);
1197 ret = -EDEADLK; 1735 /*
1736 * Normally, this check is done in user space.
1737 * In case of requeue, the owner may attempt to lock this futex,
1738 * even if the ownership has already been given by the previous
1739 * waker.
1740 * In the usual case, this is a case of deadlock, but not in case
1741 * of REQUEUE_PI.
1742 */
1743 if (!(curval & FUTEX_WAITER_REQUEUED))
1744 ret = -EDEADLK;
1198 goto out_unlock_release_sem; 1745 goto out_unlock_release_sem;
1199 } 1746 }
1200 1747
@@ -1206,7 +1753,18 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
1206 goto out_unlock_release_sem; 1753 goto out_unlock_release_sem;
1207 1754
1208 uval = curval; 1755 uval = curval;
1209 newval = uval | FUTEX_WAITERS; 1756 /*
1757 * In case of a requeue, check if there already is an owner
1758 * If not, just take the futex.
1759 */
1760 if ((curval & FUTEX_WAITER_REQUEUED) && !(curval & FUTEX_TID_MASK)) {
1761 /* set current as futex owner */
1762 newval = curval | current->pid;
1763 lock_held = 1;
1764 } else
1765 /* Set the WAITERS flag, so the owner will know it has someone
1766 to wake at next unlock */
1767 newval = curval | FUTEX_WAITERS;
1210 1768
1211 pagefault_disable(); 1769 pagefault_disable();
1212 curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); 1770 curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
@@ -1217,11 +1775,16 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
1217 if (unlikely(curval != uval)) 1775 if (unlikely(curval != uval))
1218 goto retry_locked; 1776 goto retry_locked;
1219 1777
1778 if (lock_held) {
1779 set_pi_futex_owner(hb, &q.key, curr);
1780 goto out_unlock_release_sem;
1781 }
1782
1220 /* 1783 /*
1221 * We dont have the lock. Look up the PI state (or create it if 1784 * We dont have the lock. Look up the PI state (or create it if
1222 * we are the first waiter): 1785 * we are the first waiter):
1223 */ 1786 */
1224 ret = lookup_pi_state(uval, hb, &q); 1787 ret = lookup_pi_state(uval, hb, &q.key, &q.pi_state);
1225 1788
1226 if (unlikely(ret)) { 1789 if (unlikely(ret)) {
1227 /* 1790 /*
@@ -1263,7 +1826,8 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
1263 * Now the futex is queued and we have checked the data, we 1826 * Now the futex is queued and we have checked the data, we
1264 * don't want to hold mmap_sem while we sleep. 1827 * don't want to hold mmap_sem while we sleep.
1265 */ 1828 */
1266 up_read(&curr->mm->mmap_sem); 1829 if (fshared)
1830 up_read(fshared);
1267 1831
1268 WARN_ON(!q.pi_state); 1832 WARN_ON(!q.pi_state);
1269 /* 1833 /*
@@ -1277,52 +1841,18 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
1277 ret = ret ? 0 : -EWOULDBLOCK; 1841 ret = ret ? 0 : -EWOULDBLOCK;
1278 } 1842 }
1279 1843
1280 down_read(&curr->mm->mmap_sem); 1844 if (fshared)
1845 down_read(fshared);
1281 spin_lock(q.lock_ptr); 1846 spin_lock(q.lock_ptr);
1282 1847
1283 /* 1848 /*
1284 * Got the lock. We might not be the anticipated owner if we 1849 * Got the lock. We might not be the anticipated owner if we
1285 * did a lock-steal - fix up the PI-state in that case. 1850 * did a lock-steal - fix up the PI-state in that case.
1286 */ 1851 */
1287 if (!ret && q.pi_state->owner != curr) { 1852 if (!ret && q.pi_state->owner != curr)
1288 u32 newtid = current->pid | FUTEX_WAITERS; 1853 /* mmap_sem is unlocked at return of this function */
1289 1854 ret = fixup_pi_state_owner(uaddr, fshared, &q, hb, curr);
1290 /* Owner died? */ 1855 else {
1291 if (q.pi_state->owner != NULL) {
1292 spin_lock_irq(&q.pi_state->owner->pi_lock);
1293 WARN_ON(list_empty(&q.pi_state->list));
1294 list_del_init(&q.pi_state->list);
1295 spin_unlock_irq(&q.pi_state->owner->pi_lock);
1296 } else
1297 newtid |= FUTEX_OWNER_DIED;
1298
1299 q.pi_state->owner = current;
1300
1301 spin_lock_irq(&current->pi_lock);
1302 WARN_ON(!list_empty(&q.pi_state->list));
1303 list_add(&q.pi_state->list, &current->pi_state_list);
1304 spin_unlock_irq(&current->pi_lock);
1305
1306 /* Unqueue and drop the lock */
1307 unqueue_me_pi(&q, hb);
1308 up_read(&curr->mm->mmap_sem);
1309 /*
1310 * We own it, so we have to replace the pending owner
1311 * TID. This must be atomic as we have preserve the
1312 * owner died bit here.
1313 */
1314 ret = get_user(uval, uaddr);
1315 while (!ret) {
1316 newval = (uval & FUTEX_OWNER_DIED) | newtid;
1317 curval = futex_atomic_cmpxchg_inatomic(uaddr,
1318 uval, newval);
1319 if (curval == -EFAULT)
1320 ret = -EFAULT;
1321 if (curval == uval)
1322 break;
1323 uval = curval;
1324 }
1325 } else {
1326 /* 1856 /*
1327 * Catch the rare case, where the lock was released 1857 * Catch the rare case, where the lock was released
1328 * when we were on the way back before we locked 1858 * when we were on the way back before we locked
@@ -1333,8 +1863,9 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
1333 ret = 0; 1863 ret = 0;
1334 } 1864 }
1335 /* Unqueue and drop the lock */ 1865 /* Unqueue and drop the lock */
1336 unqueue_me_pi(&q, hb); 1866 unqueue_me_pi(&q);
1337 up_read(&curr->mm->mmap_sem); 1867 if (fshared)
1868 up_read(fshared);
1338 } 1869 }
1339 1870
1340 if (!detect && ret == -EDEADLK && 0) 1871 if (!detect && ret == -EDEADLK && 0)
@@ -1346,7 +1877,8 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
1346 queue_unlock(&q, hb); 1877 queue_unlock(&q, hb);
1347 1878
1348 out_release_sem: 1879 out_release_sem:
1349 up_read(&curr->mm->mmap_sem); 1880 if (fshared)
1881 up_read(fshared);
1350 return ret; 1882 return ret;
1351 1883
1352 uaddr_faulted: 1884 uaddr_faulted:
@@ -1357,15 +1889,16 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
1357 * still holding the mmap_sem. 1889 * still holding the mmap_sem.
1358 */ 1890 */
1359 if (attempt++) { 1891 if (attempt++) {
1360 if (futex_handle_fault((unsigned long)uaddr, attempt)) { 1892 ret = futex_handle_fault((unsigned long)uaddr, fshared,
1361 ret = -EFAULT; 1893 attempt);
1894 if (ret)
1362 goto out_unlock_release_sem; 1895 goto out_unlock_release_sem;
1363 }
1364 goto retry_locked; 1896 goto retry_locked;
1365 } 1897 }
1366 1898
1367 queue_unlock(&q, hb); 1899 queue_unlock(&q, hb);
1368 up_read(&curr->mm->mmap_sem); 1900 if (fshared)
1901 up_read(fshared);
1369 1902
1370 ret = get_user(uval, uaddr); 1903 ret = get_user(uval, uaddr);
1371 if (!ret && (uval != -EFAULT)) 1904 if (!ret && (uval != -EFAULT))
@@ -1379,12 +1912,12 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
1379 * This is the in-kernel slowpath: we look up the PI state (if any), 1912 * This is the in-kernel slowpath: we look up the PI state (if any),
1380 * and do the rt-mutex unlock. 1913 * and do the rt-mutex unlock.
1381 */ 1914 */
1382static int futex_unlock_pi(u32 __user *uaddr) 1915static int futex_unlock_pi(u32 __user *uaddr, struct rw_semaphore *fshared)
1383{ 1916{
1384 struct futex_hash_bucket *hb; 1917 struct futex_hash_bucket *hb;
1385 struct futex_q *this, *next; 1918 struct futex_q *this, *next;
1386 u32 uval; 1919 u32 uval;
1387 struct list_head *head; 1920 struct plist_head *head;
1388 union futex_key key; 1921 union futex_key key;
1389 int ret, attempt = 0; 1922 int ret, attempt = 0;
1390 1923
@@ -1399,9 +1932,10 @@ retry:
1399 /* 1932 /*
1400 * First take all the futex related locks: 1933 * First take all the futex related locks:
1401 */ 1934 */
1402 down_read(&current->mm->mmap_sem); 1935 if (fshared)
1936 down_read(fshared);
1403 1937
1404 ret = get_futex_key(uaddr, &key); 1938 ret = get_futex_key(uaddr, fshared, &key);
1405 if (unlikely(ret != 0)) 1939 if (unlikely(ret != 0))
1406 goto out; 1940 goto out;
1407 1941
@@ -1435,7 +1969,7 @@ retry_locked:
1435 */ 1969 */
1436 head = &hb->chain; 1970 head = &hb->chain;
1437 1971
1438 list_for_each_entry_safe(this, next, head, list) { 1972 plist_for_each_entry_safe(this, next, head, list) {
1439 if (!match_futex (&this->key, &key)) 1973 if (!match_futex (&this->key, &key))
1440 continue; 1974 continue;
1441 ret = wake_futex_pi(uaddr, uval, this); 1975 ret = wake_futex_pi(uaddr, uval, this);
@@ -1460,7 +1994,8 @@ retry_locked:
1460out_unlock: 1994out_unlock:
1461 spin_unlock(&hb->lock); 1995 spin_unlock(&hb->lock);
1462out: 1996out:
1463 up_read(&current->mm->mmap_sem); 1997 if (fshared)
1998 up_read(fshared);
1464 1999
1465 return ret; 2000 return ret;
1466 2001
@@ -1472,15 +2007,16 @@ pi_faulted:
1472 * still holding the mmap_sem. 2007 * still holding the mmap_sem.
1473 */ 2008 */
1474 if (attempt++) { 2009 if (attempt++) {
1475 if (futex_handle_fault((unsigned long)uaddr, attempt)) { 2010 ret = futex_handle_fault((unsigned long)uaddr, fshared,
1476 ret = -EFAULT; 2011 attempt);
2012 if (ret)
1477 goto out_unlock; 2013 goto out_unlock;
1478 }
1479 goto retry_locked; 2014 goto retry_locked;
1480 } 2015 }
1481 2016
1482 spin_unlock(&hb->lock); 2017 spin_unlock(&hb->lock);
1483 up_read(&current->mm->mmap_sem); 2018 if (fshared)
2019 up_read(fshared);
1484 2020
1485 ret = get_user(uval, uaddr); 2021 ret = get_user(uval, uaddr);
1486 if (!ret && (uval != -EFAULT)) 2022 if (!ret && (uval != -EFAULT))
@@ -1509,10 +2045,10 @@ static unsigned int futex_poll(struct file *filp,
1509 poll_wait(filp, &q->waiters, wait); 2045 poll_wait(filp, &q->waiters, wait);
1510 2046
1511 /* 2047 /*
1512 * list_empty() is safe here without any lock. 2048 * plist_node_empty() is safe here without any lock.
1513 * q->lock_ptr != 0 is not safe, because of ordering against wakeup. 2049 * q->lock_ptr != 0 is not safe, because of ordering against wakeup.
1514 */ 2050 */
1515 if (list_empty(&q->list)) 2051 if (plist_node_empty(&q->list))
1516 ret = POLLIN | POLLRDNORM; 2052 ret = POLLIN | POLLRDNORM;
1517 2053
1518 return ret; 2054 return ret;
@@ -1532,6 +2068,7 @@ static int futex_fd(u32 __user *uaddr, int signal)
1532 struct futex_q *q; 2068 struct futex_q *q;
1533 struct file *filp; 2069 struct file *filp;
1534 int ret, err; 2070 int ret, err;
2071 struct rw_semaphore *fshared;
1535 static unsigned long printk_interval; 2072 static unsigned long printk_interval;
1536 2073
1537 if (printk_timed_ratelimit(&printk_interval, 60 * 60 * 1000)) { 2074 if (printk_timed_ratelimit(&printk_interval, 60 * 60 * 1000)) {
@@ -1573,11 +2110,12 @@ static int futex_fd(u32 __user *uaddr, int signal)
1573 } 2110 }
1574 q->pi_state = NULL; 2111 q->pi_state = NULL;
1575 2112
1576 down_read(&current->mm->mmap_sem); 2113 fshared = &current->mm->mmap_sem;
1577 err = get_futex_key(uaddr, &q->key); 2114 down_read(fshared);
2115 err = get_futex_key(uaddr, fshared, &q->key);
1578 2116
1579 if (unlikely(err != 0)) { 2117 if (unlikely(err != 0)) {
1580 up_read(&current->mm->mmap_sem); 2118 up_read(fshared);
1581 kfree(q); 2119 kfree(q);
1582 goto error; 2120 goto error;
1583 } 2121 }
@@ -1589,7 +2127,7 @@ static int futex_fd(u32 __user *uaddr, int signal)
1589 filp->private_data = q; 2127 filp->private_data = q;
1590 2128
1591 queue_me(q, ret, filp); 2129 queue_me(q, ret, filp);
1592 up_read(&current->mm->mmap_sem); 2130 up_read(fshared);
1593 2131
1594 /* Now we map fd to filp, so userspace can access it */ 2132 /* Now we map fd to filp, so userspace can access it */
1595 fd_install(ret, filp); 2133 fd_install(ret, filp);
@@ -1702,6 +2240,8 @@ retry:
1702 * userspace. 2240 * userspace.
1703 */ 2241 */
1704 mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; 2242 mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
2243 /* Also keep the FUTEX_WAITER_REQUEUED flag if set */
2244 mval |= (uval & FUTEX_WAITER_REQUEUED);
1705 nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, mval); 2245 nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, mval);
1706 2246
1707 if (nval == -EFAULT) 2247 if (nval == -EFAULT)
@@ -1716,7 +2256,7 @@ retry:
1716 */ 2256 */
1717 if (!pi) { 2257 if (!pi) {
1718 if (uval & FUTEX_WAITERS) 2258 if (uval & FUTEX_WAITERS)
1719 futex_wake(uaddr, 1); 2259 futex_wake(uaddr, &curr->mm->mmap_sem, 1);
1720 } 2260 }
1721 } 2261 }
1722 return 0; 2262 return 0;
@@ -1772,7 +2312,8 @@ void exit_robust_list(struct task_struct *curr)
1772 return; 2312 return;
1773 2313
1774 if (pending) 2314 if (pending)
1775 handle_futex_death((void __user *)pending + futex_offset, curr, pip); 2315 handle_futex_death((void __user *)pending + futex_offset,
2316 curr, pip);
1776 2317
1777 while (entry != &head->list) { 2318 while (entry != &head->list) {
1778 /* 2319 /*
@@ -1798,39 +2339,47 @@ void exit_robust_list(struct task_struct *curr)
1798 } 2339 }
1799} 2340}
1800 2341
1801long do_futex(u32 __user *uaddr, int op, u32 val, unsigned long timeout, 2342long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
1802 u32 __user *uaddr2, u32 val2, u32 val3) 2343 u32 __user *uaddr2, u32 val2, u32 val3)
1803{ 2344{
1804 int ret; 2345 int ret;
2346 int cmd = op & FUTEX_CMD_MASK;
2347 struct rw_semaphore *fshared = NULL;
2348
2349 if (!(op & FUTEX_PRIVATE_FLAG))
2350 fshared = &current->mm->mmap_sem;
1805 2351
1806 switch (op) { 2352 switch (cmd) {
1807 case FUTEX_WAIT: 2353 case FUTEX_WAIT:
1808 ret = futex_wait(uaddr, val, timeout); 2354 ret = futex_wait(uaddr, fshared, val, timeout);
1809 break; 2355 break;
1810 case FUTEX_WAKE: 2356 case FUTEX_WAKE:
1811 ret = futex_wake(uaddr, val); 2357 ret = futex_wake(uaddr, fshared, val);
1812 break; 2358 break;
1813 case FUTEX_FD: 2359 case FUTEX_FD:
1814 /* non-zero val means F_SETOWN(getpid()) & F_SETSIG(val) */ 2360 /* non-zero val means F_SETOWN(getpid()) & F_SETSIG(val) */
1815 ret = futex_fd(uaddr, val); 2361 ret = futex_fd(uaddr, val);
1816 break; 2362 break;
1817 case FUTEX_REQUEUE: 2363 case FUTEX_REQUEUE:
1818 ret = futex_requeue(uaddr, uaddr2, val, val2, NULL); 2364 ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL);
1819 break; 2365 break;
1820 case FUTEX_CMP_REQUEUE: 2366 case FUTEX_CMP_REQUEUE:
1821 ret = futex_requeue(uaddr, uaddr2, val, val2, &val3); 2367 ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3);
1822 break; 2368 break;
1823 case FUTEX_WAKE_OP: 2369 case FUTEX_WAKE_OP:
1824 ret = futex_wake_op(uaddr, uaddr2, val, val2, val3); 2370 ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3);
1825 break; 2371 break;
1826 case FUTEX_LOCK_PI: 2372 case FUTEX_LOCK_PI:
1827 ret = futex_lock_pi(uaddr, val, timeout, val2, 0); 2373 ret = futex_lock_pi(uaddr, fshared, val, timeout, 0);
1828 break; 2374 break;
1829 case FUTEX_UNLOCK_PI: 2375 case FUTEX_UNLOCK_PI:
1830 ret = futex_unlock_pi(uaddr); 2376 ret = futex_unlock_pi(uaddr, fshared);
1831 break; 2377 break;
1832 case FUTEX_TRYLOCK_PI: 2378 case FUTEX_TRYLOCK_PI:
1833 ret = futex_lock_pi(uaddr, 0, timeout, val2, 1); 2379 ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1);
2380 break;
2381 case FUTEX_CMP_REQUEUE_PI:
2382 ret = futex_requeue_pi(uaddr, fshared, uaddr2, val, val2, &val3);
1834 break; 2383 break;
1835 default: 2384 default:
1836 ret = -ENOSYS; 2385 ret = -ENOSYS;
@@ -1843,29 +2392,30 @@ asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val,
1843 struct timespec __user *utime, u32 __user *uaddr2, 2392 struct timespec __user *utime, u32 __user *uaddr2,
1844 u32 val3) 2393 u32 val3)
1845{ 2394{
1846 struct timespec t; 2395 struct timespec ts;
1847 unsigned long timeout = MAX_SCHEDULE_TIMEOUT; 2396 ktime_t t, *tp = NULL;
1848 u32 val2 = 0; 2397 u32 val2 = 0;
2398 int cmd = op & FUTEX_CMD_MASK;
1849 2399
1850 if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) { 2400 if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI)) {
1851 if (copy_from_user(&t, utime, sizeof(t)) != 0) 2401 if (copy_from_user(&ts, utime, sizeof(ts)) != 0)
1852 return -EFAULT; 2402 return -EFAULT;
1853 if (!timespec_valid(&t)) 2403 if (!timespec_valid(&ts))
1854 return -EINVAL; 2404 return -EINVAL;
1855 if (op == FUTEX_WAIT) 2405
1856 timeout = timespec_to_jiffies(&t) + 1; 2406 t = timespec_to_ktime(ts);
1857 else { 2407 if (cmd == FUTEX_WAIT)
1858 timeout = t.tv_sec; 2408 t = ktime_add(ktime_get(), t);
1859 val2 = t.tv_nsec; 2409 tp = &t;
1860 }
1861 } 2410 }
1862 /* 2411 /*
1863 * requeue parameter in 'utime' if op == FUTEX_REQUEUE. 2412 * requeue parameter in 'utime' if cmd == FUTEX_REQUEUE.
1864 */ 2413 */
1865 if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE) 2414 if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE
2415 || cmd == FUTEX_CMP_REQUEUE_PI)
1866 val2 = (u32) (unsigned long) utime; 2416 val2 = (u32) (unsigned long) utime;
1867 2417
1868 return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3); 2418 return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
1869} 2419}
1870 2420
1871static int futexfs_get_sb(struct file_system_type *fs_type, 2421static int futexfs_get_sb(struct file_system_type *fs_type,
@@ -1895,7 +2445,7 @@ static int __init init(void)
1895 } 2445 }
1896 2446
1897 for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { 2447 for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {
1898 INIT_LIST_HEAD(&futex_queues[i].chain); 2448 plist_head_init(&futex_queues[i].chain, &futex_queues[i].lock);
1899 spin_lock_init(&futex_queues[i].lock); 2449 spin_lock_init(&futex_queues[i].lock);
1900 } 2450 }
1901 return 0; 2451 return 0;
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 50f24eea6cd0..338a9b489fbc 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -141,24 +141,24 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
141 struct compat_timespec __user *utime, u32 __user *uaddr2, 141 struct compat_timespec __user *utime, u32 __user *uaddr2,
142 u32 val3) 142 u32 val3)
143{ 143{
144 struct timespec t; 144 struct timespec ts;
145 unsigned long timeout = MAX_SCHEDULE_TIMEOUT; 145 ktime_t t, *tp = NULL;
146 int val2 = 0; 146 int val2 = 0;
147 147
148 if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) { 148 if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) {
149 if (get_compat_timespec(&t, utime)) 149 if (get_compat_timespec(&ts, utime))
150 return -EFAULT; 150 return -EFAULT;
151 if (!timespec_valid(&t)) 151 if (!timespec_valid(&ts))
152 return -EINVAL; 152 return -EINVAL;
153
154 t = timespec_to_ktime(ts);
153 if (op == FUTEX_WAIT) 155 if (op == FUTEX_WAIT)
154 timeout = timespec_to_jiffies(&t) + 1; 156 t = ktime_add(ktime_get(), t);
155 else { 157 tp = &t;
156 timeout = t.tv_sec;
157 val2 = t.tv_nsec;
158 }
159 } 158 }
160 if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE) 159 if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE
160 || op == FUTEX_CMP_REQUEUE_PI)
161 val2 = (int) (unsigned long) utime; 161 val2 = (int) (unsigned long) utime;
162 162
163 return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3); 163 return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
164} 164}
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index c9f4f044a8a8..23c03f43e196 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -1411,11 +1411,13 @@ static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self,
1411 switch (action) { 1411 switch (action) {
1412 1412
1413 case CPU_UP_PREPARE: 1413 case CPU_UP_PREPARE:
1414 case CPU_UP_PREPARE_FROZEN:
1414 init_hrtimers_cpu(cpu); 1415 init_hrtimers_cpu(cpu);
1415 break; 1416 break;
1416 1417
1417#ifdef CONFIG_HOTPLUG_CPU 1418#ifdef CONFIG_HOTPLUG_CPU
1418 case CPU_DEAD: 1419 case CPU_DEAD:
1420 case CPU_DEAD_FROZEN:
1419 clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DEAD, &cpu); 1421 clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DEAD, &cpu);
1420 migrate_hrtimers(cpu); 1422 migrate_hrtimers(cpu);
1421 break; 1423 break;
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 32e1ab1477d1..e391cbb1f566 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -22,7 +22,6 @@
22 * handle_bad_irq - handle spurious and unhandled irqs 22 * handle_bad_irq - handle spurious and unhandled irqs
23 * @irq: the interrupt number 23 * @irq: the interrupt number
24 * @desc: description of the interrupt 24 * @desc: description of the interrupt
25 * @regs: pointer to a register structure
26 * 25 *
27 * Handles spurious and unhandled IRQ's. It also prints a debugmessage. 26 * Handles spurious and unhandled IRQ's. It also prints a debugmessage.
28 */ 27 */
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 49cc4b9c1a8d..4d32eb077179 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -135,7 +135,6 @@ static int ____call_usermodehelper(void *data)
135 135
136 /* Unblock all signals and set the session keyring. */ 136 /* Unblock all signals and set the session keyring. */
137 new_session = key_get(sub_info->ring); 137 new_session = key_get(sub_info->ring);
138 flush_signals(current);
139 spin_lock_irq(&current->sighand->siglock); 138 spin_lock_irq(&current->sighand->siglock);
140 old_session = __install_session_keyring(current, new_session); 139 old_session = __install_session_keyring(current, new_session);
141 flush_signal_handlers(current, 1); 140 flush_signal_handlers(current, 1);
@@ -186,14 +185,9 @@ static int wait_for_helper(void *data)
186{ 185{
187 struct subprocess_info *sub_info = data; 186 struct subprocess_info *sub_info = data;
188 pid_t pid; 187 pid_t pid;
189 struct k_sigaction sa;
190 188
191 /* Install a handler: if SIGCLD isn't handled sys_wait4 won't 189 /* Install a handler: if SIGCLD isn't handled sys_wait4 won't
192 * populate the status, but will return -ECHILD. */ 190 * populate the status, but will return -ECHILD. */
193 sa.sa.sa_handler = SIG_IGN;
194 sa.sa.sa_flags = 0;
195 siginitset(&sa.sa.sa_mask, sigmask(SIGCHLD));
196 do_sigaction(SIGCHLD, &sa, NULL);
197 allow_signal(SIGCHLD); 191 allow_signal(SIGCHLD);
198 192
199 pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD); 193 pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 87c50ccd1d4e..df8a8e8f6ca4 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -1,7 +1,7 @@
1/* Kernel thread helper functions. 1/* Kernel thread helper functions.
2 * Copyright (C) 2004 IBM Corporation, Rusty Russell. 2 * Copyright (C) 2004 IBM Corporation, Rusty Russell.
3 * 3 *
4 * Creation is done via keventd, so that we get a clean environment 4 * Creation is done via kthreadd, so that we get a clean environment
5 * even if we're invoked from userspace (think modprobe, hotplug cpu, 5 * even if we're invoked from userspace (think modprobe, hotplug cpu,
6 * etc.). 6 * etc.).
7 */ 7 */
@@ -15,24 +15,22 @@
15#include <linux/mutex.h> 15#include <linux/mutex.h>
16#include <asm/semaphore.h> 16#include <asm/semaphore.h>
17 17
18/* 18static DEFINE_SPINLOCK(kthread_create_lock);
19 * We dont want to execute off keventd since it might 19static LIST_HEAD(kthread_create_list);
20 * hold a semaphore our callers hold too: 20struct task_struct *kthreadd_task;
21 */
22static struct workqueue_struct *helper_wq;
23 21
24struct kthread_create_info 22struct kthread_create_info
25{ 23{
26 /* Information passed to kthread() from keventd. */ 24 /* Information passed to kthread() from kthreadd. */
27 int (*threadfn)(void *data); 25 int (*threadfn)(void *data);
28 void *data; 26 void *data;
29 struct completion started; 27 struct completion started;
30 28
31 /* Result passed back to kthread_create() from keventd. */ 29 /* Result passed back to kthread_create() from kthreadd. */
32 struct task_struct *result; 30 struct task_struct *result;
33 struct completion done; 31 struct completion done;
34 32
35 struct work_struct work; 33 struct list_head list;
36}; 34};
37 35
38struct kthread_stop_info 36struct kthread_stop_info
@@ -60,42 +58,17 @@ int kthread_should_stop(void)
60} 58}
61EXPORT_SYMBOL(kthread_should_stop); 59EXPORT_SYMBOL(kthread_should_stop);
62 60
63static void kthread_exit_files(void)
64{
65 struct fs_struct *fs;
66 struct task_struct *tsk = current;
67
68 exit_fs(tsk); /* current->fs->count--; */
69 fs = init_task.fs;
70 tsk->fs = fs;
71 atomic_inc(&fs->count);
72 exit_files(tsk);
73 current->files = init_task.files;
74 atomic_inc(&tsk->files->count);
75}
76
77static int kthread(void *_create) 61static int kthread(void *_create)
78{ 62{
79 struct kthread_create_info *create = _create; 63 struct kthread_create_info *create = _create;
80 int (*threadfn)(void *data); 64 int (*threadfn)(void *data);
81 void *data; 65 void *data;
82 sigset_t blocked;
83 int ret = -EINTR; 66 int ret = -EINTR;
84 67
85 kthread_exit_files(); 68 /* Copy data: it's on kthread's stack */
86
87 /* Copy data: it's on keventd's stack */
88 threadfn = create->threadfn; 69 threadfn = create->threadfn;
89 data = create->data; 70 data = create->data;
90 71
91 /* Block and flush all signals (in case we're not from keventd). */
92 sigfillset(&blocked);
93 sigprocmask(SIG_BLOCK, &blocked, NULL);
94 flush_signals(current);
95
96 /* By default we can run anywhere, unlike keventd. */
97 set_cpus_allowed(current, CPU_MASK_ALL);
98
99 /* OK, tell user we're spawned, wait for stop or wakeup */ 72 /* OK, tell user we're spawned, wait for stop or wakeup */
100 __set_current_state(TASK_INTERRUPTIBLE); 73 __set_current_state(TASK_INTERRUPTIBLE);
101 complete(&create->started); 74 complete(&create->started);
@@ -112,11 +85,8 @@ static int kthread(void *_create)
112 return 0; 85 return 0;
113} 86}
114 87
115/* We are keventd: create a thread. */ 88static void create_kthread(struct kthread_create_info *create)
116static void keventd_create_kthread(struct work_struct *work)
117{ 89{
118 struct kthread_create_info *create =
119 container_of(work, struct kthread_create_info, work);
120 int pid; 90 int pid;
121 91
122 /* We want our own signal handler (we take no signals by default). */ 92 /* We want our own signal handler (we take no signals by default). */
@@ -162,17 +132,14 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
162 create.data = data; 132 create.data = data;
163 init_completion(&create.started); 133 init_completion(&create.started);
164 init_completion(&create.done); 134 init_completion(&create.done);
165 INIT_WORK(&create.work, keventd_create_kthread); 135
166 136 spin_lock(&kthread_create_lock);
167 /* 137 list_add_tail(&create.list, &kthread_create_list);
168 * The workqueue needs to start up first: 138 wake_up_process(kthreadd_task);
169 */ 139 spin_unlock(&kthread_create_lock);
170 if (!helper_wq) 140
171 create.work.func(&create.work); 141 wait_for_completion(&create.done);
172 else { 142
173 queue_work(helper_wq, &create.work);
174 wait_for_completion(&create.done);
175 }
176 if (!IS_ERR(create.result)) { 143 if (!IS_ERR(create.result)) {
177 va_list args; 144 va_list args;
178 va_start(args, namefmt); 145 va_start(args, namefmt);
@@ -180,7 +147,6 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
180 namefmt, args); 147 namefmt, args);
181 va_end(args); 148 va_end(args);
182 } 149 }
183
184 return create.result; 150 return create.result;
185} 151}
186EXPORT_SYMBOL(kthread_create); 152EXPORT_SYMBOL(kthread_create);
@@ -245,12 +211,47 @@ int kthread_stop(struct task_struct *k)
245} 211}
246EXPORT_SYMBOL(kthread_stop); 212EXPORT_SYMBOL(kthread_stop);
247 213
248static __init int helper_init(void) 214
215static __init void kthreadd_setup(void)
249{ 216{
250 helper_wq = create_singlethread_workqueue("kthread"); 217 struct task_struct *tsk = current;
251 BUG_ON(!helper_wq);
252 218
253 return 0; 219 set_task_comm(tsk, "kthreadd");
220
221 ignore_signals(tsk);
222
223 set_user_nice(tsk, -5);
224 set_cpus_allowed(tsk, CPU_MASK_ALL);
254} 225}
255 226
256core_initcall(helper_init); 227int kthreadd(void *unused)
228{
229 /* Setup a clean context for our children to inherit. */
230 kthreadd_setup();
231
232 current->flags |= PF_NOFREEZE;
233
234 for (;;) {
235 set_current_state(TASK_INTERRUPTIBLE);
236 if (list_empty(&kthread_create_list))
237 schedule();
238 __set_current_state(TASK_RUNNING);
239
240 spin_lock(&kthread_create_lock);
241 while (!list_empty(&kthread_create_list)) {
242 struct kthread_create_info *create;
243
244 create = list_entry(kthread_create_list.next,
245 struct kthread_create_info, list);
246 list_del_init(&create->list);
247 spin_unlock(&kthread_create_lock);
248
249 create_kthread(create);
250
251 spin_lock(&kthread_create_lock);
252 }
253 spin_unlock(&kthread_create_lock);
254 }
255
256 return 0;
257}
diff --git a/kernel/mutex.c b/kernel/mutex.c
index e7cbbb82765b..303eab18484b 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -133,7 +133,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass)
133 133
134 debug_mutex_lock_common(lock, &waiter); 134 debug_mutex_lock_common(lock, &waiter);
135 mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_); 135 mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
136 debug_mutex_add_waiter(lock, &waiter, task->thread_info); 136 debug_mutex_add_waiter(lock, &waiter, task_thread_info(task));
137 137
138 /* add waiting tasks to the end of the waitqueue (FIFO): */ 138 /* add waiting tasks to the end of the waitqueue (FIFO): */
139 list_add_tail(&waiter.list, &lock->wait_list); 139 list_add_tail(&waiter.list, &lock->wait_list);
@@ -159,7 +159,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass)
159 */ 159 */
160 if (unlikely(state == TASK_INTERRUPTIBLE && 160 if (unlikely(state == TASK_INTERRUPTIBLE &&
161 signal_pending(task))) { 161 signal_pending(task))) {
162 mutex_remove_waiter(lock, &waiter, task->thread_info); 162 mutex_remove_waiter(lock, &waiter, task_thread_info(task));
163 mutex_release(&lock->dep_map, 1, _RET_IP_); 163 mutex_release(&lock->dep_map, 1, _RET_IP_);
164 spin_unlock_mutex(&lock->wait_lock, flags); 164 spin_unlock_mutex(&lock->wait_lock, flags);
165 165
@@ -175,8 +175,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass)
175 } 175 }
176 176
177 /* got the lock - rejoice! */ 177 /* got the lock - rejoice! */
178 mutex_remove_waiter(lock, &waiter, task->thread_info); 178 mutex_remove_waiter(lock, &waiter, task_thread_info(task));
179 debug_mutex_set_owner(lock, task->thread_info); 179 debug_mutex_set_owner(lock, task_thread_info(task));
180 180
181 /* set it to 0 if there are no waiters left: */ 181 /* set it to 0 if there are no waiters left: */
182 if (likely(list_empty(&lock->wait_list))) 182 if (likely(list_empty(&lock->wait_list)))
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 06331374d862..b5f0543ed84d 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -30,30 +30,69 @@ char resume_file[256] = CONFIG_PM_STD_PARTITION;
30dev_t swsusp_resume_device; 30dev_t swsusp_resume_device;
31sector_t swsusp_resume_block; 31sector_t swsusp_resume_block;
32 32
33enum {
34 HIBERNATION_INVALID,
35 HIBERNATION_PLATFORM,
36 HIBERNATION_TEST,
37 HIBERNATION_TESTPROC,
38 HIBERNATION_SHUTDOWN,
39 HIBERNATION_REBOOT,
40 /* keep last */
41 __HIBERNATION_AFTER_LAST
42};
43#define HIBERNATION_MAX (__HIBERNATION_AFTER_LAST-1)
44#define HIBERNATION_FIRST (HIBERNATION_INVALID + 1)
45
46static int hibernation_mode = HIBERNATION_SHUTDOWN;
47
48struct hibernation_ops *hibernation_ops;
49
50/**
51 * hibernation_set_ops - set the global hibernate operations
52 * @ops: the hibernation operations to use in subsequent hibernation transitions
53 */
54
55void hibernation_set_ops(struct hibernation_ops *ops)
56{
57 if (ops && !(ops->prepare && ops->enter && ops->finish)) {
58 WARN_ON(1);
59 return;
60 }
61 mutex_lock(&pm_mutex);
62 hibernation_ops = ops;
63 if (ops)
64 hibernation_mode = HIBERNATION_PLATFORM;
65 else if (hibernation_mode == HIBERNATION_PLATFORM)
66 hibernation_mode = HIBERNATION_SHUTDOWN;
67
68 mutex_unlock(&pm_mutex);
69}
70
71
33/** 72/**
34 * platform_prepare - prepare the machine for hibernation using the 73 * platform_prepare - prepare the machine for hibernation using the
35 * platform driver if so configured and return an error code if it fails 74 * platform driver if so configured and return an error code if it fails
36 */ 75 */
37 76
38static inline int platform_prepare(void) 77static int platform_prepare(void)
39{ 78{
40 int error = 0; 79 return (hibernation_mode == HIBERNATION_PLATFORM && hibernation_ops) ?
80 hibernation_ops->prepare() : 0;
81}
41 82
42 switch (pm_disk_mode) { 83/**
43 case PM_DISK_TEST: 84 * platform_finish - switch the machine to the normal mode of operation
44 case PM_DISK_TESTPROC: 85 * using the platform driver (must be called after platform_prepare())
45 case PM_DISK_SHUTDOWN: 86 */
46 case PM_DISK_REBOOT: 87
47 break; 88static void platform_finish(void)
48 default: 89{
49 if (pm_ops && pm_ops->prepare) 90 if (hibernation_mode == HIBERNATION_PLATFORM && hibernation_ops)
50 error = pm_ops->prepare(PM_SUSPEND_DISK); 91 hibernation_ops->finish();
51 }
52 return error;
53} 92}
54 93
55/** 94/**
56 * power_down - Shut machine down for hibernate. 95 * power_down - Shut the machine down for hibernation.
57 * 96 *
58 * Use the platform driver, if configured so; otherwise try 97 * Use the platform driver, if configured so; otherwise try
59 * to power off or reboot. 98 * to power off or reboot.
@@ -61,20 +100,20 @@ static inline int platform_prepare(void)
61 100
62static void power_down(void) 101static void power_down(void)
63{ 102{
64 switch (pm_disk_mode) { 103 switch (hibernation_mode) {
65 case PM_DISK_TEST: 104 case HIBERNATION_TEST:
66 case PM_DISK_TESTPROC: 105 case HIBERNATION_TESTPROC:
67 break; 106 break;
68 case PM_DISK_SHUTDOWN: 107 case HIBERNATION_SHUTDOWN:
69 kernel_power_off(); 108 kernel_power_off();
70 break; 109 break;
71 case PM_DISK_REBOOT: 110 case HIBERNATION_REBOOT:
72 kernel_restart(NULL); 111 kernel_restart(NULL);
73 break; 112 break;
74 default: 113 case HIBERNATION_PLATFORM:
75 if (pm_ops && pm_ops->enter) { 114 if (hibernation_ops) {
76 kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK); 115 kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK);
77 pm_ops->enter(PM_SUSPEND_DISK); 116 hibernation_ops->enter();
78 break; 117 break;
79 } 118 }
80 } 119 }
@@ -87,20 +126,6 @@ static void power_down(void)
87 while(1); 126 while(1);
88} 127}
89 128
90static inline void platform_finish(void)
91{
92 switch (pm_disk_mode) {
93 case PM_DISK_TEST:
94 case PM_DISK_TESTPROC:
95 case PM_DISK_SHUTDOWN:
96 case PM_DISK_REBOOT:
97 break;
98 default:
99 if (pm_ops && pm_ops->finish)
100 pm_ops->finish(PM_SUSPEND_DISK);
101 }
102}
103
104static void unprepare_processes(void) 129static void unprepare_processes(void)
105{ 130{
106 thaw_processes(); 131 thaw_processes();
@@ -120,13 +145,10 @@ static int prepare_processes(void)
120} 145}
121 146
122/** 147/**
123 * pm_suspend_disk - The granpappy of hibernation power management. 148 * hibernate - The granpappy of the built-in hibernation management
124 *
125 * If not, then call swsusp to do its thing, then figure out how
126 * to power down the system.
127 */ 149 */
128 150
129int pm_suspend_disk(void) 151int hibernate(void)
130{ 152{
131 int error; 153 int error;
132 154
@@ -143,7 +165,8 @@ int pm_suspend_disk(void)
143 if (error) 165 if (error)
144 goto Finish; 166 goto Finish;
145 167
146 if (pm_disk_mode == PM_DISK_TESTPROC) { 168 mutex_lock(&pm_mutex);
169 if (hibernation_mode == HIBERNATION_TESTPROC) {
147 printk("swsusp debug: Waiting for 5 seconds.\n"); 170 printk("swsusp debug: Waiting for 5 seconds.\n");
148 mdelay(5000); 171 mdelay(5000);
149 goto Thaw; 172 goto Thaw;
@@ -168,7 +191,7 @@ int pm_suspend_disk(void)
168 if (error) 191 if (error)
169 goto Enable_cpus; 192 goto Enable_cpus;
170 193
171 if (pm_disk_mode == PM_DISK_TEST) { 194 if (hibernation_mode == HIBERNATION_TEST) {
172 printk("swsusp debug: Waiting for 5 seconds.\n"); 195 printk("swsusp debug: Waiting for 5 seconds.\n");
173 mdelay(5000); 196 mdelay(5000);
174 goto Enable_cpus; 197 goto Enable_cpus;
@@ -205,6 +228,7 @@ int pm_suspend_disk(void)
205 device_resume(); 228 device_resume();
206 resume_console(); 229 resume_console();
207 Thaw: 230 Thaw:
231 mutex_unlock(&pm_mutex);
208 unprepare_processes(); 232 unprepare_processes();
209 Finish: 233 Finish:
210 free_basic_memory_bitmaps(); 234 free_basic_memory_bitmaps();
@@ -220,7 +244,7 @@ int pm_suspend_disk(void)
220 * Called as a late_initcall (so all devices are discovered and 244 * Called as a late_initcall (so all devices are discovered and
221 * initialized), we call swsusp to see if we have a saved image or not. 245 * initialized), we call swsusp to see if we have a saved image or not.
222 * If so, we quiesce devices, the restore the saved image. We will 246 * If so, we quiesce devices, the restore the saved image. We will
223 * return above (in pm_suspend_disk() ) if everything goes well. 247 * return above (in hibernate() ) if everything goes well.
224 * Otherwise, we fail gracefully and return to the normally 248 * Otherwise, we fail gracefully and return to the normally
225 * scheduled program. 249 * scheduled program.
226 * 250 *
@@ -315,25 +339,26 @@ static int software_resume(void)
315late_initcall(software_resume); 339late_initcall(software_resume);
316 340
317 341
318static const char * const pm_disk_modes[] = { 342static const char * const hibernation_modes[] = {
319 [PM_DISK_PLATFORM] = "platform", 343 [HIBERNATION_PLATFORM] = "platform",
320 [PM_DISK_SHUTDOWN] = "shutdown", 344 [HIBERNATION_SHUTDOWN] = "shutdown",
321 [PM_DISK_REBOOT] = "reboot", 345 [HIBERNATION_REBOOT] = "reboot",
322 [PM_DISK_TEST] = "test", 346 [HIBERNATION_TEST] = "test",
323 [PM_DISK_TESTPROC] = "testproc", 347 [HIBERNATION_TESTPROC] = "testproc",
324}; 348};
325 349
326/** 350/**
327 * disk - Control suspend-to-disk mode 351 * disk - Control hibernation mode
328 * 352 *
329 * Suspend-to-disk can be handled in several ways. We have a few options 353 * Suspend-to-disk can be handled in several ways. We have a few options
330 * for putting the system to sleep - using the platform driver (e.g. ACPI 354 * for putting the system to sleep - using the platform driver (e.g. ACPI
331 * or other pm_ops), powering off the system or rebooting the system 355 * or other hibernation_ops), powering off the system or rebooting the
332 * (for testing) as well as the two test modes. 356 * system (for testing) as well as the two test modes.
333 * 357 *
334 * The system can support 'platform', and that is known a priori (and 358 * The system can support 'platform', and that is known a priori (and
335 * encoded in pm_ops). However, the user may choose 'shutdown' or 'reboot' 359 * encoded by the presence of hibernation_ops). However, the user may
336 * as alternatives, as well as the test modes 'test' and 'testproc'. 360 * choose 'shutdown' or 'reboot' as alternatives, as well as one fo the
361 * test modes, 'test' or 'testproc'.
337 * 362 *
338 * show() will display what the mode is currently set to. 363 * show() will display what the mode is currently set to.
339 * store() will accept one of 364 * store() will accept one of
@@ -345,7 +370,7 @@ static const char * const pm_disk_modes[] = {
345 * 'testproc' 370 * 'testproc'
346 * 371 *
347 * It will only change to 'platform' if the system 372 * It will only change to 'platform' if the system
348 * supports it (as determined from pm_ops->pm_disk_mode). 373 * supports it (as determined by having hibernation_ops).
349 */ 374 */
350 375
351static ssize_t disk_show(struct kset *kset, char *buf) 376static ssize_t disk_show(struct kset *kset, char *buf)
@@ -353,28 +378,25 @@ static ssize_t disk_show(struct kset *kset, char *buf)
353 int i; 378 int i;
354 char *start = buf; 379 char *start = buf;
355 380
356 for (i = PM_DISK_PLATFORM; i < PM_DISK_MAX; i++) { 381 for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) {
357 if (!pm_disk_modes[i]) 382 if (!hibernation_modes[i])
358 continue; 383 continue;
359 switch (i) { 384 switch (i) {
360 case PM_DISK_SHUTDOWN: 385 case HIBERNATION_SHUTDOWN:
361 case PM_DISK_REBOOT: 386 case HIBERNATION_REBOOT:
362 case PM_DISK_TEST: 387 case HIBERNATION_TEST:
363 case PM_DISK_TESTPROC: 388 case HIBERNATION_TESTPROC:
364 break; 389 break;
365 default: 390 case HIBERNATION_PLATFORM:
366 if (pm_ops && pm_ops->enter && 391 if (hibernation_ops)
367 (i == pm_ops->pm_disk_mode))
368 break; 392 break;
369 /* not a valid mode, continue with loop */ 393 /* not a valid mode, continue with loop */
370 continue; 394 continue;
371 } 395 }
372 if (i == pm_disk_mode) 396 if (i == hibernation_mode)
373 buf += sprintf(buf, "[%s]", pm_disk_modes[i]); 397 buf += sprintf(buf, "[%s] ", hibernation_modes[i]);
374 else 398 else
375 buf += sprintf(buf, "%s", pm_disk_modes[i]); 399 buf += sprintf(buf, "%s ", hibernation_modes[i]);
376 if (i+1 != PM_DISK_MAX)
377 buf += sprintf(buf, " ");
378 } 400 }
379 buf += sprintf(buf, "\n"); 401 buf += sprintf(buf, "\n");
380 return buf-start; 402 return buf-start;
@@ -387,39 +409,38 @@ static ssize_t disk_store(struct kset *kset, const char *buf, size_t n)
387 int i; 409 int i;
388 int len; 410 int len;
389 char *p; 411 char *p;
390 suspend_disk_method_t mode = 0; 412 int mode = HIBERNATION_INVALID;
391 413
392 p = memchr(buf, '\n', n); 414 p = memchr(buf, '\n', n);
393 len = p ? p - buf : n; 415 len = p ? p - buf : n;
394 416
395 mutex_lock(&pm_mutex); 417 mutex_lock(&pm_mutex);
396 for (i = PM_DISK_PLATFORM; i < PM_DISK_MAX; i++) { 418 for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) {
397 if (!strncmp(buf, pm_disk_modes[i], len)) { 419 if (!strncmp(buf, hibernation_modes[i], len)) {
398 mode = i; 420 mode = i;
399 break; 421 break;
400 } 422 }
401 } 423 }
402 if (mode) { 424 if (mode != HIBERNATION_INVALID) {
403 switch (mode) { 425 switch (mode) {
404 case PM_DISK_SHUTDOWN: 426 case HIBERNATION_SHUTDOWN:
405 case PM_DISK_REBOOT: 427 case HIBERNATION_REBOOT:
406 case PM_DISK_TEST: 428 case HIBERNATION_TEST:
407 case PM_DISK_TESTPROC: 429 case HIBERNATION_TESTPROC:
408 pm_disk_mode = mode; 430 hibernation_mode = mode;
409 break; 431 break;
410 default: 432 case HIBERNATION_PLATFORM:
411 if (pm_ops && pm_ops->enter && 433 if (hibernation_ops)
412 (mode == pm_ops->pm_disk_mode)) 434 hibernation_mode = mode;
413 pm_disk_mode = mode;
414 else 435 else
415 error = -EINVAL; 436 error = -EINVAL;
416 } 437 }
417 } else { 438 } else
418 error = -EINVAL; 439 error = -EINVAL;
419 }
420 440
421 pr_debug("PM: suspend-to-disk mode set to '%s'\n", 441 if (!error)
422 pm_disk_modes[mode]); 442 pr_debug("PM: suspend-to-disk mode set to '%s'\n",
443 hibernation_modes[mode]);
423 mutex_unlock(&pm_mutex); 444 mutex_unlock(&pm_mutex);
424 return error ? error : n; 445 return error ? error : n;
425} 446}
diff --git a/kernel/power/main.c b/kernel/power/main.c
index f6dda685e7e2..40d56a31245e 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -30,7 +30,6 @@
30DEFINE_MUTEX(pm_mutex); 30DEFINE_MUTEX(pm_mutex);
31 31
32struct pm_ops *pm_ops; 32struct pm_ops *pm_ops;
33suspend_disk_method_t pm_disk_mode = PM_DISK_SHUTDOWN;
34 33
35/** 34/**
36 * pm_set_ops - Set the global power method table. 35 * pm_set_ops - Set the global power method table.
@@ -41,10 +40,6 @@ void pm_set_ops(struct pm_ops * ops)
41{ 40{
42 mutex_lock(&pm_mutex); 41 mutex_lock(&pm_mutex);
43 pm_ops = ops; 42 pm_ops = ops;
44 if (ops && ops->pm_disk_mode != PM_DISK_INVALID) {
45 pm_disk_mode = ops->pm_disk_mode;
46 } else
47 pm_disk_mode = PM_DISK_SHUTDOWN;
48 mutex_unlock(&pm_mutex); 43 mutex_unlock(&pm_mutex);
49} 44}
50 45
@@ -184,24 +179,12 @@ static void suspend_finish(suspend_state_t state)
184static const char * const pm_states[PM_SUSPEND_MAX] = { 179static const char * const pm_states[PM_SUSPEND_MAX] = {
185 [PM_SUSPEND_STANDBY] = "standby", 180 [PM_SUSPEND_STANDBY] = "standby",
186 [PM_SUSPEND_MEM] = "mem", 181 [PM_SUSPEND_MEM] = "mem",
187 [PM_SUSPEND_DISK] = "disk",
188}; 182};
189 183
190static inline int valid_state(suspend_state_t state) 184static inline int valid_state(suspend_state_t state)
191{ 185{
192 /* Suspend-to-disk does not really need low-level support. 186 /* All states need lowlevel support and need to be valid
193 * It can work with shutdown/reboot if needed. If it isn't 187 * to the lowlevel implementation, no valid callback
194 * configured, then it cannot be supported.
195 */
196 if (state == PM_SUSPEND_DISK)
197#ifdef CONFIG_SOFTWARE_SUSPEND
198 return 1;
199#else
200 return 0;
201#endif
202
203 /* all other states need lowlevel support and need to be
204 * valid to the lowlevel implementation, no valid callback
205 * implies that none are valid. */ 188 * implies that none are valid. */
206 if (!pm_ops || !pm_ops->valid || !pm_ops->valid(state)) 189 if (!pm_ops || !pm_ops->valid || !pm_ops->valid(state))
207 return 0; 190 return 0;
@@ -229,11 +212,6 @@ static int enter_state(suspend_state_t state)
229 if (!mutex_trylock(&pm_mutex)) 212 if (!mutex_trylock(&pm_mutex))
230 return -EBUSY; 213 return -EBUSY;
231 214
232 if (state == PM_SUSPEND_DISK) {
233 error = pm_suspend_disk();
234 goto Unlock;
235 }
236
237 pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]); 215 pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]);
238 if ((error = suspend_prepare(state))) 216 if ((error = suspend_prepare(state)))
239 goto Unlock; 217 goto Unlock;
@@ -251,7 +229,7 @@ static int enter_state(suspend_state_t state)
251 229
252/** 230/**
253 * pm_suspend - Externally visible function for suspending system. 231 * pm_suspend - Externally visible function for suspending system.
254 * @state: Enumarted value of state to enter. 232 * @state: Enumerated value of state to enter.
255 * 233 *
256 * Determine whether or not value is within range, get state 234 * Determine whether or not value is within range, get state
257 * structure, and enter (above). 235 * structure, and enter (above).
@@ -289,7 +267,13 @@ static ssize_t state_show(struct kset *kset, char *buf)
289 if (pm_states[i] && valid_state(i)) 267 if (pm_states[i] && valid_state(i))
290 s += sprintf(s,"%s ", pm_states[i]); 268 s += sprintf(s,"%s ", pm_states[i]);
291 } 269 }
292 s += sprintf(s,"\n"); 270#ifdef CONFIG_SOFTWARE_SUSPEND
271 s += sprintf(s, "%s\n", "disk");
272#else
273 if (s != buf)
274 /* convert the last space to a newline */
275 *(s-1) = '\n';
276#endif
293 return (s - buf); 277 return (s - buf);
294} 278}
295 279
@@ -304,6 +288,12 @@ static ssize_t state_store(struct kset *kset, const char *buf, size_t n)
304 p = memchr(buf, '\n', n); 288 p = memchr(buf, '\n', n);
305 len = p ? p - buf : n; 289 len = p ? p - buf : n;
306 290
291 /* First, check if we are requested to hibernate */
292 if (!strncmp(buf, "disk", len)) {
293 error = hibernate();
294 return error ? error : n;
295 }
296
307 for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) { 297 for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) {
308 if (*s && !strncmp(buf, *s, len)) 298 if (*s && !strncmp(buf, *s, len))
309 break; 299 break;
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 34b43542785a..51381487103f 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -25,12 +25,7 @@ struct swsusp_info {
25 */ 25 */
26#define SPARE_PAGES ((1024 * 1024) >> PAGE_SHIFT) 26#define SPARE_PAGES ((1024 * 1024) >> PAGE_SHIFT)
27 27
28extern int pm_suspend_disk(void); 28extern struct hibernation_ops *hibernation_ops;
29#else
30static inline int pm_suspend_disk(void)
31{
32 return -EPERM;
33}
34#endif 29#endif
35 30
36extern int pfn_is_nosave(unsigned long); 31extern int pfn_is_nosave(unsigned long);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index b7039772b05c..48383ea72290 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1227,7 +1227,7 @@ asmlinkage int swsusp_save(void)
1227 nr_copy_pages = nr_pages; 1227 nr_copy_pages = nr_pages;
1228 nr_meta_pages = DIV_ROUND_UP(nr_pages * sizeof(long), PAGE_SIZE); 1228 nr_meta_pages = DIV_ROUND_UP(nr_pages * sizeof(long), PAGE_SIZE);
1229 1229
1230 printk("swsusp: critical section/: done (%d pages copied)\n", nr_pages); 1230 printk("swsusp: critical section: done (%d pages copied)\n", nr_pages);
1231 1231
1232 return 0; 1232 return 0;
1233} 1233}
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 040560d9c312..24d7d78e6f42 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -130,16 +130,16 @@ static inline int platform_prepare(void)
130{ 130{
131 int error = 0; 131 int error = 0;
132 132
133 if (pm_ops && pm_ops->prepare) 133 if (hibernation_ops)
134 error = pm_ops->prepare(PM_SUSPEND_DISK); 134 error = hibernation_ops->prepare();
135 135
136 return error; 136 return error;
137} 137}
138 138
139static inline void platform_finish(void) 139static inline void platform_finish(void)
140{ 140{
141 if (pm_ops && pm_ops->finish) 141 if (hibernation_ops)
142 pm_ops->finish(PM_SUSPEND_DISK); 142 hibernation_ops->finish();
143} 143}
144 144
145static inline int snapshot_suspend(int platform_suspend) 145static inline int snapshot_suspend(int platform_suspend)
@@ -384,7 +384,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
384 switch (arg) { 384 switch (arg) {
385 385
386 case PMOPS_PREPARE: 386 case PMOPS_PREPARE:
387 if (pm_ops && pm_ops->enter) { 387 if (hibernation_ops) {
388 data->platform_suspend = 1; 388 data->platform_suspend = 1;
389 error = 0; 389 error = 0;
390 } else { 390 } else {
@@ -395,8 +395,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
395 case PMOPS_ENTER: 395 case PMOPS_ENTER:
396 if (data->platform_suspend) { 396 if (data->platform_suspend) {
397 kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK); 397 kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK);
398 error = pm_ops->enter(PM_SUSPEND_DISK); 398 error = hibernation_ops->enter();
399 error = 0;
400 } 399 }
401 break; 400 break;
402 401
diff --git a/kernel/profile.c b/kernel/profile.c
index 9bfadb248dd8..cc91b9bf759d 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -340,6 +340,7 @@ static int __devinit profile_cpu_callback(struct notifier_block *info,
340 340
341 switch (action) { 341 switch (action) {
342 case CPU_UP_PREPARE: 342 case CPU_UP_PREPARE:
343 case CPU_UP_PREPARE_FROZEN:
343 node = cpu_to_node(cpu); 344 node = cpu_to_node(cpu);
344 per_cpu(cpu_profile_flip, cpu) = 0; 345 per_cpu(cpu_profile_flip, cpu) = 0;
345 if (!per_cpu(cpu_profile_hits, cpu)[1]) { 346 if (!per_cpu(cpu_profile_hits, cpu)[1]) {
@@ -365,10 +366,13 @@ static int __devinit profile_cpu_callback(struct notifier_block *info,
365 __free_page(page); 366 __free_page(page);
366 return NOTIFY_BAD; 367 return NOTIFY_BAD;
367 case CPU_ONLINE: 368 case CPU_ONLINE:
369 case CPU_ONLINE_FROZEN:
368 cpu_set(cpu, prof_cpu_mask); 370 cpu_set(cpu, prof_cpu_mask);
369 break; 371 break;
370 case CPU_UP_CANCELED: 372 case CPU_UP_CANCELED:
373 case CPU_UP_CANCELED_FROZEN:
371 case CPU_DEAD: 374 case CPU_DEAD:
375 case CPU_DEAD_FROZEN:
372 cpu_clear(cpu, prof_cpu_mask); 376 cpu_clear(cpu, prof_cpu_mask);
373 if (per_cpu(cpu_profile_hits, cpu)[0]) { 377 if (per_cpu(cpu_profile_hits, cpu)[0]) {
374 page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]); 378 page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]);
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 3554b76da84c..2c2dd8410dc4 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -558,9 +558,11 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
558 long cpu = (long)hcpu; 558 long cpu = (long)hcpu;
559 switch (action) { 559 switch (action) {
560 case CPU_UP_PREPARE: 560 case CPU_UP_PREPARE:
561 case CPU_UP_PREPARE_FROZEN:
561 rcu_online_cpu(cpu); 562 rcu_online_cpu(cpu);
562 break; 563 break;
563 case CPU_DEAD: 564 case CPU_DEAD:
565 case CPU_DEAD_FROZEN:
564 rcu_offline_cpu(cpu); 566 rcu_offline_cpu(cpu);
565 break; 567 break;
566 default: 568 default:
diff --git a/kernel/relay.c b/kernel/relay.c
index d24395e8b6e5..4311101b0ca7 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -310,16 +310,13 @@ static struct rchan_callbacks default_channel_callbacks = {
310 310
311/** 311/**
312 * wakeup_readers - wake up readers waiting on a channel 312 * wakeup_readers - wake up readers waiting on a channel
313 * @work: work struct that contains the channel buffer 313 * @data: contains the channel buffer
314 * 314 *
315 * This is the work function used to defer reader waking. The 315 * This is the timer function used to defer reader waking.
316 * reason waking is deferred is that calling directly from write
317 * causes problems if you're writing from say the scheduler.
318 */ 316 */
319static void wakeup_readers(struct work_struct *work) 317static void wakeup_readers(unsigned long data)
320{ 318{
321 struct rchan_buf *buf = 319 struct rchan_buf *buf = (struct rchan_buf *)data;
322 container_of(work, struct rchan_buf, wake_readers.work);
323 wake_up_interruptible(&buf->read_wait); 320 wake_up_interruptible(&buf->read_wait);
324} 321}
325 322
@@ -337,11 +334,9 @@ static void __relay_reset(struct rchan_buf *buf, unsigned int init)
337 if (init) { 334 if (init) {
338 init_waitqueue_head(&buf->read_wait); 335 init_waitqueue_head(&buf->read_wait);
339 kref_init(&buf->kref); 336 kref_init(&buf->kref);
340 INIT_DELAYED_WORK(&buf->wake_readers, NULL); 337 setup_timer(&buf->timer, wakeup_readers, (unsigned long)buf);
341 } else { 338 } else
342 cancel_delayed_work(&buf->wake_readers); 339 del_timer_sync(&buf->timer);
343 flush_scheduled_work();
344 }
345 340
346 buf->subbufs_produced = 0; 341 buf->subbufs_produced = 0;
347 buf->subbufs_consumed = 0; 342 buf->subbufs_consumed = 0;
@@ -447,8 +442,7 @@ end:
447static void relay_close_buf(struct rchan_buf *buf) 442static void relay_close_buf(struct rchan_buf *buf)
448{ 443{
449 buf->finalized = 1; 444 buf->finalized = 1;
450 cancel_delayed_work(&buf->wake_readers); 445 del_timer_sync(&buf->timer);
451 flush_scheduled_work();
452 kref_put(&buf->kref, relay_remove_buf); 446 kref_put(&buf->kref, relay_remove_buf);
453} 447}
454 448
@@ -490,6 +484,7 @@ static int __cpuinit relay_hotcpu_callback(struct notifier_block *nb,
490 484
491 switch(action) { 485 switch(action) {
492 case CPU_UP_PREPARE: 486 case CPU_UP_PREPARE:
487 case CPU_UP_PREPARE_FROZEN:
493 mutex_lock(&relay_channels_mutex); 488 mutex_lock(&relay_channels_mutex);
494 list_for_each_entry(chan, &relay_channels, list) { 489 list_for_each_entry(chan, &relay_channels, list) {
495 if (chan->buf[hotcpu]) 490 if (chan->buf[hotcpu])
@@ -506,6 +501,7 @@ static int __cpuinit relay_hotcpu_callback(struct notifier_block *nb,
506 mutex_unlock(&relay_channels_mutex); 501 mutex_unlock(&relay_channels_mutex);
507 break; 502 break;
508 case CPU_DEAD: 503 case CPU_DEAD:
504 case CPU_DEAD_FROZEN:
509 /* No need to flush the cpu : will be flushed upon 505 /* No need to flush the cpu : will be flushed upon
510 * final relay_flush() call. */ 506 * final relay_flush() call. */
511 break; 507 break;
@@ -608,11 +604,14 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
608 buf->dentry->d_inode->i_size += buf->chan->subbuf_size - 604 buf->dentry->d_inode->i_size += buf->chan->subbuf_size -
609 buf->padding[old_subbuf]; 605 buf->padding[old_subbuf];
610 smp_mb(); 606 smp_mb();
611 if (waitqueue_active(&buf->read_wait)) { 607 if (waitqueue_active(&buf->read_wait))
612 PREPARE_DELAYED_WORK(&buf->wake_readers, 608 /*
613 wakeup_readers); 609 * Calling wake_up_interruptible() from here
614 schedule_delayed_work(&buf->wake_readers, 1); 610 * will deadlock if we happen to be logging
615 } 611 * from the scheduler (trying to re-grab
612 * rq->lock), so defer it.
613 */
614 __mod_timer(&buf->timer, jiffies + 1);
616 } 615 }
617 616
618 old = buf->data; 617 old = buf->data;
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 180978cb2f75..12879f6c1ec3 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -56,7 +56,7 @@
56 * state. 56 * state.
57 */ 57 */
58 58
59static void 59void
60rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner, 60rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner,
61 unsigned long mask) 61 unsigned long mask)
62{ 62{
@@ -81,29 +81,6 @@ static void fixup_rt_mutex_waiters(struct rt_mutex *lock)
81} 81}
82 82
83/* 83/*
84 * We can speed up the acquire/release, if the architecture
85 * supports cmpxchg and if there's no debugging state to be set up
86 */
87#if defined(__HAVE_ARCH_CMPXCHG) && !defined(CONFIG_DEBUG_RT_MUTEXES)
88# define rt_mutex_cmpxchg(l,c,n) (cmpxchg(&l->owner, c, n) == c)
89static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
90{
91 unsigned long owner, *p = (unsigned long *) &lock->owner;
92
93 do {
94 owner = *p;
95 } while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner);
96}
97#else
98# define rt_mutex_cmpxchg(l,c,n) (0)
99static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
100{
101 lock->owner = (struct task_struct *)
102 ((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS);
103}
104#endif
105
106/*
107 * Calculate task priority from the waiter list priority 84 * Calculate task priority from the waiter list priority
108 * 85 *
109 * Return task->normal_prio when the waiter list is empty or when 86 * Return task->normal_prio when the waiter list is empty or when
@@ -123,7 +100,7 @@ int rt_mutex_getprio(struct task_struct *task)
123 * 100 *
124 * This can be both boosting and unboosting. task->pi_lock must be held. 101 * This can be both boosting and unboosting. task->pi_lock must be held.
125 */ 102 */
126static void __rt_mutex_adjust_prio(struct task_struct *task) 103void __rt_mutex_adjust_prio(struct task_struct *task)
127{ 104{
128 int prio = rt_mutex_getprio(task); 105 int prio = rt_mutex_getprio(task);
129 106
@@ -159,11 +136,11 @@ int max_lock_depth = 1024;
159 * Decreases task's usage by one - may thus free the task. 136 * Decreases task's usage by one - may thus free the task.
160 * Returns 0 or -EDEADLK. 137 * Returns 0 or -EDEADLK.
161 */ 138 */
162static int rt_mutex_adjust_prio_chain(struct task_struct *task, 139int rt_mutex_adjust_prio_chain(struct task_struct *task,
163 int deadlock_detect, 140 int deadlock_detect,
164 struct rt_mutex *orig_lock, 141 struct rt_mutex *orig_lock,
165 struct rt_mutex_waiter *orig_waiter, 142 struct rt_mutex_waiter *orig_waiter,
166 struct task_struct *top_task) 143 struct task_struct *top_task)
167{ 144{
168 struct rt_mutex *lock; 145 struct rt_mutex *lock;
169 struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter; 146 struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter;
@@ -524,8 +501,8 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
524 * 501 *
525 * Must be called with lock->wait_lock held 502 * Must be called with lock->wait_lock held
526 */ 503 */
527static void remove_waiter(struct rt_mutex *lock, 504void remove_waiter(struct rt_mutex *lock,
528 struct rt_mutex_waiter *waiter) 505 struct rt_mutex_waiter *waiter)
529{ 506{
530 int first = (waiter == rt_mutex_top_waiter(lock)); 507 int first = (waiter == rt_mutex_top_waiter(lock));
531 struct task_struct *owner = rt_mutex_owner(lock); 508 struct task_struct *owner = rt_mutex_owner(lock);
diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h
index 9c75856e791e..242ec7ee740b 100644
--- a/kernel/rtmutex_common.h
+++ b/kernel/rtmutex_common.h
@@ -113,6 +113,29 @@ static inline unsigned long rt_mutex_owner_pending(struct rt_mutex *lock)
113} 113}
114 114
115/* 115/*
116 * We can speed up the acquire/release, if the architecture
117 * supports cmpxchg and if there's no debugging state to be set up
118 */
119#if defined(__HAVE_ARCH_CMPXCHG) && !defined(CONFIG_DEBUG_RT_MUTEXES)
120# define rt_mutex_cmpxchg(l,c,n) (cmpxchg(&l->owner, c, n) == c)
121static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
122{
123 unsigned long owner, *p = (unsigned long *) &lock->owner;
124
125 do {
126 owner = *p;
127 } while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner);
128}
129#else
130# define rt_mutex_cmpxchg(l,c,n) (0)
131static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
132{
133 lock->owner = (struct task_struct *)
134 ((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS);
135}
136#endif
137
138/*
116 * PI-futex support (proxy locking functions, etc.): 139 * PI-futex support (proxy locking functions, etc.):
117 */ 140 */
118extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock); 141extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock);
@@ -120,4 +143,15 @@ extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
120 struct task_struct *proxy_owner); 143 struct task_struct *proxy_owner);
121extern void rt_mutex_proxy_unlock(struct rt_mutex *lock, 144extern void rt_mutex_proxy_unlock(struct rt_mutex *lock,
122 struct task_struct *proxy_owner); 145 struct task_struct *proxy_owner);
146
147extern void rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner,
148 unsigned long mask);
149extern void __rt_mutex_adjust_prio(struct task_struct *task);
150extern int rt_mutex_adjust_prio_chain(struct task_struct *task,
151 int deadlock_detect,
152 struct rt_mutex *orig_lock,
153 struct rt_mutex_waiter *orig_waiter,
154 struct task_struct *top_task);
155extern void remove_waiter(struct rt_mutex *lock,
156 struct rt_mutex_waiter *waiter);
123#endif 157#endif
diff --git a/kernel/sched.c b/kernel/sched.c
index 66bd7ff23f18..799d23b4e35d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -305,6 +305,7 @@ struct rq {
305}; 305};
306 306
307static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp; 307static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp;
308static DEFINE_MUTEX(sched_hotcpu_mutex);
308 309
309static inline int cpu_of(struct rq *rq) 310static inline int cpu_of(struct rq *rq)
310{ 311{
@@ -4520,13 +4521,13 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask)
4520 struct task_struct *p; 4521 struct task_struct *p;
4521 int retval; 4522 int retval;
4522 4523
4523 lock_cpu_hotplug(); 4524 mutex_lock(&sched_hotcpu_mutex);
4524 read_lock(&tasklist_lock); 4525 read_lock(&tasklist_lock);
4525 4526
4526 p = find_process_by_pid(pid); 4527 p = find_process_by_pid(pid);
4527 if (!p) { 4528 if (!p) {
4528 read_unlock(&tasklist_lock); 4529 read_unlock(&tasklist_lock);
4529 unlock_cpu_hotplug(); 4530 mutex_unlock(&sched_hotcpu_mutex);
4530 return -ESRCH; 4531 return -ESRCH;
4531 } 4532 }
4532 4533
@@ -4553,7 +4554,7 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask)
4553 4554
4554out_unlock: 4555out_unlock:
4555 put_task_struct(p); 4556 put_task_struct(p);
4556 unlock_cpu_hotplug(); 4557 mutex_unlock(&sched_hotcpu_mutex);
4557 return retval; 4558 return retval;
4558} 4559}
4559 4560
@@ -4610,7 +4611,7 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask)
4610 struct task_struct *p; 4611 struct task_struct *p;
4611 int retval; 4612 int retval;
4612 4613
4613 lock_cpu_hotplug(); 4614 mutex_lock(&sched_hotcpu_mutex);
4614 read_lock(&tasklist_lock); 4615 read_lock(&tasklist_lock);
4615 4616
4616 retval = -ESRCH; 4617 retval = -ESRCH;
@@ -4626,7 +4627,7 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask)
4626 4627
4627out_unlock: 4628out_unlock:
4628 read_unlock(&tasklist_lock); 4629 read_unlock(&tasklist_lock);
4629 unlock_cpu_hotplug(); 4630 mutex_unlock(&sched_hotcpu_mutex);
4630 if (retval) 4631 if (retval)
4631 return retval; 4632 return retval;
4632 4633
@@ -5388,7 +5389,12 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5388 struct rq *rq; 5389 struct rq *rq;
5389 5390
5390 switch (action) { 5391 switch (action) {
5392 case CPU_LOCK_ACQUIRE:
5393 mutex_lock(&sched_hotcpu_mutex);
5394 break;
5395
5391 case CPU_UP_PREPARE: 5396 case CPU_UP_PREPARE:
5397 case CPU_UP_PREPARE_FROZEN:
5392 p = kthread_create(migration_thread, hcpu, "migration/%d",cpu); 5398 p = kthread_create(migration_thread, hcpu, "migration/%d",cpu);
5393 if (IS_ERR(p)) 5399 if (IS_ERR(p))
5394 return NOTIFY_BAD; 5400 return NOTIFY_BAD;
@@ -5402,12 +5408,14 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5402 break; 5408 break;
5403 5409
5404 case CPU_ONLINE: 5410 case CPU_ONLINE:
5411 case CPU_ONLINE_FROZEN:
5405 /* Strictly unneccessary, as first user will wake it. */ 5412 /* Strictly unneccessary, as first user will wake it. */
5406 wake_up_process(cpu_rq(cpu)->migration_thread); 5413 wake_up_process(cpu_rq(cpu)->migration_thread);
5407 break; 5414 break;
5408 5415
5409#ifdef CONFIG_HOTPLUG_CPU 5416#ifdef CONFIG_HOTPLUG_CPU
5410 case CPU_UP_CANCELED: 5417 case CPU_UP_CANCELED:
5418 case CPU_UP_CANCELED_FROZEN:
5411 if (!cpu_rq(cpu)->migration_thread) 5419 if (!cpu_rq(cpu)->migration_thread)
5412 break; 5420 break;
5413 /* Unbind it from offline cpu so it can run. Fall thru. */ 5421 /* Unbind it from offline cpu so it can run. Fall thru. */
@@ -5418,6 +5426,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5418 break; 5426 break;
5419 5427
5420 case CPU_DEAD: 5428 case CPU_DEAD:
5429 case CPU_DEAD_FROZEN:
5421 migrate_live_tasks(cpu); 5430 migrate_live_tasks(cpu);
5422 rq = cpu_rq(cpu); 5431 rq = cpu_rq(cpu);
5423 kthread_stop(rq->migration_thread); 5432 kthread_stop(rq->migration_thread);
@@ -5433,7 +5442,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5433 BUG_ON(rq->nr_running != 0); 5442 BUG_ON(rq->nr_running != 0);
5434 5443
5435 /* No need to migrate the tasks: it was best-effort if 5444 /* No need to migrate the tasks: it was best-effort if
5436 * they didn't do lock_cpu_hotplug(). Just wake up 5445 * they didn't take sched_hotcpu_mutex. Just wake up
5437 * the requestors. */ 5446 * the requestors. */
5438 spin_lock_irq(&rq->lock); 5447 spin_lock_irq(&rq->lock);
5439 while (!list_empty(&rq->migration_queue)) { 5448 while (!list_empty(&rq->migration_queue)) {
@@ -5447,6 +5456,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5447 spin_unlock_irq(&rq->lock); 5456 spin_unlock_irq(&rq->lock);
5448 break; 5457 break;
5449#endif 5458#endif
5459 case CPU_LOCK_RELEASE:
5460 mutex_unlock(&sched_hotcpu_mutex);
5461 break;
5450 } 5462 }
5451 return NOTIFY_OK; 5463 return NOTIFY_OK;
5452} 5464}
@@ -6822,10 +6834,10 @@ int arch_reinit_sched_domains(void)
6822{ 6834{
6823 int err; 6835 int err;
6824 6836
6825 lock_cpu_hotplug(); 6837 mutex_lock(&sched_hotcpu_mutex);
6826 detach_destroy_domains(&cpu_online_map); 6838 detach_destroy_domains(&cpu_online_map);
6827 err = arch_init_sched_domains(&cpu_online_map); 6839 err = arch_init_sched_domains(&cpu_online_map);
6828 unlock_cpu_hotplug(); 6840 mutex_unlock(&sched_hotcpu_mutex);
6829 6841
6830 return err; 6842 return err;
6831} 6843}
@@ -6904,14 +6916,20 @@ static int update_sched_domains(struct notifier_block *nfb,
6904{ 6916{
6905 switch (action) { 6917 switch (action) {
6906 case CPU_UP_PREPARE: 6918 case CPU_UP_PREPARE:
6919 case CPU_UP_PREPARE_FROZEN:
6907 case CPU_DOWN_PREPARE: 6920 case CPU_DOWN_PREPARE:
6921 case CPU_DOWN_PREPARE_FROZEN:
6908 detach_destroy_domains(&cpu_online_map); 6922 detach_destroy_domains(&cpu_online_map);
6909 return NOTIFY_OK; 6923 return NOTIFY_OK;
6910 6924
6911 case CPU_UP_CANCELED: 6925 case CPU_UP_CANCELED:
6926 case CPU_UP_CANCELED_FROZEN:
6912 case CPU_DOWN_FAILED: 6927 case CPU_DOWN_FAILED:
6928 case CPU_DOWN_FAILED_FROZEN:
6913 case CPU_ONLINE: 6929 case CPU_ONLINE:
6930 case CPU_ONLINE_FROZEN:
6914 case CPU_DEAD: 6931 case CPU_DEAD:
6932 case CPU_DEAD_FROZEN:
6915 /* 6933 /*
6916 * Fall through and re-initialise the domains. 6934 * Fall through and re-initialise the domains.
6917 */ 6935 */
@@ -6930,12 +6948,12 @@ void __init sched_init_smp(void)
6930{ 6948{
6931 cpumask_t non_isolated_cpus; 6949 cpumask_t non_isolated_cpus;
6932 6950
6933 lock_cpu_hotplug(); 6951 mutex_lock(&sched_hotcpu_mutex);
6934 arch_init_sched_domains(&cpu_online_map); 6952 arch_init_sched_domains(&cpu_online_map);
6935 cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map); 6953 cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
6936 if (cpus_empty(non_isolated_cpus)) 6954 if (cpus_empty(non_isolated_cpus))
6937 cpu_set(smp_processor_id(), non_isolated_cpus); 6955 cpu_set(smp_processor_id(), non_isolated_cpus);
6938 unlock_cpu_hotplug(); 6956 mutex_unlock(&sched_hotcpu_mutex);
6939 /* XXX: Theoretical race here - CPU may be hotplugged now */ 6957 /* XXX: Theoretical race here - CPU may be hotplugged now */
6940 hotcpu_notifier(update_sched_domains, 0); 6958 hotcpu_notifier(update_sched_domains, 0);
6941 6959
diff --git a/kernel/signal.c b/kernel/signal.c
index 1368e67c8482..2ac3a668d9dd 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -38,125 +38,6 @@
38 38
39static struct kmem_cache *sigqueue_cachep; 39static struct kmem_cache *sigqueue_cachep;
40 40
41/*
42 * In POSIX a signal is sent either to a specific thread (Linux task)
43 * or to the process as a whole (Linux thread group). How the signal
44 * is sent determines whether it's to one thread or the whole group,
45 * which determines which signal mask(s) are involved in blocking it
46 * from being delivered until later. When the signal is delivered,
47 * either it's caught or ignored by a user handler or it has a default
48 * effect that applies to the whole thread group (POSIX process).
49 *
50 * The possible effects an unblocked signal set to SIG_DFL can have are:
51 * ignore - Nothing Happens
52 * terminate - kill the process, i.e. all threads in the group,
53 * similar to exit_group. The group leader (only) reports
54 * WIFSIGNALED status to its parent.
55 * coredump - write a core dump file describing all threads using
56 * the same mm and then kill all those threads
57 * stop - stop all the threads in the group, i.e. TASK_STOPPED state
58 *
59 * SIGKILL and SIGSTOP cannot be caught, blocked, or ignored.
60 * Other signals when not blocked and set to SIG_DFL behaves as follows.
61 * The job control signals also have other special effects.
62 *
63 * +--------------------+------------------+
64 * | POSIX signal | default action |
65 * +--------------------+------------------+
66 * | SIGHUP | terminate |
67 * | SIGINT | terminate |
68 * | SIGQUIT | coredump |
69 * | SIGILL | coredump |
70 * | SIGTRAP | coredump |
71 * | SIGABRT/SIGIOT | coredump |
72 * | SIGBUS | coredump |
73 * | SIGFPE | coredump |
74 * | SIGKILL | terminate(+) |
75 * | SIGUSR1 | terminate |
76 * | SIGSEGV | coredump |
77 * | SIGUSR2 | terminate |
78 * | SIGPIPE | terminate |
79 * | SIGALRM | terminate |
80 * | SIGTERM | terminate |
81 * | SIGCHLD | ignore |
82 * | SIGCONT | ignore(*) |
83 * | SIGSTOP | stop(*)(+) |
84 * | SIGTSTP | stop(*) |
85 * | SIGTTIN | stop(*) |
86 * | SIGTTOU | stop(*) |
87 * | SIGURG | ignore |
88 * | SIGXCPU | coredump |
89 * | SIGXFSZ | coredump |
90 * | SIGVTALRM | terminate |
91 * | SIGPROF | terminate |
92 * | SIGPOLL/SIGIO | terminate |
93 * | SIGSYS/SIGUNUSED | coredump |
94 * | SIGSTKFLT | terminate |
95 * | SIGWINCH | ignore |
96 * | SIGPWR | terminate |
97 * | SIGRTMIN-SIGRTMAX | terminate |
98 * +--------------------+------------------+
99 * | non-POSIX signal | default action |
100 * +--------------------+------------------+
101 * | SIGEMT | coredump |
102 * +--------------------+------------------+
103 *
104 * (+) For SIGKILL and SIGSTOP the action is "always", not just "default".
105 * (*) Special job control effects:
106 * When SIGCONT is sent, it resumes the process (all threads in the group)
107 * from TASK_STOPPED state and also clears any pending/queued stop signals
108 * (any of those marked with "stop(*)"). This happens regardless of blocking,
109 * catching, or ignoring SIGCONT. When any stop signal is sent, it clears
110 * any pending/queued SIGCONT signals; this happens regardless of blocking,
111 * catching, or ignored the stop signal, though (except for SIGSTOP) the
112 * default action of stopping the process may happen later or never.
113 */
114
115#ifdef SIGEMT
116#define M_SIGEMT M(SIGEMT)
117#else
118#define M_SIGEMT 0
119#endif
120
121#if SIGRTMIN > BITS_PER_LONG
122#define M(sig) (1ULL << ((sig)-1))
123#else
124#define M(sig) (1UL << ((sig)-1))
125#endif
126#define T(sig, mask) (M(sig) & (mask))
127
128#define SIG_KERNEL_ONLY_MASK (\
129 M(SIGKILL) | M(SIGSTOP) )
130
131#define SIG_KERNEL_STOP_MASK (\
132 M(SIGSTOP) | M(SIGTSTP) | M(SIGTTIN) | M(SIGTTOU) )
133
134#define SIG_KERNEL_COREDUMP_MASK (\
135 M(SIGQUIT) | M(SIGILL) | M(SIGTRAP) | M(SIGABRT) | \
136 M(SIGFPE) | M(SIGSEGV) | M(SIGBUS) | M(SIGSYS) | \
137 M(SIGXCPU) | M(SIGXFSZ) | M_SIGEMT )
138
139#define SIG_KERNEL_IGNORE_MASK (\
140 M(SIGCONT) | M(SIGCHLD) | M(SIGWINCH) | M(SIGURG) )
141
142#define sig_kernel_only(sig) \
143 (((sig) < SIGRTMIN) && T(sig, SIG_KERNEL_ONLY_MASK))
144#define sig_kernel_coredump(sig) \
145 (((sig) < SIGRTMIN) && T(sig, SIG_KERNEL_COREDUMP_MASK))
146#define sig_kernel_ignore(sig) \
147 (((sig) < SIGRTMIN) && T(sig, SIG_KERNEL_IGNORE_MASK))
148#define sig_kernel_stop(sig) \
149 (((sig) < SIGRTMIN) && T(sig, SIG_KERNEL_STOP_MASK))
150
151#define sig_needs_tasklist(sig) ((sig) == SIGCONT)
152
153#define sig_user_defined(t, signr) \
154 (((t)->sighand->action[(signr)-1].sa.sa_handler != SIG_DFL) && \
155 ((t)->sighand->action[(signr)-1].sa.sa_handler != SIG_IGN))
156
157#define sig_fatal(t, signr) \
158 (!T(signr, SIG_KERNEL_IGNORE_MASK|SIG_KERNEL_STOP_MASK) && \
159 (t)->sighand->action[(signr)-1].sa.sa_handler == SIG_DFL)
160 41
161static int sig_ignored(struct task_struct *t, int sig) 42static int sig_ignored(struct task_struct *t, int sig)
162{ 43{
@@ -328,6 +209,16 @@ void flush_signals(struct task_struct *t)
328 spin_unlock_irqrestore(&t->sighand->siglock, flags); 209 spin_unlock_irqrestore(&t->sighand->siglock, flags);
329} 210}
330 211
212void ignore_signals(struct task_struct *t)
213{
214 int i;
215
216 for (i = 0; i < _NSIG; ++i)
217 t->sighand->action[i].sa.sa_handler = SIG_IGN;
218
219 flush_signals(t);
220}
221
331/* 222/*
332 * Flush all handlers for a task. 223 * Flush all handlers for a task.
333 */ 224 */
@@ -1032,17 +923,6 @@ void zap_other_threads(struct task_struct *p)
1032 if (t->exit_state) 923 if (t->exit_state)
1033 continue; 924 continue;
1034 925
1035 /*
1036 * We don't want to notify the parent, since we are
1037 * killed as part of a thread group due to another
1038 * thread doing an execve() or similar. So set the
1039 * exit signal to -1 to allow immediate reaping of
1040 * the process. But don't detach the thread group
1041 * leader.
1042 */
1043 if (t != p->group_leader)
1044 t->exit_signal = -1;
1045
1046 /* SIGKILL will be handled before any pending SIGSTOP */ 926 /* SIGKILL will be handled before any pending SIGSTOP */
1047 sigaddset(&t->pending.signal, SIGKILL); 927 sigaddset(&t->pending.signal, SIGKILL);
1048 signal_wake_up(t, 1); 928 signal_wake_up(t, 1);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 8b75008e2bd8..0b9886a00e74 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -593,6 +593,7 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
593 593
594 switch (action) { 594 switch (action) {
595 case CPU_UP_PREPARE: 595 case CPU_UP_PREPARE:
596 case CPU_UP_PREPARE_FROZEN:
596 p = kthread_create(ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu); 597 p = kthread_create(ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu);
597 if (IS_ERR(p)) { 598 if (IS_ERR(p)) {
598 printk("ksoftirqd for %i failed\n", hotcpu); 599 printk("ksoftirqd for %i failed\n", hotcpu);
@@ -602,16 +603,19 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
602 per_cpu(ksoftirqd, hotcpu) = p; 603 per_cpu(ksoftirqd, hotcpu) = p;
603 break; 604 break;
604 case CPU_ONLINE: 605 case CPU_ONLINE:
606 case CPU_ONLINE_FROZEN:
605 wake_up_process(per_cpu(ksoftirqd, hotcpu)); 607 wake_up_process(per_cpu(ksoftirqd, hotcpu));
606 break; 608 break;
607#ifdef CONFIG_HOTPLUG_CPU 609#ifdef CONFIG_HOTPLUG_CPU
608 case CPU_UP_CANCELED: 610 case CPU_UP_CANCELED:
611 case CPU_UP_CANCELED_FROZEN:
609 if (!per_cpu(ksoftirqd, hotcpu)) 612 if (!per_cpu(ksoftirqd, hotcpu))
610 break; 613 break;
611 /* Unbind so it can run. Fall thru. */ 614 /* Unbind so it can run. Fall thru. */
612 kthread_bind(per_cpu(ksoftirqd, hotcpu), 615 kthread_bind(per_cpu(ksoftirqd, hotcpu),
613 any_online_cpu(cpu_online_map)); 616 any_online_cpu(cpu_online_map));
614 case CPU_DEAD: 617 case CPU_DEAD:
618 case CPU_DEAD_FROZEN:
615 p = per_cpu(ksoftirqd, hotcpu); 619 p = per_cpu(ksoftirqd, hotcpu);
616 per_cpu(ksoftirqd, hotcpu) = NULL; 620 per_cpu(ksoftirqd, hotcpu) = NULL;
617 kthread_stop(p); 621 kthread_stop(p);
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 8fa7040247ad..0131e296ffb4 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -146,6 +146,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
146 146
147 switch (action) { 147 switch (action) {
148 case CPU_UP_PREPARE: 148 case CPU_UP_PREPARE:
149 case CPU_UP_PREPARE_FROZEN:
149 BUG_ON(per_cpu(watchdog_task, hotcpu)); 150 BUG_ON(per_cpu(watchdog_task, hotcpu));
150 p = kthread_create(watchdog, hcpu, "watchdog/%d", hotcpu); 151 p = kthread_create(watchdog, hcpu, "watchdog/%d", hotcpu);
151 if (IS_ERR(p)) { 152 if (IS_ERR(p)) {
@@ -157,16 +158,19 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
157 kthread_bind(p, hotcpu); 158 kthread_bind(p, hotcpu);
158 break; 159 break;
159 case CPU_ONLINE: 160 case CPU_ONLINE:
161 case CPU_ONLINE_FROZEN:
160 wake_up_process(per_cpu(watchdog_task, hotcpu)); 162 wake_up_process(per_cpu(watchdog_task, hotcpu));
161 break; 163 break;
162#ifdef CONFIG_HOTPLUG_CPU 164#ifdef CONFIG_HOTPLUG_CPU
163 case CPU_UP_CANCELED: 165 case CPU_UP_CANCELED:
166 case CPU_UP_CANCELED_FROZEN:
164 if (!per_cpu(watchdog_task, hotcpu)) 167 if (!per_cpu(watchdog_task, hotcpu))
165 break; 168 break;
166 /* Unbind so it can run. Fall thru. */ 169 /* Unbind so it can run. Fall thru. */
167 kthread_bind(per_cpu(watchdog_task, hotcpu), 170 kthread_bind(per_cpu(watchdog_task, hotcpu),
168 any_online_cpu(cpu_online_map)); 171 any_online_cpu(cpu_online_map));
169 case CPU_DEAD: 172 case CPU_DEAD:
173 case CPU_DEAD_FROZEN:
170 p = per_cpu(watchdog_task, hotcpu); 174 p = per_cpu(watchdog_task, hotcpu);
171 per_cpu(watchdog_task, hotcpu) = NULL; 175 per_cpu(watchdog_task, hotcpu) = NULL;
172 kthread_stop(p); 176 kthread_stop(p);
diff --git a/kernel/sys.c b/kernel/sys.c
index 0742c938dfa7..cdb7e9457ba6 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -134,19 +134,39 @@ static int notifier_chain_unregister(struct notifier_block **nl,
134 return -ENOENT; 134 return -ENOENT;
135} 135}
136 136
137/**
138 * notifier_call_chain - Informs the registered notifiers about an event.
139 * @nl: Pointer to head of the blocking notifier chain
140 * @val: Value passed unmodified to notifier function
141 * @v: Pointer passed unmodified to notifier function
142 * @nr_to_call: Number of notifier functions to be called. Don't care
143 * value of this parameter is -1.
144 * @nr_calls: Records the number of notifications sent. Don't care
145 * value of this field is NULL.
146 * @returns: notifier_call_chain returns the value returned by the
147 * last notifier function called.
148 */
149
137static int __kprobes notifier_call_chain(struct notifier_block **nl, 150static int __kprobes notifier_call_chain(struct notifier_block **nl,
138 unsigned long val, void *v) 151 unsigned long val, void *v,
152 int nr_to_call, int *nr_calls)
139{ 153{
140 int ret = NOTIFY_DONE; 154 int ret = NOTIFY_DONE;
141 struct notifier_block *nb, *next_nb; 155 struct notifier_block *nb, *next_nb;
142 156
143 nb = rcu_dereference(*nl); 157 nb = rcu_dereference(*nl);
144 while (nb) { 158
159 while (nb && nr_to_call) {
145 next_nb = rcu_dereference(nb->next); 160 next_nb = rcu_dereference(nb->next);
146 ret = nb->notifier_call(nb, val, v); 161 ret = nb->notifier_call(nb, val, v);
162
163 if (nr_calls)
164 (*nr_calls)++;
165
147 if ((ret & NOTIFY_STOP_MASK) == NOTIFY_STOP_MASK) 166 if ((ret & NOTIFY_STOP_MASK) == NOTIFY_STOP_MASK)
148 break; 167 break;
149 nb = next_nb; 168 nb = next_nb;
169 nr_to_call--;
150 } 170 }
151 return ret; 171 return ret;
152} 172}
@@ -205,10 +225,12 @@ int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh,
205EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister); 225EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister);
206 226
207/** 227/**
208 * atomic_notifier_call_chain - Call functions in an atomic notifier chain 228 * __atomic_notifier_call_chain - Call functions in an atomic notifier chain
209 * @nh: Pointer to head of the atomic notifier chain 229 * @nh: Pointer to head of the atomic notifier chain
210 * @val: Value passed unmodified to notifier function 230 * @val: Value passed unmodified to notifier function
211 * @v: Pointer passed unmodified to notifier function 231 * @v: Pointer passed unmodified to notifier function
232 * @nr_to_call: See the comment for notifier_call_chain.
233 * @nr_calls: See the comment for notifier_call_chain.
212 * 234 *
213 * Calls each function in a notifier chain in turn. The functions 235 * Calls each function in a notifier chain in turn. The functions
214 * run in an atomic context, so they must not block. 236 * run in an atomic context, so they must not block.
@@ -222,19 +244,27 @@ EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister);
222 * of the last notifier function called. 244 * of the last notifier function called.
223 */ 245 */
224 246
225int __kprobes atomic_notifier_call_chain(struct atomic_notifier_head *nh, 247int __kprobes __atomic_notifier_call_chain(struct atomic_notifier_head *nh,
226 unsigned long val, void *v) 248 unsigned long val, void *v,
249 int nr_to_call, int *nr_calls)
227{ 250{
228 int ret; 251 int ret;
229 252
230 rcu_read_lock(); 253 rcu_read_lock();
231 ret = notifier_call_chain(&nh->head, val, v); 254 ret = notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls);
232 rcu_read_unlock(); 255 rcu_read_unlock();
233 return ret; 256 return ret;
234} 257}
235 258
236EXPORT_SYMBOL_GPL(atomic_notifier_call_chain); 259EXPORT_SYMBOL_GPL(__atomic_notifier_call_chain);
260
261int __kprobes atomic_notifier_call_chain(struct atomic_notifier_head *nh,
262 unsigned long val, void *v)
263{
264 return __atomic_notifier_call_chain(nh, val, v, -1, NULL);
265}
237 266
267EXPORT_SYMBOL_GPL(atomic_notifier_call_chain);
238/* 268/*
239 * Blocking notifier chain routines. All access to the chain is 269 * Blocking notifier chain routines. All access to the chain is
240 * synchronized by an rwsem. 270 * synchronized by an rwsem.
@@ -304,10 +334,12 @@ int blocking_notifier_chain_unregister(struct blocking_notifier_head *nh,
304EXPORT_SYMBOL_GPL(blocking_notifier_chain_unregister); 334EXPORT_SYMBOL_GPL(blocking_notifier_chain_unregister);
305 335
306/** 336/**
307 * blocking_notifier_call_chain - Call functions in a blocking notifier chain 337 * __blocking_notifier_call_chain - Call functions in a blocking notifier chain
308 * @nh: Pointer to head of the blocking notifier chain 338 * @nh: Pointer to head of the blocking notifier chain
309 * @val: Value passed unmodified to notifier function 339 * @val: Value passed unmodified to notifier function
310 * @v: Pointer passed unmodified to notifier function 340 * @v: Pointer passed unmodified to notifier function
341 * @nr_to_call: See comment for notifier_call_chain.
342 * @nr_calls: See comment for notifier_call_chain.
311 * 343 *
312 * Calls each function in a notifier chain in turn. The functions 344 * Calls each function in a notifier chain in turn. The functions
313 * run in a process context, so they are allowed to block. 345 * run in a process context, so they are allowed to block.
@@ -320,8 +352,9 @@ EXPORT_SYMBOL_GPL(blocking_notifier_chain_unregister);
320 * of the last notifier function called. 352 * of the last notifier function called.
321 */ 353 */
322 354
323int blocking_notifier_call_chain(struct blocking_notifier_head *nh, 355int __blocking_notifier_call_chain(struct blocking_notifier_head *nh,
324 unsigned long val, void *v) 356 unsigned long val, void *v,
357 int nr_to_call, int *nr_calls)
325{ 358{
326 int ret = NOTIFY_DONE; 359 int ret = NOTIFY_DONE;
327 360
@@ -332,12 +365,19 @@ int blocking_notifier_call_chain(struct blocking_notifier_head *nh,
332 */ 365 */
333 if (rcu_dereference(nh->head)) { 366 if (rcu_dereference(nh->head)) {
334 down_read(&nh->rwsem); 367 down_read(&nh->rwsem);
335 ret = notifier_call_chain(&nh->head, val, v); 368 ret = notifier_call_chain(&nh->head, val, v, nr_to_call,
369 nr_calls);
336 up_read(&nh->rwsem); 370 up_read(&nh->rwsem);
337 } 371 }
338 return ret; 372 return ret;
339} 373}
374EXPORT_SYMBOL_GPL(__blocking_notifier_call_chain);
340 375
376int blocking_notifier_call_chain(struct blocking_notifier_head *nh,
377 unsigned long val, void *v)
378{
379 return __blocking_notifier_call_chain(nh, val, v, -1, NULL);
380}
341EXPORT_SYMBOL_GPL(blocking_notifier_call_chain); 381EXPORT_SYMBOL_GPL(blocking_notifier_call_chain);
342 382
343/* 383/*
@@ -383,10 +423,12 @@ int raw_notifier_chain_unregister(struct raw_notifier_head *nh,
383EXPORT_SYMBOL_GPL(raw_notifier_chain_unregister); 423EXPORT_SYMBOL_GPL(raw_notifier_chain_unregister);
384 424
385/** 425/**
386 * raw_notifier_call_chain - Call functions in a raw notifier chain 426 * __raw_notifier_call_chain - Call functions in a raw notifier chain
387 * @nh: Pointer to head of the raw notifier chain 427 * @nh: Pointer to head of the raw notifier chain
388 * @val: Value passed unmodified to notifier function 428 * @val: Value passed unmodified to notifier function
389 * @v: Pointer passed unmodified to notifier function 429 * @v: Pointer passed unmodified to notifier function
430 * @nr_to_call: See comment for notifier_call_chain.
431 * @nr_calls: See comment for notifier_call_chain
390 * 432 *
391 * Calls each function in a notifier chain in turn. The functions 433 * Calls each function in a notifier chain in turn. The functions
392 * run in an undefined context. 434 * run in an undefined context.
@@ -400,10 +442,19 @@ EXPORT_SYMBOL_GPL(raw_notifier_chain_unregister);
400 * of the last notifier function called. 442 * of the last notifier function called.
401 */ 443 */
402 444
445int __raw_notifier_call_chain(struct raw_notifier_head *nh,
446 unsigned long val, void *v,
447 int nr_to_call, int *nr_calls)
448{
449 return notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls);
450}
451
452EXPORT_SYMBOL_GPL(__raw_notifier_call_chain);
453
403int raw_notifier_call_chain(struct raw_notifier_head *nh, 454int raw_notifier_call_chain(struct raw_notifier_head *nh,
404 unsigned long val, void *v) 455 unsigned long val, void *v)
405{ 456{
406 return notifier_call_chain(&nh->head, val, v); 457 return __raw_notifier_call_chain(nh, val, v, -1, NULL);
407} 458}
408 459
409EXPORT_SYMBOL_GPL(raw_notifier_call_chain); 460EXPORT_SYMBOL_GPL(raw_notifier_call_chain);
@@ -478,10 +529,12 @@ int srcu_notifier_chain_unregister(struct srcu_notifier_head *nh,
478EXPORT_SYMBOL_GPL(srcu_notifier_chain_unregister); 529EXPORT_SYMBOL_GPL(srcu_notifier_chain_unregister);
479 530
480/** 531/**
481 * srcu_notifier_call_chain - Call functions in an SRCU notifier chain 532 * __srcu_notifier_call_chain - Call functions in an SRCU notifier chain
482 * @nh: Pointer to head of the SRCU notifier chain 533 * @nh: Pointer to head of the SRCU notifier chain
483 * @val: Value passed unmodified to notifier function 534 * @val: Value passed unmodified to notifier function
484 * @v: Pointer passed unmodified to notifier function 535 * @v: Pointer passed unmodified to notifier function
536 * @nr_to_call: See comment for notifier_call_chain.
537 * @nr_calls: See comment for notifier_call_chain
485 * 538 *
486 * Calls each function in a notifier chain in turn. The functions 539 * Calls each function in a notifier chain in turn. The functions
487 * run in a process context, so they are allowed to block. 540 * run in a process context, so they are allowed to block.
@@ -494,18 +547,25 @@ EXPORT_SYMBOL_GPL(srcu_notifier_chain_unregister);
494 * of the last notifier function called. 547 * of the last notifier function called.
495 */ 548 */
496 549
497int srcu_notifier_call_chain(struct srcu_notifier_head *nh, 550int __srcu_notifier_call_chain(struct srcu_notifier_head *nh,
498 unsigned long val, void *v) 551 unsigned long val, void *v,
552 int nr_to_call, int *nr_calls)
499{ 553{
500 int ret; 554 int ret;
501 int idx; 555 int idx;
502 556
503 idx = srcu_read_lock(&nh->srcu); 557 idx = srcu_read_lock(&nh->srcu);
504 ret = notifier_call_chain(&nh->head, val, v); 558 ret = notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls);
505 srcu_read_unlock(&nh->srcu, idx); 559 srcu_read_unlock(&nh->srcu, idx);
506 return ret; 560 return ret;
507} 561}
562EXPORT_SYMBOL_GPL(__srcu_notifier_call_chain);
508 563
564int srcu_notifier_call_chain(struct srcu_notifier_head *nh,
565 unsigned long val, void *v)
566{
567 return __srcu_notifier_call_chain(nh, val, v, -1, NULL);
568}
509EXPORT_SYMBOL_GPL(srcu_notifier_call_chain); 569EXPORT_SYMBOL_GPL(srcu_notifier_call_chain);
510 570
511/** 571/**
@@ -881,7 +941,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
881#ifdef CONFIG_SOFTWARE_SUSPEND 941#ifdef CONFIG_SOFTWARE_SUSPEND
882 case LINUX_REBOOT_CMD_SW_SUSPEND: 942 case LINUX_REBOOT_CMD_SW_SUSPEND:
883 { 943 {
884 int ret = pm_suspend(PM_SUSPEND_DISK); 944 int ret = hibernate();
885 unlock_kernel(); 945 unlock_kernel();
886 return ret; 946 return ret;
887 } 947 }
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index f0664bd5011c..4073353abd4f 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -77,6 +77,7 @@ extern int sysctl_drop_caches;
77extern int percpu_pagelist_fraction; 77extern int percpu_pagelist_fraction;
78extern int compat_log; 78extern int compat_log;
79extern int maps_protect; 79extern int maps_protect;
80extern int sysctl_stat_interval;
80 81
81/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ 82/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
82static int maxolduid = 65535; 83static int maxolduid = 65535;
@@ -857,6 +858,17 @@ static ctl_table vm_table[] = {
857 .extra2 = &one_hundred, 858 .extra2 = &one_hundred,
858 }, 859 },
859#endif 860#endif
861#ifdef CONFIG_SMP
862 {
863 .ctl_name = CTL_UNNUMBERED,
864 .procname = "stat_interval",
865 .data = &sysctl_stat_interval,
866 .maxlen = sizeof(sysctl_stat_interval),
867 .mode = 0644,
868 .proc_handler = &proc_dointvec_jiffies,
869 .strategy = &sysctl_jiffies,
870 },
871#endif
860#if defined(CONFIG_X86_32) || \ 872#if defined(CONFIG_X86_32) || \
861 (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL)) 873 (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL))
862 { 874 {
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index fe5c7db24247..3db5c3c460d7 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -74,15 +74,17 @@ static struct clocksource *watchdog;
74static struct timer_list watchdog_timer; 74static struct timer_list watchdog_timer;
75static DEFINE_SPINLOCK(watchdog_lock); 75static DEFINE_SPINLOCK(watchdog_lock);
76static cycle_t watchdog_last; 76static cycle_t watchdog_last;
77static int watchdog_resumed;
78
77/* 79/*
78 * Interval: 0.5sec Treshold: 0.0625s 80 * Interval: 0.5sec Threshold: 0.0625s
79 */ 81 */
80#define WATCHDOG_INTERVAL (HZ >> 1) 82#define WATCHDOG_INTERVAL (HZ >> 1)
81#define WATCHDOG_TRESHOLD (NSEC_PER_SEC >> 4) 83#define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 4)
82 84
83static void clocksource_ratewd(struct clocksource *cs, int64_t delta) 85static void clocksource_ratewd(struct clocksource *cs, int64_t delta)
84{ 86{
85 if (delta > -WATCHDOG_TRESHOLD && delta < WATCHDOG_TRESHOLD) 87 if (delta > -WATCHDOG_THRESHOLD && delta < WATCHDOG_THRESHOLD)
86 return; 88 return;
87 89
88 printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n", 90 printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n",
@@ -98,15 +100,26 @@ static void clocksource_watchdog(unsigned long data)
98 struct clocksource *cs, *tmp; 100 struct clocksource *cs, *tmp;
99 cycle_t csnow, wdnow; 101 cycle_t csnow, wdnow;
100 int64_t wd_nsec, cs_nsec; 102 int64_t wd_nsec, cs_nsec;
103 int resumed;
101 104
102 spin_lock(&watchdog_lock); 105 spin_lock(&watchdog_lock);
103 106
107 resumed = watchdog_resumed;
108 if (unlikely(resumed))
109 watchdog_resumed = 0;
110
104 wdnow = watchdog->read(); 111 wdnow = watchdog->read();
105 wd_nsec = cyc2ns(watchdog, (wdnow - watchdog_last) & watchdog->mask); 112 wd_nsec = cyc2ns(watchdog, (wdnow - watchdog_last) & watchdog->mask);
106 watchdog_last = wdnow; 113 watchdog_last = wdnow;
107 114
108 list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) { 115 list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) {
109 csnow = cs->read(); 116 csnow = cs->read();
117
118 if (unlikely(resumed)) {
119 cs->wd_last = csnow;
120 continue;
121 }
122
110 /* Initialized ? */ 123 /* Initialized ? */
111 if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) { 124 if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) {
112 if ((cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) && 125 if ((cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) &&
@@ -136,6 +149,13 @@ static void clocksource_watchdog(unsigned long data)
136 } 149 }
137 spin_unlock(&watchdog_lock); 150 spin_unlock(&watchdog_lock);
138} 151}
152static void clocksource_resume_watchdog(void)
153{
154 spin_lock(&watchdog_lock);
155 watchdog_resumed = 1;
156 spin_unlock(&watchdog_lock);
157}
158
139static void clocksource_check_watchdog(struct clocksource *cs) 159static void clocksource_check_watchdog(struct clocksource *cs)
140{ 160{
141 struct clocksource *cse; 161 struct clocksource *cse;
@@ -182,9 +202,34 @@ static void clocksource_check_watchdog(struct clocksource *cs)
182 if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) 202 if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
183 cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; 203 cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
184} 204}
205
206static inline void clocksource_resume_watchdog(void) { }
185#endif 207#endif
186 208
187/** 209/**
210 * clocksource_resume - resume the clocksource(s)
211 */
212void clocksource_resume(void)
213{
214 struct list_head *tmp;
215 unsigned long flags;
216
217 spin_lock_irqsave(&clocksource_lock, flags);
218
219 list_for_each(tmp, &clocksource_list) {
220 struct clocksource *cs;
221
222 cs = list_entry(tmp, struct clocksource, list);
223 if (cs->resume)
224 cs->resume();
225 }
226
227 clocksource_resume_watchdog();
228
229 spin_unlock_irqrestore(&clocksource_lock, flags);
230}
231
232/**
188 * clocksource_get_next - Returns the selected clocksource 233 * clocksource_get_next - Returns the selected clocksource
189 * 234 *
190 */ 235 */
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index b734ca4bc75e..8bbcfb77f7d2 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -65,7 +65,7 @@ print_timer(struct seq_file *m, struct hrtimer *timer, int idx, u64 now)
65 SEQ_printf(m, ", %s/%d", tmp, timer->start_pid); 65 SEQ_printf(m, ", %s/%d", tmp, timer->start_pid);
66#endif 66#endif
67 SEQ_printf(m, "\n"); 67 SEQ_printf(m, "\n");
68 SEQ_printf(m, " # expires at %Ld nsecs [in %Ld nsecs]\n", 68 SEQ_printf(m, " # expires at %Lu nsecs [in %Lu nsecs]\n",
69 (unsigned long long)ktime_to_ns(timer->expires), 69 (unsigned long long)ktime_to_ns(timer->expires),
70 (unsigned long long)(ktime_to_ns(timer->expires) - now)); 70 (unsigned long long)(ktime_to_ns(timer->expires) - now));
71} 71}
@@ -111,14 +111,14 @@ print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now)
111{ 111{
112 SEQ_printf(m, " .index: %d\n", 112 SEQ_printf(m, " .index: %d\n",
113 base->index); 113 base->index);
114 SEQ_printf(m, " .resolution: %Ld nsecs\n", 114 SEQ_printf(m, " .resolution: %Lu nsecs\n",
115 (unsigned long long)ktime_to_ns(base->resolution)); 115 (unsigned long long)ktime_to_ns(base->resolution));
116 SEQ_printf(m, " .get_time: "); 116 SEQ_printf(m, " .get_time: ");
117 print_name_offset(m, base->get_time); 117 print_name_offset(m, base->get_time);
118 SEQ_printf(m, "\n"); 118 SEQ_printf(m, "\n");
119#ifdef CONFIG_HIGH_RES_TIMERS 119#ifdef CONFIG_HIGH_RES_TIMERS
120 SEQ_printf(m, " .offset: %Ld nsecs\n", 120 SEQ_printf(m, " .offset: %Lu nsecs\n",
121 ktime_to_ns(base->offset)); 121 (unsigned long long) ktime_to_ns(base->offset));
122#endif 122#endif
123 SEQ_printf(m, "active timers:\n"); 123 SEQ_printf(m, "active timers:\n");
124 print_active_timers(m, base, now); 124 print_active_timers(m, base, now);
@@ -135,10 +135,11 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
135 print_base(m, cpu_base->clock_base + i, now); 135 print_base(m, cpu_base->clock_base + i, now);
136 } 136 }
137#define P(x) \ 137#define P(x) \
138 SEQ_printf(m, " .%-15s: %Ld\n", #x, (u64)(cpu_base->x)) 138 SEQ_printf(m, " .%-15s: %Lu\n", #x, \
139 (unsigned long long)(cpu_base->x))
139#define P_ns(x) \ 140#define P_ns(x) \
140 SEQ_printf(m, " .%-15s: %Ld nsecs\n", #x, \ 141 SEQ_printf(m, " .%-15s: %Lu nsecs\n", #x, \
141 (u64)(ktime_to_ns(cpu_base->x))) 142 (unsigned long long)(ktime_to_ns(cpu_base->x)))
142 143
143#ifdef CONFIG_HIGH_RES_TIMERS 144#ifdef CONFIG_HIGH_RES_TIMERS
144 P_ns(expires_next); 145 P_ns(expires_next);
@@ -150,10 +151,11 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
150 151
151#ifdef CONFIG_TICK_ONESHOT 152#ifdef CONFIG_TICK_ONESHOT
152# define P(x) \ 153# define P(x) \
153 SEQ_printf(m, " .%-15s: %Ld\n", #x, (u64)(ts->x)) 154 SEQ_printf(m, " .%-15s: %Lu\n", #x, \
155 (unsigned long long)(ts->x))
154# define P_ns(x) \ 156# define P_ns(x) \
155 SEQ_printf(m, " .%-15s: %Ld nsecs\n", #x, \ 157 SEQ_printf(m, " .%-15s: %Lu nsecs\n", #x, \
156 (u64)(ktime_to_ns(ts->x))) 158 (unsigned long long)(ktime_to_ns(ts->x)))
157 { 159 {
158 struct tick_sched *ts = tick_get_tick_sched(cpu); 160 struct tick_sched *ts = tick_get_tick_sched(cpu);
159 P(nohz_mode); 161 P(nohz_mode);
@@ -167,7 +169,8 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
167 P(last_jiffies); 169 P(last_jiffies);
168 P(next_jiffies); 170 P(next_jiffies);
169 P_ns(idle_expires); 171 P_ns(idle_expires);
170 SEQ_printf(m, "jiffies: %Ld\n", (u64)jiffies); 172 SEQ_printf(m, "jiffies: %Lu\n",
173 (unsigned long long)jiffies);
171 } 174 }
172#endif 175#endif
173 176
diff --git a/kernel/timer.c b/kernel/timer.c
index 7a6448340f90..59a28b1752f8 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -92,24 +92,24 @@ static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = &boot_tvec_bases;
92/* Functions below help us manage 'deferrable' flag */ 92/* Functions below help us manage 'deferrable' flag */
93static inline unsigned int tbase_get_deferrable(tvec_base_t *base) 93static inline unsigned int tbase_get_deferrable(tvec_base_t *base)
94{ 94{
95 return ((unsigned int)(unsigned long)base & TBASE_DEFERRABLE_FLAG); 95 return (unsigned int)((unsigned long)base & TBASE_DEFERRABLE_FLAG);
96} 96}
97 97
98static inline tvec_base_t *tbase_get_base(tvec_base_t *base) 98static inline tvec_base_t *tbase_get_base(tvec_base_t *base)
99{ 99{
100 return ((tvec_base_t *)((unsigned long)base & ~TBASE_DEFERRABLE_FLAG)); 100 return (tvec_base_t *)((unsigned long)base & ~TBASE_DEFERRABLE_FLAG);
101} 101}
102 102
103static inline void timer_set_deferrable(struct timer_list *timer) 103static inline void timer_set_deferrable(struct timer_list *timer)
104{ 104{
105 timer->base = ((tvec_base_t *)((unsigned long)(timer->base) | 105 timer->base = (tvec_base_t *)((unsigned long)timer->base |
106 TBASE_DEFERRABLE_FLAG)); 106 TBASE_DEFERRABLE_FLAG);
107} 107}
108 108
109static inline void 109static inline void
110timer_set_base(struct timer_list *timer, tvec_base_t *new_base) 110timer_set_base(struct timer_list *timer, tvec_base_t *new_base)
111{ 111{
112 timer->base = (tvec_base_t *)((unsigned long)(new_base) | 112 timer->base = (tvec_base_t *)((unsigned long)new_base |
113 tbase_get_deferrable(timer->base)); 113 tbase_get_deferrable(timer->base));
114} 114}
115 115
@@ -1293,11 +1293,13 @@ static int __cpuinit timer_cpu_notify(struct notifier_block *self,
1293 long cpu = (long)hcpu; 1293 long cpu = (long)hcpu;
1294 switch(action) { 1294 switch(action) {
1295 case CPU_UP_PREPARE: 1295 case CPU_UP_PREPARE:
1296 case CPU_UP_PREPARE_FROZEN:
1296 if (init_timers_cpu(cpu) < 0) 1297 if (init_timers_cpu(cpu) < 0)
1297 return NOTIFY_BAD; 1298 return NOTIFY_BAD;
1298 break; 1299 break;
1299#ifdef CONFIG_HOTPLUG_CPU 1300#ifdef CONFIG_HOTPLUG_CPU
1300 case CPU_DEAD: 1301 case CPU_DEAD:
1302 case CPU_DEAD_FROZEN:
1301 migrate_timers(cpu); 1303 migrate_timers(cpu);
1302 break; 1304 break;
1303#endif 1305#endif
@@ -1497,6 +1499,8 @@ unregister_time_interpolator(struct time_interpolator *ti)
1497 prev = &curr->next; 1499 prev = &curr->next;
1498 } 1500 }
1499 1501
1502 clocksource_resume();
1503
1500 write_seqlock_irqsave(&xtime_lock, flags); 1504 write_seqlock_irqsave(&xtime_lock, flags);
1501 if (ti == time_interpolator) { 1505 if (ti == time_interpolator) {
1502 /* we lost the best time-interpolator: */ 1506 /* we lost the best time-interpolator: */
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index b6fa5e63085d..fb56fedd5c02 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -36,30 +36,20 @@
36/* 36/*
37 * The per-CPU workqueue (if single thread, we always use the first 37 * The per-CPU workqueue (if single thread, we always use the first
38 * possible cpu). 38 * possible cpu).
39 *
40 * The sequence counters are for flush_scheduled_work(). It wants to wait
41 * until all currently-scheduled works are completed, but it doesn't
42 * want to be livelocked by new, incoming ones. So it waits until
43 * remove_sequence is >= the insert_sequence which pertained when
44 * flush_scheduled_work() was called.
45 */ 39 */
46struct cpu_workqueue_struct { 40struct cpu_workqueue_struct {
47 41
48 spinlock_t lock; 42 spinlock_t lock;
49 43
50 long remove_sequence; /* Least-recently added (next to run) */
51 long insert_sequence; /* Next to add */
52
53 struct list_head worklist; 44 struct list_head worklist;
54 wait_queue_head_t more_work; 45 wait_queue_head_t more_work;
55 wait_queue_head_t work_done; 46 struct work_struct *current_work;
56 47
57 struct workqueue_struct *wq; 48 struct workqueue_struct *wq;
58 struct task_struct *thread; 49 struct task_struct *thread;
50 int should_stop;
59 51
60 int run_depth; /* Detect run_workqueue() recursion depth */ 52 int run_depth; /* Detect run_workqueue() recursion depth */
61
62 int freezeable; /* Freeze the thread during suspend */
63} ____cacheline_aligned; 53} ____cacheline_aligned;
64 54
65/* 55/*
@@ -68,8 +58,10 @@ struct cpu_workqueue_struct {
68 */ 58 */
69struct workqueue_struct { 59struct workqueue_struct {
70 struct cpu_workqueue_struct *cpu_wq; 60 struct cpu_workqueue_struct *cpu_wq;
61 struct list_head list;
71 const char *name; 62 const char *name;
72 struct list_head list; /* Empty if single thread */ 63 int singlethread;
64 int freezeable; /* Freeze threads during suspend */
73}; 65};
74 66
75/* All the per-cpu workqueues on the system, for hotplug cpu to add/remove 67/* All the per-cpu workqueues on the system, for hotplug cpu to add/remove
@@ -77,106 +69,68 @@ struct workqueue_struct {
77static DEFINE_MUTEX(workqueue_mutex); 69static DEFINE_MUTEX(workqueue_mutex);
78static LIST_HEAD(workqueues); 70static LIST_HEAD(workqueues);
79 71
80static int singlethread_cpu; 72static int singlethread_cpu __read_mostly;
73static cpumask_t cpu_singlethread_map __read_mostly;
74/* optimization, we could use cpu_possible_map */
75static cpumask_t cpu_populated_map __read_mostly;
81 76
82/* If it's single threaded, it isn't in the list of workqueues. */ 77/* If it's single threaded, it isn't in the list of workqueues. */
83static inline int is_single_threaded(struct workqueue_struct *wq) 78static inline int is_single_threaded(struct workqueue_struct *wq)
84{ 79{
85 return list_empty(&wq->list); 80 return wq->singlethread;
81}
82
83static const cpumask_t *wq_cpu_map(struct workqueue_struct *wq)
84{
85 return is_single_threaded(wq)
86 ? &cpu_singlethread_map : &cpu_populated_map;
87}
88
89static
90struct cpu_workqueue_struct *wq_per_cpu(struct workqueue_struct *wq, int cpu)
91{
92 if (unlikely(is_single_threaded(wq)))
93 cpu = singlethread_cpu;
94 return per_cpu_ptr(wq->cpu_wq, cpu);
86} 95}
87 96
88/* 97/*
89 * Set the workqueue on which a work item is to be run 98 * Set the workqueue on which a work item is to be run
90 * - Must *only* be called if the pending flag is set 99 * - Must *only* be called if the pending flag is set
91 */ 100 */
92static inline void set_wq_data(struct work_struct *work, void *wq) 101static inline void set_wq_data(struct work_struct *work,
102 struct cpu_workqueue_struct *cwq)
93{ 103{
94 unsigned long new; 104 unsigned long new;
95 105
96 BUG_ON(!work_pending(work)); 106 BUG_ON(!work_pending(work));
97 107
98 new = (unsigned long) wq | (1UL << WORK_STRUCT_PENDING); 108 new = (unsigned long) cwq | (1UL << WORK_STRUCT_PENDING);
99 new |= WORK_STRUCT_FLAG_MASK & *work_data_bits(work); 109 new |= WORK_STRUCT_FLAG_MASK & *work_data_bits(work);
100 atomic_long_set(&work->data, new); 110 atomic_long_set(&work->data, new);
101} 111}
102 112
103static inline void *get_wq_data(struct work_struct *work) 113static inline
114struct cpu_workqueue_struct *get_wq_data(struct work_struct *work)
104{ 115{
105 return (void *) (atomic_long_read(&work->data) & WORK_STRUCT_WQ_DATA_MASK); 116 return (void *) (atomic_long_read(&work->data) & WORK_STRUCT_WQ_DATA_MASK);
106} 117}
107 118
108static int __run_work(struct cpu_workqueue_struct *cwq, struct work_struct *work) 119static void insert_work(struct cpu_workqueue_struct *cwq,
120 struct work_struct *work, int tail)
109{ 121{
110 int ret = 0; 122 set_wq_data(work, cwq);
111 unsigned long flags;
112
113 spin_lock_irqsave(&cwq->lock, flags);
114 /* 123 /*
115 * We need to re-validate the work info after we've gotten 124 * Ensure that we get the right work->data if we see the
116 * the cpu_workqueue lock. We can run the work now iff: 125 * result of list_add() below, see try_to_grab_pending().
117 *
118 * - the wq_data still matches the cpu_workqueue_struct
119 * - AND the work is still marked pending
120 * - AND the work is still on a list (which will be this
121 * workqueue_struct list)
122 *
123 * All these conditions are important, because we
124 * need to protect against the work being run right
125 * now on another CPU (all but the last one might be
126 * true if it's currently running and has not been
127 * released yet, for example).
128 */ 126 */
129 if (get_wq_data(work) == cwq 127 smp_wmb();
130 && work_pending(work) 128 if (tail)
131 && !list_empty(&work->entry)) { 129 list_add_tail(&work->entry, &cwq->worklist);
132 work_func_t f = work->func; 130 else
133 list_del_init(&work->entry); 131 list_add(&work->entry, &cwq->worklist);
134 spin_unlock_irqrestore(&cwq->lock, flags); 132 wake_up(&cwq->more_work);
135
136 if (!test_bit(WORK_STRUCT_NOAUTOREL, work_data_bits(work)))
137 work_release(work);
138 f(work);
139
140 spin_lock_irqsave(&cwq->lock, flags);
141 cwq->remove_sequence++;
142 wake_up(&cwq->work_done);
143 ret = 1;
144 }
145 spin_unlock_irqrestore(&cwq->lock, flags);
146 return ret;
147}
148
149/**
150 * run_scheduled_work - run scheduled work synchronously
151 * @work: work to run
152 *
153 * This checks if the work was pending, and runs it
154 * synchronously if so. It returns a boolean to indicate
155 * whether it had any scheduled work to run or not.
156 *
157 * NOTE! This _only_ works for normal work_structs. You
158 * CANNOT use this for delayed work, because the wq data
159 * for delayed work will not point properly to the per-
160 * CPU workqueue struct, but will change!
161 */
162int fastcall run_scheduled_work(struct work_struct *work)
163{
164 for (;;) {
165 struct cpu_workqueue_struct *cwq;
166
167 if (!work_pending(work))
168 return 0;
169 if (list_empty(&work->entry))
170 return 0;
171 /* NOTE! This depends intimately on __queue_work! */
172 cwq = get_wq_data(work);
173 if (!cwq)
174 return 0;
175 if (__run_work(cwq, work))
176 return 1;
177 }
178} 133}
179EXPORT_SYMBOL(run_scheduled_work);
180 134
181/* Preempt must be disabled. */ 135/* Preempt must be disabled. */
182static void __queue_work(struct cpu_workqueue_struct *cwq, 136static void __queue_work(struct cpu_workqueue_struct *cwq,
@@ -185,10 +139,7 @@ static void __queue_work(struct cpu_workqueue_struct *cwq,
185 unsigned long flags; 139 unsigned long flags;
186 140
187 spin_lock_irqsave(&cwq->lock, flags); 141 spin_lock_irqsave(&cwq->lock, flags);
188 set_wq_data(work, cwq); 142 insert_work(cwq, work, 1);
189 list_add_tail(&work->entry, &cwq->worklist);
190 cwq->insert_sequence++;
191 wake_up(&cwq->more_work);
192 spin_unlock_irqrestore(&cwq->lock, flags); 143 spin_unlock_irqrestore(&cwq->lock, flags);
193} 144}
194 145
@@ -204,16 +155,14 @@ static void __queue_work(struct cpu_workqueue_struct *cwq,
204 */ 155 */
205int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work) 156int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work)
206{ 157{
207 int ret = 0, cpu = get_cpu(); 158 int ret = 0;
208 159
209 if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) { 160 if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) {
210 if (unlikely(is_single_threaded(wq)))
211 cpu = singlethread_cpu;
212 BUG_ON(!list_empty(&work->entry)); 161 BUG_ON(!list_empty(&work->entry));
213 __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work); 162 __queue_work(wq_per_cpu(wq, get_cpu()), work);
163 put_cpu();
214 ret = 1; 164 ret = 1;
215 } 165 }
216 put_cpu();
217 return ret; 166 return ret;
218} 167}
219EXPORT_SYMBOL_GPL(queue_work); 168EXPORT_SYMBOL_GPL(queue_work);
@@ -221,13 +170,10 @@ EXPORT_SYMBOL_GPL(queue_work);
221void delayed_work_timer_fn(unsigned long __data) 170void delayed_work_timer_fn(unsigned long __data)
222{ 171{
223 struct delayed_work *dwork = (struct delayed_work *)__data; 172 struct delayed_work *dwork = (struct delayed_work *)__data;
224 struct workqueue_struct *wq = get_wq_data(&dwork->work); 173 struct cpu_workqueue_struct *cwq = get_wq_data(&dwork->work);
225 int cpu = smp_processor_id(); 174 struct workqueue_struct *wq = cwq->wq;
226 175
227 if (unlikely(is_single_threaded(wq))) 176 __queue_work(wq_per_cpu(wq, smp_processor_id()), &dwork->work);
228 cpu = singlethread_cpu;
229
230 __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), &dwork->work);
231} 177}
232 178
233/** 179/**
@@ -241,27 +187,11 @@ void delayed_work_timer_fn(unsigned long __data)
241int fastcall queue_delayed_work(struct workqueue_struct *wq, 187int fastcall queue_delayed_work(struct workqueue_struct *wq,
242 struct delayed_work *dwork, unsigned long delay) 188 struct delayed_work *dwork, unsigned long delay)
243{ 189{
244 int ret = 0; 190 timer_stats_timer_set_start_info(&dwork->timer);
245 struct timer_list *timer = &dwork->timer;
246 struct work_struct *work = &dwork->work;
247
248 timer_stats_timer_set_start_info(timer);
249 if (delay == 0) 191 if (delay == 0)
250 return queue_work(wq, work); 192 return queue_work(wq, &dwork->work);
251
252 if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) {
253 BUG_ON(timer_pending(timer));
254 BUG_ON(!list_empty(&work->entry));
255 193
256 /* This stores wq for the moment, for the timer_fn */ 194 return queue_delayed_work_on(-1, wq, dwork, delay);
257 set_wq_data(work, wq);
258 timer->expires = jiffies + delay;
259 timer->data = (unsigned long)dwork;
260 timer->function = delayed_work_timer_fn;
261 add_timer(timer);
262 ret = 1;
263 }
264 return ret;
265} 195}
266EXPORT_SYMBOL_GPL(queue_delayed_work); 196EXPORT_SYMBOL_GPL(queue_delayed_work);
267 197
@@ -285,12 +215,16 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
285 BUG_ON(timer_pending(timer)); 215 BUG_ON(timer_pending(timer));
286 BUG_ON(!list_empty(&work->entry)); 216 BUG_ON(!list_empty(&work->entry));
287 217
288 /* This stores wq for the moment, for the timer_fn */ 218 /* This stores cwq for the moment, for the timer_fn */
289 set_wq_data(work, wq); 219 set_wq_data(work, wq_per_cpu(wq, raw_smp_processor_id()));
290 timer->expires = jiffies + delay; 220 timer->expires = jiffies + delay;
291 timer->data = (unsigned long)dwork; 221 timer->data = (unsigned long)dwork;
292 timer->function = delayed_work_timer_fn; 222 timer->function = delayed_work_timer_fn;
293 add_timer_on(timer, cpu); 223
224 if (unlikely(cpu >= 0))
225 add_timer_on(timer, cpu);
226 else
227 add_timer(timer);
294 ret = 1; 228 ret = 1;
295 } 229 }
296 return ret; 230 return ret;
@@ -299,13 +233,7 @@ EXPORT_SYMBOL_GPL(queue_delayed_work_on);
299 233
300static void run_workqueue(struct cpu_workqueue_struct *cwq) 234static void run_workqueue(struct cpu_workqueue_struct *cwq)
301{ 235{
302 unsigned long flags; 236 spin_lock_irq(&cwq->lock);
303
304 /*
305 * Keep taking off work from the queue until
306 * done.
307 */
308 spin_lock_irqsave(&cwq->lock, flags);
309 cwq->run_depth++; 237 cwq->run_depth++;
310 if (cwq->run_depth > 3) { 238 if (cwq->run_depth > 3) {
311 /* morton gets to eat his hat */ 239 /* morton gets to eat his hat */
@@ -318,12 +246,12 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
318 struct work_struct, entry); 246 struct work_struct, entry);
319 work_func_t f = work->func; 247 work_func_t f = work->func;
320 248
249 cwq->current_work = work;
321 list_del_init(cwq->worklist.next); 250 list_del_init(cwq->worklist.next);
322 spin_unlock_irqrestore(&cwq->lock, flags); 251 spin_unlock_irq(&cwq->lock);
323 252
324 BUG_ON(get_wq_data(work) != cwq); 253 BUG_ON(get_wq_data(work) != cwq);
325 if (!test_bit(WORK_STRUCT_NOAUTOREL, work_data_bits(work))) 254 work_clear_pending(work);
326 work_release(work);
327 f(work); 255 f(work);
328 256
329 if (unlikely(in_atomic() || lockdep_depth(current) > 0)) { 257 if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
@@ -337,63 +265,81 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
337 dump_stack(); 265 dump_stack();
338 } 266 }
339 267
340 spin_lock_irqsave(&cwq->lock, flags); 268 spin_lock_irq(&cwq->lock);
341 cwq->remove_sequence++; 269 cwq->current_work = NULL;
342 wake_up(&cwq->work_done);
343 } 270 }
344 cwq->run_depth--; 271 cwq->run_depth--;
345 spin_unlock_irqrestore(&cwq->lock, flags); 272 spin_unlock_irq(&cwq->lock);
273}
274
275/*
276 * NOTE: the caller must not touch *cwq if this func returns true
277 */
278static int cwq_should_stop(struct cpu_workqueue_struct *cwq)
279{
280 int should_stop = cwq->should_stop;
281
282 if (unlikely(should_stop)) {
283 spin_lock_irq(&cwq->lock);
284 should_stop = cwq->should_stop && list_empty(&cwq->worklist);
285 if (should_stop)
286 cwq->thread = NULL;
287 spin_unlock_irq(&cwq->lock);
288 }
289
290 return should_stop;
346} 291}
347 292
348static int worker_thread(void *__cwq) 293static int worker_thread(void *__cwq)
349{ 294{
350 struct cpu_workqueue_struct *cwq = __cwq; 295 struct cpu_workqueue_struct *cwq = __cwq;
351 DECLARE_WAITQUEUE(wait, current); 296 DEFINE_WAIT(wait);
352 struct k_sigaction sa;
353 sigset_t blocked;
354 297
355 if (!cwq->freezeable) 298 if (!cwq->wq->freezeable)
356 current->flags |= PF_NOFREEZE; 299 current->flags |= PF_NOFREEZE;
357 300
358 set_user_nice(current, -5); 301 set_user_nice(current, -5);
359 302
360 /* Block and flush all signals */ 303 for (;;) {
361 sigfillset(&blocked); 304 prepare_to_wait(&cwq->more_work, &wait, TASK_INTERRUPTIBLE);
362 sigprocmask(SIG_BLOCK, &blocked, NULL); 305 if (!freezing(current) && !cwq->should_stop
363 flush_signals(current); 306 && list_empty(&cwq->worklist))
364 307 schedule();
365 /* 308 finish_wait(&cwq->more_work, &wait);
366 * We inherited MPOL_INTERLEAVE from the booting kernel.
367 * Set MPOL_DEFAULT to insure node local allocations.
368 */
369 numa_default_policy();
370
371 /* SIG_IGN makes children autoreap: see do_notify_parent(). */
372 sa.sa.sa_handler = SIG_IGN;
373 sa.sa.sa_flags = 0;
374 siginitset(&sa.sa.sa_mask, sigmask(SIGCHLD));
375 do_sigaction(SIGCHLD, &sa, (struct k_sigaction *)0);
376 309
377 set_current_state(TASK_INTERRUPTIBLE); 310 try_to_freeze();
378 while (!kthread_should_stop()) {
379 if (cwq->freezeable)
380 try_to_freeze();
381 311
382 add_wait_queue(&cwq->more_work, &wait); 312 if (cwq_should_stop(cwq))
383 if (list_empty(&cwq->worklist)) 313 break;
384 schedule();
385 else
386 __set_current_state(TASK_RUNNING);
387 remove_wait_queue(&cwq->more_work, &wait);
388 314
389 if (!list_empty(&cwq->worklist)) 315 run_workqueue(cwq);
390 run_workqueue(cwq);
391 set_current_state(TASK_INTERRUPTIBLE);
392 } 316 }
393 __set_current_state(TASK_RUNNING); 317
394 return 0; 318 return 0;
395} 319}
396 320
321struct wq_barrier {
322 struct work_struct work;
323 struct completion done;
324};
325
326static void wq_barrier_func(struct work_struct *work)
327{
328 struct wq_barrier *barr = container_of(work, struct wq_barrier, work);
329 complete(&barr->done);
330}
331
332static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
333 struct wq_barrier *barr, int tail)
334{
335 INIT_WORK(&barr->work, wq_barrier_func);
336 __set_bit(WORK_STRUCT_PENDING, work_data_bits(&barr->work));
337
338 init_completion(&barr->done);
339
340 insert_work(cwq, &barr->work, tail);
341}
342
397static void flush_cpu_workqueue(struct cpu_workqueue_struct *cwq) 343static void flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
398{ 344{
399 if (cwq->thread == current) { 345 if (cwq->thread == current) {
@@ -403,21 +349,18 @@ static void flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
403 */ 349 */
404 run_workqueue(cwq); 350 run_workqueue(cwq);
405 } else { 351 } else {
406 DEFINE_WAIT(wait); 352 struct wq_barrier barr;
407 long sequence_needed; 353 int active = 0;
408 354
409 spin_lock_irq(&cwq->lock); 355 spin_lock_irq(&cwq->lock);
410 sequence_needed = cwq->insert_sequence; 356 if (!list_empty(&cwq->worklist) || cwq->current_work != NULL) {
411 357 insert_wq_barrier(cwq, &barr, 1);
412 while (sequence_needed - cwq->remove_sequence > 0) { 358 active = 1;
413 prepare_to_wait(&cwq->work_done, &wait,
414 TASK_UNINTERRUPTIBLE);
415 spin_unlock_irq(&cwq->lock);
416 schedule();
417 spin_lock_irq(&cwq->lock);
418 } 359 }
419 finish_wait(&cwq->work_done, &wait);
420 spin_unlock_irq(&cwq->lock); 360 spin_unlock_irq(&cwq->lock);
361
362 if (active)
363 wait_for_completion(&barr.done);
421 } 364 }
422} 365}
423 366
@@ -428,151 +371,145 @@ static void flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
428 * Forces execution of the workqueue and blocks until its completion. 371 * Forces execution of the workqueue and blocks until its completion.
429 * This is typically used in driver shutdown handlers. 372 * This is typically used in driver shutdown handlers.
430 * 373 *
431 * This function will sample each workqueue's current insert_sequence number and 374 * We sleep until all works which were queued on entry have been handled,
432 * will sleep until the head sequence is greater than or equal to that. This 375 * but we are not livelocked by new incoming ones.
433 * means that we sleep until all works which were queued on entry have been
434 * handled, but we are not livelocked by new incoming ones.
435 * 376 *
436 * This function used to run the workqueues itself. Now we just wait for the 377 * This function used to run the workqueues itself. Now we just wait for the
437 * helper threads to do it. 378 * helper threads to do it.
438 */ 379 */
439void fastcall flush_workqueue(struct workqueue_struct *wq) 380void fastcall flush_workqueue(struct workqueue_struct *wq)
440{ 381{
382 const cpumask_t *cpu_map = wq_cpu_map(wq);
383 int cpu;
384
441 might_sleep(); 385 might_sleep();
386 for_each_cpu_mask(cpu, *cpu_map)
387 flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu));
388}
389EXPORT_SYMBOL_GPL(flush_workqueue);
442 390
443 if (is_single_threaded(wq)) { 391/*
444 /* Always use first cpu's area. */ 392 * Upon a successful return, the caller "owns" WORK_STRUCT_PENDING bit,
445 flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, singlethread_cpu)); 393 * so this work can't be re-armed in any way.
446 } else { 394 */
447 int cpu; 395static int try_to_grab_pending(struct work_struct *work)
396{
397 struct cpu_workqueue_struct *cwq;
398 int ret = 0;
448 399
449 mutex_lock(&workqueue_mutex); 400 if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work)))
450 for_each_online_cpu(cpu) 401 return 1;
451 flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu)); 402
452 mutex_unlock(&workqueue_mutex); 403 /*
404 * The queueing is in progress, or it is already queued. Try to
405 * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
406 */
407
408 cwq = get_wq_data(work);
409 if (!cwq)
410 return ret;
411
412 spin_lock_irq(&cwq->lock);
413 if (!list_empty(&work->entry)) {
414 /*
415 * This work is queued, but perhaps we locked the wrong cwq.
416 * In that case we must see the new value after rmb(), see
417 * insert_work()->wmb().
418 */
419 smp_rmb();
420 if (cwq == get_wq_data(work)) {
421 list_del_init(&work->entry);
422 ret = 1;
423 }
453 } 424 }
425 spin_unlock_irq(&cwq->lock);
426
427 return ret;
454} 428}
455EXPORT_SYMBOL_GPL(flush_workqueue);
456 429
457static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq, 430static void wait_on_cpu_work(struct cpu_workqueue_struct *cwq,
458 int cpu, int freezeable) 431 struct work_struct *work)
459{ 432{
460 struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu); 433 struct wq_barrier barr;
461 struct task_struct *p; 434 int running = 0;
462 435
463 spin_lock_init(&cwq->lock); 436 spin_lock_irq(&cwq->lock);
464 cwq->wq = wq; 437 if (unlikely(cwq->current_work == work)) {
465 cwq->thread = NULL; 438 insert_wq_barrier(cwq, &barr, 0);
466 cwq->insert_sequence = 0; 439 running = 1;
467 cwq->remove_sequence = 0; 440 }
468 cwq->freezeable = freezeable; 441 spin_unlock_irq(&cwq->lock);
469 INIT_LIST_HEAD(&cwq->worklist);
470 init_waitqueue_head(&cwq->more_work);
471 init_waitqueue_head(&cwq->work_done);
472 442
473 if (is_single_threaded(wq)) 443 if (unlikely(running))
474 p = kthread_create(worker_thread, cwq, "%s", wq->name); 444 wait_for_completion(&barr.done);
475 else
476 p = kthread_create(worker_thread, cwq, "%s/%d", wq->name, cpu);
477 if (IS_ERR(p))
478 return NULL;
479 cwq->thread = p;
480 return p;
481} 445}
482 446
483struct workqueue_struct *__create_workqueue(const char *name, 447static void wait_on_work(struct work_struct *work)
484 int singlethread, int freezeable)
485{ 448{
486 int cpu, destroy = 0; 449 struct cpu_workqueue_struct *cwq;
487 struct workqueue_struct *wq; 450 struct workqueue_struct *wq;
488 struct task_struct *p; 451 const cpumask_t *cpu_map;
452 int cpu;
489 453
490 wq = kzalloc(sizeof(*wq), GFP_KERNEL); 454 might_sleep();
491 if (!wq)
492 return NULL;
493 455
494 wq->cpu_wq = alloc_percpu(struct cpu_workqueue_struct); 456 cwq = get_wq_data(work);
495 if (!wq->cpu_wq) { 457 if (!cwq)
496 kfree(wq); 458 return;
497 return NULL;
498 }
499 459
500 wq->name = name; 460 wq = cwq->wq;
501 mutex_lock(&workqueue_mutex); 461 cpu_map = wq_cpu_map(wq);
502 if (singlethread) {
503 INIT_LIST_HEAD(&wq->list);
504 p = create_workqueue_thread(wq, singlethread_cpu, freezeable);
505 if (!p)
506 destroy = 1;
507 else
508 wake_up_process(p);
509 } else {
510 list_add(&wq->list, &workqueues);
511 for_each_online_cpu(cpu) {
512 p = create_workqueue_thread(wq, cpu, freezeable);
513 if (p) {
514 kthread_bind(p, cpu);
515 wake_up_process(p);
516 } else
517 destroy = 1;
518 }
519 }
520 mutex_unlock(&workqueue_mutex);
521 462
522 /* 463 for_each_cpu_mask(cpu, *cpu_map)
523 * Was there any error during startup? If yes then clean up: 464 wait_on_cpu_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
524 */
525 if (destroy) {
526 destroy_workqueue(wq);
527 wq = NULL;
528 }
529 return wq;
530} 465}
531EXPORT_SYMBOL_GPL(__create_workqueue);
532 466
533static void cleanup_workqueue_thread(struct workqueue_struct *wq, int cpu) 467/**
468 * cancel_work_sync - block until a work_struct's callback has terminated
469 * @work: the work which is to be flushed
470 *
471 * cancel_work_sync() will cancel the work if it is queued. If the work's
472 * callback appears to be running, cancel_work_sync() will block until it
473 * has completed.
474 *
475 * It is possible to use this function if the work re-queues itself. It can
476 * cancel the work even if it migrates to another workqueue, however in that
477 * case it only guarantees that work->func() has completed on the last queued
478 * workqueue.
479 *
480 * cancel_work_sync(&delayed_work->work) should be used only if ->timer is not
481 * pending, otherwise it goes into a busy-wait loop until the timer expires.
482 *
483 * The caller must ensure that workqueue_struct on which this work was last
484 * queued can't be destroyed before this function returns.
485 */
486void cancel_work_sync(struct work_struct *work)
534{ 487{
535 struct cpu_workqueue_struct *cwq; 488 while (!try_to_grab_pending(work))
536 unsigned long flags; 489 cpu_relax();
537 struct task_struct *p; 490 wait_on_work(work);
538 491 work_clear_pending(work);
539 cwq = per_cpu_ptr(wq->cpu_wq, cpu);
540 spin_lock_irqsave(&cwq->lock, flags);
541 p = cwq->thread;
542 cwq->thread = NULL;
543 spin_unlock_irqrestore(&cwq->lock, flags);
544 if (p)
545 kthread_stop(p);
546} 492}
493EXPORT_SYMBOL_GPL(cancel_work_sync);
547 494
548/** 495/**
549 * destroy_workqueue - safely terminate a workqueue 496 * cancel_rearming_delayed_work - reliably kill off a delayed work.
550 * @wq: target workqueue 497 * @dwork: the delayed work struct
551 * 498 *
552 * Safely destroy a workqueue. All work currently pending will be done first. 499 * It is possible to use this function if @dwork rearms itself via queue_work()
500 * or queue_delayed_work(). See also the comment for cancel_work_sync().
553 */ 501 */
554void destroy_workqueue(struct workqueue_struct *wq) 502void cancel_rearming_delayed_work(struct delayed_work *dwork)
555{ 503{
556 int cpu; 504 while (!del_timer(&dwork->timer) &&
557 505 !try_to_grab_pending(&dwork->work))
558 flush_workqueue(wq); 506 cpu_relax();
559 507 wait_on_work(&dwork->work);
560 /* We don't need the distraction of CPUs appearing and vanishing. */ 508 work_clear_pending(&dwork->work);
561 mutex_lock(&workqueue_mutex);
562 if (is_single_threaded(wq))
563 cleanup_workqueue_thread(wq, singlethread_cpu);
564 else {
565 for_each_online_cpu(cpu)
566 cleanup_workqueue_thread(wq, cpu);
567 list_del(&wq->list);
568 }
569 mutex_unlock(&workqueue_mutex);
570 free_percpu(wq->cpu_wq);
571 kfree(wq);
572} 509}
573EXPORT_SYMBOL_GPL(destroy_workqueue); 510EXPORT_SYMBOL(cancel_rearming_delayed_work);
574 511
575static struct workqueue_struct *keventd_wq; 512static struct workqueue_struct *keventd_wq __read_mostly;
576 513
577/** 514/**
578 * schedule_work - put work task in global workqueue 515 * schedule_work - put work task in global workqueue
@@ -638,7 +575,7 @@ int schedule_on_each_cpu(work_func_t func)
638 if (!works) 575 if (!works)
639 return -ENOMEM; 576 return -ENOMEM;
640 577
641 mutex_lock(&workqueue_mutex); 578 preempt_disable(); /* CPU hotplug */
642 for_each_online_cpu(cpu) { 579 for_each_online_cpu(cpu) {
643 struct work_struct *work = per_cpu_ptr(works, cpu); 580 struct work_struct *work = per_cpu_ptr(works, cpu);
644 581
@@ -646,7 +583,7 @@ int schedule_on_each_cpu(work_func_t func)
646 set_bit(WORK_STRUCT_PENDING, work_data_bits(work)); 583 set_bit(WORK_STRUCT_PENDING, work_data_bits(work));
647 __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), work); 584 __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), work);
648 } 585 }
649 mutex_unlock(&workqueue_mutex); 586 preempt_enable();
650 flush_workqueue(keventd_wq); 587 flush_workqueue(keventd_wq);
651 free_percpu(works); 588 free_percpu(works);
652 return 0; 589 return 0;
@@ -659,29 +596,6 @@ void flush_scheduled_work(void)
659EXPORT_SYMBOL(flush_scheduled_work); 596EXPORT_SYMBOL(flush_scheduled_work);
660 597
661/** 598/**
662 * cancel_rearming_delayed_workqueue - reliably kill off a delayed work whose handler rearms the delayed work.
663 * @wq: the controlling workqueue structure
664 * @dwork: the delayed work struct
665 */
666void cancel_rearming_delayed_workqueue(struct workqueue_struct *wq,
667 struct delayed_work *dwork)
668{
669 while (!cancel_delayed_work(dwork))
670 flush_workqueue(wq);
671}
672EXPORT_SYMBOL(cancel_rearming_delayed_workqueue);
673
674/**
675 * cancel_rearming_delayed_work - reliably kill off a delayed keventd work whose handler rearms the delayed work.
676 * @dwork: the delayed work struct
677 */
678void cancel_rearming_delayed_work(struct delayed_work *dwork)
679{
680 cancel_rearming_delayed_workqueue(keventd_wq, dwork);
681}
682EXPORT_SYMBOL(cancel_rearming_delayed_work);
683
684/**
685 * execute_in_process_context - reliably execute the routine with user context 599 * execute_in_process_context - reliably execute the routine with user context
686 * @fn: the function to execute 600 * @fn: the function to execute
687 * @ew: guaranteed storage for the execute work structure (must 601 * @ew: guaranteed storage for the execute work structure (must
@@ -728,94 +642,209 @@ int current_is_keventd(void)
728 642
729} 643}
730 644
731/* Take the work from this (downed) CPU. */ 645static struct cpu_workqueue_struct *
732static void take_over_work(struct workqueue_struct *wq, unsigned int cpu) 646init_cpu_workqueue(struct workqueue_struct *wq, int cpu)
733{ 647{
734 struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu); 648 struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu);
735 struct list_head list;
736 struct work_struct *work;
737 649
738 spin_lock_irq(&cwq->lock); 650 cwq->wq = wq;
739 list_replace_init(&cwq->worklist, &list); 651 spin_lock_init(&cwq->lock);
652 INIT_LIST_HEAD(&cwq->worklist);
653 init_waitqueue_head(&cwq->more_work);
654
655 return cwq;
656}
657
658static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
659{
660 struct workqueue_struct *wq = cwq->wq;
661 const char *fmt = is_single_threaded(wq) ? "%s" : "%s/%d";
662 struct task_struct *p;
663
664 p = kthread_create(worker_thread, cwq, fmt, wq->name, cpu);
665 /*
666 * Nobody can add the work_struct to this cwq,
667 * if (caller is __create_workqueue)
668 * nobody should see this wq
669 * else // caller is CPU_UP_PREPARE
670 * cpu is not on cpu_online_map
671 * so we can abort safely.
672 */
673 if (IS_ERR(p))
674 return PTR_ERR(p);
675
676 cwq->thread = p;
677 cwq->should_stop = 0;
678
679 return 0;
680}
681
682static void start_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
683{
684 struct task_struct *p = cwq->thread;
740 685
741 while (!list_empty(&list)) { 686 if (p != NULL) {
742 printk("Taking work for %s\n", wq->name); 687 if (cpu >= 0)
743 work = list_entry(list.next,struct work_struct,entry); 688 kthread_bind(p, cpu);
744 list_del(&work->entry); 689 wake_up_process(p);
745 __queue_work(per_cpu_ptr(wq->cpu_wq, smp_processor_id()), work);
746 } 690 }
747 spin_unlock_irq(&cwq->lock);
748} 691}
749 692
750/* We're holding the cpucontrol mutex here */ 693struct workqueue_struct *__create_workqueue(const char *name,
751static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, 694 int singlethread, int freezeable)
752 unsigned long action,
753 void *hcpu)
754{ 695{
755 unsigned int hotcpu = (unsigned long)hcpu;
756 struct workqueue_struct *wq; 696 struct workqueue_struct *wq;
697 struct cpu_workqueue_struct *cwq;
698 int err = 0, cpu;
757 699
758 switch (action) { 700 wq = kzalloc(sizeof(*wq), GFP_KERNEL);
759 case CPU_UP_PREPARE: 701 if (!wq)
760 mutex_lock(&workqueue_mutex); 702 return NULL;
761 /* Create a new workqueue thread for it. */
762 list_for_each_entry(wq, &workqueues, list) {
763 if (!create_workqueue_thread(wq, hotcpu, 0)) {
764 printk("workqueue for %i failed\n", hotcpu);
765 return NOTIFY_BAD;
766 }
767 }
768 break;
769 703
770 case CPU_ONLINE: 704 wq->cpu_wq = alloc_percpu(struct cpu_workqueue_struct);
771 /* Kick off worker threads. */ 705 if (!wq->cpu_wq) {
772 list_for_each_entry(wq, &workqueues, list) { 706 kfree(wq);
773 struct cpu_workqueue_struct *cwq; 707 return NULL;
708 }
774 709
775 cwq = per_cpu_ptr(wq->cpu_wq, hotcpu); 710 wq->name = name;
776 kthread_bind(cwq->thread, hotcpu); 711 wq->singlethread = singlethread;
777 wake_up_process(cwq->thread); 712 wq->freezeable = freezeable;
778 } 713 INIT_LIST_HEAD(&wq->list);
779 mutex_unlock(&workqueue_mutex);
780 break;
781 714
782 case CPU_UP_CANCELED: 715 if (singlethread) {
783 list_for_each_entry(wq, &workqueues, list) { 716 cwq = init_cpu_workqueue(wq, singlethread_cpu);
784 if (!per_cpu_ptr(wq->cpu_wq, hotcpu)->thread) 717 err = create_workqueue_thread(cwq, singlethread_cpu);
718 start_workqueue_thread(cwq, -1);
719 } else {
720 mutex_lock(&workqueue_mutex);
721 list_add(&wq->list, &workqueues);
722
723 for_each_possible_cpu(cpu) {
724 cwq = init_cpu_workqueue(wq, cpu);
725 if (err || !cpu_online(cpu))
785 continue; 726 continue;
786 /* Unbind so it can run. */ 727 err = create_workqueue_thread(cwq, cpu);
787 kthread_bind(per_cpu_ptr(wq->cpu_wq, hotcpu)->thread, 728 start_workqueue_thread(cwq, cpu);
788 any_online_cpu(cpu_online_map));
789 cleanup_workqueue_thread(wq, hotcpu);
790 } 729 }
791 mutex_unlock(&workqueue_mutex); 730 mutex_unlock(&workqueue_mutex);
792 break; 731 }
732
733 if (err) {
734 destroy_workqueue(wq);
735 wq = NULL;
736 }
737 return wq;
738}
739EXPORT_SYMBOL_GPL(__create_workqueue);
740
741static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
742{
743 struct wq_barrier barr;
744 int alive = 0;
745
746 spin_lock_irq(&cwq->lock);
747 if (cwq->thread != NULL) {
748 insert_wq_barrier(cwq, &barr, 1);
749 cwq->should_stop = 1;
750 alive = 1;
751 }
752 spin_unlock_irq(&cwq->lock);
753
754 if (alive) {
755 wait_for_completion(&barr.done);
793 756
794 case CPU_DOWN_PREPARE: 757 while (unlikely(cwq->thread != NULL))
758 cpu_relax();
759 /*
760 * Wait until cwq->thread unlocks cwq->lock,
761 * it won't touch *cwq after that.
762 */
763 smp_rmb();
764 spin_unlock_wait(&cwq->lock);
765 }
766}
767
768/**
769 * destroy_workqueue - safely terminate a workqueue
770 * @wq: target workqueue
771 *
772 * Safely destroy a workqueue. All work currently pending will be done first.
773 */
774void destroy_workqueue(struct workqueue_struct *wq)
775{
776 const cpumask_t *cpu_map = wq_cpu_map(wq);
777 struct cpu_workqueue_struct *cwq;
778 int cpu;
779
780 mutex_lock(&workqueue_mutex);
781 list_del(&wq->list);
782 mutex_unlock(&workqueue_mutex);
783
784 for_each_cpu_mask(cpu, *cpu_map) {
785 cwq = per_cpu_ptr(wq->cpu_wq, cpu);
786 cleanup_workqueue_thread(cwq, cpu);
787 }
788
789 free_percpu(wq->cpu_wq);
790 kfree(wq);
791}
792EXPORT_SYMBOL_GPL(destroy_workqueue);
793
794static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
795 unsigned long action,
796 void *hcpu)
797{
798 unsigned int cpu = (unsigned long)hcpu;
799 struct cpu_workqueue_struct *cwq;
800 struct workqueue_struct *wq;
801
802 action &= ~CPU_TASKS_FROZEN;
803
804 switch (action) {
805 case CPU_LOCK_ACQUIRE:
795 mutex_lock(&workqueue_mutex); 806 mutex_lock(&workqueue_mutex);
796 break; 807 return NOTIFY_OK;
797 808
798 case CPU_DOWN_FAILED: 809 case CPU_LOCK_RELEASE:
799 mutex_unlock(&workqueue_mutex); 810 mutex_unlock(&workqueue_mutex);
800 break; 811 return NOTIFY_OK;
801 812
802 case CPU_DEAD: 813 case CPU_UP_PREPARE:
803 list_for_each_entry(wq, &workqueues, list) 814 cpu_set(cpu, cpu_populated_map);
804 cleanup_workqueue_thread(wq, hotcpu); 815 }
805 list_for_each_entry(wq, &workqueues, list) 816
806 take_over_work(wq, hotcpu); 817 list_for_each_entry(wq, &workqueues, list) {
807 mutex_unlock(&workqueue_mutex); 818 cwq = per_cpu_ptr(wq->cpu_wq, cpu);
808 break; 819
820 switch (action) {
821 case CPU_UP_PREPARE:
822 if (!create_workqueue_thread(cwq, cpu))
823 break;
824 printk(KERN_ERR "workqueue for %i failed\n", cpu);
825 return NOTIFY_BAD;
826
827 case CPU_ONLINE:
828 start_workqueue_thread(cwq, cpu);
829 break;
830
831 case CPU_UP_CANCELED:
832 start_workqueue_thread(cwq, -1);
833 case CPU_DEAD:
834 cleanup_workqueue_thread(cwq, cpu);
835 break;
836 }
809 } 837 }
810 838
811 return NOTIFY_OK; 839 return NOTIFY_OK;
812} 840}
813 841
814void init_workqueues(void) 842void __init init_workqueues(void)
815{ 843{
844 cpu_populated_map = cpu_online_map;
816 singlethread_cpu = first_cpu(cpu_possible_map); 845 singlethread_cpu = first_cpu(cpu_possible_map);
846 cpu_singlethread_map = cpumask_of_cpu(singlethread_cpu);
817 hotcpu_notifier(workqueue_cpu_callback, 0); 847 hotcpu_notifier(workqueue_cpu_callback, 0);
818 keventd_wq = create_workqueue("events"); 848 keventd_wq = create_workqueue("events");
819 BUG_ON(!keventd_wq); 849 BUG_ON(!keventd_wq);
820} 850}
821
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index d69ddbe43865..402eb4eb6b23 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -1004,7 +1004,7 @@ static int radix_tree_callback(struct notifier_block *nfb,
1004 struct radix_tree_preload *rtp; 1004 struct radix_tree_preload *rtp;
1005 1005
1006 /* Free per-cpu pool of perloaded nodes */ 1006 /* Free per-cpu pool of perloaded nodes */
1007 if (action == CPU_DEAD) { 1007 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
1008 rtp = &per_cpu(radix_tree_preloads, cpu); 1008 rtp = &per_cpu(radix_tree_preloads, cpu);
1009 while (rtp->nr) { 1009 while (rtp->nr) {
1010 kmem_cache_free(radix_tree_node_cachep, 1010 kmem_cache_free(radix_tree_node_cachep,
diff --git a/mm/filemap.c b/mm/filemap.c
index 9cbf4fea4a59..9e56fd158fa3 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -750,6 +750,7 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
750 read_unlock_irq(&mapping->tree_lock); 750 read_unlock_irq(&mapping->tree_lock);
751 return i; 751 return i;
752} 752}
753EXPORT_SYMBOL(find_get_pages_contig);
753 754
754/** 755/**
755 * find_get_pages_tag - find and return pages that match @tag 756 * find_get_pages_tag - find and return pages that match @tag
@@ -778,6 +779,7 @@ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
778 read_unlock_irq(&mapping->tree_lock); 779 read_unlock_irq(&mapping->tree_lock);
779 return ret; 780 return ret;
780} 781}
782EXPORT_SYMBOL(find_get_pages_tag);
781 783
782/** 784/**
783 * grab_cache_page_nowait - returns locked page at given index in given cache 785 * grab_cache_page_nowait - returns locked page at given index in given cache
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index cbb335813ec0..1b49dab9b25d 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -434,7 +434,6 @@ xip_truncate_page(struct address_space *mapping, loff_t from)
434 unsigned blocksize; 434 unsigned blocksize;
435 unsigned length; 435 unsigned length;
436 struct page *page; 436 struct page *page;
437 void *kaddr;
438 437
439 BUG_ON(!mapping->a_ops->get_xip_page); 438 BUG_ON(!mapping->a_ops->get_xip_page);
440 439
@@ -458,11 +457,7 @@ xip_truncate_page(struct address_space *mapping, loff_t from)
458 else 457 else
459 return PTR_ERR(page); 458 return PTR_ERR(page);
460 } 459 }
461 kaddr = kmap_atomic(page, KM_USER0); 460 zero_user_page(page, offset, length, KM_USER0);
462 memset(kaddr + offset, 0, length);
463 kunmap_atomic(kaddr, KM_USER0);
464
465 flush_dcache_page(page);
466 return 0; 461 return 0;
467} 462}
468EXPORT_SYMBOL_GPL(xip_truncate_page); 463EXPORT_SYMBOL_GPL(xip_truncate_page);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 36db012b38dd..eb7180db3033 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -140,6 +140,8 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
140 return page; 140 return page;
141 141
142fail: 142fail:
143 if (vma->vm_flags & VM_MAYSHARE)
144 resv_huge_pages++;
143 spin_unlock(&hugetlb_lock); 145 spin_unlock(&hugetlb_lock);
144 return NULL; 146 return NULL;
145} 147}
@@ -172,6 +174,17 @@ static int __init hugetlb_setup(char *s)
172} 174}
173__setup("hugepages=", hugetlb_setup); 175__setup("hugepages=", hugetlb_setup);
174 176
177static unsigned int cpuset_mems_nr(unsigned int *array)
178{
179 int node;
180 unsigned int nr = 0;
181
182 for_each_node_mask(node, cpuset_current_mems_allowed)
183 nr += array[node];
184
185 return nr;
186}
187
175#ifdef CONFIG_SYSCTL 188#ifdef CONFIG_SYSCTL
176static void update_and_free_page(struct page *page) 189static void update_and_free_page(struct page *page)
177{ 190{
@@ -817,6 +830,26 @@ int hugetlb_reserve_pages(struct inode *inode, long from, long to)
817 chg = region_chg(&inode->i_mapping->private_list, from, to); 830 chg = region_chg(&inode->i_mapping->private_list, from, to);
818 if (chg < 0) 831 if (chg < 0)
819 return chg; 832 return chg;
833 /*
834 * When cpuset is configured, it breaks the strict hugetlb page
835 * reservation as the accounting is done on a global variable. Such
836 * reservation is completely rubbish in the presence of cpuset because
837 * the reservation is not checked against page availability for the
838 * current cpuset. Application can still potentially OOM'ed by kernel
839 * with lack of free htlb page in cpuset that the task is in.
840 * Attempt to enforce strict accounting with cpuset is almost
841 * impossible (or too ugly) because cpuset is too fluid that
842 * task or memory node can be dynamically moved between cpusets.
843 *
844 * The change of semantics for shared hugetlb mapping with cpuset is
845 * undesirable. However, in order to preserve some of the semantics,
846 * we fall back to check against current free page availability as
847 * a best attempt and hopefully to minimize the impact of changing
848 * semantics that cpuset has.
849 */
850 if (chg > cpuset_mems_nr(free_huge_pages_node))
851 return -ENOMEM;
852
820 ret = hugetlb_acct_memory(chg); 853 ret = hugetlb_acct_memory(chg);
821 if (ret < 0) 854 if (ret < 0)
822 return ret; 855 return ret;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6fd0b7455b0b..f9b5d6d5f4d6 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -691,43 +691,26 @@ static void __init setup_nr_node_ids(void) {}
691 691
692#ifdef CONFIG_NUMA 692#ifdef CONFIG_NUMA
693/* 693/*
694 * Called from the slab reaper to drain pagesets on a particular node that 694 * Called from the vmstat counter updater to drain pagesets of this
695 * belongs to the currently executing processor. 695 * currently executing processor on remote nodes after they have
696 * expired.
697 *
696 * Note that this function must be called with the thread pinned to 698 * Note that this function must be called with the thread pinned to
697 * a single processor. 699 * a single processor.
698 */ 700 */
699void drain_node_pages(int nodeid) 701void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
700{ 702{
701 int i;
702 enum zone_type z;
703 unsigned long flags; 703 unsigned long flags;
704 int to_drain;
704 705
705 for (z = 0; z < MAX_NR_ZONES; z++) { 706 local_irq_save(flags);
706 struct zone *zone = NODE_DATA(nodeid)->node_zones + z; 707 if (pcp->count >= pcp->batch)
707 struct per_cpu_pageset *pset; 708 to_drain = pcp->batch;
708 709 else
709 if (!populated_zone(zone)) 710 to_drain = pcp->count;
710 continue; 711 free_pages_bulk(zone, to_drain, &pcp->list, 0);
711 712 pcp->count -= to_drain;
712 pset = zone_pcp(zone, smp_processor_id()); 713 local_irq_restore(flags);
713 for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
714 struct per_cpu_pages *pcp;
715
716 pcp = &pset->pcp[i];
717 if (pcp->count) {
718 int to_drain;
719
720 local_irq_save(flags);
721 if (pcp->count >= pcp->batch)
722 to_drain = pcp->batch;
723 else
724 to_drain = pcp->count;
725 free_pages_bulk(zone, to_drain, &pcp->list, 0);
726 pcp->count -= to_drain;
727 local_irq_restore(flags);
728 }
729 }
730 }
731} 714}
732#endif 715#endif
733 716
@@ -2148,11 +2131,14 @@ static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb,
2148 2131
2149 switch (action) { 2132 switch (action) {
2150 case CPU_UP_PREPARE: 2133 case CPU_UP_PREPARE:
2134 case CPU_UP_PREPARE_FROZEN:
2151 if (process_zones(cpu)) 2135 if (process_zones(cpu))
2152 ret = NOTIFY_BAD; 2136 ret = NOTIFY_BAD;
2153 break; 2137 break;
2154 case CPU_UP_CANCELED: 2138 case CPU_UP_CANCELED:
2139 case CPU_UP_CANCELED_FROZEN:
2155 case CPU_DEAD: 2140 case CPU_DEAD:
2141 case CPU_DEAD_FROZEN:
2156 free_zone_pagesets(cpu); 2142 free_zone_pagesets(cpu);
2157 break; 2143 break;
2158 default: 2144 default:
@@ -3012,7 +2998,7 @@ static int page_alloc_cpu_notify(struct notifier_block *self,
3012{ 2998{
3013 int cpu = (unsigned long)hcpu; 2999 int cpu = (unsigned long)hcpu;
3014 3000
3015 if (action == CPU_DEAD) { 3001 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
3016 local_irq_disable(); 3002 local_irq_disable();
3017 __drain_pages(cpu); 3003 __drain_pages(cpu);
3018 vm_events_fold_cpu(cpu); 3004 vm_events_fold_cpu(cpu);
diff --git a/mm/slab.c b/mm/slab.c
index acda7e2d66e4..944b20581f8c 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -928,12 +928,6 @@ static void next_reap_node(void)
928{ 928{
929 int node = __get_cpu_var(reap_node); 929 int node = __get_cpu_var(reap_node);
930 930
931 /*
932 * Also drain per cpu pages on remote zones
933 */
934 if (node != numa_node_id())
935 drain_node_pages(node);
936
937 node = next_node(node, node_online_map); 931 node = next_node(node, node_online_map);
938 if (unlikely(node >= MAX_NUMNODES)) 932 if (unlikely(node >= MAX_NUMNODES))
939 node = first_node(node_online_map); 933 node = first_node(node_online_map);
@@ -1186,8 +1180,11 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
1186 int memsize = sizeof(struct kmem_list3); 1180 int memsize = sizeof(struct kmem_list3);
1187 1181
1188 switch (action) { 1182 switch (action) {
1189 case CPU_UP_PREPARE: 1183 case CPU_LOCK_ACQUIRE:
1190 mutex_lock(&cache_chain_mutex); 1184 mutex_lock(&cache_chain_mutex);
1185 break;
1186 case CPU_UP_PREPARE:
1187 case CPU_UP_PREPARE_FROZEN:
1191 /* 1188 /*
1192 * We need to do this right in the beginning since 1189 * We need to do this right in the beginning since
1193 * alloc_arraycache's are going to use this list. 1190 * alloc_arraycache's are going to use this list.
@@ -1274,17 +1271,28 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
1274 } 1271 }
1275 break; 1272 break;
1276 case CPU_ONLINE: 1273 case CPU_ONLINE:
1277 mutex_unlock(&cache_chain_mutex); 1274 case CPU_ONLINE_FROZEN:
1278 start_cpu_timer(cpu); 1275 start_cpu_timer(cpu);
1279 break; 1276 break;
1280#ifdef CONFIG_HOTPLUG_CPU 1277#ifdef CONFIG_HOTPLUG_CPU
1281 case CPU_DOWN_PREPARE: 1278 case CPU_DOWN_PREPARE:
1282 mutex_lock(&cache_chain_mutex); 1279 case CPU_DOWN_PREPARE_FROZEN:
1283 break; 1280 /*
1284 case CPU_DOWN_FAILED: 1281 * Shutdown cache reaper. Note that the cache_chain_mutex is
1285 mutex_unlock(&cache_chain_mutex); 1282 * held so that if cache_reap() is invoked it cannot do
1286 break; 1283 * anything expensive but will only modify reap_work
1284 * and reschedule the timer.
1285 */
1286 cancel_rearming_delayed_work(&per_cpu(reap_work, cpu));
1287 /* Now the cache_reaper is guaranteed to be not running. */
1288 per_cpu(reap_work, cpu).work.func = NULL;
1289 break;
1290 case CPU_DOWN_FAILED:
1291 case CPU_DOWN_FAILED_FROZEN:
1292 start_cpu_timer(cpu);
1293 break;
1287 case CPU_DEAD: 1294 case CPU_DEAD:
1295 case CPU_DEAD_FROZEN:
1288 /* 1296 /*
1289 * Even if all the cpus of a node are down, we don't free the 1297 * Even if all the cpus of a node are down, we don't free the
1290 * kmem_list3 of any cache. This to avoid a race between 1298 * kmem_list3 of any cache. This to avoid a race between
@@ -1296,6 +1304,7 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
1296 /* fall thru */ 1304 /* fall thru */
1297#endif 1305#endif
1298 case CPU_UP_CANCELED: 1306 case CPU_UP_CANCELED:
1307 case CPU_UP_CANCELED_FROZEN:
1299 list_for_each_entry(cachep, &cache_chain, next) { 1308 list_for_each_entry(cachep, &cache_chain, next) {
1300 struct array_cache *nc; 1309 struct array_cache *nc;
1301 struct array_cache *shared; 1310 struct array_cache *shared;
@@ -1354,6 +1363,8 @@ free_array_cache:
1354 continue; 1363 continue;
1355 drain_freelist(cachep, l3, l3->free_objects); 1364 drain_freelist(cachep, l3, l3->free_objects);
1356 } 1365 }
1366 break;
1367 case CPU_LOCK_RELEASE:
1357 mutex_unlock(&cache_chain_mutex); 1368 mutex_unlock(&cache_chain_mutex);
1358 break; 1369 break;
1359 } 1370 }
@@ -3742,7 +3753,6 @@ EXPORT_SYMBOL(__kmalloc);
3742 3753
3743/** 3754/**
3744 * krealloc - reallocate memory. The contents will remain unchanged. 3755 * krealloc - reallocate memory. The contents will remain unchanged.
3745 *
3746 * @p: object to reallocate memory for. 3756 * @p: object to reallocate memory for.
3747 * @new_size: how many bytes of memory are required. 3757 * @new_size: how many bytes of memory are required.
3748 * @flags: the type of memory to allocate. 3758 * @flags: the type of memory to allocate.
@@ -4140,7 +4150,6 @@ next:
4140 check_irq_on(); 4150 check_irq_on();
4141 mutex_unlock(&cache_chain_mutex); 4151 mutex_unlock(&cache_chain_mutex);
4142 next_reap_node(); 4152 next_reap_node();
4143 refresh_cpu_vm_stats(smp_processor_id());
4144out: 4153out:
4145 /* Set up the next iteration */ 4154 /* Set up the next iteration */
4146 schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_CPUC)); 4155 schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_CPUC));
diff --git a/mm/slub.c b/mm/slub.c
index 5db3da5a60bf..bd2efae02bcd 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -66,11 +66,11 @@
66 * SLUB assigns one slab for allocation to each processor. 66 * SLUB assigns one slab for allocation to each processor.
67 * Allocations only occur from these slabs called cpu slabs. 67 * Allocations only occur from these slabs called cpu slabs.
68 * 68 *
69 * Slabs with free elements are kept on a partial list. 69 * Slabs with free elements are kept on a partial list and during regular
70 * There is no list for full slabs. If an object in a full slab is 70 * operations no list for full slabs is used. If an object in a full slab is
71 * freed then the slab will show up again on the partial lists. 71 * freed then the slab will show up again on the partial lists.
72 * Otherwise there is no need to track full slabs unless we have to 72 * We track full slabs for debugging purposes though because otherwise we
73 * track full slabs for debugging purposes. 73 * cannot scan all objects.
74 * 74 *
75 * Slabs are freed when they become empty. Teardown and setup is 75 * Slabs are freed when they become empty. Teardown and setup is
76 * minimal so we rely on the page allocators per cpu caches for 76 * minimal so we rely on the page allocators per cpu caches for
@@ -87,13 +87,36 @@
87 * the fast path. 87 * the fast path.
88 */ 88 */
89 89
90static inline int SlabDebug(struct page *page)
91{
92#ifdef CONFIG_SLUB_DEBUG
93 return PageError(page);
94#else
95 return 0;
96#endif
97}
98
99static inline void SetSlabDebug(struct page *page)
100{
101#ifdef CONFIG_SLUB_DEBUG
102 SetPageError(page);
103#endif
104}
105
106static inline void ClearSlabDebug(struct page *page)
107{
108#ifdef CONFIG_SLUB_DEBUG
109 ClearPageError(page);
110#endif
111}
112
90/* 113/*
91 * Issues still to be resolved: 114 * Issues still to be resolved:
92 * 115 *
93 * - The per cpu array is updated for each new slab and and is a remote 116 * - The per cpu array is updated for each new slab and and is a remote
94 * cacheline for most nodes. This could become a bouncing cacheline given 117 * cacheline for most nodes. This could become a bouncing cacheline given
95 * enough frequent updates. There are 16 pointers in a cacheline.so at 118 * enough frequent updates. There are 16 pointers in a cacheline, so at
96 * max 16 cpus could compete. Likely okay. 119 * max 16 cpus could compete for the cacheline which may be okay.
97 * 120 *
98 * - Support PAGE_ALLOC_DEBUG. Should be easy to do. 121 * - Support PAGE_ALLOC_DEBUG. Should be easy to do.
99 * 122 *
@@ -137,6 +160,7 @@
137 160
138#define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE | SLAB_RED_ZONE | \ 161#define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE | SLAB_RED_ZONE | \
139 SLAB_POISON | SLAB_STORE_USER) 162 SLAB_POISON | SLAB_STORE_USER)
163
140/* 164/*
141 * Set of flags that will prevent slab merging 165 * Set of flags that will prevent slab merging
142 */ 166 */
@@ -157,6 +181,11 @@
157/* Internal SLUB flags */ 181/* Internal SLUB flags */
158#define __OBJECT_POISON 0x80000000 /* Poison object */ 182#define __OBJECT_POISON 0x80000000 /* Poison object */
159 183
184/* Not all arches define cache_line_size */
185#ifndef cache_line_size
186#define cache_line_size() L1_CACHE_BYTES
187#endif
188
160static int kmem_size = sizeof(struct kmem_cache); 189static int kmem_size = sizeof(struct kmem_cache);
161 190
162#ifdef CONFIG_SMP 191#ifdef CONFIG_SMP
@@ -166,7 +195,7 @@ static struct notifier_block slab_notifier;
166static enum { 195static enum {
167 DOWN, /* No slab functionality available */ 196 DOWN, /* No slab functionality available */
168 PARTIAL, /* kmem_cache_open() works but kmalloc does not */ 197 PARTIAL, /* kmem_cache_open() works but kmalloc does not */
169 UP, /* Everything works */ 198 UP, /* Everything works but does not show up in sysfs */
170 SYSFS /* Sysfs up */ 199 SYSFS /* Sysfs up */
171} slab_state = DOWN; 200} slab_state = DOWN;
172 201
@@ -174,7 +203,19 @@ static enum {
174static DECLARE_RWSEM(slub_lock); 203static DECLARE_RWSEM(slub_lock);
175LIST_HEAD(slab_caches); 204LIST_HEAD(slab_caches);
176 205
177#ifdef CONFIG_SYSFS 206/*
207 * Tracking user of a slab.
208 */
209struct track {
210 void *addr; /* Called from address */
211 int cpu; /* Was running on cpu */
212 int pid; /* Pid context */
213 unsigned long when; /* When did the operation occur */
214};
215
216enum track_item { TRACK_ALLOC, TRACK_FREE };
217
218#if defined(CONFIG_SYSFS) && defined(CONFIG_SLUB_DEBUG)
178static int sysfs_slab_add(struct kmem_cache *); 219static int sysfs_slab_add(struct kmem_cache *);
179static int sysfs_slab_alias(struct kmem_cache *, const char *); 220static int sysfs_slab_alias(struct kmem_cache *, const char *);
180static void sysfs_slab_remove(struct kmem_cache *); 221static void sysfs_slab_remove(struct kmem_cache *);
@@ -202,6 +243,63 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
202#endif 243#endif
203} 244}
204 245
246static inline int check_valid_pointer(struct kmem_cache *s,
247 struct page *page, const void *object)
248{
249 void *base;
250
251 if (!object)
252 return 1;
253
254 base = page_address(page);
255 if (object < base || object >= base + s->objects * s->size ||
256 (object - base) % s->size) {
257 return 0;
258 }
259
260 return 1;
261}
262
263/*
264 * Slow version of get and set free pointer.
265 *
266 * This version requires touching the cache lines of kmem_cache which
267 * we avoid to do in the fast alloc free paths. There we obtain the offset
268 * from the page struct.
269 */
270static inline void *get_freepointer(struct kmem_cache *s, void *object)
271{
272 return *(void **)(object + s->offset);
273}
274
275static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
276{
277 *(void **)(object + s->offset) = fp;
278}
279
280/* Loop over all objects in a slab */
281#define for_each_object(__p, __s, __addr) \
282 for (__p = (__addr); __p < (__addr) + (__s)->objects * (__s)->size;\
283 __p += (__s)->size)
284
285/* Scan freelist */
286#define for_each_free_object(__p, __s, __free) \
287 for (__p = (__free); __p; __p = get_freepointer((__s), __p))
288
289/* Determine object index from a given position */
290static inline int slab_index(void *p, struct kmem_cache *s, void *addr)
291{
292 return (p - addr) / s->size;
293}
294
295#ifdef CONFIG_SLUB_DEBUG
296/*
297 * Debug settings:
298 */
299static int slub_debug;
300
301static char *slub_debug_slabs;
302
205/* 303/*
206 * Object debugging 304 * Object debugging
207 */ 305 */
@@ -237,35 +335,6 @@ static void print_section(char *text, u8 *addr, unsigned int length)
237 } 335 }
238} 336}
239 337
240/*
241 * Slow version of get and set free pointer.
242 *
243 * This requires touching the cache lines of kmem_cache.
244 * The offset can also be obtained from the page. In that
245 * case it is in the cacheline that we already need to touch.
246 */
247static void *get_freepointer(struct kmem_cache *s, void *object)
248{
249 return *(void **)(object + s->offset);
250}
251
252static void set_freepointer(struct kmem_cache *s, void *object, void *fp)
253{
254 *(void **)(object + s->offset) = fp;
255}
256
257/*
258 * Tracking user of a slab.
259 */
260struct track {
261 void *addr; /* Called from address */
262 int cpu; /* Was running on cpu */
263 int pid; /* Pid context */
264 unsigned long when; /* When did the operation occur */
265};
266
267enum track_item { TRACK_ALLOC, TRACK_FREE };
268
269static struct track *get_track(struct kmem_cache *s, void *object, 338static struct track *get_track(struct kmem_cache *s, void *object,
270 enum track_item alloc) 339 enum track_item alloc)
271{ 340{
@@ -400,24 +469,6 @@ static int check_bytes(u8 *start, unsigned int value, unsigned int bytes)
400 return 1; 469 return 1;
401} 470}
402 471
403
404static int check_valid_pointer(struct kmem_cache *s, struct page *page,
405 void *object)
406{
407 void *base;
408
409 if (!object)
410 return 1;
411
412 base = page_address(page);
413 if (object < base || object >= base + s->objects * s->size ||
414 (object - base) % s->size) {
415 return 0;
416 }
417
418 return 1;
419}
420
421/* 472/*
422 * Object layout: 473 * Object layout:
423 * 474 *
@@ -425,26 +476,34 @@ static int check_valid_pointer(struct kmem_cache *s, struct page *page,
425 * Bytes of the object to be managed. 476 * Bytes of the object to be managed.
426 * If the freepointer may overlay the object then the free 477 * If the freepointer may overlay the object then the free
427 * pointer is the first word of the object. 478 * pointer is the first word of the object.
479 *
428 * Poisoning uses 0x6b (POISON_FREE) and the last byte is 480 * Poisoning uses 0x6b (POISON_FREE) and the last byte is
429 * 0xa5 (POISON_END) 481 * 0xa5 (POISON_END)
430 * 482 *
431 * object + s->objsize 483 * object + s->objsize
432 * Padding to reach word boundary. This is also used for Redzoning. 484 * Padding to reach word boundary. This is also used for Redzoning.
433 * Padding is extended to word size if Redzoning is enabled 485 * Padding is extended by another word if Redzoning is enabled and
434 * and objsize == inuse. 486 * objsize == inuse.
487 *
435 * We fill with 0xbb (RED_INACTIVE) for inactive objects and with 488 * We fill with 0xbb (RED_INACTIVE) for inactive objects and with
436 * 0xcc (RED_ACTIVE) for objects in use. 489 * 0xcc (RED_ACTIVE) for objects in use.
437 * 490 *
438 * object + s->inuse 491 * object + s->inuse
492 * Meta data starts here.
493 *
439 * A. Free pointer (if we cannot overwrite object on free) 494 * A. Free pointer (if we cannot overwrite object on free)
440 * B. Tracking data for SLAB_STORE_USER 495 * B. Tracking data for SLAB_STORE_USER
441 * C. Padding to reach required alignment boundary 496 * C. Padding to reach required alignment boundary or at mininum
442 * Padding is done using 0x5a (POISON_INUSE) 497 * one word if debuggin is on to be able to detect writes
498 * before the word boundary.
499 *
500 * Padding is done using 0x5a (POISON_INUSE)
443 * 501 *
444 * object + s->size 502 * object + s->size
503 * Nothing is used beyond s->size.
445 * 504 *
446 * If slabcaches are merged then the objsize and inuse boundaries are to 505 * If slabcaches are merged then the objsize and inuse boundaries are mostly
447 * be ignored. And therefore no slab options that rely on these boundaries 506 * ignored. And therefore no slab options that rely on these boundaries
448 * may be used with merged slabcaches. 507 * may be used with merged slabcaches.
449 */ 508 */
450 509
@@ -570,8 +629,7 @@ static int check_object(struct kmem_cache *s, struct page *page,
570 /* 629 /*
571 * No choice but to zap it and thus loose the remainder 630 * No choice but to zap it and thus loose the remainder
572 * of the free objects in this slab. May cause 631 * of the free objects in this slab. May cause
573 * another error because the object count maybe 632 * another error because the object count is now wrong.
574 * wrong now.
575 */ 633 */
576 set_freepointer(s, p, NULL); 634 set_freepointer(s, p, NULL);
577 return 0; 635 return 0;
@@ -611,9 +669,8 @@ static int check_slab(struct kmem_cache *s, struct page *page)
611} 669}
612 670
613/* 671/*
614 * Determine if a certain object on a page is on the freelist and 672 * Determine if a certain object on a page is on the freelist. Must hold the
615 * therefore free. Must hold the slab lock for cpu slabs to 673 * slab lock to guarantee that the chains are in a consistent state.
616 * guarantee that the chains are consistent.
617 */ 674 */
618static int on_freelist(struct kmem_cache *s, struct page *page, void *search) 675static int on_freelist(struct kmem_cache *s, struct page *page, void *search)
619{ 676{
@@ -659,7 +716,7 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search)
659} 716}
660 717
661/* 718/*
662 * Tracking of fully allocated slabs for debugging 719 * Tracking of fully allocated slabs for debugging purposes.
663 */ 720 */
664static void add_full(struct kmem_cache_node *n, struct page *page) 721static void add_full(struct kmem_cache_node *n, struct page *page)
665{ 722{
@@ -710,7 +767,7 @@ bad:
710 /* 767 /*
711 * If this is a slab page then lets do the best we can 768 * If this is a slab page then lets do the best we can
712 * to avoid issues in the future. Marking all objects 769 * to avoid issues in the future. Marking all objects
713 * as used avoids touching the remainder. 770 * as used avoids touching the remaining objects.
714 */ 771 */
715 printk(KERN_ERR "@@@ SLUB: %s slab 0x%p. Marking all objects used.\n", 772 printk(KERN_ERR "@@@ SLUB: %s slab 0x%p. Marking all objects used.\n",
716 s->name, page); 773 s->name, page);
@@ -764,6 +821,113 @@ fail:
764 return 0; 821 return 0;
765} 822}
766 823
824static void trace(struct kmem_cache *s, struct page *page, void *object, int alloc)
825{
826 if (s->flags & SLAB_TRACE) {
827 printk(KERN_INFO "TRACE %s %s 0x%p inuse=%d fp=0x%p\n",
828 s->name,
829 alloc ? "alloc" : "free",
830 object, page->inuse,
831 page->freelist);
832
833 if (!alloc)
834 print_section("Object", (void *)object, s->objsize);
835
836 dump_stack();
837 }
838}
839
840static int __init setup_slub_debug(char *str)
841{
842 if (!str || *str != '=')
843 slub_debug = DEBUG_DEFAULT_FLAGS;
844 else {
845 str++;
846 if (*str == 0 || *str == ',')
847 slub_debug = DEBUG_DEFAULT_FLAGS;
848 else
849 for( ;*str && *str != ','; str++)
850 switch (*str) {
851 case 'f' : case 'F' :
852 slub_debug |= SLAB_DEBUG_FREE;
853 break;
854 case 'z' : case 'Z' :
855 slub_debug |= SLAB_RED_ZONE;
856 break;
857 case 'p' : case 'P' :
858 slub_debug |= SLAB_POISON;
859 break;
860 case 'u' : case 'U' :
861 slub_debug |= SLAB_STORE_USER;
862 break;
863 case 't' : case 'T' :
864 slub_debug |= SLAB_TRACE;
865 break;
866 default:
867 printk(KERN_ERR "slub_debug option '%c' "
868 "unknown. skipped\n",*str);
869 }
870 }
871
872 if (*str == ',')
873 slub_debug_slabs = str + 1;
874 return 1;
875}
876
877__setup("slub_debug", setup_slub_debug);
878
879static void kmem_cache_open_debug_check(struct kmem_cache *s)
880{
881 /*
882 * The page->offset field is only 16 bit wide. This is an offset
883 * in units of words from the beginning of an object. If the slab
884 * size is bigger then we cannot move the free pointer behind the
885 * object anymore.
886 *
887 * On 32 bit platforms the limit is 256k. On 64bit platforms
888 * the limit is 512k.
889 *
890 * Debugging or ctor/dtors may create a need to move the free
891 * pointer. Fail if this happens.
892 */
893 if (s->size >= 65535 * sizeof(void *)) {
894 BUG_ON(s->flags & (SLAB_RED_ZONE | SLAB_POISON |
895 SLAB_STORE_USER | SLAB_DESTROY_BY_RCU));
896 BUG_ON(s->ctor || s->dtor);
897 }
898 else
899 /*
900 * Enable debugging if selected on the kernel commandline.
901 */
902 if (slub_debug && (!slub_debug_slabs ||
903 strncmp(slub_debug_slabs, s->name,
904 strlen(slub_debug_slabs)) == 0))
905 s->flags |= slub_debug;
906}
907#else
908
909static inline int alloc_object_checks(struct kmem_cache *s,
910 struct page *page, void *object) { return 0; }
911
912static inline int free_object_checks(struct kmem_cache *s,
913 struct page *page, void *object) { return 0; }
914
915static inline void add_full(struct kmem_cache_node *n, struct page *page) {}
916static inline void remove_full(struct kmem_cache *s, struct page *page) {}
917static inline void trace(struct kmem_cache *s, struct page *page,
918 void *object, int alloc) {}
919static inline void init_object(struct kmem_cache *s,
920 void *object, int active) {}
921static inline void init_tracking(struct kmem_cache *s, void *object) {}
922static inline int slab_pad_check(struct kmem_cache *s, struct page *page)
923 { return 1; }
924static inline int check_object(struct kmem_cache *s, struct page *page,
925 void *object, int active) { return 1; }
926static inline void set_track(struct kmem_cache *s, void *object,
927 enum track_item alloc, void *addr) {}
928static inline void kmem_cache_open_debug_check(struct kmem_cache *s) {}
929#define slub_debug 0
930#endif
767/* 931/*
768 * Slab allocation and freeing 932 * Slab allocation and freeing
769 */ 933 */
@@ -797,7 +961,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
797static void setup_object(struct kmem_cache *s, struct page *page, 961static void setup_object(struct kmem_cache *s, struct page *page,
798 void *object) 962 void *object)
799{ 963{
800 if (PageError(page)) { 964 if (SlabDebug(page)) {
801 init_object(s, object, 0); 965 init_object(s, object, 0);
802 init_tracking(s, object); 966 init_tracking(s, object);
803 } 967 }
@@ -832,7 +996,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
832 page->flags |= 1 << PG_slab; 996 page->flags |= 1 << PG_slab;
833 if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON | 997 if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON |
834 SLAB_STORE_USER | SLAB_TRACE)) 998 SLAB_STORE_USER | SLAB_TRACE))
835 page->flags |= 1 << PG_error; 999 SetSlabDebug(page);
836 1000
837 start = page_address(page); 1001 start = page_address(page);
838 end = start + s->objects * s->size; 1002 end = start + s->objects * s->size;
@@ -841,7 +1005,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
841 memset(start, POISON_INUSE, PAGE_SIZE << s->order); 1005 memset(start, POISON_INUSE, PAGE_SIZE << s->order);
842 1006
843 last = start; 1007 last = start;
844 for (p = start + s->size; p < end; p += s->size) { 1008 for_each_object(p, s, start) {
845 setup_object(s, page, last); 1009 setup_object(s, page, last);
846 set_freepointer(s, last, p); 1010 set_freepointer(s, last, p);
847 last = p; 1011 last = p;
@@ -861,13 +1025,11 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
861{ 1025{
862 int pages = 1 << s->order; 1026 int pages = 1 << s->order;
863 1027
864 if (unlikely(PageError(page) || s->dtor)) { 1028 if (unlikely(SlabDebug(page) || s->dtor)) {
865 void *start = page_address(page);
866 void *end = start + (pages << PAGE_SHIFT);
867 void *p; 1029 void *p;
868 1030
869 slab_pad_check(s, page); 1031 slab_pad_check(s, page);
870 for (p = start; p <= end - s->size; p += s->size) { 1032 for_each_object(p, s, page_address(page)) {
871 if (s->dtor) 1033 if (s->dtor)
872 s->dtor(p, s, 0); 1034 s->dtor(p, s, 0);
873 check_object(s, page, p, 0); 1035 check_object(s, page, p, 0);
@@ -910,7 +1072,8 @@ static void discard_slab(struct kmem_cache *s, struct page *page)
910 1072
911 atomic_long_dec(&n->nr_slabs); 1073 atomic_long_dec(&n->nr_slabs);
912 reset_page_mapcount(page); 1074 reset_page_mapcount(page);
913 page->flags &= ~(1 << PG_slab | 1 << PG_error); 1075 ClearSlabDebug(page);
1076 __ClearPageSlab(page);
914 free_slab(s, page); 1077 free_slab(s, page);
915} 1078}
916 1079
@@ -966,9 +1129,9 @@ static void remove_partial(struct kmem_cache *s,
966} 1129}
967 1130
968/* 1131/*
969 * Lock page and remove it from the partial list 1132 * Lock slab and remove from the partial list.
970 * 1133 *
971 * Must hold list_lock 1134 * Must hold list_lock.
972 */ 1135 */
973static int lock_and_del_slab(struct kmem_cache_node *n, struct page *page) 1136static int lock_and_del_slab(struct kmem_cache_node *n, struct page *page)
974{ 1137{
@@ -981,7 +1144,7 @@ static int lock_and_del_slab(struct kmem_cache_node *n, struct page *page)
981} 1144}
982 1145
983/* 1146/*
984 * Try to get a partial slab from a specific node 1147 * Try to allocate a partial slab from a specific node.
985 */ 1148 */
986static struct page *get_partial_node(struct kmem_cache_node *n) 1149static struct page *get_partial_node(struct kmem_cache_node *n)
987{ 1150{
@@ -990,7 +1153,8 @@ static struct page *get_partial_node(struct kmem_cache_node *n)
990 /* 1153 /*
991 * Racy check. If we mistakenly see no partial slabs then we 1154 * Racy check. If we mistakenly see no partial slabs then we
992 * just allocate an empty slab. If we mistakenly try to get a 1155 * just allocate an empty slab. If we mistakenly try to get a
993 * partial slab then get_partials() will return NULL. 1156 * partial slab and there is none available then get_partials()
1157 * will return NULL.
994 */ 1158 */
995 if (!n || !n->nr_partial) 1159 if (!n || !n->nr_partial)
996 return NULL; 1160 return NULL;
@@ -1006,8 +1170,7 @@ out:
1006} 1170}
1007 1171
1008/* 1172/*
1009 * Get a page from somewhere. Search in increasing NUMA 1173 * Get a page from somewhere. Search in increasing NUMA distances.
1010 * distances.
1011 */ 1174 */
1012static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) 1175static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
1013{ 1176{
@@ -1017,24 +1180,22 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
1017 struct page *page; 1180 struct page *page;
1018 1181
1019 /* 1182 /*
1020 * The defrag ratio allows to configure the tradeoffs between 1183 * The defrag ratio allows a configuration of the tradeoffs between
1021 * inter node defragmentation and node local allocations. 1184 * inter node defragmentation and node local allocations. A lower
1022 * A lower defrag_ratio increases the tendency to do local 1185 * defrag_ratio increases the tendency to do local allocations
1023 * allocations instead of scanning throught the partial 1186 * instead of attempting to obtain partial slabs from other nodes.
1024 * lists on other nodes.
1025 *
1026 * If defrag_ratio is set to 0 then kmalloc() always
1027 * returns node local objects. If its higher then kmalloc()
1028 * may return off node objects in order to avoid fragmentation.
1029 * 1187 *
1030 * A higher ratio means slabs may be taken from other nodes 1188 * If the defrag_ratio is set to 0 then kmalloc() always
1031 * thus reducing the number of partial slabs on those nodes. 1189 * returns node local objects. If the ratio is higher then kmalloc()
1190 * may return off node objects because partial slabs are obtained
1191 * from other nodes and filled up.
1032 * 1192 *
1033 * If /sys/slab/xx/defrag_ratio is set to 100 (which makes 1193 * If /sys/slab/xx/defrag_ratio is set to 100 (which makes
1034 * defrag_ratio = 1000) then every (well almost) allocation 1194 * defrag_ratio = 1000) then every (well almost) allocation will
1035 * will first attempt to defrag slab caches on other nodes. This 1195 * first attempt to defrag slab caches on other nodes. This means
1036 * means scanning over all nodes to look for partial slabs which 1196 * scanning over all nodes to look for partial slabs which may be
1037 * may be a bit expensive to do on every slab allocation. 1197 * expensive if we do it every time we are trying to find a slab
1198 * with available objects.
1038 */ 1199 */
1039 if (!s->defrag_ratio || get_cycles() % 1024 > s->defrag_ratio) 1200 if (!s->defrag_ratio || get_cycles() % 1024 > s->defrag_ratio)
1040 return NULL; 1201 return NULL;
@@ -1087,18 +1248,19 @@ static void putback_slab(struct kmem_cache *s, struct page *page)
1087 1248
1088 if (page->freelist) 1249 if (page->freelist)
1089 add_partial(n, page); 1250 add_partial(n, page);
1090 else if (PageError(page) && (s->flags & SLAB_STORE_USER)) 1251 else if (SlabDebug(page) && (s->flags & SLAB_STORE_USER))
1091 add_full(n, page); 1252 add_full(n, page);
1092 slab_unlock(page); 1253 slab_unlock(page);
1093 1254
1094 } else { 1255 } else {
1095 if (n->nr_partial < MIN_PARTIAL) { 1256 if (n->nr_partial < MIN_PARTIAL) {
1096 /* 1257 /*
1097 * Adding an empty page to the partial slabs in order 1258 * Adding an empty slab to the partial slabs in order
1098 * to avoid page allocator overhead. This page needs to 1259 * to avoid page allocator overhead. This slab needs
1099 * come after all the others that are not fully empty 1260 * to come after the other slabs with objects in
1100 * in order to make sure that we do maximum 1261 * order to fill them up. That way the size of the
1101 * defragmentation. 1262 * partial list stays small. kmem_cache_shrink can
1263 * reclaim empty slabs from the partial list.
1102 */ 1264 */
1103 add_partial_tail(n, page); 1265 add_partial_tail(n, page);
1104 slab_unlock(page); 1266 slab_unlock(page);
@@ -1166,11 +1328,11 @@ static void flush_all(struct kmem_cache *s)
1166 * 1. The page struct 1328 * 1. The page struct
1167 * 2. The first cacheline of the object to be allocated. 1329 * 2. The first cacheline of the object to be allocated.
1168 * 1330 *
1169 * The only cache lines that are read (apart from code) is the 1331 * The only other cache lines that are read (apart from code) is the
1170 * per cpu array in the kmem_cache struct. 1332 * per cpu array in the kmem_cache struct.
1171 * 1333 *
1172 * Fastpath is not possible if we need to get a new slab or have 1334 * Fastpath is not possible if we need to get a new slab or have
1173 * debugging enabled (which means all slabs are marked with PageError) 1335 * debugging enabled (which means all slabs are marked with SlabDebug)
1174 */ 1336 */
1175static void *slab_alloc(struct kmem_cache *s, 1337static void *slab_alloc(struct kmem_cache *s,
1176 gfp_t gfpflags, int node, void *addr) 1338 gfp_t gfpflags, int node, void *addr)
@@ -1193,7 +1355,7 @@ redo:
1193 object = page->freelist; 1355 object = page->freelist;
1194 if (unlikely(!object)) 1356 if (unlikely(!object))
1195 goto another_slab; 1357 goto another_slab;
1196 if (unlikely(PageError(page))) 1358 if (unlikely(SlabDebug(page)))
1197 goto debug; 1359 goto debug;
1198 1360
1199have_object: 1361have_object:
@@ -1220,9 +1382,11 @@ have_slab:
1220 cpu = smp_processor_id(); 1382 cpu = smp_processor_id();
1221 if (s->cpu_slab[cpu]) { 1383 if (s->cpu_slab[cpu]) {
1222 /* 1384 /*
1223 * Someone else populated the cpu_slab while we enabled 1385 * Someone else populated the cpu_slab while we
1224 * interrupts, or we have got scheduled on another cpu. 1386 * enabled interrupts, or we have gotten scheduled
1225 * The page may not be on the requested node. 1387 * on another cpu. The page may not be on the
1388 * requested node even if __GFP_THISNODE was
1389 * specified. So we need to recheck.
1226 */ 1390 */
1227 if (node == -1 || 1391 if (node == -1 ||
1228 page_to_nid(s->cpu_slab[cpu]) == node) { 1392 page_to_nid(s->cpu_slab[cpu]) == node) {
@@ -1235,7 +1399,7 @@ have_slab:
1235 slab_lock(page); 1399 slab_lock(page);
1236 goto redo; 1400 goto redo;
1237 } 1401 }
1238 /* Dump the current slab */ 1402 /* New slab does not fit our expectations */
1239 flush_slab(s, s->cpu_slab[cpu], cpu); 1403 flush_slab(s, s->cpu_slab[cpu], cpu);
1240 } 1404 }
1241 slab_lock(page); 1405 slab_lock(page);
@@ -1248,12 +1412,7 @@ debug:
1248 goto another_slab; 1412 goto another_slab;
1249 if (s->flags & SLAB_STORE_USER) 1413 if (s->flags & SLAB_STORE_USER)
1250 set_track(s, object, TRACK_ALLOC, addr); 1414 set_track(s, object, TRACK_ALLOC, addr);
1251 if (s->flags & SLAB_TRACE) { 1415 trace(s, page, object, 1);
1252 printk(KERN_INFO "TRACE %s alloc 0x%p inuse=%d fp=0x%p\n",
1253 s->name, object, page->inuse,
1254 page->freelist);
1255 dump_stack();
1256 }
1257 init_object(s, object, 1); 1416 init_object(s, object, 1);
1258 goto have_object; 1417 goto have_object;
1259} 1418}
@@ -1276,7 +1435,8 @@ EXPORT_SYMBOL(kmem_cache_alloc_node);
1276 * The fastpath only writes the cacheline of the page struct and the first 1435 * The fastpath only writes the cacheline of the page struct and the first
1277 * cacheline of the object. 1436 * cacheline of the object.
1278 * 1437 *
1279 * No special cachelines need to be read 1438 * We read the cpu_slab cacheline to check if the slab is the per cpu
1439 * slab for this processor.
1280 */ 1440 */
1281static void slab_free(struct kmem_cache *s, struct page *page, 1441static void slab_free(struct kmem_cache *s, struct page *page,
1282 void *x, void *addr) 1442 void *x, void *addr)
@@ -1288,7 +1448,7 @@ static void slab_free(struct kmem_cache *s, struct page *page,
1288 local_irq_save(flags); 1448 local_irq_save(flags);
1289 slab_lock(page); 1449 slab_lock(page);
1290 1450
1291 if (unlikely(PageError(page))) 1451 if (unlikely(SlabDebug(page)))
1292 goto debug; 1452 goto debug;
1293checks_ok: 1453checks_ok:
1294 prior = object[page->offset] = page->freelist; 1454 prior = object[page->offset] = page->freelist;
@@ -1321,7 +1481,7 @@ out_unlock:
1321slab_empty: 1481slab_empty:
1322 if (prior) 1482 if (prior)
1323 /* 1483 /*
1324 * Slab on the partial list. 1484 * Slab still on the partial list.
1325 */ 1485 */
1326 remove_partial(s, page); 1486 remove_partial(s, page);
1327 1487
@@ -1337,13 +1497,7 @@ debug:
1337 remove_full(s, page); 1497 remove_full(s, page);
1338 if (s->flags & SLAB_STORE_USER) 1498 if (s->flags & SLAB_STORE_USER)
1339 set_track(s, x, TRACK_FREE, addr); 1499 set_track(s, x, TRACK_FREE, addr);
1340 if (s->flags & SLAB_TRACE) { 1500 trace(s, page, object, 0);
1341 printk(KERN_INFO "TRACE %s free 0x%p inuse=%d fp=0x%p\n",
1342 s->name, object, page->inuse,
1343 page->freelist);
1344 print_section("Object", (void *)object, s->objsize);
1345 dump_stack();
1346 }
1347 init_object(s, object, 0); 1501 init_object(s, object, 0);
1348 goto checks_ok; 1502 goto checks_ok;
1349} 1503}
@@ -1370,22 +1524,16 @@ static struct page *get_object_page(const void *x)
1370} 1524}
1371 1525
1372/* 1526/*
1373 * kmem_cache_open produces objects aligned at "size" and the first object 1527 * Object placement in a slab is made very easy because we always start at
1374 * is placed at offset 0 in the slab (We have no metainformation on the 1528 * offset 0. If we tune the size of the object to the alignment then we can
1375 * slab, all slabs are in essence "off slab"). 1529 * get the required alignment by putting one properly sized object after
1376 * 1530 * another.
1377 * In order to get the desired alignment one just needs to align the
1378 * size.
1379 * 1531 *
1380 * Notice that the allocation order determines the sizes of the per cpu 1532 * Notice that the allocation order determines the sizes of the per cpu
1381 * caches. Each processor has always one slab available for allocations. 1533 * caches. Each processor has always one slab available for allocations.
1382 * Increasing the allocation order reduces the number of times that slabs 1534 * Increasing the allocation order reduces the number of times that slabs
1383 * must be moved on and off the partial lists and therefore may influence 1535 * must be moved on and off the partial lists and is therefore a factor in
1384 * locking overhead. 1536 * locking overhead.
1385 *
1386 * The offset is used to relocate the free list link in each object. It is
1387 * therefore possible to move the free list link behind the object. This
1388 * is necessary for RCU to work properly and also useful for debugging.
1389 */ 1537 */
1390 1538
1391/* 1539/*
@@ -1396,76 +1544,110 @@ static struct page *get_object_page(const void *x)
1396 */ 1544 */
1397static int slub_min_order; 1545static int slub_min_order;
1398static int slub_max_order = DEFAULT_MAX_ORDER; 1546static int slub_max_order = DEFAULT_MAX_ORDER;
1399
1400/*
1401 * Minimum number of objects per slab. This is necessary in order to
1402 * reduce locking overhead. Similar to the queue size in SLAB.
1403 */
1404static int slub_min_objects = DEFAULT_MIN_OBJECTS; 1547static int slub_min_objects = DEFAULT_MIN_OBJECTS;
1405 1548
1406/* 1549/*
1407 * Merge control. If this is set then no merging of slab caches will occur. 1550 * Merge control. If this is set then no merging of slab caches will occur.
1551 * (Could be removed. This was introduced to pacify the merge skeptics.)
1408 */ 1552 */
1409static int slub_nomerge; 1553static int slub_nomerge;
1410 1554
1411/* 1555/*
1412 * Debug settings:
1413 */
1414static int slub_debug;
1415
1416static char *slub_debug_slabs;
1417
1418/*
1419 * Calculate the order of allocation given an slab object size. 1556 * Calculate the order of allocation given an slab object size.
1420 * 1557 *
1421 * The order of allocation has significant impact on other elements 1558 * The order of allocation has significant impact on performance and other
1422 * of the system. Generally order 0 allocations should be preferred 1559 * system components. Generally order 0 allocations should be preferred since
1423 * since they do not cause fragmentation in the page allocator. Larger 1560 * order 0 does not cause fragmentation in the page allocator. Larger objects
1424 * objects may have problems with order 0 because there may be too much 1561 * be problematic to put into order 0 slabs because there may be too much
1425 * space left unused in a slab. We go to a higher order if more than 1/8th 1562 * unused space left. We go to a higher order if more than 1/8th of the slab
1426 * of the slab would be wasted. 1563 * would be wasted.
1427 * 1564 *
1428 * In order to reach satisfactory performance we must ensure that 1565 * In order to reach satisfactory performance we must ensure that a minimum
1429 * a minimum number of objects is in one slab. Otherwise we may 1566 * number of objects is in one slab. Otherwise we may generate too much
1430 * generate too much activity on the partial lists. This is less a 1567 * activity on the partial lists which requires taking the list_lock. This is
1431 * concern for large slabs though. slub_max_order specifies the order 1568 * less a concern for large slabs though which are rarely used.
1432 * where we begin to stop considering the number of objects in a slab.
1433 * 1569 *
1434 * Higher order allocations also allow the placement of more objects 1570 * slub_max_order specifies the order where we begin to stop considering the
1435 * in a slab and thereby reduce object handling overhead. If the user 1571 * number of objects in a slab as critical. If we reach slub_max_order then
1436 * has requested a higher mininum order then we start with that one 1572 * we try to keep the page order as low as possible. So we accept more waste
1437 * instead of zero. 1573 * of space in favor of a small page order.
1574 *
1575 * Higher order allocations also allow the placement of more objects in a
1576 * slab and thereby reduce object handling overhead. If the user has
1577 * requested a higher mininum order then we start with that one instead of
1578 * the smallest order which will fit the object.
1438 */ 1579 */
1439static int calculate_order(int size) 1580static inline int slab_order(int size, int min_objects,
1581 int max_order, int fract_leftover)
1440{ 1582{
1441 int order; 1583 int order;
1442 int rem; 1584 int rem;
1443 1585
1444 for (order = max(slub_min_order, fls(size - 1) - PAGE_SHIFT); 1586 for (order = max(slub_min_order,
1445 order < MAX_ORDER; order++) { 1587 fls(min_objects * size - 1) - PAGE_SHIFT);
1446 unsigned long slab_size = PAGE_SIZE << order; 1588 order <= max_order; order++) {
1447 1589
1448 if (slub_max_order > order && 1590 unsigned long slab_size = PAGE_SIZE << order;
1449 slab_size < slub_min_objects * size)
1450 continue;
1451 1591
1452 if (slab_size < size) 1592 if (slab_size < min_objects * size)
1453 continue; 1593 continue;
1454 1594
1455 rem = slab_size % size; 1595 rem = slab_size % size;
1456 1596
1457 if (rem <= (PAGE_SIZE << order) / 8) 1597 if (rem <= slab_size / fract_leftover)
1458 break; 1598 break;
1459 1599
1460 } 1600 }
1461 if (order >= MAX_ORDER) 1601
1462 return -E2BIG;
1463 return order; 1602 return order;
1464} 1603}
1465 1604
1605static inline int calculate_order(int size)
1606{
1607 int order;
1608 int min_objects;
1609 int fraction;
1610
1611 /*
1612 * Attempt to find best configuration for a slab. This
1613 * works by first attempting to generate a layout with
1614 * the best configuration and backing off gradually.
1615 *
1616 * First we reduce the acceptable waste in a slab. Then
1617 * we reduce the minimum objects required in a slab.
1618 */
1619 min_objects = slub_min_objects;
1620 while (min_objects > 1) {
1621 fraction = 8;
1622 while (fraction >= 4) {
1623 order = slab_order(size, min_objects,
1624 slub_max_order, fraction);
1625 if (order <= slub_max_order)
1626 return order;
1627 fraction /= 2;
1628 }
1629 min_objects /= 2;
1630 }
1631
1632 /*
1633 * We were unable to place multiple objects in a slab. Now
1634 * lets see if we can place a single object there.
1635 */
1636 order = slab_order(size, 1, slub_max_order, 1);
1637 if (order <= slub_max_order)
1638 return order;
1639
1640 /*
1641 * Doh this slab cannot be placed using slub_max_order.
1642 */
1643 order = slab_order(size, 1, MAX_ORDER, 1);
1644 if (order <= MAX_ORDER)
1645 return order;
1646 return -ENOSYS;
1647}
1648
1466/* 1649/*
1467 * Function to figure out which alignment to use from the 1650 * Figure out what the alignment of the objects will be.
1468 * various ways of specifying it.
1469 */ 1651 */
1470static unsigned long calculate_alignment(unsigned long flags, 1652static unsigned long calculate_alignment(unsigned long flags,
1471 unsigned long align, unsigned long size) 1653 unsigned long align, unsigned long size)
@@ -1480,8 +1662,8 @@ static unsigned long calculate_alignment(unsigned long flags,
1480 * then use it. 1662 * then use it.
1481 */ 1663 */
1482 if ((flags & SLAB_HWCACHE_ALIGN) && 1664 if ((flags & SLAB_HWCACHE_ALIGN) &&
1483 size > L1_CACHE_BYTES / 2) 1665 size > cache_line_size() / 2)
1484 return max_t(unsigned long, align, L1_CACHE_BYTES); 1666 return max_t(unsigned long, align, cache_line_size());
1485 1667
1486 if (align < ARCH_SLAB_MINALIGN) 1668 if (align < ARCH_SLAB_MINALIGN)
1487 return ARCH_SLAB_MINALIGN; 1669 return ARCH_SLAB_MINALIGN;
@@ -1619,22 +1801,23 @@ static int calculate_sizes(struct kmem_cache *s)
1619 */ 1801 */
1620 size = ALIGN(size, sizeof(void *)); 1802 size = ALIGN(size, sizeof(void *));
1621 1803
1804#ifdef CONFIG_SLUB_DEBUG
1622 /* 1805 /*
1623 * If we are redzoning then check if there is some space between the 1806 * If we are Redzoning then check if there is some space between the
1624 * end of the object and the free pointer. If not then add an 1807 * end of the object and the free pointer. If not then add an
1625 * additional word, so that we can establish a redzone between 1808 * additional word to have some bytes to store Redzone information.
1626 * the object and the freepointer to be able to check for overwrites.
1627 */ 1809 */
1628 if ((flags & SLAB_RED_ZONE) && size == s->objsize) 1810 if ((flags & SLAB_RED_ZONE) && size == s->objsize)
1629 size += sizeof(void *); 1811 size += sizeof(void *);
1812#endif
1630 1813
1631 /* 1814 /*
1632 * With that we have determined how much of the slab is in actual 1815 * With that we have determined the number of bytes in actual use
1633 * use by the object. This is the potential offset to the free 1816 * by the object. This is the potential offset to the free pointer.
1634 * pointer.
1635 */ 1817 */
1636 s->inuse = size; 1818 s->inuse = size;
1637 1819
1820#ifdef CONFIG_SLUB_DEBUG
1638 if (((flags & (SLAB_DESTROY_BY_RCU | SLAB_POISON)) || 1821 if (((flags & (SLAB_DESTROY_BY_RCU | SLAB_POISON)) ||
1639 s->ctor || s->dtor)) { 1822 s->ctor || s->dtor)) {
1640 /* 1823 /*
@@ -1656,7 +1839,7 @@ static int calculate_sizes(struct kmem_cache *s)
1656 */ 1839 */
1657 size += 2 * sizeof(struct track); 1840 size += 2 * sizeof(struct track);
1658 1841
1659 if (flags & DEBUG_DEFAULT_FLAGS) 1842 if (flags & SLAB_RED_ZONE)
1660 /* 1843 /*
1661 * Add some empty padding so that we can catch 1844 * Add some empty padding so that we can catch
1662 * overwrites from earlier objects rather than let 1845 * overwrites from earlier objects rather than let
@@ -1665,10 +1848,12 @@ static int calculate_sizes(struct kmem_cache *s)
1665 * of the object. 1848 * of the object.
1666 */ 1849 */
1667 size += sizeof(void *); 1850 size += sizeof(void *);
1851#endif
1852
1668 /* 1853 /*
1669 * Determine the alignment based on various parameters that the 1854 * Determine the alignment based on various parameters that the
1670 * user specified (this is unecessarily complex due to the attempt 1855 * user specified and the dynamic determination of cache line size
1671 * to be compatible with SLAB. Should be cleaned up some day). 1856 * on bootup.
1672 */ 1857 */
1673 align = calculate_alignment(flags, align, s->objsize); 1858 align = calculate_alignment(flags, align, s->objsize);
1674 1859
@@ -1700,23 +1885,6 @@ static int calculate_sizes(struct kmem_cache *s)
1700 1885
1701} 1886}
1702 1887
1703static int __init finish_bootstrap(void)
1704{
1705 struct list_head *h;
1706 int err;
1707
1708 slab_state = SYSFS;
1709
1710 list_for_each(h, &slab_caches) {
1711 struct kmem_cache *s =
1712 container_of(h, struct kmem_cache, list);
1713
1714 err = sysfs_slab_add(s);
1715 BUG_ON(err);
1716 }
1717 return 0;
1718}
1719
1720static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, 1888static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
1721 const char *name, size_t size, 1889 const char *name, size_t size,
1722 size_t align, unsigned long flags, 1890 size_t align, unsigned long flags,
@@ -1730,32 +1898,7 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
1730 s->objsize = size; 1898 s->objsize = size;
1731 s->flags = flags; 1899 s->flags = flags;
1732 s->align = align; 1900 s->align = align;
1733 1901 kmem_cache_open_debug_check(s);
1734 /*
1735 * The page->offset field is only 16 bit wide. This is an offset
1736 * in units of words from the beginning of an object. If the slab
1737 * size is bigger then we cannot move the free pointer behind the
1738 * object anymore.
1739 *
1740 * On 32 bit platforms the limit is 256k. On 64bit platforms
1741 * the limit is 512k.
1742 *
1743 * Debugging or ctor/dtors may create a need to move the free
1744 * pointer. Fail if this happens.
1745 */
1746 if (s->size >= 65535 * sizeof(void *)) {
1747 BUG_ON(flags & (SLAB_RED_ZONE | SLAB_POISON |
1748 SLAB_STORE_USER | SLAB_DESTROY_BY_RCU));
1749 BUG_ON(ctor || dtor);
1750 }
1751 else
1752 /*
1753 * Enable debugging if selected on the kernel commandline.
1754 */
1755 if (slub_debug && (!slub_debug_slabs ||
1756 strncmp(slub_debug_slabs, name,
1757 strlen(slub_debug_slabs)) == 0))
1758 s->flags |= slub_debug;
1759 1902
1760 if (!calculate_sizes(s)) 1903 if (!calculate_sizes(s))
1761 goto error; 1904 goto error;
@@ -1783,7 +1926,6 @@ EXPORT_SYMBOL(kmem_cache_open);
1783int kmem_ptr_validate(struct kmem_cache *s, const void *object) 1926int kmem_ptr_validate(struct kmem_cache *s, const void *object)
1784{ 1927{
1785 struct page * page; 1928 struct page * page;
1786 void *addr;
1787 1929
1788 page = get_object_page(object); 1930 page = get_object_page(object);
1789 1931
@@ -1791,13 +1933,7 @@ int kmem_ptr_validate(struct kmem_cache *s, const void *object)
1791 /* No slab or wrong slab */ 1933 /* No slab or wrong slab */
1792 return 0; 1934 return 0;
1793 1935
1794 addr = page_address(page); 1936 if (!check_valid_pointer(s, page, object))
1795 if (object < addr || object >= addr + s->objects * s->size)
1796 /* Out of bounds */
1797 return 0;
1798
1799 if ((object - addr) % s->size)
1800 /* Improperly aligned */
1801 return 0; 1937 return 0;
1802 1938
1803 /* 1939 /*
@@ -1826,7 +1962,8 @@ const char *kmem_cache_name(struct kmem_cache *s)
1826EXPORT_SYMBOL(kmem_cache_name); 1962EXPORT_SYMBOL(kmem_cache_name);
1827 1963
1828/* 1964/*
1829 * Attempt to free all slabs on a node 1965 * Attempt to free all slabs on a node. Return the number of slabs we
1966 * were unable to free.
1830 */ 1967 */
1831static int free_list(struct kmem_cache *s, struct kmem_cache_node *n, 1968static int free_list(struct kmem_cache *s, struct kmem_cache_node *n,
1832 struct list_head *list) 1969 struct list_head *list)
@@ -1847,7 +1984,7 @@ static int free_list(struct kmem_cache *s, struct kmem_cache_node *n,
1847} 1984}
1848 1985
1849/* 1986/*
1850 * Release all resources used by slab cache 1987 * Release all resources used by a slab cache.
1851 */ 1988 */
1852static int kmem_cache_close(struct kmem_cache *s) 1989static int kmem_cache_close(struct kmem_cache *s)
1853{ 1990{
@@ -1932,45 +2069,6 @@ static int __init setup_slub_nomerge(char *str)
1932 2069
1933__setup("slub_nomerge", setup_slub_nomerge); 2070__setup("slub_nomerge", setup_slub_nomerge);
1934 2071
1935static int __init setup_slub_debug(char *str)
1936{
1937 if (!str || *str != '=')
1938 slub_debug = DEBUG_DEFAULT_FLAGS;
1939 else {
1940 str++;
1941 if (*str == 0 || *str == ',')
1942 slub_debug = DEBUG_DEFAULT_FLAGS;
1943 else
1944 for( ;*str && *str != ','; str++)
1945 switch (*str) {
1946 case 'f' : case 'F' :
1947 slub_debug |= SLAB_DEBUG_FREE;
1948 break;
1949 case 'z' : case 'Z' :
1950 slub_debug |= SLAB_RED_ZONE;
1951 break;
1952 case 'p' : case 'P' :
1953 slub_debug |= SLAB_POISON;
1954 break;
1955 case 'u' : case 'U' :
1956 slub_debug |= SLAB_STORE_USER;
1957 break;
1958 case 't' : case 'T' :
1959 slub_debug |= SLAB_TRACE;
1960 break;
1961 default:
1962 printk(KERN_ERR "slub_debug option '%c' "
1963 "unknown. skipped\n",*str);
1964 }
1965 }
1966
1967 if (*str == ',')
1968 slub_debug_slabs = str + 1;
1969 return 1;
1970}
1971
1972__setup("slub_debug", setup_slub_debug);
1973
1974static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s, 2072static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s,
1975 const char *name, int size, gfp_t gfp_flags) 2073 const char *name, int size, gfp_t gfp_flags)
1976{ 2074{
@@ -2108,13 +2206,14 @@ void kfree(const void *x)
2108EXPORT_SYMBOL(kfree); 2206EXPORT_SYMBOL(kfree);
2109 2207
2110/* 2208/*
2111 * kmem_cache_shrink removes empty slabs from the partial lists 2209 * kmem_cache_shrink removes empty slabs from the partial lists and sorts
2112 * and then sorts the partially allocated slabs by the number 2210 * the remaining slabs by the number of items in use. The slabs with the
2113 * of items in use. The slabs with the most items in use 2211 * most items in use come first. New allocations will then fill those up
2114 * come first. New allocations will remove these from the 2212 * and thus they can be removed from the partial lists.
2115 * partial list because they are full. The slabs with the 2213 *
2116 * least items are placed last. If it happens that the objects 2214 * The slabs with the least items are placed last. This results in them
2117 * are freed then the page can be returned to the page allocator. 2215 * being allocated from last increasing the chance that the last objects
2216 * are freed in them.
2118 */ 2217 */
2119int kmem_cache_shrink(struct kmem_cache *s) 2218int kmem_cache_shrink(struct kmem_cache *s)
2120{ 2219{
@@ -2143,12 +2242,10 @@ int kmem_cache_shrink(struct kmem_cache *s)
2143 spin_lock_irqsave(&n->list_lock, flags); 2242 spin_lock_irqsave(&n->list_lock, flags);
2144 2243
2145 /* 2244 /*
2146 * Build lists indexed by the items in use in 2245 * Build lists indexed by the items in use in each slab.
2147 * each slab or free slabs if empty.
2148 * 2246 *
2149 * Note that concurrent frees may occur while 2247 * Note that concurrent frees may occur while we hold the
2150 * we hold the list_lock. page->inuse here is 2248 * list_lock. page->inuse here is the upper limit.
2151 * the upper limit.
2152 */ 2249 */
2153 list_for_each_entry_safe(page, t, &n->partial, lru) { 2250 list_for_each_entry_safe(page, t, &n->partial, lru) {
2154 if (!page->inuse && slab_trylock(page)) { 2251 if (!page->inuse && slab_trylock(page)) {
@@ -2172,8 +2269,8 @@ int kmem_cache_shrink(struct kmem_cache *s)
2172 goto out; 2269 goto out;
2173 2270
2174 /* 2271 /*
2175 * Rebuild the partial list with the slabs filled up 2272 * Rebuild the partial list with the slabs filled up most
2176 * most first and the least used slabs at the end. 2273 * first and the least used slabs at the end.
2177 */ 2274 */
2178 for (i = s->objects - 1; i >= 0; i--) 2275 for (i = s->objects - 1; i >= 0; i--)
2179 list_splice(slabs_by_inuse + i, n->partial.prev); 2276 list_splice(slabs_by_inuse + i, n->partial.prev);
@@ -2189,7 +2286,6 @@ EXPORT_SYMBOL(kmem_cache_shrink);
2189 2286
2190/** 2287/**
2191 * krealloc - reallocate memory. The contents will remain unchanged. 2288 * krealloc - reallocate memory. The contents will remain unchanged.
2192 *
2193 * @p: object to reallocate memory for. 2289 * @p: object to reallocate memory for.
2194 * @new_size: how many bytes of memory are required. 2290 * @new_size: how many bytes of memory are required.
2195 * @flags: the type of memory to allocate. 2291 * @flags: the type of memory to allocate.
@@ -2201,9 +2297,8 @@ EXPORT_SYMBOL(kmem_cache_shrink);
2201 */ 2297 */
2202void *krealloc(const void *p, size_t new_size, gfp_t flags) 2298void *krealloc(const void *p, size_t new_size, gfp_t flags)
2203{ 2299{
2204 struct kmem_cache *new_cache;
2205 void *ret; 2300 void *ret;
2206 struct page *page; 2301 size_t ks;
2207 2302
2208 if (unlikely(!p)) 2303 if (unlikely(!p))
2209 return kmalloc(new_size, flags); 2304 return kmalloc(new_size, flags);
@@ -2213,19 +2308,13 @@ void *krealloc(const void *p, size_t new_size, gfp_t flags)
2213 return NULL; 2308 return NULL;
2214 } 2309 }
2215 2310
2216 page = virt_to_head_page(p); 2311 ks = ksize(p);
2217 2312 if (ks >= new_size)
2218 new_cache = get_slab(new_size, flags);
2219
2220 /*
2221 * If new size fits in the current cache, bail out.
2222 */
2223 if (likely(page->slab == new_cache))
2224 return (void *)p; 2313 return (void *)p;
2225 2314
2226 ret = kmalloc(new_size, flags); 2315 ret = kmalloc(new_size, flags);
2227 if (ret) { 2316 if (ret) {
2228 memcpy(ret, p, min(new_size, ksize(p))); 2317 memcpy(ret, p, min(new_size, ks));
2229 kfree(p); 2318 kfree(p);
2230 } 2319 }
2231 return ret; 2320 return ret;
@@ -2243,7 +2332,7 @@ void __init kmem_cache_init(void)
2243#ifdef CONFIG_NUMA 2332#ifdef CONFIG_NUMA
2244 /* 2333 /*
2245 * Must first have the slab cache available for the allocations of the 2334 * Must first have the slab cache available for the allocations of the
2246 * struct kmalloc_cache_node's. There is special bootstrap code in 2335 * struct kmem_cache_node's. There is special bootstrap code in
2247 * kmem_cache_open for slab_state == DOWN. 2336 * kmem_cache_open for slab_state == DOWN.
2248 */ 2337 */
2249 create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node", 2338 create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node",
@@ -2280,7 +2369,7 @@ void __init kmem_cache_init(void)
2280 2369
2281 printk(KERN_INFO "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d," 2370 printk(KERN_INFO "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d,"
2282 " Processors=%d, Nodes=%d\n", 2371 " Processors=%d, Nodes=%d\n",
2283 KMALLOC_SHIFT_HIGH, L1_CACHE_BYTES, 2372 KMALLOC_SHIFT_HIGH, cache_line_size(),
2284 slub_min_order, slub_max_order, slub_min_objects, 2373 slub_min_order, slub_max_order, slub_min_objects,
2285 nr_cpu_ids, nr_node_ids); 2374 nr_cpu_ids, nr_node_ids);
2286} 2375}
@@ -2415,8 +2504,8 @@ static void for_all_slabs(void (*func)(struct kmem_cache *, int), int cpu)
2415} 2504}
2416 2505
2417/* 2506/*
2418 * Use the cpu notifier to insure that the slab are flushed 2507 * Use the cpu notifier to insure that the cpu slabs are flushed when
2419 * when necessary. 2508 * necessary.
2420 */ 2509 */
2421static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb, 2510static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb,
2422 unsigned long action, void *hcpu) 2511 unsigned long action, void *hcpu)
@@ -2425,7 +2514,9 @@ static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb,
2425 2514
2426 switch (action) { 2515 switch (action) {
2427 case CPU_UP_CANCELED: 2516 case CPU_UP_CANCELED:
2517 case CPU_UP_CANCELED_FROZEN:
2428 case CPU_DEAD: 2518 case CPU_DEAD:
2519 case CPU_DEAD_FROZEN:
2429 for_all_slabs(__flush_cpu_slab, cpu); 2520 for_all_slabs(__flush_cpu_slab, cpu);
2430 break; 2521 break;
2431 default: 2522 default:
@@ -2439,153 +2530,6 @@ static struct notifier_block __cpuinitdata slab_notifier =
2439 2530
2440#endif 2531#endif
2441 2532
2442#ifdef CONFIG_NUMA
2443
2444/*****************************************************************
2445 * Generic reaper used to support the page allocator
2446 * (the cpu slabs are reaped by a per slab workqueue).
2447 *
2448 * Maybe move this to the page allocator?
2449 ****************************************************************/
2450
2451static DEFINE_PER_CPU(unsigned long, reap_node);
2452
2453static void init_reap_node(int cpu)
2454{
2455 int node;
2456
2457 node = next_node(cpu_to_node(cpu), node_online_map);
2458 if (node == MAX_NUMNODES)
2459 node = first_node(node_online_map);
2460
2461 __get_cpu_var(reap_node) = node;
2462}
2463
2464static void next_reap_node(void)
2465{
2466 int node = __get_cpu_var(reap_node);
2467
2468 /*
2469 * Also drain per cpu pages on remote zones
2470 */
2471 if (node != numa_node_id())
2472 drain_node_pages(node);
2473
2474 node = next_node(node, node_online_map);
2475 if (unlikely(node >= MAX_NUMNODES))
2476 node = first_node(node_online_map);
2477 __get_cpu_var(reap_node) = node;
2478}
2479#else
2480#define init_reap_node(cpu) do { } while (0)
2481#define next_reap_node(void) do { } while (0)
2482#endif
2483
2484#define REAPTIMEOUT_CPUC (2*HZ)
2485
2486#ifdef CONFIG_SMP
2487static DEFINE_PER_CPU(struct delayed_work, reap_work);
2488
2489static void cache_reap(struct work_struct *unused)
2490{
2491 next_reap_node();
2492 refresh_cpu_vm_stats(smp_processor_id());
2493 schedule_delayed_work(&__get_cpu_var(reap_work),
2494 REAPTIMEOUT_CPUC);
2495}
2496
2497static void __devinit start_cpu_timer(int cpu)
2498{
2499 struct delayed_work *reap_work = &per_cpu(reap_work, cpu);
2500
2501 /*
2502 * When this gets called from do_initcalls via cpucache_init(),
2503 * init_workqueues() has already run, so keventd will be setup
2504 * at that time.
2505 */
2506 if (keventd_up() && reap_work->work.func == NULL) {
2507 init_reap_node(cpu);
2508 INIT_DELAYED_WORK(reap_work, cache_reap);
2509 schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu);
2510 }
2511}
2512
2513static int __init cpucache_init(void)
2514{
2515 int cpu;
2516
2517 /*
2518 * Register the timers that drain pcp pages and update vm statistics
2519 */
2520 for_each_online_cpu(cpu)
2521 start_cpu_timer(cpu);
2522 return 0;
2523}
2524__initcall(cpucache_init);
2525#endif
2526
2527#ifdef SLUB_RESILIENCY_TEST
2528static unsigned long validate_slab_cache(struct kmem_cache *s);
2529
2530static void resiliency_test(void)
2531{
2532 u8 *p;
2533
2534 printk(KERN_ERR "SLUB resiliency testing\n");
2535 printk(KERN_ERR "-----------------------\n");
2536 printk(KERN_ERR "A. Corruption after allocation\n");
2537
2538 p = kzalloc(16, GFP_KERNEL);
2539 p[16] = 0x12;
2540 printk(KERN_ERR "\n1. kmalloc-16: Clobber Redzone/next pointer"
2541 " 0x12->0x%p\n\n", p + 16);
2542
2543 validate_slab_cache(kmalloc_caches + 4);
2544
2545 /* Hmmm... The next two are dangerous */
2546 p = kzalloc(32, GFP_KERNEL);
2547 p[32 + sizeof(void *)] = 0x34;
2548 printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab"
2549 " 0x34 -> -0x%p\n", p);
2550 printk(KERN_ERR "If allocated object is overwritten then not detectable\n\n");
2551
2552 validate_slab_cache(kmalloc_caches + 5);
2553 p = kzalloc(64, GFP_KERNEL);
2554 p += 64 + (get_cycles() & 0xff) * sizeof(void *);
2555 *p = 0x56;
2556 printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n",
2557 p);
2558 printk(KERN_ERR "If allocated object is overwritten then not detectable\n\n");
2559 validate_slab_cache(kmalloc_caches + 6);
2560
2561 printk(KERN_ERR "\nB. Corruption after free\n");
2562 p = kzalloc(128, GFP_KERNEL);
2563 kfree(p);
2564 *p = 0x78;
2565 printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p);
2566 validate_slab_cache(kmalloc_caches + 7);
2567
2568 p = kzalloc(256, GFP_KERNEL);
2569 kfree(p);
2570 p[50] = 0x9a;
2571 printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", p);
2572 validate_slab_cache(kmalloc_caches + 8);
2573
2574 p = kzalloc(512, GFP_KERNEL);
2575 kfree(p);
2576 p[512] = 0xab;
2577 printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p);
2578 validate_slab_cache(kmalloc_caches + 9);
2579}
2580#else
2581static void resiliency_test(void) {};
2582#endif
2583
2584/*
2585 * These are not as efficient as kmalloc for the non debug case.
2586 * We do not have the page struct available so we have to touch one
2587 * cacheline in struct kmem_cache to check slab flags.
2588 */
2589void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller) 2533void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller)
2590{ 2534{
2591 struct kmem_cache *s = get_slab(size, gfpflags); 2535 struct kmem_cache *s = get_slab(size, gfpflags);
@@ -2607,13 +2551,12 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
2607 return slab_alloc(s, gfpflags, node, caller); 2551 return slab_alloc(s, gfpflags, node, caller);
2608} 2552}
2609 2553
2610#ifdef CONFIG_SYSFS 2554#if defined(CONFIG_SYSFS) && defined(CONFIG_SLUB_DEBUG)
2611
2612static int validate_slab(struct kmem_cache *s, struct page *page) 2555static int validate_slab(struct kmem_cache *s, struct page *page)
2613{ 2556{
2614 void *p; 2557 void *p;
2615 void *addr = page_address(page); 2558 void *addr = page_address(page);
2616 unsigned long map[BITS_TO_LONGS(s->objects)]; 2559 DECLARE_BITMAP(map, s->objects);
2617 2560
2618 if (!check_slab(s, page) || 2561 if (!check_slab(s, page) ||
2619 !on_freelist(s, page, NULL)) 2562 !on_freelist(s, page, NULL))
@@ -2622,14 +2565,14 @@ static int validate_slab(struct kmem_cache *s, struct page *page)
2622 /* Now we know that a valid freelist exists */ 2565 /* Now we know that a valid freelist exists */
2623 bitmap_zero(map, s->objects); 2566 bitmap_zero(map, s->objects);
2624 2567
2625 for(p = page->freelist; p; p = get_freepointer(s, p)) { 2568 for_each_free_object(p, s, page->freelist) {
2626 set_bit((p - addr) / s->size, map); 2569 set_bit(slab_index(p, s, addr), map);
2627 if (!check_object(s, page, p, 0)) 2570 if (!check_object(s, page, p, 0))
2628 return 0; 2571 return 0;
2629 } 2572 }
2630 2573
2631 for(p = addr; p < addr + s->objects * s->size; p += s->size) 2574 for_each_object(p, s, addr)
2632 if (!test_bit((p - addr) / s->size, map)) 2575 if (!test_bit(slab_index(p, s, addr), map))
2633 if (!check_object(s, page, p, 1)) 2576 if (!check_object(s, page, p, 1))
2634 return 0; 2577 return 0;
2635 return 1; 2578 return 1;
@@ -2645,12 +2588,12 @@ static void validate_slab_slab(struct kmem_cache *s, struct page *page)
2645 s->name, page); 2588 s->name, page);
2646 2589
2647 if (s->flags & DEBUG_DEFAULT_FLAGS) { 2590 if (s->flags & DEBUG_DEFAULT_FLAGS) {
2648 if (!PageError(page)) 2591 if (!SlabDebug(page))
2649 printk(KERN_ERR "SLUB %s: PageError not set " 2592 printk(KERN_ERR "SLUB %s: SlabDebug not set "
2650 "on slab 0x%p\n", s->name, page); 2593 "on slab 0x%p\n", s->name, page);
2651 } else { 2594 } else {
2652 if (PageError(page)) 2595 if (SlabDebug(page))
2653 printk(KERN_ERR "SLUB %s: PageError set on " 2596 printk(KERN_ERR "SLUB %s: SlabDebug set on "
2654 "slab 0x%p\n", s->name, page); 2597 "slab 0x%p\n", s->name, page);
2655 } 2598 }
2656} 2599}
@@ -2702,14 +2645,76 @@ static unsigned long validate_slab_cache(struct kmem_cache *s)
2702 return count; 2645 return count;
2703} 2646}
2704 2647
2648#ifdef SLUB_RESILIENCY_TEST
2649static void resiliency_test(void)
2650{
2651 u8 *p;
2652
2653 printk(KERN_ERR "SLUB resiliency testing\n");
2654 printk(KERN_ERR "-----------------------\n");
2655 printk(KERN_ERR "A. Corruption after allocation\n");
2656
2657 p = kzalloc(16, GFP_KERNEL);
2658 p[16] = 0x12;
2659 printk(KERN_ERR "\n1. kmalloc-16: Clobber Redzone/next pointer"
2660 " 0x12->0x%p\n\n", p + 16);
2661
2662 validate_slab_cache(kmalloc_caches + 4);
2663
2664 /* Hmmm... The next two are dangerous */
2665 p = kzalloc(32, GFP_KERNEL);
2666 p[32 + sizeof(void *)] = 0x34;
2667 printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab"
2668 " 0x34 -> -0x%p\n", p);
2669 printk(KERN_ERR "If allocated object is overwritten then not detectable\n\n");
2670
2671 validate_slab_cache(kmalloc_caches + 5);
2672 p = kzalloc(64, GFP_KERNEL);
2673 p += 64 + (get_cycles() & 0xff) * sizeof(void *);
2674 *p = 0x56;
2675 printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n",
2676 p);
2677 printk(KERN_ERR "If allocated object is overwritten then not detectable\n\n");
2678 validate_slab_cache(kmalloc_caches + 6);
2679
2680 printk(KERN_ERR "\nB. Corruption after free\n");
2681 p = kzalloc(128, GFP_KERNEL);
2682 kfree(p);
2683 *p = 0x78;
2684 printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p);
2685 validate_slab_cache(kmalloc_caches + 7);
2686
2687 p = kzalloc(256, GFP_KERNEL);
2688 kfree(p);
2689 p[50] = 0x9a;
2690 printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", p);
2691 validate_slab_cache(kmalloc_caches + 8);
2692
2693 p = kzalloc(512, GFP_KERNEL);
2694 kfree(p);
2695 p[512] = 0xab;
2696 printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p);
2697 validate_slab_cache(kmalloc_caches + 9);
2698}
2699#else
2700static void resiliency_test(void) {};
2701#endif
2702
2705/* 2703/*
2706 * Generate lists of locations where slabcache objects are allocated 2704 * Generate lists of code addresses where slabcache objects are allocated
2707 * and freed. 2705 * and freed.
2708 */ 2706 */
2709 2707
2710struct location { 2708struct location {
2711 unsigned long count; 2709 unsigned long count;
2712 void *addr; 2710 void *addr;
2711 long long sum_time;
2712 long min_time;
2713 long max_time;
2714 long min_pid;
2715 long max_pid;
2716 cpumask_t cpus;
2717 nodemask_t nodes;
2713}; 2718};
2714 2719
2715struct loc_track { 2720struct loc_track {
@@ -2750,11 +2755,12 @@ static int alloc_loc_track(struct loc_track *t, unsigned long max)
2750} 2755}
2751 2756
2752static int add_location(struct loc_track *t, struct kmem_cache *s, 2757static int add_location(struct loc_track *t, struct kmem_cache *s,
2753 void *addr) 2758 const struct track *track)
2754{ 2759{
2755 long start, end, pos; 2760 long start, end, pos;
2756 struct location *l; 2761 struct location *l;
2757 void *caddr; 2762 void *caddr;
2763 unsigned long age = jiffies - track->when;
2758 2764
2759 start = -1; 2765 start = -1;
2760 end = t->count; 2766 end = t->count;
@@ -2770,19 +2776,36 @@ static int add_location(struct loc_track *t, struct kmem_cache *s,
2770 break; 2776 break;
2771 2777
2772 caddr = t->loc[pos].addr; 2778 caddr = t->loc[pos].addr;
2773 if (addr == caddr) { 2779 if (track->addr == caddr) {
2774 t->loc[pos].count++; 2780
2781 l = &t->loc[pos];
2782 l->count++;
2783 if (track->when) {
2784 l->sum_time += age;
2785 if (age < l->min_time)
2786 l->min_time = age;
2787 if (age > l->max_time)
2788 l->max_time = age;
2789
2790 if (track->pid < l->min_pid)
2791 l->min_pid = track->pid;
2792 if (track->pid > l->max_pid)
2793 l->max_pid = track->pid;
2794
2795 cpu_set(track->cpu, l->cpus);
2796 }
2797 node_set(page_to_nid(virt_to_page(track)), l->nodes);
2775 return 1; 2798 return 1;
2776 } 2799 }
2777 2800
2778 if (addr < caddr) 2801 if (track->addr < caddr)
2779 end = pos; 2802 end = pos;
2780 else 2803 else
2781 start = pos; 2804 start = pos;
2782 } 2805 }
2783 2806
2784 /* 2807 /*
2785 * Not found. Insert new tracking element 2808 * Not found. Insert new tracking element.
2786 */ 2809 */
2787 if (t->count >= t->max && !alloc_loc_track(t, 2 * t->max)) 2810 if (t->count >= t->max && !alloc_loc_track(t, 2 * t->max))
2788 return 0; 2811 return 0;
@@ -2793,7 +2816,16 @@ static int add_location(struct loc_track *t, struct kmem_cache *s,
2793 (t->count - pos) * sizeof(struct location)); 2816 (t->count - pos) * sizeof(struct location));
2794 t->count++; 2817 t->count++;
2795 l->count = 1; 2818 l->count = 1;
2796 l->addr = addr; 2819 l->addr = track->addr;
2820 l->sum_time = age;
2821 l->min_time = age;
2822 l->max_time = age;
2823 l->min_pid = track->pid;
2824 l->max_pid = track->pid;
2825 cpus_clear(l->cpus);
2826 cpu_set(track->cpu, l->cpus);
2827 nodes_clear(l->nodes);
2828 node_set(page_to_nid(virt_to_page(track)), l->nodes);
2797 return 1; 2829 return 1;
2798} 2830}
2799 2831
@@ -2801,19 +2833,16 @@ static void process_slab(struct loc_track *t, struct kmem_cache *s,
2801 struct page *page, enum track_item alloc) 2833 struct page *page, enum track_item alloc)
2802{ 2834{
2803 void *addr = page_address(page); 2835 void *addr = page_address(page);
2804 unsigned long map[BITS_TO_LONGS(s->objects)]; 2836 DECLARE_BITMAP(map, s->objects);
2805 void *p; 2837 void *p;
2806 2838
2807 bitmap_zero(map, s->objects); 2839 bitmap_zero(map, s->objects);
2808 for (p = page->freelist; p; p = get_freepointer(s, p)) 2840 for_each_free_object(p, s, page->freelist)
2809 set_bit((p - addr) / s->size, map); 2841 set_bit(slab_index(p, s, addr), map);
2810
2811 for (p = addr; p < addr + s->objects * s->size; p += s->size)
2812 if (!test_bit((p - addr) / s->size, map)) {
2813 void *addr = get_track(s, p, alloc)->addr;
2814 2842
2815 add_location(t, s, addr); 2843 for_each_object(p, s, addr)
2816 } 2844 if (!test_bit(slab_index(p, s, addr), map))
2845 add_location(t, s, get_track(s, p, alloc));
2817} 2846}
2818 2847
2819static int list_locations(struct kmem_cache *s, char *buf, 2848static int list_locations(struct kmem_cache *s, char *buf,
@@ -2847,15 +2876,47 @@ static int list_locations(struct kmem_cache *s, char *buf,
2847 } 2876 }
2848 2877
2849 for (i = 0; i < t.count; i++) { 2878 for (i = 0; i < t.count; i++) {
2850 void *addr = t.loc[i].addr; 2879 struct location *l = &t.loc[i];
2851 2880
2852 if (n > PAGE_SIZE - 100) 2881 if (n > PAGE_SIZE - 100)
2853 break; 2882 break;
2854 n += sprintf(buf + n, "%7ld ", t.loc[i].count); 2883 n += sprintf(buf + n, "%7ld ", l->count);
2855 if (addr) 2884
2856 n += sprint_symbol(buf + n, (unsigned long)t.loc[i].addr); 2885 if (l->addr)
2886 n += sprint_symbol(buf + n, (unsigned long)l->addr);
2857 else 2887 else
2858 n += sprintf(buf + n, "<not-available>"); 2888 n += sprintf(buf + n, "<not-available>");
2889
2890 if (l->sum_time != l->min_time) {
2891 unsigned long remainder;
2892
2893 n += sprintf(buf + n, " age=%ld/%ld/%ld",
2894 l->min_time,
2895 div_long_long_rem(l->sum_time, l->count, &remainder),
2896 l->max_time);
2897 } else
2898 n += sprintf(buf + n, " age=%ld",
2899 l->min_time);
2900
2901 if (l->min_pid != l->max_pid)
2902 n += sprintf(buf + n, " pid=%ld-%ld",
2903 l->min_pid, l->max_pid);
2904 else
2905 n += sprintf(buf + n, " pid=%ld",
2906 l->min_pid);
2907
2908 if (num_online_cpus() > 1 && !cpus_empty(l->cpus)) {
2909 n += sprintf(buf + n, " cpus=");
2910 n += cpulist_scnprintf(buf + n, PAGE_SIZE - n - 50,
2911 l->cpus);
2912 }
2913
2914 if (num_online_nodes() > 1 && !nodes_empty(l->nodes)) {
2915 n += sprintf(buf + n, " nodes=");
2916 n += nodelist_scnprintf(buf + n, PAGE_SIZE - n - 50,
2917 l->nodes);
2918 }
2919
2859 n += sprintf(buf + n, "\n"); 2920 n += sprintf(buf + n, "\n");
2860 } 2921 }
2861 2922
@@ -3491,6 +3552,7 @@ static int sysfs_slab_alias(struct kmem_cache *s, const char *name)
3491 3552
3492static int __init slab_sysfs_init(void) 3553static int __init slab_sysfs_init(void)
3493{ 3554{
3555 struct list_head *h;
3494 int err; 3556 int err;
3495 3557
3496 err = subsystem_register(&slab_subsys); 3558 err = subsystem_register(&slab_subsys);
@@ -3499,7 +3561,15 @@ static int __init slab_sysfs_init(void)
3499 return -ENOSYS; 3561 return -ENOSYS;
3500 } 3562 }
3501 3563
3502 finish_bootstrap(); 3564 slab_state = SYSFS;
3565
3566 list_for_each(h, &slab_caches) {
3567 struct kmem_cache *s =
3568 container_of(h, struct kmem_cache, list);
3569
3570 err = sysfs_slab_add(s);
3571 BUG_ON(err);
3572 }
3503 3573
3504 while (alias_list) { 3574 while (alias_list) {
3505 struct saved_alias *al = alias_list; 3575 struct saved_alias *al = alias_list;
@@ -3515,6 +3585,4 @@ static int __init slab_sysfs_init(void)
3515} 3585}
3516 3586
3517__initcall(slab_sysfs_init); 3587__initcall(slab_sysfs_init);
3518#else
3519__initcall(finish_bootstrap);
3520#endif 3588#endif
diff --git a/mm/swap.c b/mm/swap.c
index 218c52a24a21..d3cb966fe992 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -488,7 +488,7 @@ static int cpu_swap_callback(struct notifier_block *nfb,
488 long *committed; 488 long *committed;
489 489
490 committed = &per_cpu(committed_space, (long)hcpu); 490 committed = &per_cpu(committed_space, (long)hcpu);
491 if (action == CPU_DEAD) { 491 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
492 atomic_add(*committed, &vm_committed_space); 492 atomic_add(*committed, &vm_committed_space);
493 *committed = 0; 493 *committed = 0;
494 __lru_add_drain((long)hcpu); 494 __lru_add_drain((long)hcpu);
diff --git a/mm/truncate.c b/mm/truncate.c
index 0f4b6d18ab0e..4fbe1a2da5fb 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -12,6 +12,7 @@
12#include <linux/swap.h> 12#include <linux/swap.h>
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/pagemap.h> 14#include <linux/pagemap.h>
15#include <linux/highmem.h>
15#include <linux/pagevec.h> 16#include <linux/pagevec.h>
16#include <linux/task_io_accounting_ops.h> 17#include <linux/task_io_accounting_ops.h>
17#include <linux/buffer_head.h> /* grr. try_to_release_page, 18#include <linux/buffer_head.h> /* grr. try_to_release_page,
@@ -46,7 +47,7 @@ void do_invalidatepage(struct page *page, unsigned long offset)
46 47
47static inline void truncate_partial_page(struct page *page, unsigned partial) 48static inline void truncate_partial_page(struct page *page, unsigned partial)
48{ 49{
49 memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial); 50 zero_user_page(page, partial, PAGE_CACHE_SIZE - partial, KM_USER0);
50 if (PagePrivate(page)) 51 if (PagePrivate(page))
51 do_invalidatepage(page, partial); 52 do_invalidatepage(page, partial);
52} 53}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 1c8e75a1cfcd..1be5a6376ef0 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1528,7 +1528,7 @@ static int __devinit cpu_callback(struct notifier_block *nfb,
1528 pg_data_t *pgdat; 1528 pg_data_t *pgdat;
1529 cpumask_t mask; 1529 cpumask_t mask;
1530 1530
1531 if (action == CPU_ONLINE) { 1531 if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) {
1532 for_each_online_pgdat(pgdat) { 1532 for_each_online_pgdat(pgdat) {
1533 mask = node_to_cpumask(pgdat->node_id); 1533 mask = node_to_cpumask(pgdat->node_id);
1534 if (any_online_cpu(mask) != NR_CPUS) 1534 if (any_online_cpu(mask) != NR_CPUS)
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 6c488d6ac425..9832d9a41d8c 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -281,6 +281,17 @@ EXPORT_SYMBOL(dec_zone_page_state);
281 281
282/* 282/*
283 * Update the zone counters for one cpu. 283 * Update the zone counters for one cpu.
284 *
285 * Note that refresh_cpu_vm_stats strives to only access
286 * node local memory. The per cpu pagesets on remote zones are placed
287 * in the memory local to the processor using that pageset. So the
288 * loop over all zones will access a series of cachelines local to
289 * the processor.
290 *
291 * The call to zone_page_state_add updates the cachelines with the
292 * statistics in the remote zone struct as well as the global cachelines
293 * with the global counters. These could cause remote node cache line
294 * bouncing and will have to be only done when necessary.
284 */ 295 */
285void refresh_cpu_vm_stats(int cpu) 296void refresh_cpu_vm_stats(int cpu)
286{ 297{
@@ -289,21 +300,54 @@ void refresh_cpu_vm_stats(int cpu)
289 unsigned long flags; 300 unsigned long flags;
290 301
291 for_each_zone(zone) { 302 for_each_zone(zone) {
292 struct per_cpu_pageset *pcp; 303 struct per_cpu_pageset *p;
293 304
294 if (!populated_zone(zone)) 305 if (!populated_zone(zone))
295 continue; 306 continue;
296 307
297 pcp = zone_pcp(zone, cpu); 308 p = zone_pcp(zone, cpu);
298 309
299 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) 310 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
300 if (pcp->vm_stat_diff[i]) { 311 if (p->vm_stat_diff[i]) {
301 local_irq_save(flags); 312 local_irq_save(flags);
302 zone_page_state_add(pcp->vm_stat_diff[i], 313 zone_page_state_add(p->vm_stat_diff[i],
303 zone, i); 314 zone, i);
304 pcp->vm_stat_diff[i] = 0; 315 p->vm_stat_diff[i] = 0;
316#ifdef CONFIG_NUMA
317 /* 3 seconds idle till flush */
318 p->expire = 3;
319#endif
305 local_irq_restore(flags); 320 local_irq_restore(flags);
306 } 321 }
322#ifdef CONFIG_NUMA
323 /*
324 * Deal with draining the remote pageset of this
325 * processor
326 *
327 * Check if there are pages remaining in this pageset
328 * if not then there is nothing to expire.
329 */
330 if (!p->expire || (!p->pcp[0].count && !p->pcp[1].count))
331 continue;
332
333 /*
334 * We never drain zones local to this processor.
335 */
336 if (zone_to_nid(zone) == numa_node_id()) {
337 p->expire = 0;
338 continue;
339 }
340
341 p->expire--;
342 if (p->expire)
343 continue;
344
345 if (p->pcp[0].count)
346 drain_zone_pages(zone, p->pcp + 0);
347
348 if (p->pcp[1].count)
349 drain_zone_pages(zone, p->pcp + 1);
350#endif
307 } 351 }
308} 352}
309 353
@@ -640,6 +684,24 @@ const struct seq_operations vmstat_op = {
640#endif /* CONFIG_PROC_FS */ 684#endif /* CONFIG_PROC_FS */
641 685
642#ifdef CONFIG_SMP 686#ifdef CONFIG_SMP
687static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
688int sysctl_stat_interval __read_mostly = HZ;
689
690static void vmstat_update(struct work_struct *w)
691{
692 refresh_cpu_vm_stats(smp_processor_id());
693 schedule_delayed_work(&__get_cpu_var(vmstat_work),
694 sysctl_stat_interval);
695}
696
697static void __devinit start_cpu_timer(int cpu)
698{
699 struct delayed_work *vmstat_work = &per_cpu(vmstat_work, cpu);
700
701 INIT_DELAYED_WORK(vmstat_work, vmstat_update);
702 schedule_delayed_work_on(cpu, vmstat_work, HZ + cpu);
703}
704
643/* 705/*
644 * Use the cpu notifier to insure that the thresholds are recalculated 706 * Use the cpu notifier to insure that the thresholds are recalculated
645 * when necessary. 707 * when necessary.
@@ -648,10 +710,24 @@ static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb,
648 unsigned long action, 710 unsigned long action,
649 void *hcpu) 711 void *hcpu)
650{ 712{
713 long cpu = (long)hcpu;
714
651 switch (action) { 715 switch (action) {
652 case CPU_UP_PREPARE: 716 case CPU_ONLINE:
653 case CPU_UP_CANCELED: 717 case CPU_ONLINE_FROZEN:
718 start_cpu_timer(cpu);
719 break;
720 case CPU_DOWN_PREPARE:
721 case CPU_DOWN_PREPARE_FROZEN:
722 cancel_rearming_delayed_work(&per_cpu(vmstat_work, cpu));
723 per_cpu(vmstat_work, cpu).work.func = NULL;
724 break;
725 case CPU_DOWN_FAILED:
726 case CPU_DOWN_FAILED_FROZEN:
727 start_cpu_timer(cpu);
728 break;
654 case CPU_DEAD: 729 case CPU_DEAD:
730 case CPU_DEAD_FROZEN:
655 refresh_zone_stat_thresholds(); 731 refresh_zone_stat_thresholds();
656 break; 732 break;
657 default: 733 default:
@@ -665,8 +741,13 @@ static struct notifier_block __cpuinitdata vmstat_notifier =
665 741
666int __init setup_vmstat(void) 742int __init setup_vmstat(void)
667{ 743{
744 int cpu;
745
668 refresh_zone_stat_thresholds(); 746 refresh_zone_stat_thresholds();
669 register_cpu_notifier(&vmstat_notifier); 747 register_cpu_notifier(&vmstat_notifier);
748
749 for_each_online_cpu(cpu)
750 start_cpu_timer(cpu);
670 return 0; 751 return 0;
671} 752}
672module_init(setup_vmstat) 753module_init(setup_vmstat)
diff --git a/net/core/dev.c b/net/core/dev.c
index 4317c1be4d3f..8301e2ac747f 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3450,7 +3450,7 @@ static int dev_cpu_callback(struct notifier_block *nfb,
3450 unsigned int cpu, oldcpu = (unsigned long)ocpu; 3450 unsigned int cpu, oldcpu = (unsigned long)ocpu;
3451 struct softnet_data *sd, *oldsd; 3451 struct softnet_data *sd, *oldsd;
3452 3452
3453 if (action != CPU_DEAD) 3453 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
3454 return NOTIFY_OK; 3454 return NOTIFY_OK;
3455 3455
3456 local_irq_disable(); 3456 local_irq_disable();
diff --git a/net/core/flow.c b/net/core/flow.c
index 5d25697920b1..051430545a05 100644
--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -338,7 +338,7 @@ static int flow_cache_cpu(struct notifier_block *nfb,
338 unsigned long action, 338 unsigned long action,
339 void *hcpu) 339 void *hcpu)
340{ 340{
341 if (action == CPU_DEAD) 341 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
342 __flow_cache_shrink((unsigned long)hcpu, 0); 342 __flow_cache_shrink((unsigned long)hcpu, 0);
343 return NOTIFY_OK; 343 return NOTIFY_OK;
344} 344}
diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c
index b3050a6817e7..68fe1d4d0210 100644
--- a/net/ipv4/ipvs/ip_vs_ctl.c
+++ b/net/ipv4/ipvs/ip_vs_ctl.c
@@ -2387,6 +2387,7 @@ void ip_vs_control_cleanup(void)
2387 EnterFunction(2); 2387 EnterFunction(2);
2388 ip_vs_trash_cleanup(); 2388 ip_vs_trash_cleanup();
2389 cancel_rearming_delayed_work(&defense_work); 2389 cancel_rearming_delayed_work(&defense_work);
2390 cancel_work_sync(&defense_work.work);
2390 ip_vs_kill_estimator(&ip_vs_stats); 2391 ip_vs_kill_estimator(&ip_vs_stats);
2391 unregister_sysctl_table(sysctl_header); 2392 unregister_sysctl_table(sysctl_header);
2392 proc_net_remove("ip_vs_stats"); 2393 proc_net_remove("ip_vs_stats");
diff --git a/net/iucv/iucv.c b/net/iucv/iucv.c
index fb3faf72e850..b7333061016d 100644
--- a/net/iucv/iucv.c
+++ b/net/iucv/iucv.c
@@ -556,6 +556,7 @@ static int __cpuinit iucv_cpu_notify(struct notifier_block *self,
556 556
557 switch (action) { 557 switch (action) {
558 case CPU_UP_PREPARE: 558 case CPU_UP_PREPARE:
559 case CPU_UP_PREPARE_FROZEN:
559 if (!percpu_populate(iucv_irq_data, 560 if (!percpu_populate(iucv_irq_data,
560 sizeof(struct iucv_irq_data), 561 sizeof(struct iucv_irq_data),
561 GFP_KERNEL|GFP_DMA, cpu)) 562 GFP_KERNEL|GFP_DMA, cpu))
@@ -567,15 +568,20 @@ static int __cpuinit iucv_cpu_notify(struct notifier_block *self,
567 } 568 }
568 break; 569 break;
569 case CPU_UP_CANCELED: 570 case CPU_UP_CANCELED:
571 case CPU_UP_CANCELED_FROZEN:
570 case CPU_DEAD: 572 case CPU_DEAD:
573 case CPU_DEAD_FROZEN:
571 percpu_depopulate(iucv_param, cpu); 574 percpu_depopulate(iucv_param, cpu);
572 percpu_depopulate(iucv_irq_data, cpu); 575 percpu_depopulate(iucv_irq_data, cpu);
573 break; 576 break;
574 case CPU_ONLINE: 577 case CPU_ONLINE:
578 case CPU_ONLINE_FROZEN:
575 case CPU_DOWN_FAILED: 579 case CPU_DOWN_FAILED:
580 case CPU_DOWN_FAILED_FROZEN:
576 smp_call_function_on(iucv_declare_cpu, NULL, 0, 1, cpu); 581 smp_call_function_on(iucv_declare_cpu, NULL, 0, 1, cpu);
577 break; 582 break;
578 case CPU_DOWN_PREPARE: 583 case CPU_DOWN_PREPARE:
584 case CPU_DOWN_PREPARE_FROZEN:
579 cpumask = iucv_buffer_cpumask; 585 cpumask = iucv_buffer_cpumask;
580 cpu_clear(cpu, cpumask); 586 cpu_clear(cpu, cpumask);
581 if (cpus_empty(cpumask)) 587 if (cpus_empty(cpumask))
diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c
index db298b501c81..099a983797da 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -924,6 +924,7 @@ static inline int
924gss_write_init_verf(struct svc_rqst *rqstp, struct rsi *rsip) 924gss_write_init_verf(struct svc_rqst *rqstp, struct rsi *rsip)
925{ 925{
926 struct rsc *rsci; 926 struct rsc *rsci;
927 int rc;
927 928
928 if (rsip->major_status != GSS_S_COMPLETE) 929 if (rsip->major_status != GSS_S_COMPLETE)
929 return gss_write_null_verf(rqstp); 930 return gss_write_null_verf(rqstp);
@@ -932,7 +933,9 @@ gss_write_init_verf(struct svc_rqst *rqstp, struct rsi *rsip)
932 rsip->major_status = GSS_S_NO_CONTEXT; 933 rsip->major_status = GSS_S_NO_CONTEXT;
933 return gss_write_null_verf(rqstp); 934 return gss_write_null_verf(rqstp);
934 } 935 }
935 return gss_write_verf(rqstp, rsci->mechctx, GSS_SEQ_WIN); 936 rc = gss_write_verf(rqstp, rsci->mechctx, GSS_SEQ_WIN);
937 cache_put(&rsci->h, &rsc_cache);
938 return rc;
936} 939}
937 940
938/* 941/*
@@ -1089,6 +1092,8 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp)
1089 } 1092 }
1090 goto complete; 1093 goto complete;
1091 case RPC_GSS_PROC_DESTROY: 1094 case RPC_GSS_PROC_DESTROY:
1095 if (gss_write_verf(rqstp, rsci->mechctx, gc->gc_seq))
1096 goto auth_err;
1092 set_bit(CACHE_NEGATIVE, &rsci->h.flags); 1097 set_bit(CACHE_NEGATIVE, &rsci->h.flags);
1093 if (resv->iov_len + 4 > PAGE_SIZE) 1098 if (resv->iov_len + 4 > PAGE_SIZE)
1094 goto drop; 1099 goto drop;
@@ -1196,13 +1201,7 @@ svcauth_gss_wrap_resp_integ(struct svc_rqst *rqstp)
1196 if (xdr_buf_subsegment(resbuf, &integ_buf, integ_offset, 1201 if (xdr_buf_subsegment(resbuf, &integ_buf, integ_offset,
1197 integ_len)) 1202 integ_len))
1198 BUG(); 1203 BUG();
1199 if (resbuf->page_len == 0 1204 if (resbuf->tail[0].iov_base == NULL) {
1200 && resbuf->head[0].iov_len + RPC_MAX_AUTH_SIZE
1201 < PAGE_SIZE) {
1202 BUG_ON(resbuf->tail[0].iov_len);
1203 /* Use head for everything */
1204 resv = &resbuf->head[0];
1205 } else if (resbuf->tail[0].iov_base == NULL) {
1206 if (resbuf->head[0].iov_len + RPC_MAX_AUTH_SIZE > PAGE_SIZE) 1205 if (resbuf->head[0].iov_len + RPC_MAX_AUTH_SIZE > PAGE_SIZE)
1207 goto out_err; 1206 goto out_err;
1208 resbuf->tail[0].iov_base = resbuf->head[0].iov_base 1207 resbuf->tail[0].iov_base = resbuf->head[0].iov_base
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index ad39b47e05bc..a2f1893bde53 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -845,6 +845,8 @@ init_once(void * foo, struct kmem_cache * cachep, unsigned long flags)
845 845
846int register_rpc_pipefs(void) 846int register_rpc_pipefs(void)
847{ 847{
848 int err;
849
848 rpc_inode_cachep = kmem_cache_create("rpc_inode_cache", 850 rpc_inode_cachep = kmem_cache_create("rpc_inode_cache",
849 sizeof(struct rpc_inode), 851 sizeof(struct rpc_inode),
850 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| 852 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
@@ -852,7 +854,12 @@ int register_rpc_pipefs(void)
852 init_once, NULL); 854 init_once, NULL);
853 if (!rpc_inode_cachep) 855 if (!rpc_inode_cachep)
854 return -ENOMEM; 856 return -ENOMEM;
855 register_filesystem(&rpc_pipe_fs_type); 857 err = register_filesystem(&rpc_pipe_fs_type);
858 if (err) {
859 kmem_cache_destroy(rpc_inode_cachep);
860 return err;
861 }
862
856 return 0; 863 return 0;
857} 864}
858 865
diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c
index 43ecf62f12ef..0d35bc796d00 100644
--- a/net/sunrpc/sunrpc_syms.c
+++ b/net/sunrpc/sunrpc_syms.c
@@ -146,9 +146,11 @@ init_sunrpc(void)
146 int err = register_rpc_pipefs(); 146 int err = register_rpc_pipefs();
147 if (err) 147 if (err)
148 goto out; 148 goto out;
149 err = rpc_init_mempool() != 0; 149 err = rpc_init_mempool();
150 if (err) 150 if (err) {
151 unregister_rpc_pipefs();
151 goto out; 152 goto out;
153 }
152#ifdef RPC_DEBUG 154#ifdef RPC_DEBUG
153 rpc_register_sysctl(); 155 rpc_register_sysctl();
154#endif 156#endif
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index b7503c103ae8..e673ef993904 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -907,7 +907,7 @@ svc_process(struct svc_rqst *rqstp)
907 * better idea of reply size 907 * better idea of reply size
908 */ 908 */
909 if (procp->pc_xdrressize) 909 if (procp->pc_xdrressize)
910 svc_reserve(rqstp, procp->pc_xdrressize<<2); 910 svc_reserve_auth(rqstp, procp->pc_xdrressize<<2);
911 911
912 /* Call the function that processes the request. */ 912 /* Call the function that processes the request. */
913 if (!versp->vs_dispatch) { 913 if (!versp->vs_dispatch) {
diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c
index 2bd23ea2aa8b..07dcd20cbee4 100644
--- a/net/sunrpc/svcauth_unix.c
+++ b/net/sunrpc/svcauth_unix.c
@@ -385,7 +385,7 @@ ip_map_cached_get(struct svc_rqst *rqstp)
385{ 385{
386 struct ip_map *ipm; 386 struct ip_map *ipm;
387 struct svc_sock *svsk = rqstp->rq_sock; 387 struct svc_sock *svsk = rqstp->rq_sock;
388 spin_lock_bh(&svsk->sk_defer_lock); 388 spin_lock(&svsk->sk_lock);
389 ipm = svsk->sk_info_authunix; 389 ipm = svsk->sk_info_authunix;
390 if (ipm != NULL) { 390 if (ipm != NULL) {
391 if (!cache_valid(&ipm->h)) { 391 if (!cache_valid(&ipm->h)) {
@@ -395,13 +395,13 @@ ip_map_cached_get(struct svc_rqst *rqstp)
395 * same IP address. 395 * same IP address.
396 */ 396 */
397 svsk->sk_info_authunix = NULL; 397 svsk->sk_info_authunix = NULL;
398 spin_unlock_bh(&svsk->sk_defer_lock); 398 spin_unlock(&svsk->sk_lock);
399 cache_put(&ipm->h, &ip_map_cache); 399 cache_put(&ipm->h, &ip_map_cache);
400 return NULL; 400 return NULL;
401 } 401 }
402 cache_get(&ipm->h); 402 cache_get(&ipm->h);
403 } 403 }
404 spin_unlock_bh(&svsk->sk_defer_lock); 404 spin_unlock(&svsk->sk_lock);
405 return ipm; 405 return ipm;
406} 406}
407 407
@@ -410,14 +410,14 @@ ip_map_cached_put(struct svc_rqst *rqstp, struct ip_map *ipm)
410{ 410{
411 struct svc_sock *svsk = rqstp->rq_sock; 411 struct svc_sock *svsk = rqstp->rq_sock;
412 412
413 spin_lock_bh(&svsk->sk_defer_lock); 413 spin_lock(&svsk->sk_lock);
414 if (svsk->sk_sock->type == SOCK_STREAM && 414 if (svsk->sk_sock->type == SOCK_STREAM &&
415 svsk->sk_info_authunix == NULL) { 415 svsk->sk_info_authunix == NULL) {
416 /* newly cached, keep the reference */ 416 /* newly cached, keep the reference */
417 svsk->sk_info_authunix = ipm; 417 svsk->sk_info_authunix = ipm;
418 ipm = NULL; 418 ipm = NULL;
419 } 419 }
420 spin_unlock_bh(&svsk->sk_defer_lock); 420 spin_unlock(&svsk->sk_lock);
421 if (ipm) 421 if (ipm)
422 cache_put(&ipm->h, &ip_map_cache); 422 cache_put(&ipm->h, &ip_map_cache);
423} 423}
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 22f61aee4824..5baf48de2558 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -53,7 +53,8 @@
53 * svc_serv->sv_lock protects sv_tempsocks, sv_permsocks, sv_tmpcnt. 53 * svc_serv->sv_lock protects sv_tempsocks, sv_permsocks, sv_tmpcnt.
54 * when both need to be taken (rare), svc_serv->sv_lock is first. 54 * when both need to be taken (rare), svc_serv->sv_lock is first.
55 * BKL protects svc_serv->sv_nrthread. 55 * BKL protects svc_serv->sv_nrthread.
56 * svc_sock->sk_defer_lock protects the svc_sock->sk_deferred list 56 * svc_sock->sk_lock protects the svc_sock->sk_deferred list
57 * and the ->sk_info_authunix cache.
57 * svc_sock->sk_flags.SK_BUSY prevents a svc_sock being enqueued multiply. 58 * svc_sock->sk_flags.SK_BUSY prevents a svc_sock being enqueued multiply.
58 * 59 *
59 * Some flags can be set to certain values at any time 60 * Some flags can be set to certain values at any time
@@ -787,15 +788,20 @@ svc_udp_recvfrom(struct svc_rqst *rqstp)
787 } 788 }
788 789
789 clear_bit(SK_DATA, &svsk->sk_flags); 790 clear_bit(SK_DATA, &svsk->sk_flags);
790 while ((err = kernel_recvmsg(svsk->sk_sock, &msg, NULL, 791 skb = NULL;
791 0, 0, MSG_PEEK | MSG_DONTWAIT)) < 0 || 792 err = kernel_recvmsg(svsk->sk_sock, &msg, NULL,
792 (skb = skb_recv_datagram(svsk->sk_sk, 0, 1, &err)) == NULL) { 793 0, 0, MSG_PEEK | MSG_DONTWAIT);
793 if (err == -EAGAIN) { 794 if (err >= 0)
794 svc_sock_received(svsk); 795 skb = skb_recv_datagram(svsk->sk_sk, 0, 1, &err);
795 return err; 796
797 if (skb == NULL) {
798 if (err != -EAGAIN) {
799 /* possibly an icmp error */
800 dprintk("svc: recvfrom returned error %d\n", -err);
801 set_bit(SK_DATA, &svsk->sk_flags);
796 } 802 }
797 /* possibly an icmp error */ 803 svc_sock_received(svsk);
798 dprintk("svc: recvfrom returned error %d\n", -err); 804 return -EAGAIN;
799 } 805 }
800 rqstp->rq_addrlen = sizeof(rqstp->rq_addr); 806 rqstp->rq_addrlen = sizeof(rqstp->rq_addr);
801 if (skb->tstamp.tv64 == 0) { 807 if (skb->tstamp.tv64 == 0) {
@@ -1633,7 +1639,7 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv,
1633 svsk->sk_server = serv; 1639 svsk->sk_server = serv;
1634 atomic_set(&svsk->sk_inuse, 1); 1640 atomic_set(&svsk->sk_inuse, 1);
1635 svsk->sk_lastrecv = get_seconds(); 1641 svsk->sk_lastrecv = get_seconds();
1636 spin_lock_init(&svsk->sk_defer_lock); 1642 spin_lock_init(&svsk->sk_lock);
1637 INIT_LIST_HEAD(&svsk->sk_deferred); 1643 INIT_LIST_HEAD(&svsk->sk_deferred);
1638 INIT_LIST_HEAD(&svsk->sk_ready); 1644 INIT_LIST_HEAD(&svsk->sk_ready);
1639 mutex_init(&svsk->sk_mutex); 1645 mutex_init(&svsk->sk_mutex);
@@ -1857,9 +1863,9 @@ static void svc_revisit(struct cache_deferred_req *dreq, int too_many)
1857 dprintk("revisit queued\n"); 1863 dprintk("revisit queued\n");
1858 svsk = dr->svsk; 1864 svsk = dr->svsk;
1859 dr->svsk = NULL; 1865 dr->svsk = NULL;
1860 spin_lock_bh(&svsk->sk_defer_lock); 1866 spin_lock(&svsk->sk_lock);
1861 list_add(&dr->handle.recent, &svsk->sk_deferred); 1867 list_add(&dr->handle.recent, &svsk->sk_deferred);
1862 spin_unlock_bh(&svsk->sk_defer_lock); 1868 spin_unlock(&svsk->sk_lock);
1863 set_bit(SK_DEFERRED, &svsk->sk_flags); 1869 set_bit(SK_DEFERRED, &svsk->sk_flags);
1864 svc_sock_enqueue(svsk); 1870 svc_sock_enqueue(svsk);
1865 svc_sock_put(svsk); 1871 svc_sock_put(svsk);
@@ -1925,7 +1931,7 @@ static struct svc_deferred_req *svc_deferred_dequeue(struct svc_sock *svsk)
1925 1931
1926 if (!test_bit(SK_DEFERRED, &svsk->sk_flags)) 1932 if (!test_bit(SK_DEFERRED, &svsk->sk_flags))
1927 return NULL; 1933 return NULL;
1928 spin_lock_bh(&svsk->sk_defer_lock); 1934 spin_lock(&svsk->sk_lock);
1929 clear_bit(SK_DEFERRED, &svsk->sk_flags); 1935 clear_bit(SK_DEFERRED, &svsk->sk_flags);
1930 if (!list_empty(&svsk->sk_deferred)) { 1936 if (!list_empty(&svsk->sk_deferred)) {
1931 dr = list_entry(svsk->sk_deferred.next, 1937 dr = list_entry(svsk->sk_deferred.next,
@@ -1934,6 +1940,6 @@ static struct svc_deferred_req *svc_deferred_dequeue(struct svc_sock *svsk)
1934 list_del_init(&dr->handle.recent); 1940 list_del_init(&dr->handle.recent);
1935 set_bit(SK_DEFERRED, &svsk->sk_flags); 1941 set_bit(SK_DEFERRED, &svsk->sk_flags);
1936 } 1942 }
1937 spin_unlock_bh(&svsk->sk_defer_lock); 1943 spin_unlock(&svsk->sk_lock);
1938 return dr; 1944 return dr;
1939} 1945}
diff --git a/scripts/kernel-doc b/scripts/kernel-doc
index a325a0c890dc..e5bf649e516a 100755
--- a/scripts/kernel-doc
+++ b/scripts/kernel-doc
@@ -337,6 +337,7 @@ sub get_kernel_version() {
337 } 337 }
338 return $version; 338 return $version;
339} 339}
340my $kernelversion = get_kernel_version();
340 341
341# generate a sequence of code that will splice in highlighting information 342# generate a sequence of code that will splice in highlighting information
342# using the s// operator. 343# using the s// operator.
@@ -610,7 +611,7 @@ sub output_function_xml(%) {
610 print "<refmeta>\n"; 611 print "<refmeta>\n";
611 print " <refentrytitle><phrase>".$args{'function'}."</phrase></refentrytitle>\n"; 612 print " <refentrytitle><phrase>".$args{'function'}."</phrase></refentrytitle>\n";
612 print " <manvolnum>9</manvolnum>\n"; 613 print " <manvolnum>9</manvolnum>\n";
613 print " <refmiscinfo class=\"version\">" . get_kernel_version() . "</refmiscinfo>\n"; 614 print " <refmiscinfo class=\"version\">" . $kernelversion . "</refmiscinfo>\n";
614 print "</refmeta>\n"; 615 print "</refmeta>\n";
615 print "<refnamediv>\n"; 616 print "<refnamediv>\n";
616 print " <refname>".$args{'function'}."</refname>\n"; 617 print " <refname>".$args{'function'}."</refname>\n";
@@ -687,7 +688,7 @@ sub output_struct_xml(%) {
687 print "<refmeta>\n"; 688 print "<refmeta>\n";
688 print " <refentrytitle><phrase>".$args{'type'}." ".$args{'struct'}."</phrase></refentrytitle>\n"; 689 print " <refentrytitle><phrase>".$args{'type'}." ".$args{'struct'}."</phrase></refentrytitle>\n";
689 print " <manvolnum>9</manvolnum>\n"; 690 print " <manvolnum>9</manvolnum>\n";
690 print " <refmiscinfo class=\"version\">" . get_kernel_version() . "</refmiscinfo>\n"; 691 print " <refmiscinfo class=\"version\">" . $kernelversion . "</refmiscinfo>\n";
691 print "</refmeta>\n"; 692 print "</refmeta>\n";
692 print "<refnamediv>\n"; 693 print "<refnamediv>\n";
693 print " <refname>".$args{'type'}." ".$args{'struct'}."</refname>\n"; 694 print " <refname>".$args{'type'}." ".$args{'struct'}."</refname>\n";
@@ -772,7 +773,7 @@ sub output_enum_xml(%) {
772 print "<refmeta>\n"; 773 print "<refmeta>\n";
773 print " <refentrytitle><phrase>enum ".$args{'enum'}."</phrase></refentrytitle>\n"; 774 print " <refentrytitle><phrase>enum ".$args{'enum'}."</phrase></refentrytitle>\n";
774 print " <manvolnum>9</manvolnum>\n"; 775 print " <manvolnum>9</manvolnum>\n";
775 print " <refmiscinfo class=\"version\">" . get_kernel_version() . "</refmiscinfo>\n"; 776 print " <refmiscinfo class=\"version\">" . $kernelversion . "</refmiscinfo>\n";
776 print "</refmeta>\n"; 777 print "</refmeta>\n";
777 print "<refnamediv>\n"; 778 print "<refnamediv>\n";
778 print " <refname>enum ".$args{'enum'}."</refname>\n"; 779 print " <refname>enum ".$args{'enum'}."</refname>\n";
diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c
index 480e18b00aa6..113dc77b9f60 100644
--- a/scripts/mod/modpost.c
+++ b/scripts/mod/modpost.c
@@ -1343,6 +1343,7 @@ static void add_header(struct buffer *b, struct module *mod)
1343 buf_printf(b, "#ifdef CONFIG_MODULE_UNLOAD\n" 1343 buf_printf(b, "#ifdef CONFIG_MODULE_UNLOAD\n"
1344 " .exit = cleanup_module,\n" 1344 " .exit = cleanup_module,\n"
1345 "#endif\n"); 1345 "#endif\n");
1346 buf_printf(b, " .arch = MODULE_ARCH_INIT,\n");
1346 buf_printf(b, "};\n"); 1347 buf_printf(b, "};\n");
1347} 1348}
1348 1349